From a1b3ec23f8884a77cd2def23ace3904d1e2d8788 Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 12 Apr 2019 15:04:10 +0200 Subject: [PATCH 01/73] bonding: fix event handling for stacked bonds [ Upstream commit 92480b3977fd3884649d404cbbaf839b70035699 ] When a bond is enslaved to another bond, bond_netdev_event() only handles the event as if the bond is a master, and skips treating the bond as a slave. This leads to a refcount leak on the slave, since we don't remove the adjacency to its master and the master holds a reference on the slave. Reproducer: ip link add bondL type bond ip link add bondU type bond ip link set bondL master bondU ip link del bondL No "Fixes:" tag, this code is older than git history. Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/bonding/bond_main.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 99e60bb5fe07..1edd4ff5382c 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3169,8 +3169,12 @@ static int bond_netdev_event(struct notifier_block *this, return NOTIFY_DONE; if (event_dev->flags & IFF_MASTER) { + int ret; + netdev_dbg(event_dev, "IFF_MASTER\n"); - return bond_master_netdev_event(event, event_dev); + ret = bond_master_netdev_event(event, event_dev); + if (ret != NOTIFY_DONE) + return ret; } if (event_dev->flags & IFF_SLAVE) { From 9fac50c39253b6bef74c2cfc2244a93c4d5a0896 Mon Sep 17 00:00:00 2001 From: "Gustavo A. R. Silva" Date: Mon, 15 Apr 2019 15:57:23 -0500 Subject: [PATCH 02/73] net: atm: Fix potential Spectre v1 vulnerabilities [ Upstream commit 899537b73557aafbdd11050b501cf54b4f5c45af ] arg is controlled by user-space, hence leading to a potential exploitation of the Spectre variant 1 vulnerability. This issue was detected with the help of Smatch: net/atm/lec.c:715 lec_mcast_attach() warn: potential spectre issue 'dev_lec' [r] (local cap) Fix this by sanitizing arg before using it to index dev_lec. Notice that given that speculation windows are large, the policy is to kill the speculation on the first load and not worry if it can be completed with a dependent load/store [1]. [1] https://lore.kernel.org/lkml/20180423164740.GY17484@dhcp22.suse.cz/ Signed-off-by: Gustavo A. R. Silva Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/atm/lec.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/atm/lec.c b/net/atm/lec.c index 9f2365694ad4..85ce89c8a35c 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -710,7 +710,10 @@ static int lec_vcc_attach(struct atm_vcc *vcc, void __user *arg) static int lec_mcast_attach(struct atm_vcc *vcc, int arg) { - if (arg < 0 || arg >= MAX_LEC_ITF || !dev_lec[arg]) + if (arg < 0 || arg >= MAX_LEC_ITF) + return -EINVAL; + arg = array_index_nospec(arg, MAX_LEC_ITF); + if (!dev_lec[arg]) return -EINVAL; vcc->proto_data = dev_lec[arg]; return lec_mcast_make(netdev_priv(dev_lec[arg]), vcc); @@ -728,6 +731,7 @@ static int lecd_attach(struct atm_vcc *vcc, int arg) i = arg; if (arg >= MAX_LEC_ITF) return -EINVAL; + i = array_index_nospec(arg, MAX_LEC_ITF); if (!dev_lec[i]) { int size; From a1a9b69e2605db1edea34bb3a6051cf845b26bc1 Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 11 Apr 2019 13:56:39 +0300 Subject: [PATCH 03/73] net: bridge: fix per-port af_packet sockets [ Upstream commit 3b2e2904deb314cc77a2192f506f2fd44e3d10d0 ] When the commit below was introduced it changed two visible things: - the skb was no longer passed through the protocol handlers with the original device - the skb was passed up the stack with skb->dev = bridge The first change broke af_packet sockets on bridge ports. For example we use them for hostapd which listens for ETH_P_PAE packets on the ports. We discussed two possible fixes: - create a clone and pass it through NF_HOOK(), act on the original skb based on the result - somehow signal to the caller from the okfn() that it was called, meaning the skb is ok to be passed, which this patch is trying to implement via returning 1 from the bridge link-local okfn() Note that we rely on the fact that NF_QUEUE/STOLEN would return 0 and drop/error would return < 0 thus the okfn() is called only when the return was 1, so we signal to the caller that it was called by preserving the return value from nf_hook(). Fixes: 8626c56c8279 ("bridge: fix potential use-after-free when hook returns QUEUE or STOLEN verdict") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_input.c | 23 ++++++++++++++--------- 1 file changed, 14 insertions(+), 9 deletions(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 7637f58c1226..10fa84056cb5 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -236,13 +236,10 @@ static void __br_handle_local_finish(struct sk_buff *skb) /* note: already called with rcu_read_lock */ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_buff *skb) { - struct net_bridge_port *p = br_port_get_rcu(skb->dev); - __br_handle_local_finish(skb); - BR_INPUT_SKB_CB(skb)->brdev = p->br->dev; - br_pass_frame_up(skb); - return 0; + /* return 1 to signal the okfn() was called so it's ok to use the skb */ + return 1; } /* @@ -318,10 +315,18 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb) goto forward; } - /* Deliver packet to local host only */ - NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev), - NULL, skb, skb->dev, NULL, br_handle_local_finish); - return RX_HANDLER_CONSUMED; + /* The else clause should be hit when nf_hook(): + * - returns < 0 (drop/error) + * - returns = 0 (stolen/nf_queue) + * Thus return 1 from the okfn() to signal the skb is ok to pass + */ + if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, + dev_net(skb->dev), NULL, skb, skb->dev, NULL, + br_handle_local_finish) == 1) { + return RX_HANDLER_PASS; + } else { + return RX_HANDLER_CONSUMED; + } } forward: From c1e7e01e2edb90678e26f100fae0dc50024d139b Mon Sep 17 00:00:00 2001 From: Nikolay Aleksandrov Date: Thu, 11 Apr 2019 15:08:25 +0300 Subject: [PATCH 04/73] net: bridge: multicast: use rcu to access port list from br_multicast_start_querier [ Upstream commit c5b493ce192bd7a4e7bd073b5685aad121eeef82 ] br_multicast_start_querier() walks over the port list but it can be called from a timer with only multicast_lock held which doesn't protect the port list, so use RCU to walk over it. Fixes: c83b8fab06fc ("bridge: Restart queries when last querier expires") Signed-off-by: Nikolay Aleksandrov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_multicast.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 8dc5c8d69bcd..e83048cb53ce 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2119,7 +2119,8 @@ static void br_multicast_start_querier(struct net_bridge *br, __br_multicast_open(br, query); - list_for_each_entry(port, &br->port_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(port, &br->port_list, list) { if (port->state == BR_STATE_DISABLED || port->state == BR_STATE_BLOCKING) continue; @@ -2131,6 +2132,7 @@ static void br_multicast_start_querier(struct net_bridge *br, br_multicast_enable(&port->ip6_own_query); #endif } + rcu_read_unlock(); } int br_multicast_toggle(struct net_bridge *br, unsigned long val) From 8835f1c7d0fb0faf6df93c1715f13b9ef04a9e99 Mon Sep 17 00:00:00 2001 From: Lorenzo Bianconi Date: Tue, 9 Apr 2019 11:47:20 +0200 Subject: [PATCH 05/73] net: fou: do not use guehdr after iptunnel_pull_offloads in gue_udp_recv [ Upstream commit 988dc4a9a3b66be75b30405a5494faf0dc7cffb6 ] gue tunnels run iptunnel_pull_offloads on received skbs. This can determine a possible use-after-free accessing guehdr pointer since the packet will be 'uncloned' running pskb_expand_head if it is a cloned gso skb (e.g if the packet has been sent though a veth device) Fixes: a09a4c8dd1ec ("tunnels: Remove encapsulation offloads on decap") Signed-off-by: Lorenzo Bianconi Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/fou.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c index c9ec1603666b..665f11d7388e 100644 --- a/net/ipv4/fou.c +++ b/net/ipv4/fou.c @@ -120,6 +120,7 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) struct guehdr *guehdr; void *data; u16 doffset = 0; + u8 proto_ctype; if (!fou) return 1; @@ -211,13 +212,14 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb) if (unlikely(guehdr->control)) return gue_control_message(skb, guehdr); + proto_ctype = guehdr->proto_ctype; __skb_pull(skb, sizeof(struct udphdr) + hdrlen); skb_reset_transport_header(skb); if (iptunnel_pull_offloads(skb)) goto drop; - return -guehdr->proto_ctype; + return -proto_ctype; drop: kfree_skb(skb); From 07b1747f11af7cb3ed0903cd8bf3b5a382452ec4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 16 Apr 2019 10:55:20 -0700 Subject: [PATCH 06/73] tcp: tcp_grow_window() needs to respect tcp_space() [ Upstream commit 50ce163a72d817a99e8974222dcf2886d5deb1ae ] For some reason, tcp_grow_window() correctly tests if enough room is present before attempting to increase tp->rcv_ssthresh, but does not prevent it to grow past tcp_space() This is causing hard to debug issues, like failing the (__tcp_select_window(sk) >= tp->rcv_wnd) test in __tcp_ack_snd_check(), causing ACK delays and possibly slow flows. Depending on tcp_rmem[2], MTU, skb->len/skb->truesize ratio, we can see the problem happening on "netperf -t TCP_RR -- -r 2000,2000" after about 60 round trips, when the active side no longer sends immediate acks. This bug predates git history. Signed-off-by: Eric Dumazet Acked-by: Soheil Hassas Yeganeh Acked-by: Neal Cardwell Acked-by: Wei Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index c8227e07d574..657d33e2ff6a 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -389,11 +389,12 @@ static int __tcp_grow_window(const struct sock *sk, const struct sk_buff *skb) static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); + int room; + + room = min_t(int, tp->window_clamp, tcp_space(sk)) - tp->rcv_ssthresh; /* Check #1 */ - if (tp->rcv_ssthresh < tp->window_clamp && - (int)tp->rcv_ssthresh < tcp_space(sk) && - !tcp_under_memory_pressure(sk)) { + if (room > 0 && !tcp_under_memory_pressure(sk)) { int incr; /* Check #2. Increase window, if skb with such overhead @@ -406,8 +407,7 @@ static void tcp_grow_window(struct sock *sk, const struct sk_buff *skb) if (incr) { incr = max_t(int, incr, 2 * skb->len); - tp->rcv_ssthresh = min(tp->rcv_ssthresh + incr, - tp->window_clamp); + tp->rcv_ssthresh += min(room, incr); inet_csk(sk)->icsk_ack.quick |= 1; } } From b205097fae230acfe7a928e7f4875148568667bd Mon Sep 17 00:00:00 2001 From: Hangbin Liu Date: Mon, 8 Apr 2019 16:45:17 +0800 Subject: [PATCH 07/73] team: set slave to promisc if team is already in promisc mode [ Upstream commit 43c2adb9df7ddd6560fd3546d925b42cef92daa0 ] After adding a team interface to bridge, the team interface will enter promisc mode. Then if we add a new slave to team0, the slave will keep promisc off. Fix it by setting slave to promisc on if team master is already in promisc mode, also do the same for allmulti. v2: add promisc and allmulti checking when delete ports Fixes: 3d249d4ca7d0 ("net: introduce ethernet teaming device") Signed-off-by: Hangbin Liu Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/team/team.c | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index bb96153f496e..fea141e71705 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -1245,6 +1245,23 @@ static int team_port_add(struct team *team, struct net_device *port_dev) goto err_option_port_add; } + /* set promiscuity level to new slave */ + if (dev->flags & IFF_PROMISC) { + err = dev_set_promiscuity(port_dev, 1); + if (err) + goto err_set_slave_promisc; + } + + /* set allmulti level to new slave */ + if (dev->flags & IFF_ALLMULTI) { + err = dev_set_allmulti(port_dev, 1); + if (err) { + if (dev->flags & IFF_PROMISC) + dev_set_promiscuity(port_dev, -1); + goto err_set_slave_promisc; + } + } + netif_addr_lock_bh(dev); dev_uc_sync_multiple(port_dev, dev); dev_mc_sync_multiple(port_dev, dev); @@ -1261,6 +1278,9 @@ static int team_port_add(struct team *team, struct net_device *port_dev) return 0; +err_set_slave_promisc: + __team_option_inst_del_port(team, port); + err_option_port_add: team_upper_dev_unlink(team, port); @@ -1306,6 +1326,12 @@ static int team_port_del(struct team *team, struct net_device *port_dev) team_port_disable(team, port); list_del_rcu(&port->list); + + if (dev->flags & IFF_PROMISC) + dev_set_promiscuity(port_dev, -1); + if (dev->flags & IFF_ALLMULTI) + dev_set_allmulti(port_dev, -1); + team_upper_dev_unlink(team, port); netdev_rx_handler_unregister(port_dev); team_port_disable_netpoll(port); From f052bafe5476347fe9ae0665c4d2297ad1301e38 Mon Sep 17 00:00:00 2001 From: Jason Wang Date: Tue, 9 Apr 2019 12:10:25 +0800 Subject: [PATCH 08/73] vhost: reject zero size iova range [ Upstream commit 813dbeb656d6c90266f251d8bd2b02d445afa63f ] We used to accept zero size iova range which will lead a infinite loop in translate_desc(). Fixing this by failing the request in this case. Reported-by: syzbot+d21e6e297322a900c128@syzkaller.appspotmail.com Fixes: 6b1e6cc7 ("vhost: new device IOTLB API") Signed-off-by: Jason Wang Acked-by: Michael S. Tsirkin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/vhost/vhost.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index d7c22ae5c368..0e93ac888a5f 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -918,8 +918,12 @@ static int vhost_new_umem_range(struct vhost_umem *umem, u64 start, u64 size, u64 end, u64 userspace_addr, int perm) { - struct vhost_umem_node *tmp, *node = kmalloc(sizeof(*node), GFP_ATOMIC); + struct vhost_umem_node *tmp, *node; + if (!size) + return -EFAULT; + + node = kmalloc(sizeof(*node), GFP_ATOMIC); if (!node) return -ENOMEM; From 3d988fcddbe7b8673a231958bd2fba61b5a7ced9 Mon Sep 17 00:00:00 2001 From: Stephen Suryaputra Date: Fri, 12 Apr 2019 16:19:27 -0400 Subject: [PATCH 09/73] ipv4: recompile ip options in ipv4_link_failure [ Upstream commit ed0de45a1008991fdaa27a0152befcb74d126a8b ] Recompile IP options since IPCB may not be valid anymore when ipv4_link_failure is called from arp_error_report. Refer to the commit 3da1ed7ac398 ("net: avoid use IPCB in cipso_v4_error") and the commit before that (9ef6b42ad6fd) for a similar issue. Signed-off-by: Stephen Suryaputra Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/route.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a1bf87711bfa..9af1c0d4dea9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1195,8 +1195,16 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) static void ipv4_link_failure(struct sk_buff *skb) { struct rtable *rt; + struct ip_options opt; - icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0); + /* Recompile ip options since IPCB may not be valid anymore. + */ + memset(&opt, 0, sizeof(opt)); + opt.optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); + if (__ip_options_compile(dev_net(skb->dev), &opt, skb, NULL)) + return; + + __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); rt = skb_rtable(skb); if (rt) From 0a7e8300b7f4cf72a6f1c0c2799d92149465c414 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 13 Apr 2019 17:32:21 -0700 Subject: [PATCH 10/73] ipv4: ensure rcu_read_lock() in ipv4_link_failure() [ Upstream commit c543cb4a5f07e09237ec0fc2c60c9f131b2c79ad ] fib_compute_spec_dst() needs to be called under rcu protection. syzbot reported : WARNING: suspicious RCU usage 5.1.0-rc4+ #165 Not tainted include/linux/inetdevice.h:220 suspicious rcu_dereference_check() usage! other info that might help us debug this: rcu_scheduler_active = 2, debug_locks = 1 1 lock held by swapper/0/0: #0: 0000000051b67925 ((&n->timer)){+.-.}, at: lockdep_copy_map include/linux/lockdep.h:170 [inline] #0: 0000000051b67925 ((&n->timer)){+.-.}, at: call_timer_fn+0xda/0x720 kernel/time/timer.c:1315 stack backtrace: CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.1.0-rc4+ #165 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x172/0x1f0 lib/dump_stack.c:113 lockdep_rcu_suspicious+0x153/0x15d kernel/locking/lockdep.c:5162 __in_dev_get_rcu include/linux/inetdevice.h:220 [inline] fib_compute_spec_dst+0xbbd/0x1030 net/ipv4/fib_frontend.c:294 spec_dst_fill net/ipv4/ip_options.c:245 [inline] __ip_options_compile+0x15a7/0x1a10 net/ipv4/ip_options.c:343 ipv4_link_failure+0x172/0x400 net/ipv4/route.c:1195 dst_link_failure include/net/dst.h:427 [inline] arp_error_report+0xd1/0x1c0 net/ipv4/arp.c:297 neigh_invalidate+0x24b/0x570 net/core/neighbour.c:995 neigh_timer_handler+0xc35/0xf30 net/core/neighbour.c:1081 call_timer_fn+0x190/0x720 kernel/time/timer.c:1325 expire_timers kernel/time/timer.c:1362 [inline] __run_timers kernel/time/timer.c:1681 [inline] __run_timers kernel/time/timer.c:1649 [inline] run_timer_softirq+0x652/0x1700 kernel/time/timer.c:1694 __do_softirq+0x266/0x95a kernel/softirq.c:293 invoke_softirq kernel/softirq.c:374 [inline] irq_exit+0x180/0x1d0 kernel/softirq.c:414 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0x14a/0x570 arch/x86/kernel/apic/apic.c:1062 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:807 Fixes: ed0de45a1008 ("ipv4: recompile ip options in ipv4_link_failure") Signed-off-by: Eric Dumazet Reported-by: syzbot Cc: Stephen Suryaputra Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/route.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 9af1c0d4dea9..c64f062d6323 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1194,14 +1194,20 @@ static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie) static void ipv4_link_failure(struct sk_buff *skb) { - struct rtable *rt; struct ip_options opt; + struct rtable *rt; + int res; /* Recompile ip options since IPCB may not be valid anymore. */ memset(&opt, 0, sizeof(opt)); opt.optlen = ip_hdr(skb)->ihl*4 - sizeof(struct iphdr); - if (__ip_options_compile(dev_net(skb->dev), &opt, skb, NULL)) + + rcu_read_lock(); + res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL); + rcu_read_unlock(); + + if (res) return; __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt); From 6d5d30e03309fc22a48fc522b2e3f3c02c130092 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 11 Apr 2019 12:26:32 +0200 Subject: [PATCH 11/73] net: thunderx: raise XDP MTU to 1508 [ Upstream commit 5ee15c101f29e0093ffb5448773ccbc786eb313b ] The thunderx driver splits frames bigger than 1530 bytes to multiple pages, making impossible to run an eBPF program on it. This leads to a maximum MTU of 1508 if QinQ is in use. The thunderx driver forbids to load an eBPF program if the MTU is higher than 1500 bytes. Raise the limit to 1508 so it is possible to use L2 protocols which need some more headroom. Fixes: 05c773f52b96e ("net: thunderx: Add basic XDP support") Signed-off-by: Matteo Croce Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index 59b62b49ad48..bbe8add918b7 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -29,6 +29,13 @@ #define DRV_NAME "thunder-nicvf" #define DRV_VERSION "1.0" +/* NOTE: Packets bigger than 1530 are split across multiple pages and XDP needs + * the buffer to be contiguous. Allow XDP to be set up only if we don't exceed + * this value, keeping headroom for the 14 byte Ethernet header and two + * VLAN tags (for QinQ) + */ +#define MAX_XDP_MTU (1530 - ETH_HLEN - VLAN_HLEN * 2) + /* Supported devices */ static const struct pci_device_id nicvf_id_table[] = { { PCI_DEVICE_SUB(PCI_VENDOR_ID_CAVIUM, @@ -1702,8 +1709,10 @@ static int nicvf_xdp_setup(struct nicvf *nic, struct bpf_prog *prog) bool bpf_attached = false; int ret = 0; - /* For now just support only the usual MTU sized frames */ - if (prog && (dev->mtu > 1500)) { + /* For now just support only the usual MTU sized frames, + * plus some headroom for VLAN, QinQ. + */ + if (prog && dev->mtu > MAX_XDP_MTU) { netdev_warn(dev, "Jumbo frames not yet supported with XDP, current MTU %d.\n", dev->mtu); return -EOPNOTSUPP; From 4a692bc1d5fa706d9170404c34b3efa190ebf039 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Thu, 11 Apr 2019 12:26:33 +0200 Subject: [PATCH 12/73] net: thunderx: don't allow jumbo frames with XDP [ Upstream commit 1f227d16083b2e280b7dde4ca78883d75593f2fd ] The thunderx driver forbids to load an eBPF program if the MTU is too high, but this can be circumvented by loading the eBPF, then raising the MTU. Fix this by limiting the MTU if an eBPF program is already loaded. Fixes: 05c773f52b96e ("net: thunderx: Add basic XDP support") Signed-off-by: Matteo Croce Acked-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/cavium/thunder/nicvf_main.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c b/drivers/net/ethernet/cavium/thunder/nicvf_main.c index bbe8add918b7..98734a37b6f6 100644 --- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c +++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c @@ -1461,6 +1461,15 @@ static int nicvf_change_mtu(struct net_device *netdev, int new_mtu) struct nicvf *nic = netdev_priv(netdev); int orig_mtu = netdev->mtu; + /* For now just support only the usual MTU sized frames, + * plus some headroom for VLAN, QinQ. + */ + if (nic->xdp_prog && new_mtu > MAX_XDP_MTU) { + netdev_warn(netdev, "Jumbo frames not yet supported with XDP, current MTU %d.\n", + netdev->mtu); + return -EINVAL; + } + netdev->mtu = new_mtu; if (!netif_running(netdev)) From 53fc31a4853e30d6e8f142b824f724da27ff3e40 Mon Sep 17 00:00:00 2001 From: Aurelien Aptel Date: Fri, 29 Mar 2019 10:49:12 +0100 Subject: [PATCH 13/73] CIFS: keep FileInfo handle live during oplock break commit b98749cac4a695f084a5ff076f4510b23e353ecd upstream. In the oplock break handler, writing pending changes from pages puts the FileInfo handle. If the refcount reaches zero it closes the handle and waits for any oplock break handler to return, thus causing a deadlock. To prevent this situation: * We add a wait flag to cifsFileInfo_put() to decide whether we should wait for running/pending oplock break handlers * We keep an additionnal reference of the SMB FileInfo handle so that for the rest of the handler putting the handle won't close it. - The ref is bumped everytime we queue the handler via the cifs_queue_oplock_break() helper. - The ref is decremented at the end of the handler This bug was triggered by xfstest 464. Also important fix to address the various reports of oops in smb2_push_mandatory_locks Signed-off-by: Aurelien Aptel Signed-off-by: Steve French Reviewed-by: Pavel Shilovsky CC: Stable Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/file.c | 30 +++++++++++++++++++++++++----- fs/cifs/misc.c | 25 +++++++++++++++++++++++-- fs/cifs/smb2misc.c | 6 +++--- 4 files changed, 53 insertions(+), 10 deletions(-) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index f29cdb1cdeb7..7b7ab10a9db1 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1189,6 +1189,7 @@ cifsFileInfo_get_locked(struct cifsFileInfo *cifs_file) } struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); #define CIFS_CACHE_READ_FLG 1 @@ -1693,6 +1694,7 @@ GLOBAL_EXTERN spinlock_t gidsidlock; #endif /* CONFIG_CIFS_ACL */ void cifs_oplock_break(struct work_struct *work); +void cifs_queue_oplock_break(struct cifsFileInfo *cfile); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index cd69c1e9750f..48ea9dfd5f02 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -358,12 +358,30 @@ cifsFileInfo_get(struct cifsFileInfo *cifs_file) return cifs_file; } -/* - * Release a reference on the file private data. This may involve closing - * the filehandle out on the server. Must be called without holding - * tcon->open_file_lock and cifs_file->file_info_lock. +/** + * cifsFileInfo_put - release a reference of file priv data + * + * Always potentially wait for oplock handler. See _cifsFileInfo_put(). */ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) +{ + _cifsFileInfo_put(cifs_file, true); +} + +/** + * _cifsFileInfo_put - release a reference of file priv data + * + * This may involve closing the filehandle @cifs_file out on the + * server. Must be called without holding tcon->open_file_lock and + * cifs_file->file_info_lock. + * + * If @wait_for_oplock_handler is true and we are releasing the last + * reference, wait for any running oplock break handler of the file + * and cancel any pending one. If calling this function from the + * oplock break handler, you need to pass false. + * + */ +void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_handler) { struct inode *inode = d_inode(cifs_file->dentry); struct cifs_tcon *tcon = tlink_tcon(cifs_file->tlink); @@ -411,7 +429,8 @@ void cifsFileInfo_put(struct cifsFileInfo *cifs_file) spin_unlock(&tcon->open_file_lock); - oplock_break_cancelled = cancel_work_sync(&cifs_file->oplock_break); + oplock_break_cancelled = wait_oplock_handler ? + cancel_work_sync(&cifs_file->oplock_break) : false; if (!tcon->need_reconnect && !cifs_file->invalidHandle) { struct TCP_Server_Info *server = tcon->ses->server; @@ -4136,6 +4155,7 @@ void cifs_oplock_break(struct work_struct *work) cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + _cifsFileInfo_put(cfile, false /* do not wait for ourself */); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index bcab30d4a6c7..76f1649ab444 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -486,8 +486,7 @@ is_valid_oplock_break(char *buffer, struct TCP_Server_Info *srv) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &pCifsInode->flags); - queue_work(cifsoplockd_wq, - &netfile->oplock_break); + cifs_queue_oplock_break(netfile); netfile->oplock_break_cancelled = false; spin_unlock(&tcon->open_file_lock); @@ -584,6 +583,28 @@ void cifs_put_writer(struct cifsInodeInfo *cinode) spin_unlock(&cinode->writers_lock); } +/** + * cifs_queue_oplock_break - queue the oplock break handler for cfile + * + * This function is called from the demultiplex thread when it + * receives an oplock break for @cfile. + * + * Assumes the tcon->open_file_lock is held. + * Assumes cfile->file_info_lock is NOT held. + */ +void cifs_queue_oplock_break(struct cifsFileInfo *cfile) +{ + /* + * Bump the handle refcount now while we hold the + * open_file_lock to enforce the validity of it for the oplock + * break handler. The matching put is done at the end of the + * handler. + */ + cifsFileInfo_get(cfile); + + queue_work(cifsoplockd_wq, &cfile->oplock_break); +} + void cifs_done_oplock_break(struct cifsInodeInfo *cinode) { clear_bit(CIFS_INODE_PENDING_OPLOCK_BREAK, &cinode->flags); diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index a97a0e0b1a74..31f01f09d25a 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -517,7 +517,7 @@ smb2_tcon_has_lease(struct cifs_tcon *tcon, struct smb2_lease_break *rsp, clear_bit(CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); - queue_work(cifsoplockd_wq, &cfile->oplock_break); + cifs_queue_oplock_break(cfile); kfree(lw); return true; } @@ -661,8 +661,8 @@ smb2_is_valid_oplock_break(char *buffer, struct TCP_Server_Info *server) CIFS_INODE_DOWNGRADE_OPLOCK_TO_L2, &cinode->flags); spin_unlock(&cfile->file_info_lock); - queue_work(cifsoplockd_wq, - &cfile->oplock_break); + + cifs_queue_oplock_break(cfile); spin_unlock(&tcon->open_file_lock); spin_unlock(&cifs_tcp_ses_lock); From bc1919400960a50e50ff55b0fde97342e4be621b Mon Sep 17 00:00:00 2001 From: Sean Christopherson Date: Tue, 2 Apr 2019 08:10:47 -0700 Subject: [PATCH 14/73] KVM: x86: Don't clear EFER during SMM transitions for 32-bit vCPU commit 8f4dc2e77cdfaf7e644ef29693fa229db29ee1de upstream. Neither AMD nor Intel CPUs have an EFER field in the legacy SMRAM save state area, i.e. don't save/restore EFER across SMM transitions. KVM somewhat models this, e.g. doesn't clear EFER on entry to SMM if the guest doesn't support long mode. But during RSM, KVM unconditionally clears EFER so that it can get back to pure 32-bit mode in order to start loading CRs with their actual non-SMM values. Clear EFER only when it will be written when loading the non-SMM state so as to preserve bits that can theoretically be set on 32-bit vCPUs, e.g. KVM always emulates EFER_SCE. And because CR4.PAE is cleared only to play nice with EFER, wrap that code in the long mode check as well. Note, this may result in a compiler warning about cr4 being consumed uninitialized. Re-read CR4 even though it's technically unnecessary, as doing so allows for more readable code and RSM emulation is not a performance critical path. Fixes: 660a5d517aaab ("KVM: x86: save/load state on SMM switch") Cc: stable@vger.kernel.org Signed-off-by: Sean Christopherson Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/emulate.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 5f758568fc44..2bcadfc5b2f0 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -2588,15 +2588,13 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) * CR0/CR3/CR4/EFER. It's all a bit more complicated if the vCPU * supports long mode. */ - cr4 = ctxt->ops->get_cr(ctxt, 4); if (emulator_has_longmode(ctxt)) { struct desc_struct cs_desc; /* Zero CR4.PCIDE before CR0.PG. */ - if (cr4 & X86_CR4_PCIDE) { + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PCIDE) ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PCIDE); - cr4 &= ~X86_CR4_PCIDE; - } /* A 32-bit code segment is required to clear EFER.LMA. */ memset(&cs_desc, 0, sizeof(cs_desc)); @@ -2610,13 +2608,16 @@ static int em_rsm(struct x86_emulate_ctxt *ctxt) if (cr0 & X86_CR0_PE) ctxt->ops->set_cr(ctxt, 0, cr0 & ~(X86_CR0_PG | X86_CR0_PE)); - /* Now clear CR4.PAE (which must be done before clearing EFER.LME). */ - if (cr4 & X86_CR4_PAE) - ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); + if (emulator_has_longmode(ctxt)) { + /* Clear CR4.PAE before clearing EFER.LME. */ + cr4 = ctxt->ops->get_cr(ctxt, 4); + if (cr4 & X86_CR4_PAE) + ctxt->ops->set_cr(ctxt, 4, cr4 & ~X86_CR4_PAE); - /* And finally go back to 32-bit mode. */ - efer = 0; - ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + /* And finally go back to 32-bit mode. */ + efer = 0; + ctxt->ops->set_msr(ctxt, MSR_EFER, efer); + } smbase = ctxt->ops->get_smbase(ctxt); if (emulator_has_longmode(ctxt)) From 8091c2ae4a1949f0f2cb8b48a6ccaf383661a76b Mon Sep 17 00:00:00 2001 From: Vitaly Kuznetsov Date: Wed, 3 Apr 2019 16:06:42 +0200 Subject: [PATCH 15/73] KVM: x86: svm: make sure NMI is injected after nmi_singlestep commit 99c221796a810055974b54c02e8f53297e48d146 upstream. I noticed that apic test from kvm-unit-tests always hangs on my EPYC 7401P, the hanging test nmi-after-sti is trying to deliver 30000 NMIs and tracing shows that we're sometimes able to deliver a few but never all. When we're trying to inject an NMI we may fail to do so immediately for various reasons, however, we still need to inject it so enable_nmi_window() arms nmi_singlestep mode. #DB occurs as expected, but we're not checking for pending NMIs before entering the guest and unless there's a different event to process, the NMI will never get delivered. Make KVM_REQ_EVENT request on the vCPU from db_interception() to make sure pending NMIs are checked and possibly injected. Signed-off-by: Vitaly Kuznetsov Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/svm.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index c387047e926a..3a57816ea498 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -2211,6 +2211,7 @@ static int pf_interception(struct vcpu_svm *svm) static int db_interception(struct vcpu_svm *svm) { struct kvm_run *kvm_run = svm->vcpu.run; + struct kvm_vcpu *vcpu = &svm->vcpu; if (!(svm->vcpu.guest_debug & (KVM_GUESTDBG_SINGLESTEP | KVM_GUESTDBG_USE_HW_BP)) && @@ -2221,6 +2222,8 @@ static int db_interception(struct vcpu_svm *svm) if (svm->nmi_singlestep) { disable_nmi_singlestep(svm); + /* Make sure we check for pending NMIs upon entry */ + kvm_make_request(KVM_REQ_EVENT, vcpu); } if (svm->vcpu.guest_debug & From b69e4d5f1ab43ea3c5e41ac841606eaffdc61402 Mon Sep 17 00:00:00 2001 From: Leonard Pollak Date: Wed, 13 Feb 2019 11:19:52 +0100 Subject: [PATCH 16/73] Staging: iio: meter: fixed typo commit 0a8a29be499cbb67df79370aaf5109085509feb8 upstream. This patch fixes an obvious typo, which will cause erroneously returning the Peak Voltage instead of the Peak Current. Signed-off-by: Leonard Pollak Cc: Acked-by: Michael Hennerich Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/staging/iio/meter/ade7854.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/iio/meter/ade7854.c b/drivers/staging/iio/meter/ade7854.c index 70612da64a8b..7ae774ef9da3 100644 --- a/drivers/staging/iio/meter/ade7854.c +++ b/drivers/staging/iio/meter/ade7854.c @@ -269,7 +269,7 @@ static IIO_DEV_ATTR_VPEAK(0644, static IIO_DEV_ATTR_IPEAK(0644, ade7854_read_32bit, ade7854_write_32bit, - ADE7854_VPEAK); + ADE7854_IPEAK); static IIO_DEV_ATTR_APHCAL(0644, ade7854_read_16bit, ade7854_write_16bit, From 63aafa6501a5640032107fe9647818a930b18222 Mon Sep 17 00:00:00 2001 From: Mircea Caprioru Date: Wed, 20 Feb 2019 13:08:20 +0200 Subject: [PATCH 17/73] staging: iio: ad7192: Fix ad7193 channel address commit 7ce0f216221856a17fc4934b39284678a5fef2e9 upstream. This patch fixes the differential channels addresses for the ad7193. Signed-off-by: Mircea Caprioru Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/staging/iio/adc/ad7192.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/staging/iio/adc/ad7192.c b/drivers/staging/iio/adc/ad7192.c index 31a195d1bf05..f58c80327ba5 100644 --- a/drivers/staging/iio/adc/ad7192.c +++ b/drivers/staging/iio/adc/ad7192.c @@ -109,10 +109,10 @@ #define AD7192_CH_AIN3 BIT(6) /* AIN3 - AINCOM */ #define AD7192_CH_AIN4 BIT(7) /* AIN4 - AINCOM */ -#define AD7193_CH_AIN1P_AIN2M 0x000 /* AIN1(+) - AIN2(-) */ -#define AD7193_CH_AIN3P_AIN4M 0x001 /* AIN3(+) - AIN4(-) */ -#define AD7193_CH_AIN5P_AIN6M 0x002 /* AIN5(+) - AIN6(-) */ -#define AD7193_CH_AIN7P_AIN8M 0x004 /* AIN7(+) - AIN8(-) */ +#define AD7193_CH_AIN1P_AIN2M 0x001 /* AIN1(+) - AIN2(-) */ +#define AD7193_CH_AIN3P_AIN4M 0x002 /* AIN3(+) - AIN4(-) */ +#define AD7193_CH_AIN5P_AIN6M 0x004 /* AIN5(+) - AIN6(-) */ +#define AD7193_CH_AIN7P_AIN8M 0x008 /* AIN7(+) - AIN8(-) */ #define AD7193_CH_TEMP 0x100 /* Temp senseor */ #define AD7193_CH_AIN2P_AIN2M 0x200 /* AIN2(+) - AIN2(-) */ #define AD7193_CH_AIN1 0x401 /* AIN1 - AINCOM */ From b7ad00c028ab8ab238cc72906b8fdb6ed3c874ea Mon Sep 17 00:00:00 2001 From: Sergey Larin Date: Sat, 2 Mar 2019 19:54:55 +0300 Subject: [PATCH 18/73] iio: gyro: mpu3050: fix chip ID reading commit 409a51e0a4a5f908763191fae2c29008632eb712 upstream. According to the datasheet, the last bit of CHIP_ID register controls I2C bus, and the first one is unused. Handle this correctly. Note that there are chips out there that have a value such that the id check currently fails. Signed-off-by: Sergey Larin Reviewed-by: Linus Walleij Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/gyro/mpu3050-core.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/drivers/iio/gyro/mpu3050-core.c b/drivers/iio/gyro/mpu3050-core.c index e0d241a9aa30..a7be4670bf8f 100644 --- a/drivers/iio/gyro/mpu3050-core.c +++ b/drivers/iio/gyro/mpu3050-core.c @@ -29,7 +29,8 @@ #include "mpu3050.h" -#define MPU3050_CHIP_ID 0x69 +#define MPU3050_CHIP_ID 0x68 +#define MPU3050_CHIP_ID_MASK 0x7E /* * Register map: anything suffixed *_H is a big-endian high byte and always @@ -1178,8 +1179,9 @@ int mpu3050_common_probe(struct device *dev, goto err_power_down; } - if (val != MPU3050_CHIP_ID) { - dev_err(dev, "unsupported chip id %02x\n", (u8)val); + if ((val & MPU3050_CHIP_ID_MASK) != MPU3050_CHIP_ID) { + dev_err(dev, "unsupported chip id %02x\n", + (u8)(val & MPU3050_CHIP_ID_MASK)); ret = -ENODEV; goto err_power_down; } From f245ce44d58e88c74d37d73b23ac9617789038d2 Mon Sep 17 00:00:00 2001 From: Mike Looijmans Date: Wed, 13 Feb 2019 08:41:47 +0100 Subject: [PATCH 19/73] iio/gyro/bmg160: Use millidegrees for temperature scale commit 40a7198a4a01037003c7ca714f0d048a61e729ac upstream. Standard unit for temperature is millidegrees Celcius, whereas this driver was reporting in degrees. Fix the scale factor in the driver. Signed-off-by: Mike Looijmans Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/gyro/bmg160_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/iio/gyro/bmg160_core.c b/drivers/iio/gyro/bmg160_core.c index 821919dd245b..b5a5517e3ce1 100644 --- a/drivers/iio/gyro/bmg160_core.c +++ b/drivers/iio/gyro/bmg160_core.c @@ -583,11 +583,10 @@ static int bmg160_read_raw(struct iio_dev *indio_dev, case IIO_CHAN_INFO_LOW_PASS_FILTER_3DB_FREQUENCY: return bmg160_get_filter(data, val); case IIO_CHAN_INFO_SCALE: - *val = 0; switch (chan->type) { case IIO_TEMP: - *val2 = 500000; - return IIO_VAL_INT_PLUS_MICRO; + *val = 500; + return IIO_VAL_INT; case IIO_ANGL_VEL: { int i; @@ -595,6 +594,7 @@ static int bmg160_read_raw(struct iio_dev *indio_dev, for (i = 0; i < ARRAY_SIZE(bmg160_scale_table); ++i) { if (bmg160_scale_table[i].dps_range == data->dps_range) { + *val = 0; *val2 = bmg160_scale_table[i].scale; return IIO_VAL_INT_PLUS_MICRO; } From 0336305753a66205f0b74ab9c7236cd436238870 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Wed, 13 Mar 2019 12:40:02 +0100 Subject: [PATCH 20/73] iio: cros_ec: Fix the maths for gyro scale calculation commit 3d02d7082e5823598090530c3988a35f69689943 upstream. Calculation did not use IIO_DEGREE_TO_RAD and implemented a variant to avoid precision loss as we aim a nano value. The offset added to avoid rounding error, though, doesn't give us a close result to the expected value. E.g. For 1000dps, the result should be: (1000 * pi ) / 180 >> 15 ~= 0.000532632218 But with current calculation we get $ cat scale 0.000547890 Fix the calculation by just doing the maths involved for a nano value val * pi * 10e12 / (180 * 2^15) so we get a closer result. $ cat scale 0.000532632 Fixes: c14dca07a31d ("iio: cros_ec_sensors: add ChromeOS EC Contiguous Sensors driver") Signed-off-by: Gwendal Grignou Signed-off-by: Enric Balletbo i Serra Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c index 38e8783e4b05..287fbe08264d 100644 --- a/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c +++ b/drivers/iio/common/cros_ec_sensors/cros_ec_sensors.c @@ -104,9 +104,10 @@ static int cros_ec_sensors_read(struct iio_dev *indio_dev, * Do not use IIO_DEGREE_TO_RAD to avoid precision * loss. Round to the nearest integer. */ - *val = div_s64(val64 * 314159 + 9000000ULL, 1000); - *val2 = 18000 << (CROS_EC_SENSOR_BITS - 1); - ret = IIO_VAL_FRACTIONAL; + *val = 0; + *val2 = div_s64(val64 * 3141592653ULL, + 180 << (CROS_EC_SENSOR_BITS - 1)); + ret = IIO_VAL_INT_PLUS_NANO; break; case MOTIONSENSE_TYPE_MAG: /* From ad0f65cd55b88f2e4d49968f9d5aba26fe3e8c1c Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 19 Mar 2019 12:47:00 +0200 Subject: [PATCH 21/73] iio: ad_sigma_delta: select channel when reading register commit fccfb9ce70ed4ea7a145f77b86de62e38178517f upstream. The desired channel has to be selected in order to correctly fill the buffer with the corresponding data. The `ad_sd_write_reg()` already does this, but for the `ad_sd_read_reg_raw()` this was omitted. Fixes: af3008485ea03 ("iio:adc: Add common code for ADI Sigma Delta devices") Signed-off-by: Dragos Bogdan Signed-off-by: Alexandru Ardelean Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/ad_sigma_delta.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index 22c4c17cd996..a1d072ecb717 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -121,6 +121,7 @@ static int ad_sd_read_reg_raw(struct ad_sigma_delta *sigma_delta, if (sigma_delta->info->has_registers) { data[0] = reg << sigma_delta->info->addr_shift; data[0] |= sigma_delta->info->read_mask; + data[0] |= sigma_delta->comm; spi_message_add_tail(&t[0], &m); } spi_message_add_tail(&t[1], &m); From 8ba5d597593acc318e426d7931d32719039b5138 Mon Sep 17 00:00:00 2001 From: Jean-Francois Dagenais Date: Wed, 6 Mar 2019 15:56:06 -0500 Subject: [PATCH 22/73] iio: dac: mcp4725: add missing powerdown bits in store eeprom commit 06003531502d06bc89d32528f6ec96bf978790f9 upstream. When issuing the write DAC register and write eeprom command, the two powerdown bits (PD0 and PD1) are assumed by the chip to be present in the bytes sent. Leaving them at 0 implies "powerdown disabled" which is a different state that the current one. By adding the current state of the powerdown in the i2c write, the chip will correctly power-on exactly like as it is at the moment of store_eeprom call. This is documented in MCP4725's datasheet, FIGURE 6-2: "Write Commands for DAC Input Register and EEPROM" and MCP4726's datasheet, FIGURE 6-3: "Write All Memory Command". Signed-off-by: Jean-Francois Dagenais Acked-by: Peter Meerwald-Stadler Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/dac/mcp4725.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/dac/mcp4725.c b/drivers/iio/dac/mcp4725.c index 6ab1f23e5a79..fe3e42defb33 100644 --- a/drivers/iio/dac/mcp4725.c +++ b/drivers/iio/dac/mcp4725.c @@ -98,6 +98,7 @@ static ssize_t mcp4725_store_eeprom(struct device *dev, inoutbuf[0] = 0x60; /* write EEPROM */ inoutbuf[0] |= data->ref_mode << 3; + inoutbuf[0] |= data->powerdown ? ((data->powerdown_mode + 1) << 1) : 0; inoutbuf[1] = data->dac_value >> 4; inoutbuf[2] = (data->dac_value & 0xf) << 4; From 1ae7297b737d39fe42cec4de3151e54aefec3666 Mon Sep 17 00:00:00 2001 From: Lars-Peter Clausen Date: Wed, 20 Feb 2019 17:11:32 +0200 Subject: [PATCH 23/73] iio: Fix scan mask selection commit 20ea39ef9f2f911bd01c69519e7d69cfec79fde3 upstream. The trialmask is expected to have all bits set to 0 after allocation. Currently kmalloc_array() is used which does not zero the memory and so random bits are set. This results in random channels being enabled when they shouldn't. Replace kmalloc_array() with kcalloc() which has the same interface but zeros the memory. Note the fix is actually required earlier than the below fixes tag, but will require a manual backport due to move from kmalloc to kmalloc_array. Signed-off-by: Lars-Peter Clausen Signed-off-by: Alexandru Ardelean Fixes commit 057ac1acdfc4 ("iio: Use kmalloc_array() in iio_scan_mask_set()"). Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/industrialio-buffer.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/iio/industrialio-buffer.c b/drivers/iio/industrialio-buffer.c index 78482d456c3b..d50125766093 100644 --- a/drivers/iio/industrialio-buffer.c +++ b/drivers/iio/industrialio-buffer.c @@ -320,9 +320,8 @@ static int iio_scan_mask_set(struct iio_dev *indio_dev, const unsigned long *mask; unsigned long *trialmask; - trialmask = kmalloc_array(BITS_TO_LONGS(indio_dev->masklength), - sizeof(*trialmask), - GFP_KERNEL); + trialmask = kcalloc(BITS_TO_LONGS(indio_dev->masklength), + sizeof(*trialmask), GFP_KERNEL); if (trialmask == NULL) return -ENOMEM; if (!indio_dev->masklength) { From fe400daf26deaf94cb182fec48b80079d305097e Mon Sep 17 00:00:00 2001 From: Georg Ottinger Date: Wed, 30 Jan 2019 14:42:02 +0100 Subject: [PATCH 24/73] iio: adc: at91: disable adc channel interrupt in timeout case commit 09c6bdee51183a575bf7546890c8c137a75a2b44 upstream. Having a brief look at at91_adc_read_raw() it is obvious that in the case of a timeout the setting of AT91_ADC_CHDR and AT91_ADC_IDR registers is omitted. If 2 different channels are queried we can end up with a situation where two interrupts are enabled, but only one interrupt is cleared in the interrupt handler. Resulting in a interrupt loop and a system hang. Signed-off-by: Georg Ottinger Acked-by: Ludovic Desroches Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/at91_adc.c | 28 +++++++++++++++++----------- 1 file changed, 17 insertions(+), 11 deletions(-) diff --git a/drivers/iio/adc/at91_adc.c b/drivers/iio/adc/at91_adc.c index cd686179aa92..492f6c8ba735 100644 --- a/drivers/iio/adc/at91_adc.c +++ b/drivers/iio/adc/at91_adc.c @@ -705,23 +705,29 @@ static int at91_adc_read_raw(struct iio_dev *idev, ret = wait_event_interruptible_timeout(st->wq_data_avail, st->done, msecs_to_jiffies(1000)); - if (ret == 0) - ret = -ETIMEDOUT; - if (ret < 0) { - mutex_unlock(&st->lock); - return ret; - } - - *val = st->last_value; + /* Disable interrupts, regardless if adc conversion was + * successful or not + */ at91_adc_writel(st, AT91_ADC_CHDR, AT91_ADC_CH(chan->channel)); at91_adc_writel(st, AT91_ADC_IDR, BIT(chan->channel)); - st->last_value = 0; - st->done = false; + if (ret > 0) { + /* a valid conversion took place */ + *val = st->last_value; + st->last_value = 0; + st->done = false; + ret = IIO_VAL_INT; + } else if (ret == 0) { + /* conversion timeout */ + dev_err(&idev->dev, "ADC Channel %d timeout.\n", + chan->channel); + ret = -ETIMEDOUT; + } + mutex_unlock(&st->lock); - return IIO_VAL_INT; + return ret; case IIO_CHAN_INFO_SCALE: *val = st->vref_mv; From 0386fd65c2db218972a1965471d9c0428a4f5aba Mon Sep 17 00:00:00 2001 From: Fabrice Gasnier Date: Mon, 25 Mar 2019 14:01:23 +0100 Subject: [PATCH 25/73] iio: core: fix a possible circular locking dependency commit 7f75591fc5a123929a29636834d1bcb8b5c9fee3 upstream. This fixes a possible circular locking dependency detected warning seen with: - CONFIG_PROVE_LOCKING=y - consumer/provider IIO devices (ex: "voltage-divider" consumer of "adc") When using the IIO consumer interface, e.g. iio_channel_get(), the consumer device will likely call iio_read_channel_raw() or similar that rely on 'info_exist_lock' mutex. typically: ... mutex_lock(&chan->indio_dev->info_exist_lock); if (chan->indio_dev->info == NULL) { ret = -ENODEV; goto err_unlock; } ret = do_some_ops() err_unlock: mutex_unlock(&chan->indio_dev->info_exist_lock); return ret; ... Same mutex is also hold in iio_device_unregister(). The following deadlock warning happens when: - the consumer device has called an API like iio_read_channel_raw() at least once. - the consumer driver is unregistered, removed (unbind from sysfs) ====================================================== WARNING: possible circular locking dependency detected 4.19.24 #577 Not tainted ------------------------------------------------------ sh/372 is trying to acquire lock: (kn->count#30){++++}, at: kernfs_remove_by_name_ns+0x3c/0x84 but task is already holding lock: (&dev->info_exist_lock){+.+.}, at: iio_device_unregister+0x18/0x60 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #1 (&dev->info_exist_lock){+.+.}: __mutex_lock+0x70/0xa3c mutex_lock_nested+0x1c/0x24 iio_read_channel_raw+0x1c/0x60 iio_read_channel_info+0xa8/0xb0 dev_attr_show+0x1c/0x48 sysfs_kf_seq_show+0x84/0xec seq_read+0x154/0x528 __vfs_read+0x2c/0x15c vfs_read+0x8c/0x110 ksys_read+0x4c/0xac ret_fast_syscall+0x0/0x28 0xbedefb60 -> #0 (kn->count#30){++++}: lock_acquire+0xd8/0x268 __kernfs_remove+0x288/0x374 kernfs_remove_by_name_ns+0x3c/0x84 remove_files+0x34/0x78 sysfs_remove_group+0x40/0x9c sysfs_remove_groups+0x24/0x34 device_remove_attrs+0x38/0x64 device_del+0x11c/0x360 cdev_device_del+0x14/0x2c iio_device_unregister+0x24/0x60 release_nodes+0x1bc/0x200 device_release_driver_internal+0x1a0/0x230 unbind_store+0x80/0x130 kernfs_fop_write+0x100/0x1e4 __vfs_write+0x2c/0x160 vfs_write+0xa4/0x17c ksys_write+0x4c/0xac ret_fast_syscall+0x0/0x28 0xbe906840 other info that might help us debug this: Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&dev->info_exist_lock); lock(kn->count#30); lock(&dev->info_exist_lock); lock(kn->count#30); *** DEADLOCK *** ... cdev_device_del() can be called without holding the lock. It should be safe as info_exist_lock prevents kernelspace consumers to use the exported routines during/after provider removal. cdev_device_del() is for userspace. Help to reproduce: See example: Documentation/devicetree/bindings/iio/afe/voltage-divider.txt sysv { compatible = "voltage-divider"; io-channels = <&adc 0>; output-ohms = <22>; full-ohms = <222>; }; First, go to iio:deviceX for the "voltage-divider", do one read: $ cd /sys/bus/iio/devices/iio:deviceX $ cat in_voltage0_raw Then, unbind the consumer driver. It triggers above deadlock warning. $ cd /sys/bus/platform/drivers/iio-rescale/ $ echo sysv > unbind Note I don't actually expect stable will pick this up all the way back into IIO being in staging, but if's probably valid that far back. Signed-off-by: Fabrice Gasnier Fixes: ac917a81117c ("staging:iio:core set the iio_dev.info pointer to null on unregister") Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/industrialio-core.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index e565fd4fc414..97b7266ee0ff 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -1741,10 +1741,10 @@ EXPORT_SYMBOL(iio_device_register); **/ void iio_device_unregister(struct iio_dev *indio_dev) { - mutex_lock(&indio_dev->info_exist_lock); - cdev_device_del(&indio_dev->chrdev, &indio_dev->dev); + mutex_lock(&indio_dev->info_exist_lock); + iio_device_unregister_debugfs(indio_dev); iio_disable_all_buffers(indio_dev); From de970d4efc3bfb944d4c835bed5b73b8f96a10c5 Mon Sep 17 00:00:00 2001 From: "he, bo" Date: Wed, 6 Mar 2019 10:32:20 +0800 Subject: [PATCH 26/73] io: accel: kxcjk1013: restore the range after resume. commit fe2d3df639a7940a125a33d6460529b9689c5406 upstream. On some laptops, kxcjk1013 is powered off when system enters S3. We need restore the range regiter during resume. Otherwise, the sensor doesn't work properly after S3. Signed-off-by: he, bo Signed-off-by: Chen, Hu Reviewed-by: Hans de Goede Cc: Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/accel/kxcjk-1013.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/iio/accel/kxcjk-1013.c b/drivers/iio/accel/kxcjk-1013.c index 784636800361..780f886ccbfe 100644 --- a/drivers/iio/accel/kxcjk-1013.c +++ b/drivers/iio/accel/kxcjk-1013.c @@ -1340,6 +1340,8 @@ static int kxcjk1013_resume(struct device *dev) mutex_lock(&data->mutex); ret = kxcjk1013_set_mode(data, OPERATION); + if (ret == 0) + ret = kxcjk1013_set_range(data, data->range); mutex_unlock(&data->mutex); return ret; From 586f669a2f06818d0c509b17dbd5ab8e216c3ba0 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 15 Apr 2019 12:10:14 +0100 Subject: [PATCH 27/73] staging: comedi: vmk80xx: Fix use of uninitialized semaphore commit 08b7c2f9208f0e2a32159e4e7a4831b7adb10a3e upstream. If `vmk80xx_auto_attach()` returns an error, the core comedi module code will call `vmk80xx_detach()` to clean up. If `vmk80xx_auto_attach()` successfully allocated the comedi device private data, `vmk80xx_detach()` assumes that a `struct semaphore limit_sem` contained in the private data has been initialized and uses it. Unfortunately, there are a couple of places where `vmk80xx_auto_attach()` can return an error after allocating the device private data but before initializing the semaphore, so this assumption is invalid. Fix it by initializing the semaphore just after allocating the private data in `vmk80xx_auto_attach()` before any other errors can be returned. I believe this was the cause of the following syzbot crash report : usb 1-1: config 0 has no interface number 0 usb 1-1: New USB device found, idVendor=10cf, idProduct=8068, bcdDevice=e6.8d usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 usb 1-1: config 0 descriptor?? vmk80xx 1-1:0.117: driver 'vmk80xx' failed to auto-configure device. INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 12 Comm: kworker/0:1 Not tainted 5.1.0-rc4-319354-g9a33b36 #3 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xe8/0x16e lib/dump_stack.c:113 assign_lock_key kernel/locking/lockdep.c:786 [inline] register_lock_class+0x11b8/0x1250 kernel/locking/lockdep.c:1095 __lock_acquire+0xfb/0x37c0 kernel/locking/lockdep.c:3582 lock_acquire+0x10d/0x2f0 kernel/locking/lockdep.c:4211 __raw_spin_lock_irqsave include/linux/spinlock_api_smp.h:110 [inline] _raw_spin_lock_irqsave+0x44/0x60 kernel/locking/spinlock.c:152 down+0x12/0x80 kernel/locking/semaphore.c:58 vmk80xx_detach+0x59/0x100 drivers/staging/comedi/drivers/vmk80xx.c:829 comedi_device_detach+0xed/0x800 drivers/staging/comedi/drivers.c:204 comedi_device_cleanup.part.0+0x68/0x140 drivers/staging/comedi/comedi_fops.c:156 comedi_device_cleanup drivers/staging/comedi/comedi_fops.c:187 [inline] comedi_free_board_dev.part.0+0x16/0x90 drivers/staging/comedi/comedi_fops.c:190 comedi_free_board_dev drivers/staging/comedi/comedi_fops.c:189 [inline] comedi_release_hardware_device+0x111/0x140 drivers/staging/comedi/comedi_fops.c:2880 comedi_auto_config.cold+0x124/0x1b0 drivers/staging/comedi/drivers.c:1068 usb_probe_interface+0x31d/0x820 drivers/usb/core/driver.c:361 really_probe+0x2da/0xb10 drivers/base/dd.c:509 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454 __device_attach+0x223/0x3a0 drivers/base/dd.c:844 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514 device_add+0xad2/0x16e0 drivers/base/core.c:2106 usb_set_configuration+0xdf7/0x1740 drivers/usb/core/message.c:2021 generic_probe+0xa2/0xda drivers/usb/core/generic.c:210 usb_probe_device+0xc0/0x150 drivers/usb/core/driver.c:266 really_probe+0x2da/0xb10 drivers/base/dd.c:509 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454 __device_attach+0x223/0x3a0 drivers/base/dd.c:844 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514 device_add+0xad2/0x16e0 drivers/base/core.c:2106 usb_new_device.cold+0x537/0xccf drivers/usb/core/hub.c:2534 hub_port_connect drivers/usb/core/hub.c:5089 [inline] hub_port_connect_change drivers/usb/core/hub.c:5204 [inline] port_event drivers/usb/core/hub.c:5350 [inline] hub_event+0x138e/0x3b00 drivers/usb/core/hub.c:5432 process_one_work+0x90f/0x1580 kernel/workqueue.c:2269 worker_thread+0x9b/0xe20 kernel/workqueue.c:2415 kthread+0x313/0x420 kernel/kthread.c:253 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 Reported-by: syzbot+54c2f58f15fe6876b6ad@syzkaller.appspotmail.com Signed-off-by: Ian Abbott Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/vmk80xx.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c index a004aed0147a..ff6f69ffcd62 100644 --- a/drivers/staging/comedi/drivers/vmk80xx.c +++ b/drivers/staging/comedi/drivers/vmk80xx.c @@ -809,6 +809,8 @@ static int vmk80xx_auto_attach(struct comedi_device *dev, devpriv->model = board->model; + sema_init(&devpriv->limit_sem, 8); + ret = vmk80xx_find_usb_endpoints(dev); if (ret) return ret; @@ -817,8 +819,6 @@ static int vmk80xx_auto_attach(struct comedi_device *dev, if (ret) return ret; - sema_init(&devpriv->limit_sem, 8); - usb_set_intfdata(intf, devpriv); if (devpriv->model == VMK8055_MODEL) From 9de6de262c33c8700a2af6209ba732595eb0f94e Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 15 Apr 2019 12:52:30 +0100 Subject: [PATCH 28/73] staging: comedi: vmk80xx: Fix possible double-free of ->usb_rx_buf commit 663d294b4768bfd89e529e069bffa544a830b5bf upstream. `vmk80xx_alloc_usb_buffers()` is called from `vmk80xx_auto_attach()` to allocate RX and TX buffers for USB transfers. It allocates `devpriv->usb_rx_buf` followed by `devpriv->usb_tx_buf`. If the allocation of `devpriv->usb_tx_buf` fails, it frees `devpriv->usb_rx_buf`, leaving the pointer set dangling, and returns an error. Later, `vmk80xx_detach()` will be called from the core comedi module code to clean up. `vmk80xx_detach()` also frees both `devpriv->usb_rx_buf` and `devpriv->usb_tx_buf`, but `devpriv->usb_rx_buf` may have already been freed, leading to a double-free error. Fix it by removing the call to `kfree(devpriv->usb_rx_buf)` from `vmk80xx_alloc_usb_buffers()`, relying on `vmk80xx_detach()` to free the memory. Signed-off-by: Ian Abbott Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/vmk80xx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/staging/comedi/drivers/vmk80xx.c b/drivers/staging/comedi/drivers/vmk80xx.c index ff6f69ffcd62..1800eb3ae017 100644 --- a/drivers/staging/comedi/drivers/vmk80xx.c +++ b/drivers/staging/comedi/drivers/vmk80xx.c @@ -691,10 +691,8 @@ static int vmk80xx_alloc_usb_buffers(struct comedi_device *dev) size = usb_endpoint_maxp(devpriv->ep_tx); devpriv->usb_tx_buf = kzalloc(size, GFP_KERNEL); - if (!devpriv->usb_tx_buf) { - kfree(devpriv->usb_rx_buf); + if (!devpriv->usb_tx_buf) return -ENOMEM; - } return 0; } From 72cd1a3275cfc4a57d4e0799237666def6fc2510 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 15 Apr 2019 12:43:01 +0100 Subject: [PATCH 29/73] staging: comedi: ni_usb6501: Fix use of uninitialized mutex commit 660cf4ce9d0f3497cc7456eaa6d74c8b71d6282c upstream. If `ni6501_auto_attach()` returns an error, the core comedi module code will call `ni6501_detach()` to clean up. If `ni6501_auto_attach()` successfully allocated the comedi device private data, `ni6501_detach()` assumes that a `struct mutex mut` contained in the private data has been initialized and uses it. Unfortunately, there are a couple of places where `ni6501_auto_attach()` can return an error after allocating the device private data but before initializing the mutex, so this assumption is invalid. Fix it by initializing the mutex just after allocating the private data in `ni6501_auto_attach()` before any other errors can be retturned. Also move the call to `usb_set_intfdata()` just to keep the code a bit neater (either position for the call is fine). I believe this was the cause of the following syzbot crash report : usb 1-1: New USB device strings: Mfr=0, Product=0, SerialNumber=0 usb 1-1: config 0 descriptor?? usb 1-1: string descriptor 0 read error: -71 comedi comedi0: Wrong number of endpoints ni6501 1-1:0.233: driver 'ni6501' failed to auto-configure device. INFO: trying to register non-static key. the code is fine but needs lockdep annotation. turning off the locking correctness validator. CPU: 0 PID: 585 Comm: kworker/0:3 Not tainted 5.1.0-rc4-319354-g9a33b36 #3 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0xe8/0x16e lib/dump_stack.c:113 assign_lock_key kernel/locking/lockdep.c:786 [inline] register_lock_class+0x11b8/0x1250 kernel/locking/lockdep.c:1095 __lock_acquire+0xfb/0x37c0 kernel/locking/lockdep.c:3582 lock_acquire+0x10d/0x2f0 kernel/locking/lockdep.c:4211 __mutex_lock_common kernel/locking/mutex.c:925 [inline] __mutex_lock+0xfe/0x12b0 kernel/locking/mutex.c:1072 ni6501_detach+0x5b/0x110 drivers/staging/comedi/drivers/ni_usb6501.c:567 comedi_device_detach+0xed/0x800 drivers/staging/comedi/drivers.c:204 comedi_device_cleanup.part.0+0x68/0x140 drivers/staging/comedi/comedi_fops.c:156 comedi_device_cleanup drivers/staging/comedi/comedi_fops.c:187 [inline] comedi_free_board_dev.part.0+0x16/0x90 drivers/staging/comedi/comedi_fops.c:190 comedi_free_board_dev drivers/staging/comedi/comedi_fops.c:189 [inline] comedi_release_hardware_device+0x111/0x140 drivers/staging/comedi/comedi_fops.c:2880 comedi_auto_config.cold+0x124/0x1b0 drivers/staging/comedi/drivers.c:1068 usb_probe_interface+0x31d/0x820 drivers/usb/core/driver.c:361 really_probe+0x2da/0xb10 drivers/base/dd.c:509 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454 __device_attach+0x223/0x3a0 drivers/base/dd.c:844 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514 device_add+0xad2/0x16e0 drivers/base/core.c:2106 usb_set_configuration+0xdf7/0x1740 drivers/usb/core/message.c:2021 generic_probe+0xa2/0xda drivers/usb/core/generic.c:210 usb_probe_device+0xc0/0x150 drivers/usb/core/driver.c:266 really_probe+0x2da/0xb10 drivers/base/dd.c:509 driver_probe_device+0x21d/0x350 drivers/base/dd.c:671 __device_attach_driver+0x1d8/0x290 drivers/base/dd.c:778 bus_for_each_drv+0x163/0x1e0 drivers/base/bus.c:454 __device_attach+0x223/0x3a0 drivers/base/dd.c:844 bus_probe_device+0x1f1/0x2a0 drivers/base/bus.c:514 device_add+0xad2/0x16e0 drivers/base/core.c:2106 usb_new_device.cold+0x537/0xccf drivers/usb/core/hub.c:2534 hub_port_connect drivers/usb/core/hub.c:5089 [inline] hub_port_connect_change drivers/usb/core/hub.c:5204 [inline] port_event drivers/usb/core/hub.c:5350 [inline] hub_event+0x138e/0x3b00 drivers/usb/core/hub.c:5432 process_one_work+0x90f/0x1580 kernel/workqueue.c:2269 worker_thread+0x9b/0xe20 kernel/workqueue.c:2415 kthread+0x313/0x420 kernel/kthread.c:253 ret_from_fork+0x3a/0x50 arch/x86/entry/entry_64.S:352 Reported-by: syzbot+cf4f2b6c24aff0a3edf6@syzkaller.appspotmail.com Signed-off-by: Ian Abbott Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/ni_usb6501.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/staging/comedi/drivers/ni_usb6501.c b/drivers/staging/comedi/drivers/ni_usb6501.c index 9a0a96329a55..9b33e53a3329 100644 --- a/drivers/staging/comedi/drivers/ni_usb6501.c +++ b/drivers/staging/comedi/drivers/ni_usb6501.c @@ -527,6 +527,9 @@ static int ni6501_auto_attach(struct comedi_device *dev, if (!devpriv) return -ENOMEM; + mutex_init(&devpriv->mut); + usb_set_intfdata(intf, devpriv); + ret = ni6501_find_endpoints(dev); if (ret) return ret; @@ -535,9 +538,6 @@ static int ni6501_auto_attach(struct comedi_device *dev, if (ret) return ret; - mutex_init(&devpriv->mut); - usb_set_intfdata(intf, devpriv); - ret = comedi_alloc_subdevices(dev, 2); if (ret) return ret; From 9ae4c50f5e5284808370b47e22a2d0f7dcb38e34 Mon Sep 17 00:00:00 2001 From: Ian Abbott Date: Mon, 15 Apr 2019 12:43:02 +0100 Subject: [PATCH 30/73] staging: comedi: ni_usb6501: Fix possible double-free of ->usb_rx_buf commit af4b54a2e5ba18259ff9aac445bf546dd60d037e upstream. `ni6501_alloc_usb_buffers()` is called from `ni6501_auto_attach()` to allocate RX and TX buffers for USB transfers. It allocates `devpriv->usb_rx_buf` followed by `devpriv->usb_tx_buf`. If the allocation of `devpriv->usb_tx_buf` fails, it frees `devpriv->usb_rx_buf`, leaving the pointer set dangling, and returns an error. Later, `ni6501_detach()` will be called from the core comedi module code to clean up. `ni6501_detach()` also frees both `devpriv->usb_rx_buf` and `devpriv->usb_tx_buf`, but `devpriv->usb_rx_buf` may have already beed freed, leading to a double-free error. Fix it bu removing the call to `kfree(devpriv->usb_rx_buf)` from `ni6501_alloc_usb_buffers()`, relying on `ni6501_detach()` to free the memory. Signed-off-by: Ian Abbott Cc: stable Signed-off-by: Greg Kroah-Hartman --- drivers/staging/comedi/drivers/ni_usb6501.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/staging/comedi/drivers/ni_usb6501.c b/drivers/staging/comedi/drivers/ni_usb6501.c index 9b33e53a3329..009c5277387b 100644 --- a/drivers/staging/comedi/drivers/ni_usb6501.c +++ b/drivers/staging/comedi/drivers/ni_usb6501.c @@ -472,10 +472,8 @@ static int ni6501_alloc_usb_buffers(struct comedi_device *dev) size = usb_endpoint_maxp(devpriv->ep_tx); devpriv->usb_tx_buf = kzalloc(size, GFP_KERNEL); - if (!devpriv->usb_tx_buf) { - kfree(devpriv->usb_rx_buf); + if (!devpriv->usb_tx_buf) return -ENOMEM; - } return 0; } From cfb5fb042ee1c8dbfbce53016f331c97c4f29411 Mon Sep 17 00:00:00 2001 From: Hui Wang Date: Wed, 17 Apr 2019 16:10:32 +0800 Subject: [PATCH 31/73] ALSA: hda/realtek - add two more pin configuration sets to quirk table commit b26e36b7ef36a8a3a147b1609b2505f8a4ecf511 upstream. We have two Dell laptops which have the codec 10ec0236 and 10ec0256 respectively, the headset mic on them can't work, need to apply the quirk of ALC255_FIXUP_DELL1_MIC_NO_PRESENCE. So adding their pin configurations in the pin quirk table. Cc: Signed-off-by: Hui Wang Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_realtek.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 9637d0bbdeb5..b9e720cb6f02 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6743,6 +6743,8 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x12, 0x90a60140}, {0x14, 0x90170150}, {0x21, 0x02211020}), + SND_HDA_PIN_QUIRK(0x10ec0236, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x21, 0x02211020}), SND_HDA_PIN_QUIRK(0x10ec0255, 0x1028, "Dell", ALC255_FIXUP_DELL2_MIC_NO_PRESENCE, {0x14, 0x90170110}, {0x21, 0x02211020}), @@ -6853,6 +6855,10 @@ static const struct snd_hda_pin_quirk alc269_pin_fixup_tbl[] = { {0x21, 0x0221101f}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, ALC256_STANDARD_PINS), + SND_HDA_PIN_QUIRK(0x10ec0256, 0x1028, "Dell", ALC255_FIXUP_DELL1_MIC_NO_PRESENCE, + {0x14, 0x90170110}, + {0x1b, 0x01011020}, + {0x21, 0x0221101f}), SND_HDA_PIN_QUIRK(0x10ec0256, 0x1043, "ASUS", ALC256_FIXUP_ASUS_MIC, {0x14, 0x90170110}, {0x1b, 0x90a70130}, From d11a33e9ba584bb6f5cc74df9d74b26156ba9bb2 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 16 Apr 2019 17:06:33 +0200 Subject: [PATCH 32/73] ALSA: core: Fix card races between register and disconnect commit 2a3f7221acddfe1caa9ff09b3a8158c39b2fdeac upstream. There is a small race window in the card disconnection code that allows the registration of another card with the very same card id. This leads to a warning in procfs creation as caught by syzkaller. The problem is that we delete snd_cards and snd_cards_lock entries at the very beginning of the disconnection procedure. This makes the slot available to be assigned for another card object while the disconnection procedure is being processed. Then it becomes possible to issue a procfs registration with the existing file name although we check the conflict beforehand. The fix is simply to move the snd_cards and snd_cards_lock clearances at the end of the disconnection procedure. The references to these entries are merely either from the global proc files like /proc/asound/cards or from the card registration / disconnection, so it should be fine to shift at the very end. Reported-by: syzbot+48df349490c36f9f54ab@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/init.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/sound/core/init.c b/sound/core/init.c index 32ebe2f6bc59..dcb9199f5e4f 100644 --- a/sound/core/init.c +++ b/sound/core/init.c @@ -406,14 +406,7 @@ int snd_card_disconnect(struct snd_card *card) card->shutdown = 1; spin_unlock(&card->files_lock); - /* phase 1: disable fops (user space) operations for ALSA API */ - mutex_lock(&snd_card_mutex); - snd_cards[card->number] = NULL; - clear_bit(card->number, snd_cards_lock); - mutex_unlock(&snd_card_mutex); - - /* phase 2: replace file->f_op with special dummy operations */ - + /* replace file->f_op with special dummy operations */ spin_lock(&card->files_lock); list_for_each_entry(mfile, &card->files_list, list) { /* it's critical part, use endless loop */ @@ -429,7 +422,7 @@ int snd_card_disconnect(struct snd_card *card) } spin_unlock(&card->files_lock); - /* phase 3: notify all connected devices about disconnection */ + /* notify all connected devices about disconnection */ /* at this point, they cannot respond to any calls except release() */ #if IS_ENABLED(CONFIG_SND_MIXER_OSS) @@ -445,6 +438,13 @@ int snd_card_disconnect(struct snd_card *card) device_del(&card->card_dev); card->registered = false; } + + /* disable fops (user space) operations for ALSA API */ + mutex_lock(&snd_card_mutex); + snd_cards[card->number] = NULL; + clear_bit(card->number, snd_cards_lock); + mutex_unlock(&snd_card_mutex); + #ifdef CONFIG_PM wake_up(&card->power_sleep); #endif From 49c67980e52ba3bfc7675c777d4896e9ea741efe Mon Sep 17 00:00:00 2001 From: Jaesoo Lee Date: Tue, 9 Apr 2019 17:02:22 -0700 Subject: [PATCH 33/73] scsi: core: set result when the command cannot be dispatched commit be549d49115422f846b6d96ee8fd7173a5f7ceb0 upstream. When SCSI blk-mq is enabled, there is a bug in handling errors in scsi_queue_rq. Specifically, the bug is not setting result field of scsi_request correctly when the dispatch of the command has been failed. Since the upper layer code including the sg_io ioctl expects to receive any error status from result field of scsi_request, the error is silently ignored and this could cause data corruptions for some applications. Fixes: d285203cf647 ("scsi: add support for a blk-mq based I/O path.") Cc: Signed-off-by: Jaesoo Lee Reviewed-by: Hannes Reinecke Reviewed-by: Bart Van Assche Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_lib.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_lib.c b/drivers/scsi/scsi_lib.c index 37d366696d21..c89f0e129f58 100644 --- a/drivers/scsi/scsi_lib.c +++ b/drivers/scsi/scsi_lib.c @@ -2050,8 +2050,12 @@ out: blk_mq_delay_run_hw_queue(hctx, SCSI_QUEUE_DELAY); break; default: + if (unlikely(!scsi_device_online(sdev))) + scsi_req(req)->result = DID_NO_CONNECT << 16; + else + scsi_req(req)->result = DID_ERROR << 16; /* - * Make sure to release all allocated ressources when + * Make sure to release all allocated resources when * we hit an error, as we will never see this command * again. */ From bc6b83db8f5ae658f1db0855bcc03c8366b9084e Mon Sep 17 00:00:00 2001 From: Saurav Kashyap Date: Thu, 18 Apr 2019 03:40:12 -0700 Subject: [PATCH 34/73] Revert "scsi: fcoe: clear FC_RP_STARTED flags when receiving a LOGO" commit 0228034d8e5915b98c33db35a98f5e909e848ae9 upstream. This patch clears FC_RP_STARTED flag during logoff, because of this re-login(flogi) didn't happen to the switch. This reverts commit 1550ec458e0cf1a40a170ab1f4c46e3f52860f65. Fixes: 1550ec458e0c ("scsi: fcoe: clear FC_RP_STARTED flags when receiving a LOGO") Cc: # v4.18+ Signed-off-by: Saurav Kashyap Reviewed-by: Hannes Reinecke Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/libfc/fc_rport.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/scsi/libfc/fc_rport.c b/drivers/scsi/libfc/fc_rport.c index 89b1f1af2fd4..31d31aad3de1 100644 --- a/drivers/scsi/libfc/fc_rport.c +++ b/drivers/scsi/libfc/fc_rport.c @@ -2164,7 +2164,6 @@ static void fc_rport_recv_logo_req(struct fc_lport *lport, struct fc_frame *fp) FC_RPORT_DBG(rdata, "Received LOGO request while in state %s\n", fc_rport_state(rdata)); - rdata->flags &= ~FC_RP_STARTED; fc_rport_enter_delete(rdata, RPORT_EV_STOP); mutex_unlock(&rdata->rp_mutex); kref_put(&rdata->kref, fc_rport_destroy); From c197e4693aca226675ffdce69ee5638842fbdfe6 Mon Sep 17 00:00:00 2001 From: "Suthikulpanit, Suravee" Date: Wed, 20 Mar 2019 08:12:28 +0000 Subject: [PATCH 35/73] Revert "svm: Fix AVIC incomplete IPI emulation" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 4a58038b9e420276157785afa0a0bbb4b9bc2265 upstream. This reverts commit bb218fbcfaaa3b115d4cd7a43c0ca164f3a96e57. As Oren Twaig pointed out the old discussion: https://patchwork.kernel.org/patch/8292231/ that the change coud potentially cause an extra IPI to be sent to the destination vcpu because the AVIC hardware already set the IRR bit before the incomplete IPI #VMEXIT with id=1 (target vcpu is not running). Since writting to ICR and ICR2 will also set the IRR. If something triggers the destination vcpu to get scheduled before the emulation finishes, then this could result in an additional IPI. Also, the issue mentioned in the commit bb218fbcfaaa was misdiagnosed. Cc: Radim Krčmář Cc: Paolo Bonzini Reported-by: Oren Twaig Signed-off-by: Suravee Suthikulpanit Cc: stable@vger.kernel.org Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/svm.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 3a57816ea498..1296e44fd969 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -4017,14 +4017,25 @@ static int avic_incomplete_ipi_interception(struct vcpu_svm *svm) kvm_lapic_reg_write(apic, APIC_ICR, icrl); break; case AVIC_IPI_FAILURE_TARGET_NOT_RUNNING: { + int i; + struct kvm_vcpu *vcpu; + struct kvm *kvm = svm->vcpu.kvm; struct kvm_lapic *apic = svm->vcpu.arch.apic; /* - * Update ICR high and low, then emulate sending IPI, - * which is handled when writing APIC_ICR. + * At this point, we expect that the AVIC HW has already + * set the appropriate IRR bits on the valid target + * vcpus. So, we just need to kick the appropriate vcpu. */ - kvm_lapic_reg_write(apic, APIC_ICR2, icrh); - kvm_lapic_reg_write(apic, APIC_ICR, icrl); + kvm_for_each_vcpu(i, vcpu, kvm) { + bool m = kvm_apic_match_dest(vcpu, apic, + icrl & KVM_APIC_SHORT_MASK, + GET_APIC_DEST_FIELD(icrh), + icrl & KVM_APIC_DEST_MASK); + + if (m && !avic_vcpu_is_running(vcpu)) + kvm_vcpu_wake_up(vcpu); + } break; } case AVIC_IPI_FAILURE_INVALID_TARGET: From bb461ad8e6e0653fc6bd0f26d9173bab0aec235b Mon Sep 17 00:00:00 2001 From: Andrea Arcangeli Date: Thu, 18 Apr 2019 17:50:52 -0700 Subject: [PATCH 36/73] coredump: fix race condition between mmget_not_zero()/get_task_mm() and core dumping commit 04f5866e41fb70690e28397487d8bd8eea7d712a upstream. The core dumping code has always run without holding the mmap_sem for writing, despite that is the only way to ensure that the entire vma layout will not change from under it. Only using some signal serialization on the processes belonging to the mm is not nearly enough. This was pointed out earlier. For example in Hugh's post from Jul 2017: https://lkml.kernel.org/r/alpine.LSU.2.11.1707191716030.2055@eggly.anvils "Not strictly relevant here, but a related note: I was very surprised to discover, only quite recently, how handle_mm_fault() may be called without down_read(mmap_sem) - when core dumping. That seems a misguided optimization to me, which would also be nice to correct" In particular because the growsdown and growsup can move the vm_start/vm_end the various loops the core dump does around the vma will not be consistent if page faults can happen concurrently. Pretty much all users calling mmget_not_zero()/get_task_mm() and then taking the mmap_sem had the potential to introduce unexpected side effects in the core dumping code. Adding mmap_sem for writing around the ->core_dump invocation is a viable long term fix, but it requires removing all copy user and page faults and to replace them with get_dump_page() for all binary formats which is not suitable as a short term fix. For the time being this solution manually covers the places that can confuse the core dump either by altering the vma layout or the vma flags while it runs. Once ->core_dump runs under mmap_sem for writing the function mmget_still_valid() can be dropped. Allowing mmap_sem protected sections to run in parallel with the coredump provides some minor parallelism advantage to the swapoff code (which seems to be safe enough by never mangling any vma field and can keep doing swapins in parallel to the core dumping) and to some other corner case. In order to facilitate the backporting I added "Fixes: 86039bd3b4e6" however the side effect of this same race condition in /proc/pid/mem should be reproducible since before 2.6.12-rc2 so I couldn't add any other "Fixes:" because there's no hash beyond the git genesis commit. Because find_extend_vma() is the only location outside of the process context that could modify the "mm" structures under mmap_sem for reading, by adding the mmget_still_valid() check to it, all other cases that take the mmap_sem for reading don't need the new check after mmget_not_zero()/get_task_mm(). The expand_stack() in page fault context also doesn't need the new check, because all tasks under core dumping are frozen. Link: http://lkml.kernel.org/r/20190325224949.11068-1-aarcange@redhat.com Fixes: 86039bd3b4e6 ("userfaultfd: add new syscall to provide memory externalization") Signed-off-by: Andrea Arcangeli Reported-by: Jann Horn Suggested-by: Oleg Nesterov Acked-by: Peter Xu Reviewed-by: Mike Rapoport Reviewed-by: Oleg Nesterov Reviewed-by: Jann Horn Acked-by: Jason Gunthorpe Acked-by: Michal Hocko Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/proc/task_mmu.c | 18 ++++++++++++++++++ fs/userfaultfd.c | 9 +++++++++ include/linux/sched/mm.h | 21 +++++++++++++++++++++ mm/mmap.c | 7 ++++++- 4 files changed, 54 insertions(+), 1 deletion(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 5e63c459dc61..309d24118f9a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1160,6 +1160,24 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, count = -EINTR; goto out_mm; } + /* + * Avoid to modify vma->vm_flags + * without locked ops while the + * coredump reads the vm_flags. + */ + if (!mmget_still_valid(mm)) { + /* + * Silently return "count" + * like if get_task_mm() + * failed. FIXME: should this + * function have returned + * -ESRCH if get_task_mm() + * failed like if + * get_proc_task() fails? + */ + up_write(&mm->mmap_sem); + goto out_mm; + } for (vma = mm->mmap; vma; vma = vma->vm_next) { vma->vm_flags &= ~VM_SOFTDIRTY; vma_set_page_prot(vma); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 5f10052d2671..7a908d683258 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -627,6 +627,8 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, /* the various vma->vm_userfaultfd_ctx still points to it */ down_write(&mm->mmap_sem); + /* no task can run (and in turn coredump) yet */ + VM_WARN_ON(!mmget_still_valid(mm)); for (vma = mm->mmap; vma; vma = vma->vm_next) if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; @@ -867,6 +869,8 @@ static int userfaultfd_release(struct inode *inode, struct file *file) * taking the mmap_sem for writing. */ down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto skip_mm; prev = NULL; for (vma = mm->mmap; vma; vma = vma->vm_next) { cond_resched(); @@ -889,6 +893,7 @@ static int userfaultfd_release(struct inode *inode, struct file *file) vma->vm_flags = new_flags; vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; } +skip_mm: up_write(&mm->mmap_sem); mmput(mm); wakeup: @@ -1327,6 +1332,8 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; @@ -1514,6 +1521,8 @@ static int userfaultfd_unregister(struct userfaultfd_ctx *ctx, goto out; down_write(&mm->mmap_sem); + if (!mmget_still_valid(mm)) + goto out_unlock; vma = find_vma_prev(mm, start, &prev); if (!vma) goto out_unlock; diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..ef4ae0a545fe 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -57,6 +57,27 @@ static inline void mmdrop_async(struct mm_struct *mm) } } +/* + * This has to be called after a get_task_mm()/mmget_not_zero() + * followed by taking the mmap_sem for writing before modifying the + * vmas or anything the coredump pretends not to change from under it. + * + * NOTE: find_extend_vma() called from GUP context is the only place + * that can modify the "mm" (notably the vm_start/end) under mmap_sem + * for reading and outside the context of the process, so it is also + * the only case that holds the mmap_sem for reading that must call + * this function. Generally if the mmap_sem is hold for reading + * there's no need of this check after get_task_mm()/mmget_not_zero(). + * + * This function can be obsoleted and the check can be removed, after + * the coredump code will hold the mmap_sem for writing before + * invoking the ->core_dump methods. + */ +static inline bool mmget_still_valid(struct mm_struct *mm) +{ + return likely(!mm->core_state); +} + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/mm/mmap.c b/mm/mmap.c index 00dab291e61d..59fd53b41c9c 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -45,6 +45,7 @@ #include #include #include +#include #include #include @@ -2448,7 +2449,8 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) vma = find_vma_prev(mm, addr, &prev); if (vma && (vma->vm_start <= addr)) return vma; - if (!prev || expand_stack(prev, addr)) + /* don't alter vm_end if the coredump is running */ + if (!prev || !mmget_still_valid(mm) || expand_stack(prev, addr)) return NULL; if (prev->vm_flags & VM_LOCKED) populate_vma_page_range(prev, addr, prev->vm_end, NULL); @@ -2474,6 +2476,9 @@ find_extend_vma(struct mm_struct *mm, unsigned long addr) return vma; if (!(vma->vm_flags & VM_GROWSDOWN)) return NULL; + /* don't alter vm_start if the coredump is running */ + if (!mmget_still_valid(mm)) + return NULL; start = vma->vm_start; if (expand_stack(vma, addr)) return NULL; From 4de25ac0e2d22198395ae6f2ddd133fb824ea9e5 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Sun, 31 Mar 2019 13:04:11 -0700 Subject: [PATCH 37/73] crypto: x86/poly1305 - fix overflow during partial reduction commit 678cce4019d746da6c680c48ba9e6d417803e127 upstream. The x86_64 implementation of Poly1305 produces the wrong result on some inputs because poly1305_4block_avx2() incorrectly assumes that when partially reducing the accumulator, the bits carried from limb 'd4' to limb 'h0' fit in a 32-bit integer. This is true for poly1305-generic which processes only one block at a time. However, it's not true for the AVX2 implementation, which processes 4 blocks at a time and therefore can produce intermediate limbs about 4x larger. Fix it by making the relevant calculations use 64-bit arithmetic rather than 32-bit. Note that most of the carries already used 64-bit arithmetic, but the d4 -> h0 carry was different for some reason. To be safe I also made the same change to the corresponding SSE2 code, though that only operates on 1 or 2 blocks at a time. I don't think it's really needed for poly1305_block_sse2(), but it doesn't hurt because it's already x86_64 code. It *might* be needed for poly1305_2block_sse2(), but overflows aren't easy to reproduce there. This bug was originally detected by my patches that improve testmgr to fuzz algorithms against their generic implementation. But also add a test vector which reproduces it directly (in the AVX2 case). Fixes: b1ccc8f4b631 ("crypto: poly1305 - Add a four block AVX2 variant for x86_64") Fixes: c70f4abef07a ("crypto: poly1305 - Add a SSE2 SIMD variant for x86_64") Cc: # v4.3+ Cc: Martin Willi Cc: Jason A. Donenfeld Signed-off-by: Eric Biggers Reviewed-by: Martin Willi Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/x86/crypto/poly1305-avx2-x86_64.S | 14 +++++--- arch/x86/crypto/poly1305-sse2-x86_64.S | 22 ++++++++----- crypto/testmgr.h | 44 +++++++++++++++++++++++++- 3 files changed, 67 insertions(+), 13 deletions(-) diff --git a/arch/x86/crypto/poly1305-avx2-x86_64.S b/arch/x86/crypto/poly1305-avx2-x86_64.S index 3b6e70d085da..8457cdd47f75 100644 --- a/arch/x86/crypto/poly1305-avx2-x86_64.S +++ b/arch/x86/crypto/poly1305-avx2-x86_64.S @@ -323,6 +323,12 @@ ENTRY(poly1305_4block_avx2) vpaddq t2,t1,t1 vmovq t1x,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -361,16 +367,16 @@ ENTRY(poly1305_4block_avx2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/arch/x86/crypto/poly1305-sse2-x86_64.S b/arch/x86/crypto/poly1305-sse2-x86_64.S index c88c670cb5fc..5851c7418fb7 100644 --- a/arch/x86/crypto/poly1305-sse2-x86_64.S +++ b/arch/x86/crypto/poly1305-sse2-x86_64.S @@ -253,16 +253,16 @@ ENTRY(poly1305_block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx @@ -520,6 +520,12 @@ ENTRY(poly1305_2block_sse2) paddq t2,t1 movq t1,d4 + # Now do a partial reduction mod (2^130)-5, carrying h0 -> h1 -> h2 -> + # h3 -> h4 -> h0 -> h1 to get h0,h2,h3,h4 < 2^26 and h1 < 2^26 + a small + # amount. Careful: we must not assume the carry bits 'd0 >> 26', + # 'd1 >> 26', 'd2 >> 26', 'd3 >> 26', and '(d4 >> 26) * 5' fit in 32-bit + # integers. It's true in a single-block implementation, but not here. + # d1 += d0 >> 26 mov d0,%rax shr $26,%rax @@ -558,16 +564,16 @@ ENTRY(poly1305_2block_sse2) # h0 += (d4 >> 26) * 5 mov d4,%rax shr $26,%rax - lea (%eax,%eax,4),%eax - add %eax,%ebx + lea (%rax,%rax,4),%rax + add %rax,%rbx # h4 = d4 & 0x3ffffff mov d4,%rax and $0x3ffffff,%eax mov %eax,h4 # h1 += h0 >> 26 - mov %ebx,%eax - shr $26,%eax + mov %rbx,%rax + shr $26,%rax add %eax,h1 # h0 = h0 & 0x3ffffff andl $0x3ffffff,%ebx diff --git a/crypto/testmgr.h b/crypto/testmgr.h index fbc0fab5e79e..12835f072614 100644 --- a/crypto/testmgr.h +++ b/crypto/testmgr.h @@ -4660,7 +4660,49 @@ static const struct hash_testvec poly1305_tv_template[] = { .psize = 80, .digest = "\x13\x00\x00\x00\x00\x00\x00\x00" "\x00\x00\x00\x00\x00\x00\x00\x00", - }, + }, { /* Regression test for overflow in AVX2 implementation */ + .plaintext = "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff\xff\xff\xff\xff" + "\xff\xff\xff\xff", + .psize = 300, + .digest = "\xfb\x5e\x96\xd8\x61\xd5\xc7\xc8" + "\x78\xe5\x87\xcc\x2d\x5a\x22\xe1", + } }; /* From 1f2b61e4657950b96722a5a6c6dd535347d55e8a Mon Sep 17 00:00:00 2001 From: Nathan Chancellor Date: Wed, 17 Apr 2019 00:21:21 -0700 Subject: [PATCH 38/73] arm64: futex: Restore oldval initialization to work around buggy compilers commit ff8acf929014b7f87315588e0daf8597c8aa9d1c upstream. Commit 045afc24124d ("arm64: futex: Fix FUTEX_WAKE_OP atomic ops with non-zero result value") removed oldval's zero initialization in arch_futex_atomic_op_inuser because it is not necessary. Unfortunately, Android's arm64 GCC 4.9.4 [1] does not agree: ../kernel/futex.c: In function 'do_futex': ../kernel/futex.c:1658:17: warning: 'oldval' may be used uninitialized in this function [-Wmaybe-uninitialized] return oldval == cmparg; ^ In file included from ../kernel/futex.c:73:0: ../arch/arm64/include/asm/futex.h:53:6: note: 'oldval' was declared here int oldval, ret, tmp; ^ GCC fails to follow that when ret is non-zero, futex_atomic_op_inuser returns right away, avoiding the uninitialized use that it claims. Restoring the zero initialization works around this issue. [1]: https://android.googlesource.com/platform/prebuilts/gcc/linux-x86/aarch64/aarch64-linux-android-4.9/ Cc: stable@vger.kernel.org Fixes: 045afc24124d ("arm64: futex: Fix FUTEX_WAKE_OP atomic ops with non-zero result value") Reviewed-by: Greg Kroah-Hartman Signed-off-by: Nathan Chancellor Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/futex.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/include/asm/futex.h b/arch/arm64/include/asm/futex.h index b447b4db423a..fd1e722f3821 100644 --- a/arch/arm64/include/asm/futex.h +++ b/arch/arm64/include/asm/futex.h @@ -50,7 +50,7 @@ do { \ static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr) { - int oldval, ret, tmp; + int oldval = 0, ret, tmp; u32 __user *uaddr = __uaccess_mask_ptr(_uaddr); pagefault_disable(); From 877e9c51c96cd5ad92b45e36447e275374efc7af Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:49:52 +0900 Subject: [PATCH 39/73] x86/kprobes: Verify stack frame on kretprobe commit 3ff9c075cc767b3060bdac12da72fc94dd7da1b8 upstream. Verify the stack frame pointer on kretprobe trampoline handler, If the stack frame pointer does not match, it skips the wrong entry and tries to find correct one. This can happen if user puts the kretprobe on the function which can be used in the path of ftrace user-function call. Such functions should not be probed, so this adds a warning message that reports which function should be blacklisted. Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094059185.6137.15527904013362842072.stgit@devbox Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/kprobes/core.c | 26 ++++++++++++++++++++++++++ include/linux/kprobes.h | 1 + 2 files changed, 27 insertions(+) diff --git a/arch/x86/kernel/kprobes/core.c b/arch/x86/kernel/kprobes/core.c index 65452d555f05..56cf6c263254 100644 --- a/arch/x86/kernel/kprobes/core.c +++ b/arch/x86/kernel/kprobes/core.c @@ -553,6 +553,7 @@ void arch_prepare_kretprobe(struct kretprobe_instance *ri, struct pt_regs *regs) unsigned long *sara = stack_addr(regs); ri->ret_addr = (kprobe_opcode_t *) *sara; + ri->fp = sara; /* Replace the return addr with trampoline addr */ *sara = (unsigned long) &kretprobe_trampoline; @@ -754,15 +755,21 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) unsigned long flags, orig_ret_address = 0; unsigned long trampoline_address = (unsigned long)&kretprobe_trampoline; kprobe_opcode_t *correct_ret_addr = NULL; + void *frame_pointer; + bool skipped = false; INIT_HLIST_HEAD(&empty_rp); kretprobe_hash_lock(current, &head, &flags); /* fixup registers */ #ifdef CONFIG_X86_64 regs->cs = __KERNEL_CS; + /* On x86-64, we use pt_regs->sp for return address holder. */ + frame_pointer = ®s->sp; #else regs->cs = __KERNEL_CS | get_kernel_rpl(); regs->gs = 0; + /* On x86-32, we use pt_regs->flags for return address holder. */ + frame_pointer = ®s->flags; #endif regs->ip = trampoline_address; regs->orig_ax = ~0UL; @@ -784,8 +791,25 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + /* + * Return probes must be pushed on this hash list correct + * order (same as return order) so that it can be poped + * correctly. However, if we find it is pushed it incorrect + * order, this means we find a function which should not be + * probed, because the wrong order entry is pushed on the + * path of processing other kretprobe itself. + */ + if (ri->fp != frame_pointer) { + if (!skipped) + pr_warn("kretprobe is stacked incorrectly. Trying to fixup.\n"); + skipped = true; + continue; + } orig_ret_address = (unsigned long)ri->ret_addr; + if (skipped) + pr_warn("%ps must be blacklisted because of incorrect kretprobe order\n", + ri->rp->kp.addr); if (orig_ret_address != trampoline_address) /* @@ -803,6 +827,8 @@ __visible __used void *trampoline_handler(struct pt_regs *regs) if (ri->task != current) /* another task is sharing our hash bucket */ continue; + if (ri->fp != frame_pointer) + continue; orig_ret_address = (unsigned long)ri->ret_addr; if (ri->rp && ri->rp->handler) { diff --git a/include/linux/kprobes.h b/include/linux/kprobes.h index bd2684700b74..520702b82134 100644 --- a/include/linux/kprobes.h +++ b/include/linux/kprobes.h @@ -198,6 +198,7 @@ struct kretprobe_instance { struct kretprobe *rp; kprobe_opcode_t *ret_addr; struct task_struct *task; + void *fp; char data[0]; }; From 18a0a7c1f9d5150dd6da3adf93e2275717f6c6d6 Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Sun, 24 Feb 2019 01:50:20 +0900 Subject: [PATCH 40/73] kprobes: Mark ftrace mcount handler functions nokprobe commit fabe38ab6b2bd9418350284c63825f13b8a6abba upstream. Mark ftrace mcount handler functions nokprobe since probing on these functions with kretprobe pushes return address incorrectly on kretprobe shadow stack. Reported-by: Francis Deslauriers Tested-by: Andrea Righi Signed-off-by: Masami Hiramatsu Acked-by: Steven Rostedt Acked-by: Steven Rostedt (VMware) Cc: Linus Torvalds Cc: Mathieu Desnoyers Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org Link: http://lkml.kernel.org/r/155094062044.6137.6419622920568680640.stgit@devbox Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ftrace.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 9937d7cf2a64..3e92852c8b23 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -33,6 +33,7 @@ #include #include #include +#include #include @@ -6035,7 +6036,7 @@ void ftrace_reset_array_ops(struct trace_array *tr) tr->ops->func = ftrace_stub; } -static inline void +static nokprobe_inline void __ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, struct ftrace_ops *ignored, struct pt_regs *regs) { @@ -6098,11 +6099,13 @@ static void ftrace_ops_list_func(unsigned long ip, unsigned long parent_ip, { __ftrace_ops_list_func(ip, parent_ip, NULL, regs); } +NOKPROBE_SYMBOL(ftrace_ops_list_func); #else static void ftrace_ops_no_ops(unsigned long ip, unsigned long parent_ip) { __ftrace_ops_list_func(ip, parent_ip, NULL, NULL); } +NOKPROBE_SYMBOL(ftrace_ops_no_ops); #endif /* @@ -6132,6 +6135,7 @@ static void ftrace_ops_assist_func(unsigned long ip, unsigned long parent_ip, preempt_enable_notrace(); trace_clear_recursion(bit); } +NOKPROBE_SYMBOL(ftrace_ops_assist_func); /** * ftrace_ops_get_func - get the function a trampoline should call From 5e8002fb22201466d27022c03a5911b30034ff7a Mon Sep 17 00:00:00 2001 From: Masami Hiramatsu Date: Mon, 15 Apr 2019 15:01:25 +0900 Subject: [PATCH 41/73] kprobes: Fix error check when reusing optimized probes commit 5f843ed415581cfad4ef8fefe31c138a8346ca8a upstream. The following commit introduced a bug in one of our error paths: 819319fc9346 ("kprobes: Return error if we fail to reuse kprobe instead of BUG_ON()") it missed to handle the return value of kprobe_optready() as error-value. In reality, the kprobe_optready() returns a bool result, so "true" case must be passed instead of 0. This causes some errors on kprobe boot-time selftests on ARM: [ ] Beginning kprobe tests... [ ] Probe ARM code [ ] kprobe [ ] kretprobe [ ] ARM instruction simulation [ ] Check decoding tables [ ] Run test cases [ ] FAIL: test_case_handler not run [ ] FAIL: Test andge r10, r11, r14, asr r7 [ ] FAIL: Scenario 11 ... [ ] FAIL: Scenario 7 [ ] Total instruction simulation tests=1631, pass=1433 fail=198 [ ] kprobe tests failed This can happen if an optimized probe is unregistered and next kprobe is registered on same address until the previous probe is not reclaimed. If this happens, a hidden aggregated probe may be kept in memory, and no new kprobe can probe same address. Also, in that case register_kprobe() will return "1" instead of minus error value, which can mislead caller logic. Signed-off-by: Masami Hiramatsu Cc: Anil S Keshavamurthy Cc: David S . Miller Cc: Linus Torvalds Cc: Naveen N . Rao Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: stable@vger.kernel.org # v5.0+ Fixes: 819319fc9346 ("kprobes: Return error if we fail to reuse kprobe instead of BUG_ON()") Link: http://lkml.kernel.org/r/155530808559.32517.539898325433642204.stgit@devnote2 Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/kprobes.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/kprobes.c b/kernel/kprobes.c index 5cbad4fb9107..ec11bb986a8b 100644 --- a/kernel/kprobes.c +++ b/kernel/kprobes.c @@ -703,7 +703,6 @@ static void unoptimize_kprobe(struct kprobe *p, bool force) static int reuse_unused_kprobe(struct kprobe *ap) { struct optimized_kprobe *op; - int ret; BUG_ON(!kprobe_unused(ap)); /* @@ -717,9 +716,8 @@ static int reuse_unused_kprobe(struct kprobe *ap) /* Enable the probe again */ ap->flags &= ~KPROBE_FLAG_DISABLED; /* Optimize it again (remove from op->list) */ - ret = kprobe_optready(ap); - if (ret) - return ret; + if (!kprobe_optready(ap)) + return -EINVAL; optimize_kprobe(ap); return 0; From 8a80544c5e6f50e7796380381f285488af47287f Mon Sep 17 00:00:00 2001 From: Vijayakumar Durai Date: Wed, 27 Mar 2019 11:03:17 +0100 Subject: [PATCH 42/73] rt2x00: do not increment sequence number while re-transmitting commit 746ba11f170603bf1eaade817553a6c2e9135bbe upstream. Currently rt2x00 devices retransmit the management frames with incremented sequence number if hardware is assigning the sequence. This is HW bug fixed already for non-QOS data frames, but it should be fixed for management frames except beacon. Without fix retransmitted frames have wrong SN: AlphaNet_e8:fb:36 Vivotek_52:31:51 Authentication, SN=1648, FN=0, Flags=........C Frame is not being retransmitted 1648 1 AlphaNet_e8:fb:36 Vivotek_52:31:51 Authentication, SN=1649, FN=0, Flags=....R...C Frame is being retransmitted 1649 1 AlphaNet_e8:fb:36 Vivotek_52:31:51 Authentication, SN=1650, FN=0, Flags=....R...C Frame is being retransmitted 1650 1 With the fix SN stays correctly the same: 88:6a:e3:e8:f9:a2 8c:f5:a3:88:76:87 Authentication, SN=1450, FN=0, Flags=........C 88:6a:e3:e8:f9:a2 8c:f5:a3:88:76:87 Authentication, SN=1450, FN=0, Flags=....R...C 88:6a:e3:e8:f9:a2 8c:f5:a3:88:76:87 Authentication, SN=1450, FN=0, Flags=....R...C Cc: stable@vger.kernel.org Signed-off-by: Vijayakumar Durai [sgruszka: simplify code, change comments and changelog] Signed-off-by: Stanislaw Gruszka Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ralink/rt2x00/rt2x00.h | 1 - drivers/net/wireless/ralink/rt2x00/rt2x00mac.c | 10 ---------- drivers/net/wireless/ralink/rt2x00/rt2x00queue.c | 15 +++++++++------ 3 files changed, 9 insertions(+), 17 deletions(-) diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00.h b/drivers/net/wireless/ralink/rt2x00/rt2x00.h index 1f38c338ca7a..2a25996d058d 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00.h +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00.h @@ -672,7 +672,6 @@ enum rt2x00_state_flags { CONFIG_CHANNEL_HT40, CONFIG_POWERSAVING, CONFIG_HT_DISABLED, - CONFIG_QOS_DISABLED, CONFIG_MONITORING, /* diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c index 6fe0c6abe0d6..84728c281f46 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00mac.c @@ -670,18 +670,8 @@ void rt2x00mac_bss_info_changed(struct ieee80211_hw *hw, rt2x00dev->intf_associated--; rt2x00leds_led_assoc(rt2x00dev, !!rt2x00dev->intf_associated); - - clear_bit(CONFIG_QOS_DISABLED, &rt2x00dev->flags); } - /* - * Check for access point which do not support 802.11e . We have to - * generate data frames sequence number in S/W for such AP, because - * of H/W bug. - */ - if (changes & BSS_CHANGED_QOS && !bss_conf->qos) - set_bit(CONFIG_QOS_DISABLED, &rt2x00dev->flags); - /* * When the erp information has changed, we should perform * additional configuration steps. For all other changes we are done. diff --git a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c index e1660b92b20c..1b0f2da8a10d 100644 --- a/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c +++ b/drivers/net/wireless/ralink/rt2x00/rt2x00queue.c @@ -200,15 +200,18 @@ static void rt2x00queue_create_tx_descriptor_seq(struct rt2x00_dev *rt2x00dev, if (!rt2x00_has_cap_flag(rt2x00dev, REQUIRE_SW_SEQNO)) { /* * rt2800 has a H/W (or F/W) bug, device incorrectly increase - * seqno on retransmited data (non-QOS) frames. To workaround - * the problem let's generate seqno in software if QOS is - * disabled. + * seqno on retransmitted data (non-QOS) and management frames. + * To workaround the problem let's generate seqno in software. + * Except for beacons which are transmitted periodically by H/W + * hence hardware has to assign seqno for them. */ - if (test_bit(CONFIG_QOS_DISABLED, &rt2x00dev->flags)) - __clear_bit(ENTRY_TXD_GENERATE_SEQ, &txdesc->flags); - else + if (ieee80211_is_beacon(hdr->frame_control)) { + __set_bit(ENTRY_TXD_GENERATE_SEQ, &txdesc->flags); /* H/W will generate sequence number */ return; + } + + __clear_bit(ENTRY_TXD_GENERATE_SEQ, &txdesc->flags); } /* From 5efba8d96466b824aa9d205fe6eabcec2c6aa960 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Fri, 1 Mar 2019 14:48:37 +0100 Subject: [PATCH 43/73] mac80211: do not call driver wake_tx_queue op during reconfig commit 4856bfd230985e43e84c26473c91028ff0a533bd upstream. There are several scenarios in which mac80211 can call drv_wake_tx_queue after ieee80211_restart_hw has been called and has not yet completed. Driver private structs are considered uninitialized until mac80211 has uploaded the vifs, stations and keys again, so using private tx queue data during that time is not safe. The driver can also not rely on drv_reconfig_complete to figure out when it is safe to accept drv_wake_tx_queue calls again, because it is only called after all tx queues are woken again. To fix this, bail out early in drv_wake_tx_queue if local->in_reconfig is set. Cc: stable@vger.kernel.org Signed-off-by: Felix Fietkau Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/driver-ops.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 4d82fe7d627c..284276b3e0b4 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -1164,6 +1164,9 @@ static inline void drv_wake_tx_queue(struct ieee80211_local *local, { struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->txq.vif); + if (local->in_reconfig) + return; + if (!check_sdata_in_driver(sdata)) return; From 59809557e84e22d1ae75703a237b020a4acc9508 Mon Sep 17 00:00:00 2001 From: Kim Phillips Date: Thu, 21 Mar 2019 21:15:22 +0000 Subject: [PATCH 44/73] perf/x86/amd: Add event map for AMD Family 17h MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3fe3331bb285700ab2253dbb07f8e478fcea2f1b upstream. Family 17h differs from prior families by: - Does not support an L2 cache miss event - It has re-enumerated PMC counters for: - L2 cache references - front & back end stalled cycles So we add a new amd_f17h_perfmon_event_map[] so that the generic perf event names will resolve to the correct h/w events on family 17h and above processors. Reference sections 2.1.13.3.3 (stalls) and 2.1.13.3.6 (L2): https://www.amd.com/system/files/TechDocs/54945_PPR_Family_17h_Models_00h-0Fh.pdf Signed-off-by: Kim Phillips Cc: # v4.9+ Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Borislav Petkov Cc: H. Peter Anvin Cc: Janakarajan Natarajan Cc: Jiri Olsa Cc: Linus Torvalds Cc: Martin Liška Cc: Namhyung Kim Cc: Peter Zijlstra Cc: Pu Wen Cc: Suravee Suthikulpanit Cc: Thomas Gleixner Cc: linux-kernel@vger.kernel.org Fixes: e40ed1542dd7 ("perf/x86: Add perf support for AMD family-17h processors") [ Improved the formatting a bit. ] Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/amd/core.c | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/arch/x86/events/amd/core.c b/arch/x86/events/amd/core.c index 3e5dd85b019a..263af6312329 100644 --- a/arch/x86/events/amd/core.c +++ b/arch/x86/events/amd/core.c @@ -117,22 +117,39 @@ static __initconst const u64 amd_hw_cache_event_ids }; /* - * AMD Performance Monitor K7 and later. + * AMD Performance Monitor K7 and later, up to and including Family 16h: */ static const u64 amd_perfmon_event_map[PERF_COUNT_HW_MAX] = { - [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, - [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, - [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, - [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, - [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, - [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, - [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ - [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0x077d, + [PERF_COUNT_HW_CACHE_MISSES] = 0x077e, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x00d0, /* "Decoder empty" event */ + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x00d1, /* "Dispatch stalls" event */ +}; + +/* + * AMD Performance Monitor Family 17h and later: + */ +static const u64 amd_f17h_perfmon_event_map[PERF_COUNT_HW_MAX] = +{ + [PERF_COUNT_HW_CPU_CYCLES] = 0x0076, + [PERF_COUNT_HW_INSTRUCTIONS] = 0x00c0, + [PERF_COUNT_HW_CACHE_REFERENCES] = 0xff60, + [PERF_COUNT_HW_BRANCH_INSTRUCTIONS] = 0x00c2, + [PERF_COUNT_HW_BRANCH_MISSES] = 0x00c3, + [PERF_COUNT_HW_STALLED_CYCLES_FRONTEND] = 0x0287, + [PERF_COUNT_HW_STALLED_CYCLES_BACKEND] = 0x0187, }; static u64 amd_pmu_event_map(int hw_event) { + if (boot_cpu_data.x86 >= 0x17) + return amd_f17h_perfmon_event_map[hw_event]; + return amd_perfmon_event_map[hw_event]; } From 607d291cfc7b5df5df5194fe8a32d22ec1960aa4 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Fri, 29 Mar 2019 17:47:43 -0700 Subject: [PATCH 45/73] x86/cpu/bugs: Use __initconst for 'const' init data commit 1de7edbb59c8f1b46071f66c5c97b8a59569eb51 upstream. Some of the recently added const tables use __initdata which causes section attribute conflicts. Use __initconst instead. Fixes: fa1202ef2243 ("x86/speculation: Add command line control") Signed-off-by: Andi Kleen Signed-off-by: Thomas Gleixner Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/20190330004743.29541-9-andi@firstfloor.org Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/bugs.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c index ec7aedba3d74..5567705e0601 100644 --- a/arch/x86/kernel/cpu/bugs.c +++ b/arch/x86/kernel/cpu/bugs.c @@ -271,7 +271,7 @@ static const struct { const char *option; enum spectre_v2_user_cmd cmd; bool secure; -} v2_user_options[] __initdata = { +} v2_user_options[] __initconst = { { "auto", SPECTRE_V2_USER_CMD_AUTO, false }, { "off", SPECTRE_V2_USER_CMD_NONE, false }, { "on", SPECTRE_V2_USER_CMD_FORCE, true }, @@ -406,7 +406,7 @@ static const struct { const char *option; enum spectre_v2_mitigation_cmd cmd; bool secure; -} mitigation_options[] __initdata = { +} mitigation_options[] __initconst = { { "off", SPECTRE_V2_CMD_NONE, false }, { "on", SPECTRE_V2_CMD_FORCE, true }, { "retpoline", SPECTRE_V2_CMD_RETPOLINE, false }, @@ -642,7 +642,7 @@ static const char * const ssb_strings[] = { static const struct { const char *option; enum ssb_mitigation_cmd cmd; -} ssb_mitigation_options[] __initdata = { +} ssb_mitigation_options[] __initconst = { { "auto", SPEC_STORE_BYPASS_CMD_AUTO }, /* Platform decides */ { "on", SPEC_STORE_BYPASS_CMD_ON }, /* Disable Speculative Store Bypass */ { "off", SPEC_STORE_BYPASS_CMD_NONE }, /* Don't touch Speculative Store Bypass */ From 55e7e51f750b91adbce9784cd0283f79b6d07af3 Mon Sep 17 00:00:00 2001 From: Kan Liang Date: Tue, 2 Apr 2019 12:44:58 -0700 Subject: [PATCH 46/73] perf/x86: Fix incorrect PEBS_REGS commit 9d5dcc93a6ddfc78124f006ccd3637ce070ef2fc upstream. PEBS_REGS used as mask for the supported registers for large PEBS. However, the mask cannot filter the sample_regs_user/sample_regs_intr correctly. (1ULL << PERF_REG_X86_*) should be used to replace PERF_REG_X86_*, which is only the index. Rename PEBS_REGS to PEBS_GP_REGS, because the mask is only for general purpose registers. Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: acme@kernel.org Cc: jolsa@kernel.org Fixes: 2fe1bc1f501d ("perf/x86: Enable free running PEBS for REGS_USER/INTR") Link: https://lkml.kernel.org/r/20190402194509.2832-2-kan.liang@linux.intel.com [ Renamed it to PEBS_GP_REGS - as 'GPRS' is used elsewhere ;-) ] Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/perf_event.h | 38 ++++++++++++++++++------------------ 2 files changed, 20 insertions(+), 20 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index dc8f8b3e6cec..99d45660242e 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3001,7 +3001,7 @@ static unsigned long intel_pmu_free_running_flags(struct perf_event *event) flags &= ~PERF_SAMPLE_TIME; if (!event->attr.exclude_kernel) flags &= ~PERF_SAMPLE_REGS_USER; - if (event->attr.sample_regs_user & ~PEBS_REGS) + if (event->attr.sample_regs_user & ~PEBS_GP_REGS) flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index 84b3841c131d..bfe16631fd1d 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -95,25 +95,25 @@ struct amd_nb { PERF_SAMPLE_TRANSACTION | PERF_SAMPLE_PHYS_ADDR | \ PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) -#define PEBS_REGS \ - (PERF_REG_X86_AX | \ - PERF_REG_X86_BX | \ - PERF_REG_X86_CX | \ - PERF_REG_X86_DX | \ - PERF_REG_X86_DI | \ - PERF_REG_X86_SI | \ - PERF_REG_X86_SP | \ - PERF_REG_X86_BP | \ - PERF_REG_X86_IP | \ - PERF_REG_X86_FLAGS | \ - PERF_REG_X86_R8 | \ - PERF_REG_X86_R9 | \ - PERF_REG_X86_R10 | \ - PERF_REG_X86_R11 | \ - PERF_REG_X86_R12 | \ - PERF_REG_X86_R13 | \ - PERF_REG_X86_R14 | \ - PERF_REG_X86_R15) +#define PEBS_GP_REGS \ + ((1ULL << PERF_REG_X86_AX) | \ + (1ULL << PERF_REG_X86_BX) | \ + (1ULL << PERF_REG_X86_CX) | \ + (1ULL << PERF_REG_X86_DX) | \ + (1ULL << PERF_REG_X86_DI) | \ + (1ULL << PERF_REG_X86_SI) | \ + (1ULL << PERF_REG_X86_SP) | \ + (1ULL << PERF_REG_X86_BP) | \ + (1ULL << PERF_REG_X86_IP) | \ + (1ULL << PERF_REG_X86_FLAGS) | \ + (1ULL << PERF_REG_X86_R8) | \ + (1ULL << PERF_REG_X86_R9) | \ + (1ULL << PERF_REG_X86_R10) | \ + (1ULL << PERF_REG_X86_R11) | \ + (1ULL << PERF_REG_X86_R12) | \ + (1ULL << PERF_REG_X86_R13) | \ + (1ULL << PERF_REG_X86_R14) | \ + (1ULL << PERF_REG_X86_R15)) /* * Per register state. From 3b921dc46fe13fa20d7bc83df3eaa7511bb2717e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sun, 14 Apr 2019 19:51:06 +0200 Subject: [PATCH 47/73] x86/speculation: Prevent deadlock on ssb_state::lock commit 2f5fb19341883bb6e37da351bc3700489d8506a7 upstream. Mikhail reported a lockdep splat related to the AMD specific ssb_state lock: CPU0 CPU1 lock(&st->lock); local_irq_disable(); lock(&(&sighand->siglock)->rlock); lock(&st->lock); lock(&(&sighand->siglock)->rlock); *** DEADLOCK *** The connection between sighand->siglock and st->lock comes through seccomp, which takes st->lock while holding sighand->siglock. Make sure interrupts are disabled when __speculation_ctrl_update() is invoked via prctl() -> speculation_ctrl_update(). Add a lockdep assert to catch future offenders. Fixes: 1f50ddb4f418 ("x86/speculation: Handle HT correctly on AMD") Reported-by: Mikhail Gavrilov Signed-off-by: Thomas Gleixner Tested-by: Mikhail Gavrilov Cc: Thomas Lendacky Cc: stable@vger.kernel.org Link: https://lkml.kernel.org/r/alpine.DEB.2.21.1904141948200.4917@nanos.tec.linutronix.de Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/process.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index a98d1cdd6299..d2ef967bfafb 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -465,10 +465,12 @@ static unsigned long speculation_ctrl_update_tif(struct task_struct *tsk) void speculation_ctrl_update(unsigned long tif) { + unsigned long flags; + /* Forced update. Make sure all relevant TIF flags are different */ - preempt_disable(); + local_irq_save(flags); __speculation_ctrl_update(~tif, tif); - preempt_enable(); + local_irq_restore(flags); } /* Called from seccomp/prctl update */ From 4cec35e8e251eb953869c199c2a33e841e422fc4 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Thu, 19 Apr 2018 18:41:55 +0200 Subject: [PATCH 48/73] crypto: crypto4xx - properly set IV after de- and encrypt [ Upstream commit fc340115ffb8235c1bbd200c28855e6373d0dd1a ] This patch fixes cts(cbc(aes)) test when cbc-aes-ppc4xx is used. alg: skcipher: Test 1 failed (invalid result) on encryption for cts(cbc-aes-ppc4xx) 00000000: 4b 10 75 fc 2f 14 1b 6a 27 35 37 33 d1 b7 70 05 00000010: 97 alg: skcipher: Failed to load transform for cts(cbc(aes)): -2 The CTS cipher mode expect the IV (req->iv) of skcipher_request to contain the last ciphertext block after the {en,de}crypt operation is complete. Fix this issue for the AMCC Crypto4xx hardware engine. The tcrypt test case for cts(cbc(aes)) is now correctly passed. name : cts(cbc(aes)) driver : cts(cbc-aes-ppc4xx) module : cts priority : 300 refcnt : 1 selftest : passed internal : no type : skcipher async : yes blocksize : 16 min keysize : 16 max keysize : 32 ivsize : 16 chunksize : 16 walksize : 16 Signed-off-by: Christian Lamparter Signed-off-by: Herbert Xu Signed-off-by: Sasha Levin --- drivers/crypto/amcc/crypto4xx_alg.c | 3 ++- drivers/crypto/amcc/crypto4xx_core.c | 9 +++++++++ 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/amcc/crypto4xx_alg.c b/drivers/crypto/amcc/crypto4xx_alg.c index 4afca3968773..e3b8bebfdd30 100644 --- a/drivers/crypto/amcc/crypto4xx_alg.c +++ b/drivers/crypto/amcc/crypto4xx_alg.c @@ -138,7 +138,8 @@ static int crypto4xx_setkey_aes(struct crypto_ablkcipher *cipher, sa = (struct dynamic_sa_ctl *) ctx->sa_in; ctx->hash_final = 0; - set_dynamic_sa_command_0(sa, SA_NOT_SAVE_HASH, SA_NOT_SAVE_IV, + set_dynamic_sa_command_0(sa, SA_NOT_SAVE_HASH, (cm == CRYPTO_MODE_CBC ? + SA_SAVE_IV : SA_NOT_SAVE_IV), SA_LOAD_HASH_FROM_SA, SA_LOAD_IV_FROM_STATE, SA_NO_HEADER_PROC, SA_HASH_ALG_NULL, SA_CIPHER_ALG_AES, SA_PAD_TYPE_ZERO, diff --git a/drivers/crypto/amcc/crypto4xx_core.c b/drivers/crypto/amcc/crypto4xx_core.c index 3f9eee7e555f..8d4d8db244e9 100644 --- a/drivers/crypto/amcc/crypto4xx_core.c +++ b/drivers/crypto/amcc/crypto4xx_core.c @@ -645,6 +645,15 @@ static u32 crypto4xx_ablkcipher_done(struct crypto4xx_device *dev, addr = dma_map_page(dev->core_dev->device, sg_page(dst), dst->offset, dst->length, DMA_FROM_DEVICE); } + + if (pd_uinfo->sa_va->sa_command_0.bf.save_iv == SA_SAVE_IV) { + struct crypto_skcipher *skcipher = crypto_skcipher_reqtfm(req); + + crypto4xx_memcpy_from_le32((u32 *)req->iv, + pd_uinfo->sr_va->save_iv, + crypto_skcipher_ivsize(skcipher)); + } + crypto4xx_ret_sg_desc(dev, pd_uinfo); if (ablk_req->base.complete != NULL) ablk_req->base.complete(&ablk_req->base, 0); From f83cf258b928b94a908d88e692f14ef459f927db Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 15 Nov 2018 15:53:41 +0200 Subject: [PATCH 49/73] mmc: sdhci: Fix data command CRC error handling [ Upstream commit 4bf780996669280171c9cd58196512849b93434e ] Existing data command CRC error handling is non-standard and does not work with some Intel host controllers. Specifically, the assumption that the host controller will continue operating normally after the error interrupt, is not valid. Change the driver to handle the error in the same manner as a data CRC error, taking care to ensure that the data line reset is done for single or multi-block transfers, and it is done before unmapping DMA. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci.c | 40 +++++++++++++++------------------------- 1 file changed, 15 insertions(+), 25 deletions(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 0edcc2763f3c..2d59b0389567 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -1002,8 +1002,7 @@ static bool sdhci_needs_reset(struct sdhci_host *host, struct mmc_request *mrq) return (!(host->flags & SDHCI_DEVICE_DEAD) && ((mrq->cmd && mrq->cmd->error) || (mrq->sbc && mrq->sbc->error) || - (mrq->data && ((mrq->data->error && !mrq->data->stop) || - (mrq->data->stop && mrq->data->stop->error))) || + (mrq->data && mrq->data->stop && mrq->data->stop->error) || (host->quirks & SDHCI_QUIRK_RESET_AFTER_REQUEST))); } @@ -1055,6 +1054,16 @@ static void sdhci_finish_data(struct sdhci_host *host) host->data = NULL; host->data_cmd = NULL; + /* + * The controller needs a reset of internal state machines upon error + * conditions. + */ + if (data->error) { + if (!host->cmd || host->cmd == data_cmd) + sdhci_do_reset(host, SDHCI_RESET_CMD); + sdhci_do_reset(host, SDHCI_RESET_DATA); + } + if ((host->flags & (SDHCI_REQ_USE_DMA | SDHCI_USE_ADMA)) == (SDHCI_REQ_USE_DMA | SDHCI_USE_ADMA)) sdhci_adma_table_post(host, data); @@ -1079,17 +1088,6 @@ static void sdhci_finish_data(struct sdhci_host *host) if (data->stop && (data->error || !data->mrq->sbc)) { - - /* - * The controller needs a reset of internal state machines - * upon error conditions. - */ - if (data->error) { - if (!host->cmd || host->cmd == data_cmd) - sdhci_do_reset(host, SDHCI_RESET_CMD); - sdhci_do_reset(host, SDHCI_RESET_DATA); - } - /* * 'cap_cmd_during_tfr' request must not use the command line * after mmc_command_done() has been called. It is upper layer's @@ -2560,7 +2558,7 @@ static void sdhci_timeout_data_timer(unsigned long data) * * \*****************************************************************************/ -static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask) +static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) { if (!host->cmd) { /* @@ -2583,20 +2581,12 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask) else host->cmd->error = -EILSEQ; - /* - * If this command initiates a data phase and a response - * CRC error is signalled, the card can start transferring - * data - the card may have received the command without - * error. We must not terminate the mmc_request early. - * - * If the card did not receive the command or returned an - * error which prevented it sending data, the data phase - * will time out. - */ + /* Treat data command CRC error the same as data CRC error */ if (host->cmd->data && (intmask & (SDHCI_INT_CRC | SDHCI_INT_TIMEOUT)) == SDHCI_INT_CRC) { host->cmd = NULL; + *intmask_p |= SDHCI_INT_DATA_CRC; return; } @@ -2824,7 +2814,7 @@ static irqreturn_t sdhci_irq(int irq, void *dev_id) } if (intmask & SDHCI_INT_CMD_MASK) - sdhci_cmd_irq(host, intmask & SDHCI_INT_CMD_MASK); + sdhci_cmd_irq(host, intmask & SDHCI_INT_CMD_MASK, &intmask); if (intmask & SDHCI_INT_DATA_MASK) sdhci_data_irq(host, intmask & SDHCI_INT_DATA_MASK); From 45fd8679ea86bffb352132a1df4917c3d11375aa Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 15 Nov 2018 15:53:42 +0200 Subject: [PATCH 50/73] mmc: sdhci: Rename SDHCI_ACMD12_ERR and SDHCI_INT_ACMD12ERR [ Upstream commit 869f8a69bb3a4aec4eb914a330d4ba53a9eed495 ] The SDHCI_ACMD12_ERR register is used for auto-CMD23 and auto-CMD12 errors, as is the SDHCI_INT_ACMD12ERR interrupt bit. Rename them to SDHCI_AUTO_CMD_STATUS and SDHCI_INT_AUTO_CMD_ERR respectively. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci-esdhc-imx.c | 12 ++++++------ drivers/mmc/host/sdhci.c | 4 ++-- drivers/mmc/host/sdhci.h | 4 ++-- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index ff5c4ad37a3a..8c0b80a54e4d 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -425,7 +425,7 @@ static u16 esdhc_readw_le(struct sdhci_host *host, int reg) val = readl(host->ioaddr + ESDHC_MIX_CTRL); else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) /* the std tuning bits is in ACMD12_ERR for imx6sl */ - val = readl(host->ioaddr + SDHCI_ACMD12_ERR); + val = readl(host->ioaddr + SDHCI_AUTO_CMD_STATUS); } if (val & ESDHC_MIX_CTRL_EXE_TUNE) @@ -490,7 +490,7 @@ static void esdhc_writew_le(struct sdhci_host *host, u16 val, int reg) } writel(new_val , host->ioaddr + ESDHC_MIX_CTRL); } else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) { - u32 v = readl(host->ioaddr + SDHCI_ACMD12_ERR); + u32 v = readl(host->ioaddr + SDHCI_AUTO_CMD_STATUS); u32 m = readl(host->ioaddr + ESDHC_MIX_CTRL); if (val & SDHCI_CTRL_TUNED_CLK) { v |= ESDHC_MIX_CTRL_SMPCLK_SEL; @@ -508,7 +508,7 @@ static void esdhc_writew_le(struct sdhci_host *host, u16 val, int reg) v &= ~ESDHC_MIX_CTRL_EXE_TUNE; } - writel(v, host->ioaddr + SDHCI_ACMD12_ERR); + writel(v, host->ioaddr + SDHCI_AUTO_CMD_STATUS); writel(m, host->ioaddr + ESDHC_MIX_CTRL); } return; @@ -937,9 +937,9 @@ static void esdhc_reset_tuning(struct sdhci_host *host) writel(ctrl, host->ioaddr + ESDHC_MIX_CTRL); writel(0, host->ioaddr + ESDHC_TUNE_CTRL_STATUS); } else if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) { - ctrl = readl(host->ioaddr + SDHCI_ACMD12_ERR); + ctrl = readl(host->ioaddr + SDHCI_AUTO_CMD_STATUS); ctrl &= ~ESDHC_MIX_CTRL_SMPCLK_SEL; - writel(ctrl, host->ioaddr + SDHCI_ACMD12_ERR); + writel(ctrl, host->ioaddr + SDHCI_AUTO_CMD_STATUS); } } } @@ -1303,7 +1303,7 @@ static int sdhci_esdhc_imx_probe(struct platform_device *pdev) /* clear tuning bits in case ROM has set it already */ writel(0x0, host->ioaddr + ESDHC_MIX_CTRL); - writel(0x0, host->ioaddr + SDHCI_ACMD12_ERR); + writel(0x0, host->ioaddr + SDHCI_AUTO_CMD_STATUS); writel(0x0, host->ioaddr + ESDHC_TUNE_CTRL_STATUS); } diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 2d59b0389567..8abbe40a9a05 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -82,8 +82,8 @@ void sdhci_dumpregs(struct sdhci_host *host) SDHCI_DUMP("Int enab: 0x%08x | Sig enab: 0x%08x\n", sdhci_readl(host, SDHCI_INT_ENABLE), sdhci_readl(host, SDHCI_SIGNAL_ENABLE)); - SDHCI_DUMP("AC12 err: 0x%08x | Slot int: 0x%08x\n", - sdhci_readw(host, SDHCI_ACMD12_ERR), + SDHCI_DUMP("ACmd stat: 0x%08x | Slot int: 0x%08x\n", + sdhci_readw(host, SDHCI_AUTO_CMD_STATUS), sdhci_readw(host, SDHCI_SLOT_INT_STATUS)); SDHCI_DUMP("Caps: 0x%08x | Caps_1: 0x%08x\n", sdhci_readl(host, SDHCI_CAPABILITIES), diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 1d7d61e25dbf..860b2c729e68 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -144,7 +144,7 @@ #define SDHCI_INT_DATA_CRC 0x00200000 #define SDHCI_INT_DATA_END_BIT 0x00400000 #define SDHCI_INT_BUS_POWER 0x00800000 -#define SDHCI_INT_ACMD12ERR 0x01000000 +#define SDHCI_INT_AUTO_CMD_ERR 0x01000000 #define SDHCI_INT_ADMA_ERROR 0x02000000 #define SDHCI_INT_NORMAL_MASK 0x00007FFF @@ -166,7 +166,7 @@ #define SDHCI_CQE_INT_MASK (SDHCI_CQE_INT_ERR_MASK | SDHCI_INT_CQE) -#define SDHCI_ACMD12_ERR 0x3C +#define SDHCI_AUTO_CMD_STATUS 0x3C #define SDHCI_HOST_CONTROL2 0x3E #define SDHCI_CTRL_UHS_MASK 0x0007 From c728ffc5150c0e7b6c895f281a3d37c1bef52a10 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Thu, 15 Nov 2018 15:53:43 +0200 Subject: [PATCH 51/73] mmc: sdhci: Handle auto-command errors [ Upstream commit af849c86109d79222e549826068bbf4e7f9a2472 ] If the host controller supports auto-commands then enable the auto-command error interrupt and handle it. In the case of auto-CMD23, the error is treated the same as manual CMD23 error. In the case of auto-CMD12, commands-during-transfer are not permitted, so the error handling is treated the same as a data error. Signed-off-by: Adrian Hunter Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin --- drivers/mmc/host/sdhci.c | 35 +++++++++++++++++++++++++++++++++++ drivers/mmc/host/sdhci.h | 7 ++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 8abbe40a9a05..9540fda7fc6b 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -790,6 +790,11 @@ static void sdhci_set_transfer_irqs(struct sdhci_host *host) else host->ier = (host->ier & ~dma_irqs) | pio_irqs; + if (host->flags & (SDHCI_AUTO_CMD23 | SDHCI_AUTO_CMD12)) + host->ier |= SDHCI_INT_AUTO_CMD_ERR; + else + host->ier &= ~SDHCI_INT_AUTO_CMD_ERR; + sdhci_writel(host, host->ier, SDHCI_INT_ENABLE); sdhci_writel(host, host->ier, SDHCI_SIGNAL_ENABLE); } @@ -2560,6 +2565,21 @@ static void sdhci_timeout_data_timer(unsigned long data) static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) { + /* Handle auto-CMD12 error */ + if (intmask & SDHCI_INT_AUTO_CMD_ERR && host->data_cmd) { + struct mmc_request *mrq = host->data_cmd->mrq; + u16 auto_cmd_status = sdhci_readw(host, SDHCI_AUTO_CMD_STATUS); + int data_err_bit = (auto_cmd_status & SDHCI_AUTO_CMD_TIMEOUT) ? + SDHCI_INT_DATA_TIMEOUT : + SDHCI_INT_DATA_CRC; + + /* Treat auto-CMD12 error the same as data error */ + if (!mrq->sbc && (host->flags & SDHCI_AUTO_CMD12)) { + *intmask_p |= data_err_bit; + return; + } + } + if (!host->cmd) { /* * SDHCI recovers from errors by resetting the cmd and data @@ -2594,6 +2614,21 @@ static void sdhci_cmd_irq(struct sdhci_host *host, u32 intmask, u32 *intmask_p) return; } + /* Handle auto-CMD23 error */ + if (intmask & SDHCI_INT_AUTO_CMD_ERR) { + struct mmc_request *mrq = host->cmd->mrq; + u16 auto_cmd_status = sdhci_readw(host, SDHCI_AUTO_CMD_STATUS); + int err = (auto_cmd_status & SDHCI_AUTO_CMD_TIMEOUT) ? + -ETIMEDOUT : + -EILSEQ; + + if (mrq->sbc && (host->flags & SDHCI_AUTO_CMD23)) { + mrq->sbc->error = err; + sdhci_finish_mrq(host, mrq); + return; + } + } + if (intmask & SDHCI_INT_RESPONSE) sdhci_finish_command(host); } diff --git a/drivers/mmc/host/sdhci.h b/drivers/mmc/host/sdhci.h index 860b2c729e68..c0d5458c36d4 100644 --- a/drivers/mmc/host/sdhci.h +++ b/drivers/mmc/host/sdhci.h @@ -151,7 +151,8 @@ #define SDHCI_INT_ERROR_MASK 0xFFFF8000 #define SDHCI_INT_CMD_MASK (SDHCI_INT_RESPONSE | SDHCI_INT_TIMEOUT | \ - SDHCI_INT_CRC | SDHCI_INT_END_BIT | SDHCI_INT_INDEX) + SDHCI_INT_CRC | SDHCI_INT_END_BIT | SDHCI_INT_INDEX | \ + SDHCI_INT_AUTO_CMD_ERR) #define SDHCI_INT_DATA_MASK (SDHCI_INT_DATA_END | SDHCI_INT_DMA_END | \ SDHCI_INT_DATA_AVAIL | SDHCI_INT_SPACE_AVAIL | \ SDHCI_INT_DATA_TIMEOUT | SDHCI_INT_DATA_CRC | \ @@ -167,6 +168,10 @@ #define SDHCI_CQE_INT_MASK (SDHCI_CQE_INT_ERR_MASK | SDHCI_INT_CQE) #define SDHCI_AUTO_CMD_STATUS 0x3C +#define SDHCI_AUTO_CMD_TIMEOUT 0x00000002 +#define SDHCI_AUTO_CMD_CRC 0x00000004 +#define SDHCI_AUTO_CMD_END_BIT 0x00000008 +#define SDHCI_AUTO_CMD_INDEX 0x00000010 #define SDHCI_HOST_CONTROL2 0x3E #define SDHCI_CTRL_UHS_MASK 0x0007 From 1a7fe5cb7a04ebc00cc282bb227bb8b78f52138b Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 22 Nov 2018 13:28:41 +0900 Subject: [PATCH 52/73] modpost: file2alias: go back to simple devtable lookup commit ec91e78d378cc5d4b43805a1227d8e04e5dfa17d upstream. Commit e49ce14150c6 ("modpost: use linker section to generate table.") was not so cool as we had expected first; it ended up with ugly section hacks when commit dd2a3acaecd7 ("mod/file2alias: make modpost compile on darwin again") came in. Given a certain degree of unknowledge about the link stage of host programs, I really want to see simple, stupid table lookup so that this works in the same way regardless of the underlying executable format. Signed-off-by: Masahiro Yamada Acked-by: Mathieu Malaterre [nc: Omit rpmsg, sdw, tbsvc, and typec as they do not exist here] Signed-off-by: Nathan Chancellor Signed-off-by: Sasha Levin --- scripts/mod/file2alias.c | 136 +++++++++++++-------------------------- 1 file changed, 45 insertions(+), 91 deletions(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 29d6699d5a06..6e3ae94cf123 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -50,46 +50,6 @@ struct devtable { void *function; }; -#define ___cat(a,b) a ## b -#define __cat(a,b) ___cat(a,b) - -/* we need some special handling for this host tool running eventually on - * Darwin. The Mach-O section handling is a bit different than ELF section - * handling. The differnces in detail are: - * a) we have segments which have sections - * b) we need a API call to get the respective section symbols */ -#if defined(__MACH__) -#include - -#define INIT_SECTION(name) do { \ - unsigned long name ## _len; \ - char *__cat(pstart_,name) = getsectdata("__TEXT", \ - #name, &__cat(name,_len)); \ - char *__cat(pstop_,name) = __cat(pstart_,name) + \ - __cat(name, _len); \ - __cat(__start_,name) = (void *)__cat(pstart_,name); \ - __cat(__stop_,name) = (void *)__cat(pstop_,name); \ - } while (0) -#define SECTION(name) __attribute__((section("__TEXT, " #name))) - -struct devtable **__start___devtable, **__stop___devtable; -#else -#define INIT_SECTION(name) /* no-op for ELF */ -#define SECTION(name) __attribute__((section(#name))) - -/* We construct a table of pointers in an ELF section (pointers generally - * go unpadded by gcc). ld creates boundary syms for us. */ -extern struct devtable *__start___devtable[], *__stop___devtable[]; -#endif /* __MACH__ */ - -#if !defined(__used) -# if __GNUC__ == 3 && __GNUC_MINOR__ < 3 -# define __used __attribute__((__unused__)) -# else -# define __used __attribute__((__used__)) -# endif -#endif - /* Define a variable f that holds the value of field f of struct devid * based at address m. */ @@ -102,16 +62,6 @@ extern struct devtable *__start___devtable[], *__stop___devtable[]; #define DEF_FIELD_ADDR(m, devid, f) \ typeof(((struct devid *)0)->f) *f = ((m) + OFF_##devid##_##f) -/* Add a table entry. We test function type matches while we're here. */ -#define ADD_TO_DEVTABLE(device_id, type, function) \ - static struct devtable __cat(devtable,__LINE__) = { \ - device_id + 0*sizeof((function)((const char *)NULL, \ - (void *)NULL, \ - (char *)NULL)), \ - SIZE_##type, (function) }; \ - static struct devtable *SECTION(__devtable) __used \ - __cat(devtable_ptr,__LINE__) = &__cat(devtable,__LINE__) - #define ADD(str, sep, cond, field) \ do { \ strcat(str, sep); \ @@ -431,7 +381,6 @@ static int do_hid_entry(const char *filename, return 1; } -ADD_TO_DEVTABLE("hid", hid_device_id, do_hid_entry); /* Looks like: ieee1394:venNmoNspNverN */ static int do_ieee1394_entry(const char *filename, @@ -456,7 +405,6 @@ static int do_ieee1394_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("ieee1394", ieee1394_device_id, do_ieee1394_entry); /* Looks like: pci:vNdNsvNsdNbcNscNiN. */ static int do_pci_entry(const char *filename, @@ -500,7 +448,6 @@ static int do_pci_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("pci", pci_device_id, do_pci_entry); /* looks like: "ccw:tNmNdtNdmN" */ static int do_ccw_entry(const char *filename, @@ -524,7 +471,6 @@ static int do_ccw_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("ccw", ccw_device_id, do_ccw_entry); /* looks like: "ap:tN" */ static int do_ap_entry(const char *filename, @@ -535,7 +481,6 @@ static int do_ap_entry(const char *filename, sprintf(alias, "ap:t%02X*", dev_type); return 1; } -ADD_TO_DEVTABLE("ap", ap_device_id, do_ap_entry); /* looks like: "css:tN" */ static int do_css_entry(const char *filename, @@ -546,7 +491,6 @@ static int do_css_entry(const char *filename, sprintf(alias, "css:t%01X", type); return 1; } -ADD_TO_DEVTABLE("css", css_device_id, do_css_entry); /* Looks like: "serio:tyNprNidNexN" */ static int do_serio_entry(const char *filename, @@ -566,7 +510,6 @@ static int do_serio_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("serio", serio_device_id, do_serio_entry); /* looks like: "acpi:ACPI0003" or "acpi:PNP0C0B" or "acpi:LNXVIDEO" or * "acpi:bbsspp" (bb=base-class, ss=sub-class, pp=prog-if) @@ -604,7 +547,6 @@ static int do_acpi_entry(const char *filename, } return 1; } -ADD_TO_DEVTABLE("acpi", acpi_device_id, do_acpi_entry); /* looks like: "pnp:dD" */ static void do_pnp_device_entry(void *symval, unsigned long size, @@ -725,7 +667,6 @@ static int do_pcmcia_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("pcmcia", pcmcia_device_id, do_pcmcia_entry); static int do_vio_entry(const char *filename, void *symval, char *alias) @@ -745,7 +686,6 @@ static int do_vio_entry(const char *filename, void *symval, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("vio", vio_device_id, do_vio_entry); #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0])) @@ -818,7 +758,6 @@ static int do_input_entry(const char *filename, void *symval, do_input(alias, *swbit, 0, INPUT_DEVICE_ID_SW_MAX); return 1; } -ADD_TO_DEVTABLE("input", input_device_id, do_input_entry); static int do_eisa_entry(const char *filename, void *symval, char *alias) @@ -830,7 +769,6 @@ static int do_eisa_entry(const char *filename, void *symval, strcat(alias, "*"); return 1; } -ADD_TO_DEVTABLE("eisa", eisa_device_id, do_eisa_entry); /* Looks like: parisc:tNhvNrevNsvN */ static int do_parisc_entry(const char *filename, void *symval, @@ -850,7 +788,6 @@ static int do_parisc_entry(const char *filename, void *symval, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("parisc", parisc_device_id, do_parisc_entry); /* Looks like: sdio:cNvNdN. */ static int do_sdio_entry(const char *filename, @@ -867,7 +804,6 @@ static int do_sdio_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("sdio", sdio_device_id, do_sdio_entry); /* Looks like: ssb:vNidNrevN. */ static int do_ssb_entry(const char *filename, @@ -884,7 +820,6 @@ static int do_ssb_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("ssb", ssb_device_id, do_ssb_entry); /* Looks like: bcma:mNidNrevNclN. */ static int do_bcma_entry(const char *filename, @@ -903,7 +838,6 @@ static int do_bcma_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("bcma", bcma_device_id, do_bcma_entry); /* Looks like: virtio:dNvN */ static int do_virtio_entry(const char *filename, void *symval, @@ -919,7 +853,6 @@ static int do_virtio_entry(const char *filename, void *symval, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("virtio", virtio_device_id, do_virtio_entry); /* * Looks like: vmbus:guid @@ -942,7 +875,6 @@ static int do_vmbus_entry(const char *filename, void *symval, return 1; } -ADD_TO_DEVTABLE("vmbus", hv_vmbus_device_id, do_vmbus_entry); /* Looks like: i2c:S */ static int do_i2c_entry(const char *filename, void *symval, @@ -953,7 +885,6 @@ static int do_i2c_entry(const char *filename, void *symval, return 1; } -ADD_TO_DEVTABLE("i2c", i2c_device_id, do_i2c_entry); /* Looks like: spi:S */ static int do_spi_entry(const char *filename, void *symval, @@ -964,7 +895,6 @@ static int do_spi_entry(const char *filename, void *symval, return 1; } -ADD_TO_DEVTABLE("spi", spi_device_id, do_spi_entry); static const struct dmifield { const char *prefix; @@ -1019,7 +949,6 @@ static int do_dmi_entry(const char *filename, void *symval, strcat(alias, ":"); return 1; } -ADD_TO_DEVTABLE("dmi", dmi_system_id, do_dmi_entry); static int do_platform_entry(const char *filename, void *symval, char *alias) @@ -1028,7 +957,6 @@ static int do_platform_entry(const char *filename, sprintf(alias, PLATFORM_MODULE_PREFIX "%s", *name); return 1; } -ADD_TO_DEVTABLE("platform", platform_device_id, do_platform_entry); static int do_mdio_entry(const char *filename, void *symval, char *alias) @@ -1053,7 +981,6 @@ static int do_mdio_entry(const char *filename, return 1; } -ADD_TO_DEVTABLE("mdio", mdio_device_id, do_mdio_entry); /* Looks like: zorro:iN. */ static int do_zorro_entry(const char *filename, void *symval, @@ -1064,7 +991,6 @@ static int do_zorro_entry(const char *filename, void *symval, ADD(alias, "i", id != ZORRO_WILDCARD, id); return 1; } -ADD_TO_DEVTABLE("zorro", zorro_device_id, do_zorro_entry); /* looks like: "pnp:dD" */ static int do_isapnp_entry(const char *filename, @@ -1080,7 +1006,6 @@ static int do_isapnp_entry(const char *filename, (function >> 12) & 0x0f, (function >> 8) & 0x0f); return 1; } -ADD_TO_DEVTABLE("isapnp", isapnp_device_id, do_isapnp_entry); /* Looks like: "ipack:fNvNdN". */ static int do_ipack_entry(const char *filename, @@ -1096,7 +1021,6 @@ static int do_ipack_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("ipack", ipack_device_id, do_ipack_entry); /* * Append a match expression for a single masked hex digit. @@ -1167,7 +1091,6 @@ static int do_amba_entry(const char *filename, return 1; } -ADD_TO_DEVTABLE("amba", amba_id, do_amba_entry); /* * looks like: "mipscdmm:tN" @@ -1183,7 +1106,6 @@ static int do_mips_cdmm_entry(const char *filename, sprintf(alias, "mipscdmm:t%02X*", type); return 1; } -ADD_TO_DEVTABLE("mipscdmm", mips_cdmm_device_id, do_mips_cdmm_entry); /* LOOKS like cpu:type:x86,venVVVVfamFFFFmodMMMM:feature:*,FEAT,* * All fields are numbers. It would be nicer to use strings for vendor @@ -1208,7 +1130,6 @@ static int do_x86cpu_entry(const char *filename, void *symval, sprintf(alias + strlen(alias), "%04X*", feature); return 1; } -ADD_TO_DEVTABLE("x86cpu", x86_cpu_id, do_x86cpu_entry); /* LOOKS like cpu:type:*:feature:*FEAT* */ static int do_cpu_entry(const char *filename, void *symval, char *alias) @@ -1218,7 +1139,6 @@ static int do_cpu_entry(const char *filename, void *symval, char *alias) sprintf(alias, "cpu:type:*:feature:*%04X*", feature); return 1; } -ADD_TO_DEVTABLE("cpu", cpu_feature, do_cpu_entry); /* Looks like: mei:S:uuid:N:* */ static int do_mei_entry(const char *filename, void *symval, @@ -1237,7 +1157,6 @@ static int do_mei_entry(const char *filename, void *symval, return 1; } -ADD_TO_DEVTABLE("mei", mei_cl_device_id, do_mei_entry); /* Looks like: rapidio:vNdNavNadN */ static int do_rio_entry(const char *filename, @@ -1257,7 +1176,6 @@ static int do_rio_entry(const char *filename, add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("rapidio", rio_device_id, do_rio_entry); /* Looks like: ulpi:vNpN */ static int do_ulpi_entry(const char *filename, void *symval, @@ -1270,7 +1188,6 @@ static int do_ulpi_entry(const char *filename, void *symval, return 1; } -ADD_TO_DEVTABLE("ulpi", ulpi_device_id, do_ulpi_entry); /* Looks like: hdaudio:vNrNaN */ static int do_hda_entry(const char *filename, void *symval, char *alias) @@ -1287,7 +1204,6 @@ static int do_hda_entry(const char *filename, void *symval, char *alias) add_wildcard(alias); return 1; } -ADD_TO_DEVTABLE("hdaudio", hda_device_id, do_hda_entry); /* Looks like: fsl-mc:vNdN */ static int do_fsl_mc_entry(const char *filename, void *symval, @@ -1299,7 +1215,6 @@ static int do_fsl_mc_entry(const char *filename, void *symval, sprintf(alias, "fsl-mc:v%08Xd%s", vendor, *obj_type); return 1; } -ADD_TO_DEVTABLE("fslmc", fsl_mc_device_id, do_fsl_mc_entry); /* Does namelen bytes of name exactly match the symbol? */ static bool sym_is(const char *name, unsigned namelen, const char *symbol) @@ -1332,6 +1247,44 @@ static void do_table(void *symval, unsigned long size, } } +static const struct devtable devtable[] = { + {"hid", SIZE_hid_device_id, do_hid_entry}, + {"ieee1394", SIZE_ieee1394_device_id, do_ieee1394_entry}, + {"pci", SIZE_pci_device_id, do_pci_entry}, + {"ccw", SIZE_ccw_device_id, do_ccw_entry}, + {"ap", SIZE_ap_device_id, do_ap_entry}, + {"css", SIZE_css_device_id, do_css_entry}, + {"serio", SIZE_serio_device_id, do_serio_entry}, + {"acpi", SIZE_acpi_device_id, do_acpi_entry}, + {"pcmcia", SIZE_pcmcia_device_id, do_pcmcia_entry}, + {"vio", SIZE_vio_device_id, do_vio_entry}, + {"input", SIZE_input_device_id, do_input_entry}, + {"eisa", SIZE_eisa_device_id, do_eisa_entry}, + {"parisc", SIZE_parisc_device_id, do_parisc_entry}, + {"sdio", SIZE_sdio_device_id, do_sdio_entry}, + {"ssb", SIZE_ssb_device_id, do_ssb_entry}, + {"bcma", SIZE_bcma_device_id, do_bcma_entry}, + {"virtio", SIZE_virtio_device_id, do_virtio_entry}, + {"vmbus", SIZE_hv_vmbus_device_id, do_vmbus_entry}, + {"i2c", SIZE_i2c_device_id, do_i2c_entry}, + {"spi", SIZE_spi_device_id, do_spi_entry}, + {"dmi", SIZE_dmi_system_id, do_dmi_entry}, + {"platform", SIZE_platform_device_id, do_platform_entry}, + {"mdio", SIZE_mdio_device_id, do_mdio_entry}, + {"zorro", SIZE_zorro_device_id, do_zorro_entry}, + {"isapnp", SIZE_isapnp_device_id, do_isapnp_entry}, + {"ipack", SIZE_ipack_device_id, do_ipack_entry}, + {"amba", SIZE_amba_id, do_amba_entry}, + {"mipscdmm", SIZE_mips_cdmm_device_id, do_mips_cdmm_entry}, + {"x86cpu", SIZE_x86_cpu_id, do_x86cpu_entry}, + {"cpu", SIZE_cpu_feature, do_cpu_entry}, + {"mei", SIZE_mei_cl_device_id, do_mei_entry}, + {"rapidio", SIZE_rio_device_id, do_rio_entry}, + {"ulpi", SIZE_ulpi_device_id, do_ulpi_entry}, + {"hdaudio", SIZE_hda_device_id, do_hda_entry}, + {"fslmc", SIZE_fsl_mc_device_id, do_fsl_mc_entry}, +}; + /* Create MODULE_ALIAS() statements. * At this time, we cannot write the actual output C source yet, * so we write into the mod->dev_table_buf buffer. */ @@ -1386,13 +1339,14 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, else if (sym_is(name, namelen, "pnp_card")) do_pnp_card_entries(symval, sym->st_size, mod); else { - struct devtable **p; - INIT_SECTION(__devtable); + int i; - for (p = __start___devtable; p < __stop___devtable; p++) { - if (sym_is(name, namelen, (*p)->device_id)) { - do_table(symval, sym->st_size, (*p)->id_size, - (*p)->device_id, (*p)->function, mod); + for (i = 0; i < ARRAY_SIZE(devtable); i++) { + const struct devtable *p = &devtable[i]; + + if (sym_is(name, namelen, p->device_id)) { + do_table(symval, sym->st_size, p->id_size, + p->device_id, p->function, mod); break; } } From 10bd1c7ad3894bb9bf2a6d2f3669b5b674f1e59e Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 22 Nov 2018 13:28:42 +0900 Subject: [PATCH 53/73] modpost: file2alias: check prototype of handler commit f880eea68fe593342fa6e09be9bb661f3c297aec upstream. Use specific prototype instead of an opaque pointer so that the compiler can catch function prototype mismatch. Signed-off-by: Masahiro Yamada Reviewed-by: Mathieu Malaterre Signed-off-by: Nathan Chancellor Signed-off-by: Sasha Levin --- scripts/mod/file2alias.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/scripts/mod/file2alias.c b/scripts/mod/file2alias.c index 6e3ae94cf123..55b4c0dc2b93 100644 --- a/scripts/mod/file2alias.c +++ b/scripts/mod/file2alias.c @@ -47,7 +47,7 @@ typedef struct { struct devtable { const char *device_id; /* name of table, __mod___*_device_table. */ unsigned long id_size; - void *function; + int (*do_entry)(const char *filename, void *symval, char *alias); }; /* Define a variable f that holds the value of field f of struct devid @@ -1228,12 +1228,11 @@ static bool sym_is(const char *name, unsigned namelen, const char *symbol) static void do_table(void *symval, unsigned long size, unsigned long id_size, const char *device_id, - void *function, + int (*do_entry)(const char *filename, void *symval, char *alias), struct module *mod) { unsigned int i; char alias[500]; - int (*do_entry)(const char *, void *entry, char *alias) = function; device_id_check(mod->name, device_id, size, id_size, symval); /* Leave last one: it's the terminator. */ @@ -1346,7 +1345,7 @@ void handle_moddevtable(struct module *mod, struct elf_info *info, if (sym_is(name, namelen, p->device_id)) { do_table(symval, sym->st_size, p->id_size, - p->device_id, p->function, mod); + p->device_id, p->do_entry, mod); break; } } From 00a22235025c6d12ee5d67cf50698ea93181b0b6 Mon Sep 17 00:00:00 2001 From: Jarkko Sakkinen Date: Fri, 8 Feb 2019 18:30:59 +0200 Subject: [PATCH 54/73] tpm/tpm_i2c_atmel: Return -E2BIG when the transfer is incomplete [ Upstream commit 442601e87a4769a8daba4976ec3afa5222ca211d ] Return -E2BIG when the transfer is incomplete. The upper layer does not retry, so not doing that is incorrect behaviour. Cc: stable@vger.kernel.org Fixes: a2871c62e186 ("tpm: Add support for Atmel I2C TPMs") Signed-off-by: Jarkko Sakkinen Reviewed-by: Stefan Berger Reviewed-by: Jerry Snitselaar Signed-off-by: Sasha Levin --- drivers/char/tpm/tpm_i2c_atmel.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/char/tpm/tpm_i2c_atmel.c b/drivers/char/tpm/tpm_i2c_atmel.c index 32a8e27c5382..cc4e642d3180 100644 --- a/drivers/char/tpm/tpm_i2c_atmel.c +++ b/drivers/char/tpm/tpm_i2c_atmel.c @@ -69,6 +69,10 @@ static int i2c_atmel_send(struct tpm_chip *chip, u8 *buf, size_t len) if (status < 0) return status; + /* The upper layer does not support incomplete sends. */ + if (status != len) + return -E2BIG; + return 0; } From a885a0ff776680ab6cb79cf891a81da9a2c18371 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 23 Apr 2019 10:48:21 -0700 Subject: [PATCH 55/73] ipv6: frags: fix a lockdep false positive [ Upstream commit 415787d7799f4fccbe8d49cb0b8e5811be6b0389 ] lockdep does not know that the locks used by IPv4 defrag and IPv6 reassembly units are of different classes. It complains because of following chains : 1) sch_direct_xmit() (lock txq->_xmit_lock) dev_hard_start_xmit() xmit_one() dev_queue_xmit_nit() packet_rcv_fanout() ip_check_defrag() ip_defrag() spin_lock() (lock frag queue spinlock) 2) ip6_input_finish() ipv6_frag_rcv() (lock frag queue spinlock) ip6_frag_queue() icmpv6_param_prob() (lock txq->_xmit_lock at some point) We could add lockdep annotations, but we also can make sure IPv6 calls icmpv6_param_prob() only after the release of the frag queue spinlock, since this naturally makes frag queue spinlock a leaf in lock hierarchy. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv6/reassembly.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 2a8c680b67cd..f75e9e711c31 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -170,7 +170,8 @@ fq_find(struct net *net, __be32 id, const struct ipv6hdr *hdr, int iif) } static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, - struct frag_hdr *fhdr, int nhoff) + struct frag_hdr *fhdr, int nhoff, + u32 *prob_offset) { struct sk_buff *prev, *next; struct net_device *dev; @@ -186,11 +187,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, - ((u8 *)&fhdr->frag_off - - skb_network_header(skb))); + *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); return -1; } @@ -221,10 +218,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, /* RFC2460 says always send parameter problem in * this case. -DaveM */ - __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), - IPSTATS_MIB_INHDRERRORS); - icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, - offsetof(struct ipv6hdr, payload_len)); + *prob_offset = offsetof(struct ipv6hdr, payload_len); return -1; } if (end > fq->q.len) { @@ -536,15 +530,22 @@ static int ipv6_frag_rcv(struct sk_buff *skb) iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { + u32 prob_offset = 0; int ret; spin_lock(&fq->q.lock); fq->iif = iif; - ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff); + ret = ip6_frag_queue(fq, skb, fhdr, IP6CB(skb)->nhoff, + &prob_offset); spin_unlock(&fq->q.lock); inet_frag_put(&fq->q); + if (prob_offset) { + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_INHDRERRORS); + icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); + } return ret; } From ccfa73daf762f3adac3f6c0e2f09c3c74548775f Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Tue, 23 Apr 2019 10:48:22 -0700 Subject: [PATCH 56/73] net: IP defrag: encapsulate rbtree defrag code into callable functions [ Upstream commit c23f35d19db3b36ffb9e04b08f1d91565d15f84f ] This is a refactoring patch: without changing runtime behavior, it moves rbtree-related code from IPv4-specific files/functions into .h/.c defrag files shared with IPv6 defragmentation code. v2: make handling of overlapping packets match upstream. Signed-off-by: Peter Oskolkov Cc: Eric Dumazet Cc: Florian Westphal Cc: Tom Herbert Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/inet_frag.h | 16 ++- net/ipv4/inet_fragment.c | 293 +++++++++++++++++++++++++++++++++++++ net/ipv4/ip_fragment.c | 302 +++++---------------------------------- 3 files changed, 342 insertions(+), 269 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 335cf7851f12..008f64823c41 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -77,8 +77,8 @@ struct inet_frag_queue { struct timer_list timer; spinlock_t lock; refcount_t refcnt; - struct sk_buff *fragments; /* Used in IPv6. */ - struct rb_root rb_fragments; /* Used in IPv4. */ + struct sk_buff *fragments; /* used in 6lopwpan IPv6. */ + struct rb_root rb_fragments; /* Used in IPv4/IPv6. */ struct sk_buff *fragments_tail; struct sk_buff *last_run_head; ktime_t stamp; @@ -153,4 +153,16 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, long val) extern const u8 ip_frag_ecn_table[16]; +/* Return values of inet_frag_queue_insert() */ +#define IPFRAG_OK 0 +#define IPFRAG_DUP 1 +#define IPFRAG_OVERLAP 2 +int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, + int offset, int end); +void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, + struct sk_buff *parent); +void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, + void *reasm_data); +struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q); + #endif diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index 6ffee9d2b0e5..481cded81b2d 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -24,6 +24,62 @@ #include #include #include +#include +#include + +/* Use skb->cb to track consecutive/adjacent fragments coming at + * the end of the queue. Nodes in the rb-tree queue will + * contain "runs" of one or more adjacent fragments. + * + * Invariants: + * - next_frag is NULL at the tail of a "run"; + * - the head of a "run" has the sum of all fragment lengths in frag_run_len. + */ +struct ipfrag_skb_cb { + union { + struct inet_skb_parm h4; + struct inet6_skb_parm h6; + }; + struct sk_buff *next_frag; + int frag_run_len; +}; + +#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) + +static void fragcb_clear(struct sk_buff *skb) +{ + RB_CLEAR_NODE(&skb->rbnode); + FRAG_CB(skb)->next_frag = NULL; + FRAG_CB(skb)->frag_run_len = skb->len; +} + +/* Append skb to the last "run". */ +static void fragrun_append_to_last(struct inet_frag_queue *q, + struct sk_buff *skb) +{ + fragcb_clear(skb); + + FRAG_CB(q->last_run_head)->frag_run_len += skb->len; + FRAG_CB(q->fragments_tail)->next_frag = skb; + q->fragments_tail = skb; +} + +/* Create a new "run" with the skb. */ +static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); + fragcb_clear(skb); + + if (q->last_run_head) + rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, + &q->last_run_head->rbnode.rb_right); + else + rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + + q->fragments_tail = skb; + q->last_run_head = skb; +} /* Given the OR values of all fragments, apply RFC 3168 5.3 requirements * Value : 0xff if frame should be dropped. @@ -122,6 +178,28 @@ static void inet_frag_destroy_rcu(struct rcu_head *head) kmem_cache_free(f->frags_cachep, q); } +unsigned int inet_frag_rbtree_purge(struct rb_root *root) +{ + struct rb_node *p = rb_first(root); + unsigned int sum = 0; + + while (p) { + struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); + + p = rb_next(p); + rb_erase(&skb->rbnode, root); + while (skb) { + struct sk_buff *next = FRAG_CB(skb)->next_frag; + + sum += skb->truesize; + kfree_skb(skb); + skb = next; + } + } + return sum; +} +EXPORT_SYMBOL(inet_frag_rbtree_purge); + void inet_frag_destroy(struct inet_frag_queue *q) { struct sk_buff *fp; @@ -224,3 +302,218 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key) return fq; } EXPORT_SYMBOL(inet_frag_find); + +int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb, + int offset, int end) +{ + struct sk_buff *last = q->fragments_tail; + + /* RFC5722, Section 4, amended by Errata ID : 3089 + * When reassembling an IPv6 datagram, if + * one or more its constituent fragments is determined to be an + * overlapping fragment, the entire datagram (and any constituent + * fragments) MUST be silently discarded. + * + * Duplicates, however, should be ignored (i.e. skb dropped, but the + * queue/fragments kept for later reassembly). + */ + if (!last) + fragrun_create(q, skb); /* First fragment. */ + else if (last->ip_defrag_offset + last->len < end) { + /* This is the common case: skb goes to the end. */ + /* Detect and discard overlaps. */ + if (offset < last->ip_defrag_offset + last->len) + return IPFRAG_OVERLAP; + if (offset == last->ip_defrag_offset + last->len) + fragrun_append_to_last(q, skb); + else + fragrun_create(q, skb); + } else { + /* Binary search. Note that skb can become the first fragment, + * but not the last (covered above). + */ + struct rb_node **rbn, *parent; + + rbn = &q->rb_fragments.rb_node; + do { + struct sk_buff *curr; + int curr_run_end; + + parent = *rbn; + curr = rb_to_skb(parent); + curr_run_end = curr->ip_defrag_offset + + FRAG_CB(curr)->frag_run_len; + if (end <= curr->ip_defrag_offset) + rbn = &parent->rb_left; + else if (offset >= curr_run_end) + rbn = &parent->rb_right; + else if (offset >= curr->ip_defrag_offset && + end <= curr_run_end) + return IPFRAG_DUP; + else + return IPFRAG_OVERLAP; + } while (*rbn); + /* Here we have parent properly set, and rbn pointing to + * one of its NULL left/right children. Insert skb. + */ + fragcb_clear(skb); + rb_link_node(&skb->rbnode, parent, rbn); + rb_insert_color(&skb->rbnode, &q->rb_fragments); + } + + skb->ip_defrag_offset = offset; + + return IPFRAG_OK; +} +EXPORT_SYMBOL(inet_frag_queue_insert); + +void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb, + struct sk_buff *parent) +{ + struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments); + struct sk_buff **nextp; + int delta; + + if (head != skb) { + fp = skb_clone(skb, GFP_ATOMIC); + if (!fp) + return NULL; + FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; + if (RB_EMPTY_NODE(&skb->rbnode)) + FRAG_CB(parent)->next_frag = fp; + else + rb_replace_node(&skb->rbnode, &fp->rbnode, + &q->rb_fragments); + if (q->fragments_tail == skb) + q->fragments_tail = fp; + skb_morph(skb, head); + FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + consume_skb(head); + head = skb; + } + WARN_ON(head->ip_defrag_offset != 0); + + delta = -head->truesize; + + /* Head of list must not be cloned. */ + if (skb_unclone(head, GFP_ATOMIC)) + return NULL; + + delta += head->truesize; + if (delta) + add_frag_mem_limit(q->net, delta); + + /* If the first fragment is fragmented itself, we split + * it to two chunks: the first with data and paged part + * and the second, holding only fragments. + */ + if (skb_has_frag_list(head)) { + struct sk_buff *clone; + int i, plen = 0; + + clone = alloc_skb(0, GFP_ATOMIC); + if (!clone) + return NULL; + skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; + skb_frag_list_init(head); + for (i = 0; i < skb_shinfo(head)->nr_frags; i++) + plen += skb_frag_size(&skb_shinfo(head)->frags[i]); + clone->data_len = head->data_len - plen; + clone->len = clone->data_len; + head->truesize += clone->truesize; + clone->csum = 0; + clone->ip_summed = head->ip_summed; + add_frag_mem_limit(q->net, clone->truesize); + skb_shinfo(head)->frag_list = clone; + nextp = &clone->next; + } else { + nextp = &skb_shinfo(head)->frag_list; + } + + return nextp; +} +EXPORT_SYMBOL(inet_frag_reasm_prepare); + +void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, + void *reasm_data) +{ + struct sk_buff **nextp = (struct sk_buff **)reasm_data; + struct rb_node *rbn; + struct sk_buff *fp; + + skb_push(head, head->data - skb_network_header(head)); + + /* Traverse the tree in order, to build frag_list. */ + fp = FRAG_CB(head)->next_frag; + rbn = rb_next(&head->rbnode); + rb_erase(&head->rbnode, &q->rb_fragments); + while (rbn || fp) { + /* fp points to the next sk_buff in the current run; + * rbn points to the next run. + */ + /* Go through the current run. */ + while (fp) { + *nextp = fp; + nextp = &fp->next; + fp->prev = NULL; + memset(&fp->rbnode, 0, sizeof(fp->rbnode)); + fp->sk = NULL; + head->data_len += fp->len; + head->len += fp->len; + if (head->ip_summed != fp->ip_summed) + head->ip_summed = CHECKSUM_NONE; + else if (head->ip_summed == CHECKSUM_COMPLETE) + head->csum = csum_add(head->csum, fp->csum); + head->truesize += fp->truesize; + fp = FRAG_CB(fp)->next_frag; + } + /* Move to the next run. */ + if (rbn) { + struct rb_node *rbnext = rb_next(rbn); + + fp = rb_to_skb(rbn); + rb_erase(rbn, &q->rb_fragments); + rbn = rbnext; + } + } + sub_frag_mem_limit(q->net, head->truesize); + + *nextp = NULL; + head->next = NULL; + head->prev = NULL; + head->tstamp = q->stamp; +} +EXPORT_SYMBOL(inet_frag_reasm_finish); + +struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q) +{ + struct sk_buff *head; + + if (q->fragments) { + head = q->fragments; + q->fragments = head->next; + } else { + struct sk_buff *skb; + + head = skb_rb_first(&q->rb_fragments); + if (!head) + return NULL; + skb = FRAG_CB(head)->next_frag; + if (skb) + rb_replace_node(&head->rbnode, &skb->rbnode, + &q->rb_fragments); + else + rb_erase(&head->rbnode, &q->rb_fragments); + memset(&head->rbnode, 0, sizeof(head->rbnode)); + barrier(); + } + if (head == q->fragments_tail) + q->fragments_tail = NULL; + + sub_frag_mem_limit(q->net, head->truesize); + + return head; +} +EXPORT_SYMBOL(inet_frag_pull_head); diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index d95b32af4a0e..5a1d39e32196 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -57,57 +57,6 @@ */ static const char ip_frag_cache_name[] = "ip4-frags"; -/* Use skb->cb to track consecutive/adjacent fragments coming at - * the end of the queue. Nodes in the rb-tree queue will - * contain "runs" of one or more adjacent fragments. - * - * Invariants: - * - next_frag is NULL at the tail of a "run"; - * - the head of a "run" has the sum of all fragment lengths in frag_run_len. - */ -struct ipfrag_skb_cb { - struct inet_skb_parm h; - struct sk_buff *next_frag; - int frag_run_len; -}; - -#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb)) - -static void ip4_frag_init_run(struct sk_buff *skb) -{ - BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb)); - - FRAG_CB(skb)->next_frag = NULL; - FRAG_CB(skb)->frag_run_len = skb->len; -} - -/* Append skb to the last "run". */ -static void ip4_frag_append_to_last_run(struct inet_frag_queue *q, - struct sk_buff *skb) -{ - RB_CLEAR_NODE(&skb->rbnode); - FRAG_CB(skb)->next_frag = NULL; - - FRAG_CB(q->last_run_head)->frag_run_len += skb->len; - FRAG_CB(q->fragments_tail)->next_frag = skb; - q->fragments_tail = skb; -} - -/* Create a new "run" with the skb. */ -static void ip4_frag_create_run(struct inet_frag_queue *q, struct sk_buff *skb) -{ - if (q->last_run_head) - rb_link_node(&skb->rbnode, &q->last_run_head->rbnode, - &q->last_run_head->rbnode.rb_right); - else - rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node); - rb_insert_color(&skb->rbnode, &q->rb_fragments); - - ip4_frag_init_run(skb); - q->fragments_tail = skb; - q->last_run_head = skb; -} - /* Describe an entry in the "incomplete datagrams" queue. */ struct ipq { struct inet_frag_queue q; @@ -212,27 +161,9 @@ static void ip_expire(struct timer_list *t) * pull the head out of the tree in order to be able to * deal with head->dev. */ - if (qp->q.fragments) { - head = qp->q.fragments; - qp->q.fragments = head->next; - } else { - head = skb_rb_first(&qp->q.rb_fragments); - if (!head) - goto out; - if (FRAG_CB(head)->next_frag) - rb_replace_node(&head->rbnode, - &FRAG_CB(head)->next_frag->rbnode, - &qp->q.rb_fragments); - else - rb_erase(&head->rbnode, &qp->q.rb_fragments); - memset(&head->rbnode, 0, sizeof(head->rbnode)); - barrier(); - } - if (head == qp->q.fragments_tail) - qp->q.fragments_tail = NULL; - - sub_frag_mem_limit(qp->q.net, head->truesize); - + head = inet_frag_pull_head(&qp->q); + if (!head) + goto out; head->dev = dev_get_by_index_rcu(net, qp->iif); if (!head->dev) goto out; @@ -345,12 +276,10 @@ static int ip_frag_reinit(struct ipq *qp) static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); - struct rb_node **rbn, *parent; - struct sk_buff *skb1, *prev_tail; - int ihl, end, skb1_run_end; + int ihl, end, flags, offset; + struct sk_buff *prev_tail; struct net_device *dev; unsigned int fragsize; - int flags, offset; int err = -ENOENT; u8 ecn; @@ -382,7 +311,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) */ if (end < qp->q.len || ((qp->q.flags & INET_FRAG_LAST_IN) && end != qp->q.len)) - goto err; + goto discard_qp; qp->q.flags |= INET_FRAG_LAST_IN; qp->q.len = end; } else { @@ -394,82 +323,33 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (end > qp->q.len) { /* Some bits beyond end -> corruption. */ if (qp->q.flags & INET_FRAG_LAST_IN) - goto err; + goto discard_qp; qp->q.len = end; } } if (end == offset) - goto err; + goto discard_qp; err = -ENOMEM; if (!pskb_pull(skb, skb_network_offset(skb) + ihl)) - goto err; + goto discard_qp; err = pskb_trim_rcsum(skb, end - offset); if (err) - goto err; + goto discard_qp; /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; /* Makes sure compiler wont do silly aliasing games */ barrier(); - /* RFC5722, Section 4, amended by Errata ID : 3089 - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - * - * We do the same here for IPv4 (and increment an snmp counter) but - * we do not want to drop the whole queue in response to a duplicate - * fragment. - */ - - err = -EINVAL; - /* Find out where to put this fragment. */ prev_tail = qp->q.fragments_tail; - if (!prev_tail) - ip4_frag_create_run(&qp->q, skb); /* First fragment. */ - else if (prev_tail->ip_defrag_offset + prev_tail->len < end) { - /* This is the common case: skb goes to the end. */ - /* Detect and discard overlaps. */ - if (offset < prev_tail->ip_defrag_offset + prev_tail->len) - goto discard_qp; - if (offset == prev_tail->ip_defrag_offset + prev_tail->len) - ip4_frag_append_to_last_run(&qp->q, skb); - else - ip4_frag_create_run(&qp->q, skb); - } else { - /* Binary search. Note that skb can become the first fragment, - * but not the last (covered above). - */ - rbn = &qp->q.rb_fragments.rb_node; - do { - parent = *rbn; - skb1 = rb_to_skb(parent); - skb1_run_end = skb1->ip_defrag_offset + - FRAG_CB(skb1)->frag_run_len; - if (end <= skb1->ip_defrag_offset) - rbn = &parent->rb_left; - else if (offset >= skb1_run_end) - rbn = &parent->rb_right; - else if (offset >= skb1->ip_defrag_offset && - end <= skb1_run_end) - goto err; /* No new data, potential duplicate */ - else - goto discard_qp; /* Found an overlap */ - } while (*rbn); - /* Here we have parent properly set, and rbn pointing to - * one of its NULL left/right children. Insert skb. - */ - ip4_frag_init_run(skb); - rb_link_node(&skb->rbnode, parent, rbn); - rb_insert_color(&skb->rbnode, &qp->q.rb_fragments); - } + err = inet_frag_queue_insert(&qp->q, skb, offset, end); + if (err) + goto insert_error; if (dev) qp->iif = dev->ifindex; - skb->ip_defrag_offset = offset; qp->q.stamp = skb->tstamp; qp->q.meat += skb->len; @@ -494,15 +374,24 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) skb->_skb_refdst = 0UL; err = ip_frag_reasm(qp, skb, prev_tail, dev); skb->_skb_refdst = orefdst; + if (err) + inet_frag_kill(&qp->q); return err; } skb_dst_drop(skb); return -EINPROGRESS; +insert_error: + if (err == IPFRAG_DUP) { + kfree_skb(skb); + return -EINVAL; + } + err = -EINVAL; + __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); discard_qp: inet_frag_kill(&qp->q); - __IP_INC_STATS(net, IPSTATS_MIB_REASM_OVERLAPS); + __IP_INC_STATS(net, IPSTATS_MIB_REASMFAILS); err: kfree_skb(skb); return err; @@ -514,13 +403,8 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, { struct net *net = container_of(qp->q.net, struct net, ipv4.frags); struct iphdr *iph; - struct sk_buff *fp, *head = skb_rb_first(&qp->q.rb_fragments); - struct sk_buff **nextp; /* To build frag_list. */ - struct rb_node *rbn; - int len; - int ihlen; - int delta; - int err; + void *reasm_data; + int len, err; u8 ecn; ipq_kill(qp); @@ -530,117 +414,23 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, err = -EINVAL; goto out_fail; } + /* Make the one we just received the head. */ - if (head != skb) { - fp = skb_clone(skb, GFP_ATOMIC); - if (!fp) - goto out_nomem; - FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag; - if (RB_EMPTY_NODE(&skb->rbnode)) - FRAG_CB(prev_tail)->next_frag = fp; - else - rb_replace_node(&skb->rbnode, &fp->rbnode, - &qp->q.rb_fragments); - if (qp->q.fragments_tail == skb) - qp->q.fragments_tail = fp; - skb_morph(skb, head); - FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag; - rb_replace_node(&head->rbnode, &skb->rbnode, - &qp->q.rb_fragments); - consume_skb(head); - head = skb; - } - - WARN_ON(head->ip_defrag_offset != 0); - - /* Allocate a new buffer for the datagram. */ - ihlen = ip_hdrlen(head); - len = ihlen + qp->q.len; + reasm_data = inet_frag_reasm_prepare(&qp->q, skb, prev_tail); + if (!reasm_data) + goto out_nomem; + len = ip_hdrlen(skb) + qp->q.len; err = -E2BIG; if (len > 65535) goto out_oversize; - delta = - head->truesize; + inet_frag_reasm_finish(&qp->q, skb, reasm_data); - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - goto out_nomem; + skb->dev = dev; + IPCB(skb)->frag_max_size = max(qp->max_df_size, qp->q.max_size); - delta += head->truesize; - if (delta) - add_frag_mem_limit(qp->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_nomem; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->truesize += clone->truesize; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(qp->q.net, clone->truesize); - skb_shinfo(head)->frag_list = clone; - nextp = &clone->next; - } else { - nextp = &skb_shinfo(head)->frag_list; - } - - skb_push(head, head->data - skb_network_header(head)); - - /* Traverse the tree in order, to build frag_list. */ - fp = FRAG_CB(head)->next_frag; - rbn = rb_next(&head->rbnode); - rb_erase(&head->rbnode, &qp->q.rb_fragments); - while (rbn || fp) { - /* fp points to the next sk_buff in the current run; - * rbn points to the next run. - */ - /* Go through the current run. */ - while (fp) { - *nextp = fp; - nextp = &fp->next; - fp->prev = NULL; - memset(&fp->rbnode, 0, sizeof(fp->rbnode)); - fp->sk = NULL; - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp = FRAG_CB(fp)->next_frag; - } - /* Move to the next run. */ - if (rbn) { - struct rb_node *rbnext = rb_next(rbn); - - fp = rb_to_skb(rbn); - rb_erase(rbn, &qp->q.rb_fragments); - rbn = rbnext; - } - } - sub_frag_mem_limit(qp->q.net, head->truesize); - - *nextp = NULL; - head->next = NULL; - head->prev = NULL; - head->dev = dev; - head->tstamp = qp->q.stamp; - IPCB(head)->frag_max_size = max(qp->max_df_size, qp->q.max_size); - - iph = ip_hdr(head); + iph = ip_hdr(skb); iph->tot_len = htons(len); iph->tos |= ecn; @@ -653,7 +443,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *skb, * from one very small df-fragment and one large non-df frag. */ if (qp->max_df_size == qp->q.max_size) { - IPCB(head)->flags |= IPSKB_FRAG_PMTU; + IPCB(skb)->flags |= IPSKB_FRAG_PMTU; iph->frag_off = htons(IP_DF); } else { iph->frag_off = 0; @@ -751,28 +541,6 @@ struct sk_buff *ip_check_defrag(struct net *net, struct sk_buff *skb, u32 user) } EXPORT_SYMBOL(ip_check_defrag); -unsigned int inet_frag_rbtree_purge(struct rb_root *root) -{ - struct rb_node *p = rb_first(root); - unsigned int sum = 0; - - while (p) { - struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); - - p = rb_next(p); - rb_erase(&skb->rbnode, root); - while (skb) { - struct sk_buff *next = FRAG_CB(skb)->next_frag; - - sum += skb->truesize; - kfree_skb(skb); - skb = next; - } - } - return sum; -} -EXPORT_SYMBOL(inet_frag_rbtree_purge); - #ifdef CONFIG_SYSCTL static int dist_min; From 5d827bfe37a5cecd1de33405afde4708657643c5 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 23 Apr 2019 10:48:23 -0700 Subject: [PATCH 57/73] ipv6: remove dependency of nf_defrag_ipv6 on ipv6 module [ Upstream commit 70b095c84326640eeacfd69a411db8fc36e8ab1a ] IPV6=m DEFRAG_IPV6=m CONNTRACK=y yields: net/netfilter/nf_conntrack_proto.o: In function `nf_ct_netns_do_get': net/netfilter/nf_conntrack_proto.c:802: undefined reference to `nf_defrag_ipv6_enable' net/netfilter/nf_conntrack_proto.o:(.rodata+0x640): undefined reference to `nf_conntrack_l4proto_icmpv6' Setting DEFRAG_IPV6=y causes undefined references to ip6_rhash_params ip6_frag_init and ip6_expire_frag_queue so it would be needed to force IPV6=y too. This patch gets rid of the 'followup linker error' by removing the dependency of ipv6.ko symbols from netfilter ipv6 defrag. Shared code is placed into a header, then used from both. Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin --- include/net/ipv6.h | 29 ------ include/net/ipv6_frag.h | 104 ++++++++++++++++++++++ net/ieee802154/6lowpan/reassembly.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 17 ++-- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 3 +- net/ipv6/reassembly.c | 92 ++----------------- net/openvswitch/conntrack.c | 1 + 7 files changed, 126 insertions(+), 122 deletions(-) create mode 100644 include/net/ipv6_frag.h diff --git a/include/net/ipv6.h b/include/net/ipv6.h index fa87a62e9bd3..6294d20a5f0e 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -512,35 +512,6 @@ static inline bool ipv6_prefix_equal(const struct in6_addr *addr1, } #endif -struct inet_frag_queue; - -enum ip6_defrag_users { - IP6_DEFRAG_LOCAL_DELIVER, - IP6_DEFRAG_CONNTRACK_IN, - __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_OUT, - __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, - IP6_DEFRAG_CONNTRACK_BRIDGE_IN, - __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, -}; - -void ip6_frag_init(struct inet_frag_queue *q, const void *a); -extern const struct rhashtable_params ip6_rhash_params; - -/* - * Equivalent of ipv4 struct ip - */ -struct frag_queue { - struct inet_frag_queue q; - - int iif; - unsigned int csum; - __u16 nhoffset; - u8 ecn; -}; - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq); - static inline bool ipv6_addr_any(const struct in6_addr *a) { #if defined(CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS) && BITS_PER_LONG == 64 diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h new file mode 100644 index 000000000000..6ced1e6899b6 --- /dev/null +++ b/include/net/ipv6_frag.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IPV6_FRAG_H +#define _IPV6_FRAG_H +#include +#include +#include +#include + +enum ip6_defrag_users { + IP6_DEFRAG_LOCAL_DELIVER, + IP6_DEFRAG_CONNTRACK_IN, + __IP6_DEFRAG_CONNTRACK_IN = IP6_DEFRAG_CONNTRACK_IN + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_OUT, + __IP6_DEFRAG_CONNTRACK_OUT = IP6_DEFRAG_CONNTRACK_OUT + USHRT_MAX, + IP6_DEFRAG_CONNTRACK_BRIDGE_IN, + __IP6_DEFRAG_CONNTRACK_BRIDGE_IN = IP6_DEFRAG_CONNTRACK_BRIDGE_IN + USHRT_MAX, +}; + +/* + * Equivalent of ipv4 struct ip + */ +struct frag_queue { + struct inet_frag_queue q; + + int iif; + __u16 nhoffset; + u8 ecn; +}; + +#if IS_ENABLED(CONFIG_IPV6) +static inline void ip6frag_init(struct inet_frag_queue *q, const void *a) +{ + struct frag_queue *fq = container_of(q, struct frag_queue, q); + const struct frag_v6_compare_key *key = a; + + q->key.v6 = *key; + fq->ecn = 0; +} + +static inline u32 ip6frag_key_hashfn(const void *data, u32 len, u32 seed) +{ + return jhash2(data, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline u32 ip6frag_obj_hashfn(const void *data, u32 len, u32 seed) +{ + const struct inet_frag_queue *fq = data; + + return jhash2((const u32 *)&fq->key.v6, + sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); +} + +static inline int +ip6frag_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) +{ + const struct frag_v6_compare_key *key = arg->key; + const struct inet_frag_queue *fq = ptr; + + return !!memcmp(&fq->key, key, sizeof(*key)); +} + +static inline void +ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) +{ + struct net_device *dev = NULL; + struct sk_buff *head; + + rcu_read_lock(); + spin_lock(&fq->q.lock); + + if (fq->q.flags & INET_FRAG_COMPLETE) + goto out; + + inet_frag_kill(&fq->q); + + dev = dev_get_by_index_rcu(net, fq->iif); + if (!dev) + goto out; + + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); + __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); + + /* Don't send error if the first segment did not arrive. */ + head = fq->q.fragments; + if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + goto out; + + head->dev = dev; + skb_get(head); + spin_unlock(&fq->q.lock); + + icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); + kfree_skb(head); + goto out_rcu_unlock; + +out: + spin_unlock(&fq->q.lock); +out_rcu_unlock: + rcu_read_unlock(); + inet_frag_put(&fq->q); +} +#endif +#endif diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 2cc224106b69..ec7a5da56129 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include "6lowpan_i.h" diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 237fb04c6716..0568d49b5da4 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -33,9 +33,8 @@ #include #include -#include +#include -#include #include #include #include @@ -159,7 +158,7 @@ static void nf_ct_frag6_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, nf_frag.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } /* Creation primitives. */ @@ -641,16 +640,24 @@ static struct pernet_operations nf_ct_net_ops = { .exit = nf_ct_net_exit, }; +static const struct rhashtable_params nfct_rhash_params = { + .head_offset = offsetof(struct inet_frag_queue, node), + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, + .automatic_shrinking = true, +}; + int nf_ct_frag6_init(void) { int ret = 0; - nf_frags.constructor = ip6_frag_init; + nf_frags.constructor = ip6frag_init; nf_frags.destructor = NULL; nf_frags.qsize = sizeof(struct frag_queue); nf_frags.frag_expire = nf_ct_frag6_expire; nf_frags.frags_cache_name = nf_frags_cache_name; - nf_frags.rhash_params = ip6_rhash_params; + nf_frags.rhash_params = nfct_rhash_params; ret = inet_frags_init(&nf_frags); if (ret) goto out; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c index b326da59257f..123bfb13a5d1 100644 --- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -14,8 +14,7 @@ #include #include #include -#include -#include +#include #include #include diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index f75e9e711c31..e5ab3b7813d6 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -57,7 +57,7 @@ #include #include #include -#include +#include #include static const char ip6_frag_cache_name[] = "ip6-frags"; @@ -79,61 +79,6 @@ static struct inet_frags ip6_frags; static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev); -void ip6_frag_init(struct inet_frag_queue *q, const void *a) -{ - struct frag_queue *fq = container_of(q, struct frag_queue, q); - const struct frag_v6_compare_key *key = a; - - q->key.v6 = *key; - fq->ecn = 0; -} -EXPORT_SYMBOL(ip6_frag_init); - -void ip6_expire_frag_queue(struct net *net, struct frag_queue *fq) -{ - struct net_device *dev = NULL; - struct sk_buff *head; - - rcu_read_lock(); - spin_lock(&fq->q.lock); - - if (fq->q.flags & INET_FRAG_COMPLETE) - goto out; - - inet_frag_kill(&fq->q); - - dev = dev_get_by_index_rcu(net, fq->iif); - if (!dev) - goto out; - - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); - __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); - - /* Don't send error if the first segment did not arrive. */ - head = fq->q.fragments; - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) - goto out; - - /* But use as source device on which LAST ARRIVED - * segment was received. And do not use fq->dev - * pointer directly, device might already disappeared. - */ - head->dev = dev; - skb_get(head); - spin_unlock(&fq->q.lock); - - icmpv6_send(head, ICMPV6_TIME_EXCEED, ICMPV6_EXC_FRAGTIME, 0); - kfree_skb(head); - goto out_rcu_unlock; - -out: - spin_unlock(&fq->q.lock); -out_rcu_unlock: - rcu_read_unlock(); - inet_frag_put(&fq->q); -} -EXPORT_SYMBOL(ip6_expire_frag_queue); - static void ip6_frag_expire(struct timer_list *t) { struct inet_frag_queue *frag = from_timer(frag, t, timer); @@ -143,7 +88,7 @@ static void ip6_frag_expire(struct timer_list *t) fq = container_of(frag, struct frag_queue, q); net = container_of(fq->q.net, struct net, ipv6.frags); - ip6_expire_frag_queue(net, fq); + ip6frag_expire_frag_queue(net, fq); } static struct frag_queue * @@ -713,42 +658,19 @@ static struct pernet_operations ip6_frags_ops = { .exit = ipv6_frags_exit_net, }; -static u32 ip6_key_hashfn(const void *data, u32 len, u32 seed) -{ - return jhash2(data, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static u32 ip6_obj_hashfn(const void *data, u32 len, u32 seed) -{ - const struct inet_frag_queue *fq = data; - - return jhash2((const u32 *)&fq->key.v6, - sizeof(struct frag_v6_compare_key) / sizeof(u32), seed); -} - -static int ip6_obj_cmpfn(struct rhashtable_compare_arg *arg, const void *ptr) -{ - const struct frag_v6_compare_key *key = arg->key; - const struct inet_frag_queue *fq = ptr; - - return !!memcmp(&fq->key, key, sizeof(*key)); -} - -const struct rhashtable_params ip6_rhash_params = { +static const struct rhashtable_params ip6_rhash_params = { .head_offset = offsetof(struct inet_frag_queue, node), - .hashfn = ip6_key_hashfn, - .obj_hashfn = ip6_obj_hashfn, - .obj_cmpfn = ip6_obj_cmpfn, + .hashfn = ip6frag_key_hashfn, + .obj_hashfn = ip6frag_obj_hashfn, + .obj_cmpfn = ip6frag_obj_cmpfn, .automatic_shrinking = true, }; -EXPORT_SYMBOL(ip6_rhash_params); int __init ipv6_frag_init(void) { int ret; - ip6_frags.constructor = ip6_frag_init; + ip6_frags.constructor = ip6frag_init; ip6_frags.destructor = NULL; ip6_frags.qsize = sizeof(struct frag_queue); ip6_frags.frag_expire = ip6_frag_expire; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 285f8797c26a..0171b27a2b81 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -23,6 +23,7 @@ #include #include #include +#include #ifdef CONFIG_NF_NAT_NEEDED #include From 6925083963809d53f1d5ef58c2ef8a3792908166 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Tue, 23 Apr 2019 10:48:24 -0700 Subject: [PATCH 58/73] net: IP6 defrag: use rbtrees for IPv6 defrag [ Upstream commit d4289fcc9b16b89619ee1c54f829e05e56de8b9a ] Currently, IPv6 defragmentation code drops non-last fragments that are smaller than 1280 bytes: see commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu") This behavior is not specified in IPv6 RFCs and appears to break compatibility with some IPv6 implemenations, as reported here: https://www.spinics.net/lists/netdev/msg543846.html This patch re-uses common IP defragmentation queueing and reassembly code in IPv6, removing the 1280 byte restriction. v2: change handling of overlaps to match that of upstream. Signed-off-by: Peter Oskolkov Reported-by: Tom Herbert Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- include/net/ipv6_frag.h | 11 +- net/ipv6/reassembly.c | 247 +++++++++++----------------------------- 2 files changed, 76 insertions(+), 182 deletions(-) diff --git a/include/net/ipv6_frag.h b/include/net/ipv6_frag.h index 6ced1e6899b6..28aa9b30aece 100644 --- a/include/net/ipv6_frag.h +++ b/include/net/ipv6_frag.h @@ -82,8 +82,15 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq) __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT); /* Don't send error if the first segment did not arrive. */ - head = fq->q.fragments; - if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head) + if (!(fq->q.flags & INET_FRAG_FIRST_IN)) + goto out; + + /* sk_buff::dev and sk_buff::rbnode are unionized. So we + * pull the head out of the tree in order to be able to + * deal with head->dev. + */ + head = inet_frag_pull_head(&fq->q); + if (!head) goto out; head->dev = dev; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index e5ab3b7813d6..fe797b29ca89 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -62,13 +62,6 @@ static const char ip6_frag_cache_name[] = "ip6-frags"; -struct ip6frag_skb_cb { - struct inet6_skb_parm h; - int offset; -}; - -#define FRAG6_CB(skb) ((struct ip6frag_skb_cb *)((skb)->cb)) - static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); @@ -76,8 +69,8 @@ static u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) static struct inet_frags ip6_frags; -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, - struct net_device *dev); +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev); static void ip6_frag_expire(struct timer_list *t) { @@ -118,21 +111,26 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, struct frag_hdr *fhdr, int nhoff, u32 *prob_offset) { - struct sk_buff *prev, *next; - struct net_device *dev; - int offset, end, fragsize; struct net *net = dev_net(skb_dst(skb)->dev); + int offset, end, fragsize; + struct sk_buff *prev_tail; + struct net_device *dev; + int err = -ENOENT; u8 ecn; if (fq->q.flags & INET_FRAG_COMPLETE) goto err; + err = -EINVAL; offset = ntohs(fhdr->frag_off) & ~0x7; end = offset + (ntohs(ipv6_hdr(skb)->payload_len) - ((u8 *)(fhdr + 1) - (u8 *)(ipv6_hdr(skb) + 1))); if ((unsigned int)end > IPV6_MAXPLEN) { *prob_offset = (u8 *)&fhdr->frag_off - skb_network_header(skb); + /* note that if prob_offset is set, the skb is freed elsewhere, + * we do not free it here. + */ return -1; } @@ -152,7 +150,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, */ if (end < fq->q.len || ((fq->q.flags & INET_FRAG_LAST_IN) && end != fq->q.len)) - goto err; + goto discard_fq; fq->q.flags |= INET_FRAG_LAST_IN; fq->q.len = end; } else { @@ -169,70 +167,36 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, if (end > fq->q.len) { /* Some bits beyond end -> corruption. */ if (fq->q.flags & INET_FRAG_LAST_IN) - goto err; + goto discard_fq; fq->q.len = end; } } if (end == offset) - goto err; + goto discard_fq; + err = -ENOMEM; /* Point into the IP datagram 'data' part. */ if (!pskb_pull(skb, (u8 *) (fhdr + 1) - skb->data)) - goto err; - - if (pskb_trim_rcsum(skb, end - offset)) - goto err; - - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ - prev = fq->q.fragments_tail; - if (!prev || FRAG6_CB(prev)->offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = fq->q.fragments; next != NULL; next = next->next) { - if (FRAG6_CB(next)->offset >= offset) - break; /* bingo! */ - prev = next; - } - -found: - /* RFC5722, Section 4, amended by Errata ID : 3089 - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments) MUST be silently discarded. - */ - - /* Check for overlap with preceding fragment. */ - if (prev && - (FRAG6_CB(prev)->offset + prev->len) > offset) goto discard_fq; - /* Look for overlap with succeeding segment. */ - if (next && FRAG6_CB(next)->offset < end) + err = pskb_trim_rcsum(skb, end - offset); + if (err) goto discard_fq; - FRAG6_CB(skb)->offset = offset; - - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - fq->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - fq->q.fragments = skb; - + /* Note : skb->rbnode and skb->dev share the same location. */ dev = skb->dev; - if (dev) { + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + + prev_tail = fq->q.fragments_tail; + err = inet_frag_queue_insert(&fq->q, skb, offset, end); + if (err) + goto insert_error; + + if (dev) fq->iif = dev->ifindex; - skb->dev = NULL; - } + fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -252,44 +216,48 @@ found: if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && fq->q.meat == fq->q.len) { - int res; unsigned long orefdst = skb->_skb_refdst; skb->_skb_refdst = 0UL; - res = ip6_frag_reasm(fq, prev, dev); + err = ip6_frag_reasm(fq, skb, prev_tail, dev); skb->_skb_refdst = orefdst; - return res; + return err; } skb_dst_drop(skb); - return -1; + return -EINPROGRESS; +insert_error: + if (err == IPFRAG_DUP) { + kfree_skb(skb); + return -EINVAL; + } + err = -EINVAL; + __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), + IPSTATS_MIB_REASM_OVERLAPS); discard_fq: inet_frag_kill(&fq->q); -err: __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_REASMFAILS); +err: kfree_skb(skb); - return -1; + return err; } /* * Check if this packet is complete. - * Returns NULL on failure by any reason, and pointer - * to current nexthdr field in reassembled frame. * * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. */ -static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, - struct net_device *dev) +static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev) { struct net *net = container_of(fq->q.net, struct net, ipv6.frags); - struct sk_buff *fp, *head = fq->q.fragments; - int payload_len, delta; unsigned int nhoff; - int sum_truesize; + void *reasm_data; + int payload_len; u8 ecn; inet_frag_kill(&fq->q); @@ -298,120 +266,40 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, if (unlikely(ecn == 0xff)) goto out_fail; - /* Make the one we just received the head. */ - if (prev) { - head = prev->next; - fp = skb_clone(head, GFP_ATOMIC); + reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); + if (!reasm_data) + goto out_oom; - if (!fp) - goto out_oom; - - fp->next = head->next; - if (!fp->next) - fq->q.fragments_tail = fp; - prev->next = fp; - - skb_morph(head, fq->q.fragments); - head->next = fq->q.fragments->next; - - consume_skb(fq->q.fragments); - fq->q.fragments = head; - } - - WARN_ON(head == NULL); - WARN_ON(FRAG6_CB(head)->offset != 0); - - /* Unfragmented part is taken from the first segment. */ - payload_len = ((head->data - skb_network_header(head)) - + payload_len = ((skb->data - skb_network_header(skb)) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) goto out_oversize; - delta = - head->truesize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - goto out_oom; - - delta += head->truesize; - if (delta) - add_frag_mem_limit(fq->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (!clone) - goto out_oom; - clone->next = head->next; - head->next = clone; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - add_frag_mem_limit(fq->q.net, clone->truesize); - } - /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ nhoff = fq->nhoffset; - skb_network_header(head)[nhoff] = skb_transport_header(head)[0]; - memmove(head->head + sizeof(struct frag_hdr), head->head, - (head->data - head->head) - sizeof(struct frag_hdr)); - if (skb_mac_header_was_set(head)) - head->mac_header += sizeof(struct frag_hdr); - head->network_header += sizeof(struct frag_hdr); + skb_network_header(skb)[nhoff] = skb_transport_header(skb)[0]; + memmove(skb->head + sizeof(struct frag_hdr), skb->head, + (skb->data - skb->head) - sizeof(struct frag_hdr)); + if (skb_mac_header_was_set(skb)) + skb->mac_header += sizeof(struct frag_hdr); + skb->network_header += sizeof(struct frag_hdr); - skb_reset_transport_header(head); - skb_push(head, head->data - skb_network_header(head)); + skb_reset_transport_header(skb); - sum_truesize = head->truesize; - for (fp = head->next; fp;) { - bool headstolen; - int delta; - struct sk_buff *next = fp->next; + inet_frag_reasm_finish(&fq->q, skb, reasm_data); - sum_truesize += fp->truesize; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - - if (skb_try_coalesce(head, fp, &headstolen, &delta)) { - kfree_skb_partial(fp, headstolen); - } else { - if (!skb_shinfo(head)->frag_list) - skb_shinfo(head)->frag_list = fp; - head->data_len += fp->len; - head->len += fp->len; - head->truesize += fp->truesize; - } - fp = next; - } - sub_frag_mem_limit(fq->q.net, sum_truesize); - - head->next = NULL; - head->dev = dev; - head->tstamp = fq->q.stamp; - ipv6_hdr(head)->payload_len = htons(payload_len); - ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); - IP6CB(head)->nhoff = nhoff; - IP6CB(head)->flags |= IP6SKB_FRAGMENTED; - IP6CB(head)->frag_max_size = fq->q.max_size; + skb->dev = dev; + ipv6_hdr(skb)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); + IP6CB(skb)->nhoff = nhoff; + IP6CB(skb)->flags |= IP6SKB_FRAGMENTED; + IP6CB(skb)->frag_max_size = fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ - skb_postpush_rcsum(head, skb_network_header(head), - skb_network_header_len(head)); + skb_postpush_rcsum(skb, skb_network_header(skb), + skb_network_header_len(skb)); rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); @@ -419,6 +307,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, fq->q.fragments = NULL; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; + fq->q.last_run_head = NULL; return 1; out_oversize: @@ -430,6 +319,7 @@ out_fail: rcu_read_lock(); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMFAILS); rcu_read_unlock(); + inet_frag_kill(&fq->q); return -1; } @@ -468,10 +358,6 @@ static int ipv6_frag_rcv(struct sk_buff *skb) return 1; } - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && - fhdr->frag_off & htons(IP6_MF)) - goto fail_hdr; - iif = skb->dev ? skb->dev->ifindex : 0; fq = fq_find(net, fhdr->identification, hdr, iif); if (fq) { @@ -489,6 +375,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb) if (prob_offset) { __IP6_INC_STATS(net, ip6_dst_idev(skb_dst(skb)), IPSTATS_MIB_INHDRERRORS); + /* icmpv6_param_prob() calls kfree_skb(skb) */ icmpv6_param_prob(skb, ICMPV6_HDR_FIELD, prob_offset); } return ret; From 74cec2565b14c3bd6d1c77d5e3ef9eaffab70404 Mon Sep 17 00:00:00 2001 From: Peter Oskolkov Date: Tue, 23 Apr 2019 10:48:25 -0700 Subject: [PATCH 59/73] net: IP6 defrag: use rbtrees in nf_conntrack_reasm.c [ Upstream commit 997dd96471641e147cb2c33ad54284000d0f5e35 ] Currently, IPv6 defragmentation code drops non-last fragments that are smaller than 1280 bytes: see commit 0ed4229b08c1 ("ipv6: defrag: drop non-last frags smaller than min mtu") This behavior is not specified in IPv6 RFCs and appears to break compatibility with some IPv6 implemenations, as reported here: https://www.spinics.net/lists/netdev/msg543846.html This patch re-uses common IP defragmentation queueing and reassembly code in IP6 defragmentation in nf_conntrack, removing the 1280 byte restriction. Signed-off-by: Peter Oskolkov Reported-by: Tom Herbert Cc: Eric Dumazet Cc: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Sasha Levin --- net/ipv6/netfilter/nf_conntrack_reasm.c | 258 +++++++----------------- 1 file changed, 70 insertions(+), 188 deletions(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 0568d49b5da4..cb1b4772dac0 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -51,14 +51,6 @@ static const char nf_frags_cache_name[] = "nf-frags"; -struct nf_ct_frag6_skb_cb -{ - struct inet6_skb_parm h; - int offset; -}; - -#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb)) - static struct inet_frags nf_frags; #ifdef CONFIG_SYSCTL @@ -144,6 +136,9 @@ static void __net_exit nf_ct_frags6_sysctl_unregister(struct net *net) } #endif +static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev); + static inline u8 ip6_frag_ecn(const struct ipv6hdr *ipv6h) { return 1 << (ipv6_get_dsfield(ipv6h) & INET_ECN_MASK); @@ -185,9 +180,10 @@ static struct frag_queue *fq_find(struct net *net, __be32 id, u32 user, static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, const struct frag_hdr *fhdr, int nhoff) { - struct sk_buff *prev, *next; unsigned int payload_len; - int offset, end; + struct net_device *dev; + struct sk_buff *prev; + int offset, end, err; u8 ecn; if (fq->q.flags & INET_FRAG_COMPLETE) { @@ -262,55 +258,19 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, goto err; } - /* Find out which fragments are in front and at the back of us - * in the chain of fragments so far. We must know where to put - * this fragment, right? - */ + /* Note : skb->rbnode and skb->dev share the same location. */ + dev = skb->dev; + /* Makes sure compiler wont do silly aliasing games */ + barrier(); + prev = fq->q.fragments_tail; - if (!prev || NFCT_FRAG6_CB(prev)->offset < offset) { - next = NULL; - goto found; - } - prev = NULL; - for (next = fq->q.fragments; next != NULL; next = next->next) { - if (NFCT_FRAG6_CB(next)->offset >= offset) - break; /* bingo! */ - prev = next; - } + err = inet_frag_queue_insert(&fq->q, skb, offset, end); + if (err) + goto insert_error; -found: - /* RFC5722, Section 4: - * When reassembling an IPv6 datagram, if - * one or more its constituent fragments is determined to be an - * overlapping fragment, the entire datagram (and any constituent - * fragments, including those not yet received) MUST be silently - * discarded. - */ + if (dev) + fq->iif = dev->ifindex; - /* Check for overlap with preceding fragment. */ - if (prev && - (NFCT_FRAG6_CB(prev)->offset + prev->len) > offset) - goto discard_fq; - - /* Look for overlap with succeeding segment. */ - if (next && NFCT_FRAG6_CB(next)->offset < end) - goto discard_fq; - - NFCT_FRAG6_CB(skb)->offset = offset; - - /* Insert this fragment in the chain of fragments. */ - skb->next = next; - if (!next) - fq->q.fragments_tail = skb; - if (prev) - prev->next = skb; - else - fq->q.fragments = skb; - - if (skb->dev) { - fq->iif = skb->dev->ifindex; - skb->dev = NULL; - } fq->q.stamp = skb->tstamp; fq->q.meat += skb->len; fq->ecn |= ecn; @@ -326,11 +286,25 @@ found: fq->q.flags |= INET_FRAG_FIRST_IN; } - return 0; + if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && + fq->q.meat == fq->q.len) { + unsigned long orefdst = skb->_skb_refdst; -discard_fq: + skb->_skb_refdst = 0UL; + err = nf_ct_frag6_reasm(fq, skb, prev, dev); + skb->_skb_refdst = orefdst; + return err; + } + + skb_dst_drop(skb); + return -EINPROGRESS; + +insert_error: + if (err == IPFRAG_DUP) + goto err; inet_frag_kill(&fq->q); err: + skb_dst_drop(skb); return -EINVAL; } @@ -340,147 +314,67 @@ err: * It is called with locked fq, and caller must check that * queue is eligible for reassembly i.e. it is not COMPLETE, * the last and the first frames arrived and all the bits are here. - * - * returns true if *prev skb has been transformed into the reassembled - * skb, false otherwise. */ -static bool -nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev) +static int nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *skb, + struct sk_buff *prev_tail, struct net_device *dev) { - struct sk_buff *fp, *head = fq->q.fragments; - int payload_len, delta; + void *reasm_data; + int payload_len; u8 ecn; inet_frag_kill(&fq->q); - WARN_ON(head == NULL); - WARN_ON(NFCT_FRAG6_CB(head)->offset != 0); - ecn = ip_frag_ecn_table[fq->ecn]; if (unlikely(ecn == 0xff)) - return false; + goto err; - /* Unfragmented part is taken from the first segment. */ - payload_len = ((head->data - skb_network_header(head)) - + reasm_data = inet_frag_reasm_prepare(&fq->q, skb, prev_tail); + if (!reasm_data) + goto err; + + payload_len = ((skb->data - skb_network_header(skb)) - sizeof(struct ipv6hdr) + fq->q.len - sizeof(struct frag_hdr)); if (payload_len > IPV6_MAXPLEN) { net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n", payload_len); - return false; - } - - delta = - head->truesize; - - /* Head of list must not be cloned. */ - if (skb_unclone(head, GFP_ATOMIC)) - return false; - - delta += head->truesize; - if (delta) - add_frag_mem_limit(fq->q.net, delta); - - /* If the first fragment is fragmented itself, we split - * it to two chunks: the first with data and paged part - * and the second, holding only fragments. */ - if (skb_has_frag_list(head)) { - struct sk_buff *clone; - int i, plen = 0; - - clone = alloc_skb(0, GFP_ATOMIC); - if (clone == NULL) - return false; - - clone->next = head->next; - head->next = clone; - skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list; - skb_frag_list_init(head); - for (i = 0; i < skb_shinfo(head)->nr_frags; i++) - plen += skb_frag_size(&skb_shinfo(head)->frags[i]); - clone->len = clone->data_len = head->data_len - plen; - head->data_len -= clone->len; - head->len -= clone->len; - clone->csum = 0; - clone->ip_summed = head->ip_summed; - - add_frag_mem_limit(fq->q.net, clone->truesize); - } - - /* morph head into last received skb: prev. - * - * This allows callers of ipv6 conntrack defrag to continue - * to use the last skb(frag) passed into the reasm engine. - * The last skb frag 'silently' turns into the full reassembled skb. - * - * Since prev is also part of q->fragments we have to clone it first. - */ - if (head != prev) { - struct sk_buff *iter; - - fp = skb_clone(prev, GFP_ATOMIC); - if (!fp) - return false; - - fp->next = prev->next; - - iter = head; - while (iter) { - if (iter->next == prev) { - iter->next = fp; - break; - } - iter = iter->next; - } - - skb_morph(prev, head); - prev->next = head->next; - consume_skb(head); - head = prev; + goto err; } /* We have to remove fragment header from datagram and to relocate * header in order to calculate ICV correctly. */ - skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0]; - memmove(head->head + sizeof(struct frag_hdr), head->head, - (head->data - head->head) - sizeof(struct frag_hdr)); - head->mac_header += sizeof(struct frag_hdr); - head->network_header += sizeof(struct frag_hdr); + skb_network_header(skb)[fq->nhoffset] = skb_transport_header(skb)[0]; + memmove(skb->head + sizeof(struct frag_hdr), skb->head, + (skb->data - skb->head) - sizeof(struct frag_hdr)); + skb->mac_header += sizeof(struct frag_hdr); + skb->network_header += sizeof(struct frag_hdr); - skb_shinfo(head)->frag_list = head->next; - skb_reset_transport_header(head); - skb_push(head, head->data - skb_network_header(head)); + skb_reset_transport_header(skb); - for (fp = head->next; fp; fp = fp->next) { - head->data_len += fp->len; - head->len += fp->len; - if (head->ip_summed != fp->ip_summed) - head->ip_summed = CHECKSUM_NONE; - else if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_add(head->csum, fp->csum); - head->truesize += fp->truesize; - fp->sk = NULL; - } - sub_frag_mem_limit(fq->q.net, head->truesize); + inet_frag_reasm_finish(&fq->q, skb, reasm_data); - head->ignore_df = 1; - head->next = NULL; - head->dev = dev; - head->tstamp = fq->q.stamp; - ipv6_hdr(head)->payload_len = htons(payload_len); - ipv6_change_dsfield(ipv6_hdr(head), 0xff, ecn); - IP6CB(head)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; + skb->ignore_df = 1; + skb->dev = dev; + ipv6_hdr(skb)->payload_len = htons(payload_len); + ipv6_change_dsfield(ipv6_hdr(skb), 0xff, ecn); + IP6CB(skb)->frag_max_size = sizeof(struct ipv6hdr) + fq->q.max_size; /* Yes, and fold redundant checksum back. 8) */ - if (head->ip_summed == CHECKSUM_COMPLETE) - head->csum = csum_partial(skb_network_header(head), - skb_network_header_len(head), - head->csum); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->csum = csum_partial(skb_network_header(skb), + skb_network_header_len(skb), + skb->csum); fq->q.fragments = NULL; fq->q.rb_fragments = RB_ROOT; fq->q.fragments_tail = NULL; + fq->q.last_run_head = NULL; - return true; + return 0; + +err: + inet_frag_kill(&fq->q); + return -EINVAL; } /* @@ -549,7 +443,6 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff) int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) { u16 savethdr = skb->transport_header; - struct net_device *dev = skb->dev; int fhoff, nhoff, ret; struct frag_hdr *fhdr; struct frag_queue *fq; @@ -572,10 +465,6 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) hdr = ipv6_hdr(skb); fhdr = (struct frag_hdr *)skb_transport_header(skb); - if (skb->len - skb_network_offset(skb) < IPV6_MIN_MTU && - fhdr->frag_off & htons(IP6_MF)) - return -EINVAL; - skb_orphan(skb); fq = fq_find(net, fhdr->identification, user, hdr, skb->dev ? skb->dev->ifindex : 0); @@ -587,24 +476,17 @@ int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user) spin_lock_bh(&fq->q.lock); ret = nf_ct_frag6_queue(fq, skb, fhdr, nhoff); - if (ret < 0) { - if (ret == -EPROTO) { - skb->transport_header = savethdr; - ret = 0; - } - goto out_unlock; + if (ret == -EPROTO) { + skb->transport_header = savethdr; + ret = 0; } /* after queue has assumed skb ownership, only 0 or -EINPROGRESS * must be returned. */ - ret = -EINPROGRESS; - if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) && - fq->q.meat == fq->q.len && - nf_ct_frag6_reasm(fq, skb, dev)) - ret = 0; + if (ret) + ret = -EINPROGRESS; -out_unlock: spin_unlock_bh(&fq->q.lock); inet_frag_put(&fq->q); return ret; From 56714d4517b45acfb3747980fda39e018deb756a Mon Sep 17 00:00:00 2001 From: Matthias Kaehlcke Date: Tue, 23 Apr 2019 12:04:22 -0700 Subject: [PATCH 60/73] Revert "kbuild: use -Oz instead of -Os when using clang" commit a75bb4eb9e565b9f5115e2e8c07377ce32cbe69a upstream. The clang option -Oz enables *aggressive* optimization for size, which doesn't necessarily result in smaller images, but can have negative impact on performance. Switch back to the less aggressive -Os. This reverts commit 6748cb3c299de1ffbe56733647b01dbcc398c419. Suggested-by: Peter Zijlstra Signed-off-by: Matthias Kaehlcke Reviewed-by: Nick Desaulniers Signed-off-by: Masahiro Yamada Signed-off-by: Nathan Chancellor Signed-off-by: Sasha Levin --- Makefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/Makefile b/Makefile index fcfef30ca9a6..d36e66ff60aa 100644 --- a/Makefile +++ b/Makefile @@ -653,8 +653,7 @@ KBUILD_CFLAGS += $(call cc-disable-warning, int-in-bool-context) KBUILD_CFLAGS += $(call cc-disable-warning, attribute-alias) ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE -KBUILD_CFLAGS += $(call cc-option,-Oz,-Os) -KBUILD_CFLAGS += $(call cc-disable-warning,maybe-uninitialized,) +KBUILD_CFLAGS += -Os $(call cc-disable-warning,maybe-uninitialized,) else ifdef CONFIG_PROFILE_ALL_BRANCHES KBUILD_CFLAGS += -O2 $(call cc-disable-warning,maybe-uninitialized,) From d069fe4844f8d799d771659a745fe91870c93fda Mon Sep 17 00:00:00 2001 From: Phil Auld Date: Tue, 23 Apr 2019 19:51:06 -0400 Subject: [PATCH 61/73] sched/fair: Limit sched_cfs_period_timer() loop to avoid hard lockup [ Upstream commit 2e8e19226398db8265a8e675fcc0118b9e80c9e8 ] With extremely short cfs_period_us setting on a parent task group with a large number of children the for loop in sched_cfs_period_timer() can run until the watchdog fires. There is no guarantee that the call to hrtimer_forward_now() will ever return 0. The large number of children can make do_sched_cfs_period_timer() take longer than the period. NMI watchdog: Watchdog detected hard LOCKUP on cpu 24 RIP: 0010:tg_nop+0x0/0x10 walk_tg_tree_from+0x29/0xb0 unthrottle_cfs_rq+0xe0/0x1a0 distribute_cfs_runtime+0xd3/0xf0 sched_cfs_period_timer+0xcb/0x160 ? sched_cfs_slack_timer+0xd0/0xd0 __hrtimer_run_queues+0xfb/0x270 hrtimer_interrupt+0x122/0x270 smp_apic_timer_interrupt+0x6a/0x140 apic_timer_interrupt+0xf/0x20 To prevent this we add protection to the loop that detects when the loop has run too many times and scales the period and quota up, proportionally, so that the timer can complete before then next period expires. This preserves the relative runtime quota while preventing the hard lockup. A warning is issued reporting this state and the new values. Signed-off-by: Phil Auld Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Anton Blanchard Cc: Ben Segall Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190319130005.25492-1-pauld@redhat.com Signed-off-by: Ingo Molnar Signed-off-by: Sasha Levin --- kernel/sched/fair.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 9829ede00498..a5d163903835 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4672,12 +4672,15 @@ static enum hrtimer_restart sched_cfs_slack_timer(struct hrtimer *timer) return HRTIMER_NORESTART; } +extern const u64 max_cfs_quota_period; + static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b = container_of(timer, struct cfs_bandwidth, period_timer); int overrun; int idle = 0; + int count = 0; raw_spin_lock(&cfs_b->lock); for (;;) { @@ -4685,6 +4688,28 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) if (!overrun) break; + if (++count > 3) { + u64 new, old = ktime_to_ns(cfs_b->period); + + new = (old * 147) / 128; /* ~115% */ + new = min(new, max_cfs_quota_period); + + cfs_b->period = ns_to_ktime(new); + + /* since max is 1s, this is limited to 1e9^2, which fits in u64 */ + cfs_b->quota *= new; + cfs_b->quota = div64_u64(cfs_b->quota, old); + + pr_warn_ratelimited( + "cfs_period_timer[cpu%d]: period too short, scaling up (new cfs_period_us %lld, cfs_quota_us = %lld)\n", + smp_processor_id(), + div_u64(new, NSEC_PER_USEC), + div_u64(cfs_b->quota, NSEC_PER_USEC)); + + /* reset count so we don't come right back in here */ + count = 0; + } + idle = do_sched_cfs_period_timer(cfs_b, overrun); } if (idle) From 80ef021be19d3cdebe162c73b13d0f249d5bd2f3 Mon Sep 17 00:00:00 2001 From: Jann Horn Date: Tue, 19 Mar 2019 02:36:59 +0100 Subject: [PATCH 62/73] device_cgroup: fix RCU imbalance in error case commit 0fcc4c8c044e117ac126ab6df4138ea9a67fa2a9 upstream. When dev_exception_add() returns an error (due to a failed memory allocation), make sure that we move the RCU preemption count back to where it was before we were called. We dropped the RCU read lock inside the loop body, so we can't just "break". sparse complains about this, too: $ make -s C=2 security/device_cgroup.o ./include/linux/rcupdate.h:647:9: warning: context imbalance in 'propagate_exception' - unexpected unlock Fixes: d591fb56618f ("device_cgroup: simplify cgroup tree walk in propagate_exception()") Cc: stable@vger.kernel.org Signed-off-by: Jann Horn Acked-by: Michal Hocko Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- security/device_cgroup.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 5ef7e5240563..ea014df89428 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -569,7 +569,7 @@ static int propagate_exception(struct dev_cgroup *devcg_root, devcg->behavior == DEVCG_DEFAULT_ALLOW) { rc = dev_exception_add(devcg, ex); if (rc) - break; + return rc; } else { /* * in the other possible cases: From 98ae85677ebfac2fa2243a9c954d96ea58c08e85 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Thu, 18 Apr 2019 17:50:20 -0700 Subject: [PATCH 63/73] mm/vmstat.c: fix /proc/vmstat format for CONFIG_DEBUG_TLBFLUSH=y CONFIG_SMP=n commit e8277b3b52240ec1caad8e6df278863e4bf42eac upstream. Commit 58bc4c34d249 ("mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly") depends on skipping vmstat entries with empty name introduced in 7aaf77272358 ("mm: don't show nr_indirectly_reclaimable in /proc/vmstat") but reverted in b29940c1abd7 ("mm: rename and change semantics of nr_indirectly_reclaimable_bytes"). So skipping no longer works and /proc/vmstat has misformatted lines " 0". This patch simply shows debug counters "nr_tlb_remote_*" for UP. Link: http://lkml.kernel.org/r/155481488468.467.4295519102880913454.stgit@buzz Fixes: 58bc4c34d249 ("mm/vmstat.c: skip NR_TLB_REMOTE_FLUSH* properly") Signed-off-by: Konstantin Khlebnikov Acked-by: Vlastimil Babka Cc: Roman Gushchin Cc: Jann Horn Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/vmstat.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/vmstat.c b/mm/vmstat.c index 6389e876c7a7..28c45c26f901 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1201,13 +1201,8 @@ const char * const vmstat_text[] = { #endif #endif /* CONFIG_MEMORY_BALLOON */ #ifdef CONFIG_DEBUG_TLBFLUSH -#ifdef CONFIG_SMP "nr_tlb_remote_flush", "nr_tlb_remote_flush_received", -#else - "", /* nr_tlb_remote_flush */ - "", /* nr_tlb_remote_flush_received */ -#endif /* CONFIG_SMP */ "nr_tlb_local_flush_all", "nr_tlb_local_flush_one", #endif /* CONFIG_DEBUG_TLBFLUSH */ From 216f6570d18bcd06975205b8af1708ea10a1baf6 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 16 Apr 2019 15:25:00 +0200 Subject: [PATCH 64/73] ALSA: info: Fix racy addition/deletion of nodes commit 8c2f870890fd28e023b0fcf49dcee333f2c8bad7 upstream. The ALSA proc helper manages the child nodes in a linked list, but its addition and deletion is done without any lock. This leads to a corruption if they are operated concurrently. Usually this isn't a problem because the proc entries are added sequentially in the driver probe procedure itself. But the card registrations are done often asynchronously, and the crash could be actually reproduced with syzkaller. This patch papers over it by protecting the link addition and deletion with the parent's mutex. There is "access" mutex that is used for the file access, and this can be reused for this purpose as well. Reported-by: syzbot+48df349490c36f9f54ab@syzkaller.appspotmail.com Cc: Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/info.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/sound/core/info.c b/sound/core/info.c index bcf6a48cc70d..5fb00437507b 100644 --- a/sound/core/info.c +++ b/sound/core/info.c @@ -722,8 +722,11 @@ snd_info_create_entry(const char *name, struct snd_info_entry *parent) INIT_LIST_HEAD(&entry->children); INIT_LIST_HEAD(&entry->list); entry->parent = parent; - if (parent) + if (parent) { + mutex_lock(&parent->access); list_add_tail(&entry->list, &parent->children); + mutex_unlock(&parent->access); + } return entry; } @@ -805,7 +808,12 @@ void snd_info_free_entry(struct snd_info_entry * entry) list_for_each_entry_safe(p, n, &entry->children, list) snd_info_free_entry(p); - list_del(&entry->list); + p = entry->parent; + if (p) { + mutex_lock(&p->access); + list_del(&entry->list); + mutex_unlock(&p->access); + } kfree(entry->name); if (entry->private_free) entry->private_free(entry); From 47ad82a34560ea70e85d2eb56be0ada03dc4fd35 Mon Sep 17 00:00:00 2001 From: Matteo Croce Date: Mon, 18 Mar 2019 02:32:36 +0100 Subject: [PATCH 65/73] percpu: stop printing kernel addresses commit 00206a69ee32f03e6f40837684dcbe475ea02266 upstream. Since commit ad67b74d2469d9b8 ("printk: hash addresses printed with %p"), at boot "____ptrval____" is printed instead of actual addresses: percpu: Embedded 38 pages/cpu @(____ptrval____) s124376 r0 d31272 u524288 Instead of changing the print to "%px", and leaking kernel addresses, just remove the print completely, cfr. e.g. commit 071929dbdd865f77 ("arm64: Stop printing the virtual memory layout"). Signed-off-by: Matteo Croce Signed-off-by: Dennis Zhou Signed-off-by: Greg Kroah-Hartman --- mm/percpu.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/percpu.c b/mm/percpu.c index 3074148b7e0d..0c06e2f549a7 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -2507,8 +2507,8 @@ int __init pcpu_embed_first_chunk(size_t reserved_size, size_t dyn_size, ai->groups[group].base_offset = areas[group] - base; } - pr_info("Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n", - PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size, + pr_info("Embedded %zu pages/cpu s%zu r%zu d%zu u%zu\n", + PFN_DOWN(size_sum), ai->static_size, ai->reserved_size, ai->dyn_size, ai->unit_size); rc = pcpu_setup_first_chunk(ai, base); @@ -2629,8 +2629,8 @@ int __init pcpu_page_first_chunk(size_t reserved_size, } /* we're ready, commit */ - pr_info("%d %s pages/cpu @%p s%zu r%zu d%zu\n", - unit_pages, psize_str, vm.addr, ai->static_size, + pr_info("%d %s pages/cpu s%zu r%zu d%zu\n", + unit_pages, psize_str, ai->static_size, ai->reserved_size, ai->dyn_size); rc = pcpu_setup_first_chunk(ai, vm.addr); From c5e44540ca312c4bbaafb1950c825a930c9adfdf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 25 Sep 2018 10:55:59 -0300 Subject: [PATCH 66/73] tools include: Adopt linux/bits.h commit ba4aa02b417f08a0bee5e7b8ed70cac788a7c854 upstream. So that we reduce the difference of tools/include/linux/bitops.h to the original kernel file, include/linux/bitops.h, trying to remove the need to define BITS_PER_LONG, to avoid clashes with asm/bitsperlong.h. And the things removed from tools/include/linux/bitops.h are really in linux/bits.h, so that we can have a copy and then tools/perf/check_headers.sh will tell us when new stuff gets added to linux/bits.h so that we can check if it is useful and if any adjustment needs to be done to the tools/{include,arch}/ copies. Cc: Adrian Hunter Cc: Alexander Sverdlin Cc: David Ahern Cc: Jiri Olsa Cc: Namhyung Kim Cc: Wang Nan Link: https://lkml.kernel.org/n/tip-y1sqyydvfzo0bjjoj4zsl562@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: Greg Kroah-Hartman --- tools/include/linux/bitops.h | 7 ++----- tools/include/linux/bits.h | 26 ++++++++++++++++++++++++++ tools/perf/check-headers.sh | 1 + 3 files changed, 29 insertions(+), 5 deletions(-) create mode 100644 tools/include/linux/bits.h diff --git a/tools/include/linux/bitops.h b/tools/include/linux/bitops.h index acc704bd3998..0b0ef3abc966 100644 --- a/tools/include/linux/bitops.h +++ b/tools/include/linux/bitops.h @@ -3,8 +3,6 @@ #define _TOOLS_LINUX_BITOPS_H_ #include -#include - #ifndef __WORDSIZE #define __WORDSIZE (__SIZEOF_LONG__ * 8) #endif @@ -12,10 +10,9 @@ #ifndef BITS_PER_LONG # define BITS_PER_LONG __WORDSIZE #endif +#include +#include -#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) -#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) -#define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #define BITS_TO_U64(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u64)) #define BITS_TO_U32(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(u32)) diff --git a/tools/include/linux/bits.h b/tools/include/linux/bits.h new file mode 100644 index 000000000000..2b7b532c1d51 --- /dev/null +++ b/tools/include/linux/bits.h @@ -0,0 +1,26 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_BITS_H +#define __LINUX_BITS_H +#include + +#define BIT(nr) (1UL << (nr)) +#define BIT_ULL(nr) (1ULL << (nr)) +#define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) +#define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) +#define BITS_PER_BYTE 8 + +/* + * Create a contiguous bitmask starting at bit position @l and ending at + * position @h. For example + * GENMASK_ULL(39, 21) gives us the 64bit vector 0x000000ffffe00000. + */ +#define GENMASK(h, l) \ + (((~0UL) - (1UL << (l)) + 1) & (~0UL >> (BITS_PER_LONG - 1 - (h)))) + +#define GENMASK_ULL(h, l) \ + (((~0ULL) - (1ULL << (l)) + 1) & \ + (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h)))) + +#endif /* __LINUX_BITS_H */ diff --git a/tools/perf/check-headers.sh b/tools/perf/check-headers.sh index 50cd6228f506..df1dbee8d98d 100755 --- a/tools/perf/check-headers.sh +++ b/tools/perf/check-headers.sh @@ -11,6 +11,7 @@ include/uapi/linux/sched.h include/uapi/linux/stat.h include/uapi/linux/vhost.h include/uapi/sound/asound.h +include/linux/bits.h include/linux/hash.h include/uapi/linux/hw_breakpoint.h arch/x86/include/asm/disabled-features.h From 4fa17fc730ba3b7f0cb69937df919a7a226067be Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 8 Jan 2018 10:41:39 -0800 Subject: [PATCH 67/73] iomap: report collisions between directio and buffered writes to userspace commit 5a9d929d6e13278df62bd9e3d3ceae8c87ad1eea upstream. If two programs simultaneously try to write to the same part of a file via direct IO and buffered IO, there's a chance that the post-diowrite pagecache invalidation will fail on the dirty page. When this happens, the dio write succeeded, which means that the page cache is no longer coherent with the disk! Programs are not supposed to mix IO types and this is a clear case of data corruption, so store an EIO which will be reflected to userspace during the next fsync. Replace the WARN_ON with a ratelimited pr_crit so that the developers have /some/ kind of breadcrumb to track down the offending program(s) and file(s) involved. Signed-off-by: Darrick J. Wong Reviewed-by: Liu Bo Signed-off-by: Greg Kroah-Hartman Cc: Zubin Mithra --- fs/direct-io.c | 24 +++++++++++++++++++++++- fs/iomap.c | 12 ++++++++++-- include/linux/fs.h | 1 + 3 files changed, 34 insertions(+), 3 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 2c90d541f527..30bf22c989de 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -219,6 +219,27 @@ static inline struct page *dio_get_page(struct dio *dio, return dio->pages[sdio->head]; } +/* + * Warn about a page cache invalidation failure during a direct io write. + */ +void dio_warn_stale_pagecache(struct file *filp) +{ + static DEFINE_RATELIMIT_STATE(_rs, 86400 * HZ, DEFAULT_RATELIMIT_BURST); + char pathname[128]; + struct inode *inode = file_inode(filp); + char *path; + + errseq_set(&inode->i_mapping->wb_err, -EIO); + if (__ratelimit(&_rs)) { + path = file_path(filp, pathname, sizeof(pathname)); + if (IS_ERR(path)) + path = "(unknown)"; + pr_crit("Page cache invalidation failure on direct I/O. Possible data corruption due to collision with buffered I/O!\n"); + pr_crit("File: %s PID: %d Comm: %.20s\n", path, current->pid, + current->comm); + } +} + /** * dio_complete() - called when all DIO BIO I/O has been completed * @offset: the byte offset in the file of the completed operation @@ -290,7 +311,8 @@ static ssize_t dio_complete(struct dio *dio, ssize_t ret, unsigned int flags) err = invalidate_inode_pages2_range(dio->inode->i_mapping, offset >> PAGE_SHIFT, (offset + ret - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); + if (err) + dio_warn_stale_pagecache(dio->iocb->ki_filp); } if (!(dio->flags & DIO_SKIP_DIO_COUNT)) diff --git a/fs/iomap.c b/fs/iomap.c index 8f7673a69273..467d98bf7054 100644 --- a/fs/iomap.c +++ b/fs/iomap.c @@ -753,7 +753,8 @@ static ssize_t iomap_dio_complete(struct iomap_dio *dio) err = invalidate_inode_pages2_range(inode->i_mapping, offset >> PAGE_SHIFT, (offset + dio->size - 1) >> PAGE_SHIFT); - WARN_ON_ONCE(err); + if (err) + dio_warn_stale_pagecache(iocb->ki_filp); } inode_dio_end(file_inode(iocb->ki_filp)); @@ -1010,9 +1011,16 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out_free_dio; + /* + * Try to invalidate cache pages for the range we're direct + * writing. If this invalidation fails, tough, the write will + * still work, but racing two incompatible write paths is a + * pretty crazy thing to do, so we don't support it 100%. + */ ret = invalidate_inode_pages2_range(mapping, start >> PAGE_SHIFT, end >> PAGE_SHIFT); - WARN_ON_ONCE(ret); + if (ret) + dio_warn_stale_pagecache(iocb->ki_filp); ret = 0; if (iov_iter_rw(iter) == WRITE && !dio->wait_for_completion && diff --git a/include/linux/fs.h b/include/linux/fs.h index f6a577edec67..dafac283b0ff 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2965,6 +2965,7 @@ enum { }; void dio_end_io(struct bio *bio); +void dio_warn_stale_pagecache(struct file *filp); ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, From 2411a27e7475273aa10179d94bb00e958f6fbec5 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Dec 2017 19:07:02 -0800 Subject: [PATCH 68/73] xfs: add the ability to join a held buffer to a defer_ops commit b7b2846fe26f2c0d7f317c874a13d3ecf22670ff upstream. In certain cases, defer_ops callers will lock a buffer and want to hold the lock across transaction rolls. Similar to ijoined inodes, we want to dirty & join the buffer with each transaction roll in defer_finish so that afterwards the caller still owns the buffer lock and we haven't inadvertently pinned the log. Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Alex Lyakas Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_defer.c | 39 ++++++++++++++++++++++++++++++++++++--- fs/xfs/libxfs/xfs_defer.h | 5 ++++- 2 files changed, 40 insertions(+), 4 deletions(-) diff --git a/fs/xfs/libxfs/xfs_defer.c b/fs/xfs/libxfs/xfs_defer.c index 072ebfe1d6ae..087fea02c389 100644 --- a/fs/xfs/libxfs/xfs_defer.c +++ b/fs/xfs/libxfs/xfs_defer.c @@ -249,6 +249,10 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_log_inode(*tp, dop->dop_inodes[i], XFS_ILOG_CORE); + /* Hold the (previously bjoin'd) buffer locked across the roll. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) + xfs_trans_dirty_buf(*tp, dop->dop_bufs[i]); + trace_xfs_defer_trans_roll((*tp)->t_mountp, dop); /* Roll the transaction. */ @@ -264,6 +268,12 @@ xfs_defer_trans_roll( for (i = 0; i < XFS_DEFER_OPS_NR_INODES && dop->dop_inodes[i]; i++) xfs_trans_ijoin(*tp, dop->dop_inodes[i], 0); + /* Rejoin the buffers and dirty them so the log moves forward. */ + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS && dop->dop_bufs[i]; i++) { + xfs_trans_bjoin(*tp, dop->dop_bufs[i]); + xfs_trans_bhold(*tp, dop->dop_bufs[i]); + } + return error; } @@ -295,6 +305,31 @@ xfs_defer_ijoin( } } + ASSERT(0); + return -EFSCORRUPTED; +} + +/* + * Add this buffer to the deferred op. Each joined buffer is relogged + * each time we roll the transaction. + */ +int +xfs_defer_bjoin( + struct xfs_defer_ops *dop, + struct xfs_buf *bp) +{ + int i; + + for (i = 0; i < XFS_DEFER_OPS_NR_BUFS; i++) { + if (dop->dop_bufs[i] == bp) + return 0; + else if (dop->dop_bufs[i] == NULL) { + dop->dop_bufs[i] = bp; + return 0; + } + } + + ASSERT(0); return -EFSCORRUPTED; } @@ -493,9 +528,7 @@ xfs_defer_init( struct xfs_defer_ops *dop, xfs_fsblock_t *fbp) { - dop->dop_committed = false; - dop->dop_low = false; - memset(&dop->dop_inodes, 0, sizeof(dop->dop_inodes)); + memset(dop, 0, sizeof(struct xfs_defer_ops)); *fbp = NULLFSBLOCK; INIT_LIST_HEAD(&dop->dop_intake); INIT_LIST_HEAD(&dop->dop_pending); diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h index d4f046dd44bd..045beacdd37d 100644 --- a/fs/xfs/libxfs/xfs_defer.h +++ b/fs/xfs/libxfs/xfs_defer.h @@ -59,6 +59,7 @@ enum xfs_defer_ops_type { }; #define XFS_DEFER_OPS_NR_INODES 2 /* join up to two inodes */ +#define XFS_DEFER_OPS_NR_BUFS 2 /* join up to two buffers */ struct xfs_defer_ops { bool dop_committed; /* did any trans commit? */ @@ -66,8 +67,9 @@ struct xfs_defer_ops { struct list_head dop_intake; /* unlogged pending work */ struct list_head dop_pending; /* logged pending work */ - /* relog these inodes with each roll */ + /* relog these with each roll */ struct xfs_inode *dop_inodes[XFS_DEFER_OPS_NR_INODES]; + struct xfs_buf *dop_bufs[XFS_DEFER_OPS_NR_BUFS]; }; void xfs_defer_add(struct xfs_defer_ops *dop, enum xfs_defer_ops_type type, @@ -77,6 +79,7 @@ void xfs_defer_cancel(struct xfs_defer_ops *dop); void xfs_defer_init(struct xfs_defer_ops *dop, xfs_fsblock_t *fbp); bool xfs_defer_has_unfinished_work(struct xfs_defer_ops *dop); int xfs_defer_ijoin(struct xfs_defer_ops *dop, struct xfs_inode *ip); +int xfs_defer_bjoin(struct xfs_defer_ops *dop, struct xfs_buf *bp); /* Description of a deferred type. */ struct xfs_defer_op_type { From 4024e3bde13e46489634cd2734d1518b1d92aef4 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 7 Dec 2017 19:07:02 -0800 Subject: [PATCH 69/73] xfs: hold xfs_buf locked between shortform->leaf conversion and the addition of an attribute commit 6e643cd094de3bd0f97edcc1db0089afa24d909f upstream. The new attribute leaf buffer is not held locked across the transaction roll between the shortform->leaf modification and the addition of the new entry. As a result, the attribute buffer modification being made is not atomic from an operational perspective. Hence the AIL push can grab it in the transient state of "just created" after the initial transaction is rolled, because the buffer has been released. This leads to xfs_attr3_leaf_verify() asserting that hdr.count is zero, treating this as in-memory corruption, and shutting down the filesystem. Darrick ported the original patch to 4.15 and reworked it use the xfs_defer_bjoin helper and hold/join the buffer correctly across the second transaction roll. Signed-off-by: Alex Lyakas Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Alex Lyakas Signed-off-by: Greg Kroah-Hartman --- fs/xfs/libxfs/xfs_attr.c | 20 +++++++++++++++----- fs/xfs/libxfs/xfs_attr_leaf.c | 9 ++++++--- fs/xfs/libxfs/xfs_attr_leaf.h | 3 ++- 3 files changed, 23 insertions(+), 9 deletions(-) diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c index ea66f04f46f7..e4265db08e4b 100644 --- a/fs/xfs/libxfs/xfs_attr.c +++ b/fs/xfs/libxfs/xfs_attr.c @@ -212,6 +212,7 @@ xfs_attr_set( int flags) { struct xfs_mount *mp = dp->i_mount; + struct xfs_buf *leaf_bp = NULL; struct xfs_da_args args; struct xfs_defer_ops dfops; struct xfs_trans_res tres; @@ -327,9 +328,16 @@ xfs_attr_set( * GROT: another possible req'mt for a double-split btree op. */ xfs_defer_init(args.dfops, args.firstblock); - error = xfs_attr_shortform_to_leaf(&args); + error = xfs_attr_shortform_to_leaf(&args, &leaf_bp); if (error) goto out_defer_cancel; + /* + * Prevent the leaf buffer from being unlocked so that a + * concurrent AIL push cannot grab the half-baked leaf + * buffer and run into problems with the write verifier. + */ + xfs_trans_bhold(args.trans, leaf_bp); + xfs_defer_bjoin(args.dfops, leaf_bp); xfs_defer_ijoin(args.dfops, dp); error = xfs_defer_finish(&args.trans, args.dfops); if (error) @@ -337,13 +345,14 @@ xfs_attr_set( /* * Commit the leaf transformation. We'll need another (linked) - * transaction to add the new attribute to the leaf. + * transaction to add the new attribute to the leaf, which + * means that we have to hold & join the leaf buffer here too. */ - error = xfs_trans_roll_inode(&args.trans, dp); if (error) goto out; - + xfs_trans_bjoin(args.trans, leaf_bp); + leaf_bp = NULL; } if (xfs_bmap_one_block(dp, XFS_ATTR_FORK)) @@ -374,8 +383,9 @@ xfs_attr_set( out_defer_cancel: xfs_defer_cancel(&dfops); - args.trans = NULL; out: + if (leaf_bp) + xfs_trans_brelse(args.trans, leaf_bp); if (args.trans) xfs_trans_cancel(args.trans); xfs_iunlock(dp, XFS_ILOCK_EXCL); diff --git a/fs/xfs/libxfs/xfs_attr_leaf.c b/fs/xfs/libxfs/xfs_attr_leaf.c index 40e53a4fc0a6..73a541755d5b 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.c +++ b/fs/xfs/libxfs/xfs_attr_leaf.c @@ -739,10 +739,13 @@ xfs_attr_shortform_getvalue(xfs_da_args_t *args) } /* - * Convert from using the shortform to the leaf. + * Convert from using the shortform to the leaf. On success, return the + * buffer so that we can keep it locked until we're totally done with it. */ int -xfs_attr_shortform_to_leaf(xfs_da_args_t *args) +xfs_attr_shortform_to_leaf( + struct xfs_da_args *args, + struct xfs_buf **leaf_bp) { xfs_inode_t *dp; xfs_attr_shortform_t *sf; @@ -821,7 +824,7 @@ xfs_attr_shortform_to_leaf(xfs_da_args_t *args) sfe = XFS_ATTR_SF_NEXTENTRY(sfe); } error = 0; - + *leaf_bp = bp; out: kmem_free(tmpbuffer); return error; diff --git a/fs/xfs/libxfs/xfs_attr_leaf.h b/fs/xfs/libxfs/xfs_attr_leaf.h index f7dda0c237b0..894124efb421 100644 --- a/fs/xfs/libxfs/xfs_attr_leaf.h +++ b/fs/xfs/libxfs/xfs_attr_leaf.h @@ -48,7 +48,8 @@ void xfs_attr_shortform_create(struct xfs_da_args *args); void xfs_attr_shortform_add(struct xfs_da_args *args, int forkoff); int xfs_attr_shortform_lookup(struct xfs_da_args *args); int xfs_attr_shortform_getvalue(struct xfs_da_args *args); -int xfs_attr_shortform_to_leaf(struct xfs_da_args *args); +int xfs_attr_shortform_to_leaf(struct xfs_da_args *args, + struct xfs_buf **leaf_bp); int xfs_attr_shortform_remove(struct xfs_da_args *args); int xfs_attr_shortform_allfit(struct xfs_buf *bp, struct xfs_inode *dp); int xfs_attr_shortform_bytesfit(struct xfs_inode *dp, int bytes); From d6b11409047f4aa9b1fec4e665b5cd42fcf3bbdb Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sat, 27 Oct 2018 09:10:48 -0700 Subject: [PATCH 70/73] i2c-hid: properly terminate i2c_hid_dmi_desc_override_table[] array MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b59dfdaef173677b0b7e10f375226c0a1114fd20 upstream. Commit 9ee3e06610fd ("HID: i2c-hid: override HID descriptors for certain devices") added a new dmi_system_id quirk table to override certain HID report descriptors for some systems that lack them. But the table wasn't properly terminated, causing the dmi matching to walk off into la-la-land, and starting to treat random data as dmi descriptor pointers, causing boot-time oopses if you were at all unlucky. Terminate the array. We really should have some way to just statically check that arrays that should be terminated by an empty entry actually are so. But the HID people really should have caught this themselves, rather than have me deal with an oops during the merge window. Tssk, tssk. Cc: Julian Sax Cc: Benjamin Tissoires Cc: Jiri Kosina Signed-off-by: Linus Torvalds Cc: Ambrož Bizjak Signed-off-by: Greg Kroah-Hartman --- drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c b/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c index 1d645c9ab417..cac262a912c1 100644 --- a/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c +++ b/drivers/hid/i2c-hid/i2c-hid-dmi-quirks.c @@ -337,7 +337,8 @@ static const struct dmi_system_id i2c_hid_dmi_desc_override_table[] = { DMI_EXACT_MATCH(DMI_PRODUCT_NAME, "FlexBook edge11 - M-FBE11"), }, .driver_data = (void *)&sipodev_desc - } + }, + { } /* Terminate list */ }; From 599c8a408f875477624384edd60ce0c821ff7c8a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 25 Apr 2019 10:00:43 +0200 Subject: [PATCH 71/73] Revert "locking/lockdep: Add debug_locks check in __lock_downgrade()" This reverts commit 4a195a0bc2e954b91085d5c82eb20c51835ee7b0 which was commit 71492580571467fb7177aade19c18ce7486267f5 upstream. Tetsuo rightly points out that the backport here is incorrect, as it touches the __lock_set_class function instead of the intended __lock_downgrade function. Reported-by: Tetsuo Handa Cc: Waiman Long Cc: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Paul E. McKenney Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Will Deacon Cc: Ingo Molnar Cc: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- kernel/locking/lockdep.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index e57be7031cb3..bf694c709b96 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -3650,9 +3650,6 @@ __lock_set_class(struct lockdep_map *lock, const char *name, unsigned int depth; int i; - if (unlikely(!debug_locks)) - return 0; - depth = curr->lockdep_depth; /* * This function is about (re)setting the class of a held lock, From 62c1af5fb31a72c229d17f812fa1fc37f69d4dbf Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 5 Apr 2019 18:39:38 -0700 Subject: [PATCH 72/73] kernel/sysctl.c: fix out-of-bounds access when setting file-max commit 9002b21465fa4d829edfc94a5a441005cffaa972 upstream. Commit 32a5ad9c2285 ("sysctl: handle overflow for file-max") hooked up min/max values for the file-max sysctl parameter via the .extra1 and .extra2 fields in the corresponding struct ctl_table entry. Unfortunately, the minimum value points at the global 'zero' variable, which is an int. This results in a KASAN splat when accessed as a long by proc_doulongvec_minmax on 64-bit architectures: | BUG: KASAN: global-out-of-bounds in __do_proc_doulongvec_minmax+0x5d8/0x6a0 | Read of size 8 at addr ffff2000133d1c20 by task systemd/1 | | CPU: 0 PID: 1 Comm: systemd Not tainted 5.1.0-rc3-00012-g40b114779944 #2 | Hardware name: linux,dummy-virt (DT) | Call trace: | dump_backtrace+0x0/0x228 | show_stack+0x14/0x20 | dump_stack+0xe8/0x124 | print_address_description+0x60/0x258 | kasan_report+0x140/0x1a0 | __asan_report_load8_noabort+0x18/0x20 | __do_proc_doulongvec_minmax+0x5d8/0x6a0 | proc_doulongvec_minmax+0x4c/0x78 | proc_sys_call_handler.isra.19+0x144/0x1d8 | proc_sys_write+0x34/0x58 | __vfs_write+0x54/0xe8 | vfs_write+0x124/0x3c0 | ksys_write+0xbc/0x168 | __arm64_sys_write+0x68/0x98 | el0_svc_common+0x100/0x258 | el0_svc_handler+0x48/0xc0 | el0_svc+0x8/0xc | | The buggy address belongs to the variable: | zero+0x0/0x40 | | Memory state around the buggy address: | ffff2000133d1b00: 00 00 00 00 00 00 00 00 fa fa fa fa 04 fa fa fa | ffff2000133d1b80: fa fa fa fa 04 fa fa fa fa fa fa fa 04 fa fa fa | >ffff2000133d1c00: fa fa fa fa 04 fa fa fa fa fa fa fa 00 00 00 00 | ^ | ffff2000133d1c80: fa fa fa fa 00 fa fa fa fa fa fa fa 00 00 00 00 | ffff2000133d1d00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 Fix the splat by introducing a unsigned long 'zero_ul' and using that instead. Link: http://lkml.kernel.org/r/20190403153409.17307-1-will.deacon@arm.com Fixes: 32a5ad9c2285 ("sysctl: handle overflow for file-max") Signed-off-by: Will Deacon Acked-by: Christian Brauner Cc: Kees Cook Cc: Alexey Dobriyan Cc: Matteo Croce Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 34a3b8a262a9..f13601a616ad 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -124,6 +124,7 @@ static int zero; static int __maybe_unused one = 1; static int __maybe_unused two = 2; static int __maybe_unused four = 4; +static unsigned long zero_ul; static unsigned long one_ul = 1; static unsigned long long_max = LONG_MAX; static int one_hundred = 100; @@ -1682,7 +1683,7 @@ static struct ctl_table fs_table[] = { .maxlen = sizeof(files_stat.max_files), .mode = 0644, .proc_handler = proc_doulongvec_minmax, - .extra1 = &zero, + .extra1 = &zero_ul, .extra2 = &long_max, }, { From fa5941f45d7ed070118b7c209b7f2c3a034293bd Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 27 Apr 2019 09:35:42 +0200 Subject: [PATCH 73/73] Linux 4.14.114 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index d36e66ff60aa..47a9f9883bdd 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 VERSION = 4 PATCHLEVEL = 14 -SUBLEVEL = 113 +SUBLEVEL = 114 EXTRAVERSION = NAME = Petit Gorille