Files
kondors1995 2d5c481f29 bpf: squash revert spoofing and some backports:
Squashed commit of the following:

commit 259593385c05a430c4685b611c0e43b4272c22f8
Author: John Galt <johngaltfirstrun@gmail.com>
Date:   Fri Dec 13 08:30:37 2024 -0500

    bpf: squash revert spoofing and some backports:

    Squashed commit of the following:

    commit 8ac5df9c8bc9575059fff6cea0c40463b96fc129
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:58:17 2024 -0500

        Revert "BACKPORT: bpf: add skb_load_bytes_relative helper"

        This reverts commit 029893dcc5d67af16fdf0723bacaae37ec567f67.

    commit dbcbceafe848744ec188f74e87e9717916d359ea
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:58:13 2024 -0500

        Revert "BACKPORT: bpf: encapsulate verifier log state into a structure"

        This reverts commit d861145b97d247cbd9fe1400df52155f48639126.

    commit 478f4dfee0406b54525e68764cc9ba48af1624fc
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:58:10 2024 -0500

        Revert "BACKPORT: bpf: Rename bpf_verifer_log"

        This reverts commit 5d088635de1bf2d6ae9ea94e3dd1c601d30c0cce.

    commit 7bc7c24beb82168b49337530cb56b5dfeeafe19a
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:58:07 2024 -0500

        Revert "BACKPORT: bpf: btf: Introduce BPF Type Format (BTF)"

        This reverts commit 93d34e26514b4d9d15fd176706f57634b2e97485.

    commit 7106457ba90a459b6241fdd44df658c1b52c0e4b
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:58:03 2024 -0500

        Revert "bpf: Update logging functions to work with BTF"

        This reverts commit 97e6c528eb2f76c58a3b6a4c1e7fbeafcd97633a.

    commit 08e68c7ba56f5e78fd1afcd5a2164716a75b0fe3
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:58:00 2024 -0500

        Revert "bpf: btf: Validate type reference"

        This reverts commit c7b7eecbc1134e5d8865af2cc0692fc7156175d5.

    commit 7763cf0831970a64ed62f9b7362fca02ab6e83f1
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:51 2024 -0500

        Revert "bpf: btf: Check members of struct/union"

        This reverts commit 9a77b51cad6f04866ca067ca0e70a89b9f59ed56.

    commit eb033235f666b5f66995f4cf89702de7ab4721f8
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:47 2024 -0500

        Revert "bpf: btf: Add pretty print capability for data with BTF type info"

        This reverts commit 745692103435221d6e39bc177811769995540525.

    commit c32995674ace91e06c591d2f63177585e81adc75
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:43 2024 -0500

        Revert "BACKPORT: bpf: btf: Add BPF_BTF_LOAD command"

        This reverts commit 4e0afd38e20e5aa2df444361309bc07251ca6b2a.

    commit 1310bc8d4aca0015c8723e7624121eddf76b3244
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:38 2024 -0500

        Revert "bpf: btf: Add BPF_OBJ_GET_INFO_BY_FD support to BTF fd"

        This reverts commit d4b5d76d9101b97e6fe5181bcefe7f601ed19926.

    commit 881a49445608712bdb0a0f0c959838bdbc725f62
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:34 2024 -0500

        Revert "BACKPORT: bpf: btf: Clean up btf.h in uapi"

        This reverts commit 26b661822933d41b3feb59bb284334bfbbc82af4.

    commit e2109fd858ebd5fe392c8bf579b9350fbca35a35
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:29 2024 -0500

        Revert "bpf: btf: Avoid WARN_ON when CONFIG_REFCOUNT_FULL=y"

        This reverts commit 9abf878903404e649fef4ad0b189eec1c13d29fe.

    commit 088a7d9137f03da4e0fc1d72add3901823081ccd
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:23 2024 -0500

        Revert "bpf: Fix compiler warning on info.map_ids for 32bit platform"

        This reverts commit a3a278e1f6cf167d538ac52f4ad60bb9cf8d4129.

    commit 6e14aed6b63f2b266982454d83678445c062cf39
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:13 2024 -0500

        Revert "bpf: btf: Change how section is supported in btf_header"

        This reverts commit 4b60ffd683eb623a184b46761777838d7c49e707.

    commit 151a60855c23bf0317734031481d779efb369d6c
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:08 2024 -0500

        Revert "bpf: btf: Check array->index_type"

        This reverts commit b00e10f1a073fadce178b6fb62496722e16db303.

    commit 49775e9074a54ac5f60f518e6fc5a26172996eae
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:57:01 2024 -0500

        Revert "bpf: btf: Remove unused bits from uapi/linux/btf.h"

        This reverts commit c90c6ad34f7a8f565f351d21c2d5b9706838767d.

    commit b6d6c6ab28e4b018da6ce9e64125e63f4191d3d9
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:58 2024 -0500

        Revert "bpf: btf: Avoid variable length array"

        This reverts commit fe7d1f7750242e77a73839d173ac36c3e39d4171.

    commit a45bedecb9b1175fef96f2d64fba2d61777dbf35
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:49 2024 -0500

        Revert "bpf: btf: avoid -Wreturn-type warning"

        This reverts commit 78214f1e390bf1d69d9ae4ee80072ac85c34619e.

    commit 445efb8465b9fa5706d81098417f15656265322e
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:46 2024 -0500

        Revert "bpf: btf: Check array t->size"

        This reverts commit aed532e7466f77885a362e4b863bf90c41e834ba.

    commit 8aada590d525de735cf39196d88722e727c141e9
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:42 2024 -0500

        Revert "bpf: btf: Ensure t->type == 0 for BTF_KIND_FWD"

        This reverts commit 8c8b601dcc2e62e1276b73dfee8b49e40fb65944.

    commit ed67ad09e866c9c30897488088bbb4555ea3dc80
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:38 2024 -0500

        Revert "bpf: btf: Fix bitfield extraction for big endian"

        This reverts commit b0696a226c52868d64963f01665dd1a640a92f2b.

    commit 5cc64db782daf86cdf7ac77133ca94181bb29146
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:35 2024 -0500

        Revert "bpf: btf: Clean up BTF_INT_BITS() in uapi btf.h"

        This reverts commit 0f008594540b09c667ea88fc87cf289b8db334da.

    commit 3a5c6b9010426449c08ecdcc10e758431b1e515f
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:31 2024 -0500

        Revert "bpf: btf: Ensure the member->offset is in the right order"

        This reverts commit c5e361ecd6d45a7cdbffda02e4691a7a37198bdd.

    commit bd6173c1ac458b08d6cedaf06e6e53c93e6b0cc5
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:26 2024 -0500

        Revert "bpf: fix bpf_skb_load_bytes_relative pkt length check"

        This reverts commit 9ea14969874cd7896588df435c890f6f2f547821.

    commit 0b61d26b25a65d9ded4611426c6da9c78e41567c
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:22 2024 -0500

        Revert "bpf: btf: Fix end boundary calculation for type section"

        This reverts commit 08ef221c7fb604cb60c490fa999ec7254d492f05.

    commit 72fb2b9bb5b90f60ab71915fe4e57eeee3308163
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:18 2024 -0500

        Revert "bpf: btf: Fix a missing check bug"

        This reverts commit 594687e3e01e26086f3b0173e5eda9b9f0b672f8.

    commit 575a34ceba4013ad0230038f29f6ea0b3ba41a7e
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:15 2024 -0500

        Revert "bpf, btf: fix a missing check bug in btf_parse"

        This reverts commit 6bf31bbc438663756e92fb0aad4f5a35fd730fb0.

    commit bcca98c0bc5e19b38af3ddcd0feee80ad26e1f96
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:11 2024 -0500

        Revert "bpf: fix BTF limits"

        This reverts commit e351b26ae671dfacd82f27c1c5f66cf8089d930d.

    commit f71c484e340041d8828c94b39a233ea587d8cc09
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:07 2024 -0500

        Revert "bpf/btf: Fix BTF verification of enum members in struct/union"

        This reverts commit 861e65b744c171d59850e61a01715f194f25e45c.

    commit eca310722a2624d33cd49884aa18c36d435b10f8
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:56:02 2024 -0500

        Revert "bpf: btf: fix truncated last_member_type_id in btf_struct_resolve"

        This reverts commit d6cd1eac41b10e606ec7f445162a0617c01be973.

    commit caae5c99a3ca7bed0e318b31b6aa7ca8260a1c52
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:58 2024 -0500

        Revert "BACKPORT: net: bpf: rename ndo_xdp to ndo_bpf"

        This reverts commit 2a1ddcb6a384745195d57b4e4cdda2a55d2cbe47.

    commit f90bdcdaa095a4f10268bb740470a3e0893be21b
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:54 2024 -0500

        Revert "BACKPORT: bpf: offload: add infrastructure for loading programs for a specific netdev"

        This reverts commit a9516d402726094eafccce26a99cf5110d188be9.

    commit c6e0ce9019c06d9a45c030a2bc38eed320afd45a
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:50 2024 -0500

        Revert "bpf: offload: rename the ifindex field"

        This reverts commit 36bc9c7351a1dc78b3e71571998af381e876b4cb.

    commit 88b6a4d41b69df804b846a8ebdca410517e08343
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:46 2024 -0500

        Revert "BACKPORT: bpf: Check attach type at prog load time"

        This reverts commit fe5a0d514e4970d86983458136d4a2f6caeee365.

    commit 9ccfaa66a5ea042331f0aacdb3667e23c8ed363e
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:43 2024 -0500

        Revert "BACKPORT: bpf: introduce BPF_PROG_QUERY command"

        This reverts commit a5720688858170f1054f9549b5a628db1c252a88.

    commit adab2743b3fa0853d0351b33b0a286de745025e5
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:37 2024 -0500

        Revert "BACKPORT: bpf: Hooks for sys_bind"

        This reverts commit e484887c7e7aa026521ddc1773233368a6304b24.

    commit d462e09db98ad89b3a836f9b9a925812b0d8cfe7
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:33 2024 -0500

        Revert "BACKPORT: net: Introduce __inet_bind() and __inet6_bind"

        This reverts commit 41a3131c3e94c28fd084dd6f4358baee3824fd17.

    commit cdf7f55dc65b4bdf7ecfc924be77c6a039709b3d
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:29 2024 -0500

        Revert "BACKPORT: bpf: Hooks for sys_connect"

        This reverts commit f26fe7233e2885ef489707ab5a5a5dda9f081b80.

    commit 97685d5058f76ba4ea6dd2db157f4537f3a8953d
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:23 2024 -0500

        Revert "BACKPORT: bpf: Post-hooks for sys_bind"

        This reverts commit 284ac5bc7c70dac338301445e94e1ad40fb40fdb.

    commit d03d9c05036d3109eae643f473cc5a5ad0a80721
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:19 2024 -0500

        Revert "kernel: bpf: devmap: Create __dev_map_alloc_node"

        This reverts commit db726149fa9abfd1ca9add3e2db6b1524f7e90a3.

    commit 8c34bcb3e4c6630799764871b4af2e5f9344a371
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:15 2024 -0500

        Revert "BACKPORT: xdp: Add devmap_hash map type for looking up devices by hashed index"

        This reverts commit c4d4e1d201d8433e06b2ac66041d7105095a0204.

    commit ef277c7b3a08fd59943eb2b47af64afc513de008
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:11 2024 -0500

        Revert "BACKPORT: devmap: Allow map lookups from eBPF"

        This reverts commit 24d196375871c72de0de977de79afede5a7d1780.

    commit 4fcd87869c55c28ed59bff916d640147601816d2
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:07 2024 -0500

        Revert "gen_headers_{arm, arm64}: Add btf.h to the list"

        This reverts commit 37edfe7c90bac355885ffec3327b338a34619792.

    commit b89560e0b405b58ecc5fc12c15ad4f56147760d6
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:55:03 2024 -0500

        Revert "syscall: Fake uname to 4.19 for bpfloader/netd"

        This reverts commit 186e74af61269602d0c068d98928b1f25e03eba2.

    commit fd49f8c35eb7875d6810a5a52877ebc59bfd4530
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:54:59 2024 -0500

        Revert "syscall: Fake uname to 4.19 also for netbpfload"

        This reverts commit 34b9a1ab387d7dc83ede613b2c12b3741ea08edb.

    commit b853fcf2ff892664d0ff522ca7fd530bc94c023e
    Author: John Galt <johngaltfirstrun@gmail.com>
    Date:   Fri Dec 13 07:54:53 2024 -0500

        Revert "syscall: Increase bpf fake uname to 5.4"

        This reverts commit 9cdc014e11b410a7f03d8c968a35ee0dd6a28fff.

    # Conflicts:
    #	net/ipv4/af_inet.c
    #	net/ipv6/af_inet6.c

commit 4a0143fa36d300485650dc447b580151a69a3be2
Author: kondors1995 <normandija1945@gmail.com>
Date:   Wed Dec 18 13:48:16 2024 +0200

    Revert "syscall: Fake uname to 4.19 for bpfloader/netd"

    This reverts commit 417f37c97f.

commit 6f512c5c7341a51d7bbc9cdd93814764cae8868f
Author: kondors1995 <normandija1945@gmail.com>
Date:   Wed Dec 18 13:48:16 2024 +0200

    Revert "syscall: Fake uname to 4.19 also for netbpfload"

    This reverts commit a4c61c3d97.

commit 41f326616251f0122d81e518082ef7faaad4b2e5
Author: kondors1995 <normandija1945@gmail.com>
Date:   Wed Dec 18 13:48:15 2024 +0200

    Revert "syscall: Increase bpf fake uname to 5.4"

    This reverts commit 4a906017d4.

commit a0d3db72a836096cf533516d56c81a43150976ed
Author: kondors1995 <normandija1945@gmail.com>
Date:   Wed Dec 18 13:46:12 2024 +0200

    Revert "bpf: Hooks for sys_sendmsg"

    This reverts commit 735c155332.

commit 246eb3d90b95e0ab5aee8d5a9e9cd639c7beb174
Author: kondors1995 <normandija1945@gmail.com>
Date:   Wed Dec 18 13:45:08 2024 +0200

    Revert "syscall: Increase fake uname to 6.6.40"

    This reverts commit 92494b9920.

commit c56eaa5b7f170f58f2ade14bb71aaad2964b9018
Author: kondors1995 <normandija1945@gmail.com>
Date:   Mon Dec 9 21:35:20 2024 +0200

    raphael_defconfig: increase sbalance pooling rate to 10s

commit 54d190b8af
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Wed Dec 4 15:53:22 2024 -0800

    sbalance: Fix severe misattribution of movable IRQs to the last active CPU

    Due to a horrible omission in the big IRQ list traversal, all movable IRQs
    are misattributed to the last active CPU in the system since that's what
    `bd` is last set to in the loop prior. This horribly breaks SBalance's
    notion of balance, producing nonsensical balancing decisions and failing to
    balance IRQs even when they are heavily imbalanced.

    Fix the massive breakage by adding the missing line of code to set `bd` to
    the CPU an IRQ actually belongs to, so that it's added to the correct CPU's
    movable IRQs list.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit f2fa2db581
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Wed Dec 4 14:31:52 2024 -0800

    sbalance: Don't race with CPU hotplug

    When a CPU is hotplugged, cpu_active_mask is modified without any RCU
    synchronization. As a result, the only synchronization for cpu_active_mask
    provided by the hotplug code is the CPU hotplug lock.

    Furthermore, since IRQ balance is majorly disrupted during CPU hotplug due
    to mass IRQ migration off a dying CPU, SBalance just shouldn't operate
    while a CPU hotplug is in progress.

    Take the CPU hotplug lock in balance_irqs() to prevent races and mishaps
    during CPU hotplugs.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>

commit a4e81ff60a
Author: Sultan Alsawaf <sultan@kerneltoast.com>
Date:   Wed Dec 4 14:16:48 2024 -0800

    sbalance: Convert various IRQ counter types to unsigned ints

    These counted values are actually unsigned ints, not unsigned longs.
    Convert them to unsigned ints since there's no reason for them to be longs.

    Signed-off-by: Sultan Alsawaf <sultan@kerneltoast.com>
2024-12-19 17:34:31 +02:00

421 lines
13 KiB
C

/* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
* License as published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
/* Devmaps primary use is as a backend map for XDP BPF helper call
* bpf_redirect_map(). Because XDP is mostly concerned with performance we
* spent some effort to ensure the datapath with redirect maps does not use
* any locking. This is a quick note on the details.
*
* We have three possible paths to get into the devmap control plane bpf
* syscalls, bpf programs, and driver side xmit/flush operations. A bpf syscall
* will invoke an update, delete, or lookup operation. To ensure updates and
* deletes appear atomic from the datapath side xchg() is used to modify the
* netdev_map array. Then because the datapath does a lookup into the netdev_map
* array (read-only) from an RCU critical section we use call_rcu() to wait for
* an rcu grace period before free'ing the old data structures. This ensures the
* datapath always has a valid copy. However, the datapath does a "flush"
* operation that pushes any pending packets in the driver outside the RCU
* critical section. Each bpf_dtab_netdev tracks these pending operations using
* an atomic per-cpu bitmap. The bpf_dtab_netdev object will not be destroyed
* until all bits are cleared indicating outstanding flush operations have
* completed.
*
* BPF syscalls may race with BPF program calls on any of the update, delete
* or lookup operations. As noted above the xchg() operation also keep the
* netdev_map consistent in this case. From the devmap side BPF programs
* calling into these operations are the same as multiple user space threads
* making system calls.
*
* Finally, any of the above may race with a netdev_unregister notifier. The
* unregister notifier must search for net devices in the map structure that
* contain a reference to the net device and remove them. This is a two step
* process (a) dereference the bpf_dtab_netdev object in netdev_map and (b)
* check to see if the ifindex is the same as the net_device being removed.
* When removing the dev a cmpxchg() is used to ensure the correct dev is
* removed, in the case of a concurrent update or delete operation it is
* possible that the initially referenced dev is no longer in the map. As the
* notifier hook walks the map we know that new dev references can not be
* added by the user because core infrastructure ensures dev_get_by_index()
* calls will fail at this point.
*/
#include <linux/bpf.h>
#include <linux/filter.h>
#define DEV_CREATE_FLAG_MASK \
(BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
struct bpf_dtab_netdev {
struct net_device *dev;
struct bpf_dtab *dtab;
unsigned int bit;
struct rcu_head rcu;
};
struct bpf_dtab {
struct bpf_map map;
struct bpf_dtab_netdev **netdev_map;
unsigned long __percpu *flush_needed;
struct list_head list;
};
static DEFINE_SPINLOCK(dev_map_lock);
static LIST_HEAD(dev_map_list);
static u64 dev_map_bitmap_size(const union bpf_attr *attr)
{
return BITS_TO_LONGS((u64) attr->max_entries) * sizeof(unsigned long);
}
static struct bpf_map *dev_map_alloc(union bpf_attr *attr)
{
struct bpf_dtab *dtab;
int err = -EINVAL;
u64 cost;
if (!capable(CAP_NET_ADMIN))
return ERR_PTR(-EPERM);
/* check sanity of attributes */
if (attr->max_entries == 0 || attr->key_size != 4 ||
attr->value_size != 4 || attr->map_flags & ~DEV_CREATE_FLAG_MASK)
return ERR_PTR(-EINVAL);
dtab = kzalloc(sizeof(*dtab), GFP_USER);
if (!dtab)
return ERR_PTR(-ENOMEM);
/* mandatory map attributes */
dtab->map.map_type = attr->map_type;
dtab->map.key_size = attr->key_size;
dtab->map.value_size = attr->value_size;
dtab->map.max_entries = attr->max_entries;
dtab->map.map_flags = attr->map_flags;
dtab->map.numa_node = bpf_map_attr_numa_node(attr);
/* make sure page count doesn't overflow */
cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *);
cost += dev_map_bitmap_size(attr) * num_possible_cpus();
if (cost >= U32_MAX - PAGE_SIZE)
goto free_dtab;
dtab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
/* if map size is larger than memlock limit, reject it early */
err = bpf_map_precharge_memlock(dtab->map.pages);
if (err)
goto free_dtab;
err = -ENOMEM;
/* A per cpu bitfield with a bit per possible net device */
dtab->flush_needed = __alloc_percpu_gfp(dev_map_bitmap_size(attr),
__alignof__(unsigned long),
GFP_KERNEL | __GFP_NOWARN);
if (!dtab->flush_needed)
goto free_dtab;
dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries *
sizeof(struct bpf_dtab_netdev *),
dtab->map.numa_node);
if (!dtab->netdev_map)
goto free_dtab;
spin_lock(&dev_map_lock);
list_add_tail_rcu(&dtab->list, &dev_map_list);
spin_unlock(&dev_map_lock);
return &dtab->map;
free_dtab:
free_percpu(dtab->flush_needed);
kfree(dtab);
return ERR_PTR(err);
}
static void dev_map_free(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
int i, cpu;
/* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
* so the programs (can be more than one that used this map) were
* disconnected from events. Wait for outstanding critical sections in
* these programs to complete. The rcu critical section only guarantees
* no further reads against netdev_map. It does __not__ ensure pending
* flush operations (if any) are complete.
*/
spin_lock(&dev_map_lock);
list_del_rcu(&dtab->list);
spin_unlock(&dev_map_lock);
synchronize_rcu();
/* Make sure prior __dev_map_entry_free() have completed. */
rcu_barrier();
/* To ensure all pending flush operations have completed wait for flush
* bitmap to indicate all flush_needed bits to be zero on _all_ cpus.
* Because the above synchronize_rcu() ensures the map is disconnected
* from the program we can assume no new bits will be set.
*/
for_each_online_cpu(cpu) {
unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu);
while (!bitmap_empty(bitmap, dtab->map.max_entries))
cond_resched();
}
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev;
dev = dtab->netdev_map[i];
if (!dev)
continue;
dev_put(dev->dev);
kfree(dev);
}
free_percpu(dtab->flush_needed);
bpf_map_area_free(dtab->netdev_map);
kfree(dtab);
}
static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
u32 index = key ? *(u32 *)key : U32_MAX;
u32 *next = next_key;
if (index >= dtab->map.max_entries) {
*next = 0;
return 0;
}
if (index == dtab->map.max_entries - 1)
return -ENOENT;
*next = index + 1;
return 0;
}
void __dev_map_insert_ctx(struct bpf_map *map, u32 bit)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
__set_bit(bit, bitmap);
}
/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled
* from the driver before returning from its napi->poll() routine. The poll()
* routine is called either from busy_poll context or net_rx_action signaled
* from NET_RX_SOFTIRQ. Either way the poll routine must complete before the
* net device can be torn down. On devmap tear down we ensure the ctx bitmap
* is zeroed before completing to ensure all flush operations have completed.
*/
void __dev_map_flush(struct bpf_map *map)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed);
u32 bit;
for_each_set_bit(bit, bitmap, map->max_entries) {
struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]);
struct net_device *netdev;
/* This is possible if the dev entry is removed by user space
* between xdp redirect and flush op.
*/
if (unlikely(!dev))
continue;
__clear_bit(bit, bitmap);
netdev = dev->dev;
if (likely(netdev->netdev_ops->ndo_xdp_flush))
netdev->netdev_ops->ndo_xdp_flush(netdev);
}
}
/* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or
* update happens in parallel here a dev_put wont happen until after reading the
* ifindex.
*/
struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *dev;
if (key >= map->max_entries)
return NULL;
dev = READ_ONCE(dtab->netdev_map[key]);
return dev ? dev->dev : NULL;
}
static void *dev_map_lookup_elem(struct bpf_map *map, void *key)
{
struct net_device *dev = __dev_map_lookup_elem(map, *(u32 *)key);
return dev ? &dev->ifindex : NULL;
}
static void dev_map_flush_old(struct bpf_dtab_netdev *dev)
{
if (dev->dev->netdev_ops->ndo_xdp_flush) {
struct net_device *fl = dev->dev;
unsigned long *bitmap;
int cpu;
for_each_online_cpu(cpu) {
bitmap = per_cpu_ptr(dev->dtab->flush_needed, cpu);
__clear_bit(dev->bit, bitmap);
fl->netdev_ops->ndo_xdp_flush(dev->dev);
}
}
}
static void __dev_map_entry_free(struct rcu_head *rcu)
{
struct bpf_dtab_netdev *dev;
dev = container_of(rcu, struct bpf_dtab_netdev, rcu);
dev_map_flush_old(dev);
dev_put(dev->dev);
kfree(dev);
}
static int dev_map_delete_elem(struct bpf_map *map, void *key)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct bpf_dtab_netdev *old_dev;
int k = *(u32 *)key;
if (k >= map->max_entries)
return -EINVAL;
/* Use call_rcu() here to ensure any rcu critical sections have
* completed, but this does not guarantee a flush has happened
* yet. Because driver side rcu_read_lock/unlock only protects the
* running XDP program. However, for pending flush operations the
* dev and ctx are stored in another per cpu map. And additionally,
* the driver tear down ensures all soft irqs are complete before
* removing the net device in the case of dev_put equals zero.
*/
old_dev = xchg(&dtab->netdev_map[k], NULL);
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
return 0;
}
static int dev_map_update_elem(struct bpf_map *map, void *key, void *value,
u64 map_flags)
{
struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map);
struct net *net = current->nsproxy->net_ns;
struct bpf_dtab_netdev *dev, *old_dev;
u32 i = *(u32 *)key;
u32 ifindex = *(u32 *)value;
if (unlikely(map_flags > BPF_EXIST))
return -EINVAL;
if (unlikely(i >= dtab->map.max_entries))
return -E2BIG;
if (unlikely(map_flags == BPF_NOEXIST))
return -EEXIST;
if (!ifindex) {
dev = NULL;
} else {
dev = kmalloc_node(sizeof(*dev), GFP_ATOMIC | __GFP_NOWARN,
map->numa_node);
if (!dev)
return -ENOMEM;
dev->dev = dev_get_by_index(net, ifindex);
if (!dev->dev) {
kfree(dev);
return -EINVAL;
}
dev->bit = i;
dev->dtab = dtab;
}
/* Use call_rcu() here to ensure rcu critical sections have completed
* Remembering the driver side flush operation will happen before the
* net device is removed.
*/
old_dev = xchg(&dtab->netdev_map[i], dev);
if (old_dev)
call_rcu(&old_dev->rcu, __dev_map_entry_free);
return 0;
}
const struct bpf_map_ops dev_map_ops = {
.map_alloc = dev_map_alloc,
.map_free = dev_map_free,
.map_get_next_key = dev_map_get_next_key,
.map_lookup_elem = dev_map_lookup_elem,
.map_update_elem = dev_map_update_elem,
.map_delete_elem = dev_map_delete_elem,
};
static int dev_map_notification(struct notifier_block *notifier,
ulong event, void *ptr)
{
struct net_device *netdev = netdev_notifier_info_to_dev(ptr);
struct bpf_dtab *dtab;
int i;
switch (event) {
case NETDEV_UNREGISTER:
/* This rcu_read_lock/unlock pair is needed because
* dev_map_list is an RCU list AND to ensure a delete
* operation does not free a netdev_map entry while we
* are comparing it against the netdev being unregistered.
*/
rcu_read_lock();
list_for_each_entry_rcu(dtab, &dev_map_list, list) {
for (i = 0; i < dtab->map.max_entries; i++) {
struct bpf_dtab_netdev *dev, *odev;
dev = READ_ONCE(dtab->netdev_map[i]);
if (!dev || netdev != dev->dev)
continue;
odev = cmpxchg(&dtab->netdev_map[i], dev, NULL);
if (dev == odev)
call_rcu(&dev->rcu,
__dev_map_entry_free);
}
}
rcu_read_unlock();
break;
default:
break;
}
return NOTIFY_OK;
}
static struct notifier_block dev_map_notifier = {
.notifier_call = dev_map_notification,
};
static int __init dev_map_init(void)
{
register_netdevice_notifier(&dev_map_notifier);
return 0;
}
subsys_initcall(dev_map_init);