diff options
Diffstat (limited to 'kernel/bpf')
-rw-r--r-- | kernel/bpf/devmap.c | 84 |
1 files changed, 83 insertions, 1 deletions
diff --git a/kernel/bpf/devmap.c b/kernel/bpf/devmap.c index 36dc13de..b2ef04a 100644 --- a/kernel/bpf/devmap.c +++ b/kernel/bpf/devmap.c @@ -53,6 +53,7 @@ struct bpf_dtab_netdev { struct bpf_dtab { struct bpf_map map; struct bpf_dtab_netdev **netdev_map; + unsigned long int __percpu *flush_needed; }; static struct bpf_map *dev_map_alloc(union bpf_attr *attr) @@ -87,6 +88,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) /* make sure page count doesn't overflow */ cost = (u64) dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *); + cost += BITS_TO_LONGS(attr->max_entries) * sizeof(unsigned long); if (cost >= U32_MAX - PAGE_SIZE) goto free_dtab; @@ -97,6 +99,14 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) if (err) goto free_dtab; + /* A per cpu bitfield with a bit per possible net device */ + dtab->flush_needed = __alloc_percpu( + BITS_TO_LONGS(attr->max_entries) * + sizeof(unsigned long), + __alignof__(unsigned long)); + if (!dtab->flush_needed) + goto free_dtab; + dtab->netdev_map = bpf_map_area_alloc(dtab->map.max_entries * sizeof(struct bpf_dtab_netdev *)); if (!dtab->netdev_map) @@ -105,6 +115,7 @@ static struct bpf_map *dev_map_alloc(union bpf_attr *attr) return &dtab->map; free_dtab: + free_percpu(dtab->flush_needed); kfree(dtab); return ERR_PTR(err); } @@ -112,7 +123,7 @@ free_dtab: static void dev_map_free(struct bpf_map *map) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); - int i; + int i, cpu; /* At this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0, * so the programs (can be more than one that used this map) were @@ -123,6 +134,18 @@ static void dev_map_free(struct bpf_map *map) */ synchronize_rcu(); + /* To ensure all pending flush operations have completed wait for flush + * bitmap to indicate all flush_needed bits to be zero on _all_ cpus. + * Because the above synchronize_rcu() ensures the map is disconnected + * from the program we can assume no new bits will be set. + */ + for_each_online_cpu(cpu) { + unsigned long *bitmap = per_cpu_ptr(dtab->flush_needed, cpu); + + while (!bitmap_empty(bitmap, dtab->map.max_entries)) + cpu_relax(); + } + for (i = 0; i < dtab->map.max_entries; i++) { struct bpf_dtab_netdev *dev; @@ -137,6 +160,7 @@ static void dev_map_free(struct bpf_map *map) /* At this point bpf program is detached and all pending operations * _must_ be complete */ + free_percpu(dtab->flush_needed); bpf_map_area_free(dtab->netdev_map); kfree(dtab); } @@ -159,6 +183,14 @@ static int dev_map_get_next_key(struct bpf_map *map, void *key, void *next_key) return 0; } +void __dev_map_insert_ctx(struct bpf_map *map, u32 key) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); + + __set_bit(key, bitmap); +} + struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) { struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); @@ -171,6 +203,39 @@ struct net_device *__dev_map_lookup_elem(struct bpf_map *map, u32 key) return dev ? dev->dev : NULL; } +/* __dev_map_flush is called from xdp_do_flush_map() which _must_ be signaled + * from the driver before returning from its napi->poll() routine. The poll() + * routine is called either from busy_poll context or net_rx_action signaled + * from NET_RX_SOFTIRQ. Either way the poll routine must complete before the + * net device can be torn down. On devmap tear down we ensure the ctx bitmap + * is zeroed before completing to ensure all flush operations have completed. + */ +void __dev_map_flush(struct bpf_map *map) +{ + struct bpf_dtab *dtab = container_of(map, struct bpf_dtab, map); + unsigned long *bitmap = this_cpu_ptr(dtab->flush_needed); + u32 bit; + + for_each_set_bit(bit, bitmap, map->max_entries) { + struct bpf_dtab_netdev *dev = READ_ONCE(dtab->netdev_map[bit]); + struct net_device *netdev; + + /* This is possible if the dev entry is removed by user space + * between xdp redirect and flush op. + */ + if (unlikely(!dev)) + continue; + + netdev = dev->dev; + + __clear_bit(bit, bitmap); + if (unlikely(!netdev || !netdev->netdev_ops->ndo_xdp_flush)) + continue; + + netdev->netdev_ops->ndo_xdp_flush(netdev); + } +} + /* rcu_read_lock (from syscall and BPF contexts) ensures that if a delete and/or * update happens in parallel here a dev_put wont happen until after reading the * ifindex. @@ -188,11 +253,28 @@ static void *dev_map_lookup_elem(struct bpf_map *map, void *key) return dev ? &dev->dev->ifindex : NULL; } +static void dev_map_flush_old(struct bpf_dtab_netdev *old_dev) +{ + if (old_dev->dev->netdev_ops->ndo_xdp_flush) { + struct net_device *fl = old_dev->dev; + unsigned long *bitmap; + int cpu; + + for_each_online_cpu(cpu) { + bitmap = per_cpu_ptr(old_dev->dtab->flush_needed, cpu); + __clear_bit(old_dev->key, bitmap); + + fl->netdev_ops->ndo_xdp_flush(old_dev->dev); + } + } +} + static void __dev_map_entry_free(struct rcu_head *rcu) { struct bpf_dtab_netdev *old_dev; old_dev = container_of(rcu, struct bpf_dtab_netdev, rcu); + dev_map_flush_old(old_dev); dev_put(old_dev->dev); kfree(old_dev); } |