diff options
Diffstat (limited to 'net/core')
-rw-r--r-- | net/core/Makefile | 2 | ||||
-rw-r--r-- | net/core/dev.c | 260 | ||||
-rw-r--r-- | net/core/dev_ioctl.c | 132 | ||||
-rw-r--r-- | net/core/devlink.c | 596 | ||||
-rw-r--r-- | net/core/dst.c | 14 | ||||
-rw-r--r-- | net/core/ethtool.c | 14 | ||||
-rw-r--r-- | net/core/filter.c | 359 | ||||
-rw-r--r-- | net/core/flow_dissector.c | 69 | ||||
-rw-r--r-- | net/core/gen_estimator.c | 4 | ||||
-rw-r--r-- | net/core/gen_stats.c | 9 | ||||
-rw-r--r-- | net/core/link_watch.c | 2 | ||||
-rw-r--r-- | net/core/neighbour.c | 1 | ||||
-rw-r--r-- | net/core/net-procfs.c | 4 | ||||
-rw-r--r-- | net/core/net-sysfs.c | 56 | ||||
-rw-r--r-- | net/core/net_namespace.c | 83 | ||||
-rw-r--r-- | net/core/pktgen.c | 281 | ||||
-rw-r--r-- | net/core/rtnetlink.c | 482 | ||||
-rw-r--r-- | net/core/skbuff.c | 14 | ||||
-rw-r--r-- | net/core/sock.c | 56 | ||||
-rw-r--r-- | net/core/sock_reuseport.c | 4 | ||||
-rw-r--r-- | net/core/sysctl_net_core.c | 60 | ||||
-rw-r--r-- | net/core/xdp.c | 73 |
22 files changed, 1902 insertions, 673 deletions
diff --git a/net/core/Makefile b/net/core/Makefile index 1fd0a9c..6dbbba8 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o \ - fib_notifier.o + fib_notifier.o xdp.o obj-y += net-sysfs.o obj-$(CONFIG_PROC_FS) += net-procfs.o diff --git a/net/core/dev.c b/net/core/dev.c index 613fb40..dda9d7b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1554,6 +1554,23 @@ void dev_disable_lro(struct net_device *dev) } EXPORT_SYMBOL(dev_disable_lro); +/** + * dev_disable_gro_hw - disable HW Generic Receive Offload on a device + * @dev: device + * + * Disable HW Generic Receive Offload (GRO_HW) on a net device. Must be + * called under RTNL. This is needed if Generic XDP is installed on + * the device. + */ +static void dev_disable_gro_hw(struct net_device *dev) +{ + dev->wanted_features &= ~NETIF_F_GRO_HW; + netdev_update_features(dev); + + if (unlikely(dev->features & NETIF_F_GRO_HW)) + netdev_WARN(dev, "failed to disable GRO_HW!\n"); +} + static int call_netdevice_notifier(struct notifier_block *nb, unsigned long val, struct net_device *dev) { @@ -1677,7 +1694,6 @@ EXPORT_SYMBOL(unregister_netdevice_notifier); /** * call_netdevice_notifiers_info - call all network notifier blocks * @val: value passed unmodified to notifier function - * @dev: net_device pointer passed unmodified to notifier function * @info: notifier information data * * Call all network notifier blocks. Parameters and return value @@ -2815,7 +2831,7 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb, segs = skb_mac_gso_segment(skb, features); - if (unlikely(skb_needs_check(skb, tx_path))) + if (unlikely(skb_needs_check(skb, tx_path) && !IS_ERR(segs))) skb_warn_bad_offload(skb); return segs; @@ -3054,7 +3070,7 @@ int skb_csum_hwoffload_help(struct sk_buff *skb, } EXPORT_SYMBOL(skb_csum_hwoffload_help); -static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev) +static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device *dev, bool *again) { netdev_features_t features; @@ -3078,9 +3094,6 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device __skb_linearize(skb)) goto out_kfree_skb; - if (validate_xmit_xfrm(skb, features)) - goto out_kfree_skb; - /* If packet is not checksummed and device does not * support checksumming for this protocol, complete * checksumming here. @@ -3097,6 +3110,8 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device } } + skb = validate_xmit_xfrm(skb, features, again); + return skb; out_kfree_skb: @@ -3106,7 +3121,7 @@ out_null: return NULL; } -struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev) +struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *dev, bool *again) { struct sk_buff *next, *head = NULL, *tail; @@ -3117,7 +3132,7 @@ struct sk_buff *validate_xmit_skb_list(struct sk_buff *skb, struct net_device *d /* in case skb wont be segmented, point to itself */ skb->prev = skb; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, again); if (!skb) continue; @@ -3185,6 +3200,21 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, int rc; qdisc_calculate_pkt_len(skb, q); + + if (q->flags & TCQ_F_NOLOCK) { + if (unlikely(test_bit(__QDISC_STATE_DEACTIVATED, &q->state))) { + __qdisc_drop(skb, &to_free); + rc = NET_XMIT_DROP; + } else { + rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; + __qdisc_run(q); + } + + if (unlikely(to_free)) + kfree_skb_list(to_free); + return rc; + } + /* * Heuristic to force contended enqueues to serialize on a * separate lock before trying to get qdisc main lock. @@ -3215,9 +3245,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); - } else - qdisc_run_end(q); + } + qdisc_run_end(q); rc = NET_XMIT_SUCCESS; } else { rc = q->enqueue(skb, q, &to_free) & NET_XMIT_MASK; @@ -3227,6 +3257,7 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, contended = false; } __qdisc_run(q); + qdisc_run_end(q); } } spin_unlock(root_lock); @@ -3399,8 +3430,7 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev, else queue_index = __netdev_pick_tx(dev, skb); - if (!accel_priv) - queue_index = netdev_cap_txqueue(dev, queue_index); + queue_index = netdev_cap_txqueue(dev, queue_index); } skb_set_queue_mapping(skb, queue_index); @@ -3439,6 +3469,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) struct netdev_queue *txq; struct Qdisc *q; int rc = -ENOMEM; + bool again = false; skb_reset_mac_header(skb); @@ -3500,7 +3531,7 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv) XMIT_RECURSION_LIMIT)) goto recursion_alert; - skb = validate_xmit_skb(skb, dev); + skb = validate_xmit_skb(skb, dev, &again); if (!skb) goto out; @@ -3896,9 +3927,33 @@ drop: return NET_RX_DROP; } +static struct netdev_rx_queue *netif_get_rxqueue(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + struct netdev_rx_queue *rxqueue; + + rxqueue = dev->_rx; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); + + return rxqueue; /* Return first rxqueue */ + } + rxqueue += index; + } + return rxqueue; +} + static u32 netif_receive_generic_xdp(struct sk_buff *skb, struct bpf_prog *xdp_prog) { + struct netdev_rx_queue *rxqueue; u32 metalen, act = XDP_DROP; struct xdp_buff xdp; void *orig_data; @@ -3942,6 +3997,9 @@ static u32 netif_receive_generic_xdp(struct sk_buff *skb, xdp.data_hard_start = skb->data - skb_headroom(skb); orig_data = xdp.data; + rxqueue = netif_get_rxqueue(skb); + xdp.rxq = &rxqueue->xdp_rxq; + act = bpf_prog_run_xdp(xdp_prog, &xdp); off = xdp.data - orig_data; @@ -4166,21 +4224,26 @@ static __latent_entropy void net_tx_action(struct softirq_action *h) while (head) { struct Qdisc *q = head; - spinlock_t *root_lock; + spinlock_t *root_lock = NULL; head = head->next_sched; - root_lock = qdisc_lock(q); - spin_lock(root_lock); + if (!(q->flags & TCQ_F_NOLOCK)) { + root_lock = qdisc_lock(q); + spin_lock(root_lock); + } /* We need to make sure head->next_sched is read * before clearing __QDISC_STATE_SCHED */ smp_mb__before_atomic(); clear_bit(__QDISC_STATE_SCHED, &q->state); qdisc_run(q); - spin_unlock(root_lock); + if (root_lock) + spin_unlock(root_lock); } } + + xfrm_dev_backlog(sd); } #if IS_ENABLED(CONFIG_BRIDGE) && IS_ENABLED(CONFIG_ATM_LANE) @@ -4568,6 +4631,7 @@ static int generic_xdp_install(struct net_device *dev, struct netdev_bpf *xdp) } else if (new && !old) { static_key_slow_inc(&generic_xdp_needed); dev_disable_lro(dev); + dev_disable_gro_hw(dev); } break; @@ -6371,6 +6435,7 @@ rollback: * netdev_upper_dev_link - Add a link to the upper device * @dev: device * @upper_dev: new upper device + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. The caller must hold * the RTNL lock. On a failure a negative errno code is returned. @@ -6392,6 +6457,7 @@ EXPORT_SYMBOL(netdev_upper_dev_link); * @upper_dev: new upper device * @upper_priv: upper device private * @upper_info: upper info to be passed down via notifier + * @extack: netlink extended ack * * Adds a link to device which is upper to this one. In this case, only * one master upper device can be linked, although other non-master devices @@ -6982,6 +7048,35 @@ int dev_set_mtu(struct net_device *dev, int new_mtu) EXPORT_SYMBOL(dev_set_mtu); /** + * dev_change_tx_queue_len - Change TX queue length of a netdevice + * @dev: device + * @new_len: new tx queue length + */ +int dev_change_tx_queue_len(struct net_device *dev, unsigned long new_len) +{ + unsigned int orig_len = dev->tx_queue_len; + int res; + + if (new_len != (unsigned int)new_len) + return -ERANGE; + + if (new_len != orig_len) { + dev->tx_queue_len = new_len; + res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); + res = notifier_to_errno(res); + if (res) { + netdev_err(dev, + "refused to change device tx_queue_len\n"); + dev->tx_queue_len = orig_len; + return res; + } + return dev_qdisc_change_tx_queue_len(dev); + } + + return 0; +} + +/** * dev_set_group - Change group this device belongs to * @dev: device * @new_group: group this device should belong to @@ -7096,17 +7191,21 @@ int dev_change_proto_down(struct net_device *dev, bool proto_down) } EXPORT_SYMBOL(dev_change_proto_down); -u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op, u32 *prog_id) +void __dev_xdp_query(struct net_device *dev, bpf_op_t bpf_op, + struct netdev_bpf *xdp) { - struct netdev_bpf xdp; - - memset(&xdp, 0, sizeof(xdp)); - xdp.command = XDP_QUERY_PROG; + memset(xdp, 0, sizeof(*xdp)); + xdp->command = XDP_QUERY_PROG; /* Query must always succeed. */ - WARN_ON(bpf_op(dev, &xdp) < 0); - if (prog_id) - *prog_id = xdp.prog_id; + WARN_ON(bpf_op(dev, xdp) < 0); +} + +static u8 __dev_xdp_attached(struct net_device *dev, bpf_op_t bpf_op) +{ + struct netdev_bpf xdp; + + __dev_xdp_query(dev, bpf_op, &xdp); return xdp.prog_attached; } @@ -7129,6 +7228,27 @@ static int dev_xdp_install(struct net_device *dev, bpf_op_t bpf_op, return bpf_op(dev, &xdp); } +static void dev_xdp_uninstall(struct net_device *dev) +{ + struct netdev_bpf xdp; + bpf_op_t ndo_bpf; + + /* Remove generic XDP */ + WARN_ON(dev_xdp_install(dev, generic_xdp_install, NULL, 0, NULL)); + + /* Remove from the driver */ + ndo_bpf = dev->netdev_ops->ndo_bpf; + if (!ndo_bpf) + return; + + __dev_xdp_query(dev, ndo_bpf, &xdp); + if (xdp.prog_attached == XDP_ATTACHED_NONE) + return; + + /* Program removal should always succeed */ + WARN_ON(dev_xdp_install(dev, ndo_bpf, NULL, xdp.prog_flags, NULL)); +} + /** * dev_change_xdp_fd - set or clear a bpf program for a device rx path * @dev: device @@ -7157,10 +7277,10 @@ int dev_change_xdp_fd(struct net_device *dev, struct netlink_ext_ack *extack, bpf_chk = generic_xdp_install; if (fd >= 0) { - if (bpf_chk && __dev_xdp_attached(dev, bpf_chk, NULL)) + if (bpf_chk && __dev_xdp_attached(dev, bpf_chk)) return -EEXIST; if ((flags & XDP_FLAGS_UPDATE_IF_NOEXIST) && - __dev_xdp_attached(dev, bpf_op, NULL)) + __dev_xdp_attached(dev, bpf_op)) return -EBUSY; prog = bpf_prog_get_type_dev(fd, BPF_PROG_TYPE_XDP, @@ -7259,6 +7379,7 @@ static void rollback_registered_many(struct list_head *head) /* Shutdown queueing discipline. */ dev_shutdown(dev); + dev_xdp_uninstall(dev); /* Notify protocols, that we are about to destroy * this device. They should clean all the things. @@ -7268,7 +7389,7 @@ static void rollback_registered_many(struct list_head *head) if (!dev->rtnl_link_ops || dev->rtnl_link_state == RTNL_LINK_INITIALIZED) skb = rtmsg_ifinfo_build_skb(RTM_DELLINK, dev, ~0U, 0, - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); /* * Flush the unicast and multicast chains @@ -7402,6 +7523,18 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, features &= ~dev->gso_partial_features; } + if (!(features & NETIF_F_RXCSUM)) { + /* NETIF_F_GRO_HW implies doing RXCSUM since every packet + * successfully merged by hardware must also have the + * checksum verified by hardware. If the user does not + * want to enable RXCSUM, logically, we should disable GRO_HW. + */ + if (features & NETIF_F_GRO_HW) { + netdev_dbg(dev, "Dropping NETIF_F_GRO_HW since no RXCSUM feature.\n"); + features &= ~NETIF_F_GRO_HW; + } + } + return features; } @@ -7535,12 +7668,12 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); -#ifdef CONFIG_SYSFS static int netif_alloc_rx_queues(struct net_device *dev) { unsigned int i, count = dev->num_rx_queues; struct netdev_rx_queue *rx; size_t sz = count * sizeof(*rx); + int err = 0; BUG_ON(count < 1); @@ -7550,11 +7683,38 @@ static int netif_alloc_rx_queues(struct net_device *dev) dev->_rx = rx; - for (i = 0; i < count; i++) + for (i = 0; i < count; i++) { rx[i].dev = dev; + + /* XDP RX-queue setup */ + err = xdp_rxq_info_reg(&rx[i].xdp_rxq, dev, i); + if (err < 0) + goto err_rxq_info; + } return 0; + +err_rxq_info: + /* Rollback successful reg's and free other resources */ + while (i--) + xdp_rxq_info_unreg(&rx[i].xdp_rxq); + kvfree(dev->_rx); + dev->_rx = NULL; + return err; +} + +static void netif_free_rx_queues(struct net_device *dev) +{ + unsigned int i, count = dev->num_rx_queues; + + /* netif_alloc_rx_queues alloc failed, resources have been unreg'ed */ + if (!dev->_rx) + return; + + for (i = 0; i < count; i++) + xdp_rxq_info_unreg(&dev->_rx[i].xdp_rxq); + + kvfree(dev->_rx); } -#endif static void netdev_init_one_queue(struct net_device *dev, struct netdev_queue *queue, void *_unused) @@ -8115,12 +8275,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, return NULL; } -#ifdef CONFIG_SYSFS if (rxqs < 1) { pr_err("alloc_netdev: Unable to allocate device with zero RX queues\n"); return NULL; } -#endif alloc_size = sizeof(struct net_device); if (sizeof_priv) { @@ -8177,12 +8335,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name, if (netif_alloc_netdev_queues(dev)) goto free_all; -#ifdef CONFIG_SYSFS dev->num_rx_queues = rxqs; dev->real_num_rx_queues = rxqs; if (netif_alloc_rx_queues(dev)) goto free_all; -#endif strcpy(dev->name, name); dev->name_assign_type = name_assign_type; @@ -8218,13 +8374,10 @@ EXPORT_SYMBOL(alloc_netdev_mqs); void free_netdev(struct net_device *dev) { struct napi_struct *p, *n; - struct bpf_prog *prog; might_sleep(); netif_free_tx_queues(dev); -#ifdef CONFIG_SYSFS - kvfree(dev->_rx); -#endif + netif_free_rx_queues(dev); kfree(rcu_dereference_protected(dev->ingress_queue, 1)); @@ -8237,12 +8390,6 @@ void free_netdev(struct net_device *dev) free_percpu(dev->pcpu_refcnt); dev->pcpu_refcnt = NULL; - prog = rcu_dereference_protected(dev->xdp_prog, 1); - if (prog) { - bpf_prog_put(prog); - static_key_slow_dec(&generic_xdp_needed); - } - /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { netdev_freemem(dev); @@ -8355,7 +8502,7 @@ EXPORT_SYMBOL(unregister_netdev); int dev_change_net_namespace(struct net_device *dev, struct net *net, const char *pat) { - int err, new_nsid; + int err, new_nsid, new_ifindex; ASSERT_RTNL(); @@ -8411,11 +8558,16 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char call_netdevice_notifiers(NETDEV_UNREGISTER, dev); rcu_barrier(); call_netdevice_notifiers(NETDEV_UNREGISTER_FINAL, dev); - if (dev->rtnl_link_ops && dev->rtnl_link_ops->get_link_net) - new_nsid = peernet2id_alloc(dev_net(dev), net); + + new_nsid = peernet2id_alloc(dev_net(dev), net); + /* If there is an ifindex conflict assign a new one */ + if (__dev_get_by_index(net, dev->ifindex)) + new_ifindex = dev_new_index(net); else - new_nsid = peernet2id(dev_net(dev), net); - rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid); + new_ifindex = dev->ifindex; + + rtmsg_ifinfo_newnet(RTM_DELLINK, dev, ~0U, GFP_KERNEL, &new_nsid, + new_ifindex); /* * Flush the unicast and multicast chains @@ -8429,10 +8581,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Actually switch the network namespace */ dev_net_set(dev, net); - - /* If there is an ifindex conflict assign a new one */ - if (__dev_get_by_index(net, dev->ifindex)) - dev->ifindex = dev_new_index(net); + dev->ifindex = new_ifindex; /* Send a netdev-add uevent to the new namespace */ kobject_uevent(&dev->dev.kobj, KOBJ_ADD); @@ -8830,6 +8979,9 @@ static int __init net_dev_init(void) skb_queue_head_init(&sd->input_pkt_queue); skb_queue_head_init(&sd->process_queue); +#ifdef CONFIG_XFRM_OFFLOAD + skb_queue_head_init(&sd->xfrm_backlog); +#endif INIT_LIST_HEAD(&sd->poll_list); sd->output_queue_tailp = &sd->output_queue; #ifdef CONFIG_RPS diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7e690d0..0ab1af0 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -18,26 +18,10 @@ * match. --pb */ -static int dev_ifname(struct net *net, struct ifreq __user *arg) +static int dev_ifname(struct net *net, struct ifreq *ifr) { - struct ifreq ifr; - int error; - - /* - * Fetch the caller's info block. - */ - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - ifr.ifr_name[IFNAMSIZ-1] = 0; - - error = netdev_get_name(net, ifr.ifr_name, ifr.ifr_ifindex); - if (error) - return error; - - if (copy_to_user(arg, &ifr, sizeof(struct ifreq))) - return -EFAULT; - return 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; + return netdev_get_name(net, ifr->ifr_name, ifr->ifr_ifindex); } static gifconf_func_t *gifconf_list[NPROTO]; @@ -66,9 +50,8 @@ EXPORT_SYMBOL(register_gifconf); * Thus we will need a 'compatibility mode'. */ -static int dev_ifconf(struct net *net, char __user *arg) +int dev_ifconf(struct net *net, struct ifconf *ifc, int size) { - struct ifconf ifc; struct net_device *dev; char __user *pos; int len; @@ -79,11 +62,8 @@ static int dev_ifconf(struct net *net, char __user *arg) * Fetch the caller's info block. */ - if (copy_from_user(&ifc, arg, sizeof(struct ifconf))) - return -EFAULT; - - pos = ifc.ifc_buf; - len = ifc.ifc_len; + pos = ifc->ifc_buf; + len = ifc->ifc_len; /* * Loop over the interfaces, and write an info block for each. @@ -95,10 +75,10 @@ static int dev_ifconf(struct net *net, char __user *arg) if (gifconf_list[i]) { int done; if (!pos) - done = gifconf_list[i](dev, NULL, 0); + done = gifconf_list[i](dev, NULL, 0, size); else done = gifconf_list[i](dev, pos + total, - len - total); + len - total, size); if (done < 0) return -EFAULT; total += done; @@ -109,12 +89,12 @@ static int dev_ifconf(struct net *net, char __user *arg) /* * All done. Write the updated control block back to the caller. */ - ifc.ifc_len = total; + ifc->ifc_len = total; /* * Both BSD and Solaris return 0 here, so we do too. */ - return copy_to_user(arg, &ifc, sizeof(struct ifconf)) ? -EFAULT : 0; + return 0; } /* @@ -406,53 +386,24 @@ EXPORT_SYMBOL(dev_load); * positive or a negative errno code on error. */ -int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) +int dev_ioctl(struct net *net, unsigned int cmd, struct ifreq *ifr, bool *need_copyout) { - struct ifreq ifr; int ret; char *colon; - /* One special case: SIOCGIFCONF takes ifconf argument - and requires shared lock, because it sleeps writing - to user space. - */ - - if (cmd == SIOCGIFCONF) { - rtnl_lock(); - ret = dev_ifconf(net, (char __user *) arg); - rtnl_unlock(); - return ret; - } + if (need_copyout) + *need_copyout = true; if (cmd == SIOCGIFNAME) - return dev_ifname(net, (struct ifreq __user *)arg); - - /* - * Take care of Wireless Extensions. Unfortunately struct iwreq - * isn't a proper subset of struct ifreq (it's 8 byte shorter) - * so we need to treat it specially, otherwise applications may - * fault if the struct they're passing happens to land at the - * end of a mapped page. - */ - if (cmd >= SIOCIWFIRST && cmd <= SIOCIWLAST) { - struct iwreq iwr; - - if (copy_from_user(&iwr, arg, sizeof(iwr))) - return -EFAULT; - - iwr.ifr_name[sizeof(iwr.ifr_name) - 1] = 0; + return dev_ifname(net, ifr); - return wext_handle_ioctl(net, &iwr, cmd, arg); - } - - if (copy_from_user(&ifr, arg, sizeof(struct ifreq))) - return -EFAULT; - - ifr.ifr_name[IFNAMSIZ-1] = 0; + ifr->ifr_name[IFNAMSIZ-1] = 0; - colon = strchr(ifr.ifr_name, ':'); + colon = strchr(ifr->ifr_name, ':'); if (colon) *colon = 0; + dev_load(net, ifr->ifr_name); + /* * See which interface the caller is talking about. */ @@ -472,31 +423,19 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCGIFMAP: case SIOCGIFINDEX: case SIOCGIFTXQLEN: - dev_load(net, ifr.ifr_name); rcu_read_lock(); - ret = dev_ifsioc_locked(net, &ifr, cmd); + ret = dev_ifsioc_locked(net, ifr, cmd); rcu_read_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; case SIOCETHTOOL: - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ethtool(net, &ifr); + ret = dev_ethtool(net, ifr); rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; /* @@ -510,17 +449,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) case SIOCSIFNAME: if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) return -EPERM; - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); - if (!ret) { - if (colon) - *colon = ':'; - if (copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; - } + if (colon) + *colon = ':'; return ret; /* @@ -561,10 +494,11 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) /* fall through */ case SIOCBONDSLAVEINFOQUERY: case SIOCBONDINFOQUERY: - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); + if (need_copyout) + *need_copyout = false; return ret; case SIOCGIFMEM: @@ -584,13 +518,9 @@ int dev_ioctl(struct net *net, unsigned int cmd, void __user *arg) cmd == SIOCGHWTSTAMP || (cmd >= SIOCDEVPRIVATE && cmd <= SIOCDEVPRIVATE + 15)) { - dev_load(net, ifr.ifr_name); rtnl_lock(); - ret = dev_ifsioc(net, &ifr, cmd); + ret = dev_ifsioc(net, ifr, cmd); rtnl_unlock(); - if (!ret && copy_to_user(arg, &ifr, - sizeof(struct ifreq))) - ret = -EFAULT; return ret; } return -ENOTTY; diff --git a/net/core/devlink.c b/net/core/devlink.c index 7d430c1..18d385e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -92,12 +92,6 @@ static LIST_HEAD(devlink_list); */ static DEFINE_MUTEX(devlink_mutex); -/* devlink_port_mutex - * - * Shared lock to guard lists of ports in all devlink devices. - */ -static DEFINE_MUTEX(devlink_port_mutex); - static struct net *devlink_net(const struct devlink *devlink) { return read_pnet(&devlink->_net); @@ -335,15 +329,18 @@ devlink_sb_tc_index_get_from_info(struct devlink_sb *devlink_sb, #define DEVLINK_NL_FLAG_NEED_DEVLINK BIT(0) #define DEVLINK_NL_FLAG_NEED_PORT BIT(1) #define DEVLINK_NL_FLAG_NEED_SB BIT(2) -#define DEVLINK_NL_FLAG_LOCK_PORTS BIT(3) - /* port is not needed but we need to ensure they don't - * change in the middle of command - */ + +/* The per devlink instance lock is taken by default in the pre-doit + * operation, yet several commands do not require this. The global + * devlink lock is taken and protects from disruption by user-calls. + */ +#define DEVLINK_NL_FLAG_NO_LOCK BIT(3) static int devlink_nl_pre_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { struct devlink *devlink; + int err; mutex_lock(&devlink_mutex); devlink = devlink_get_from_info(info); @@ -351,44 +348,47 @@ static int devlink_nl_pre_doit(const struct genl_ops *ops, mutex_unlock(&devlink_mutex); return PTR_ERR(devlink); } + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_lock(&devlink->lock); if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_DEVLINK) { info->user_ptr[0] = devlink; } else if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) { struct devlink_port *devlink_port; - mutex_lock(&devlink_port_mutex); devlink_port = devlink_port_get_from_info(devlink, info); if (IS_ERR(devlink_port)) { - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_port); + err = PTR_ERR(devlink_port); + goto unlock; } info->user_ptr[0] = devlink_port; } - if (ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) { - mutex_lock(&devlink_port_mutex); - } if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_SB) { struct devlink_sb *devlink_sb; devlink_sb = devlink_sb_get_from_info(devlink, info); if (IS_ERR(devlink_sb)) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) - mutex_unlock(&devlink_port_mutex); - mutex_unlock(&devlink_mutex); - return PTR_ERR(devlink_sb); + err = PTR_ERR(devlink_sb); + goto unlock; } info->user_ptr[1] = devlink_sb; } return 0; + +unlock: + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); + mutex_unlock(&devlink_mutex); + return err; } static void devlink_nl_post_doit(const struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) { - if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT || - ops->internal_flags & DEVLINK_NL_FLAG_LOCK_PORTS) - mutex_unlock(&devlink_port_mutex); + struct devlink *devlink; + + devlink = devlink_get_from_info(info); + if (~ops->internal_flags & DEVLINK_NL_FLAG_NO_LOCK) + mutex_unlock(&devlink->lock); mutex_unlock(&devlink_mutex); } @@ -614,10 +614,10 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_port, &devlink->port_list, list) { if (idx < start) { idx++; @@ -628,13 +628,15 @@ static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -801,6 +803,7 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk))) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { if (idx < start) { idx++; @@ -811,10 +814,13 @@ static int devlink_nl_cmd_sb_get_dumpit(struct sk_buff *msg, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, NLM_F_MULTI); - if (err) + if (err) { + mutex_unlock(&devlink->lock); goto out; + } idx++; } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -935,14 +941,18 @@ static int devlink_nl_cmd_sb_pool_get_dumpit(struct sk_buff *msg, if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: mutex_unlock(&devlink_mutex); @@ -1123,22 +1133,24 @@ static int devlink_nl_cmd_sb_port_pool_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_port_pool_get) continue; + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_port_pool_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -1347,23 +1359,26 @@ devlink_nl_cmd_sb_tc_pool_bind_get_dumpit(struct sk_buff *msg, int err; mutex_lock(&devlink_mutex); - mutex_lock(&devlink_port_mutex); list_for_each_entry(devlink, &devlink_list, list) { if (!net_eq(devlink_net(devlink), sock_net(msg->sk)) || !devlink->ops || !devlink->ops->sb_tc_pool_bind_get) continue; + + mutex_lock(&devlink->lock); list_for_each_entry(devlink_sb, &devlink->sb_list, list) { err = __sb_tc_pool_bind_get_dumpit(msg, start, &idx, devlink, devlink_sb, NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq); - if (err && err != -EOPNOTSUPP) + if (err && err != -EOPNOTSUPP) { + mutex_unlock(&devlink->lock); goto out; + } } + mutex_unlock(&devlink->lock); } out: - mutex_unlock(&devlink_port_mutex); mutex_unlock(&devlink_mutex); cb->args[0] = idx; @@ -1679,6 +1694,12 @@ static int devlink_dpipe_table_put(struct sk_buff *skb, table->counters_enabled)) goto nla_put_failure; + if (table->resource_valid) { + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_ID, + table->resource_id, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_DPIPE_TABLE_RESOURCE_UNITS, + table->resource_units, DEVLINK_ATTR_PAD); + } if (devlink_dpipe_matches_put(table, skb)) goto nla_put_failure; @@ -2273,6 +2294,273 @@ static int devlink_nl_cmd_dpipe_table_counters_set(struct sk_buff *skb, counters_enable); } +static struct devlink_resource * +devlink_resource_find(struct devlink *devlink, + struct devlink_resource *resource, u64 resource_id) +{ + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + struct devlink_resource *child_resource; + + if (resource->id == resource_id) + return resource; + + child_resource = devlink_resource_find(devlink, resource, + resource_id); + if (child_resource) + return child_resource; + } + return NULL; +} + +static void +devlink_resource_validate_children(struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + bool size_valid = true; + u64 parts_size = 0; + + if (list_empty(&resource->resource_list)) + goto out; + + list_for_each_entry(child_resource, &resource->resource_list, list) + parts_size += child_resource->size_new; + + if (parts_size > resource->size) + size_valid = false; +out: + resource->size_valid = size_valid; +} + +static int devlink_nl_cmd_resource_set(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + u64 resource_id; + u64 size; + int err; + + if (!info->attrs[DEVLINK_ATTR_RESOURCE_ID] || + !info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]) + return -EINVAL; + resource_id = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_ID]); + + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) + return -EINVAL; + + if (!resource->resource_ops->size_validate) + return -EINVAL; + + size = nla_get_u64(info->attrs[DEVLINK_ATTR_RESOURCE_SIZE]); + err = resource->resource_ops->size_validate(devlink, size, + info->extack); + if (err) + return err; + + resource->size_new = size; + devlink_resource_validate_children(resource); + if (resource->parent) + devlink_resource_validate_children(resource->parent); + return 0; +} + +static void +devlink_resource_size_params_put(struct devlink_resource *resource, + struct sk_buff *skb) +{ + struct devlink_resource_size_params *size_params; + + size_params = resource->size_params; + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_GRAN, + size_params->size_granularity, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MAX, + size_params->size_max, DEVLINK_ATTR_PAD); + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_MIN, + size_params->size_min, DEVLINK_ATTR_PAD); + nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_UNIT, size_params->unit); +} + +static int devlink_resource_put(struct devlink *devlink, struct sk_buff *skb, + struct devlink_resource *resource) +{ + struct devlink_resource *child_resource; + struct nlattr *child_resource_attr; + struct nlattr *resource_attr; + + resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE); + if (!resource_attr) + return -EMSGSIZE; + + if (nla_put_string(skb, DEVLINK_ATTR_RESOURCE_NAME, resource->name) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE, resource->size, + DEVLINK_ATTR_PAD) || + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_ID, resource->id, + DEVLINK_ATTR_PAD)) + goto nla_put_failure; + if (resource->size != resource->size_new) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_SIZE_NEW, + resource->size_new, DEVLINK_ATTR_PAD); + if (resource->resource_ops && resource->resource_ops->occ_get) + nla_put_u64_64bit(skb, DEVLINK_ATTR_RESOURCE_OCC, + resource->resource_ops->occ_get(devlink), + DEVLINK_ATTR_PAD); + devlink_resource_size_params_put(resource, skb); + if (list_empty(&resource->resource_list)) + goto out; + + if (nla_put_u8(skb, DEVLINK_ATTR_RESOURCE_SIZE_VALID, + resource->size_valid)) + goto nla_put_failure; + + child_resource_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!child_resource_attr) + goto nla_put_failure; + + list_for_each_entry(child_resource, &resource->resource_list, list) { + if (devlink_resource_put(devlink, skb, child_resource)) + goto resource_put_failure; + } + + nla_nest_end(skb, child_resource_attr); +out: + nla_nest_end(skb, resource_attr); + return 0; + +resource_put_failure: + nla_nest_cancel(skb, child_resource_attr); +nla_put_failure: + nla_nest_cancel(skb, resource_attr); + return -EMSGSIZE; +} + +static int devlink_resource_fill(struct genl_info *info, + enum devlink_command cmd, int flags) +{ + struct devlink *devlink = info->user_ptr[0]; + struct devlink_resource *resource; + struct nlattr *resources_attr; + struct sk_buff *skb = NULL; + struct nlmsghdr *nlh; + bool incomplete; + void *hdr; + int i; + int err; + + resource = list_first_entry(&devlink->resource_list, + struct devlink_resource, list); +start_again: + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + return err; + + hdr = genlmsg_put(skb, info->snd_portid, info->snd_seq, + &devlink_nl_family, NLM_F_MULTI, cmd); + if (!hdr) { + nlmsg_free(skb); + return -EMSGSIZE; + } + + if (devlink_nl_put_handle(skb, devlink)) + goto nla_put_failure; + + resources_attr = nla_nest_start(skb, DEVLINK_ATTR_RESOURCE_LIST); + if (!resources_attr) + goto nla_put_failure; + + incomplete = false; + i = 0; + list_for_each_entry_from(resource, &devlink->resource_list, list) { + err = devlink_resource_put(devlink, skb, resource); + if (err) { + if (!i) + goto err_resource_put; + incomplete = true; + break; + } + i++; + } + nla_nest_end(skb, resources_attr); + genlmsg_end(skb, hdr); + if (incomplete) + goto start_again; +send_done: + nlh = nlmsg_put(skb, info->snd_portid, info->snd_seq, + NLMSG_DONE, 0, flags | NLM_F_MULTI); + if (!nlh) { + err = devlink_dpipe_send_and_alloc_skb(&skb, info); + if (err) + goto err_skb_send_alloc; + goto send_done; + } + return genlmsg_reply(skb, info); + +nla_put_failure: + err = -EMSGSIZE; +err_resource_put: +err_skb_send_alloc: + genlmsg_cancel(skb, hdr); + nlmsg_free(skb); + return err; +} + +static int devlink_nl_cmd_resource_dump(struct sk_buff *skb, + struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + + if (list_empty(&devlink->resource_list)) + return -EOPNOTSUPP; + + return devlink_resource_fill(info, DEVLINK_CMD_RESOURCE_DUMP, 0); +} + +static int +devlink_resources_validate(struct devlink *devlink, + struct devlink_resource *resource, + struct genl_info *info) +{ + struct list_head *resource_list; + int err = 0; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + list_for_each_entry(resource, resource_list, list) { + if (!resource->size_valid) + return -EINVAL; + err = devlink_resources_validate(devlink, resource, info); + if (err) + return err; + } + return err; +} + +static int devlink_nl_cmd_reload(struct sk_buff *skb, struct genl_info *info) +{ + struct devlink *devlink = info->user_ptr[0]; + int err; + + if (!devlink->ops->reload) + return -EOPNOTSUPP; + + err = devlink_resources_validate(devlink, NULL, info); + if (err) { + NL_SET_ERR_MSG_MOD(info->extack, "resources size validation failed"); + return err; + } + return devlink->ops->reload(devlink); +} + static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING }, @@ -2291,6 +2579,8 @@ static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = { [DEVLINK_ATTR_ESWITCH_ENCAP_MODE] = { .type = NLA_U8 }, [DEVLINK_ATTR_DPIPE_TABLE_NAME] = { .type = NLA_NUL_STRING }, [DEVLINK_ATTR_DPIPE_TABLE_COUNTERS_ENABLED] = { .type = NLA_U8 }, + [DEVLINK_ATTR_RESOURCE_ID] = { .type = NLA_U64}, + [DEVLINK_ATTR_RESOURCE_SIZE] = { .type = NLA_U64}, }; static const struct genl_ops devlink_nl_ops[] = { @@ -2322,14 +2612,16 @@ static const struct genl_ops devlink_nl_ops[] = { .doit = devlink_nl_cmd_port_split_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_PORT_UNSPLIT, .doit = devlink_nl_cmd_port_unsplit_doit, .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, }, { .cmd = DEVLINK_CMD_SB_GET, @@ -2397,8 +2689,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_SB_OCC_MAX_CLEAR, @@ -2406,8 +2697,7 @@ static const struct genl_ops devlink_nl_ops[] = { .policy = devlink_nl_policy, .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | - DEVLINK_NL_FLAG_NEED_SB | - DEVLINK_NL_FLAG_LOCK_PORTS, + DEVLINK_NL_FLAG_NEED_SB, }, { .cmd = DEVLINK_CMD_ESWITCH_GET, @@ -2451,6 +2741,28 @@ static const struct genl_ops devlink_nl_ops[] = { .flags = GENL_ADMIN_PERM, .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, }, + { + .cmd = DEVLINK_CMD_RESOURCE_SET, + .doit = devlink_nl_cmd_resource_set, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RESOURCE_DUMP, + .doit = devlink_nl_cmd_resource_dump, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK, + }, + { + .cmd = DEVLINK_CMD_RELOAD, + .doit = devlink_nl_cmd_reload, + .policy = devlink_nl_policy, + .flags = GENL_ADMIN_PERM, + .internal_flags = DEVLINK_NL_FLAG_NEED_DEVLINK | + DEVLINK_NL_FLAG_NO_LOCK, + }, }; static struct genl_family devlink_nl_family __ro_after_init = { @@ -2488,6 +2800,8 @@ struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size) INIT_LIST_HEAD(&devlink->port_list); INIT_LIST_HEAD(&devlink->sb_list); INIT_LIST_HEAD_RCU(&devlink->dpipe_table_list); + INIT_LIST_HEAD(&devlink->resource_list); + mutex_init(&devlink->lock); return devlink; } EXPORT_SYMBOL_GPL(devlink_alloc); @@ -2550,16 +2864,16 @@ int devlink_port_register(struct devlink *devlink, struct devlink_port *devlink_port, unsigned int port_index) { - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); if (devlink_port_index_exists(devlink, port_index)) { - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); return -EEXIST; } devlink_port->devlink = devlink; devlink_port->index = port_index; devlink_port->registered = true; list_add_tail(&devlink_port->list, &devlink->port_list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW); return 0; } @@ -2572,10 +2886,12 @@ EXPORT_SYMBOL_GPL(devlink_port_register); */ void devlink_port_unregister(struct devlink_port *devlink_port) { + struct devlink *devlink = devlink_port->devlink; + devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); - mutex_lock(&devlink_port_mutex); + mutex_lock(&devlink->lock); list_del(&devlink_port->list); - mutex_unlock(&devlink_port_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_port_unregister); @@ -2651,7 +2967,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, struct devlink_sb *devlink_sb; int err = 0; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); if (devlink_sb_index_exists(devlink, sb_index)) { err = -EEXIST; goto unlock; @@ -2670,7 +2986,7 @@ int devlink_sb_register(struct devlink *devlink, unsigned int sb_index, devlink_sb->egress_tc_count = egress_tc_count; list_add_tail(&devlink_sb->list, &devlink->sb_list); unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return err; } EXPORT_SYMBOL_GPL(devlink_sb_register); @@ -2679,11 +2995,11 @@ void devlink_sb_unregister(struct devlink *devlink, unsigned int sb_index) { struct devlink_sb *devlink_sb; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink_sb = devlink_sb_get_by_index(devlink, sb_index); WARN_ON(!devlink_sb); list_del(&devlink_sb->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree(devlink_sb); } EXPORT_SYMBOL_GPL(devlink_sb_unregister); @@ -2699,9 +3015,9 @@ EXPORT_SYMBOL_GPL(devlink_sb_unregister); int devlink_dpipe_headers_register(struct devlink *devlink, struct devlink_dpipe_headers *dpipe_headers) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = dpipe_headers; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); @@ -2715,9 +3031,9 @@ EXPORT_SYMBOL_GPL(devlink_dpipe_headers_register); */ void devlink_dpipe_headers_unregister(struct devlink *devlink) { - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); devlink->dpipe_headers = NULL; - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_headers_unregister); @@ -2783,9 +3099,9 @@ int devlink_dpipe_table_register(struct devlink *devlink, table->priv = priv; table->counter_control_extern = counter_control_extern; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); list_add_tail_rcu(&table->list, &devlink->dpipe_table_list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); return 0; } EXPORT_SYMBOL_GPL(devlink_dpipe_table_register); @@ -2801,20 +3117,182 @@ void devlink_dpipe_table_unregister(struct devlink *devlink, { struct devlink_dpipe_table *table; - mutex_lock(&devlink_mutex); + mutex_lock(&devlink->lock); table = devlink_dpipe_table_find(&devlink->dpipe_table_list, table_name); if (!table) goto unlock; list_del_rcu(&table->list); - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); kfree_rcu(table, rcu); return; unlock: - mutex_unlock(&devlink_mutex); + mutex_unlock(&devlink->lock); } EXPORT_SYMBOL_GPL(devlink_dpipe_table_unregister); +/** + * devlink_resource_register - devlink resource register + * + * @devlink: devlink + * @resource_name: resource's name + * @top_hierarchy: top hierarchy + * @reload_required: reload is required for new configuration to + * apply + * @resource_size: resource's size + * @resource_id: resource's id + * @parent_reosurce_id: resource's parent id + * @size params: size parameters + * @resource_ops: resource ops + */ +int devlink_resource_register(struct devlink *devlink, + const char *resource_name, + bool top_hierarchy, + u64 resource_size, + u64 resource_id, + u64 parent_resource_id, + struct devlink_resource_size_params *size_params, + const struct devlink_resource_ops *resource_ops) +{ + struct devlink_resource *resource; + struct list_head *resource_list; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (resource) { + err = -EINVAL; + goto out; + } + + resource = kzalloc(sizeof(*resource), GFP_KERNEL); + if (!resource) { + err = -ENOMEM; + goto out; + } + + if (top_hierarchy) { + resource_list = &devlink->resource_list; + } else { + struct devlink_resource *parent_resource; + + parent_resource = devlink_resource_find(devlink, NULL, + parent_resource_id); + if (parent_resource) { + resource_list = &parent_resource->resource_list; + resource->parent = parent_resource; + } else { + kfree(resource); + err = -EINVAL; + goto out; + } + } + + resource->name = resource_name; + resource->size = resource_size; + resource->size_new = resource_size; + resource->id = resource_id; + resource->resource_ops = resource_ops; + resource->size_valid = true; + resource->size_params = size_params; + INIT_LIST_HEAD(&resource->resource_list); + list_add_tail(&resource->list, resource_list); +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_register); + +/** + * devlink_resources_unregister - free all resources + * + * @devlink: devlink + * @resource: resource + */ +void devlink_resources_unregister(struct devlink *devlink, + struct devlink_resource *resource) +{ + struct devlink_resource *tmp, *child_resource; + struct list_head *resource_list; + + if (resource) + resource_list = &resource->resource_list; + else + resource_list = &devlink->resource_list; + + if (!resource) + mutex_lock(&devlink->lock); + + list_for_each_entry_safe(child_resource, tmp, resource_list, list) { + devlink_resources_unregister(devlink, child_resource); + list_del(&child_resource->list); + kfree(child_resource); + } + + if (!resource) + mutex_unlock(&devlink->lock); +} +EXPORT_SYMBOL_GPL(devlink_resources_unregister); + +/** + * devlink_resource_size_get - get and update size + * + * @devlink: devlink + * @resource_id: the requested resource id + * @p_resource_size: ptr to update + */ +int devlink_resource_size_get(struct devlink *devlink, + u64 resource_id, + u64 *p_resource_size) +{ + struct devlink_resource *resource; + int err = 0; + + mutex_lock(&devlink->lock); + resource = devlink_resource_find(devlink, NULL, resource_id); + if (!resource) { + err = -EINVAL; + goto out; + } + *p_resource_size = resource->size_new; + resource->size = resource->size_new; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_resource_size_get); + +/** + * devlink_dpipe_table_resource_set - set the resource id + * + * @devlink: devlink + * @table_name: table name + * @resource_id: resource id + * @resource_units: number of resource's units consumed per table's entry + */ +int devlink_dpipe_table_resource_set(struct devlink *devlink, + const char *table_name, u64 resource_id, + u64 resource_units) +{ + struct devlink_dpipe_table *table; + int err = 0; + + mutex_lock(&devlink->lock); + table = devlink_dpipe_table_find(&devlink->dpipe_table_list, + table_name); + if (!table) { + err = -EINVAL; + goto out; + } + table->resource_id = resource_id; + table->resource_units = resource_units; + table->resource_valid = true; +out: + mutex_unlock(&devlink->lock); + return err; +} +EXPORT_SYMBOL_GPL(devlink_dpipe_table_resource_set); + static int __init devlink_module_init(void) { return genl_register_family(&devlink_nl_family); diff --git a/net/core/dst.c b/net/core/dst.c index 662a2d4..007aa0b 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -21,6 +21,7 @@ #include <linux/sched.h> #include <linux/prefetch.h> #include <net/lwtunnel.h> +#include <net/xfrm.h> #include <net/dst.h> #include <net/dst_metadata.h> @@ -62,15 +63,12 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, struct net_device *dev, int initial_ref, int initial_obsolete, unsigned short flags) { - dst->child = NULL; dst->dev = dev; if (dev) dev_hold(dev); dst->ops = ops; dst_init_metrics(dst, dst_default_metrics.metrics, true); dst->expires = 0UL; - dst->path = dst; - dst->from = NULL; #ifdef CONFIG_XFRM dst->xfrm = NULL; #endif @@ -88,7 +86,6 @@ void dst_init(struct dst_entry *dst, struct dst_ops *ops, dst->__use = 0; dst->lastuse = jiffies; dst->flags = flags; - dst->next = NULL; if (!(flags & DST_NOCOUNT)) dst_entries_add(ops, 1); } @@ -116,12 +113,17 @@ EXPORT_SYMBOL(dst_alloc); struct dst_entry *dst_destroy(struct dst_entry * dst) { - struct dst_entry *child; + struct dst_entry *child = NULL; smp_rmb(); - child = dst->child; +#ifdef CONFIG_XFRM + if (dst->xfrm) { + struct xfrm_dst *xdst = (struct xfrm_dst *) dst; + child = xdst->child; + } +#endif if (!(dst->flags & DST_NOCOUNT)) dst_entries_add(dst->ops, -1); diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 8225416..107b122 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -73,6 +73,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_LLTX_BIT] = "tx-lockless", [NETIF_F_NETNS_LOCAL_BIT] = "netns-local", [NETIF_F_GRO_BIT] = "rx-gro", + [NETIF_F_GRO_HW_BIT] = "rx-gro-hw", [NETIF_F_LRO_BIT] = "rx-lro", [NETIF_F_TSO_BIT] = "tx-tcp-segmentation", @@ -1692,14 +1693,23 @@ static int ethtool_get_ringparam(struct net_device *dev, void __user *useraddr) static int ethtool_set_ringparam(struct net_device *dev, void __user *useraddr) { - struct ethtool_ringparam ringparam; + struct ethtool_ringparam ringparam, max = { .cmd = ETHTOOL_GRINGPARAM }; - if (!dev->ethtool_ops->set_ringparam) + if (!dev->ethtool_ops->set_ringparam || !dev->ethtool_ops->get_ringparam) return -EOPNOTSUPP; if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) return -EFAULT; + dev->ethtool_ops->get_ringparam(dev, &max); + + /* ensure new ring parameters are within the maximums */ + if (ringparam.rx_pending > max.rx_max_pending || + ringparam.rx_mini_pending > max.rx_mini_max_pending || + ringparam.rx_jumbo_pending > max.rx_jumbo_max_pending || + ringparam.tx_pending > max.tx_max_pending) + return -EINVAL; + return dev->ethtool_ops->set_ringparam(dev, &ringparam); } diff --git a/net/core/filter.c b/net/core/filter.c index 1c0eb43..08ab4c6 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -401,8 +401,8 @@ do_pass: /* Classic BPF expects A and X to be reset first. These need * to be guaranteed to be the first two instructions. */ - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); - *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *new_insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_X, BPF_REG_X); /* All programs must keep CTX in callee saved BPF_REG_CTX. * In eBPF case it's done by the compiler, here we need to @@ -459,8 +459,15 @@ do_pass: break; if (fp->code == (BPF_ALU | BPF_DIV | BPF_X) || - fp->code == (BPF_ALU | BPF_MOD | BPF_X)) + fp->code == (BPF_ALU | BPF_MOD | BPF_X)) { *insn++ = BPF_MOV32_REG(BPF_REG_X, BPF_REG_X); + /* Error with exception code on div/mod by 0. + * For cBPF programs, this was always return 0. + */ + *insn++ = BPF_JMP_IMM(BPF_JNE, BPF_REG_X, 0, 2); + *insn++ = BPF_ALU32_REG(BPF_XOR, BPF_REG_A, BPF_REG_A); + *insn++ = BPF_EXIT_INSN(); + } *insn = BPF_RAW_INSN(fp->code, BPF_REG_A, BPF_REG_X, 0, fp->k); break; @@ -2686,8 +2693,9 @@ static int __xdp_generic_ok_fwd_dev(struct sk_buff *skb, struct net_device *fwd) return 0; } -int xdp_do_generic_redirect_map(struct net_device *dev, struct sk_buff *skb, - struct bpf_prog *xdp_prog) +static int xdp_do_generic_redirect_map(struct net_device *dev, + struct sk_buff *skb, + struct bpf_prog *xdp_prog) { struct redirect_info *ri = this_cpu_ptr(&redirect_info); unsigned long map_owner = ri->map_owner; @@ -2864,7 +2872,7 @@ static const struct bpf_func_proto bpf_skb_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; static unsigned short bpf_tunnel_key_af(u64 flags) @@ -3015,6 +3023,8 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE; if (flags & BPF_F_DONT_FRAGMENT) info->key.tun_flags |= TUNNEL_DONT_FRAGMENT; + if (flags & BPF_F_ZERO_CSUM_TX) + info->key.tun_flags &= ~TUNNEL_CSUM; info->key.tun_id = cpu_to_be64(from->tunnel_id); info->key.tos = from->tunnel_tos; @@ -3028,8 +3038,6 @@ BPF_CALL_4(bpf_skb_set_tunnel_key, struct sk_buff *, skb, IPV6_FLOWLABEL_MASK; } else { info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4); - if (flags & BPF_F_ZERO_CSUM_TX) - info->key.tun_flags &= ~TUNNEL_CSUM; } return 0; @@ -3153,7 +3161,7 @@ static const struct bpf_func_proto bpf_xdp_event_output_proto = { .arg2_type = ARG_CONST_MAP_PTR, .arg3_type = ARG_ANYTHING, .arg4_type = ARG_PTR_TO_MEM, - .arg5_type = ARG_CONST_SIZE, + .arg5_type = ARG_CONST_SIZE_OR_ZERO, }; BPF_CALL_1(bpf_get_socket_cookie, struct sk_buff *, skb) @@ -3231,6 +3239,29 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ret = -EINVAL; } #ifdef CONFIG_INET +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + return -EINVAL; + + val = *((int *)optval); + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + if (val < -1 || val > 0xff) { + ret = -EINVAL; + } else { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (val == -1) + val = 0; + np->tclass = val; + } + break; + default: + ret = -EINVAL; + } +#endif } else if (level == SOL_TCP && sk->sk_prot->setsockopt == tcp_setsockopt) { if (optname == TCP_CONGESTION) { @@ -3240,7 +3271,8 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, strncpy(name, optval, min_t(long, optlen, TCP_CA_NAME_MAX-1)); name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, reinit); + ret = tcp_set_congestion_control(sk, name, false, + reinit); } else { struct tcp_sock *tp = tcp_sk(sk); @@ -3306,6 +3338,22 @@ BPF_CALL_5(bpf_getsockopt, struct bpf_sock_ops_kern *, bpf_sock, } else { goto err_clear; } +#if IS_ENABLED(CONFIG_IPV6) + } else if (level == SOL_IPV6) { + struct ipv6_pinfo *np = inet6_sk(sk); + + if (optlen != sizeof(int) || sk->sk_family != AF_INET6) + goto err_clear; + + /* Only some options are supported */ + switch (optname) { + case IPV6_TCLASS: + *((int *)optval) = (int)np->tclass; + break; + default: + goto err_clear; + } +#endif } else { goto err_clear; } @@ -3327,6 +3375,33 @@ static const struct bpf_func_proto bpf_getsockopt_proto = { .arg5_type = ARG_CONST_SIZE, }; +BPF_CALL_2(bpf_sock_ops_cb_flags_set, struct bpf_sock_ops_kern *, bpf_sock, + int, argval) +{ + struct sock *sk = bpf_sock->sk; + int val = argval & BPF_SOCK_OPS_ALL_CB_FLAGS; + + if (!sk_fullsock(sk)) + return -EINVAL; + +#ifdef CONFIG_INET + if (val) + tcp_sk(sk)->bpf_sock_ops_cb_flags = val; + + return argval & (~BPF_SOCK_OPS_ALL_CB_FLAGS); +#else + return -EINVAL; +#endif +} + +static const struct bpf_func_proto bpf_sock_ops_cb_flags_set_proto = { + .func = bpf_sock_ops_cb_flags_set, + .gpl_only = false, + .ret_type = RET_INTEGER, + .arg1_type = ARG_PTR_TO_CTX, + .arg2_type = ARG_ANYTHING, +}; + static const struct bpf_func_proto * bpf_base_func_proto(enum bpf_func_id func_id) { @@ -3459,6 +3534,8 @@ xdp_func_proto(enum bpf_func_id func_id) return &bpf_xdp_event_output_proto; case BPF_FUNC_get_smp_processor_id: return &bpf_get_smp_processor_id_proto; + case BPF_FUNC_csum_diff: + return &bpf_csum_diff_proto; case BPF_FUNC_xdp_adjust_head: return &bpf_xdp_adjust_head_proto; case BPF_FUNC_xdp_adjust_meta: @@ -3507,6 +3584,8 @@ static const struct bpf_func_proto * return &bpf_setsockopt_proto; case BPF_FUNC_getsockopt: return &bpf_getsockopt_proto; + case BPF_FUNC_sock_ops_cb_flags_set: + return &bpf_sock_ops_cb_flags_set_proto; case BPF_FUNC_sock_map_update: return &bpf_sock_map_update_proto; default: @@ -3823,34 +3902,44 @@ void bpf_warn_invalid_xdp_action(u32 act) } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); -static bool __is_valid_sock_ops_access(int off, int size) +static bool sock_ops_is_valid_access(int off, int size, + enum bpf_access_type type, + struct bpf_insn_access_aux *info) { + const int size_default = sizeof(__u32); + if (off < 0 || off >= sizeof(struct bpf_sock_ops)) return false; + /* The verifier guarantees that size > 0. */ if (off % size != 0) return false; - if (size != sizeof(__u32)) - return false; - return true; -} - -static bool sock_ops_is_valid_access(int off, int size, - enum bpf_access_type type, - struct bpf_insn_access_aux *info) -{ if (type == BPF_WRITE) { switch (off) { - case offsetof(struct bpf_sock_ops, op) ... - offsetof(struct bpf_sock_ops, replylong[3]): + case offsetof(struct bpf_sock_ops, reply): + case offsetof(struct bpf_sock_ops, sk_txhash): + if (size != size_default) + return false; break; default: return false; } + } else { + switch (off) { + case bpf_ctx_range_till(struct bpf_sock_ops, bytes_received, + bytes_acked): + if (size != sizeof(__u64)) + return false; + break; + default: + if (size != size_default) + return false; + break; + } } - return __is_valid_sock_ops_access(off, size); + return true; } static int sk_skb_prologue(struct bpf_insn *insn_buf, bool direct_write, @@ -4305,6 +4394,24 @@ static u32 xdp_convert_ctx_access(enum bpf_access_type type, si->dst_reg, si->src_reg, offsetof(struct xdp_buff, data_end)); break; + case offsetof(struct xdp_md, ingress_ifindex): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_rxq_info, dev), + si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, dev)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct net_device, ifindex)); + break; + case offsetof(struct xdp_md, rx_queue_index): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(struct xdp_buff, rxq), + si->dst_reg, si->src_reg, + offsetof(struct xdp_buff, rxq)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct xdp_rxq_info, + queue_index)); + break; } return insn - insn_buf; @@ -4439,6 +4546,211 @@ static u32 sock_ops_convert_ctx_access(enum bpf_access_type type, *insn++ = BPF_LDX_MEM(BPF_H, si->dst_reg, si->dst_reg, offsetof(struct sock_common, skc_num)); break; + + case offsetof(struct bpf_sock_ops, is_fullsock): + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, + is_fullsock), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, + is_fullsock)); + break; + + case offsetof(struct bpf_sock_ops, state): + BUILD_BUG_ON(FIELD_SIZEOF(struct sock_common, skc_state) != 1); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_B, si->dst_reg, si->dst_reg, + offsetof(struct sock_common, skc_state)); + break; + + case offsetof(struct bpf_sock_ops, rtt_min): + BUILD_BUG_ON(FIELD_SIZEOF(struct tcp_sock, rtt_min) != + sizeof(struct minmax)); + BUILD_BUG_ON(sizeof(struct minmax) < + sizeof(struct minmax_sample)); + + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( + struct bpf_sock_ops_kern, sk), + si->dst_reg, si->src_reg, + offsetof(struct bpf_sock_ops_kern, sk)); + *insn++ = BPF_LDX_MEM(BPF_W, si->dst_reg, si->dst_reg, + offsetof(struct tcp_sock, rtt_min) + + FIELD_SIZEOF(struct minmax_sample, t)); + break; + +/* Helper macro for adding read access to tcp_sock or sock fields. */ +#define SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, si->dst_reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + si->dst_reg, si->src_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF(OBJ, \ + OBJ_FIELD), \ + si->dst_reg, si->dst_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + } while (0) + +/* Helper macro for adding write access to tcp_sock or sock fields. + * The macro is called with two registers, dst_reg which contains a pointer + * to ctx (context) and src_reg which contains the value that should be + * stored. However, we need an additional register since we cannot overwrite + * dst_reg because it may be used later in the program. + * Instead we "borrow" one of the other register. We first save its value + * into a new (temp) field in bpf_sock_ops_kern, use it, and then restore + * it at the end of the macro. + */ +#define SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ) \ + do { \ + int reg = BPF_REG_9; \ + BUILD_BUG_ON(FIELD_SIZEOF(OBJ, OBJ_FIELD) > \ + FIELD_SIZEOF(struct bpf_sock_ops, BPF_FIELD)); \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + if (si->dst_reg == reg || si->src_reg == reg) \ + reg--; \ + *insn++ = BPF_STX_MEM(BPF_DW, si->dst_reg, reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, \ + is_fullsock), \ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + is_fullsock)); \ + *insn++ = BPF_JMP_IMM(BPF_JEQ, reg, 0, 2); \ + *insn++ = BPF_LDX_MEM(BPF_FIELD_SIZEOF( \ + struct bpf_sock_ops_kern, sk),\ + reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, sk));\ + *insn++ = BPF_STX_MEM(BPF_FIELD_SIZEOF(OBJ, OBJ_FIELD), \ + reg, si->src_reg, \ + offsetof(OBJ, OBJ_FIELD)); \ + *insn++ = BPF_LDX_MEM(BPF_DW, reg, si->dst_reg, \ + offsetof(struct bpf_sock_ops_kern, \ + temp)); \ + } while (0) + +#define SOCK_OPS_GET_OR_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ, TYPE) \ + do { \ + if (TYPE == BPF_WRITE) \ + SOCK_OPS_SET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + else \ + SOCK_OPS_GET_FIELD(BPF_FIELD, OBJ_FIELD, OBJ); \ + } while (0) + + case offsetof(struct bpf_sock_ops, snd_cwnd): + SOCK_OPS_GET_FIELD(snd_cwnd, snd_cwnd, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, srtt_us): + SOCK_OPS_GET_FIELD(srtt_us, srtt_us, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bpf_sock_ops_cb_flags): + SOCK_OPS_GET_FIELD(bpf_sock_ops_cb_flags, bpf_sock_ops_cb_flags, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_ssthresh): + SOCK_OPS_GET_FIELD(snd_ssthresh, snd_ssthresh, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rcv_nxt): + SOCK_OPS_GET_FIELD(rcv_nxt, rcv_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_nxt): + SOCK_OPS_GET_FIELD(snd_nxt, snd_nxt, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, snd_una): + SOCK_OPS_GET_FIELD(snd_una, snd_una, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, mss_cache): + SOCK_OPS_GET_FIELD(mss_cache, mss_cache, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, ecn_flags): + SOCK_OPS_GET_FIELD(ecn_flags, ecn_flags, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_delivered): + SOCK_OPS_GET_FIELD(rate_delivered, rate_delivered, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, rate_interval_us): + SOCK_OPS_GET_FIELD(rate_interval_us, rate_interval_us, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, packets_out): + SOCK_OPS_GET_FIELD(packets_out, packets_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, retrans_out): + SOCK_OPS_GET_FIELD(retrans_out, retrans_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, total_retrans): + SOCK_OPS_GET_FIELD(total_retrans, total_retrans, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_in): + SOCK_OPS_GET_FIELD(segs_in, segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_in): + SOCK_OPS_GET_FIELD(data_segs_in, data_segs_in, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, segs_out): + SOCK_OPS_GET_FIELD(segs_out, segs_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, data_segs_out): + SOCK_OPS_GET_FIELD(data_segs_out, data_segs_out, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, lost_out): + SOCK_OPS_GET_FIELD(lost_out, lost_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sacked_out): + SOCK_OPS_GET_FIELD(sacked_out, sacked_out, struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, sk_txhash): + SOCK_OPS_GET_OR_SET_FIELD(sk_txhash, sk_txhash, + struct sock, type); + break; + + case offsetof(struct bpf_sock_ops, bytes_received): + SOCK_OPS_GET_FIELD(bytes_received, bytes_received, + struct tcp_sock); + break; + + case offsetof(struct bpf_sock_ops, bytes_acked): + SOCK_OPS_GET_FIELD(bytes_acked, bytes_acked, struct tcp_sock); + break; + } return insn - insn_buf; } @@ -4475,6 +4787,7 @@ const struct bpf_verifier_ops sk_filter_verifier_ops = { }; const struct bpf_prog_ops sk_filter_prog_ops = { + .test_run = bpf_prog_test_run_skb, }; const struct bpf_verifier_ops tc_cls_act_verifier_ops = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 544bddf..559db9e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -24,6 +24,7 @@ #include <linux/tcp.h> #include <net/flow_dissector.h> #include <scsi/fc/fc_fcoe.h> +#include <uapi/linux/batadv_packet.h> static void dissector_set_key(struct flow_dissector *flow_dissector, enum flow_dissector_key_id key_id) @@ -133,10 +134,10 @@ skb_flow_dissect_set_enc_addr_type(enum flow_dissector_key_id type, ctrl->addr_type = type; } -static void -__skb_flow_dissect_tunnel_info(const struct sk_buff *skb, - struct flow_dissector *flow_dissector, - void *target_container) +void +skb_flow_dissect_tunnel_info(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container) { struct ip_tunnel_info *info; struct ip_tunnel_key *key; @@ -212,6 +213,7 @@ __skb_flow_dissect_tunnel_info(const struct sk_buff *skb, tp->dst = key->tp_dst; } } +EXPORT_SYMBOL(skb_flow_dissect_tunnel_info); static enum flow_dissect_ret __skb_flow_dissect_mpls(const struct sk_buff *skb, @@ -436,6 +438,57 @@ __skb_flow_dissect_gre(const struct sk_buff *skb, return FLOW_DISSECT_RET_PROTO_AGAIN; } +/** + * __skb_flow_dissect_batadv() - dissect batman-adv header + * @skb: sk_buff to with the batman-adv header + * @key_control: flow dissectors control key + * @data: raw buffer pointer to the packet, if NULL use skb->data + * @p_proto: pointer used to update the protocol to process next + * @p_nhoff: pointer used to update inner network header offset + * @hlen: packet header length + * @flags: any combination of FLOW_DISSECTOR_F_* + * + * ETH_P_BATMAN packets are tried to be dissected. Only + * &struct batadv_unicast packets are actually processed because they contain an + * inner ethernet header and are usually followed by actual network header. This + * allows the flow dissector to continue processing the packet. + * + * Return: FLOW_DISSECT_RET_PROTO_AGAIN when &struct batadv_unicast was found, + * FLOW_DISSECT_RET_OUT_GOOD when dissector should stop after encapsulation, + * otherwise FLOW_DISSECT_RET_OUT_BAD + */ +static enum flow_dissect_ret +__skb_flow_dissect_batadv(const struct sk_buff *skb, + struct flow_dissector_key_control *key_control, + void *data, __be16 *p_proto, int *p_nhoff, int hlen, + unsigned int flags) +{ + struct { + struct batadv_unicast_packet batadv_unicast; + struct ethhdr eth; + } *hdr, _hdr; + + hdr = __skb_header_pointer(skb, *p_nhoff, sizeof(_hdr), data, hlen, + &_hdr); + if (!hdr) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.version != BATADV_COMPAT_VERSION) + return FLOW_DISSECT_RET_OUT_BAD; + + if (hdr->batadv_unicast.packet_type != BATADV_UNICAST) + return FLOW_DISSECT_RET_OUT_BAD; + + *p_proto = hdr->eth.h_proto; + *p_nhoff += sizeof(*hdr); + + key_control->flags |= FLOW_DIS_ENCAPSULATION; + if (flags & FLOW_DISSECTOR_F_STOP_AT_ENCAP) + return FLOW_DISSECT_RET_OUT_GOOD; + + return FLOW_DISSECT_RET_PROTO_AGAIN; +} + static void __skb_flow_dissect_tcp(const struct sk_buff *skb, struct flow_dissector *flow_dissector, @@ -576,9 +629,6 @@ bool __skb_flow_dissect(const struct sk_buff *skb, FLOW_DISSECTOR_KEY_BASIC, target_container); - __skb_flow_dissect_tunnel_info(skb, flow_dissector, - target_container); - if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_ETH_ADDRS)) { struct ethhdr *eth = eth_hdr(skb); @@ -817,6 +867,11 @@ proto_again: nhoff, hlen); break; + case htons(ETH_P_BATMAN): + fdret = __skb_flow_dissect_batadv(skb, key_control, data, + &proto, &nhoff, hlen, flags); + break; + default: fdret = FLOW_DISSECT_RET_OUT_BAD; break; diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 9834cfa..0a3f88f 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -159,7 +159,11 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, est->intvl_log = intvl_log; est->cpu_bstats = cpu_bstats; + if (stats_lock) + local_bh_disable(); est_fetch_counters(est, &b); + if (stats_lock) + local_bh_enable(); est->last_bytes = b.bytes; est->last_packets = b.packets; old = rcu_dereference_protected(*rate_est, 1); diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index 87f2855..b2b2323b 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -252,10 +252,10 @@ __gnet_stats_copy_queue_cpu(struct gnet_stats_queue *qstats, } } -static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, - const struct gnet_stats_queue __percpu *cpu, - const struct gnet_stats_queue *q, - __u32 qlen) +void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, + const struct gnet_stats_queue __percpu *cpu, + const struct gnet_stats_queue *q, + __u32 qlen) { if (cpu) { __gnet_stats_copy_queue_cpu(qstats, cpu); @@ -269,6 +269,7 @@ static void __gnet_stats_copy_queue(struct gnet_stats_queue *qstats, qstats->qlen = qlen; } +EXPORT_SYMBOL(__gnet_stats_copy_queue); /** * gnet_stats_copy_queue - copy queue statistics into statistics TLV diff --git a/net/core/link_watch.c b/net/core/link_watch.c index 9828616..e38e641 100644 --- a/net/core/link_watch.c +++ b/net/core/link_watch.c @@ -92,7 +92,7 @@ static bool linkwatch_urgent_event(struct net_device *dev) if (dev->ifindex != dev_get_iflink(dev)) return true; - if (dev->priv_flags & IFF_TEAM_PORT) + if (netif_is_lag_port(dev) || netif_is_lag_master(dev)) return true; return netif_carrier_ok(dev) && qdisc_tx_changing(dev); diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 7f83171..7b7a14a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2862,7 +2862,6 @@ static int neigh_stat_seq_open(struct inode *inode, struct file *file) }; static const struct file_operations neigh_stat_seq_fops = { - .owner = THIS_MODULE, .open = neigh_stat_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/net-procfs.c b/net/core/net-procfs.c index 615ccab..e010bb8 100644 --- a/net/core/net-procfs.c +++ b/net/core/net-procfs.c @@ -182,7 +182,6 @@ static int dev_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_seq_fops = { - .owner = THIS_MODULE, .open = dev_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -202,7 +201,6 @@ static int softnet_seq_open(struct inode *inode, struct file *file) } static const struct file_operations softnet_seq_fops = { - .owner = THIS_MODULE, .open = softnet_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -306,7 +304,6 @@ static int ptype_seq_open(struct inode *inode, struct file *file) } static const struct file_operations ptype_seq_fops = { - .owner = THIS_MODULE, .open = ptype_seq_open, .read = seq_read, .llseek = seq_lseek, @@ -387,7 +384,6 @@ static int dev_mc_seq_open(struct inode *inode, struct file *file) } static const struct file_operations dev_mc_seq_fops = { - .owner = THIS_MODULE, .open = dev_mc_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 799b752..60a5ad2 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -295,10 +295,31 @@ static ssize_t carrier_changes_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); return sprintf(buf, fmt_dec, - atomic_read(&netdev->carrier_changes)); + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); +static ssize_t carrier_up_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); +} +static DEVICE_ATTR_RO(carrier_up_count); + +static ssize_t carrier_down_count_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct net_device *netdev = to_net_dev(dev); + + return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); +} +static DEVICE_ATTR_RO(carrier_down_count); + /* read-write attributes */ static int change_mtu(struct net_device *dev, unsigned long new_mtu) @@ -325,29 +346,6 @@ static ssize_t flags_store(struct device *dev, struct device_attribute *attr, } NETDEVICE_SHOW_RW(flags, fmt_hex); -static int change_tx_queue_len(struct net_device *dev, unsigned long new_len) -{ - unsigned int orig_len = dev->tx_queue_len; - int res; - - if (new_len != (unsigned int)new_len) - return -ERANGE; - - if (new_len != orig_len) { - dev->tx_queue_len = new_len; - res = call_netdevice_notifiers(NETDEV_CHANGE_TX_QUEUE_LEN, dev); - res = notifier_to_errno(res); - if (res) { - netdev_err(dev, - "refused to change device tx_queue_len\n"); - dev->tx_queue_len = orig_len; - return -EFAULT; - } - } - - return 0; -} - static ssize_t tx_queue_len_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t len) @@ -355,7 +353,7 @@ static ssize_t tx_queue_len_store(struct device *dev, if (!capable(CAP_NET_ADMIN)) return -EPERM; - return netdev_store(dev, attr, buf, len, change_tx_queue_len); + return netdev_store(dev, attr, buf, len, dev_change_tx_queue_len); } NETDEVICE_SHOW_RW(tx_queue_len, fmt_dec); @@ -547,6 +545,8 @@ static struct attribute *net_class_attrs[] __ro_after_init = { &dev_attr_phys_port_name.attr, &dev_attr_phys_switch_id.attr, &dev_attr_proto_down.attr, + &dev_attr_carrier_up_count.attr, + &dev_attr_carrier_down_count.attr, NULL, }; ATTRIBUTE_GROUPS(net_class); @@ -961,7 +961,7 @@ net_rx_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct kobject *kobj = &dev->_rx[i].kobj; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) kobj->uevent_suppress = 1; if (dev->sysfs_rx_queue_group) sysfs_remove_group(kobj, dev->sysfs_rx_queue_group); @@ -1367,7 +1367,7 @@ netdev_queue_update_kobjects(struct net_device *dev, int old_num, int new_num) while (--i >= new_num) { struct netdev_queue *queue = dev->_tx + i; - if (!atomic_read(&dev_net(dev)->count)) + if (!refcount_read(&dev_net(dev)->count)) queue->kobj.uevent_suppress = 1; #ifdef CONFIG_BQL sysfs_remove_group(&queue->kobj, &dql_group); @@ -1558,7 +1558,7 @@ void netdev_unregister_kobject(struct net_device *ndev) { struct device *dev = &ndev->dev; - if (!atomic_read(&dev_net(ndev)->count)) + if (!refcount_read(&dev_net(ndev)->count)) dev_set_uevent_suppress(dev, 1); kobject_get(&dev->kobj); diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 60a71be..3cad5f5 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -35,7 +35,7 @@ LIST_HEAD(net_namespace_list); EXPORT_SYMBOL_GPL(net_namespace_list); struct net init_net = { - .count = ATOMIC_INIT(1), + .count = REFCOUNT_INIT(1), .dev_base_head = LIST_HEAD_INIT(init_net.dev_base_head), }; EXPORT_SYMBOL(init_net); @@ -221,17 +221,26 @@ static void rtnl_net_notifyid(struct net *net, int cmd, int id); */ int peernet2id_alloc(struct net *net, struct net *peer) { - bool alloc; + bool alloc = false, alive = false; int id; - if (atomic_read(&net->count) == 0) + if (refcount_read(&net->count) == 0) return NETNSA_NSID_NOT_ASSIGNED; spin_lock_bh(&net->nsid_lock); - alloc = atomic_read(&peer->count) == 0 ? false : true; + /* + * When peer is obtained from RCU lists, we may race with + * its cleanup. Check whether it's alive, and this guarantees + * we never hash a peer back to net->netns_ids, after it has + * just been idr_remove()'d from there in cleanup_net(). + */ + if (maybe_get_net(peer)) + alive = alloc = true; id = __peernet2id_alloc(net, peer, &alloc); spin_unlock_bh(&net->nsid_lock); if (alloc && id >= 0) rtnl_net_notifyid(net, RTM_NEWNSID, id); + if (alive) + put_net(peer); return id; } EXPORT_SYMBOL_GPL(peernet2id_alloc); @@ -264,11 +273,9 @@ struct net *get_net_ns_by_id(struct net *net, int id) return NULL; rcu_read_lock(); - spin_lock_bh(&net->nsid_lock); peer = idr_find(&net->netns_ids, id); if (peer) peer = maybe_get_net(peer); - spin_unlock_bh(&net->nsid_lock); rcu_read_unlock(); return peer; @@ -284,7 +291,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) int error = 0; LIST_HEAD(net_exit_list); - atomic_set(&net->count, 1); + refcount_set(&net->count, 1); refcount_set(&net->passive, 1); net->dev_base_seq = 1; net->user_ns = user_ns; @@ -432,13 +439,40 @@ struct net *copy_net_ns(unsigned long flags, return net; } +static void unhash_nsid(struct net *net, struct net *last) +{ + struct net *tmp; + /* This function is only called from cleanup_net() work, + * and this work is the only process, that may delete + * a net from net_namespace_list. So, when the below + * is executing, the list may only grow. Thus, we do not + * use for_each_net_rcu() or rtnl_lock(). + */ + for_each_net(tmp) { + int id; + + spin_lock_bh(&tmp->nsid_lock); + id = __peernet2id(tmp, net); + if (id >= 0) + idr_remove(&tmp->netns_ids, id); + spin_unlock_bh(&tmp->nsid_lock); + if (id >= 0) + rtnl_net_notifyid(tmp, RTM_DELNSID, id); + if (tmp == last) + break; + } + spin_lock_bh(&net->nsid_lock); + idr_destroy(&net->netns_ids); + spin_unlock_bh(&net->nsid_lock); +} + static DEFINE_SPINLOCK(cleanup_list_lock); static LIST_HEAD(cleanup_list); /* Must hold cleanup_list_lock to touch */ static void cleanup_net(struct work_struct *work) { const struct pernet_operations *ops; - struct net *net, *tmp; + struct net *net, *tmp, *last; struct list_head net_kill_list; LIST_HEAD(net_exit_list); @@ -451,26 +485,25 @@ static void cleanup_net(struct work_struct *work) /* Don't let anyone else find us. */ rtnl_lock(); - list_for_each_entry(net, &net_kill_list, cleanup_list) { + list_for_each_entry(net, &net_kill_list, cleanup_list) list_del_rcu(&net->list); - list_add_tail(&net->exit_list, &net_exit_list); - for_each_net(tmp) { - int id; - - spin_lock_bh(&tmp->nsid_lock); - id = __peernet2id(tmp, net); - if (id >= 0) - idr_remove(&tmp->netns_ids, id); - spin_unlock_bh(&tmp->nsid_lock); - if (id >= 0) - rtnl_net_notifyid(tmp, RTM_DELNSID, id); - } - spin_lock_bh(&net->nsid_lock); - idr_destroy(&net->netns_ids); - spin_unlock_bh(&net->nsid_lock); + /* Cache last net. After we unlock rtnl, no one new net + * added to net_namespace_list can assign nsid pointer + * to a net from net_kill_list (see peernet2id_alloc()). + * So, we skip them in unhash_nsid(). + * + * Note, that unhash_nsid() does not delete nsid links + * between net_kill_list's nets, as they've already + * deleted from net_namespace_list. But, this would be + * useless anyway, as netns_ids are destroyed there. + */ + last = list_last_entry(&net_namespace_list, struct net, list); + rtnl_unlock(); + list_for_each_entry(net, &net_kill_list, cleanup_list) { + unhash_nsid(net, last); + list_add_tail(&net->exit_list, &net_exit_list); } - rtnl_unlock(); /* * Another CPU might be rcu-iterating the list, wait for it. diff --git a/net/core/pktgen.c b/net/core/pktgen.c index f95a150..b8ab5c8 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -184,25 +184,44 @@ #define func_enter() pr_debug("entering %s\n", __func__); +#define PKT_FLAGS \ + pf(IPV6) /* Interface in IPV6 Mode */ \ + pf(IPSRC_RND) /* IP-Src Random */ \ + pf(IPDST_RND) /* IP-Dst Random */ \ + pf(TXSIZE_RND) /* Transmit size is random */ \ + pf(UDPSRC_RND) /* UDP-Src Random */ \ + pf(UDPDST_RND) /* UDP-Dst Random */ \ + pf(UDPCSUM) /* Include UDP checksum */ \ + pf(NO_TIMESTAMP) /* Don't timestamp packets (default TS) */ \ + pf(MPLS_RND) /* Random MPLS labels */ \ + pf(QUEUE_MAP_RND) /* queue map Random */ \ + pf(QUEUE_MAP_CPU) /* queue map mirrors smp_processor_id() */ \ + pf(FLOW_SEQ) /* Sequential flows */ \ + pf(IPSEC) /* ipsec on for flows */ \ + pf(MACSRC_RND) /* MAC-Src Random */ \ + pf(MACDST_RND) /* MAC-Dst Random */ \ + pf(VID_RND) /* Random VLAN ID */ \ + pf(SVID_RND) /* Random SVLAN ID */ \ + pf(NODE) /* Node memory alloc*/ \ + +#define pf(flag) flag##_SHIFT, +enum pkt_flags { + PKT_FLAGS +}; +#undef pf + /* Device flag bits */ -#define F_IPSRC_RND (1<<0) /* IP-Src Random */ -#define F_IPDST_RND (1<<1) /* IP-Dst Random */ -#define F_UDPSRC_RND (1<<2) /* UDP-Src Random */ -#define F_UDPDST_RND (1<<3) /* UDP-Dst Random */ -#define F_MACSRC_RND (1<<4) /* MAC-Src Random */ -#define F_MACDST_RND (1<<5) /* MAC-Dst Random */ -#define F_TXSIZE_RND (1<<6) /* Transmit size is random */ -#define F_IPV6 (1<<7) /* Interface in IPV6 Mode */ -#define F_MPLS_RND (1<<8) /* Random MPLS labels */ -#define F_VID_RND (1<<9) /* Random VLAN ID */ -#define F_SVID_RND (1<<10) /* Random SVLAN ID */ -#define F_FLOW_SEQ (1<<11) /* Sequential flows */ -#define F_IPSEC_ON (1<<12) /* ipsec on for flows */ -#define F_QUEUE_MAP_RND (1<<13) /* queue map Random */ -#define F_QUEUE_MAP_CPU (1<<14) /* queue map mirrors smp_processor_id() */ -#define F_NODE (1<<15) /* Node memory alloc*/ -#define F_UDPCSUM (1<<16) /* Include UDP checksum */ -#define F_NO_TIMESTAMP (1<<17) /* Don't timestamp packets (default TS) */ +#define pf(flag) static const __u32 F_##flag = (1<<flag##_SHIFT); +PKT_FLAGS +#undef pf + +#define pf(flag) __stringify(flag), +static char *pkt_flag_names[] = { + PKT_FLAGS +}; +#undef pf + +#define NR_PKT_FLAGS ARRAY_SIZE(pkt_flag_names) /* Thread control flag bits */ #define T_STOP (1<<0) /* Stop run */ @@ -399,7 +418,7 @@ struct pktgen_dev { __u8 ipsmode; /* IPSEC mode (config) */ __u8 ipsproto; /* IPSEC type (config) */ __u32 spi; - struct dst_entry dst; + struct xfrm_dst xdst; struct dst_ops dstops; #endif char result[512]; @@ -523,7 +542,6 @@ static int pgctrl_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_fops = { - .owner = THIS_MODULE, .open = pgctrl_open, .read = seq_read, .llseek = seq_lseek, @@ -535,6 +553,7 @@ static int pktgen_if_show(struct seq_file *seq, void *v) { const struct pktgen_dev *pkt_dev = seq->private; ktime_t stopped; + unsigned int i; u64 idle; seq_printf(seq, @@ -596,7 +615,6 @@ static int pktgen_if_show(struct seq_file *seq, void *v) pkt_dev->src_mac_count, pkt_dev->dst_mac_count); if (pkt_dev->nr_labels) { - unsigned int i; seq_puts(seq, " mpls: "); for (i = 0; i < pkt_dev->nr_labels; i++) seq_printf(seq, "%08x%s", ntohl(pkt_dev->labels[i]), @@ -632,68 +650,21 @@ static int pktgen_if_show(struct seq_file *seq, void *v) seq_puts(seq, " Flags: "); - if (pkt_dev->flags & F_IPV6) - seq_puts(seq, "IPV6 "); - - if (pkt_dev->flags & F_IPSRC_RND) - seq_puts(seq, "IPSRC_RND "); - - if (pkt_dev->flags & F_IPDST_RND) - seq_puts(seq, "IPDST_RND "); - - if (pkt_dev->flags & F_TXSIZE_RND) - seq_puts(seq, "TXSIZE_RND "); - - if (pkt_dev->flags & F_UDPSRC_RND) - seq_puts(seq, "UDPSRC_RND "); - - if (pkt_dev->flags & F_UDPDST_RND) - seq_puts(seq, "UDPDST_RND "); - - if (pkt_dev->flags & F_UDPCSUM) - seq_puts(seq, "UDPCSUM "); - - if (pkt_dev->flags & F_NO_TIMESTAMP) - seq_puts(seq, "NO_TIMESTAMP "); - - if (pkt_dev->flags & F_MPLS_RND) - seq_puts(seq, "MPLS_RND "); - - if (pkt_dev->flags & F_QUEUE_MAP_RND) - seq_puts(seq, "QUEUE_MAP_RND "); + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (i == F_FLOW_SEQ) + if (!pkt_dev->cflows) + continue; - if (pkt_dev->flags & F_QUEUE_MAP_CPU) - seq_puts(seq, "QUEUE_MAP_CPU "); - - if (pkt_dev->cflows) { - if (pkt_dev->flags & F_FLOW_SEQ) - seq_puts(seq, "FLOW_SEQ "); /*in sequence flows*/ - else - seq_puts(seq, "FLOW_RND "); - } + if (pkt_dev->flags & (1 << i)) + seq_printf(seq, "%s ", pkt_flag_names[i]); + else if (i == F_FLOW_SEQ) + seq_puts(seq, "FLOW_RND "); #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) { - seq_puts(seq, "IPSEC "); - if (pkt_dev->spi) + if (i == F_IPSEC && pkt_dev->spi) seq_printf(seq, "spi:%u", pkt_dev->spi); - } #endif - - if (pkt_dev->flags & F_MACSRC_RND) - seq_puts(seq, "MACSRC_RND "); - - if (pkt_dev->flags & F_MACDST_RND) - seq_puts(seq, "MACDST_RND "); - - if (pkt_dev->flags & F_VID_RND) - seq_puts(seq, "VID_RND "); - - if (pkt_dev->flags & F_SVID_RND) - seq_puts(seq, "SVID_RND "); - - if (pkt_dev->flags & F_NODE) - seq_puts(seq, "NODE_ALLOC "); + } seq_puts(seq, "\n"); @@ -859,6 +830,35 @@ static ssize_t get_labels(const char __user *buffer, struct pktgen_dev *pkt_dev) return i; } +static __u32 pktgen_read_flag(const char *f, bool *disable) +{ + __u32 i; + + if (f[0] == '!') { + *disable = true; + f++; + } + + for (i = 0; i < NR_PKT_FLAGS; i++) { + if (!IS_ENABLED(CONFIG_XFRM) && i == IPSEC_SHIFT) + continue; + + /* allow only disabling ipv6 flag */ + if (!*disable && i == IPV6_SHIFT) + continue; + + if (strcmp(f, pkt_flag_names[i]) == 0) + return 1 << i; + } + + if (strcmp(f, "FLOW_RND") == 0) { + *disable = !*disable; + return F_FLOW_SEQ; + } + + return 0; +} + static ssize_t pktgen_if_write(struct file *file, const char __user * user_buffer, size_t count, loff_t * offset) @@ -1216,7 +1216,10 @@ static ssize_t pktgen_if_write(struct file *file, return count; } if (!strcmp(name, "flag")) { + __u32 flag; char f[32]; + bool disable = false; + memset(f, 0, 32); len = strn_len(&user_buffer[i], sizeof(f) - 1); if (len < 0) @@ -1225,107 +1228,15 @@ static ssize_t pktgen_if_write(struct file *file, if (copy_from_user(f, &user_buffer[i], len)) return -EFAULT; i += len; - if (strcmp(f, "IPSRC_RND") == 0) - pkt_dev->flags |= F_IPSRC_RND; - - else if (strcmp(f, "!IPSRC_RND") == 0) - pkt_dev->flags &= ~F_IPSRC_RND; - - else if (strcmp(f, "TXSIZE_RND") == 0) - pkt_dev->flags |= F_TXSIZE_RND; - - else if (strcmp(f, "!TXSIZE_RND") == 0) - pkt_dev->flags &= ~F_TXSIZE_RND; - - else if (strcmp(f, "IPDST_RND") == 0) - pkt_dev->flags |= F_IPDST_RND; - - else if (strcmp(f, "!IPDST_RND") == 0) - pkt_dev->flags &= ~F_IPDST_RND; - - else if (strcmp(f, "UDPSRC_RND") == 0) - pkt_dev->flags |= F_UDPSRC_RND; - - else if (strcmp(f, "!UDPSRC_RND") == 0) - pkt_dev->flags &= ~F_UDPSRC_RND; - - else if (strcmp(f, "UDPDST_RND") == 0) - pkt_dev->flags |= F_UDPDST_RND; - - else if (strcmp(f, "!UDPDST_RND") == 0) - pkt_dev->flags &= ~F_UDPDST_RND; - - else if (strcmp(f, "MACSRC_RND") == 0) - pkt_dev->flags |= F_MACSRC_RND; - - else if (strcmp(f, "!MACSRC_RND") == 0) - pkt_dev->flags &= ~F_MACSRC_RND; - - else if (strcmp(f, "MACDST_RND") == 0) - pkt_dev->flags |= F_MACDST_RND; - - else if (strcmp(f, "!MACDST_RND") == 0) - pkt_dev->flags &= ~F_MACDST_RND; - - else if (strcmp(f, "MPLS_RND") == 0) - pkt_dev->flags |= F_MPLS_RND; - - else if (strcmp(f, "!MPLS_RND") == 0) - pkt_dev->flags &= ~F_MPLS_RND; - else if (strcmp(f, "VID_RND") == 0) - pkt_dev->flags |= F_VID_RND; + flag = pktgen_read_flag(f, &disable); - else if (strcmp(f, "!VID_RND") == 0) - pkt_dev->flags &= ~F_VID_RND; - - else if (strcmp(f, "SVID_RND") == 0) - pkt_dev->flags |= F_SVID_RND; - - else if (strcmp(f, "!SVID_RND") == 0) - pkt_dev->flags &= ~F_SVID_RND; - - else if (strcmp(f, "FLOW_SEQ") == 0) - pkt_dev->flags |= F_FLOW_SEQ; - - else if (strcmp(f, "QUEUE_MAP_RND") == 0) - pkt_dev->flags |= F_QUEUE_MAP_RND; - - else if (strcmp(f, "!QUEUE_MAP_RND") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_RND; - - else if (strcmp(f, "QUEUE_MAP_CPU") == 0) - pkt_dev->flags |= F_QUEUE_MAP_CPU; - - else if (strcmp(f, "!QUEUE_MAP_CPU") == 0) - pkt_dev->flags &= ~F_QUEUE_MAP_CPU; -#ifdef CONFIG_XFRM - else if (strcmp(f, "IPSEC") == 0) - pkt_dev->flags |= F_IPSEC_ON; -#endif - - else if (strcmp(f, "!IPV6") == 0) - pkt_dev->flags &= ~F_IPV6; - - else if (strcmp(f, "NODE_ALLOC") == 0) - pkt_dev->flags |= F_NODE; - - else if (strcmp(f, "!NODE_ALLOC") == 0) - pkt_dev->flags &= ~F_NODE; - - else if (strcmp(f, "UDPCSUM") == 0) - pkt_dev->flags |= F_UDPCSUM; - - else if (strcmp(f, "!UDPCSUM") == 0) - pkt_dev->flags &= ~F_UDPCSUM; - - else if (strcmp(f, "NO_TIMESTAMP") == 0) - pkt_dev->flags |= F_NO_TIMESTAMP; - - else if (strcmp(f, "!NO_TIMESTAMP") == 0) - pkt_dev->flags &= ~F_NO_TIMESTAMP; - - else { + if (flag) { + if (disable) + pkt_dev->flags &= ~flag; + else + pkt_dev->flags |= flag; + } else { sprintf(pg_result, "Flag -:%s:- unknown\nAvailable flags, (prepend ! to un-set flag):\n%s", f, @@ -1804,7 +1715,6 @@ static int pktgen_if_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_if_fops = { - .owner = THIS_MODULE, .open = pktgen_if_open, .read = seq_read, .llseek = seq_lseek, @@ -1942,7 +1852,6 @@ static int pktgen_thread_open(struct inode *inode, struct file *file) } static const struct file_operations pktgen_thread_fops = { - .owner = THIS_MODULE, .open = pktgen_thread_open, .read = seq_read, .llseek = seq_lseek, @@ -2544,7 +2453,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->flows[flow].cur_daddr = pkt_dev->cur_daddr; #ifdef CONFIG_XFRM - if (pkt_dev->flags & F_IPSEC_ON) + if (pkt_dev->flags & F_IPSEC) get_ipsec_sa(pkt_dev, flow); #endif pkt_dev->nflows++; @@ -2609,7 +2518,7 @@ static int pktgen_output_ipsec(struct sk_buff *skb, struct pktgen_dev *pkt_dev) * supports both transport/tunnel mode + ESP/AH type. */ if ((x->props.mode == XFRM_MODE_TUNNEL) && (pkt_dev->spi != 0)) - skb->_skb_refdst = (unsigned long)&pkt_dev->dst | SKB_DST_NOREF; + skb->_skb_refdst = (unsigned long)&pkt_dev->xdst.u.dst | SKB_DST_NOREF; rcu_read_lock_bh(); err = x->outer_mode->output(x, skb); @@ -2649,7 +2558,7 @@ static void free_SAs(struct pktgen_dev *pkt_dev) static int process_ipsec(struct pktgen_dev *pkt_dev, struct sk_buff *skb, __be16 protocol) { - if (pkt_dev->flags & F_IPSEC_ON) { + if (pkt_dev->flags & F_IPSEC) { struct xfrm_state *x = pkt_dev->flows[pkt_dev->curfl].x; int nhead = 0; if (x) { @@ -3742,10 +3651,10 @@ static int pktgen_add_device(struct pktgen_thread *t, const char *ifname) * performance under such circumstance. */ pkt_dev->dstops.family = AF_INET; - pkt_dev->dst.dev = pkt_dev->odev; - dst_init_metrics(&pkt_dev->dst, pktgen_dst_metrics, false); - pkt_dev->dst.child = &pkt_dev->dst; - pkt_dev->dst.ops = &pkt_dev->dstops; + pkt_dev->xdst.u.dst.dev = pkt_dev->odev; + dst_init_metrics(&pkt_dev->xdst.u.dst, pktgen_dst_metrics, false); + pkt_dev->xdst.child = &pkt_dev->xdst.u.dst; + pkt_dev->xdst.u.dst.ops = &pkt_dev->dstops; #endif return add_dev_to_thread(t, pkt_dev); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 778d7f0..204297d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -62,7 +62,9 @@ struct rtnl_link { rtnl_doit_func doit; rtnl_dumpit_func dumpit; + struct module *owner; unsigned int flags; + struct rcu_head rcu; }; static DEFINE_MUTEX(rtnl_mutex); @@ -127,8 +129,7 @@ bool lockdep_rtnl_is_held(void) EXPORT_SYMBOL(lockdep_rtnl_is_held); #endif /* #ifdef CONFIG_PROVE_LOCKING */ -static struct rtnl_link __rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; -static refcount_t rtnl_msg_handlers_ref[RTNL_FAMILY_MAX + 1]; +static struct rtnl_link *__rcu *rtnl_msg_handlers[RTNL_FAMILY_MAX + 1]; static inline int rtm_msgindex(int msgtype) { @@ -144,72 +145,127 @@ static inline int rtm_msgindex(int msgtype) return msgindex; } -/** - * __rtnl_register - Register a rtnetlink message type - * @protocol: Protocol family or PF_UNSPEC - * @msgtype: rtnetlink message type - * @doit: Function pointer called for each request message - * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message - * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions - * - * Registers the specified function pointers (at least one of them has - * to be non-NULL) to be called whenever a request message for the - * specified protocol family and message type is received. - * - * The special protocol family PF_UNSPEC may be used to define fallback - * function pointers for the case when no entry for the specific protocol - * family exists. - * - * Returns 0 on success or a negative error code. - */ -int __rtnl_register(int protocol, int msgtype, - rtnl_doit_func doit, rtnl_dumpit_func dumpit, - unsigned int flags) +static struct rtnl_link *rtnl_get_link(int protocol, int msgtype) +{ + struct rtnl_link **tab; + + if (protocol >= ARRAY_SIZE(rtnl_msg_handlers)) + protocol = PF_UNSPEC; + + tab = rcu_dereference_rtnl(rtnl_msg_handlers[protocol]); + if (!tab) + tab = rcu_dereference_rtnl(rtnl_msg_handlers[PF_UNSPEC]); + + return tab[msgtype]; +} + +static int rtnl_register_internal(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) { - struct rtnl_link *tab; + struct rtnl_link *link, *old; + struct rtnl_link __rcu **tab; int msgindex; + int ret = -ENOBUFS; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); - tab = rcu_dereference_raw(rtnl_msg_handlers[protocol]); + rtnl_lock(); + tab = rtnl_msg_handlers[protocol]; if (tab == NULL) { - tab = kcalloc(RTM_NR_MSGTYPES, sizeof(*tab), GFP_KERNEL); - if (tab == NULL) - return -ENOBUFS; + tab = kcalloc(RTM_NR_MSGTYPES, sizeof(void *), GFP_KERNEL); + if (!tab) + goto unlock; + /* ensures we see the 0 stores */ rcu_assign_pointer(rtnl_msg_handlers[protocol], tab); } + old = rtnl_dereference(tab[msgindex]); + if (old) { + link = kmemdup(old, sizeof(*old), GFP_KERNEL); + if (!link) + goto unlock; + } else { + link = kzalloc(sizeof(*link), GFP_KERNEL); + if (!link) + goto unlock; + } + + WARN_ON(link->owner && link->owner != owner); + link->owner = owner; + + WARN_ON(doit && link->doit && link->doit != doit); if (doit) - tab[msgindex].doit = doit; + link->doit = doit; + WARN_ON(dumpit && link->dumpit && link->dumpit != dumpit); if (dumpit) - tab[msgindex].dumpit = dumpit; - tab[msgindex].flags |= flags; + link->dumpit = dumpit; - return 0; + link->flags |= flags; + + /* publish protocol:msgtype */ + rcu_assign_pointer(tab[msgindex], link); + ret = 0; + if (old) + kfree_rcu(old, rcu); +unlock: + rtnl_unlock(); + return ret; } -EXPORT_SYMBOL_GPL(__rtnl_register); + +/** + * rtnl_register_module - Register a rtnetlink message type + * + * @owner: module registering the hook (THIS_MODULE) + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Like rtnl_register, but for use by removable modules. + */ +int rtnl_register_module(struct module *owner, + int protocol, int msgtype, + rtnl_doit_func doit, rtnl_dumpit_func dumpit, + unsigned int flags) +{ + return rtnl_register_internal(owner, protocol, msgtype, + doit, dumpit, flags); +} +EXPORT_SYMBOL_GPL(rtnl_register_module); /** * rtnl_register - Register a rtnetlink message type + * @protocol: Protocol family or PF_UNSPEC + * @msgtype: rtnetlink message type + * @doit: Function pointer called for each request message + * @dumpit: Function pointer called for each dump request (NLM_F_DUMP) message + * @flags: rtnl_link_flags to modifiy behaviour of doit/dumpit functions + * + * Registers the specified function pointers (at least one of them has + * to be non-NULL) to be called whenever a request message for the + * specified protocol family and message type is received. * - * Identical to __rtnl_register() but panics on failure. This is useful - * as failure of this function is very unlikely, it can only happen due - * to lack of memory when allocating the chain to store all message - * handlers for a protocol. Meant for use in init functions where lack - * of memory implies no sense in continuing. + * The special protocol family PF_UNSPEC may be used to define fallback + * function pointers for the case when no entry for the specific protocol + * family exists. */ void rtnl_register(int protocol, int msgtype, rtnl_doit_func doit, rtnl_dumpit_func dumpit, unsigned int flags) { - if (__rtnl_register(protocol, msgtype, doit, dumpit, flags) < 0) - panic("Unable to register rtnetlink message handler, " - "protocol = %d, message type = %d\n", - protocol, msgtype); + int err; + + err = rtnl_register_internal(NULL, protocol, msgtype, doit, dumpit, + flags); + if (err) + pr_err("Unable to register rtnetlink message handler, " + "protocol = %d, message type = %d\n", protocol, msgtype); } -EXPORT_SYMBOL_GPL(rtnl_register); /** * rtnl_unregister - Unregister a rtnetlink message type @@ -220,24 +276,25 @@ EXPORT_SYMBOL_GPL(rtnl_register); */ int rtnl_unregister(int protocol, int msgtype) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); msgindex = rtm_msgindex(msgtype); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); - if (!handlers) { + tab = rtnl_dereference(rtnl_msg_handlers[protocol]); + if (!tab) { rtnl_unlock(); return -ENOENT; } - handlers[msgindex].doit = NULL; - handlers[msgindex].dumpit = NULL; - handlers[msgindex].flags = 0; + link = tab[msgindex]; + rcu_assign_pointer(tab[msgindex], NULL); rtnl_unlock(); + kfree_rcu(link, rcu); + return 0; } EXPORT_SYMBOL_GPL(rtnl_unregister); @@ -251,20 +308,27 @@ EXPORT_SYMBOL_GPL(rtnl_unregister); */ void rtnl_unregister_all(int protocol) { - struct rtnl_link *handlers; + struct rtnl_link **tab, *link; + int msgindex; BUG_ON(protocol < 0 || protocol > RTNL_FAMILY_MAX); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[protocol]); + tab = rtnl_msg_handlers[protocol]; RCU_INIT_POINTER(rtnl_msg_handlers[protocol], NULL); + for (msgindex = 0; msgindex < RTM_NR_MSGTYPES; msgindex++) { + link = tab[msgindex]; + if (!link) + continue; + + rcu_assign_pointer(tab[msgindex], NULL); + kfree_rcu(link, rcu); + } rtnl_unlock(); synchronize_net(); - while (refcount_read(&rtnl_msg_handlers_ref[protocol]) > 1) - schedule(); - kfree(handlers); + kfree(tab); } EXPORT_SYMBOL_GPL(rtnl_unregister_all); @@ -840,6 +904,10 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev, nla_total_size_64bit(sizeof(__u64)) + /* IFLA_VF_STATS_MULTICAST */ nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_RX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + + /* IFLA_VF_STATS_TX_DROPPED */ + nla_total_size_64bit(sizeof(__u64)) + nla_total_size(sizeof(struct ifla_vf_trust))); return size; } else @@ -920,8 +988,11 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + rtnl_xdp_size() /* IFLA_XDP */ + nla_total_size(4) /* IFLA_EVENT */ + nla_total_size(4) /* IFLA_NEW_NETNSID */ + + nla_total_size(4) /* IFLA_NEW_IFINDEX */ + nla_total_size(1) /* IFLA_PROTO_DOWN */ + nla_total_size(4) /* IFLA_IF_NETNSID */ + + nla_total_size(4) /* IFLA_CARRIER_UP_COUNT */ + + nla_total_size(4) /* IFLA_CARRIER_DOWN_COUNT */ + 0; } @@ -1194,7 +1265,11 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb, nla_put_u64_64bit(skb, IFLA_VF_STATS_BROADCAST, vf_stats.broadcast, IFLA_VF_STATS_PAD) || nla_put_u64_64bit(skb, IFLA_VF_STATS_MULTICAST, - vf_stats.multicast, IFLA_VF_STATS_PAD)) { + vf_stats.multicast, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_RX_DROPPED, + vf_stats.rx_dropped, IFLA_VF_STATS_PAD) || + nla_put_u64_64bit(skb, IFLA_VF_STATS_TX_DROPPED, + vf_stats.tx_dropped, IFLA_VF_STATS_PAD)) { nla_nest_cancel(skb, vfstats); goto nla_put_vf_failure; } @@ -1261,6 +1336,7 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) { const struct net_device_ops *ops = dev->netdev_ops; const struct bpf_prog *generic_xdp_prog; + struct netdev_bpf xdp; ASSERT_RTNL(); @@ -1273,7 +1349,10 @@ static u8 rtnl_xdp_attached_mode(struct net_device *dev, u32 *prog_id) if (!ops->ndo_bpf) return XDP_ATTACHED_NONE; - return __dev_xdp_attached(dev, ops->ndo_bpf, prog_id); + __dev_xdp_query(dev, ops->ndo_bpf, &xdp); + *prog_id = xdp.prog_id; + + return xdp.prog_attached; } static int rtnl_xdp_fill(struct sk_buff *skb, struct net_device *dev) @@ -1433,7 +1512,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, struct net *src_net, int type, u32 pid, u32 seq, u32 change, unsigned int flags, u32 ext_filter_mask, - u32 event, int *new_nsid, int tgt_netnsid) + u32 event, int *new_nsid, int new_ifindex, + int tgt_netnsid) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -1475,8 +1555,13 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_string(skb, IFLA_QDISC, dev->qdisc->ops->id)) || nla_put_ifalias(skb, dev) || nla_put_u32(skb, IFLA_CARRIER_CHANGES, - atomic_read(&dev->carrier_changes)) || - nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down)) + atomic_read(&dev->carrier_up_count) + + atomic_read(&dev->carrier_down_count)) || + nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down) || + nla_put_u32(skb, IFLA_CARRIER_UP_COUNT, + atomic_read(&dev->carrier_up_count)) || + nla_put_u32(skb, IFLA_CARRIER_DOWN_COUNT, + atomic_read(&dev->carrier_down_count))) goto nla_put_failure; if (event != IFLA_EVENT_NONE) { @@ -1525,6 +1610,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, if (new_nsid && nla_put_s32(skb, IFLA_NEW_NETNSID, *new_nsid) < 0) goto nla_put_failure; + if (new_ifindex && + nla_put_s32(skb, IFLA_NEW_IFINDEX, new_ifindex) < 0) + goto nla_put_failure; + rcu_read_lock(); if (rtnl_fill_link_af(skb, dev, ext_filter_mask)) @@ -1569,6 +1658,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_PROMISCUITY] = { .type = NLA_U32 }, [IFLA_NUM_TX_QUEUES] = { .type = NLA_U32 }, [IFLA_NUM_RX_QUEUES] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SEGS] = { .type = NLA_U32 }, + [IFLA_GSO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_PHYS_PORT_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, [IFLA_CARRIER_CHANGES] = { .type = NLA_U32 }, /* ignored */ [IFLA_PHYS_SWITCH_ID] = { .type = NLA_BINARY, .len = MAX_PHYS_ITEM_ID_LEN }, @@ -1578,6 +1669,8 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_EVENT] = { .type = NLA_U32 }, [IFLA_GROUP] = { .type = NLA_U32 }, [IFLA_IF_NETNSID] = { .type = NLA_S32 }, + [IFLA_CARRIER_UP_COUNT] = { .type = NLA_U32 }, + [IFLA_CARRIER_DOWN_COUNT] = { .type = NLA_U32 }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -1766,7 +1859,7 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) NETLINK_CB(cb->skb).portid, cb->nlh->nlmsg_seq, 0, flags, - ext_filter_mask, 0, NULL, + ext_filter_mask, 0, NULL, 0, netnsid); if (err < 0) { @@ -1815,6 +1908,49 @@ struct net *rtnl_link_get_net(struct net *src_net, struct nlattr *tb[]) } EXPORT_SYMBOL(rtnl_link_get_net); +/* Figure out which network namespace we are talking about by + * examining the link attributes in the following order: + * + * 1. IFLA_NET_NS_PID + * 2. IFLA_NET_NS_FD + * 3. IFLA_IF_NETNSID + */ +static struct net *rtnl_link_get_net_by_nlattr(struct net *src_net, + struct nlattr *tb[]) +{ + struct net *net; + + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) + return rtnl_link_get_net(src_net, tb); + + if (!tb[IFLA_IF_NETNSID]) + return get_net(src_net); + + net = get_net_ns_by_id(src_net, nla_get_u32(tb[IFLA_IF_NETNSID])); + if (!net) + return ERR_PTR(-EINVAL); + + return net; +} + +static struct net *rtnl_link_get_net_capable(const struct sk_buff *skb, + struct net *src_net, + struct nlattr *tb[], int cap) +{ + struct net *net; + + net = rtnl_link_get_net_by_nlattr(src_net, tb); + if (IS_ERR(net)) + return net; + + if (!netlink_ns_capable(skb, net->user_ns, cap)) { + put_net(net); + return ERR_PTR(-EPERM); + } + + return net; +} + static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[]) { if (dev) { @@ -2077,17 +2213,14 @@ static int do_setlink(const struct sk_buff *skb, const struct net_device_ops *ops = dev->netdev_ops; int err; - if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD]) { - struct net *net = rtnl_link_get_net(dev_net(dev), tb); + if (tb[IFLA_NET_NS_PID] || tb[IFLA_NET_NS_FD] || tb[IFLA_IF_NETNSID]) { + struct net *net = rtnl_link_get_net_capable(skb, dev_net(dev), + tb, CAP_NET_ADMIN); if (IS_ERR(net)) { err = PTR_ERR(net); goto errout; } - if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN)) { - put_net(net); - err = -EPERM; - goto errout; - } + err = dev_change_net_namespace(dev, net, ifname); put_net(net); if (err) @@ -2204,17 +2337,37 @@ static int do_setlink(const struct sk_buff *skb, if (tb[IFLA_TXQLEN]) { unsigned int value = nla_get_u32(tb[IFLA_TXQLEN]); - unsigned int orig_len = dev->tx_queue_len; - - if (dev->tx_queue_len ^ value) { - dev->tx_queue_len = value; - err = call_netdevice_notifiers( - NETDEV_CHANGE_TX_QUEUE_LEN, dev); - err = notifier_to_errno(err); - if (err) { - dev->tx_queue_len = orig_len; - goto errout; - } + + err = dev_change_tx_queue_len(dev, value); + if (err) + goto errout; + status |= DO_SETLINK_MODIFIED; + } + + if (tb[IFLA_GSO_MAX_SIZE]) { + u32 max_size = nla_get_u32(tb[IFLA_GSO_MAX_SIZE]); + + if (max_size > GSO_MAX_SIZE) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_size ^ max_size) { + netif_set_gso_max_size(dev, max_size); + status |= DO_SETLINK_MODIFIED; + } + } + + if (tb[IFLA_GSO_MAX_SEGS]) { + u32 max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); + + if (max_segs > GSO_MAX_SEGS) { + err = -EINVAL; + goto errout; + } + + if (dev->gso_max_segs ^ max_segs) { + dev->gso_max_segs = max_segs; status |= DO_SETLINK_MODIFIED; } } @@ -2400,9 +2553,6 @@ static int rtnl_setlink(struct sk_buff *skb, struct nlmsghdr *nlh, if (err < 0) goto errout; - if (tb[IFLA_IF_NETNSID]) - return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); else @@ -2487,36 +2637,53 @@ static int rtnl_dellink(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct net_device *dev; + struct net *tgt_net = net; + struct net_device *dev = NULL; struct ifinfomsg *ifm; char ifname[IFNAMSIZ]; struct nlattr *tb[IFLA_MAX+1]; int err; + int netnsid = -1; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy, extack); if (err < 0) return err; - if (tb[IFLA_IF_NETNSID]) - return -EOPNOTSUPP; - if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + if (tb[IFLA_IF_NETNSID]) { + netnsid = nla_get_s32(tb[IFLA_IF_NETNSID]); + tgt_net = get_target_net(NETLINK_CB(skb).sk, netnsid); + if (IS_ERR(tgt_net)) + return PTR_ERR(tgt_net); + } + + err = -EINVAL; ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) - dev = __dev_get_by_index(net, ifm->ifi_index); + dev = __dev_get_by_index(tgt_net, ifm->ifi_index); else if (tb[IFLA_IFNAME]) - dev = __dev_get_by_name(net, ifname); + dev = __dev_get_by_name(tgt_net, ifname); else if (tb[IFLA_GROUP]) - return rtnl_group_dellink(net, nla_get_u32(tb[IFLA_GROUP])); + err = rtnl_group_dellink(tgt_net, nla_get_u32(tb[IFLA_GROUP])); else - return -EINVAL; + goto out; - if (!dev) - return -ENODEV; + if (!dev) { + if (tb[IFLA_IFNAME] || ifm->ifi_index > 0) + err = -ENODEV; - return rtnl_delete_link(dev); + goto out; + } + + err = rtnl_delete_link(dev); + +out: + if (netnsid >= 0) + put_net(tgt_net); + + return err; } int rtnl_configure_link(struct net_device *dev, const struct ifinfomsg *ifm) @@ -2583,6 +2750,10 @@ struct net_device *rtnl_create_link(struct net *net, dev->link_mode = nla_get_u8(tb[IFLA_LINKMODE]); if (tb[IFLA_GROUP]) dev_set_group(dev, nla_get_u32(tb[IFLA_GROUP])); + if (tb[IFLA_GSO_MAX_SIZE]) + netif_set_gso_max_size(dev, nla_get_u32(tb[IFLA_GSO_MAX_SIZE])); + if (tb[IFLA_GSO_MAX_SEGS]) + dev->gso_max_segs = nla_get_u32(tb[IFLA_GSO_MAX_SEGS]); return dev; } @@ -2781,14 +2952,10 @@ replay: name_assign_type = NET_NAME_ENUM; } - dest_net = rtnl_link_get_net(net, tb); + dest_net = rtnl_link_get_net_capable(skb, net, tb, CAP_NET_ADMIN); if (IS_ERR(dest_net)) return PTR_ERR(dest_net); - err = -EPERM; - if (!netlink_ns_capable(skb, dest_net->user_ns, CAP_NET_ADMIN)) - goto out; - if (tb[IFLA_LINK_NETNSID]) { int id = nla_get_s32(tb[IFLA_LINK_NETNSID]); @@ -2915,7 +3082,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr *nlh, err = rtnl_fill_ifinfo(nskb, dev, net, RTM_NEWLINK, NETLINK_CB(skb).portid, nlh->nlmsg_seq, 0, 0, ext_filter_mask, - 0, NULL, netnsid); + 0, NULL, 0, netnsid); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -2973,18 +3140,26 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) s_idx = 1; for (idx = 1; idx <= RTNL_FAMILY_MAX; idx++) { + struct rtnl_link **tab; int type = cb->nlh->nlmsg_type-RTM_BASE; - struct rtnl_link *handlers; + struct rtnl_link *link; rtnl_dumpit_func dumpit; if (idx < s_idx || idx == PF_PACKET) continue; - handlers = rtnl_dereference(rtnl_msg_handlers[idx]); - if (!handlers) + if (type < 0 || type >= RTM_NR_MSGTYPES) continue; - dumpit = READ_ONCE(handlers[type].dumpit); + tab = rcu_dereference_rtnl(rtnl_msg_handlers[idx]); + if (!tab) + continue; + + link = tab[type]; + if (!link) + continue; + + dumpit = link->dumpit; if (!dumpit) continue; @@ -3003,7 +3178,8 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, unsigned int change, - u32 event, gfp_t flags, int *new_nsid) + u32 event, gfp_t flags, int *new_nsid, + int new_ifindex) { struct net *net = dev_net(dev); struct sk_buff *skb; @@ -3016,7 +3192,7 @@ struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, err = rtnl_fill_ifinfo(skb, dev, dev_net(dev), type, 0, 0, change, 0, 0, event, - new_nsid, -1); + new_nsid, new_ifindex, -1); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -3039,14 +3215,15 @@ void rtmsg_ifinfo_send(struct sk_buff *skb, struct net_device *dev, gfp_t flags) static void rtmsg_ifinfo_event(int type, struct net_device *dev, unsigned int change, u32 event, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { struct sk_buff *skb; if (dev->reg_state != NETREG_REGISTERED) return; - skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid); + skb = rtmsg_ifinfo_build_skb(type, dev, change, event, flags, new_nsid, + new_ifindex); if (skb) rtmsg_ifinfo_send(skb, dev, flags); } @@ -3054,14 +3231,15 @@ static void rtmsg_ifinfo_event(int type, struct net_device *dev, void rtmsg_ifinfo(int type, struct net_device *dev, unsigned int change, gfp_t flags) { - rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, NULL); + rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, + NULL, 0); } void rtmsg_ifinfo_newnet(int type, struct net_device *dev, unsigned int change, - gfp_t flags, int *new_nsid) + gfp_t flags, int *new_nsid, int new_ifindex) { rtmsg_ifinfo_event(type, dev, change, rtnl_get_event(0), flags, - new_nsid); + new_nsid, new_ifindex); } static int nlmsg_populate_fdb_fill(struct sk_buff *skb, @@ -4314,7 +4492,8 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); - struct rtnl_link *handlers; + struct rtnl_link *link; + struct module *owner; int err = -EOPNOTSUPP; rtnl_doit_func doit; unsigned int flags; @@ -4338,79 +4517,85 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN)) return -EPERM; - if (family >= ARRAY_SIZE(rtnl_msg_handlers)) - family = PF_UNSPEC; - rcu_read_lock(); - handlers = rcu_dereference(rtnl_msg_handlers[family]); - if (!handlers) { - family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); - } - if (kind == 2 && nlh->nlmsg_flags&NLM_F_DUMP) { struct sock *rtnl; rtnl_dumpit_func dumpit; u16 min_dump_alloc = 0; - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) { + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[PF_UNSPEC]); - if (!handlers) - goto err_unlock; - - dumpit = READ_ONCE(handlers[type].dumpit); - if (!dumpit) + link = rtnl_get_link(family, type); + if (!link || !link->dumpit) goto err_unlock; } - - refcount_inc(&rtnl_msg_handlers_ref[family]); + owner = link->owner; + dumpit = link->dumpit; if (type == RTM_GETLINK - RTM_BASE) min_dump_alloc = rtnl_calcit(skb, nlh); + err = 0; + /* need to do this before rcu_read_unlock() */ + if (!try_module_get(owner)) + err = -EPROTONOSUPPORT; + rcu_read_unlock(); rtnl = net->rtnl; - { + if (err == 0) { struct netlink_dump_control c = { .dump = dumpit, .min_dump_alloc = min_dump_alloc, + .module = owner, }; err = netlink_dump_start(rtnl, skb, nlh, &c); + /* netlink_dump_start() will keep a reference on + * module if dump is still in progress. + */ + module_put(owner); } - refcount_dec(&rtnl_msg_handlers_ref[family]); return err; } - doit = READ_ONCE(handlers[type].doit); - if (!doit) { + link = rtnl_get_link(family, type); + if (!link || !link->doit) { family = PF_UNSPEC; - handlers = rcu_dereference(rtnl_msg_handlers[family]); + link = rtnl_get_link(PF_UNSPEC, type); + if (!link || !link->doit) + goto out_unlock; + } + + owner = link->owner; + if (!try_module_get(owner)) { + err = -EPROTONOSUPPORT; + goto out_unlock; } - flags = READ_ONCE(handlers[type].flags); + flags = link->flags; if (flags & RTNL_FLAG_DOIT_UNLOCKED) { - refcount_inc(&rtnl_msg_handlers_ref[family]); - doit = READ_ONCE(handlers[type].doit); + doit = link->doit; rcu_read_unlock(); if (doit) err = doit(skb, nlh, extack); - refcount_dec(&rtnl_msg_handlers_ref[family]); + module_put(owner); return err; } - rcu_read_unlock(); rtnl_lock(); - handlers = rtnl_dereference(rtnl_msg_handlers[family]); - if (handlers) { - doit = READ_ONCE(handlers[type].doit); - if (doit) - err = doit(skb, nlh, extack); - } + link = rtnl_get_link(family, type); + if (link && link->doit) + err = link->doit(skb, nlh, extack); rtnl_unlock(); + + module_put(owner); + + return err; + +out_unlock: + rcu_read_unlock(); return err; err_unlock: @@ -4454,7 +4639,7 @@ static int rtnetlink_event(struct notifier_block *this, unsigned long event, voi case NETDEV_CHANGELOWERSTATE: case NETDEV_CHANGE_TX_QUEUE_LEN: rtmsg_ifinfo_event(RTM_NEWLINK, dev, 0, rtnl_get_event(event), - GFP_KERNEL, NULL); + GFP_KERNEL, NULL, 0); break; default: break; @@ -4498,11 +4683,6 @@ static struct pernet_operations rtnetlink_net_ops = { void __init rtnetlink_init(void) { - int i; - - for (i = 0; i < ARRAY_SIZE(rtnl_msg_handlers_ref); i++) - refcount_set(&rtnl_msg_handlers_ref[i], 1); - if (register_pernet_subsys(&rtnetlink_net_ops)) panic("rtnetlink_init: cannot initialize rtnetlink\n"); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 08f5740..01e8285 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3656,6 +3656,10 @@ normal: skb_shinfo(nskb)->tx_flags |= skb_shinfo(head_skb)->tx_flags & SKBTX_SHARED_FRAG; + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) + goto err; + while (pos < offset + len) { if (i >= nfrags) { BUG_ON(skb_headlen(list_skb)); @@ -3667,6 +3671,11 @@ normal: BUG_ON(!nfrags); + if (skb_orphan_frags(frag_skb, GFP_ATOMIC) || + skb_zerocopy_clone(nskb, frag_skb, + GFP_ATOMIC)) + goto err; + list_skb = list_skb->next; } @@ -3678,11 +3687,6 @@ normal: goto err; } - if (unlikely(skb_orphan_frags(frag_skb, GFP_ATOMIC))) - goto err; - if (skb_zerocopy_clone(nskb, frag_skb, GFP_ATOMIC)) - goto err; - *nskb_frag = *frag; __skb_frag_ref(nskb_frag); size = skb_frag_size(nskb_frag); diff --git a/net/core/sock.c b/net/core/sock.c index 1211159..1033f8a 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -145,6 +145,8 @@ static DEFINE_MUTEX(proto_list_mutex); static LIST_HEAD(proto_list); +static void sock_inuse_add(struct net *net, int val); + /** * sk_ns_capable - General socket capability test * @sk: Socket to use a capability on or through @@ -1531,8 +1533,11 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, sk->sk_kern_sock = kern; sock_lock_init(sk); sk->sk_net_refcnt = kern ? 0 : 1; - if (likely(sk->sk_net_refcnt)) + if (likely(sk->sk_net_refcnt)) { get_net(net); + sock_inuse_add(net, 1); + } + sock_net_set(sk, net); refcount_set(&sk->sk_wmem_alloc, 1); @@ -1595,6 +1600,9 @@ void sk_destruct(struct sock *sk) static void __sk_free(struct sock *sk) { + if (likely(sk->sk_net_refcnt)) + sock_inuse_add(sock_net(sk), -1); + if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt)) sock_diag_broadcast_destroy(sk); else @@ -1716,6 +1724,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); + if (likely(newsk->sk_net_refcnt)) + sock_inuse_add(sock_net(newsk), 1); /* * Before updating sk_refcnt, we must commit prior changes to memory @@ -3045,7 +3055,7 @@ static DECLARE_BITMAP(proto_inuse_idx, PROTO_INUSE_NR); void sock_prot_inuse_add(struct net *net, struct proto *prot, int val) { - __this_cpu_add(net->core.inuse->val[prot->inuse_idx], val); + __this_cpu_add(net->core.prot_inuse->val[prot->inuse_idx], val); } EXPORT_SYMBOL_GPL(sock_prot_inuse_add); @@ -3055,21 +3065,50 @@ int sock_prot_inuse_get(struct net *net, struct proto *prot) int res = 0; for_each_possible_cpu(cpu) - res += per_cpu_ptr(net->core.inuse, cpu)->val[idx]; + res += per_cpu_ptr(net->core.prot_inuse, cpu)->val[idx]; return res >= 0 ? res : 0; } EXPORT_SYMBOL_GPL(sock_prot_inuse_get); +static void sock_inuse_add(struct net *net, int val) +{ + this_cpu_add(*net->core.sock_inuse, val); +} + +int sock_inuse_get(struct net *net) +{ + int cpu, res = 0; + + for_each_possible_cpu(cpu) + res += *per_cpu_ptr(net->core.sock_inuse, cpu); + + return res; +} + +EXPORT_SYMBOL_GPL(sock_inuse_get); + static int __net_init sock_inuse_init_net(struct net *net) { - net->core.inuse = alloc_percpu(struct prot_inuse); - return net->core.inuse ? 0 : -ENOMEM; + net->core.prot_inuse = alloc_percpu(struct prot_inuse); + if (net->core.prot_inuse == NULL) + return -ENOMEM; + + net->core.sock_inuse = alloc_percpu(int); + if (net->core.sock_inuse == NULL) + goto out; + + return 0; + +out: + free_percpu(net->core.prot_inuse); + return -ENOMEM; } static void __net_exit sock_inuse_exit_net(struct net *net) { - free_percpu(net->core.inuse); + free_percpu(net->core.prot_inuse); + free_percpu(net->core.sock_inuse); } static struct pernet_operations net_inuse_ops = { @@ -3112,6 +3151,10 @@ static inline void assign_proto_idx(struct proto *prot) static inline void release_proto_idx(struct proto *prot) { } + +static void sock_inuse_add(struct net *net, int val) +{ +} #endif static void req_prot_cleanup(struct request_sock_ops *rsk_prot) @@ -3319,7 +3362,6 @@ static int proto_seq_open(struct inode *inode, struct file *file) } static const struct file_operations proto_seq_fops = { - .owner = THIS_MODULE, .open = proto_seq_open, .read = seq_read, .llseek = seq_lseek, diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5eeb1d2..c5bb52b 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -235,7 +235,9 @@ struct sock *reuseport_select_sock(struct sock *sk, if (prog && skb) sk2 = run_bpf(reuse, socks, prog, skb, hdr_len); - else + + /* no bpf or invalid bpf result: fall back to hash usage */ + if (!sk2) sk2 = reuse->socks[reciprocal_scale(hash, socks)]; } diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index a47ad6c..f2d0462 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -25,6 +25,7 @@ static int zero = 0; static int one = 1; +static int two __maybe_unused = 2; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; @@ -250,6 +251,46 @@ static int proc_do_rss_key(struct ctl_table *table, int write, return proc_dostring(&fake_table, write, buffer, lenp, ppos); } +#ifdef CONFIG_BPF_JIT +static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + int ret, jit_enable = *(int *)table->data; + struct ctl_table tmp = *table; + + if (write && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + tmp.data = &jit_enable; + ret = proc_dointvec_minmax(&tmp, write, buffer, lenp, ppos); + if (write && !ret) { + if (jit_enable < 2 || + (jit_enable == 2 && bpf_dump_raw_ok())) { + *(int *)table->data = jit_enable; + if (jit_enable == 2) + pr_warn("bpf_jit_enable = 2 was set! NEVER use this in production, only for JIT debugging!\n"); + } else { + ret = -EPERM; + } + } + return ret; +} + +# ifdef CONFIG_HAVE_EBPF_JIT +static int +proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, + loff_t *ppos) +{ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + return proc_dointvec_minmax(table, write, buffer, lenp, ppos); +} +# endif +#endif + static struct ctl_table net_core_table[] = { #ifdef CONFIG_NET { @@ -325,13 +366,14 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_enable, .maxlen = sizeof(int), .mode = 0644, -#ifndef CONFIG_BPF_JIT_ALWAYS_ON - .proc_handler = proc_dointvec -#else - .proc_handler = proc_dointvec_minmax, + .proc_handler = proc_dointvec_minmax_bpf_enable, +# ifdef CONFIG_BPF_JIT_ALWAYS_ON .extra1 = &one, .extra2 = &one, -#endif +# else + .extra1 = &zero, + .extra2 = &two, +# endif }, # ifdef CONFIG_HAVE_EBPF_JIT { @@ -339,14 +381,18 @@ static struct ctl_table net_core_table[] = { .data = &bpf_jit_harden, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &two, }, { .procname = "bpf_jit_kallsyms", .data = &bpf_jit_kallsyms, .maxlen = sizeof(int), .mode = 0600, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &zero, + .extra2 = &one, }, # endif #endif diff --git a/net/core/xdp.c b/net/core/xdp.c new file mode 100644 index 0000000..097a0f7 --- /dev/null +++ b/net/core/xdp.c @@ -0,0 +1,73 @@ +/* net/core/xdp.c + * + * Copyright (c) 2017 Jesper Dangaard Brouer, Red Hat Inc. + * Released under terms in GPL version 2. See COPYING. + */ +#include <linux/types.h> +#include <linux/mm.h> + +#include <net/xdp.h> + +#define REG_STATE_NEW 0x0 +#define REG_STATE_REGISTERED 0x1 +#define REG_STATE_UNREGISTERED 0x2 +#define REG_STATE_UNUSED 0x3 + +void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq) +{ + /* Simplify driver cleanup code paths, allow unreg "unused" */ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) + return; + + WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG"); + + xdp_rxq->reg_state = REG_STATE_UNREGISTERED; + xdp_rxq->dev = NULL; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg); + +static void xdp_rxq_info_init(struct xdp_rxq_info *xdp_rxq) +{ + memset(xdp_rxq, 0, sizeof(*xdp_rxq)); +} + +/* Returns 0 on success, negative on failure */ +int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq, + struct net_device *dev, u32 queue_index) +{ + if (xdp_rxq->reg_state == REG_STATE_UNUSED) { + WARN(1, "Driver promised not to register this"); + return -EINVAL; + } + + if (xdp_rxq->reg_state == REG_STATE_REGISTERED) { + WARN(1, "Missing unregister, handled but fix driver"); + xdp_rxq_info_unreg(xdp_rxq); + } + + if (!dev) { + WARN(1, "Missing net_device from driver"); + return -ENODEV; + } + + /* State either UNREGISTERED or NEW */ + xdp_rxq_info_init(xdp_rxq); + xdp_rxq->dev = dev; + xdp_rxq->queue_index = queue_index; + + xdp_rxq->reg_state = REG_STATE_REGISTERED; + return 0; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_reg); + +void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq) +{ + xdp_rxq->reg_state = REG_STATE_UNUSED; +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_unused); + +bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq) +{ + return (xdp_rxq->reg_state == REG_STATE_REGISTERED); +} +EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg); |