diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-04 12:26:43 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2017-05-04 12:26:43 -0700 |
commit | 4ac4d584886a4f47f8ff3bca0f32ff9a2987d3e5 (patch) | |
tree | f5fa439ec5f73d6b5cf9a5a3ffe9b53f70fa27b4 | |
parent | 8d5e72dfdf0fa29a21143fd72746c6f43295ce9f (diff) | |
parent | 842be75c77cb72ee546a2b19da9c285fb3ded660 (diff) | |
download | op-kernel-dev-4ac4d584886a4f47f8ff3bca0f32ff9a2987d3e5.zip op-kernel-dev-4ac4d584886a4f47f8ff3bca0f32ff9a2987d3e5.tar.gz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net
Pull networking fixes from David Miller:
1) The wireless rate info fix from Johannes Berg.
2) When a RAW socket is in hdrincl mode, we need to make sure that the
user provided at least a minimally sized ipv4/ipv6 header. Fix from
Alexander Potapenko.
3) We must emit IFLA_PHYS_PORT_NAME netlink attributes using
nla_put_string() so that it is NULL terminated.
4) Fix a bug in TCP fastopen handling, wherein child sockets
erroneously inherit the fastopen_req from the parent, and later can
end up derefencing freed memory or doing a double free. From Eric
Dumazet.
5) Don't clear out netdev stats at close time in tg3 driver, from
YueHaibing.
6) Fix refcount leak in xt_CT, from Gao Feng.
7) In nft_set_bitmap() don't leak dummy elements, from Liping Zhang.
8) Fix deadlock due to taking the expectation lock twice, also from
Liping Zhang.
9) Make xt_socket work again with ipv6, from Peter Tirsek.
10) Don't allow IPV6 to be used with IPVS if ipv6.disable=1, from Paolo
Abeni.
11) Make the BPF loader more flexible wrt. changes to the bpf MAP entry
layout. From Jesper Dangaard Brouer.
12) Fix ethtool reported device name in aquantia driver, from Pavel
Belous.
13) Fix build failures due to the compile time size test not working in
netfilter conntrack. From Geert Uytterhoeven.
* git://git.kernel.org/pub/scm/linux/kernel/git/davem/net: (52 commits)
cfg80211: make RATE_INFO_BW_20 the default
ipv6: initialize route null entry in addrconf_init()
qede: Fix possible misconfiguration of advertised autoneg value.
qed: Fix overriding of supported autoneg value.
qed*: Fix possible overflow for status block id field.
rtnetlink: NUL-terminate IFLA_PHYS_PORT_NAME string
netvsc: make sure napi enabled before vmbus_open
aquantia: Fix driver name reported by ethtool
ipv4, ipv6: ensure raw socket message is big enough to hold an IP header
net/sched: remove redundant null check on head
tcp: do not inherit fastopen_req from parent
forcedeth: remove unnecessary carrier status check
ibmvnic: Move queue restarting in ibmvnic_tx_complete
ibmvnic: Record SKB RX queue during poll
ibmvnic: Continue skb processing after skb completion error
ibmvnic: Check for driver reset first in ibmvnic_xmit
ibmvnic: Wait for any pending scrqs entries at driver close
ibmvnic: Clean up tx pools when closing
ibmvnic: Whitespace correction in release_rx_pools
ibmvnic: Delete napi's when releasing driver resources
...
54 files changed, 1174 insertions, 437 deletions
diff --git a/MAINTAINERS b/MAINTAINERS index f72bf45..3f9b7d5 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8782,14 +8782,16 @@ F: drivers/net/ethernet/neterion/ NETFILTER M: Pablo Neira Ayuso <pablo@netfilter.org> M: Jozsef Kadlecsik <kadlec@blackhole.kfki.hu> +M: Florian Westphal <fw@strlen.de> L: netfilter-devel@vger.kernel.org L: coreteam@netfilter.org W: http://www.netfilter.org/ W: http://www.iptables.org/ +W: http://www.nftables.org/ Q: http://patchwork.ozlabs.org/project/netfilter-devel/list/ T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git T: git git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next.git -S: Supported +S: Maintained F: include/linux/netfilter* F: include/linux/netfilter/ F: include/net/netfilter/ diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h index 5f99237..2149864 100644 --- a/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h +++ b/drivers/net/ethernet/aquantia/atlantic/aq_cfg.h @@ -68,7 +68,7 @@ #define AQ_CFG_DRV_AUTHOR "aQuantia" #define AQ_CFG_DRV_DESC "aQuantia Corporation(R) Network Driver" -#define AQ_CFG_DRV_NAME "aquantia" +#define AQ_CFG_DRV_NAME "atlantic" #define AQ_CFG_DRV_VERSION __stringify(NIC_MAJOR_DRIVER_VERSION)"."\ __stringify(NIC_MINOR_DRIVER_VERSION)"."\ __stringify(NIC_BUILD_DRIVER_VERSION)"."\ diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c index f395b95..537d571 100644 --- a/drivers/net/ethernet/broadcom/tg3.c +++ b/drivers/net/ethernet/broadcom/tg3.c @@ -11729,10 +11729,6 @@ static int tg3_close(struct net_device *dev) tg3_stop(tp); - /* Clear stats across close / open calls */ - memset(&tp->net_stats_prev, 0, sizeof(tp->net_stats_prev)); - memset(&tp->estats_prev, 0, sizeof(tp->estats_prev)); - if (pci_device_is_present(tp->pdev)) { tg3_power_down_prepare(tp); diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 4fcd2f0..4f2d329 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -194,7 +194,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, if (!ltb->buff) return; - if (!adapter->failover) + if (adapter->reset_reason != VNIC_RESET_FAILOVER && + adapter->reset_reason != VNIC_RESET_MOBILITY) send_request_unmap(adapter, ltb->map_id); dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); } @@ -292,9 +293,6 @@ static void replenish_pools(struct ibmvnic_adapter *adapter) { int i; - if (adapter->migrated) - return; - adapter->replenish_task_cycles++; for (i = 0; i < be32_to_cpu(adapter->login_rsp_buf->num_rxadd_subcrqs); i++) { @@ -350,7 +348,7 @@ static void release_rx_pools(struct ibmvnic_adapter *adapter) free_long_term_buff(adapter, &rx_pool->long_term_buff); if (!rx_pool->rx_buff) - continue; + continue; for (j = 0; j < rx_pool->size; j++) { if (rx_pool->rx_buff[j].skb) { @@ -554,11 +552,20 @@ static int ibmvnic_login(struct net_device *netdev) static void release_resources(struct ibmvnic_adapter *adapter) { + int i; + release_tx_pools(adapter); release_rx_pools(adapter); release_stats_token(adapter); release_error_buffers(adapter); + + if (adapter->napi) { + for (i = 0; i < adapter->req_rx_queues; i++) { + if (&adapter->napi[i]) + netif_napi_del(&adapter->napi[i]); + } + } } static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) @@ -569,11 +576,6 @@ static int set_link_state(struct ibmvnic_adapter *adapter, u8 link_state) bool resend; int rc; - if (adapter->logical_link_state == link_state) { - netdev_dbg(netdev, "Link state already %d\n", link_state); - return 0; - } - netdev_err(netdev, "setting link state %d\n", link_state); memset(&crq, 0, sizeof(crq)); crq.logical_link_state.first = IBMVNIC_CRQ_CMD; @@ -624,22 +626,10 @@ static int set_real_num_queues(struct net_device *netdev) return rc; } -static int ibmvnic_open(struct net_device *netdev) +static int init_resources(struct ibmvnic_adapter *adapter) { - struct ibmvnic_adapter *adapter = netdev_priv(netdev); - struct device *dev = &adapter->vdev->dev; - int rc = 0; - int i; - - if (adapter->is_closed) { - rc = ibmvnic_init(adapter); - if (rc) - return rc; - } - - rc = ibmvnic_login(netdev); - if (rc) - return rc; + struct net_device *netdev = adapter->netdev; + int i, rc; rc = set_real_num_queues(netdev); if (rc) @@ -647,7 +637,7 @@ static int ibmvnic_open(struct net_device *netdev) rc = init_sub_crq_irqs(adapter); if (rc) { - dev_err(dev, "failed to initialize sub crq irqs\n"); + netdev_err(netdev, "failed to initialize sub crq irqs\n"); return -1; } @@ -659,90 +649,184 @@ static int ibmvnic_open(struct net_device *netdev) adapter->napi = kcalloc(adapter->req_rx_queues, sizeof(struct napi_struct), GFP_KERNEL); if (!adapter->napi) - goto ibmvnic_open_fail; + return -ENOMEM; + for (i = 0; i < adapter->req_rx_queues; i++) { netif_napi_add(netdev, &adapter->napi[i], ibmvnic_poll, NAPI_POLL_WEIGHT); - napi_enable(&adapter->napi[i]); } send_map_query(adapter); rc = init_rx_pools(netdev); if (rc) - goto ibmvnic_open_fail; + return rc; rc = init_tx_pools(netdev); - if (rc) - goto ibmvnic_open_fail; + return rc; +} + +static int __ibmvnic_open(struct net_device *netdev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + enum vnic_state prev_state = adapter->state; + int i, rc; + adapter->state = VNIC_OPENING; replenish_pools(adapter); + for (i = 0; i < adapter->req_rx_queues; i++) + napi_enable(&adapter->napi[i]); + /* We're ready to receive frames, enable the sub-crq interrupts and * set the logical link state to up */ - for (i = 0; i < adapter->req_rx_queues; i++) - enable_scrq_irq(adapter, adapter->rx_scrq[i]); + for (i = 0; i < adapter->req_rx_queues; i++) { + if (prev_state == VNIC_CLOSED) + enable_irq(adapter->rx_scrq[i]->irq); + else + enable_scrq_irq(adapter, adapter->rx_scrq[i]); + } - for (i = 0; i < adapter->req_tx_queues; i++) - enable_scrq_irq(adapter, adapter->tx_scrq[i]); + for (i = 0; i < adapter->req_tx_queues; i++) { + if (prev_state == VNIC_CLOSED) + enable_irq(adapter->tx_scrq[i]->irq); + else + enable_scrq_irq(adapter, adapter->tx_scrq[i]); + } rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_UP); - if (rc) - goto ibmvnic_open_fail; + if (rc) { + for (i = 0; i < adapter->req_rx_queues; i++) + napi_disable(&adapter->napi[i]); + release_resources(adapter); + return rc; + } netif_tx_start_all_queues(netdev); - adapter->is_closed = false; - return 0; + if (prev_state == VNIC_CLOSED) { + for (i = 0; i < adapter->req_rx_queues; i++) + napi_schedule(&adapter->napi[i]); + } -ibmvnic_open_fail: - for (i = 0; i < adapter->req_rx_queues; i++) - napi_disable(&adapter->napi[i]); - release_resources(adapter); - return -ENOMEM; + adapter->state = VNIC_OPEN; + return rc; } -static void disable_sub_crqs(struct ibmvnic_adapter *adapter) +static int ibmvnic_open(struct net_device *netdev) { - int i; + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + int rc; - if (adapter->tx_scrq) { - for (i = 0; i < adapter->req_tx_queues; i++) - if (adapter->tx_scrq[i]) - disable_irq(adapter->tx_scrq[i]->irq); + mutex_lock(&adapter->reset_lock); + + if (adapter->state != VNIC_CLOSED) { + rc = ibmvnic_login(netdev); + if (rc) { + mutex_unlock(&adapter->reset_lock); + return rc; + } + + rc = init_resources(adapter); + if (rc) { + netdev_err(netdev, "failed to initialize resources\n"); + release_resources(adapter); + mutex_unlock(&adapter->reset_lock); + return rc; + } } - if (adapter->rx_scrq) { - for (i = 0; i < adapter->req_rx_queues; i++) - if (adapter->rx_scrq[i]) - disable_irq(adapter->rx_scrq[i]->irq); + rc = __ibmvnic_open(netdev); + mutex_unlock(&adapter->reset_lock); + + return rc; +} + +static void clean_tx_pools(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_tx_pool *tx_pool; + u64 tx_entries; + int tx_scrqs; + int i, j; + + if (!adapter->tx_pool) + return; + + tx_scrqs = be32_to_cpu(adapter->login_rsp_buf->num_txsubm_subcrqs); + tx_entries = adapter->req_tx_entries_per_subcrq; + + /* Free any remaining skbs in the tx buffer pools */ + for (i = 0; i < tx_scrqs; i++) { + tx_pool = &adapter->tx_pool[i]; + if (!tx_pool) + continue; + + for (j = 0; j < tx_entries; j++) { + if (tx_pool->tx_buff[j].skb) { + dev_kfree_skb_any(tx_pool->tx_buff[j].skb); + tx_pool->tx_buff[j].skb = NULL; + } + } } } -static int ibmvnic_close(struct net_device *netdev) +static int __ibmvnic_close(struct net_device *netdev) { struct ibmvnic_adapter *adapter = netdev_priv(netdev); int rc = 0; int i; - adapter->closing = true; - disable_sub_crqs(adapter); + adapter->state = VNIC_CLOSING; + netif_tx_stop_all_queues(netdev); if (adapter->napi) { for (i = 0; i < adapter->req_rx_queues; i++) napi_disable(&adapter->napi[i]); } - if (!adapter->failover) - netif_tx_stop_all_queues(netdev); + clean_tx_pools(adapter); + + if (adapter->tx_scrq) { + for (i = 0; i < adapter->req_tx_queues; i++) + if (adapter->tx_scrq[i]->irq) + disable_irq(adapter->tx_scrq[i]->irq); + } rc = set_link_state(adapter, IBMVNIC_LOGICAL_LNK_DN); + if (rc) + return rc; - release_resources(adapter); + if (adapter->rx_scrq) { + for (i = 0; i < adapter->req_rx_queues; i++) { + int retries = 10; + + while (pending_scrq(adapter, adapter->rx_scrq[i])) { + retries--; + mdelay(100); + + if (retries == 0) + break; + } + + if (adapter->rx_scrq[i]->irq) + disable_irq(adapter->rx_scrq[i]->irq); + } + } + + adapter->state = VNIC_CLOSED; + return rc; +} + +static int ibmvnic_close(struct net_device *netdev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(netdev); + int rc; + + mutex_lock(&adapter->reset_lock); + rc = __ibmvnic_close(netdev); + mutex_unlock(&adapter->reset_lock); - adapter->is_closed = true; - adapter->closing = false; return rc; } @@ -901,13 +985,7 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) int index = 0; int ret = 0; - tx_pool = &adapter->tx_pool[queue_num]; - tx_scrq = adapter->tx_scrq[queue_num]; - txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb)); - handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + - be32_to_cpu(adapter->login_rsp_buf-> - off_txsubm_subcrqs)); - if (adapter->migrated) { + if (adapter->resetting) { if (!netif_subqueue_stopped(netdev, skb)) netif_stop_subqueue(netdev, queue_num); dev_kfree_skb_any(skb); @@ -918,6 +996,12 @@ static int ibmvnic_xmit(struct sk_buff *skb, struct net_device *netdev) goto out; } + tx_pool = &adapter->tx_pool[queue_num]; + tx_scrq = adapter->tx_scrq[queue_num]; + txq = netdev_get_tx_queue(netdev, skb_get_queue_mapping(skb)); + handle_array = (u64 *)((u8 *)(adapter->login_rsp_buf) + + be32_to_cpu(adapter->login_rsp_buf->off_txsubm_subcrqs)); + index = tx_pool->free_map[tx_pool->consumer_index]; offset = index * adapter->req_mtu; dst = tx_pool->long_term_buff.buff + offset; @@ -1099,18 +1183,185 @@ static int ibmvnic_set_mac(struct net_device *netdev, void *p) return 0; } -static void ibmvnic_tx_timeout(struct net_device *dev) +/** + * do_reset returns zero if we are able to keep processing reset events, or + * non-zero if we hit a fatal error and must halt. + */ +static int do_reset(struct ibmvnic_adapter *adapter, + struct ibmvnic_rwi *rwi, u32 reset_state) { - struct ibmvnic_adapter *adapter = netdev_priv(dev); - int rc; + struct net_device *netdev = adapter->netdev; + int i, rc; + + netif_carrier_off(netdev); + adapter->reset_reason = rwi->reset_reason; + + if (rwi->reset_reason == VNIC_RESET_MOBILITY) { + rc = ibmvnic_reenable_crq_queue(adapter); + if (rc) + return 0; + } - /* Adapter timed out, resetting it */ + rc = __ibmvnic_close(netdev); + if (rc) + return rc; + + /* remove the closed state so when we call open it appears + * we are coming from the probed state. + */ + adapter->state = VNIC_PROBED; + + release_resources(adapter); release_sub_crqs(adapter); - rc = ibmvnic_reset_crq(adapter); + release_crq_queue(adapter); + + rc = ibmvnic_init(adapter); if (rc) - dev_err(&adapter->vdev->dev, "Adapter timeout, reset failed\n"); - else - ibmvnic_send_crq_init(adapter); + return 0; + + /* If the adapter was in PROBE state prior to the reset, exit here. */ + if (reset_state == VNIC_PROBED) + return 0; + + rc = ibmvnic_login(netdev); + if (rc) { + adapter->state = VNIC_PROBED; + return 0; + } + + rtnl_lock(); + rc = init_resources(adapter); + rtnl_unlock(); + if (rc) + return rc; + + if (reset_state == VNIC_CLOSED) + return 0; + + rc = __ibmvnic_open(netdev); + if (rc) { + if (list_empty(&adapter->rwi_list)) + adapter->state = VNIC_CLOSED; + else + adapter->state = reset_state; + + return 0; + } + + netif_carrier_on(netdev); + + /* kick napi */ + for (i = 0; i < adapter->req_rx_queues; i++) + napi_schedule(&adapter->napi[i]); + + return 0; +} + +static struct ibmvnic_rwi *get_next_rwi(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_rwi *rwi; + + mutex_lock(&adapter->rwi_lock); + + if (!list_empty(&adapter->rwi_list)) { + rwi = list_first_entry(&adapter->rwi_list, struct ibmvnic_rwi, + list); + list_del(&rwi->list); + } else { + rwi = NULL; + } + + mutex_unlock(&adapter->rwi_lock); + return rwi; +} + +static void free_all_rwi(struct ibmvnic_adapter *adapter) +{ + struct ibmvnic_rwi *rwi; + + rwi = get_next_rwi(adapter); + while (rwi) { + kfree(rwi); + rwi = get_next_rwi(adapter); + } +} + +static void __ibmvnic_reset(struct work_struct *work) +{ + struct ibmvnic_rwi *rwi; + struct ibmvnic_adapter *adapter; + struct net_device *netdev; + u32 reset_state; + int rc; + + adapter = container_of(work, struct ibmvnic_adapter, ibmvnic_reset); + netdev = adapter->netdev; + + mutex_lock(&adapter->reset_lock); + adapter->resetting = true; + reset_state = adapter->state; + + rwi = get_next_rwi(adapter); + while (rwi) { + rc = do_reset(adapter, rwi, reset_state); + kfree(rwi); + if (rc) + break; + + rwi = get_next_rwi(adapter); + } + + if (rc) { + free_all_rwi(adapter); + return; + } + + adapter->resetting = false; + mutex_unlock(&adapter->reset_lock); +} + +static void ibmvnic_reset(struct ibmvnic_adapter *adapter, + enum ibmvnic_reset_reason reason) +{ + struct ibmvnic_rwi *rwi, *tmp; + struct net_device *netdev = adapter->netdev; + struct list_head *entry; + + if (adapter->state == VNIC_REMOVING || + adapter->state == VNIC_REMOVED) { + netdev_dbg(netdev, "Adapter removing, skipping reset\n"); + return; + } + + mutex_lock(&adapter->rwi_lock); + + list_for_each(entry, &adapter->rwi_list) { + tmp = list_entry(entry, struct ibmvnic_rwi, list); + if (tmp->reset_reason == reason) { + netdev_err(netdev, "Matching reset found, skipping\n"); + mutex_unlock(&adapter->rwi_lock); + return; + } + } + + rwi = kzalloc(sizeof(*rwi), GFP_KERNEL); + if (!rwi) { + mutex_unlock(&adapter->rwi_lock); + ibmvnic_close(netdev); + return; + } + + rwi->reset_reason = reason; + list_add_tail(&rwi->list, &adapter->rwi_list); + mutex_unlock(&adapter->rwi_lock); + schedule_work(&adapter->ibmvnic_reset); +} + +static void ibmvnic_tx_timeout(struct net_device *dev) +{ + struct ibmvnic_adapter *adapter = netdev_priv(dev); + + ibmvnic_reset(adapter, VNIC_RESET_TIMEOUT); } static void remove_buff_from_pool(struct ibmvnic_adapter *adapter, @@ -1153,7 +1404,7 @@ restart_poll: /* free the entry */ next->rx_comp.first = 0; remove_buff_from_pool(adapter, rx_buff); - break; + continue; } length = be32_to_cpu(next->rx_comp.len); @@ -1177,6 +1428,7 @@ restart_poll: skb_put(skb, length); skb->protocol = eth_type_trans(skb, netdev); + skb_record_rx_queue(skb, scrq_num); if (flags & IBMVNIC_IP_CHKSUM_GOOD && flags & IBMVNIC_TCP_UDP_CHKSUM_GOOD) { @@ -1557,19 +1809,8 @@ restart_loop: } if (txbuff->last_frag) { - if (atomic_sub_return(next->tx_comp.num_comps, - &scrq->used) <= - (adapter->req_tx_entries_per_subcrq / 2) && - netif_subqueue_stopped(adapter->netdev, - txbuff->skb)) { - netif_wake_subqueue(adapter->netdev, - scrq->pool_index); - netdev_dbg(adapter->netdev, - "Started queue %d\n", - scrq->pool_index); - } - dev_kfree_skb_any(txbuff->skb); + txbuff->skb = NULL; } adapter->tx_pool[pool].free_map[adapter->tx_pool[pool]. @@ -1580,6 +1821,15 @@ restart_loop: } /* remove tx_comp scrq*/ next->tx_comp.first = 0; + + if (atomic_sub_return(next->tx_comp.num_comps, &scrq->used) <= + (adapter->req_tx_entries_per_subcrq / 2) && + __netif_subqueue_stopped(adapter->netdev, + scrq->pool_index)) { + netif_wake_subqueue(adapter->netdev, scrq->pool_index); + netdev_info(adapter->netdev, "Started queue %d\n", + scrq->pool_index); + } } enable_scrq_irq(adapter, scrq); @@ -1853,7 +2103,8 @@ static int pending_scrq(struct ibmvnic_adapter *adapter, { union sub_crq *entry = &scrq->msgs[scrq->cur]; - if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || adapter->closing) + if (entry->generic.first & IBMVNIC_CRQ_CMD_RSP || + adapter->state == VNIC_CLOSING) return 1; else return 0; @@ -1991,18 +2242,6 @@ static int ibmvnic_send_crq_init(struct ibmvnic_adapter *adapter) return ibmvnic_send_crq(adapter, &crq); } -static int ibmvnic_send_crq_init_complete(struct ibmvnic_adapter *adapter) -{ - union ibmvnic_crq crq; - - memset(&crq, 0, sizeof(crq)); - crq.generic.first = IBMVNIC_CRQ_INIT_CMD; - crq.generic.cmd = IBMVNIC_CRQ_INIT_COMPLETE; - netdev_dbg(adapter->netdev, "Sending CRQ init complete\n"); - - return ibmvnic_send_crq(adapter, &crq); -} - static int send_version_xchg(struct ibmvnic_adapter *adapter) { union ibmvnic_crq crq; @@ -2500,6 +2739,9 @@ static void handle_error_indication(union ibmvnic_crq *crq, if (be32_to_cpu(crq->error_indication.error_id)) request_error_information(adapter, crq); + + if (crq->error_indication.flags & IBMVNIC_FATAL_ERROR) + ibmvnic_reset(adapter, VNIC_RESET_FATAL); } static void handle_change_mac_rsp(union ibmvnic_crq *crq, @@ -2888,26 +3130,6 @@ out: } } -static void ibmvnic_xport_event(struct work_struct *work) -{ - struct ibmvnic_adapter *adapter = container_of(work, - struct ibmvnic_adapter, - ibmvnic_xport); - struct device *dev = &adapter->vdev->dev; - long rc; - - release_sub_crqs(adapter); - if (adapter->migrated) { - rc = ibmvnic_reenable_crq_queue(adapter); - if (rc) - dev_err(dev, "Error after enable rc=%ld\n", rc); - adapter->migrated = false; - rc = ibmvnic_send_crq_init(adapter); - if (rc) - dev_err(dev, "Error sending init rc=%ld\n", rc); - } -} - static void ibmvnic_handle_crq(union ibmvnic_crq *crq, struct ibmvnic_adapter *adapter) { @@ -2925,12 +3147,6 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, switch (gen_crq->cmd) { case IBMVNIC_CRQ_INIT: dev_info(dev, "Partner initialized\n"); - /* Send back a response */ - rc = ibmvnic_send_crq_init_complete(adapter); - if (!rc) - schedule_work(&adapter->vnic_crq_init); - else - dev_err(dev, "Can't send initrsp rc=%ld\n", rc); break; case IBMVNIC_CRQ_INIT_COMPLETE: dev_info(dev, "Partner initialization complete\n"); @@ -2941,19 +3157,18 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, } return; case IBMVNIC_CRQ_XPORT_EVENT: + netif_carrier_off(netdev); if (gen_crq->cmd == IBMVNIC_PARTITION_MIGRATED) { - dev_info(dev, "Re-enabling adapter\n"); - adapter->migrated = true; - schedule_work(&adapter->ibmvnic_xport); + dev_info(dev, "Migrated, re-enabling adapter\n"); + ibmvnic_reset(adapter, VNIC_RESET_MOBILITY); } else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) { dev_info(dev, "Backing device failover detected\n"); - netif_carrier_off(netdev); - adapter->failover = true; + ibmvnic_reset(adapter, VNIC_RESET_FAILOVER); } else { /* The adapter lost the connection */ dev_err(dev, "Virtual Adapter failed (rc=%d)\n", gen_crq->cmd); - schedule_work(&adapter->ibmvnic_xport); + ibmvnic_reset(adapter, VNIC_RESET_FATAL); } return; case IBMVNIC_CRQ_CMD_RSP: @@ -3234,64 +3449,6 @@ map_failed: return retrc; } -static void handle_crq_init_rsp(struct work_struct *work) -{ - struct ibmvnic_adapter *adapter = container_of(work, - struct ibmvnic_adapter, - vnic_crq_init); - struct device *dev = &adapter->vdev->dev; - struct net_device *netdev = adapter->netdev; - unsigned long timeout = msecs_to_jiffies(30000); - bool restart = false; - int rc; - - if (adapter->failover) { - release_sub_crqs(adapter); - if (netif_running(netdev)) { - netif_tx_disable(netdev); - ibmvnic_close(netdev); - restart = true; - } - } - - reinit_completion(&adapter->init_done); - send_version_xchg(adapter); - if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { - dev_err(dev, "Passive init timeout\n"); - goto task_failed; - } - - netdev->mtu = adapter->req_mtu - ETH_HLEN; - - if (adapter->failover) { - adapter->failover = false; - if (restart) { - rc = ibmvnic_open(netdev); - if (rc) - goto restart_failed; - } - netif_carrier_on(netdev); - return; - } - - rc = register_netdev(netdev); - if (rc) { - dev_err(dev, - "failed to register netdev rc=%d\n", rc); - goto register_failed; - } - dev_info(dev, "ibmvnic registered\n"); - - return; - -restart_failed: - dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc); -register_failed: - release_sub_crqs(adapter); -task_failed: - dev_err(dev, "Passive initialization was not successful\n"); -} - static int ibmvnic_init(struct ibmvnic_adapter *adapter) { struct device *dev = &adapter->vdev->dev; @@ -3346,10 +3503,10 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) return -ENOMEM; adapter = netdev_priv(netdev); + adapter->state = VNIC_PROBING; dev_set_drvdata(&dev->dev, netdev); adapter->vdev = dev; adapter->netdev = netdev; - adapter->failover = false; ether_addr_copy(adapter->mac_addr, mac_addr_p); ether_addr_copy(netdev->dev_addr, adapter->mac_addr); @@ -3358,14 +3515,17 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) netdev->ethtool_ops = &ibmvnic_ethtool_ops; SET_NETDEV_DEV(netdev, &dev->dev); - INIT_WORK(&adapter->vnic_crq_init, handle_crq_init_rsp); - INIT_WORK(&adapter->ibmvnic_xport, ibmvnic_xport_event); - spin_lock_init(&adapter->stats_lock); INIT_LIST_HEAD(&adapter->errors); spin_lock_init(&adapter->error_list_lock); + INIT_WORK(&adapter->ibmvnic_reset, __ibmvnic_reset); + INIT_LIST_HEAD(&adapter->rwi_list); + mutex_init(&adapter->reset_lock); + mutex_init(&adapter->rwi_lock); + adapter->resetting = false; + rc = ibmvnic_init(adapter); if (rc) { free_netdev(netdev); @@ -3373,7 +3533,6 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } netdev->mtu = adapter->req_mtu - ETH_HLEN; - adapter->is_closed = false; rc = register_netdev(netdev); if (rc) { @@ -3383,6 +3542,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) } dev_info(&dev->dev, "ibmvnic registered\n"); + adapter->state = VNIC_PROBED; return 0; } @@ -3391,12 +3551,17 @@ static int ibmvnic_remove(struct vio_dev *dev) struct net_device *netdev = dev_get_drvdata(&dev->dev); struct ibmvnic_adapter *adapter = netdev_priv(netdev); + adapter->state = VNIC_REMOVING; unregister_netdev(netdev); + mutex_lock(&adapter->reset_lock); release_resources(adapter); release_sub_crqs(adapter); release_crq_queue(adapter); + adapter->state = VNIC_REMOVED; + + mutex_unlock(&adapter->reset_lock); free_netdev(netdev); dev_set_drvdata(&dev->dev, NULL); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index a69979f..4702b48 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -913,6 +913,25 @@ struct ibmvnic_error_buff { __be32 error_id; }; +enum vnic_state {VNIC_PROBING = 1, + VNIC_PROBED, + VNIC_OPENING, + VNIC_OPEN, + VNIC_CLOSING, + VNIC_CLOSED, + VNIC_REMOVING, + VNIC_REMOVED}; + +enum ibmvnic_reset_reason {VNIC_RESET_FAILOVER = 1, + VNIC_RESET_MOBILITY, + VNIC_RESET_FATAL, + VNIC_RESET_TIMEOUT}; + +struct ibmvnic_rwi { + enum ibmvnic_reset_reason reset_reason; + struct list_head list; +}; + struct ibmvnic_adapter { struct vio_dev *vdev; struct net_device *netdev; @@ -922,7 +941,6 @@ struct ibmvnic_adapter { dma_addr_t ip_offload_tok; struct ibmvnic_control_ip_offload_buffer ip_offload_ctrl; dma_addr_t ip_offload_ctrl_tok; - bool migrated; u32 msg_enable; /* Statistics */ @@ -962,7 +980,6 @@ struct ibmvnic_adapter { u64 promisc; struct ibmvnic_tx_pool *tx_pool; - bool closing; struct completion init_done; int init_done_rc; @@ -1007,9 +1024,11 @@ struct ibmvnic_adapter { __be64 tx_rx_desc_req; u8 map_id; - struct work_struct vnic_crq_init; - struct work_struct ibmvnic_xport; struct tasklet_struct tasklet; - bool failover; - bool is_closed; + enum vnic_state state; + enum ibmvnic_reset_reason reset_reason; + struct mutex reset_lock, rwi_lock; + struct list_head rwi_list; + struct work_struct ibmvnic_reset; + bool resetting; }; diff --git a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c index db20376..82bd6b0 100644 --- a/drivers/net/ethernet/netronome/nfp/nfp_net_common.c +++ b/drivers/net/ethernet/netronome/nfp/nfp_net_common.c @@ -2532,11 +2532,11 @@ nfp_net_check_config(struct nfp_net *nn, struct nfp_net_dp *dp, if (!dp->xdp_prog) return 0; if (dp->fl_bufsz > PAGE_SIZE) { - NL_MOD_TRY_SET_ERR_MSG(extack, "MTU too large w/ XDP enabled"); + NL_SET_ERR_MSG_MOD(extack, "MTU too large w/ XDP enabled"); return -EINVAL; } if (dp->num_tx_rings > nn->max_tx_rings) { - NL_MOD_TRY_SET_ERR_MSG(extack, "Insufficient number of TX rings w/ XDP enabled"); + NL_SET_ERR_MSG_MOD(extack, "Insufficient number of TX rings w/ XDP enabled"); return -EINVAL; } diff --git a/drivers/net/ethernet/nvidia/forcedeth.c b/drivers/net/ethernet/nvidia/forcedeth.c index 978d329..aa912f4 100644 --- a/drivers/net/ethernet/nvidia/forcedeth.c +++ b/drivers/net/ethernet/nvidia/forcedeth.c @@ -4248,11 +4248,9 @@ static int nv_get_link_ksettings(struct net_device *dev, /* We do not track link speed / duplex setting if the * interface is disabled. Force a link check */ if (nv_update_linkspeed(dev)) { - if (!netif_carrier_ok(dev)) - netif_carrier_on(dev); + netif_carrier_on(dev); } else { - if (netif_carrier_ok(dev)) - netif_carrier_off(dev); + netif_carrier_off(dev); } } diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev.c b/drivers/net/ethernet/qlogic/qed/qed_dev.c index 5f31140..bb70522 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev.c +++ b/drivers/net/ethernet/qlogic/qed/qed_dev.c @@ -2536,6 +2536,9 @@ static int qed_hw_get_nvm_info(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt) DP_NOTICE(p_hwfn, "Unknown Speed in 0x%08x\n", link_temp); } + p_hwfn->mcp_info->link_capabilities.default_speed_autoneg = + link->speed.autoneg; + link_temp &= NVM_CFG1_PORT_DRV_FLOW_CONTROL_MASK; link_temp >>= NVM_CFG1_PORT_DRV_FLOW_CONTROL_OFFSET; link->pause.autoneg = !!(link_temp & @@ -3586,7 +3589,7 @@ static int qed_set_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, } int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id) + u16 coalesce, u16 qid, u16 sb_id) { struct ustorm_eth_queue_zone eth_qzone; u8 timeset, timer_res; @@ -3607,7 +3610,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, } timeset = (u8)(coalesce >> timer_res); - rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid); + rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid); if (rc) return rc; @@ -3628,7 +3631,7 @@ out: } int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id) + u16 coalesce, u16 qid, u16 sb_id) { struct xstorm_eth_queue_zone eth_qzone; u8 timeset, timer_res; @@ -3649,7 +3652,7 @@ int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, } timeset = (u8)(coalesce >> timer_res); - rc = qed_fw_l2_queue(p_hwfn, (u16)qid, &fw_qid); + rc = qed_fw_l2_queue(p_hwfn, qid, &fw_qid); if (rc) return rc; diff --git a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h index cefe3ee..12d16c0 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_dev_api.h +++ b/drivers/net/ethernet/qlogic/qed/qed_dev_api.h @@ -454,7 +454,7 @@ int qed_final_cleanup(struct qed_hwfn *p_hwfn, * @return int */ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id); + u16 coalesce, u16 qid, u16 sb_id); /** * @brief qed_set_txq_coalesce - Configure coalesce parameters for a Tx queue @@ -471,7 +471,7 @@ int qed_set_rxq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, * @return int */ int qed_set_txq_coalesce(struct qed_hwfn *p_hwfn, struct qed_ptt *p_ptt, - u16 coalesce, u8 qid, u16 sb_id); + u16 coalesce, u16 qid, u16 sb_id); const char *qed_hw_get_resc_name(enum qed_resources res_id); #endif diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index 59992cf..b7ad36b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -1372,7 +1372,7 @@ static void qed_fill_link(struct qed_hwfn *hwfn, /* TODO - at the moment assume supported and advertised speed equal */ if_link->supported_caps = QED_LM_FIBRE_BIT; - if (params.speed.autoneg) + if (link_caps.default_speed_autoneg) if_link->supported_caps |= QED_LM_Autoneg_BIT; if (params.pause.autoneg || (params.pause.forced_rx && params.pause.forced_tx)) @@ -1382,6 +1382,10 @@ static void qed_fill_link(struct qed_hwfn *hwfn, if_link->supported_caps |= QED_LM_Pause_BIT; if_link->advertised_caps = if_link->supported_caps; + if (params.speed.autoneg) + if_link->advertised_caps |= QED_LM_Autoneg_BIT; + else + if_link->advertised_caps &= ~QED_LM_Autoneg_BIT; if (params.speed.advertised_speeds & NVM_CFG1_PORT_DRV_SPEED_CAPABILITY_MASK_1G) if_link->advertised_caps |= QED_LM_1000baseT_Half_BIT | @@ -1521,7 +1525,7 @@ static void qed_get_coalesce(struct qed_dev *cdev, u16 *rx_coal, u16 *tx_coal) } static int qed_set_coalesce(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, - u8 qid, u16 sb_id) + u16 qid, u16 sb_id) { struct qed_hwfn *hwfn; struct qed_ptt *ptt; diff --git a/drivers/net/ethernet/qlogic/qed/qed_mcp.h b/drivers/net/ethernet/qlogic/qed/qed_mcp.h index 5ae35d6..2b09b85 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_mcp.h +++ b/drivers/net/ethernet/qlogic/qed/qed_mcp.h @@ -61,6 +61,7 @@ struct qed_mcp_link_params { struct qed_mcp_link_capabilities { u32 speed_capabilities; + bool default_speed_autoneg; }; struct qed_mcp_link_state { diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index 4dcfe96..172b292 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -493,6 +493,11 @@ static int qede_set_link_ksettings(struct net_device *dev, params.override_flags |= QED_LINK_OVERRIDE_SPEED_ADV_SPEEDS; params.override_flags |= QED_LINK_OVERRIDE_SPEED_AUTONEG; if (base->autoneg == AUTONEG_ENABLE) { + if (!(current_link.supported_caps & QED_LM_Autoneg_BIT)) { + DP_INFO(edev, "Auto negotiation is not supported\n"); + return -EOPNOTSUPP; + } + params.autoneg = true; params.forced_speed = 0; QEDE_ETHTOOL_TO_DRV_CAPS(params.adv_speeds, cmd, advertising) @@ -706,8 +711,7 @@ static int qede_set_coalesce(struct net_device *dev, { struct qede_dev *edev = netdev_priv(dev); int i, rc = 0; - u16 rxc, txc; - u8 sb_id; + u16 rxc, txc, sb_id; if (!netif_running(dev)) { DP_INFO(edev, "Interface is down\n"); @@ -729,7 +733,7 @@ static int qede_set_coalesce(struct net_device *dev, for_each_queue(i) { sb_id = edev->fp_array[i].sb_info->igu_sb_id; rc = edev->ops->common->set_coalesce(edev->cdev, rxc, txc, - (u8)i, sb_id); + (u16)i, sb_id); if (rc) { DP_INFO(edev, "Set coalesce error, rc = %d\n", rc); return rc; diff --git a/drivers/net/ethernet/smsc/smsc911x.c b/drivers/net/ethernet/smsc/smsc911x.c index fa5ca09..ea1bbc3 100644 --- a/drivers/net/ethernet/smsc/smsc911x.c +++ b/drivers/net/ethernet/smsc/smsc911x.c @@ -25,7 +25,7 @@ * LAN9215, LAN9216, LAN9217, LAN9218 * LAN9210, LAN9211 * LAN9220, LAN9221 - * LAN89218 + * LAN89218,LAN9250 * */ @@ -1450,6 +1450,8 @@ static int smsc911x_soft_reset(struct smsc911x_data *pdata) unsigned int timeout; unsigned int temp; int ret; + unsigned int reset_offset = HW_CFG; + unsigned int reset_mask = HW_CFG_SRST_; /* * Make sure to power-up the PHY chip before doing a reset, otherwise @@ -1476,15 +1478,23 @@ static int smsc911x_soft_reset(struct smsc911x_data *pdata) } } + if ((pdata->idrev & 0xFFFF0000) == LAN9250) { + /* special reset for LAN9250 */ + reset_offset = RESET_CTL; + reset_mask = RESET_CTL_DIGITAL_RST_; + } + /* Reset the LAN911x */ - smsc911x_reg_write(pdata, HW_CFG, HW_CFG_SRST_); + smsc911x_reg_write(pdata, reset_offset, reset_mask); + + /* verify reset bit is cleared */ timeout = 10; do { udelay(10); - temp = smsc911x_reg_read(pdata, HW_CFG); - } while ((--timeout) && (temp & HW_CFG_SRST_)); + temp = smsc911x_reg_read(pdata, reset_offset); + } while ((--timeout) && (temp & reset_mask)); - if (unlikely(temp & HW_CFG_SRST_)) { + if (unlikely(temp & reset_mask)) { SMSC_WARN(pdata, drv, "Failed to complete reset"); return -EIO; } @@ -2253,28 +2263,29 @@ static int smsc911x_init(struct net_device *dev) pdata->idrev = smsc911x_reg_read(pdata, ID_REV); switch (pdata->idrev & 0xFFFF0000) { - case 0x01180000: - case 0x01170000: - case 0x01160000: - case 0x01150000: - case 0x218A0000: + case LAN9118: + case LAN9117: + case LAN9116: + case LAN9115: + case LAN89218: /* LAN911[5678] family */ pdata->generation = pdata->idrev & 0x0000FFFF; break; - case 0x118A0000: - case 0x117A0000: - case 0x116A0000: - case 0x115A0000: + case LAN9218: + case LAN9217: + case LAN9216: + case LAN9215: /* LAN921[5678] family */ pdata->generation = 3; break; - case 0x92100000: - case 0x92110000: - case 0x92200000: - case 0x92210000: - /* LAN9210/LAN9211/LAN9220/LAN9221 */ + case LAN9210: + case LAN9211: + case LAN9220: + case LAN9221: + case LAN9250: + /* LAN9210/LAN9211/LAN9220/LAN9221/LAN9250 */ pdata->generation = 4; break; diff --git a/drivers/net/ethernet/smsc/smsc911x.h b/drivers/net/ethernet/smsc/smsc911x.h index 54d6489..8d75508 100644 --- a/drivers/net/ethernet/smsc/smsc911x.h +++ b/drivers/net/ethernet/smsc/smsc911x.h @@ -20,6 +20,22 @@ #ifndef __SMSC911X_H__ #define __SMSC911X_H__ +/*Chip ID*/ +#define LAN9115 0x01150000 +#define LAN9116 0x01160000 +#define LAN9117 0x01170000 +#define LAN9118 0x01180000 +#define LAN9215 0x115A0000 +#define LAN9216 0x116A0000 +#define LAN9217 0x117A0000 +#define LAN9218 0x118A0000 +#define LAN9210 0x92100000 +#define LAN9211 0x92110000 +#define LAN9220 0x92200000 +#define LAN9221 0x92210000 +#define LAN9250 0x92500000 +#define LAN89218 0x218A0000 + #define TX_FIFO_LOW_THRESHOLD ((u32)1600) #define SMSC911X_EEPROM_SIZE ((u32)128) #define USE_DEBUG 0 @@ -303,6 +319,9 @@ #define E2P_DATA_EEPROM_DATA_ 0x000000FF #define LAN_REGISTER_EXTENT 0x00000100 +#define RESET_CTL 0x1F8 +#define RESET_CTL_DIGITAL_RST_ 0x00000001 + /* * MAC Control and Status Register (Indirect Address) * Offset (through the MAC_CSR CMD and DATA port) diff --git a/drivers/net/hyperv/netvsc.c b/drivers/net/hyperv/netvsc.c index 15749d3..652453d 100644 --- a/drivers/net/hyperv/netvsc.c +++ b/drivers/net/hyperv/netvsc.c @@ -1322,6 +1322,10 @@ int netvsc_device_add(struct hv_device *device, nvchan->channel = device->channel; } + /* Enable NAPI handler before init callbacks */ + netif_napi_add(ndev, &net_device->chan_table[0].napi, + netvsc_poll, NAPI_POLL_WEIGHT); + /* Open the channel */ ret = vmbus_open(device->channel, ring_size * PAGE_SIZE, ring_size * PAGE_SIZE, NULL, 0, @@ -1329,6 +1333,7 @@ int netvsc_device_add(struct hv_device *device, net_device->chan_table); if (ret != 0) { + netif_napi_del(&net_device->chan_table[0].napi); netdev_err(ndev, "unable to open channel: %d\n", ret); goto cleanup; } @@ -1336,9 +1341,6 @@ int netvsc_device_add(struct hv_device *device, /* Channel is opened */ netdev_dbg(ndev, "hv_netvsc channel opened successfully\n"); - /* Enable NAPI handler for init callbacks */ - netif_napi_add(ndev, &net_device->chan_table[0].napi, - netvsc_poll, NAPI_POLL_WEIGHT); napi_enable(&net_device->chan_table[0].napi); /* Writing nvdev pointer unlocks netvsc_send(), make sure chn_table is diff --git a/drivers/net/hyperv/rndis_filter.c b/drivers/net/hyperv/rndis_filter.c index ab92c3c..f9d5b0b 100644 --- a/drivers/net/hyperv/rndis_filter.c +++ b/drivers/net/hyperv/rndis_filter.c @@ -1018,7 +1018,7 @@ static void netvsc_sc_open(struct vmbus_channel *new_sc) if (ret == 0) napi_enable(&nvchan->napi); else - netdev_err(ndev, "sub channel open failed (%d)\n", ret); + netif_napi_del(&nvchan->napi); if (refcount_dec_and_test(&nvscdev->sc_offered)) complete(&nvscdev->channel_init_wait); diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index a3ed811..d716576 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -1201,6 +1201,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x2357, 0x0201, 4)}, /* TP-LINK HSUPA Modem MA180 */ {QMI_FIXED_INTF(0x2357, 0x9000, 4)}, /* TP-LINK MA260 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1040, 2)}, /* Telit LE922A */ + {QMI_FIXED_INTF(0x1bc7, 0x1100, 3)}, /* Telit ME910 */ {QMI_FIXED_INTF(0x1bc7, 0x1200, 5)}, /* Telit LE920 */ {QMI_QUIRK_SET_DTR(0x1bc7, 0x1201, 2)}, /* Telit LE920, LE920A4 */ {QMI_FIXED_INTF(0x1c9e, 0x9b01, 3)}, /* XS Stick W100-2 from 4G Systems */ diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 3d0bc48..1c6d392 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -1891,17 +1891,17 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_TSO6) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO)) { - NL_SET_ERR_MSG(extack, "can't set XDP while host is implementing LRO, disable LRO first"); + NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO, disable LRO first"); return -EOPNOTSUPP; } if (vi->mergeable_rx_bufs && !vi->any_header_sg) { - NL_SET_ERR_MSG(extack, "XDP expects header/data in single page, any_header_sg required"); + NL_SET_ERR_MSG_MOD(extack, "XDP expects header/data in single page, any_header_sg required"); return -EINVAL; } if (dev->mtu > max_sz) { - NL_SET_ERR_MSG(extack, "MTU too large to enable XDP"); + NL_SET_ERR_MSG_MOD(extack, "MTU too large to enable XDP"); netdev_warn(dev, "XDP requires MTU less than %lu\n", max_sz); return -EINVAL; } @@ -1912,7 +1912,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, /* XDP requires extra queues for XDP_TX */ if (curr_qp + xdp_qp > vi->max_queue_pairs) { - NL_SET_ERR_MSG(extack, "Too few free TX rings available"); + NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available"); netdev_warn(dev, "request %i queues but max is %i\n", curr_qp + xdp_qp, vi->max_queue_pairs); return -ENOMEM; diff --git a/include/linux/netlink.h b/include/linux/netlink.h index c20395e..5fff5ba 100644 --- a/include/linux/netlink.h +++ b/include/linux/netlink.h @@ -86,19 +86,16 @@ struct netlink_ext_ack { * Currently string formatting is not supported (due * to the lack of an output buffer.) */ -#define NL_SET_ERR_MSG(extack, msg) do { \ - static const char _msg[] = (msg); \ - \ - (extack)->_msg = _msg; \ +#define NL_SET_ERR_MSG(extack, msg) do { \ + static const char __msg[] = (msg); \ + struct netlink_ext_ack *__extack = (extack); \ + \ + if (__extack) \ + __extack->_msg = __msg; \ } while (0) -#define NL_MOD_TRY_SET_ERR_MSG(extack, msg) do { \ - static const char _msg[] = KBUILD_MODNAME ": " msg; \ - struct netlink_ext_ack *_extack = (extack); \ - \ - if (_extack) \ - _extack->_msg = _msg; \ -} while (0) +#define NL_SET_ERR_MSG_MOD(extack, msg) \ + NL_SET_ERR_MSG((extack), KBUILD_MODNAME ": " msg) extern void netlink_kernel_release(struct sock *sk); extern int __netlink_change_ngroups(struct sock *sk, unsigned int groups); diff --git a/include/linux/qed/qed_if.h b/include/linux/qed/qed_if.h index 5544d7b..c70ac13 100644 --- a/include/linux/qed/qed_if.h +++ b/include/linux/qed/qed_if.h @@ -635,7 +635,7 @@ struct qed_common_ops { * @return 0 on success, error otherwise. */ int (*set_coalesce)(struct qed_dev *cdev, u16 rx_coal, u16 tx_coal, - u8 qid, u16 sb_id); + u16 qid, u16 sb_id); /** * @brief set_led - Configure LED mode diff --git a/include/net/cfg80211.h b/include/net/cfg80211.h index 6e90f1a..15d6599 100644 --- a/include/net/cfg80211.h +++ b/include/net/cfg80211.h @@ -1013,9 +1013,9 @@ enum rate_info_flags { * @RATE_INFO_BW_160: 160 MHz bandwidth */ enum rate_info_bw { + RATE_INFO_BW_20 = 0, RATE_INFO_BW_5, RATE_INFO_BW_10, - RATE_INFO_BW_20, RATE_INFO_BW_40, RATE_INFO_BW_80, RATE_INFO_BW_160, diff --git a/include/net/ip6_route.h b/include/net/ip6_route.h index 9dc2c18..f5e625f 100644 --- a/include/net/ip6_route.h +++ b/include/net/ip6_route.h @@ -84,6 +84,7 @@ struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6, struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table, int ifindex, struct flowi6 *fl6, int flags); +void ip6_route_init_special_entries(void); int ip6_route_init(void); void ip6_route_cleanup(void); diff --git a/include/uapi/linux/netfilter/nf_conntrack_common.h b/include/uapi/linux/netfilter/nf_conntrack_common.h index a8072cc..dc947e5 100644 --- a/include/uapi/linux/netfilter/nf_conntrack_common.h +++ b/include/uapi/linux/netfilter/nf_conntrack_common.h @@ -84,10 +84,6 @@ enum ip_conntrack_status { IPS_DYING_BIT = 9, IPS_DYING = (1 << IPS_DYING_BIT), - /* Bits that cannot be altered from userland. */ - IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | - IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING), - /* Connection has fixed timeout. */ IPS_FIXED_TIMEOUT_BIT = 10, IPS_FIXED_TIMEOUT = (1 << IPS_FIXED_TIMEOUT_BIT), @@ -103,6 +99,15 @@ enum ip_conntrack_status { /* Conntrack got a helper explicitly attached via CT target. */ IPS_HELPER_BIT = 13, IPS_HELPER = (1 << IPS_HELPER_BIT), + + /* Be careful here, modifying these bits can make things messy, + * so don't let users modify them directly. + */ + IPS_UNCHANGEABLE_MASK = (IPS_NAT_DONE_MASK | IPS_NAT_MASK | + IPS_EXPECTED | IPS_CONFIRMED | IPS_DYING | + IPS_SEQ_ADJUST | IPS_TEMPLATE), + + __IPS_MAX_BIT = 14, }; /* Connection tracking event types */ diff --git a/lib/test_bpf.c b/lib/test_bpf.c index a0f6628..889bc31 100644 --- a/lib/test_bpf.c +++ b/lib/test_bpf.c @@ -4769,8 +4769,8 @@ static struct bpf_test tests[] = { BPF_LD_IMM64(R1, 3), BPF_LD_IMM64(R2, 2), BPF_JMP_REG(BPF_JGE, R1, R2, 2), - BPF_LD_IMM64(R0, 0xffffffffffffffffUL), - BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeUL), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), BPF_EXIT_INSN(), }, INTERNAL, @@ -4784,7 +4784,7 @@ static struct bpf_test tests[] = { BPF_LD_IMM64(R1, 3), BPF_LD_IMM64(R2, 2), BPF_JMP_REG(BPF_JGE, R1, R2, 0), - BPF_LD_IMM64(R0, 0xffffffffffffffffUL), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), BPF_EXIT_INSN(), }, INTERNAL, @@ -4798,8 +4798,8 @@ static struct bpf_test tests[] = { BPF_LD_IMM64(R1, 3), BPF_LD_IMM64(R2, 2), BPF_JMP_REG(BPF_JGE, R1, R2, 4), - BPF_LD_IMM64(R0, 0xffffffffffffffffUL), - BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeUL), + BPF_LD_IMM64(R0, 0xffffffffffffffffULL), + BPF_LD_IMM64(R0, 0xeeeeeeeeeeeeeeeeULL), BPF_EXIT_INSN(), }, INTERNAL, diff --git a/net/bridge/netfilter/ebt_dnat.c b/net/bridge/netfilter/ebt_dnat.c index 4e0b0c3..e0bb624c 100644 --- a/net/bridge/netfilter/ebt_dnat.c +++ b/net/bridge/netfilter/ebt_dnat.c @@ -9,6 +9,7 @@ */ #include <linux/module.h> #include <net/sock.h> +#include "../br_private.h" #include <linux/netfilter.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter_bridge/ebtables.h> @@ -18,11 +19,30 @@ static unsigned int ebt_dnat_tg(struct sk_buff *skb, const struct xt_action_param *par) { const struct ebt_nat_info *info = par->targinfo; + struct net_device *dev; if (!skb_make_writable(skb, 0)) return EBT_DROP; ether_addr_copy(eth_hdr(skb)->h_dest, info->mac); + + if (is_multicast_ether_addr(info->mac)) { + if (is_broadcast_ether_addr(info->mac)) + skb->pkt_type = PACKET_BROADCAST; + else + skb->pkt_type = PACKET_MULTICAST; + } else { + if (xt_hooknum(par) != NF_BR_BROUTING) + dev = br_port_get_rcu(xt_in(par))->br->dev; + else + dev = xt_in(par); + + if (ether_addr_equal(info->mac, dev->dev_addr)) + skb->pkt_type = PACKET_HOST; + else + skb->pkt_type = PACKET_OTHERHOST; + } + return info->target; } diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 6e67315..bcb0f610 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1054,7 +1054,7 @@ static int rtnl_phys_port_name_fill(struct sk_buff *skb, struct net_device *dev) return err; } - if (nla_put(skb, IFLA_PHYS_PORT_NAME, strlen(name), name)) + if (nla_put_string(skb, IFLA_PHYS_PORT_NAME, name)) return -EMSGSIZE; return 0; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 9d94397..bdffad8 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -358,6 +358,9 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct iphdr)) + return -EINVAL; + if (flags&MSG_PROBE) goto out; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 8f6373b..717be4d 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -523,6 +523,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newicsk->icsk_ack.last_seg_size = skb->len - newtp->tcp_header_len; newtp->rx_opt.mss_clamp = req->mss; tcp_ecn_openreq_child(newtp, req); + newtp->fastopen_req = NULL; newtp->fastopen_rsk = NULL; newtp->syn_data_acked = 0; newtp->rack.mstamp.v64 = 0; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index b09ac38..77a4bd5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -3328,7 +3328,8 @@ static int fixup_permanent_addr(struct inet6_dev *idev, idev->dev, 0, 0); } - addrconf_dad_start(ifp); + if (ifp->state == INET6_IFADDR_STATE_PREDAD) + addrconf_dad_start(ifp); return 0; } @@ -3683,7 +3684,7 @@ restart: if (keep) { /* set state to skip the notifier below */ state = INET6_IFADDR_STATE_DEAD; - ifa->state = 0; + ifa->state = INET6_IFADDR_STATE_PREDAD; if (!(ifa->flags & IFA_F_NODAD)) ifa->flags |= IFA_F_TENTATIVE; @@ -6572,6 +6573,8 @@ int __init addrconf_init(void) goto errlo; } + ip6_route_init_special_entries(); + for (i = 0; i < IN6_ADDR_HSIZE; i++) INIT_HLIST_HEAD(&inet6_addr_lst[i]); diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c index bf3ad3e..b2b4f03 100644 --- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c @@ -235,7 +235,7 @@ int nf_nat_icmpv6_reply_translation(struct sk_buff *skb, inside->icmp6.icmp6_cksum = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr, skb->len - hdrlen, IPPROTO_ICMPV6, - csum_partial(&inside->icmp6, + skb_checksum(skb, hdrlen, skb->len - hdrlen, 0)); } diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 0da6a12..1f992d9 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -632,6 +632,8 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, ipv6_local_error(sk, EMSGSIZE, fl6, rt->dst.dev->mtu); return -EMSGSIZE; } + if (length < sizeof(struct ipv6hdr)) + return -EINVAL; if (flags&MSG_PROBE) goto out; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index a1bf426..2f11366 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -4027,6 +4027,21 @@ static struct notifier_block ip6_route_dev_notifier = { .priority = 0, }; +void __init ip6_route_init_special_entries(void) +{ + /* Registering of the loopback is done before this portion of code, + * the loopback reference in rt6_info will not be taken, do it + * manually for init_net */ + init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #ifdef CONFIG_IPV6_MULTIPLE_TABLES + init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; + init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); + #endif +} + int __init ip6_route_init(void) { int ret; @@ -4053,17 +4068,6 @@ int __init ip6_route_init(void) ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; - /* Registering of the loopback is done before this portion of code, - * the loopback reference in rt6_info will not be taken, do it - * manually for init_net */ - init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #ifdef CONFIG_IPV6_MULTIPLE_TABLES - init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev; - init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev); - #endif ret = fib6_init(); if (ret) goto out_register_subsys; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 668d964..1fa3c23 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3078,6 +3078,17 @@ nla_put_failure: return skb->len; } +static bool ip_vs_is_af_valid(int af) +{ + if (af == AF_INET) + return true; +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6 && ipv6_mod_enabled()) + return true; +#endif + return false; +} + static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *usvc, struct nlattr *nla, int full_entry, @@ -3105,11 +3116,7 @@ static int ip_vs_genl_parse_service(struct netns_ipvs *ipvs, memset(usvc, 0, sizeof(*usvc)); usvc->af = nla_get_u16(nla_af); -#ifdef CONFIG_IP_VS_IPV6 - if (usvc->af != AF_INET && usvc->af != AF_INET6) -#else - if (usvc->af != AF_INET) -#endif + if (!ip_vs_is_af_valid(usvc->af)) return -EAFNOSUPPORT; if (nla_fwmark) { @@ -3612,6 +3619,11 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) if (udest.af == 0) udest.af = svc->af; + if (!ip_vs_is_af_valid(udest.af)) { + ret = -EAFNOSUPPORT; + goto out; + } + if (udest.af != svc->af && cmd != IPVS_CMD_DEL_DEST) { /* The synchronization protocol is incompatible * with mixed family services diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index f9245db..3c8f1ed 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -1853,7 +1853,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_set_hashsize); module_param_call(hashsize, nf_conntrack_set_hashsize, param_get_uint, &nf_conntrack_htable_size, 0600); -static unsigned int total_extension_size(void) +static __always_inline unsigned int total_extension_size(void) { /* remember to add new extensions below */ BUILD_BUG_ON(NF_CT_EXT_NUM > 9); diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 4b9dfe3..3a60efa 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -385,7 +385,7 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; unsigned int h = helper_hash(&me->tuple); struct nf_conntrack_helper *cur; - int ret = 0; + int ret = 0, i; BUG_ON(me->expect_policy == NULL); BUG_ON(me->expect_class_max >= NF_CT_MAX_EXPECT_CLASSES); @@ -395,10 +395,26 @@ int nf_conntrack_helper_register(struct nf_conntrack_helper *me) return -EINVAL; mutex_lock(&nf_ct_helper_mutex); - hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, &mask)) { - ret = -EEXIST; - goto out; + for (i = 0; i < nf_ct_helper_hsize; i++) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[i], hnode) { + if (!strcmp(cur->name, me->name) && + (cur->tuple.src.l3num == NFPROTO_UNSPEC || + cur->tuple.src.l3num == me->tuple.src.l3num) && + cur->tuple.dst.protonum == me->tuple.dst.protonum) { + ret = -EEXIST; + goto out; + } + } + } + + /* avoid unpredictable behaviour for auto_assign_helper */ + if (!(me->flags & NF_CT_HELPER_F_USERSPACE)) { + hlist_for_each_entry(cur, &nf_ct_helper_hash[h], hnode) { + if (nf_ct_tuple_src_mask_cmp(&cur->tuple, &me->tuple, + &mask)) { + ret = -EEXIST; + goto out; + } } } hlist_add_head_rcu(&me->hnode, &nf_ct_helper_hash[h]); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5f6f2f3..dcf561b 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -417,8 +417,7 @@ nla_put_failure: return -1; } -static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, - const struct nf_conn *ct) +static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, struct nf_conn *ct) { struct nf_conn_seqadj *seqadj = nfct_seqadj(ct); struct nf_ct_seqadj *seq; @@ -426,15 +425,20 @@ static int ctnetlink_dump_ct_seq_adj(struct sk_buff *skb, if (!(ct->status & IPS_SEQ_ADJUST) || !seqadj) return 0; + spin_lock_bh(&ct->lock); seq = &seqadj->seq[IP_CT_DIR_ORIGINAL]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_ORIG) == -1) - return -1; + goto err; seq = &seqadj->seq[IP_CT_DIR_REPLY]; if (dump_ct_seq_adj(skb, seq, CTA_SEQ_ADJ_REPLY) == -1) - return -1; + goto err; + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return -1; } static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) @@ -1417,6 +1421,24 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, } #endif +static void +__ctnetlink_change_status(struct nf_conn *ct, unsigned long on, + unsigned long off) +{ + unsigned int bit; + + /* Ignore these unchangable bits */ + on &= ~IPS_UNCHANGEABLE_MASK; + off &= ~IPS_UNCHANGEABLE_MASK; + + for (bit = 0; bit < __IPS_MAX_BIT; bit++) { + if (on & (1 << bit)) + set_bit(bit, &ct->status); + else if (off & (1 << bit)) + clear_bit(bit, &ct->status); + } +} + static int ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) { @@ -1436,10 +1458,7 @@ ctnetlink_change_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* ASSURED bit can only be set */ return -EBUSY; - /* Be careful here, modifying NAT bits can screw up things, - * so don't let users modify them directly if they don't pass - * nf_nat_range. */ - ct->status |= status & ~(IPS_NAT_DONE_MASK | IPS_NAT_MASK); + __ctnetlink_change_status(ct, status, 0); return 0; } @@ -1508,23 +1527,11 @@ static int ctnetlink_change_helper(struct nf_conn *ct, return 0; } + rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), nf_ct_protonum(ct)); if (helper == NULL) { -#ifdef CONFIG_MODULES - spin_unlock_bh(&nf_conntrack_expect_lock); - - if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_expect_lock); - return -EOPNOTSUPP; - } - - spin_lock_bh(&nf_conntrack_expect_lock); - helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), - nf_ct_protonum(ct)); - if (helper) - return -EAGAIN; -#endif + rcu_read_unlock(); return -EOPNOTSUPP; } @@ -1533,13 +1540,16 @@ static int ctnetlink_change_helper(struct nf_conn *ct, /* update private helper data if allowed. */ if (helper->from_nlattr) helper->from_nlattr(helpinfo, ct); - return 0; + err = 0; } else - return -EBUSY; + err = -EBUSY; + } else { + /* we cannot set a helper for an existing conntrack */ + err = -EOPNOTSUPP; } - /* we cannot set a helper for an existing conntrack */ - return -EOPNOTSUPP; + rcu_read_unlock(); + return err; } static int ctnetlink_change_timeout(struct nf_conn *ct, @@ -1630,25 +1640,30 @@ ctnetlink_change_seq_adj(struct nf_conn *ct, if (!seqadj) return 0; + spin_lock_bh(&ct->lock); if (cda[CTA_SEQ_ADJ_ORIG]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_ORIGINAL], cda[CTA_SEQ_ADJ_ORIG]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } if (cda[CTA_SEQ_ADJ_REPLY]) { ret = change_seq_adj(&seqadj->seq[IP_CT_DIR_REPLY], cda[CTA_SEQ_ADJ_REPLY]); if (ret < 0) - return ret; + goto err; - ct->status |= IPS_SEQ_ADJUST; + set_bit(IPS_SEQ_ADJUST_BIT, &ct->status); } + spin_unlock_bh(&ct->lock); return 0; +err: + spin_unlock_bh(&ct->lock); + return ret; } static int @@ -1959,9 +1974,7 @@ static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl, err = -EEXIST; ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - spin_lock_bh(&nf_conntrack_expect_lock); err = ctnetlink_change_conntrack(ct, cda); - spin_unlock_bh(&nf_conntrack_expect_lock); if (err == 0) { nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | @@ -2294,10 +2307,10 @@ ctnetlink_update_status(struct nf_conn *ct, const struct nlattr * const cda[]) /* This check is less strict than ctnetlink_change_status() * because callers often flip IPS_EXPECTED bits when sending * an NFQA_CT attribute to the kernel. So ignore the - * unchangeable bits but do not error out. + * unchangeable bits but do not error out. Also user programs + * are allowed to clear the bits that they are allowed to change. */ - ct->status = (status & ~IPS_UNCHANGEABLE_MASK) | - (ct->status & IPS_UNCHANGEABLE_MASK); + __ctnetlink_change_status(ct, status, ~status); return 0; } @@ -2351,11 +2364,7 @@ ctnetlink_glue_parse(const struct nlattr *attr, struct nf_conn *ct) if (ret < 0) return ret; - spin_lock_bh(&nf_conntrack_expect_lock); - ret = ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); - spin_unlock_bh(&nf_conntrack_expect_lock); - - return ret; + return ctnetlink_glue_parse_ct((const struct nlattr **)cda, ct); } static int ctnetlink_glue_exp_parse(const struct nlattr * const *cda, diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 1c6482d..5592250 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -3778,6 +3778,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, err = set->ops->insert(ctx->net, set, &elem, &ext2); if (err) { if (err == -EEXIST) { + if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) || + nft_set_ext_exists(ext, NFT_SET_EXT_OBJREF) ^ + nft_set_ext_exists(ext2, NFT_SET_EXT_OBJREF)) + return -EBUSY; if ((nft_set_ext_exists(ext, NFT_SET_EXT_DATA) && nft_set_ext_exists(ext2, NFT_SET_EXT_DATA) && memcmp(nft_set_ext_data(ext), diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 3948da3..66221ad 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -82,8 +82,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, nft_set_ext_exists(ext, NFT_SET_EXT_EXPIRATION)) { timeout = priv->timeout ? : set->timeout; *nft_set_ext_expiration(ext) = jiffies + timeout; - } else if (sexpr == NULL) - goto out; + } if (sexpr != NULL) sexpr->ops->eval(sexpr, regs, pkt); @@ -92,7 +91,7 @@ static void nft_dynset_eval(const struct nft_expr *expr, regs->verdict.code = NFT_BREAK; return; } -out: + if (!priv->invert) regs->verdict.code = NFT_BREAK; } diff --git a/net/netfilter/nft_set_bitmap.c b/net/netfilter/nft_set_bitmap.c index 8ebbc29..b988162 100644 --- a/net/netfilter/nft_set_bitmap.c +++ b/net/netfilter/nft_set_bitmap.c @@ -257,6 +257,11 @@ static int nft_bitmap_init(const struct nft_set *set, static void nft_bitmap_destroy(const struct nft_set *set) { + struct nft_bitmap *priv = nft_set_priv(set); + struct nft_bitmap_elem *be, *n; + + list_for_each_entry_safe(be, n, &priv->list, head) + nft_set_elem_destroy(set, be, true); } static bool nft_bitmap_estimate(const struct nft_set_desc *desc, u32 features, diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 14857af..f134d38 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -1051,8 +1051,10 @@ struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af, list_for_each_entry(t, &init_net.xt.tables[af], list) { if (strcmp(t->name, name)) continue; - if (!try_module_get(t->me)) + if (!try_module_get(t->me)) { + mutex_unlock(&xt[af].mutex); return NULL; + } mutex_unlock(&xt[af].mutex); if (t->table_init(net) != 0) { diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index 3cbe1bc..bb7ad82 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -168,8 +168,10 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par, goto err_put_timeout; } timeout_ext = nf_ct_timeout_ext_add(ct, timeout, GFP_ATOMIC); - if (timeout_ext == NULL) + if (!timeout_ext) { ret = -ENOMEM; + goto err_put_timeout; + } rcu_read_unlock(); return ret; @@ -201,6 +203,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, struct xt_ct_target_info_v1 *info) { struct nf_conntrack_zone zone; + struct nf_conn_help *help; struct nf_conn *ct; int ret = -EOPNOTSUPP; @@ -249,7 +252,7 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, if (info->timeout[0]) { ret = xt_ct_set_timeout(ct, par, info->timeout); if (ret < 0) - goto err3; + goto err4; } __set_bit(IPS_CONFIRMED_BIT, &ct->status); nf_conntrack_get(&ct->ct_general); @@ -257,6 +260,10 @@ out: info->ct = ct; return 0; +err4: + help = nfct_help(ct); + if (help) + module_put(help->helper->me); err3: nf_ct_tmpl_free(ct); err2: diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 770bbec..e75ef39 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -152,7 +152,7 @@ static int socket_mt_enable_defrag(struct net *net, int family) switch (family) { case NFPROTO_IPV4: return nf_defrag_ipv4_enable(net); -#ifdef XT_SOCKET_HAVE_IPV6 +#if IS_ENABLED(CONFIG_IP6_NF_IPTABLES) case NFPROTO_IPV6: return nf_defrag_ipv6_enable(net); #endif diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 42a9591..bf602e3 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -516,10 +516,38 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone, u16 proto, const struct sk_buff *skb) { struct nf_conntrack_tuple tuple; + struct nf_conntrack_expect *exp; if (!nf_ct_get_tuplepr(skb, skb_network_offset(skb), proto, net, &tuple)) return NULL; - return __nf_ct_expect_find(net, zone, &tuple); + + exp = __nf_ct_expect_find(net, zone, &tuple); + if (exp) { + struct nf_conntrack_tuple_hash *h; + + /* Delete existing conntrack entry, if it clashes with the + * expectation. This can happen since conntrack ALGs do not + * check for clashes between (new) expectations and existing + * conntrack entries. nf_conntrack_in() will check the + * expectations only if a conntrack entry can not be found, + * which can lead to OVS finding the expectation (here) in the + * init direction, but which will not be removed by the + * nf_conntrack_in() call, if a matching conntrack entry is + * found instead. In this case all init direction packets + * would be reported as new related packets, while reply + * direction packets would be reported as un-related + * established packets. + */ + h = nf_conntrack_find_get(net, zone, &tuple); + if (h) { + struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); + + nf_ct_delete(ct, 0, 0); + nf_conntrack_put(&ct->ct_general); + } + } + + return exp; } /* This replicates logic from nf_conntrack_core.c that is not exported. */ diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 2efb36c..dee469f 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -203,8 +203,7 @@ static int mall_change(struct net *net, struct sk_buff *in_skb, *arg = (unsigned long) head; rcu_assign_pointer(tp->root, new); - if (head) - call_rcu(&head->rcu, mall_destroy_rcu); + call_rcu(&head->rcu, mall_destroy_rcu); return 0; err_replace_hw_filter: diff --git a/samples/bpf/bpf_load.c b/samples/bpf/bpf_load.c index 4221dc3..74456b3 100644 --- a/samples/bpf/bpf_load.c +++ b/samples/bpf/bpf_load.c @@ -39,6 +39,9 @@ int event_fd[MAX_PROGS]; int prog_cnt; int prog_array_fd = -1; +struct bpf_map_data map_data[MAX_MAPS]; +int map_data_count = 0; + static int populate_prog_array(const char *event, int prog_fd) { int ind = atoi(event), err; @@ -186,42 +189,45 @@ static int load_and_attach(const char *event, struct bpf_insn *prog, int size) return 0; } -static int load_maps(struct bpf_map_def *maps, int nr_maps, - const char **map_names, fixup_map_cb fixup_map) +static int load_maps(struct bpf_map_data *maps, int nr_maps, + fixup_map_cb fixup_map) { int i; - /* - * Warning: Using "maps" pointing to ELF data_maps->d_buf as - * an array of struct bpf_map_def is a wrong assumption about - * the ELF maps section format. - */ + for (i = 0; i < nr_maps; i++) { - if (fixup_map) - fixup_map(&maps[i], map_names[i], i); - - if (maps[i].type == BPF_MAP_TYPE_ARRAY_OF_MAPS || - maps[i].type == BPF_MAP_TYPE_HASH_OF_MAPS) { - int inner_map_fd = map_fd[maps[i].inner_map_idx]; - - map_fd[i] = bpf_create_map_in_map(maps[i].type, - maps[i].key_size, - inner_map_fd, - maps[i].max_entries, - maps[i].map_flags); + if (fixup_map) { + fixup_map(&maps[i], i); + /* Allow userspace to assign map FD prior to creation */ + if (maps[i].fd != -1) { + map_fd[i] = maps[i].fd; + continue; + } + } + + if (maps[i].def.type == BPF_MAP_TYPE_ARRAY_OF_MAPS || + maps[i].def.type == BPF_MAP_TYPE_HASH_OF_MAPS) { + int inner_map_fd = map_fd[maps[i].def.inner_map_idx]; + + map_fd[i] = bpf_create_map_in_map(maps[i].def.type, + maps[i].def.key_size, + inner_map_fd, + maps[i].def.max_entries, + maps[i].def.map_flags); } else { - map_fd[i] = bpf_create_map(maps[i].type, - maps[i].key_size, - maps[i].value_size, - maps[i].max_entries, - maps[i].map_flags); + map_fd[i] = bpf_create_map(maps[i].def.type, + maps[i].def.key_size, + maps[i].def.value_size, + maps[i].def.max_entries, + maps[i].def.map_flags); } if (map_fd[i] < 0) { printf("failed to create a map: %d %s\n", errno, strerror(errno)); return 1; } + maps[i].fd = map_fd[i]; - if (maps[i].type == BPF_MAP_TYPE_PROG_ARRAY) + if (maps[i].def.type == BPF_MAP_TYPE_PROG_ARRAY) prog_array_fd = map_fd[i]; } return 0; @@ -251,7 +257,8 @@ static int get_sec(Elf *elf, int i, GElf_Ehdr *ehdr, char **shname, } static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, - GElf_Shdr *shdr, struct bpf_insn *insn) + GElf_Shdr *shdr, struct bpf_insn *insn, + struct bpf_map_data *maps, int nr_maps) { int i, nrels; @@ -261,6 +268,8 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, GElf_Sym sym; GElf_Rel rel; unsigned int insn_idx; + bool match = false; + int j, map_idx; gelf_getrel(data, i, &rel); @@ -274,11 +283,21 @@ static int parse_relo_and_apply(Elf_Data *data, Elf_Data *symbols, return 1; } insn[insn_idx].src_reg = BPF_PSEUDO_MAP_FD; - /* - * Warning: Using sizeof(struct bpf_map_def) here is a - * wrong assumption about ELF maps section format - */ - insn[insn_idx].imm = map_fd[sym.st_value / sizeof(struct bpf_map_def)]; + + /* Match FD relocation against recorded map_data[] offset */ + for (map_idx = 0; map_idx < nr_maps; map_idx++) { + if (maps[map_idx].elf_offset == sym.st_value) { + match = true; + break; + } + } + if (match) { + insn[insn_idx].imm = maps[map_idx].fd; + } else { + printf("invalid relo for insn[%d] no map_data match\n", + insn_idx); + return 1; + } } return 0; @@ -297,40 +316,112 @@ static int cmp_symbols(const void *l, const void *r) return 0; } -static int get_sorted_map_names(Elf *elf, Elf_Data *symbols, int maps_shndx, - int strtabidx, char **map_names) +static int load_elf_maps_section(struct bpf_map_data *maps, int maps_shndx, + Elf *elf, Elf_Data *symbols, int strtabidx) { - GElf_Sym map_symbols[MAX_MAPS]; - int i, nr_maps = 0; + int map_sz_elf, map_sz_copy; + bool validate_zero = false; + Elf_Data *data_maps; + int i, nr_maps; + GElf_Sym *sym; + Elf_Scn *scn; + int copy_sz; + + if (maps_shndx < 0) + return -EINVAL; + if (!symbols) + return -EINVAL; + + /* Get data for maps section via elf index */ + scn = elf_getscn(elf, maps_shndx); + if (scn) + data_maps = elf_getdata(scn, NULL); + if (!scn || !data_maps) { + printf("Failed to get Elf_Data from maps section %d\n", + maps_shndx); + return -EINVAL; + } - for (i = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { - assert(nr_maps < MAX_MAPS); - if (!gelf_getsym(symbols, i, &map_symbols[nr_maps])) + /* For each map get corrosponding symbol table entry */ + sym = calloc(MAX_MAPS+1, sizeof(GElf_Sym)); + for (i = 0, nr_maps = 0; i < symbols->d_size / sizeof(GElf_Sym); i++) { + assert(nr_maps < MAX_MAPS+1); + if (!gelf_getsym(symbols, i, &sym[nr_maps])) continue; - if (map_symbols[nr_maps].st_shndx != maps_shndx) + if (sym[nr_maps].st_shndx != maps_shndx) continue; + /* Only increment iif maps section */ nr_maps++; } - qsort(map_symbols, nr_maps, sizeof(GElf_Sym), cmp_symbols); + /* Align to map_fd[] order, via sort on offset in sym.st_value */ + qsort(sym, nr_maps, sizeof(GElf_Sym), cmp_symbols); + + /* Keeping compatible with ELF maps section changes + * ------------------------------------------------ + * The program size of struct bpf_map_def is known by loader + * code, but struct stored in ELF file can be different. + * + * Unfortunately sym[i].st_size is zero. To calculate the + * struct size stored in the ELF file, assume all struct have + * the same size, and simply divide with number of map + * symbols. + */ + map_sz_elf = data_maps->d_size / nr_maps; + map_sz_copy = sizeof(struct bpf_map_def); + if (map_sz_elf < map_sz_copy) { + /* + * Backward compat, loading older ELF file with + * smaller struct, keeping remaining bytes zero. + */ + map_sz_copy = map_sz_elf; + } else if (map_sz_elf > map_sz_copy) { + /* + * Forward compat, loading newer ELF file with larger + * struct with unknown features. Assume zero means + * feature not used. Thus, validate rest of struct + * data is zero. + */ + validate_zero = true; + } + /* Memcpy relevant part of ELF maps data to loader maps */ for (i = 0; i < nr_maps; i++) { - char *map_name; - - map_name = elf_strptr(elf, strtabidx, map_symbols[i].st_name); - if (!map_name) { - printf("cannot get map symbol\n"); - return -1; - } - - map_names[i] = strdup(map_name); - if (!map_names[i]) { + unsigned char *addr, *end; + struct bpf_map_def *def; + const char *map_name; + size_t offset; + + map_name = elf_strptr(elf, strtabidx, sym[i].st_name); + maps[i].name = strdup(map_name); + if (!maps[i].name) { printf("strdup(%s): %s(%d)\n", map_name, strerror(errno), errno); - return -1; + free(sym); + return -errno; + } + + /* Symbol value is offset into ELF maps section data area */ + offset = sym[i].st_value; + def = (struct bpf_map_def *)(data_maps->d_buf + offset); + maps[i].elf_offset = offset; + memset(&maps[i].def, 0, sizeof(struct bpf_map_def)); + memcpy(&maps[i].def, def, map_sz_copy); + + /* Verify no newer features were requested */ + if (validate_zero) { + addr = (unsigned char*) def + map_sz_copy; + end = (unsigned char*) def + map_sz_elf; + for (; addr < end; addr++) { + if (*addr != 0) { + free(sym); + return -EFBIG; + } + } } } + free(sym); return nr_maps; } @@ -341,7 +432,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) GElf_Ehdr ehdr; GElf_Shdr shdr, shdr_prog; Elf_Data *data, *data_prog, *data_maps = NULL, *symbols = NULL; - char *shname, *shname_prog, *map_names[MAX_MAPS] = { NULL }; + char *shname, *shname_prog; + int nr_maps = 0; /* reset global variables */ kern_version = 0; @@ -389,8 +481,12 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) } memcpy(&kern_version, data->d_buf, sizeof(int)); } else if (strcmp(shname, "maps") == 0) { + int j; + maps_shndx = i; data_maps = data; + for (j = 0; j < MAX_MAPS; j++) + map_data[j].fd = -1; } else if (shdr.sh_type == SHT_SYMTAB) { strtabidx = shdr.sh_link; symbols = data; @@ -405,27 +501,17 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) } if (data_maps) { - int nr_maps; - int prog_elf_map_sz; - - nr_maps = get_sorted_map_names(elf, symbols, maps_shndx, - strtabidx, map_names); - if (nr_maps < 0) - goto done; - - /* Deduce map struct size stored in ELF maps section */ - prog_elf_map_sz = data_maps->d_size / nr_maps; - if (prog_elf_map_sz != sizeof(struct bpf_map_def)) { - printf("Error: ELF maps sec wrong size (%d/%lu)," - " old kern.o file?\n", - prog_elf_map_sz, sizeof(struct bpf_map_def)); + nr_maps = load_elf_maps_section(map_data, maps_shndx, + elf, symbols, strtabidx); + if (nr_maps < 0) { + printf("Error: Failed loading ELF maps (errno:%d):%s\n", + nr_maps, strerror(-nr_maps)); ret = 1; goto done; } - - if (load_maps(data_maps->d_buf, nr_maps, - (const char **)map_names, fixup_map)) + if (load_maps(map_data, nr_maps, fixup_map)) goto done; + map_data_count = nr_maps; processed_sec[maps_shndx] = true; } @@ -453,7 +539,8 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) processed_sec[shdr.sh_info] = true; processed_sec[i] = true; - if (parse_relo_and_apply(data, symbols, &shdr, insns)) + if (parse_relo_and_apply(data, symbols, &shdr, insns, + map_data, nr_maps)) continue; if (memcmp(shname_prog, "kprobe/", 7) == 0 || @@ -488,8 +575,6 @@ static int do_load_bpf_file(const char *path, fixup_map_cb fixup_map) ret = 0; done: - for (i = 0; i < MAX_MAPS; i++) - free(map_names[i]); close(fd); return ret; } diff --git a/samples/bpf/bpf_load.h b/samples/bpf/bpf_load.h index 05822f8..ca0563d 100644 --- a/samples/bpf/bpf_load.h +++ b/samples/bpf/bpf_load.h @@ -15,15 +15,27 @@ struct bpf_map_def { unsigned int inner_map_idx; }; -typedef void (*fixup_map_cb)(struct bpf_map_def *map, const char *map_name, - int idx); +struct bpf_map_data { + int fd; + char *name; + size_t elf_offset; + struct bpf_map_def def; +}; + +typedef void (*fixup_map_cb)(struct bpf_map_data *map, int idx); -extern int map_fd[MAX_MAPS]; extern int prog_fd[MAX_PROGS]; extern int event_fd[MAX_PROGS]; extern char bpf_log_buf[BPF_LOG_BUF_SIZE]; extern int prog_cnt; +/* There is a one-to-one mapping between map_fd[] and map_data[]. + * The map_data[] just contains more rich info on the given map. + */ +extern int map_fd[MAX_MAPS]; +extern struct bpf_map_data map_data[MAX_MAPS]; +extern int map_data_count; + /* parses elf file compiled by llvm .c->.o * . parses 'maps' section and creates maps via BPF syscall * . parses 'license' section and passes it to syscall diff --git a/samples/bpf/map_perf_test_user.c b/samples/bpf/map_perf_test_user.c index 6ac7781..1a8894b 100644 --- a/samples/bpf/map_perf_test_user.c +++ b/samples/bpf/map_perf_test_user.c @@ -320,21 +320,21 @@ static void fill_lpm_trie(void) assert(!r); } -static void fixup_map(struct bpf_map_def *map, const char *name, int idx) +static void fixup_map(struct bpf_map_data *map, int idx) { int i; - if (!strcmp("inner_lru_hash_map", name)) { + if (!strcmp("inner_lru_hash_map", map->name)) { inner_lru_hash_idx = idx; - inner_lru_hash_size = map->max_entries; + inner_lru_hash_size = map->def.max_entries; } - if (!strcmp("array_of_lru_hashs", name)) { + if (!strcmp("array_of_lru_hashs", map->name)) { if (inner_lru_hash_idx == -1) { printf("inner_lru_hash_map must be defined before array_of_lru_hashs\n"); exit(1); } - map->inner_map_idx = inner_lru_hash_idx; + map->def.inner_map_idx = inner_lru_hash_idx; array_of_lru_hashs_idx = idx; } @@ -345,9 +345,9 @@ static void fixup_map(struct bpf_map_def *map, const char *name, int idx) /* Only change the max_entries for the enabled test(s) */ for (i = 0; i < NR_TESTS; i++) { - if (!strcmp(test_map_names[i], name) && + if (!strcmp(test_map_names[i], map->name) && (check_test_flags(i))) { - map->max_entries = num_map_entries; + map->def.max_entries = num_map_entries; } } } diff --git a/samples/bpf/tracex2_user.c b/samples/bpf/tracex2_user.c index ded9804..7fee0f1 100644 --- a/samples/bpf/tracex2_user.c +++ b/samples/bpf/tracex2_user.c @@ -4,6 +4,7 @@ #include <signal.h> #include <linux/bpf.h> #include <string.h> +#include <sys/resource.h> #include "libbpf.h" #include "bpf_load.h" @@ -112,6 +113,7 @@ static void int_exit(int sig) int main(int ac, char **argv) { + struct rlimit r = {1024*1024, RLIM_INFINITY}; char filename[256]; long key, next_key, value; FILE *f; @@ -119,6 +121,11 @@ int main(int ac, char **argv) snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + signal(SIGINT, int_exit); /* start 'ping' in the background to have some kfree_skb events */ diff --git a/samples/bpf/tracex3_user.c b/samples/bpf/tracex3_user.c index 8f7d199..fe37223 100644 --- a/samples/bpf/tracex3_user.c +++ b/samples/bpf/tracex3_user.c @@ -11,6 +11,7 @@ #include <stdbool.h> #include <string.h> #include <linux/bpf.h> +#include <sys/resource.h> #include "libbpf.h" #include "bpf_load.h" @@ -112,11 +113,17 @@ static void print_hist(int fd) int main(int ac, char **argv) { + struct rlimit r = {1024*1024, RLIM_INFINITY}; char filename[256]; int i; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK)"); + return 1; + } + if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); return 1; diff --git a/samples/bpf/tracex4_user.c b/samples/bpf/tracex4_user.c index 03449f77..22c644f 100644 --- a/samples/bpf/tracex4_user.c +++ b/samples/bpf/tracex4_user.c @@ -12,6 +12,8 @@ #include <string.h> #include <time.h> #include <linux/bpf.h> +#include <sys/resource.h> + #include "libbpf.h" #include "bpf_load.h" @@ -50,11 +52,17 @@ static void print_old_objects(int fd) int main(int ac, char **argv) { + struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY}; char filename[256]; int i; snprintf(filename, sizeof(filename), "%s_kern.o", argv[0]); + if (setrlimit(RLIMIT_MEMLOCK, &r)) { + perror("setrlimit(RLIMIT_MEMLOCK, RLIM_INFINITY)"); + return 1; + } + if (load_bpf_file(filename)) { printf("%s", bpf_log_buf); return 1; diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index d8d94b9..91edd05 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -13,7 +13,7 @@ LDLIBS += -lcap -lelf TEST_GEN_PROGS = test_verifier test_tag test_maps test_lru_map test_lpm_map test_progs -TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o +TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o TEST_PROGS := test_kmod.sh @@ -34,6 +34,6 @@ $(BPFOBJ): force CLANG ?= clang %.o: %.c - $(CLANG) -I../../../include/uapi -I../../../../samples/bpf/ \ - -D__x86_64__ -Wno-compare-distinct-pointer-types \ + $(CLANG) -I. -I../../../include/uapi -I../../../../samples/bpf/ \ + -Wno-compare-distinct-pointer-types \ -O2 -target bpf -c $< -o $@ diff --git a/tools/testing/selftests/bpf/gnu/stubs.h b/tools/testing/selftests/bpf/gnu/stubs.h new file mode 100644 index 0000000..719225b --- /dev/null +++ b/tools/testing/selftests/bpf/gnu/stubs.h @@ -0,0 +1 @@ +/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */ diff --git a/tools/testing/selftests/bpf/test_progs.c b/tools/testing/selftests/bpf/test_progs.c index 4ed049a..b59f5ed 100644 --- a/tools/testing/selftests/bpf/test_progs.c +++ b/tools/testing/selftests/bpf/test_progs.c @@ -268,6 +268,21 @@ out: bpf_object__close(obj); } +static void test_tcp_estats(void) +{ + const char *file = "./test_tcp_estats.o"; + int err, prog_fd; + struct bpf_object *obj; + __u32 duration = 0; + + err = bpf_prog_load(file, BPF_PROG_TYPE_TRACEPOINT, &obj, &prog_fd); + CHECK(err, "", "err %d errno %d\n", err, errno); + if (err) + return; + + bpf_object__close(obj); +} + int main(void) { struct rlimit rinf = { RLIM_INFINITY, RLIM_INFINITY }; @@ -277,6 +292,7 @@ int main(void) test_pkt_access(); test_xdp(); test_l4lb(); + test_tcp_estats(); printf("Summary: %d PASSED, %d FAILED\n", pass_cnt, error_cnt); return 0; diff --git a/tools/testing/selftests/bpf/test_tcp_estats.c b/tools/testing/selftests/bpf/test_tcp_estats.c new file mode 100644 index 0000000..bee3bbe --- /dev/null +++ b/tools/testing/selftests/bpf/test_tcp_estats.c @@ -0,0 +1,258 @@ +/* Copyright (c) 2017 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ + +/* This program shows clang/llvm is able to generate code pattern + * like: + * _tcp_send_active_reset: + * 0: bf 16 00 00 00 00 00 00 r6 = r1 + * ...... + * 335: b7 01 00 00 0f 00 00 00 r1 = 15 + * 336: 05 00 48 00 00 00 00 00 goto 72 + * + * LBB0_3: + * 337: b7 01 00 00 01 00 00 00 r1 = 1 + * 338: 63 1a d0 ff 00 00 00 00 *(u32 *)(r10 - 48) = r1 + * 408: b7 01 00 00 03 00 00 00 r1 = 3 + * + * LBB0_4: + * 409: 71 a2 fe ff 00 00 00 00 r2 = *(u8 *)(r10 - 2) + * 410: bf a7 00 00 00 00 00 00 r7 = r10 + * 411: 07 07 00 00 b8 ff ff ff r7 += -72 + * 412: bf 73 00 00 00 00 00 00 r3 = r7 + * 413: 0f 13 00 00 00 00 00 00 r3 += r1 + * 414: 73 23 2d 00 00 00 00 00 *(u8 *)(r3 + 45) = r2 + * + * From the above code snippet, the code generated by the compiler + * is reasonable. The "r1" is assigned to different values in basic + * blocks "_tcp_send_active_reset" and "LBB0_3", and used in "LBB0_4". + * The verifier should be able to handle such code patterns. + */ +#include <string.h> +#include <linux/bpf.h> +#include <linux/ipv6.h> +#include <linux/version.h> +#include <sys/socket.h> +#include "bpf_helpers.h" + +#define _(P) ({typeof(P) val = 0; bpf_probe_read(&val, sizeof(val), &P); val;}) +#define TCP_ESTATS_MAGIC 0xBAADBEEF + +/* This test case needs "sock" and "pt_regs" data structure. + * Recursively, "sock" needs "sock_common" and "inet_sock". + * However, this is a unit test case only for + * verifier purpose without bpf program execution. + * We can safely mock much simpler data structures, basically + * only taking the necessary fields from kernel headers. + */ +typedef __u32 __bitwise __portpair; +typedef __u64 __bitwise __addrpair; + +struct sock_common { + unsigned short skc_family; + union { + __addrpair skc_addrpair; + struct { + __be32 skc_daddr; + __be32 skc_rcv_saddr; + }; + }; + union { + __portpair skc_portpair; + struct { + __be16 skc_dport; + __u16 skc_num; + }; + }; + struct in6_addr skc_v6_daddr; + struct in6_addr skc_v6_rcv_saddr; +}; + +struct sock { + struct sock_common __sk_common; +#define sk_family __sk_common.skc_family +#define sk_v6_daddr __sk_common.skc_v6_daddr +#define sk_v6_rcv_saddr __sk_common.skc_v6_rcv_saddr +}; + +struct inet_sock { + struct sock sk; +#define inet_daddr sk.__sk_common.skc_daddr +#define inet_dport sk.__sk_common.skc_dport + __be32 inet_saddr; + __be16 inet_sport; +}; + +struct pt_regs { + long di; +}; + +static inline struct inet_sock *inet_sk(const struct sock *sk) +{ + return (struct inet_sock *)sk; +} + +/* Define various data structures for state recording. + * Some fields are not used due to test simplification. + */ +enum tcp_estats_addrtype { + TCP_ESTATS_ADDRTYPE_IPV4 = 1, + TCP_ESTATS_ADDRTYPE_IPV6 = 2 +}; + +enum tcp_estats_event_type { + TCP_ESTATS_ESTABLISH, + TCP_ESTATS_PERIODIC, + TCP_ESTATS_TIMEOUT, + TCP_ESTATS_RETRANSMIT_TIMEOUT, + TCP_ESTATS_RETRANSMIT_OTHER, + TCP_ESTATS_SYN_RETRANSMIT, + TCP_ESTATS_SYNACK_RETRANSMIT, + TCP_ESTATS_TERM, + TCP_ESTATS_TX_RESET, + TCP_ESTATS_RX_RESET, + TCP_ESTATS_WRITE_TIMEOUT, + TCP_ESTATS_CONN_TIMEOUT, + TCP_ESTATS_ACK_LATENCY, + TCP_ESTATS_NEVENTS, +}; + +struct tcp_estats_event { + int pid; + int cpu; + unsigned long ts; + unsigned int magic; + enum tcp_estats_event_type event_type; +}; + +/* The below data structure is packed in order for + * llvm compiler to generate expected code. + */ +struct tcp_estats_conn_id { + unsigned int localaddressType; + struct { + unsigned char data[16]; + } localaddress; + struct { + unsigned char data[16]; + } remaddress; + unsigned short localport; + unsigned short remport; +} __attribute__((__packed__)); + +struct tcp_estats_basic_event { + struct tcp_estats_event event; + struct tcp_estats_conn_id conn_id; +}; + +struct bpf_map_def SEC("maps") ev_record_map = { + .type = BPF_MAP_TYPE_HASH, + .key_size = sizeof(__u32), + .value_size = sizeof(struct tcp_estats_basic_event), + .max_entries = 1024, +}; + +struct dummy_tracepoint_args { + unsigned long long pad; + struct sock *sock; +}; + +static __always_inline void tcp_estats_ev_init(struct tcp_estats_event *event, + enum tcp_estats_event_type type) +{ + event->magic = TCP_ESTATS_MAGIC; + event->ts = bpf_ktime_get_ns(); + event->event_type = type; +} + +static __always_inline void unaligned_u32_set(unsigned char *to, __u8 *from) +{ + to[0] = _(from[0]); + to[1] = _(from[1]); + to[2] = _(from[2]); + to[3] = _(from[3]); +} + +static __always_inline void conn_id_ipv4_init(struct tcp_estats_conn_id *conn_id, + __be32 *saddr, __be32 *daddr) +{ + conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV4; + + unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr); + unaligned_u32_set(conn_id->remaddress.data, (__u8 *)daddr); +} + +static __always_inline void conn_id_ipv6_init(struct tcp_estats_conn_id *conn_id, + __be32 *saddr, __be32 *daddr) +{ + conn_id->localaddressType = TCP_ESTATS_ADDRTYPE_IPV6; + + unaligned_u32_set(conn_id->localaddress.data, (__u8 *)saddr); + unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32), + (__u8 *)(saddr + 1)); + unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 2, + (__u8 *)(saddr + 2)); + unaligned_u32_set(conn_id->localaddress.data + sizeof(__u32) * 3, + (__u8 *)(saddr + 3)); + + unaligned_u32_set(conn_id->remaddress.data, + (__u8 *)(daddr)); + unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32), + (__u8 *)(daddr + 1)); + unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 2, + (__u8 *)(daddr + 2)); + unaligned_u32_set(conn_id->remaddress.data + sizeof(__u32) * 3, + (__u8 *)(daddr + 3)); +} + +static __always_inline void tcp_estats_conn_id_init(struct tcp_estats_conn_id *conn_id, + struct sock *sk) +{ + conn_id->localport = _(inet_sk(sk)->inet_sport); + conn_id->remport = _(inet_sk(sk)->inet_dport); + + if (_(sk->sk_family) == AF_INET6) + conn_id_ipv6_init(conn_id, + sk->sk_v6_rcv_saddr.s6_addr32, + sk->sk_v6_daddr.s6_addr32); + else + conn_id_ipv4_init(conn_id, + &inet_sk(sk)->inet_saddr, + &inet_sk(sk)->inet_daddr); +} + +static __always_inline void tcp_estats_init(struct sock *sk, + struct tcp_estats_event *event, + struct tcp_estats_conn_id *conn_id, + enum tcp_estats_event_type type) +{ + tcp_estats_ev_init(event, type); + tcp_estats_conn_id_init(conn_id, sk); +} + +static __always_inline void send_basic_event(struct sock *sk, + enum tcp_estats_event_type type) +{ + struct tcp_estats_basic_event ev; + __u32 key = bpf_get_prandom_u32(); + + memset(&ev, 0, sizeof(ev)); + tcp_estats_init(sk, &ev.event, &ev.conn_id, type); + bpf_map_update_elem(&ev_record_map, &key, &ev, BPF_ANY); +} + +SEC("dummy_tracepoint") +int _dummy_tracepoint(struct dummy_tracepoint_args *arg) +{ + if (!arg->sock) + return 0; + + send_basic_event(arg->sock, TCP_ESTATS_TX_RESET); + return 0; +} + +char _license[] SEC("license") = "GPL"; +__u32 _version SEC("version") = 1; /* ignored by tracepoints, required by libbpf.a */ |