diff options
120 files changed, 1358 insertions, 465 deletions
diff --git a/Documentation/networking/dsa/dsa.txt b/Documentation/networking/dsa/dsa.txt index 6d6c07c..63912ef3 100644 --- a/Documentation/networking/dsa/dsa.txt +++ b/Documentation/networking/dsa/dsa.txt @@ -67,13 +67,14 @@ Note that DSA does not currently create network interfaces for the "cpu" and Switch tagging protocols ------------------------ -DSA currently supports 4 different tagging protocols, and a tag-less mode as +DSA currently supports 5 different tagging protocols, and a tag-less mode as well. The different protocols are implemented in: net/dsa/tag_trailer.c: Marvell's 4 trailer tag mode (legacy) net/dsa/tag_dsa.c: Marvell's original DSA tag net/dsa/tag_edsa.c: Marvell's enhanced DSA tag net/dsa/tag_brcm.c: Broadcom's 4 bytes tag +net/dsa/tag_qca.c: Qualcomm's 2 bytes tag The exact format of the tag protocol is vendor specific, but in general, they all contain something which: diff --git a/MAINTAINERS b/MAINTAINERS index 851b89b..2a58eea 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -8057,6 +8057,7 @@ F: drivers/infiniband/hw/mlx4/ F: include/linux/mlx4/ MELLANOX MLX5 core VPI driver +M: Saeed Mahameed <saeedm@mellanox.com> M: Matan Barak <matanb@mellanox.com> M: Leon Romanovsky <leonro@mellanox.com> L: netdev@vger.kernel.org diff --git a/drivers/net/can/sja1000/plx_pci.c b/drivers/net/can/sja1000/plx_pci.c index 3eb7430..f8ff25c 100644 --- a/drivers/net/can/sja1000/plx_pci.c +++ b/drivers/net/can/sja1000/plx_pci.c @@ -142,6 +142,9 @@ struct plx_pci_card { #define CTI_PCI_VENDOR_ID 0x12c4 #define CTI_PCI_DEVICE_ID_CRG001 0x0900 +#define MOXA_PCI_VENDOR_ID 0x1393 +#define MOXA_PCI_DEVICE_ID 0x0100 + static void plx_pci_reset_common(struct pci_dev *pdev); static void plx9056_pci_reset_common(struct pci_dev *pdev); static void plx_pci_reset_marathon_pci(struct pci_dev *pdev); @@ -258,6 +261,14 @@ static struct plx_pci_card_info plx_pci_card_info_elcus = { /* based on PLX9030 */ }; +static struct plx_pci_card_info plx_pci_card_info_moxa = { + "MOXA", 2, + PLX_PCI_CAN_CLOCK, PLX_PCI_OCR, PLX_PCI_CDR, + {0, 0x00, 0x00}, { {0, 0x00, 0x80}, {1, 0x00, 0x80} }, + &plx_pci_reset_common + /* based on PLX9052 */ +}; + static const struct pci_device_id plx_pci_tbl[] = { { /* Adlink PCI-7841/cPCI-7841 */ @@ -357,6 +368,13 @@ static const struct pci_device_id plx_pci_tbl[] = { 0, 0, (kernel_ulong_t)&plx_pci_card_info_elcus }, + { + /* moxa */ + MOXA_PCI_VENDOR_ID, MOXA_PCI_DEVICE_ID, + PCI_ANY_ID, PCI_ANY_ID, + 0, 0, + (kernel_ulong_t)&plx_pci_card_info_moxa + }, { 0,} }; MODULE_DEVICE_TABLE(pci, plx_pci_tbl); diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c index c481f10..5390ae8 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c @@ -204,17 +204,6 @@ static u32 xgene_enet_ring_len(struct xgene_enet_desc_ring *ring) return num_msgs; } -static void xgene_enet_setup_coalescing(struct xgene_enet_desc_ring *ring) -{ - u32 data = 0x7777; - - xgene_enet_ring_wr32(ring, CSR_PBM_COAL, 0x8e); - xgene_enet_ring_wr32(ring, CSR_PBM_CTICK1, data); - xgene_enet_ring_wr32(ring, CSR_PBM_CTICK2, data << 16); - xgene_enet_ring_wr32(ring, CSR_THRESHOLD0_SET1, 0x40); - xgene_enet_ring_wr32(ring, CSR_THRESHOLD1_SET1, 0x80); -} - void xgene_enet_parse_error(struct xgene_enet_desc_ring *ring, struct xgene_enet_pdata *pdata, enum xgene_enet_err_code status) @@ -929,5 +918,4 @@ struct xgene_ring_ops xgene_ring1_ops = { .clear = xgene_enet_clear_ring, .wr_cmd = xgene_enet_wr_cmd, .len = xgene_enet_ring_len, - .coalesce = xgene_enet_setup_coalescing, }; diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h index 8456337..06e598c 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.h @@ -55,8 +55,10 @@ enum xgene_enet_rm { #define PREFETCH_BUF_EN BIT(21) #define CSR_RING_ID_BUF 0x000c #define CSR_PBM_COAL 0x0014 +#define CSR_PBM_CTICK0 0x0018 #define CSR_PBM_CTICK1 0x001c #define CSR_PBM_CTICK2 0x0020 +#define CSR_PBM_CTICK3 0x0024 #define CSR_THRESHOLD0_SET1 0x0030 #define CSR_THRESHOLD1_SET1 0x0034 #define CSR_RING_NE_INT_MODE 0x017c diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c index 429f18f..8158d46 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_main.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_main.c @@ -1188,7 +1188,8 @@ static int xgene_enet_create_desc_rings(struct net_device *ndev) tx_ring->dst_ring_num = xgene_enet_dst_ring_num(cp_ring); } - pdata->ring_ops->coalesce(pdata->tx_ring[0]); + if (pdata->ring_ops->coalesce) + pdata->ring_ops->coalesce(pdata->tx_ring[0]); pdata->tx_qcnt_hi = pdata->tx_ring[0]->slots - 128; return 0; diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ring2.c b/drivers/net/ethernet/apm/xgene/xgene_enet_ring2.c index 2b76732..af51dd5 100644 --- a/drivers/net/ethernet/apm/xgene/xgene_enet_ring2.c +++ b/drivers/net/ethernet/apm/xgene/xgene_enet_ring2.c @@ -30,7 +30,7 @@ static void xgene_enet_ring_init(struct xgene_enet_desc_ring *ring) ring_cfg[0] |= SET_VAL(X2_INTLINE, ring->id & RING_BUFNUM_MASK); ring_cfg[3] |= SET_BIT(X2_DEQINTEN); } - ring_cfg[0] |= SET_VAL(X2_CFGCRID, 1); + ring_cfg[0] |= SET_VAL(X2_CFGCRID, 2); addr >>= 8; ring_cfg[2] |= QCOHERENT | SET_VAL(RINGADDRL, addr); @@ -192,13 +192,15 @@ static u32 xgene_enet_ring_len(struct xgene_enet_desc_ring *ring) static void xgene_enet_setup_coalescing(struct xgene_enet_desc_ring *ring) { - u32 data = 0x7777; + u32 data = 0x77777777; xgene_enet_ring_wr32(ring, CSR_PBM_COAL, 0x8e); + xgene_enet_ring_wr32(ring, CSR_PBM_CTICK0, data); xgene_enet_ring_wr32(ring, CSR_PBM_CTICK1, data); - xgene_enet_ring_wr32(ring, CSR_PBM_CTICK2, data << 16); - xgene_enet_ring_wr32(ring, CSR_THRESHOLD0_SET1, 0x40); - xgene_enet_ring_wr32(ring, CSR_THRESHOLD1_SET1, 0x80); + xgene_enet_ring_wr32(ring, CSR_PBM_CTICK2, data); + xgene_enet_ring_wr32(ring, CSR_PBM_CTICK3, data); + xgene_enet_ring_wr32(ring, CSR_THRESHOLD0_SET1, 0x08); + xgene_enet_ring_wr32(ring, CSR_THRESHOLD1_SET1, 0x10); } struct xgene_ring_ops xgene_ring2_ops = { diff --git a/drivers/net/ethernet/broadcom/bgmac.c b/drivers/net/ethernet/broadcom/bgmac.c index 31ca204..49f4cafe 100644 --- a/drivers/net/ethernet/broadcom/bgmac.c +++ b/drivers/net/ethernet/broadcom/bgmac.c @@ -307,6 +307,10 @@ static void bgmac_dma_rx_enable(struct bgmac *bgmac, u32 ctl; ctl = bgmac_read(bgmac, ring->mmio_base + BGMAC_DMA_RX_CTL); + + /* preserve ONLY bits 16-17 from current hardware value */ + ctl &= BGMAC_DMA_RX_ADDREXT_MASK; + if (bgmac->feature_flags & BGMAC_FEAT_RX_MASK_SETUP) { ctl &= ~BGMAC_DMA_RX_BL_MASK; ctl |= BGMAC_DMA_RX_BL_128 << BGMAC_DMA_RX_BL_SHIFT; @@ -317,7 +321,6 @@ static void bgmac_dma_rx_enable(struct bgmac *bgmac, ctl &= ~BGMAC_DMA_RX_PT_MASK; ctl |= BGMAC_DMA_RX_PT_1 << BGMAC_DMA_RX_PT_SHIFT; } - ctl &= BGMAC_DMA_RX_ADDREXT_MASK; ctl |= BGMAC_DMA_RX_ENABLE; ctl |= BGMAC_DMA_RX_PARITY_DISABLE; ctl |= BGMAC_DMA_RX_OVERFLOW_CONT; @@ -1046,9 +1049,9 @@ static void bgmac_enable(struct bgmac *bgmac) mode = (bgmac_read(bgmac, BGMAC_DEV_STATUS) & BGMAC_DS_MM_MASK) >> BGMAC_DS_MM_SHIFT; - if (!(bgmac->feature_flags & BGMAC_FEAT_CLKCTLST) || mode != 0) + if (bgmac->feature_flags & BGMAC_FEAT_CLKCTLST || mode != 0) bgmac_set(bgmac, BCMA_CLKCTLST, BCMA_CLKCTLST_FORCEHT); - if (bgmac->feature_flags & BGMAC_FEAT_CLKCTLST && mode == 2) + if (!(bgmac->feature_flags & BGMAC_FEAT_CLKCTLST) && mode == 2) bgmac_cco_ctl_maskset(bgmac, 1, ~0, BGMAC_CHIPCTL_1_RXC_DLL_BYPASS); diff --git a/drivers/net/ethernet/broadcom/bnx2.c b/drivers/net/ethernet/broadcom/bnx2.c index b3791b3..1f7034d 100644 --- a/drivers/net/ethernet/broadcom/bnx2.c +++ b/drivers/net/ethernet/broadcom/bnx2.c @@ -49,6 +49,7 @@ #include <linux/firmware.h> #include <linux/log2.h> #include <linux/aer.h> +#include <linux/crash_dump.h> #if IS_ENABLED(CONFIG_CNIC) #define BCM_CNIC 1 @@ -4764,15 +4765,16 @@ bnx2_setup_msix_tbl(struct bnx2 *bp) BNX2_WR(bp, BNX2_PCI_GRC_WINDOW3_ADDR, BNX2_MSIX_PBA_ADDR); } -static int -bnx2_reset_chip(struct bnx2 *bp, u32 reset_code) +static void +bnx2_wait_dma_complete(struct bnx2 *bp) { u32 val; - int i, rc = 0; - u8 old_port; + int i; - /* Wait for the current PCI transaction to complete before - * issuing a reset. */ + /* + * Wait for the current PCI transaction to complete before + * issuing a reset. + */ if ((BNX2_CHIP(bp) == BNX2_CHIP_5706) || (BNX2_CHIP(bp) == BNX2_CHIP_5708)) { BNX2_WR(bp, BNX2_MISC_ENABLE_CLR_BITS, @@ -4796,6 +4798,21 @@ bnx2_reset_chip(struct bnx2 *bp, u32 reset_code) } } + return; +} + + +static int +bnx2_reset_chip(struct bnx2 *bp, u32 reset_code) +{ + u32 val; + int i, rc = 0; + u8 old_port; + + /* Wait for the current PCI transaction to complete before + * issuing a reset. */ + bnx2_wait_dma_complete(bp); + /* Wait for the firmware to tell us it is ok to issue a reset. */ bnx2_fw_sync(bp, BNX2_DRV_MSG_DATA_WAIT0 | reset_code, 1, 1); @@ -6361,6 +6378,10 @@ bnx2_open(struct net_device *dev) struct bnx2 *bp = netdev_priv(dev); int rc; + rc = bnx2_request_firmware(bp); + if (rc < 0) + goto out; + netif_carrier_off(dev); bnx2_disable_int(bp); @@ -6429,6 +6450,7 @@ open_err: bnx2_free_irq(bp); bnx2_free_mem(bp); bnx2_del_napi(bp); + bnx2_release_firmware(bp); goto out; } @@ -8575,12 +8597,15 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) pci_set_drvdata(pdev, dev); - rc = bnx2_request_firmware(bp); - if (rc < 0) - goto error; - + /* + * In-flight DMA from 1st kernel could continue going in kdump kernel. + * New io-page table has been created before bnx2 does reset at open stage. + * We have to wait for the in-flight DMA to complete to avoid it look up + * into the newly created io-page table. + */ + if (is_kdump_kernel()) + bnx2_wait_dma_complete(bp); - bnx2_reset_chip(bp, BNX2_DRV_MSG_CODE_RESET); memcpy(dev->dev_addr, bp->mac_addr, ETH_ALEN); dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_SG | @@ -8613,7 +8638,6 @@ bnx2_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) return 0; error: - bnx2_release_firmware(bp); pci_iounmap(pdev, bp->regview); pci_release_regions(pdev); pci_disable_device(pdev); diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c index a9f9f37..c690966 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c @@ -6309,6 +6309,7 @@ static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto, struct tc_to_netdev *ntc) { struct bnxt *bp = netdev_priv(dev); + bool sh = false; u8 tc; if (ntc->type != TC_SETUP_MQPRIO) @@ -6325,12 +6326,11 @@ static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto, if (netdev_get_num_tc(dev) == tc) return 0; + if (bp->flags & BNXT_FLAG_SHARED_RINGS) + sh = true; + if (tc) { int max_rx_rings, max_tx_rings, rc; - bool sh = false; - - if (bp->flags & BNXT_FLAG_SHARED_RINGS) - sh = true; rc = bnxt_get_max_rings(bp, &max_rx_rings, &max_tx_rings, sh); if (rc || bp->tx_nr_rings_per_tc * tc > max_tx_rings) @@ -6348,7 +6348,8 @@ static int bnxt_setup_tc(struct net_device *dev, u32 handle, __be16 proto, bp->tx_nr_rings = bp->tx_nr_rings_per_tc; netdev_reset_tc(dev); } - bp->cp_nr_rings = max_t(int, bp->tx_nr_rings, bp->rx_nr_rings); + bp->cp_nr_rings = sh ? max_t(int, bp->tx_nr_rings, bp->rx_nr_rings) : + bp->tx_nr_rings + bp->rx_nr_rings; bp->num_stat_ctxs = bp->cp_nr_rings; if (netif_running(bp->dev)) diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c index ec6cd18..60e2af8 100644 --- a/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c +++ b/drivers/net/ethernet/broadcom/bnxt/bnxt_sriov.c @@ -774,8 +774,8 @@ static int bnxt_vf_set_link(struct bnxt *bp, struct bnxt_vf_info *vf) if (vf->flags & BNXT_VF_LINK_UP) { /* if physical link is down, force link up on VF */ - if (phy_qcfg_resp.link == - PORT_PHY_QCFG_RESP_LINK_NO_LINK) { + if (phy_qcfg_resp.link != + PORT_PHY_QCFG_RESP_LINK_LINK) { phy_qcfg_resp.link = PORT_PHY_QCFG_RESP_LINK_LINK; phy_qcfg_resp.link_speed = cpu_to_le16( diff --git a/drivers/net/ethernet/brocade/bna/bnad.c b/drivers/net/ethernet/brocade/bna/bnad.c index f9df4b5a..f42f672 100644 --- a/drivers/net/ethernet/brocade/bna/bnad.c +++ b/drivers/net/ethernet/brocade/bna/bnad.c @@ -177,6 +177,7 @@ bnad_txcmpl_process(struct bnad *bnad, struct bna_tcb *tcb) return 0; hw_cons = *(tcb->hw_consumer_index); + rmb(); cons = tcb->consumer_index; q_depth = tcb->q_depth; @@ -3094,7 +3095,7 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev) BNA_QE_INDX_INC(prod, q_depth); tcb->producer_index = prod; - smp_mb(); + wmb(); if (unlikely(!test_bit(BNAD_TXQ_TX_STARTED, &tcb->flags))) return NETDEV_TX_OK; @@ -3102,7 +3103,6 @@ bnad_start_xmit(struct sk_buff *skb, struct net_device *netdev) skb_tx_timestamp(skb); bna_txq_prod_indx_doorbell(tcb); - smp_mb(); return NETDEV_TX_OK; } diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h index 50812a1..df1573c 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_pci_id_tbl.h @@ -178,9 +178,9 @@ CH_PCI_DEVICE_ID_TABLE_DEFINE_BEGIN CH_PCI_ID_TABLE_FENTRY(0x6005), CH_PCI_ID_TABLE_FENTRY(0x6006), CH_PCI_ID_TABLE_FENTRY(0x6007), + CH_PCI_ID_TABLE_FENTRY(0x6008), CH_PCI_ID_TABLE_FENTRY(0x6009), CH_PCI_ID_TABLE_FENTRY(0x600d), - CH_PCI_ID_TABLE_FENTRY(0x6010), CH_PCI_ID_TABLE_FENTRY(0x6011), CH_PCI_ID_TABLE_FENTRY(0x6014), CH_PCI_ID_TABLE_FENTRY(0x6015), diff --git a/drivers/net/ethernet/hisilicon/hns/hnae.c b/drivers/net/ethernet/hisilicon/hns/hnae.c index c54c6fa..b6ed818 100644 --- a/drivers/net/ethernet/hisilicon/hns/hnae.c +++ b/drivers/net/ethernet/hisilicon/hns/hnae.c @@ -332,8 +332,10 @@ struct hnae_handle *hnae_get_handle(struct device *owner_dev, return ERR_PTR(-ENODEV); handle = dev->ops->get_handle(dev, port_id); - if (IS_ERR(handle)) + if (IS_ERR(handle)) { + put_device(&dev->cls_dev); return handle; + } handle->dev = dev; handle->owner_dev = owner_dev; @@ -356,6 +358,8 @@ out_when_init_queue: for (j = i - 1; j >= 0; j--) hnae_fini_queue(handle->qs[j]); + put_device(&dev->cls_dev); + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(hnae_get_handle); @@ -377,6 +381,8 @@ void hnae_put_handle(struct hnae_handle *h) dev->ops->put_handle(h); module_put(dev->owner); + + put_device(&dev->cls_dev); } EXPORT_SYMBOL(hnae_put_handle); diff --git a/drivers/net/ethernet/ibm/ehea/ehea_main.c b/drivers/net/ethernet/ibm/ehea/ehea_main.c index 54efa9a..bd719e2 100644 --- a/drivers/net/ethernet/ibm/ehea/ehea_main.c +++ b/drivers/net/ethernet/ibm/ehea/ehea_main.c @@ -2446,6 +2446,8 @@ static int ehea_open(struct net_device *dev) netif_info(port, ifup, dev, "enabling port\n"); + netif_carrier_off(dev); + ret = ehea_up(dev); if (!ret) { port_napi_enable(port); diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 5f44c55..4f3281a 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -1505,9 +1505,8 @@ static void init_sub_crqs(struct ibmvnic_adapter *adapter, int retry) adapter->max_rx_add_entries_per_subcrq > entries_page ? entries_page : adapter->max_rx_add_entries_per_subcrq; - /* Choosing the maximum number of queues supported by firmware*/ - adapter->req_tx_queues = adapter->max_tx_queues; - adapter->req_rx_queues = adapter->max_rx_queues; + adapter->req_tx_queues = adapter->opt_tx_comp_sub_queues; + adapter->req_rx_queues = adapter->opt_rx_comp_queues; adapter->req_rx_add_queues = adapter->max_rx_add_queues; adapter->req_mtu = adapter->max_mtu; @@ -3706,7 +3705,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) struct net_device *netdev; unsigned char *mac_addr_p; struct dentry *ent; - char buf[16]; /* debugfs name buf */ + char buf[17]; /* debugfs name buf */ int rc; dev_dbg(&dev->dev, "entering ibmvnic_probe for UA 0x%x\n", @@ -3845,6 +3844,9 @@ static int ibmvnic_remove(struct vio_dev *dev) if (adapter->debugfs_dir && !IS_ERR(adapter->debugfs_dir)) debugfs_remove_recursive(adapter->debugfs_dir); + dma_unmap_single(&dev->dev, adapter->stats_token, + sizeof(struct ibmvnic_statistics), DMA_FROM_DEVICE); + if (adapter->ras_comps) dma_free_coherent(&dev->dev, adapter->ras_comp_num * diff --git a/drivers/net/ethernet/marvell/mv643xx_eth.c b/drivers/net/ethernet/marvell/mv643xx_eth.c index bf5cc55b..5b12022 100644 --- a/drivers/net/ethernet/marvell/mv643xx_eth.c +++ b/drivers/net/ethernet/marvell/mv643xx_eth.c @@ -1381,6 +1381,7 @@ static unsigned int get_rx_coal(struct mv643xx_eth_private *mp) temp = (val & 0x003fff00) >> 8; temp *= 64000000; + temp += mp->t_clk / 2; do_div(temp, mp->t_clk); return (unsigned int)temp; @@ -1417,6 +1418,7 @@ static unsigned int get_tx_coal(struct mv643xx_eth_private *mp) temp = (rdlp(mp, TX_FIFO_URGENT_THRESHOLD) & 0x3fff0) >> 4; temp *= 64000000; + temp += mp->t_clk / 2; do_div(temp, mp->t_clk); return (unsigned int)temp; diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index 12c99a2..3a47e83 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2202,7 +2202,6 @@ void mlx4_en_destroy_netdev(struct net_device *dev) if (!shutdown) free_netdev(dev); - dev->ethtool_ops = NULL; } static int mlx4_en_change_mtu(struct net_device *dev, int new_mtu) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index f4c687c..84e8b25 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -1445,6 +1445,7 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, c->netdev = priv->netdev; c->mkey_be = cpu_to_be32(priv->mdev->mlx5e_res.mkey.key); c->num_tc = priv->params.num_tc; + c->xdp = !!priv->xdp_prog; if (priv->params.rx_am_enabled) rx_cq_profile = mlx5e_am_get_def_profile(priv->params.rx_cq_period_mode); @@ -1468,6 +1469,12 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, if (err) goto err_close_tx_cqs; + /* XDP SQ CQ params are same as normal TXQ sq CQ params */ + err = c->xdp ? mlx5e_open_cq(c, &cparam->tx_cq, &c->xdp_sq.cq, + priv->params.tx_cq_moderation) : 0; + if (err) + goto err_close_rx_cq; + napi_enable(&c->napi); err = mlx5e_open_sq(c, 0, &cparam->icosq, &c->icosq); @@ -1488,21 +1495,10 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, } } - if (priv->xdp_prog) { - /* XDP SQ CQ params are same as normal TXQ sq CQ params */ - err = mlx5e_open_cq(c, &cparam->tx_cq, &c->xdp_sq.cq, - priv->params.tx_cq_moderation); - if (err) - goto err_close_sqs; - - err = mlx5e_open_sq(c, 0, &cparam->xdp_sq, &c->xdp_sq); - if (err) { - mlx5e_close_cq(&c->xdp_sq.cq); - goto err_close_sqs; - } - } + err = c->xdp ? mlx5e_open_sq(c, 0, &cparam->xdp_sq, &c->xdp_sq) : 0; + if (err) + goto err_close_sqs; - c->xdp = !!priv->xdp_prog; err = mlx5e_open_rq(c, &cparam->rq, &c->rq); if (err) goto err_close_xdp_sq; @@ -1512,7 +1508,8 @@ static int mlx5e_open_channel(struct mlx5e_priv *priv, int ix, return 0; err_close_xdp_sq: - mlx5e_close_sq(&c->xdp_sq); + if (c->xdp) + mlx5e_close_sq(&c->xdp_sq); err_close_sqs: mlx5e_close_sqs(c); @@ -1522,6 +1519,10 @@ err_close_icosq: err_disable_napi: napi_disable(&c->napi); + if (c->xdp) + mlx5e_close_cq(&c->xdp_sq.cq); + +err_close_rx_cq: mlx5e_close_cq(&c->rq.cq); err_close_tx_cqs: diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c index 7fe6559e..bf1c09c 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rep.c @@ -308,7 +308,7 @@ static void mlx5e_build_rep_netdev(struct net_device *netdev) netdev->switchdev_ops = &mlx5e_rep_switchdev_ops; #endif - netdev->features |= NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_TC; + netdev->features |= NETIF_F_VLAN_CHALLENGED | NETIF_F_HW_TC | NETIF_F_NETNS_LOCAL; netdev->hw_features |= NETIF_F_HW_TC; eth_hw_addr_random(netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c index ce8c54d..6bb21b3 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c @@ -237,12 +237,15 @@ static int parse_cls_flower(struct mlx5e_priv *priv, struct mlx5_flow_spec *spec skb_flow_dissector_target(f->dissector, FLOW_DISSECTOR_KEY_VLAN, f->mask); - if (mask->vlan_id) { + if (mask->vlan_id || mask->vlan_priority) { MLX5_SET(fte_match_set_lyr_2_4, headers_c, vlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, headers_v, vlan_tag, 1); MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_vid, mask->vlan_id); MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_vid, key->vlan_id); + + MLX5_SET(fte_match_set_lyr_2_4, headers_c, first_prio, mask->vlan_priority); + MLX5_SET(fte_match_set_lyr_2_4, headers_v, first_prio, key->vlan_priority); } } diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c index c55ad8d..d239f5d 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eswitch_offloads.c @@ -57,7 +57,8 @@ mlx5_eswitch_add_offloaded_rule(struct mlx5_eswitch *esw, if (esw->mode != SRIOV_OFFLOADS) return ERR_PTR(-EOPNOTSUPP); - action = attr->action; + /* per flow vlan pop/push is emulated, don't set that into the firmware */ + action = attr->action & ~(MLX5_FLOW_CONTEXT_ACTION_VLAN_PUSH | MLX5_FLOW_CONTEXT_ACTION_VLAN_POP); if (action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { dest.type = MLX5_FLOW_DESTINATION_TYPE_VPORT; diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c index 8969604..914e546 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_core.c @@ -1690,7 +1690,7 @@ static int init_root_ns(struct mlx5_flow_steering *steering) { steering->root_ns = create_root_ns(steering, FS_FT_NIC_RX); - if (IS_ERR_OR_NULL(steering->root_ns)) + if (!steering->root_ns) goto cleanup; if (init_root_tree(steering, &root_fs, &steering->root_ns->ns.node)) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index d5433c4..3eb9315 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -1226,6 +1226,9 @@ static int init_one(struct pci_dev *pdev, pci_set_drvdata(pdev, dev); + dev->pdev = pdev; + dev->event = mlx5_core_event; + if (prof_sel < 0 || prof_sel >= ARRAY_SIZE(profile)) { mlx5_core_warn(dev, "selected profile out of range, selecting default (%d)\n", @@ -1233,8 +1236,6 @@ static int init_one(struct pci_dev *pdev, prof_sel = MLX5_DEFAULT_PROF; } dev->profile = &profile[prof_sel]; - dev->pdev = pdev; - dev->event = mlx5_core_event; INIT_LIST_HEAD(&priv->ctx_list); spin_lock_init(&priv->ctx_lock); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c index 1ec0a4c..dda5761 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.c @@ -231,7 +231,7 @@ mlxsw_sp_span_entry_create(struct mlxsw_sp_port *port) span_entry->used = true; span_entry->id = index; - span_entry->ref_count = 0; + span_entry->ref_count = 1; span_entry->local_port = local_port; return span_entry; } @@ -270,6 +270,7 @@ static struct mlxsw_sp_span_entry span_entry = mlxsw_sp_span_entry_find(port); if (span_entry) { + /* Already exists, just take a reference */ span_entry->ref_count++; return span_entry; } @@ -280,6 +281,7 @@ static struct mlxsw_sp_span_entry static int mlxsw_sp_span_entry_put(struct mlxsw_sp *mlxsw_sp, struct mlxsw_sp_span_entry *span_entry) { + WARN_ON(!span_entry->ref_count); if (--span_entry->ref_count == 0) mlxsw_sp_span_entry_destroy(mlxsw_sp, span_entry); return 0; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h index 9b22863..97bbc1d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum.h +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum.h @@ -115,7 +115,7 @@ struct mlxsw_sp_rif { struct mlxsw_sp_mid { struct list_head list; unsigned char addr[ETH_ALEN]; - u16 vid; + u16 fid; u16 mid; unsigned int ref_count; }; diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c index 4573da2..e83072d 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c @@ -594,21 +594,22 @@ static int mlxsw_sp_vrs_init(struct mlxsw_sp *mlxsw_sp) return 0; } +static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp); + static void mlxsw_sp_vrs_fini(struct mlxsw_sp *mlxsw_sp) { + mlxsw_sp_router_fib_flush(mlxsw_sp); kfree(mlxsw_sp->router.vrs); } struct mlxsw_sp_neigh_key { - unsigned char addr[sizeof(struct in6_addr)]; - struct net_device *dev; + struct neighbour *n; }; struct mlxsw_sp_neigh_entry { struct rhash_head ht_node; struct mlxsw_sp_neigh_key key; u16 rif; - struct neighbour *n; bool offloaded; struct delayed_work dw; struct mlxsw_sp_port *mlxsw_sp_port; @@ -646,19 +647,15 @@ mlxsw_sp_neigh_entry_remove(struct mlxsw_sp *mlxsw_sp, static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work); static struct mlxsw_sp_neigh_entry * -mlxsw_sp_neigh_entry_create(const void *addr, size_t addr_len, - struct net_device *dev, u16 rif, - struct neighbour *n) +mlxsw_sp_neigh_entry_create(struct neighbour *n, u16 rif) { struct mlxsw_sp_neigh_entry *neigh_entry; neigh_entry = kzalloc(sizeof(*neigh_entry), GFP_ATOMIC); if (!neigh_entry) return NULL; - memcpy(neigh_entry->key.addr, addr, addr_len); - neigh_entry->key.dev = dev; + neigh_entry->key.n = n; neigh_entry->rif = rif; - neigh_entry->n = n; INIT_DELAYED_WORK(&neigh_entry->dw, mlxsw_sp_router_neigh_update_hw); INIT_LIST_HEAD(&neigh_entry->nexthop_list); return neigh_entry; @@ -671,13 +668,11 @@ mlxsw_sp_neigh_entry_destroy(struct mlxsw_sp_neigh_entry *neigh_entry) } static struct mlxsw_sp_neigh_entry * -mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, const void *addr, - size_t addr_len, struct net_device *dev) +mlxsw_sp_neigh_entry_lookup(struct mlxsw_sp *mlxsw_sp, struct neighbour *n) { - struct mlxsw_sp_neigh_key key = {{ 0 } }; + struct mlxsw_sp_neigh_key key; - memcpy(key.addr, addr, addr_len); - key.dev = dev; + key.n = n; return rhashtable_lookup_fast(&mlxsw_sp->router.neigh_ht, &key, mlxsw_sp_neigh_ht_params); } @@ -689,26 +684,20 @@ int mlxsw_sp_router_neigh_construct(struct net_device *dev, struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_neigh_entry *neigh_entry; struct mlxsw_sp_rif *r; - u32 dip; int err; if (n->tbl != &arp_tbl) return 0; - dip = ntohl(*((__be32 *) n->primary_key)); - neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip), - n->dev); - if (neigh_entry) { - WARN_ON(neigh_entry->n != n); + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n); + if (neigh_entry) return 0; - } r = mlxsw_sp_rif_find_by_dev(mlxsw_sp, n->dev); if (WARN_ON(!r)) return -EINVAL; - neigh_entry = mlxsw_sp_neigh_entry_create(&dip, sizeof(dip), n->dev, - r->rif, n); + neigh_entry = mlxsw_sp_neigh_entry_create(n, r->rif); if (!neigh_entry) return -ENOMEM; err = mlxsw_sp_neigh_entry_insert(mlxsw_sp, neigh_entry); @@ -727,14 +716,11 @@ void mlxsw_sp_router_neigh_destroy(struct net_device *dev, struct mlxsw_sp_port *mlxsw_sp_port = netdev_priv(dev); struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; struct mlxsw_sp_neigh_entry *neigh_entry; - u32 dip; if (n->tbl != &arp_tbl) return; - dip = ntohl(*((__be32 *) n->primary_key)); - neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &dip, sizeof(dip), - n->dev); + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n); if (!neigh_entry) return; mlxsw_sp_neigh_entry_remove(mlxsw_sp, neigh_entry); @@ -817,6 +803,26 @@ static void mlxsw_sp_router_neigh_rec_process(struct mlxsw_sp *mlxsw_sp, } } +static bool mlxsw_sp_router_rauhtd_is_full(char *rauhtd_pl) +{ + u8 num_rec, last_rec_index, num_entries; + + num_rec = mlxsw_reg_rauhtd_num_rec_get(rauhtd_pl); + last_rec_index = num_rec - 1; + + if (num_rec < MLXSW_REG_RAUHTD_REC_MAX_NUM) + return false; + if (mlxsw_reg_rauhtd_rec_type_get(rauhtd_pl, last_rec_index) == + MLXSW_REG_RAUHTD_TYPE_IPV6) + return true; + + num_entries = mlxsw_reg_rauhtd_ipv4_rec_num_entries_get(rauhtd_pl, + last_rec_index); + if (++num_entries == MLXSW_REG_RAUHTD_IPV4_ENT_PER_REC) + return true; + return false; +} + static int mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp) { char *rauhtd_pl; @@ -843,7 +849,7 @@ static int mlxsw_sp_router_neighs_update_rauhtd(struct mlxsw_sp *mlxsw_sp) for (i = 0; i < num_rec; i++) mlxsw_sp_router_neigh_rec_process(mlxsw_sp, rauhtd_pl, i); - } while (num_rec); + } while (mlxsw_sp_router_rauhtd_is_full(rauhtd_pl)); rtnl_unlock(); kfree(rauhtd_pl); @@ -862,7 +868,7 @@ static void mlxsw_sp_router_neighs_update_nh(struct mlxsw_sp *mlxsw_sp) * is active regardless of the traffic. */ if (!list_empty(&neigh_entry->nexthop_list)) - neigh_event_send(neigh_entry->n, NULL); + neigh_event_send(neigh_entry->key.n, NULL); } rtnl_unlock(); } @@ -908,9 +914,9 @@ static void mlxsw_sp_router_probe_unresolved_nexthops(struct work_struct *work) rtnl_lock(); list_for_each_entry(neigh_entry, &mlxsw_sp->router.nexthop_neighs_list, nexthop_neighs_list_node) { - if (!(neigh_entry->n->nud_state & NUD_VALID) && + if (!(neigh_entry->key.n->nud_state & NUD_VALID) && !list_empty(&neigh_entry->nexthop_list)) - neigh_event_send(neigh_entry->n, NULL); + neigh_event_send(neigh_entry->key.n, NULL); } rtnl_unlock(); @@ -927,7 +933,7 @@ static void mlxsw_sp_router_neigh_update_hw(struct work_struct *work) { struct mlxsw_sp_neigh_entry *neigh_entry = container_of(work, struct mlxsw_sp_neigh_entry, dw.work); - struct neighbour *n = neigh_entry->n; + struct neighbour *n = neigh_entry->key.n; struct mlxsw_sp_port *mlxsw_sp_port = neigh_entry->mlxsw_sp_port; struct mlxsw_sp *mlxsw_sp = mlxsw_sp_port->mlxsw_sp; char rauht_pl[MLXSW_REG_RAUHT_LEN]; @@ -1030,11 +1036,8 @@ int mlxsw_sp_router_netevent_event(struct notifier_block *unused, mlxsw_sp = mlxsw_sp_port->mlxsw_sp; dip = ntohl(*((__be32 *) n->primary_key)); - neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, - &dip, - sizeof(__be32), - dev); - if (WARN_ON(!neigh_entry) || WARN_ON(neigh_entry->n != n)) { + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n); + if (WARN_ON(!neigh_entry)) { mlxsw_sp_port_dev_put(mlxsw_sp_port); return NOTIFY_DONE; } @@ -1343,33 +1346,26 @@ static int mlxsw_sp_nexthop_init(struct mlxsw_sp *mlxsw_sp, struct fib_nh *fib_nh) { struct mlxsw_sp_neigh_entry *neigh_entry; - u32 gwip = ntohl(fib_nh->nh_gw); struct net_device *dev = fib_nh->nh_dev; struct neighbour *n; u8 nud_state; - neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip, - sizeof(gwip), dev); - if (!neigh_entry) { - __be32 gwipn = htonl(gwip); - - n = neigh_create(&arp_tbl, &gwipn, dev); + /* Take a reference of neigh here ensuring that neigh would + * not be detructed before the nexthop entry is finished. + * The reference is taken either in neigh_lookup() or + * in neith_create() in case n is not found. + */ + n = neigh_lookup(&arp_tbl, &fib_nh->nh_gw, dev); + if (!n) { + n = neigh_create(&arp_tbl, &fib_nh->nh_gw, dev); if (IS_ERR(n)) return PTR_ERR(n); neigh_event_send(n, NULL); - neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, &gwip, - sizeof(gwip), dev); - if (!neigh_entry) { - neigh_release(n); - return -EINVAL; - } - } else { - /* Take a reference of neigh here ensuring that neigh would - * not be detructed before the nexthop entry is finished. - * The second branch takes the reference in neith_create() - */ - n = neigh_entry->n; - neigh_clone(n); + } + neigh_entry = mlxsw_sp_neigh_entry_lookup(mlxsw_sp, n); + if (!neigh_entry) { + neigh_release(n); + return -EINVAL; } /* If that is the first nexthop connected to that neigh, add to @@ -1403,7 +1399,7 @@ static void mlxsw_sp_nexthop_fini(struct mlxsw_sp *mlxsw_sp, if (list_empty(&nh->neigh_entry->nexthop_list)) list_del(&nh->neigh_entry->nexthop_neighs_list_node); - neigh_release(neigh_entry->n); + neigh_release(neigh_entry->key.n); } static struct mlxsw_sp_nexthop_group * @@ -1463,11 +1459,11 @@ static bool mlxsw_sp_nexthop_match(struct mlxsw_sp_nexthop *nh, for (i = 0; i < fi->fib_nhs; i++) { struct fib_nh *fib_nh = &fi->fib_nh[i]; - u32 gwip = ntohl(fib_nh->nh_gw); + struct neighbour *n = nh->neigh_entry->key.n; - if (memcmp(nh->neigh_entry->key.addr, - &gwip, sizeof(u32)) == 0 && - nh->neigh_entry->key.dev == fib_nh->nh_dev) + if (memcmp(n->primary_key, &fib_nh->nh_gw, + sizeof(fib_nh->nh_gw)) == 0 && + n->dev == fib_nh->nh_dev) return true; } return false; @@ -1874,18 +1870,18 @@ static int mlxsw_sp_router_set_abort_trap(struct mlxsw_sp *mlxsw_sp) return mlxsw_reg_write(mlxsw_sp->core, MLXSW_REG(ralue), ralue_pl); } -static void mlxsw_sp_router_fib4_abort(struct mlxsw_sp *mlxsw_sp) +static void mlxsw_sp_router_fib_flush(struct mlxsw_sp *mlxsw_sp) { struct mlxsw_resources *resources; struct mlxsw_sp_fib_entry *fib_entry; struct mlxsw_sp_fib_entry *tmp; struct mlxsw_sp_vr *vr; int i; - int err; resources = mlxsw_core_resources_get(mlxsw_sp->core); for (i = 0; i < resources->max_virtual_routers; i++) { vr = &mlxsw_sp->router.vrs[i]; + if (!vr->used) continue; @@ -1901,6 +1897,13 @@ static void mlxsw_sp_router_fib4_abort(struct mlxsw_sp *mlxsw_sp) break; } } +} + +static void mlxsw_sp_router_fib4_abort(struct mlxsw_sp *mlxsw_sp) +{ + int err; + + mlxsw_sp_router_fib_flush(mlxsw_sp); mlxsw_sp->router.aborted = true; err = mlxsw_sp_router_set_abort_trap(mlxsw_sp); if (err) @@ -1958,6 +1961,9 @@ static int mlxsw_sp_router_fib_event(struct notifier_block *nb, struct fib_entry_notifier_info *fen_info = ptr; int err; + if (!net_eq(fen_info->info.net, &init_net)) + return NOTIFY_DONE; + switch (event) { case FIB_EVENT_ENTRY_ADD: err = mlxsw_sp_router_fib4_add(mlxsw_sp, fen_info); diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c index 5e00c79..1e2c8ec 100644 --- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c +++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_switchdev.c @@ -929,12 +929,12 @@ static int mlxsw_sp_port_smid_set(struct mlxsw_sp_port *mlxsw_sp_port, u16 mid, static struct mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp *mlxsw_sp, const unsigned char *addr, - u16 vid) + u16 fid) { struct mlxsw_sp_mid *mid; list_for_each_entry(mid, &mlxsw_sp->br_mids.list, list) { - if (ether_addr_equal(mid->addr, addr) && mid->vid == vid) + if (ether_addr_equal(mid->addr, addr) && mid->fid == fid) return mid; } return NULL; @@ -942,7 +942,7 @@ static struct mlxsw_sp_mid *__mlxsw_sp_mc_get(struct mlxsw_sp *mlxsw_sp, static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, const unsigned char *addr, - u16 vid) + u16 fid) { struct mlxsw_sp_mid *mid; u16 mid_idx; @@ -958,7 +958,7 @@ static struct mlxsw_sp_mid *__mlxsw_sp_mc_alloc(struct mlxsw_sp *mlxsw_sp, set_bit(mid_idx, mlxsw_sp->br_mids.mapped); ether_addr_copy(mid->addr, addr); - mid->vid = vid; + mid->fid = fid; mid->mid = mid_idx; mid->ref_count = 0; list_add_tail(&mid->list, &mlxsw_sp->br_mids.list); @@ -991,9 +991,9 @@ static int mlxsw_sp_port_mdb_add(struct mlxsw_sp_port *mlxsw_sp_port, if (switchdev_trans_ph_prepare(trans)) return 0; - mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, mdb->vid); + mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, fid); if (!mid) { - mid = __mlxsw_sp_mc_alloc(mlxsw_sp, mdb->addr, mdb->vid); + mid = __mlxsw_sp_mc_alloc(mlxsw_sp, mdb->addr, fid); if (!mid) { netdev_err(dev, "Unable to allocate MC group\n"); return -ENOMEM; @@ -1137,7 +1137,7 @@ static int mlxsw_sp_port_mdb_del(struct mlxsw_sp_port *mlxsw_sp_port, u16 mid_idx; int err = 0; - mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, mdb->vid); + mid = __mlxsw_sp_mc_get(mlxsw_sp, mdb->addr, fid); if (!mid) { netdev_err(dev, "Unable to remove port from MC DB\n"); return -EINVAL; diff --git a/drivers/net/ethernet/qlogic/qed/qed_hsi.h b/drivers/net/ethernet/qlogic/qed/qed_hsi.h index 72eee29..2777d5b 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_hsi.h +++ b/drivers/net/ethernet/qlogic/qed/qed_hsi.h @@ -727,9 +727,6 @@ struct core_tx_bd_flags { #define CORE_TX_BD_FLAGS_L4_PROTOCOL_SHIFT 6 #define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_MASK 0x1 #define CORE_TX_BD_FLAGS_L4_PSEUDO_CSUM_MODE_SHIFT 7 -#define CORE_TX_BD_FLAGS_ROCE_FLAV_MASK 0x1 -#define CORE_TX_BD_FLAGS_ROCE_FLAV_SHIFT 12 - }; struct core_tx_bd { diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c index 63e1a1b..f95385c 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c +++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c @@ -1119,6 +1119,7 @@ static void qed_ll2_prepare_tx_packet_set_bd(struct qed_hwfn *p_hwfn, start_bd->bd_flags.as_bitfield |= CORE_TX_BD_FLAGS_START_BD_MASK << CORE_TX_BD_FLAGS_START_BD_SHIFT; SET_FIELD(start_bd->bitfield0, CORE_TX_BD_NBDS, num_of_bds); + SET_FIELD(start_bd->bitfield0, CORE_TX_BD_ROCE_FLAV, type); DMA_REGPAIR_LE(start_bd->addr, first_frag); start_bd->nbytes = cpu_to_le16(first_frag_len); diff --git a/drivers/net/ethernet/qlogic/qed/qed_main.c b/drivers/net/ethernet/qlogic/qed/qed_main.c index c418360..333c744 100644 --- a/drivers/net/ethernet/qlogic/qed/qed_main.c +++ b/drivers/net/ethernet/qlogic/qed/qed_main.c @@ -839,20 +839,19 @@ static void qed_update_pf_params(struct qed_dev *cdev, { int i; + if (IS_ENABLED(CONFIG_QED_RDMA)) { + params->rdma_pf_params.num_qps = QED_ROCE_QPS; + params->rdma_pf_params.min_dpis = QED_ROCE_DPIS; + /* divide by 3 the MRs to avoid MF ILT overflow */ + params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS; + params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX; + } + for (i = 0; i < cdev->num_hwfns; i++) { struct qed_hwfn *p_hwfn = &cdev->hwfns[i]; p_hwfn->pf_params = *params; } - - if (!IS_ENABLED(CONFIG_QED_RDMA)) - return; - - params->rdma_pf_params.num_qps = QED_ROCE_QPS; - params->rdma_pf_params.min_dpis = QED_ROCE_DPIS; - /* divide by 3 the MRs to avoid MF ILT overflow */ - params->rdma_pf_params.num_mrs = RDMA_MAX_TIDS; - params->rdma_pf_params.gl_pi = QED_ROCE_PROTOCOL_INDEX; } static int qed_slowpath_start(struct qed_dev *cdev, diff --git a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c index 12251a1..7567cc4 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_ethtool.c +++ b/drivers/net/ethernet/qlogic/qede/qede_ethtool.c @@ -175,16 +175,23 @@ static void qede_get_strings_stats(struct qede_dev *edev, u8 *buf) for (i = 0, k = 0; i < QEDE_QUEUE_CNT(edev); i++) { int tc; - for (j = 0; j < QEDE_NUM_RQSTATS; j++) - sprintf(buf + (k + j) * ETH_GSTRING_LEN, - "%d: %s", i, qede_rqstats_arr[j].string); - k += QEDE_NUM_RQSTATS; - for (tc = 0; tc < edev->num_tc; tc++) { - for (j = 0; j < QEDE_NUM_TQSTATS; j++) + if (edev->fp_array[i].type & QEDE_FASTPATH_RX) { + for (j = 0; j < QEDE_NUM_RQSTATS; j++) sprintf(buf + (k + j) * ETH_GSTRING_LEN, - "%d.%d: %s", i, tc, - qede_tqstats_arr[j].string); - k += QEDE_NUM_TQSTATS; + "%d: %s", i, + qede_rqstats_arr[j].string); + k += QEDE_NUM_RQSTATS; + } + + if (edev->fp_array[i].type & QEDE_FASTPATH_TX) { + for (tc = 0; tc < edev->num_tc; tc++) { + for (j = 0; j < QEDE_NUM_TQSTATS; j++) + sprintf(buf + (k + j) * + ETH_GSTRING_LEN, + "%d.%d: %s", i, tc, + qede_tqstats_arr[j].string); + k += QEDE_NUM_TQSTATS; + } } } diff --git a/drivers/net/ethernet/qlogic/qede/qede_main.c b/drivers/net/ethernet/qlogic/qede/qede_main.c index 7def29a..85f46db 100644 --- a/drivers/net/ethernet/qlogic/qede/qede_main.c +++ b/drivers/net/ethernet/qlogic/qede/qede_main.c @@ -2839,7 +2839,7 @@ static int qede_alloc_sge_mem(struct qede_dev *edev, struct qede_rx_queue *rxq) } mapping = dma_map_page(&edev->pdev->dev, replace_buf->data, 0, - rxq->rx_buf_size, DMA_FROM_DEVICE); + PAGE_SIZE, DMA_FROM_DEVICE); if (unlikely(dma_mapping_error(&edev->pdev->dev, mapping))) { DP_NOTICE(edev, "Failed to map TPA replacement buffer\n"); diff --git a/drivers/net/ethernet/qualcomm/emac/emac-mac.c b/drivers/net/ethernet/qualcomm/emac/emac-mac.c index 6fb3bee..0b4deb3 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-mac.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-mac.c @@ -575,10 +575,11 @@ void emac_mac_start(struct emac_adapter *adpt) mac |= TXEN | RXEN; /* enable RX/TX */ - /* We don't have ethtool support yet, so force flow-control mode - * to 'full' always. - */ - mac |= TXFC | RXFC; + /* Configure MAC flow control to match the PHY's settings. */ + if (phydev->pause) + mac |= RXFC; + if (phydev->pause != phydev->asym_pause) + mac |= TXFC; /* setup link speed */ mac &= ~SPEED_MASK; @@ -1003,6 +1004,12 @@ int emac_mac_up(struct emac_adapter *adpt) writel((u32)~DIS_INT, adpt->base + EMAC_INT_STATUS); writel(adpt->irq.mask, adpt->base + EMAC_INT_MASK); + /* Enable pause frames. Without this feature, the EMAC has been shown + * to receive (and drop) frames with FCS errors at gigabit connections. + */ + adpt->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause; + adpt->phydev->advertising |= SUPPORTED_Pause | SUPPORTED_Asym_Pause; + adpt->phydev->irq = PHY_IGNORE_INTERRUPT; phy_start(adpt->phydev); diff --git a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c index 75c1b53..72fe343 100644 --- a/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c +++ b/drivers/net/ethernet/qualcomm/emac/emac-sgmii.c @@ -421,7 +421,7 @@ static const struct emac_reg_write sgmii_v2_laned[] = { /* CDR Settings */ {EMAC_SGMII_LN_UCDR_FO_GAIN_MODE0, UCDR_STEP_BY_TWO_MODE0 | UCDR_xO_GAIN_MODE(10)}, - {EMAC_SGMII_LN_UCDR_SO_GAIN_MODE0, UCDR_xO_GAIN_MODE(6)}, + {EMAC_SGMII_LN_UCDR_SO_GAIN_MODE0, UCDR_xO_GAIN_MODE(0)}, {EMAC_SGMII_LN_UCDR_SO_CONFIG, UCDR_ENABLE | UCDR_SO_SATURATION(12)}, /* TX/RX Settings */ diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers/net/ethernet/sfc/efx.c index 3cf3557..6b89e4a 100644 --- a/drivers/net/ethernet/sfc/efx.c +++ b/drivers/net/ethernet/sfc/efx.c @@ -485,6 +485,9 @@ efx_copy_channel(const struct efx_channel *old_channel) *channel = *old_channel; channel->napi_dev = NULL; + INIT_HLIST_NODE(&channel->napi_str.napi_hash_node); + channel->napi_str.napi_id = 0; + channel->napi_str.state = 0; memset(&channel->eventq, 0, sizeof(channel->eventq)); for (j = 0; j < EFX_TXQ_TYPES; j++) { diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c index 48e71fa..e2c94ec 100644 --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c @@ -880,6 +880,13 @@ static int stmmac_init_phy(struct net_device *dev) return -ENODEV; } + /* stmmac_adjust_link will change this to PHY_IGNORE_INTERRUPT to avoid + * subsequent PHY polling, make sure we force a link transition if + * we have a UP/DOWN/UP transition + */ + if (phydev->is_pseudo_fixed_link) + phydev->irq = PHY_POLL; + pr_debug("stmmac_init_phy: %s: attached to PHY (UID 0x%x)" " Link = %d\n", dev->name, phydev->phy_id, phydev->link); diff --git a/drivers/net/ethernet/ti/cpsw-phy-sel.c b/drivers/net/ethernet/ti/cpsw-phy-sel.c index 054a8dd..ba1e45f 100644 --- a/drivers/net/ethernet/ti/cpsw-phy-sel.c +++ b/drivers/net/ethernet/ti/cpsw-phy-sel.c @@ -176,9 +176,12 @@ void cpsw_phy_sel(struct device *dev, phy_interface_t phy_mode, int slave) } dev = bus_find_device(&platform_bus_type, NULL, node, match); + of_node_put(node); priv = dev_get_drvdata(dev); priv->cpsw_phy_sel(priv, phy_mode, slave); + + put_device(dev); } EXPORT_SYMBOL_GPL(cpsw_phy_sel); diff --git a/drivers/net/ethernet/ti/davinci_emac.c b/drivers/net/ethernet/ti/davinci_emac.c index 2fd94a5..84fbe57 100644 --- a/drivers/net/ethernet/ti/davinci_emac.c +++ b/drivers/net/ethernet/ti/davinci_emac.c @@ -1410,6 +1410,7 @@ static int emac_dev_open(struct net_device *ndev) int i = 0; struct emac_priv *priv = netdev_priv(ndev); struct phy_device *phydev = NULL; + struct device *phy = NULL; ret = pm_runtime_get_sync(&priv->pdev->dev); if (ret < 0) { @@ -1488,19 +1489,20 @@ static int emac_dev_open(struct net_device *ndev) /* use the first phy on the bus if pdata did not give us a phy id */ if (!phydev && !priv->phy_id) { - struct device *phy; - phy = bus_find_device(&mdio_bus_type, NULL, NULL, match_first_device); - if (phy) + if (phy) { priv->phy_id = dev_name(phy); + if (!priv->phy_id || !*priv->phy_id) + put_device(phy); + } } if (!phydev && priv->phy_id && *priv->phy_id) { phydev = phy_connect(ndev, priv->phy_id, &emac_adjust_link, PHY_INTERFACE_MODE_MII); - + put_device(phy); /* reference taken by bus_find_device */ if (IS_ERR(phydev)) { dev_err(emac_dev, "could not connect to phy %s\n", priv->phy_id); diff --git a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c index 446ea58..928c1dc 100644 --- a/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c +++ b/drivers/net/ethernet/toshiba/ps3_gelic_wireless.c @@ -1694,7 +1694,7 @@ struct gelic_wl_scan_info *gelic_wl_find_best_bss(struct gelic_wl_info *wl) pr_debug("%s: bssid matched\n", __func__); break; } else { - pr_debug("%s: bssid unmached\n", __func__); + pr_debug("%s: bssid unmatched\n", __func__); continue; } } diff --git a/drivers/net/ethernet/xscale/ixp4xx_eth.c b/drivers/net/ethernet/xscale/ixp4xx_eth.c index 7f127dc..fa32391 100644 --- a/drivers/net/ethernet/xscale/ixp4xx_eth.c +++ b/drivers/net/ethernet/xscale/ixp4xx_eth.c @@ -708,8 +708,7 @@ static int eth_poll(struct napi_struct *napi, int budget) if (!qmgr_stat_below_low_watermark(rxq) && napi_reschedule(napi)) { /* not empty again */ #if DEBUG_RX - printk(KERN_DEBUG "%s: eth_poll" - " napi_reschedule successed\n", + printk(KERN_DEBUG "%s: eth_poll napi_reschedule succeeded\n", dev->name); #endif qmgr_disable_irq(rxq); diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c index 3234fcd..d2d6f12 100644 --- a/drivers/net/macvlan.c +++ b/drivers/net/macvlan.c @@ -1278,6 +1278,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, struct net_device *lowerdev; int err; int macmode; + bool create = false; if (!tb[IFLA_LINK]) return -EINVAL; @@ -1304,12 +1305,18 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, err = macvlan_port_create(lowerdev); if (err < 0) return err; + create = true; } port = macvlan_port_get_rtnl(lowerdev); /* Only 1 macvlan device can be created in passthru mode */ - if (port->passthru) - return -EINVAL; + if (port->passthru) { + /* The macvlan port must be not created this time, + * still goto destroy_macvlan_port for readability. + */ + err = -EINVAL; + goto destroy_macvlan_port; + } vlan->lowerdev = lowerdev; vlan->dev = dev; @@ -1325,24 +1332,28 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, vlan->flags = nla_get_u16(data[IFLA_MACVLAN_FLAGS]); if (vlan->mode == MACVLAN_MODE_PASSTHRU) { - if (port->count) - return -EINVAL; + if (port->count) { + err = -EINVAL; + goto destroy_macvlan_port; + } port->passthru = true; eth_hw_addr_inherit(dev, lowerdev); } if (data && data[IFLA_MACVLAN_MACADDR_MODE]) { - if (vlan->mode != MACVLAN_MODE_SOURCE) - return -EINVAL; + if (vlan->mode != MACVLAN_MODE_SOURCE) { + err = -EINVAL; + goto destroy_macvlan_port; + } macmode = nla_get_u32(data[IFLA_MACVLAN_MACADDR_MODE]); err = macvlan_changelink_sources(vlan, macmode, data); if (err) - return err; + goto destroy_macvlan_port; } err = register_netdevice(dev); if (err < 0) - return err; + goto destroy_macvlan_port; dev->priv_flags |= IFF_MACVLAN; err = netdev_upper_dev_link(lowerdev, dev); @@ -1357,7 +1368,9 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev, unregister_netdev: unregister_netdevice(dev); - +destroy_macvlan_port: + if (create) + macvlan_port_destroy(port->dev); return err; } EXPORT_SYMBOL_GPL(macvlan_common_newlink); diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c index e977ba9..1a4bf8a 100644 --- a/drivers/net/phy/phy_device.c +++ b/drivers/net/phy/phy_device.c @@ -723,6 +723,7 @@ struct phy_device *phy_connect(struct net_device *dev, const char *bus_id, phydev = to_phy_device(d); rc = phy_connect_direct(dev, phydev, handler, interface); + put_device(d); if (rc) return ERR_PTR(rc); @@ -953,6 +954,7 @@ struct phy_device *phy_attach(struct net_device *dev, const char *bus_id, phydev = to_phy_device(d); rc = phy_attach_direct(dev, phydev, phydev->dev_flags, interface); + put_device(d); if (rc) return ERR_PTR(rc); diff --git a/drivers/net/usb/ax88179_178a.c b/drivers/net/usb/ax88179_178a.c index e6338c1..8a6675d 100644 --- a/drivers/net/usb/ax88179_178a.c +++ b/drivers/net/usb/ax88179_178a.c @@ -1656,6 +1656,19 @@ static const struct driver_info ax88178a_info = { .tx_fixup = ax88179_tx_fixup, }; +static const struct driver_info cypress_GX3_info = { + .description = "Cypress GX3 SuperSpeed to Gigabit Ethernet Controller", + .bind = ax88179_bind, + .unbind = ax88179_unbind, + .status = ax88179_status, + .link_reset = ax88179_link_reset, + .reset = ax88179_reset, + .stop = ax88179_stop, + .flags = FLAG_ETHER | FLAG_FRAMING_AX, + .rx_fixup = ax88179_rx_fixup, + .tx_fixup = ax88179_tx_fixup, +}; + static const struct driver_info dlink_dub1312_info = { .description = "D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter", .bind = ax88179_bind, @@ -1718,6 +1731,10 @@ static const struct usb_device_id products[] = { USB_DEVICE(0x0b95, 0x178a), .driver_info = (unsigned long)&ax88178a_info, }, { + /* Cypress GX3 SuperSpeed to Gigabit Ethernet Bridge Controller */ + USB_DEVICE(0x04b4, 0x3610), + .driver_info = (unsigned long)&cypress_GX3_info, +}, { /* D-Link DUB-1312 USB 3.0 to Gigabit Ethernet Adapter */ USB_DEVICE(0x2001, 0x4a00), .driver_info = (unsigned long)&dlink_dub1312_info, diff --git a/drivers/net/usb/r8152.c b/drivers/net/usb/r8152.c index 44d439f..efb84f0 100644 --- a/drivers/net/usb/r8152.c +++ b/drivers/net/usb/r8152.c @@ -1730,7 +1730,7 @@ static u8 r8152_rx_csum(struct r8152 *tp, struct rx_desc *rx_desc) u8 checksum = CHECKSUM_NONE; u32 opts2, opts3; - if (tp->version == RTL_VER_01) + if (tp->version == RTL_VER_01 || tp->version == RTL_VER_02) goto return_result; opts2 = le32_to_cpu(rx_desc->opts2); @@ -1745,7 +1745,7 @@ static u8 r8152_rx_csum(struct r8152 *tp, struct rx_desc *rx_desc) checksum = CHECKSUM_NONE; else checksum = CHECKSUM_UNNECESSARY; - } else if (RD_IPV6_CS) { + } else if (opts2 & RD_IPV6_CS) { if ((opts2 & RD_UDP_CS) && !(opts3 & UDPF)) checksum = CHECKSUM_UNNECESSARY; else if ((opts2 & RD_TCP_CS) && !(opts3 & TCPF)) @@ -3266,10 +3266,8 @@ static int rtl8152_open(struct net_device *netdev) goto out; res = usb_autopm_get_interface(tp->intf); - if (res < 0) { - free_all_mem(tp); - goto out; - } + if (res < 0) + goto out_free; mutex_lock(&tp->control); @@ -3285,10 +3283,9 @@ static int rtl8152_open(struct net_device *netdev) netif_device_detach(tp->netdev); netif_warn(tp, ifup, netdev, "intr_urb submit failed: %d\n", res); - free_all_mem(tp); - } else { - napi_enable(&tp->napi); + goto out_unlock; } + napi_enable(&tp->napi); mutex_unlock(&tp->control); @@ -3297,7 +3294,13 @@ static int rtl8152_open(struct net_device *netdev) tp->pm_notifier.notifier_call = rtl_notifier; register_pm_notifier(&tp->pm_notifier); #endif + return 0; +out_unlock: + mutex_unlock(&tp->control); + usb_autopm_put_interface(tp->intf); +out_free: + free_all_mem(tp); out: return res; } diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index fad84f3..fd8b1e6 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -2038,23 +2038,33 @@ static struct virtio_device_id id_table[] = { { 0 }, }; +#define VIRTNET_FEATURES \ + VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, \ + VIRTIO_NET_F_MAC, \ + VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, \ + VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, \ + VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, \ + VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, \ + VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, \ + VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, \ + VIRTIO_NET_F_CTRL_MAC_ADDR, \ + VIRTIO_NET_F_MTU + static unsigned int features[] = { - VIRTIO_NET_F_CSUM, VIRTIO_NET_F_GUEST_CSUM, - VIRTIO_NET_F_GSO, VIRTIO_NET_F_MAC, - VIRTIO_NET_F_HOST_TSO4, VIRTIO_NET_F_HOST_UFO, VIRTIO_NET_F_HOST_TSO6, - VIRTIO_NET_F_HOST_ECN, VIRTIO_NET_F_GUEST_TSO4, VIRTIO_NET_F_GUEST_TSO6, - VIRTIO_NET_F_GUEST_ECN, VIRTIO_NET_F_GUEST_UFO, - VIRTIO_NET_F_MRG_RXBUF, VIRTIO_NET_F_STATUS, VIRTIO_NET_F_CTRL_VQ, - VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_VLAN, - VIRTIO_NET_F_GUEST_ANNOUNCE, VIRTIO_NET_F_MQ, - VIRTIO_NET_F_CTRL_MAC_ADDR, + VIRTNET_FEATURES, +}; + +static unsigned int features_legacy[] = { + VIRTNET_FEATURES, + VIRTIO_NET_F_GSO, VIRTIO_F_ANY_LAYOUT, - VIRTIO_NET_F_MTU, }; static struct virtio_driver virtio_net_driver = { .feature_table = features, .feature_table_size = ARRAY_SIZE(features), + .feature_table_legacy = features_legacy, + .feature_table_size_legacy = ARRAY_SIZE(features_legacy), .driver.name = KBUILD_MODNAME, .driver.owner = THIS_MODULE, .id_table = id_table, diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index f3c2fa3..24532cd 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -944,7 +944,9 @@ static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev) { struct vxlan_dev *vxlan; struct vxlan_sock *sock4; - struct vxlan_sock *sock6 = NULL; +#if IS_ENABLED(CONFIG_IPV6) + struct vxlan_sock *sock6; +#endif unsigned short family = dev->default_dst.remote_ip.sa.sa_family; sock4 = rtnl_dereference(dev->vn4_sock); diff --git a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c index b777e1b..78d9966 100644 --- a/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/broadcom/brcm80211/brcmfmac/cfg80211.c @@ -4516,7 +4516,7 @@ brcmf_cfg80211_start_ap(struct wiphy *wiphy, struct net_device *ndev, /* store current 11d setting */ if (brcmf_fil_cmd_int_get(ifp, BRCMF_C_GET_REGULATORY, &ifp->vif->is_11d)) { - supports_11d = false; + is_11d = supports_11d = false; } else { country_ie = brcmf_parse_tlvs((u8 *)settings->beacon.tail, settings->beacon.tail_len, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c index 4fdc3da..b88e204 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/d3.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/d3.c @@ -1087,6 +1087,15 @@ iwl_mvm_netdetect_config(struct iwl_mvm *mvm, ret = iwl_mvm_switch_to_d3(mvm); if (ret) return ret; + } else { + /* In theory, we wouldn't have to stop a running sched + * scan in order to start another one (for + * net-detect). But in practice this doesn't seem to + * work properly, so stop any running sched_scan now. + */ + ret = iwl_mvm_scan_stop(mvm, IWL_MVM_SCAN_SCHED, true); + if (ret) + return ret; } /* rfkill release can be either for wowlan or netdetect */ @@ -1254,7 +1263,10 @@ static int __iwl_mvm_suspend(struct ieee80211_hw *hw, out: if (ret < 0) { iwl_mvm_ref(mvm, IWL_MVM_REF_UCODE_DOWN); - ieee80211_restart_hw(mvm->hw); + if (mvm->restart_fw > 0) { + mvm->restart_fw--; + ieee80211_restart_hw(mvm->hw); + } iwl_mvm_free_nd(mvm); } out_noreset: @@ -2088,6 +2100,16 @@ static int __iwl_mvm_resume(struct iwl_mvm *mvm, bool test) iwl_mvm_update_changed_regdom(mvm); if (mvm->net_detect) { + /* If this is a non-unified image, we restart the FW, + * so no need to stop the netdetect scan. If that + * fails, continue and try to get the wake-up reasons, + * but trigger a HW restart by keeping a failure code + * in ret. + */ + if (unified_image) + ret = iwl_mvm_scan_stop(mvm, IWL_MVM_SCAN_NETDETECT, + false); + iwl_mvm_query_netdetect_reasons(mvm, vif); /* has unlocked the mutex, so skip that */ goto out; @@ -2271,7 +2293,8 @@ static void iwl_mvm_d3_test_disconn_work_iter(void *_data, u8 *mac, static int iwl_mvm_d3_test_release(struct inode *inode, struct file *file) { struct iwl_mvm *mvm = inode->i_private; - int remaining_time = 10; + bool unified_image = fw_has_capa(&mvm->fw->ucode_capa, + IWL_UCODE_TLV_CAPA_CNSLDTD_D3_D0_IMG); mvm->d3_test_active = false; @@ -2282,17 +2305,21 @@ static int iwl_mvm_d3_test_release(struct inode *inode, struct file *file) mvm->trans->system_pm_mode = IWL_PLAT_PM_MODE_DISABLED; iwl_abort_notification_waits(&mvm->notif_wait); - ieee80211_restart_hw(mvm->hw); + if (!unified_image) { + int remaining_time = 10; - /* wait for restart and disconnect all interfaces */ - while (test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status) && - remaining_time > 0) { - remaining_time--; - msleep(1000); - } + ieee80211_restart_hw(mvm->hw); + + /* wait for restart and disconnect all interfaces */ + while (test_bit(IWL_MVM_STATUS_IN_HW_RESTART, &mvm->status) && + remaining_time > 0) { + remaining_time--; + msleep(1000); + } - if (remaining_time == 0) - IWL_ERR(mvm, "Timed out waiting for HW restart to finish!\n"); + if (remaining_time == 0) + IWL_ERR(mvm, "Timed out waiting for HW restart!\n"); + } ieee80211_iterate_active_interfaces_atomic( mvm->hw, IEEE80211_IFACE_ITER_NORMAL, diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c index 07da4ef..7b7d2a1 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/debugfs.c @@ -1529,8 +1529,8 @@ static ssize_t iwl_dbgfs_mem_read(struct file *file, char __user *user_buf, .data = { &cmd, }, .len = { sizeof(cmd) }, }; - size_t delta, len; - ssize_t ret; + size_t delta; + ssize_t ret, len; hcmd.id = iwl_cmd_id(*ppos >> 24 ? UMAC_RD_WR : LMAC_RD_WR, DEBUG_GROUP, 0); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c index 318efd8..1db1dc1 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c @@ -4121,7 +4121,6 @@ void iwl_mvm_sync_rx_queues_internal(struct iwl_mvm *mvm, struct iwl_mvm_internal_rxq_notif *notif, u32 size) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(notif_waitq); u32 qmask = BIT(mvm->trans->num_rx_queues) - 1; int ret; @@ -4143,7 +4142,7 @@ void iwl_mvm_sync_rx_queues_internal(struct iwl_mvm *mvm, } if (notif->sync) - ret = wait_event_timeout(notif_waitq, + ret = wait_event_timeout(mvm->rx_sync_waitq, atomic_read(&mvm->queue_sync_counter) == 0, HZ); WARN_ON_ONCE(!ret); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h index d17cbf6..c60703e 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h +++ b/drivers/net/wireless/intel/iwlwifi/mvm/mvm.h @@ -937,6 +937,7 @@ struct iwl_mvm { /* sync d0i3_tx queue and IWL_MVM_STATUS_IN_D0I3 status flag */ spinlock_t d0i3_tx_lock; wait_queue_head_t d0i3_exit_waitq; + wait_queue_head_t rx_sync_waitq; /* BT-Coex */ struct iwl_bt_coex_profile_notif last_bt_notif; diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c index 05fe6dd..4d35deb 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/ops.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/ops.c @@ -619,6 +619,7 @@ iwl_op_mode_mvm_start(struct iwl_trans *trans, const struct iwl_cfg *cfg, spin_lock_init(&mvm->refs_lock); skb_queue_head_init(&mvm->d0i3_tx); init_waitqueue_head(&mvm->d0i3_exit_waitq); + init_waitqueue_head(&mvm->rx_sync_waitq); atomic_set(&mvm->queue_sync_counter, 0); diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c index a57c6ef..6c802ce 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c @@ -547,7 +547,8 @@ void iwl_mvm_rx_queue_notif(struct iwl_mvm *mvm, struct iwl_rx_cmd_buffer *rxb, "Received expired RX queue sync message\n"); return; } - atomic_dec(&mvm->queue_sync_counter); + if (!atomic_dec_return(&mvm->queue_sync_counter)) + wake_up(&mvm->rx_sync_waitq); } switch (internal_notif->type) { diff --git a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c index f279fdd..fa97432 100644 --- a/drivers/net/wireless/intel/iwlwifi/mvm/scan.c +++ b/drivers/net/wireless/intel/iwlwifi/mvm/scan.c @@ -1199,6 +1199,9 @@ static int iwl_mvm_num_scans(struct iwl_mvm *mvm) static int iwl_mvm_check_running_scans(struct iwl_mvm *mvm, int type) { + bool unified_image = fw_has_capa(&mvm->fw->ucode_capa, + IWL_UCODE_TLV_CAPA_CNSLDTD_D3_D0_IMG); + /* This looks a bit arbitrary, but the idea is that if we run * out of possible simultaneous scans and the userspace is * trying to run a scan type that is already running, we @@ -1225,12 +1228,30 @@ static int iwl_mvm_check_running_scans(struct iwl_mvm *mvm, int type) return -EBUSY; return iwl_mvm_scan_stop(mvm, IWL_MVM_SCAN_REGULAR, true); case IWL_MVM_SCAN_NETDETECT: - /* No need to stop anything for net-detect since the - * firmware is restarted anyway. This way, any sched - * scans that were running will be restarted when we - * resume. - */ - return 0; + /* For non-unified images, there's no need to stop + * anything for net-detect since the firmware is + * restarted anyway. This way, any sched scans that + * were running will be restarted when we resume. + */ + if (!unified_image) + return 0; + + /* If this is a unified image and we ran out of scans, + * we need to stop something. Prefer stopping regular + * scans, because the results are useless at this + * point, and we should be able to keep running + * another scheduled scan while suspended. + */ + if (mvm->scan_status & IWL_MVM_SCAN_REGULAR_MASK) + return iwl_mvm_scan_stop(mvm, IWL_MVM_SCAN_REGULAR, + true); + if (mvm->scan_status & IWL_MVM_SCAN_SCHED_MASK) + return iwl_mvm_scan_stop(mvm, IWL_MVM_SCAN_SCHED, + true); + + /* fall through, something is wrong if no scan was + * running but we ran out of scans. + */ default: WARN_ON(1); break; diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c index 001be40..2f8134b 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c @@ -541,48 +541,64 @@ static const struct pci_device_id iwl_hw_card_ids[] = { MODULE_DEVICE_TABLE(pci, iwl_hw_card_ids); #ifdef CONFIG_ACPI -#define SPL_METHOD "SPLC" -#define SPL_DOMAINTYPE_MODULE BIT(0) -#define SPL_DOMAINTYPE_WIFI BIT(1) -#define SPL_DOMAINTYPE_WIGIG BIT(2) -#define SPL_DOMAINTYPE_RFEM BIT(3) +#define ACPI_SPLC_METHOD "SPLC" +#define ACPI_SPLC_DOMAIN_WIFI (0x07) -static u64 splx_get_pwr_limit(struct iwl_trans *trans, union acpi_object *splx) +static u64 splc_get_pwr_limit(struct iwl_trans *trans, union acpi_object *splc) { - union acpi_object *limits, *domain_type, *power_limit; - - if (splx->type != ACPI_TYPE_PACKAGE || - splx->package.count != 2 || - splx->package.elements[0].type != ACPI_TYPE_INTEGER || - splx->package.elements[0].integer.value != 0) { - IWL_ERR(trans, "Unsupported splx structure\n"); + union acpi_object *data_pkg, *dflt_pwr_limit; + int i; + + /* We need at least two elements, one for the revision and one + * for the data itself. Also check that the revision is + * supported (currently only revision 0). + */ + if (splc->type != ACPI_TYPE_PACKAGE || + splc->package.count < 2 || + splc->package.elements[0].type != ACPI_TYPE_INTEGER || + splc->package.elements[0].integer.value != 0) { + IWL_DEBUG_INFO(trans, + "Unsupported structure returned by the SPLC method. Ignoring.\n"); return 0; } - limits = &splx->package.elements[1]; - if (limits->type != ACPI_TYPE_PACKAGE || - limits->package.count < 2 || - limits->package.elements[0].type != ACPI_TYPE_INTEGER || - limits->package.elements[1].type != ACPI_TYPE_INTEGER) { - IWL_ERR(trans, "Invalid limits element\n"); - return 0; + /* loop through all the packages to find the one for WiFi */ + for (i = 1; i < splc->package.count; i++) { + union acpi_object *domain; + + data_pkg = &splc->package.elements[i]; + + /* Skip anything that is not a package with the right + * amount of elements (i.e. at least 2 integers). + */ + if (data_pkg->type != ACPI_TYPE_PACKAGE || + data_pkg->package.count < 2 || + data_pkg->package.elements[0].type != ACPI_TYPE_INTEGER || + data_pkg->package.elements[1].type != ACPI_TYPE_INTEGER) + continue; + + domain = &data_pkg->package.elements[0]; + if (domain->integer.value == ACPI_SPLC_DOMAIN_WIFI) + break; + + data_pkg = NULL; } - domain_type = &limits->package.elements[0]; - power_limit = &limits->package.elements[1]; - if (!(domain_type->integer.value & SPL_DOMAINTYPE_WIFI)) { - IWL_DEBUG_INFO(trans, "WiFi power is not limited\n"); + if (!data_pkg) { + IWL_DEBUG_INFO(trans, + "No element for the WiFi domain returned by the SPLC method.\n"); return 0; } - return power_limit->integer.value; + dflt_pwr_limit = &data_pkg->package.elements[1]; + return dflt_pwr_limit->integer.value; } static void set_dflt_pwr_limit(struct iwl_trans *trans, struct pci_dev *pdev) { acpi_handle pxsx_handle; acpi_handle handle; - struct acpi_buffer splx = {ACPI_ALLOCATE_BUFFER, NULL}; + struct acpi_buffer splc = {ACPI_ALLOCATE_BUFFER, NULL}; acpi_status status; pxsx_handle = ACPI_HANDLE(&pdev->dev); @@ -593,23 +609,24 @@ static void set_dflt_pwr_limit(struct iwl_trans *trans, struct pci_dev *pdev) } /* Get the method's handle */ - status = acpi_get_handle(pxsx_handle, (acpi_string)SPL_METHOD, &handle); + status = acpi_get_handle(pxsx_handle, (acpi_string)ACPI_SPLC_METHOD, + &handle); if (ACPI_FAILURE(status)) { - IWL_DEBUG_INFO(trans, "SPL method not found\n"); + IWL_DEBUG_INFO(trans, "SPLC method not found\n"); return; } /* Call SPLC with no arguments */ - status = acpi_evaluate_object(handle, NULL, NULL, &splx); + status = acpi_evaluate_object(handle, NULL, NULL, &splc); if (ACPI_FAILURE(status)) { IWL_ERR(trans, "SPLC invocation failed (0x%x)\n", status); return; } - trans->dflt_pwr_limit = splx_get_pwr_limit(trans, splx.pointer); + trans->dflt_pwr_limit = splc_get_pwr_limit(trans, splc.pointer); IWL_DEBUG_INFO(trans, "Default power limit set to %lld\n", trans->dflt_pwr_limit); - kfree(splx.pointer); + kfree(splc.pointer); } #else /* CONFIG_ACPI */ diff --git a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c index e9a278b..5f840f1 100644 --- a/drivers/net/wireless/intel/iwlwifi/pcie/tx.c +++ b/drivers/net/wireless/intel/iwlwifi/pcie/tx.c @@ -592,6 +592,7 @@ error: static int iwl_pcie_txq_init(struct iwl_trans *trans, struct iwl_txq *txq, int slots_num, u32 txq_id) { + struct iwl_trans_pcie *trans_pcie = IWL_TRANS_GET_PCIE_TRANS(trans); int ret; txq->need_update = false; @@ -606,6 +607,13 @@ static int iwl_pcie_txq_init(struct iwl_trans *trans, struct iwl_txq *txq, return ret; spin_lock_init(&txq->lock); + + if (txq_id == trans_pcie->cmd_queue) { + static struct lock_class_key iwl_pcie_cmd_queue_lock_class; + + lockdep_set_class(&txq->lock, &iwl_pcie_cmd_queue_lock_class); + } + __skb_queue_head_init(&txq->overflow_q); /* diff --git a/drivers/net/xen-netfront.c b/drivers/net/xen-netfront.c index e17879d..bf2744e 100644 --- a/drivers/net/xen-netfront.c +++ b/drivers/net/xen-netfront.c @@ -304,7 +304,7 @@ static void xennet_alloc_rx_buffers(struct netfront_queue *queue) queue->rx_skbs[id] = skb; ref = gnttab_claim_grant_reference(&queue->gref_rx_head); - BUG_ON((signed short)ref < 0); + WARN_ON_ONCE(IS_ERR_VALUE((unsigned long)(int)ref)); queue->grant_rx_ref[id] = ref; page = skb_frag_page(&skb_shinfo(skb)->frags[0]); @@ -428,7 +428,7 @@ static void xennet_tx_setup_grant(unsigned long gfn, unsigned int offset, id = get_id_from_freelist(&queue->tx_skb_freelist, queue->tx_skbs); tx = RING_GET_REQUEST(&queue->tx, queue->tx.req_prod_pvt++); ref = gnttab_claim_grant_reference(&queue->gref_tx_head); - BUG_ON((signed short)ref < 0); + WARN_ON_ONCE(IS_ERR_VALUE((unsigned long)(int)ref)); gnttab_grant_foreign_access_ref(ref, queue->info->xbdev->otherend_id, gfn, GNTMAP_readonly); diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index ca1ad9e..a064997 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -149,7 +149,7 @@ static inline bool inet6_exact_dif_match(struct net *net, struct sk_buff *skb) { #if defined(CONFIG_NET_L3_MASTER_DEV) if (!net->ipv4.sysctl_tcp_l3mdev_accept && - ipv6_l3mdev_skb(IP6CB(skb)->flags)) + skb && ipv6_l3mdev_skb(IP6CB(skb)->flags)) return true; #endif return false; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 91ee364..bf04a46 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -3354,6 +3354,21 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb); bool is_skb_forwardable(const struct net_device *dev, const struct sk_buff *skb); +static __always_inline int ____dev_forward_skb(struct net_device *dev, + struct sk_buff *skb) +{ + if (skb_orphan_frags(skb, GFP_ATOMIC) || + unlikely(!is_skb_forwardable(dev, skb))) { + atomic_long_inc(&dev->rx_dropped); + kfree_skb(skb); + return NET_RX_DROP; + } + + skb_scrub_packet(skb, true); + skb->priority = 0; + return 0; +} + void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev); extern int netdev_budget; diff --git a/include/net/ip.h b/include/net/ip.h index 5413883..d3a1078 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -47,8 +47,7 @@ struct inet_skb_parm { #define IPSKB_REROUTED BIT(4) #define IPSKB_DOREDIRECT BIT(5) #define IPSKB_FRAG_PMTU BIT(6) -#define IPSKB_FRAG_SEGS BIT(7) -#define IPSKB_L3SLAVE BIT(8) +#define IPSKB_L3SLAVE BIT(7) u16 frag_max_size; }; diff --git a/include/net/ip6_tunnel.h b/include/net/ip6_tunnel.h index 20ed969..1b1cf33 100644 --- a/include/net/ip6_tunnel.h +++ b/include/net/ip6_tunnel.h @@ -146,6 +146,7 @@ static inline void ip6tunnel_xmit(struct sock *sk, struct sk_buff *skb, { int pkt_len, err; + memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); pkt_len = skb->len - skb_inner_network_offset(skb); err = ip6_local_out(dev_net(skb_dst(skb)->dev), sk, skb); if (unlikely(net_xmit_eval(err))) diff --git a/include/net/netfilter/nf_conntrack_labels.h b/include/net/netfilter/nf_conntrack_labels.h index 4988146..1723a67 100644 --- a/include/net/netfilter/nf_conntrack_labels.h +++ b/include/net/netfilter/nf_conntrack_labels.h @@ -30,8 +30,7 @@ static inline struct nf_conn_labels *nf_ct_labels_ext_add(struct nf_conn *ct) if (net->ct.labels_used == 0) return NULL; - return nf_ct_ext_add_length(ct, NF_CT_EXT_LABELS, - sizeof(struct nf_conn_labels), GFP_ATOMIC); + return nf_ct_ext_add(ct, NF_CT_EXT_LABELS, GFP_ATOMIC); #else return NULL; #endif diff --git a/include/net/netfilter/nf_tables.h b/include/net/netfilter/nf_tables.h index 5031e07..d79d1e9 100644 --- a/include/net/netfilter/nf_tables.h +++ b/include/net/netfilter/nf_tables.h @@ -145,7 +145,7 @@ static inline enum nft_registers nft_type_to_reg(enum nft_data_types type) return type == NFT_DATA_VERDICT ? NFT_REG_VERDICT : NFT_REG_1 * NFT_REG_SIZE / NFT_REG32_SIZE; } -unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); +int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest); unsigned int nft_parse_register(const struct nlattr *attr); int nft_dump_register(struct sk_buff *skb, unsigned int attr, unsigned int reg); @@ -542,7 +542,8 @@ void *nft_set_elem_init(const struct nft_set *set, const struct nft_set_ext_tmpl *tmpl, const u32 *key, const u32 *data, u64 timeout, gfp_t gfp); -void nft_set_elem_destroy(const struct nft_set *set, void *elem); +void nft_set_elem_destroy(const struct nft_set *set, void *elem, + bool destroy_expr); /** * struct nft_set_gc_batch_head - nf_tables set garbage collection batch @@ -693,7 +694,6 @@ static inline int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) { int err; - __module_get(src->ops->type->owner); if (src->ops->clone) { dst->ops = src->ops; err = src->ops->clone(dst, src); @@ -702,6 +702,8 @@ static inline int nft_expr_clone(struct nft_expr *dst, struct nft_expr *src) } else { memcpy(dst, src, src->ops->size); } + + __module_get(src->ops->type->owner); return 0; } diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h index 87a7f42..31acc3f 100644 --- a/include/net/sctp/sctp.h +++ b/include/net/sctp/sctp.h @@ -152,7 +152,7 @@ void sctp_unhash_endpoint(struct sctp_endpoint *); struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *, struct sctphdr *, struct sctp_association **, struct sctp_transport **); -void sctp_err_finish(struct sock *, struct sctp_association *); +void sctp_err_finish(struct sock *, struct sctp_transport *); void sctp_icmp_frag_needed(struct sock *, struct sctp_association *, struct sctp_transport *t, __u32 pmtu); void sctp_icmp_redirect(struct sock *, struct sctp_transport *, diff --git a/include/net/sock.h b/include/net/sock.h index 73c6b00..92b2697 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1596,11 +1596,11 @@ static inline void sock_put(struct sock *sk) void sock_gen_put(struct sock *sk); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested, - unsigned int trim_cap); + unsigned int trim_cap, bool refcounted); static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb, const int nested) { - return __sk_receive_skb(sk, skb, nested, 1); + return __sk_receive_skb(sk, skb, nested, 1, true); } static inline void sk_tx_queue_set(struct sock *sk, int tx_queue) diff --git a/include/net/tcp.h b/include/net/tcp.h index 5b82d4d..123979f 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -805,7 +805,7 @@ static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb) { #if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV) if (!net->ipv4.sysctl_tcp_l3mdev_accept && - ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) + skb && ipv4_l3mdev_skb(TCP_SKB_CB(skb)->header.h4.flags)) return true; #endif return false; @@ -1220,6 +1220,7 @@ static inline void tcp_prequeue_init(struct tcp_sock *tp) bool tcp_prequeue(struct sock *sk, struct sk_buff *skb); bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb); +int tcp_filter(struct sock *sk, struct sk_buff *skb); #undef STATE_TRACE diff --git a/include/uapi/linux/atm_zatm.h b/include/uapi/linux/atm_zatm.h index 5cd4d4d..9c9c6ad 100644 --- a/include/uapi/linux/atm_zatm.h +++ b/include/uapi/linux/atm_zatm.h @@ -14,7 +14,6 @@ #include <linux/atmapi.h> #include <linux/atmioc.h> -#include <linux/time.h> #define ZATM_GETPOOL _IOW('a',ATMIOC_SARPRV+1,struct atmif_sioc) /* get pool statistics */ diff --git a/include/uapi/linux/bpqether.h b/include/uapi/linux/bpqether.h index a6c35e1..05865ed 100644 --- a/include/uapi/linux/bpqether.h +++ b/include/uapi/linux/bpqether.h @@ -5,9 +5,7 @@ * Defines for the BPQETHER pseudo device driver */ -#ifndef __LINUX_IF_ETHER_H #include <linux/if_ether.h> -#endif #define SIOCSBPQETHOPT (SIOCDEVPRIVATE+0) /* reserved */ #define SIOCSBPQETHADDR (SIOCDEVPRIVATE+1) diff --git a/kernel/bpf/hashtab.c b/kernel/bpf/hashtab.c index 570eeca..ad1bc67 100644 --- a/kernel/bpf/hashtab.c +++ b/kernel/bpf/hashtab.c @@ -687,7 +687,8 @@ static void delete_all_elements(struct bpf_htab *htab) hlist_for_each_entry_safe(l, n, head, hash_node) { hlist_del_rcu(&l->hash_node); - htab_elem_free(htab, l); + if (l->state != HTAB_EXTRA_ELEM_USED) + htab_elem_free(htab, l); } } } diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c index 228f962..237f3d6 100644 --- a/kernel/bpf/syscall.c +++ b/kernel/bpf/syscall.c @@ -194,7 +194,7 @@ static int map_create(union bpf_attr *attr) err = bpf_map_charge_memlock(map); if (err) - goto free_map; + goto free_map_nouncharge; err = bpf_map_new_fd(map); if (err < 0) @@ -204,6 +204,8 @@ static int map_create(union bpf_attr *attr) return err; free_map: + bpf_map_uncharge_memlock(map); +free_map_nouncharge: map->ops->map_free(map); return err; } diff --git a/kernel/taskstats.c b/kernel/taskstats.c index b3f05ee..cbb387a 100644 --- a/kernel/taskstats.c +++ b/kernel/taskstats.c @@ -54,7 +54,11 @@ static const struct nla_policy taskstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1 [TASKSTATS_CMD_ATTR_REGISTER_CPUMASK] = { .type = NLA_STRING }, [TASKSTATS_CMD_ATTR_DEREGISTER_CPUMASK] = { .type = NLA_STRING },}; -static const struct nla_policy cgroupstats_cmd_get_policy[CGROUPSTATS_CMD_ATTR_MAX+1] = { +/* + * We have to use TASKSTATS_CMD_ATTR_MAX here, it is the maxattr in the family. + * Make sure they are always aligned. + */ +static const struct nla_policy cgroupstats_cmd_get_policy[TASKSTATS_CMD_ATTR_MAX+1] = { [CGROUPSTATS_CMD_ATTR_FD] = { .type = NLA_U32 }, }; diff --git a/net/can/bcm.c b/net/can/bcm.c index 8e999ff..8af9d25 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1549,24 +1549,31 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); + int ret = 0; if (len < sizeof(*addr)) return -EINVAL; - if (bo->bound) - return -EISCONN; + lock_sock(sk); + + if (bo->bound) { + ret = -EISCONN; + goto fail; + } /* bind a device to this socket */ if (addr->can_ifindex) { struct net_device *dev; dev = dev_get_by_index(&init_net, addr->can_ifindex); - if (!dev) - return -ENODEV; - + if (!dev) { + ret = -ENODEV; + goto fail; + } if (dev->type != ARPHRD_CAN) { dev_put(dev); - return -ENODEV; + ret = -ENODEV; + goto fail; } bo->ifindex = dev->ifindex; @@ -1577,17 +1584,24 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, bo->ifindex = 0; } - bo->bound = 1; - if (proc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_data(bo->procname, 0644, proc_dir, &bcm_proc_fops, sk); + if (!bo->bcm_proc_read) { + ret = -ENOMEM; + goto fail; + } } - return 0; + bo->bound = 1; + +fail: + release_sock(sk); + + return ret; } static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/core/dev.c b/net/core/dev.c index 820bac2..6666b28 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1766,19 +1766,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_orphan_frags(skb, GFP_ATOMIC) || - unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } + int ret = ____dev_forward_skb(dev, skb); - skb_scrub_packet(skb, true); - skb->priority = 0; - skb->protocol = eth_type_trans(skb, dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } - return 0; + return ret; } EXPORT_SYMBOL_GPL(__dev_forward_skb); @@ -2484,7 +2479,7 @@ int skb_checksum_help(struct sk_buff *skb) goto out; } - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: diff --git a/net/core/filter.c b/net/core/filter.c index 00351cd..b391209 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) return dev_forward_skb(dev, skb); } +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, + struct sk_buff *skb) +{ + int ret = ____dev_forward_skb(dev, skb); + + if (likely(!ret)) { + skb->dev = dev; + ret = netif_rx(skb); + } + + return ret; +} + static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; @@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) return ret; } +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* skb->mac_len is not set on normal egress */ + unsigned int mlen = skb->network_header - skb->mac_header; + + __skb_pull(skb, mlen); + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + skb_pop_mac_header(skb); + skb_reset_mac_len(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + bpf_push_mac_rcsum(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + return __bpf_redirect_no_mac(skb, dev, flags); + default: + return __bpf_redirect_common(skb, dev, flags); + } +} + BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; @@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) return -ENOMEM; } - bpf_push_mac_rcsum(clone); - - return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); + return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - bpf_push_mac_rcsum(skb); - - return ri->flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + return __bpf_redirect(skb, dev, ri->flags); } static const struct bpf_func_proto bpf_redirect_proto = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index ab193e5..69e4463 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -122,7 +122,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_keyid *key_keyid; bool skip_vlan = false; u8 ip_proto = 0; - bool ret = false; + bool ret; if (!data) { data = skb->data; @@ -549,12 +549,17 @@ ip_proto_again: out_good: ret = true; -out_bad: + key_control->thoff = (u16)nhoff; +out: key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; return ret; + +out_bad: + ret = false; + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); + goto out; } EXPORT_SYMBOL(__skb_flow_dissect); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fb7348f..db313ec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -275,6 +275,7 @@ int rtnl_unregister(int protocol, int msgtype) rtnl_msg_handlers[protocol][msgindex].doit = NULL; rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + rtnl_msg_handlers[protocol][msgindex].calcit = NULL; return 0; } diff --git a/net/core/sock.c b/net/core/sock.c index c73e28f..5e3ca41 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -453,7 +453,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(sock_queue_rcv_skb); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, - const int nested, unsigned int trim_cap) + const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; @@ -487,7 +487,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, bh_unlock_sock(sk); out: - sock_put(sk); + if (refcounted) + sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); @@ -1543,6 +1544,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; + newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 345a3ae..b567c87 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -235,7 +235,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (struct iphdr *)skb->data; const u8 offset = iph->ihl << 2; - const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + const struct dccp_hdr *dh; struct dccp_sock *dp; struct inet_sock *inet; const int type = icmp_hdr(skb)->type; @@ -245,11 +245,13 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) int err; struct net *net = dev_net(skb->dev); - if (skb->len < offset + sizeof(*dh) || - skb->len < offset + __dccp_basic_hdr_len(dh)) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; - } + /* Only need dccph_dport & dccph_sport which are the first + * 4 bytes in dccp header. + * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us. + */ + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet_lookup_established(net, &dccp_hashinfo, iph->daddr, dh->dccph_dport, @@ -868,7 +870,7 @@ lookup: goto discard_and_relse; nf_reset(skb); - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4); + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 3828f94..715e5d1 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -70,7 +70,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; - const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + const struct dccp_hdr *dh; struct dccp_sock *dp; struct ipv6_pinfo *np; struct sock *sk; @@ -78,12 +78,13 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, __u64 seq; struct net *net = dev_net(skb->dev); - if (skb->len < offset + sizeof(*dh) || - skb->len < offset + __dccp_basic_hdr_len(dh)) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return; - } + /* Only need dccph_dport & dccph_sport which are the first + * 4 bytes in dccp header. + * Our caller (icmpv6_notify()) already pulled 8 bytes for us. + */ + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet6_lookup_established(net, &dccp_hashinfo, &hdr->daddr, dh->dccph_dport, @@ -738,7 +739,8 @@ lookup: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0; + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, + refcounted) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -956,6 +958,7 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), + .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 41e65804..9fe25bf 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1009,6 +1009,10 @@ void dccp_close(struct sock *sk, long timeout) __kfree_skb(skb); } + /* If socket has been already reset kill it. */ + if (sk->sk_state == DCCP_CLOSED) + goto adjudge_to_death; + if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9648c97..5ddf5cd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -533,9 +533,9 @@ EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ @@ -545,13 +545,12 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; return timeo; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31cef36..4cff74d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2413,22 +2413,19 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, struct key_vector *l, **tp = &iter->tnode; t_key key; - /* use cache location of next-to-find key */ + /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { - pos -= iter->pos; key = iter->key; } else { - iter->pos = 0; + iter->pos = 1; key = 0; } - while ((l = leaf_walk_rcu(tp, key)) != NULL) { + pos -= iter->pos; + + while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; - - if (--pos <= 0) - break; - l = NULL; /* handle unlikely case of a key wrap */ @@ -2437,7 +2434,7 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, } if (l) - iter->key = key; /* remember it */ + iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2465,7 +2462,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) return fib_route_get_idx(iter, *pos); iter->pos = 0; - iter->key = 0; + iter->key = KEY_MAX; return SEQ_START_TOKEN; } @@ -2474,7 +2471,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; - t_key key = iter->key; + t_key key = iter->key + 1; ++*pos; @@ -2483,7 +2480,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) l = leaf_walk_rcu(&iter->tnode, key); if (l) { - iter->key = l->key + 1; + iter->key = l->key; iter->pos++; } else { iter->pos = 0; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 38abe70..48734ee 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -477,7 +477,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); + fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key_hash(net, fl4, @@ -502,7 +502,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_in->dev, + if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 8b4ffd2..9f0a7b9 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 03e7f73..105908d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -239,19 +239,23 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *segs; int ret = 0; - /* common case: fragmentation of segments is not allowed, - * or seglen is <= mtu + /* common case: seglen is <= mtu */ - if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) || - skb_gso_validate_mtu(skb, mtu)) + if (skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); - /* Slowpath - GSO segment length is exceeding the dst MTU. + /* Slowpath - GSO segment length exceeds the egress MTU. * - * This can happen in two cases: - * 1) TCP GRO packet, DF bit not set - * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly - * from host network stack. + * This can happen in several cases: + * - Forwarding of a TCP GRO skb, when DF flag is not set. + * - Forwarding of an skb that arrived on a virtualization interface + * (virtio-net/vhost/tap) with TSO/GSO size set by other network + * stack. + * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an + * interface with a smaller MTU. + * - Arriving GRO skb (or GSO skb in a virtualized environment) that is + * bridged to a NETIF_F_TSO tunnel stacked over an interface with an + * insufficent MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); @@ -1579,7 +1583,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - oif = oif ? : skb->skb_iif; + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 777bc18..fed3d29 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -63,7 +63,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; - int skb_iif = skb->skb_iif; struct iphdr *iph; int err; @@ -73,16 +72,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - if (skb_iif && !(df & htons(IP_DF))) { - /* Arrived from an ingress interface, got encapsulated, with - * fragmentation of encapulating frames allowed. - * If skb is gso, the resulting encapsulated network segments - * may exceed dst mtu. - * Allow IP Fragmentation of segments. - */ - IPCB(skb)->flags |= IPSKB_FRAG_SEGS; - } - /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 5f006e1..27089f5 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1749,7 +1749,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->dev->stats.tx_bytes += skb->len; } - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index bf855e6..0c01a270 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -28,7 +28,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, struct in_addr gw = { .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv4(pkt->net, pkt->skb, pkt->hook, &gw, oif); } @@ -59,7 +59,9 @@ static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 62d4d90..2a57566 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -753,7 +753,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow goto reject_redirect; } - n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); + if (!n) + n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3251fe7..814af89 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1164,7 +1164,7 @@ restart: err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; + goto do_error; sg = !!(sk->sk_route_caps & NETIF_F_SG); @@ -1241,7 +1241,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == sysctl_max_skb_frags || !sg) { + if (i >= sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 10d728b..ab37c67 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -56,6 +56,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 delayed_ack_reserved; + u32 loss_cwnd; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -96,6 +97,7 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->delayed_ack_reserved = 0; + ca->loss_cwnd = 0; ca->ce_state = 0; dctcp_reset(tp, ca); @@ -111,9 +113,10 @@ static void dctcp_init(struct sock *sk) static u32 dctcp_ssthresh(struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } @@ -308,12 +311,20 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, return 0; } +static u32 dctcp_cwnd_undo(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + static struct tcp_congestion_ops dctcp __read_mostly = { .init = dctcp_init, .in_ack_event = dctcp_update_alpha, .cwnd_event = dctcp_cwnd_event, .ssthresh = dctcp_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = dctcp_cwnd_undo, .set_state = dctcp_state, .get_info = dctcp_get_info, .flags = TCP_CONG_NEEDS_ECN, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 61b7be3..2259114 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1564,6 +1564,21 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_add_backlog); +int tcp_filter(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = (struct tcphdr *)skb->data; + unsigned int eaten = skb->len; + int err; + + err = sk_filter_trim_cap(sk, skb, th->doff * 4); + if (!err) { + eaten -= skb->len; + TCP_SKB_CB(skb)->end_seq -= eaten; + } + return err; +} +EXPORT_SYMBOL(tcp_filter); + /* * From tcp_input.c */ @@ -1676,8 +1691,10 @@ process: nf_reset(skb); - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); skb->dev = NULL; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index bd59c34..7370ad2 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -448,7 +448,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; else - iif = l3mdev_master_ifindex(skb->dev); + iif = l3mdev_master_ifindex(skb_dst(skb)->dev); /* * Must not send error if the source does not uniquely diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 6001e78..59eb4ed 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1366,7 +1366,7 @@ emsgsize: if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, exthdrlen, diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index a752052..b283f29 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -88,9 +88,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, uh->len = htons(skb->len); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED - | IPSKB_REROUTED); skb_dst_set(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 8bfd470..831f86e 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -26,7 +26,7 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr, { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); struct in6_addr *gw = (struct in6_addr *)®s->data[priv->sreg_addr]; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv6(pkt->net, pkt->skb, pkt->hook, gw, oif); } @@ -57,7 +57,9 @@ static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 947ed1d..1b57e11 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1364,6 +1364,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_LOCAL) return; + if (dst_metric_locked(dst, RTAX_MTU)) + return; + dst_confirm(dst); mtu = max_t(u32, mtu, IPV6_MIN_MTU); if (mtu >= dst_mtu(dst)) @@ -2758,6 +2761,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) PMTU discouvery. */ if (rt->dst.dev == arg->dev && + dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { if (rt->rt6i_flags & RTF_CACHE) { /* For RTF_CACHE with rt6i_pmtu == 0 diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5a27ab4..b9f1fee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -818,8 +818,12 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); - else - fl6.flowi6_oif = oif ? : skb->skb_iif; + else { + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; + + fl6.flowi6_oif = oif; + } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; @@ -1225,7 +1229,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard; /* @@ -1453,8 +1457,10 @@ process: if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + hdr = ipv6_hdr(skb); skb->dev = NULL; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index c3c809b..a6e44ef 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2845,7 +2845,7 @@ static struct genl_family ip_vs_genl_family = { .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, - .maxattr = IPVS_CMD_MAX, + .maxattr = IPVS_CMD_ATTR_MAX, .netnsok = true, /* Make ipvsadm to work on netns */ }; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 1b07578..9350530 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -283,6 +283,7 @@ struct ip_vs_sync_buff { */ static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) { + memset(ho, 0, sizeof(*ho)); ho->init_seq = get_unaligned_be32(&no->init_seq); ho->delta = get_unaligned_be32(&no->delta); ho->previous_delta = get_unaligned_be32(&no->previous_delta); @@ -917,8 +918,10 @@ static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *pa kfree(param->pe_data); } - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); + if (opt) { + cp->in_seq = opt->in_seq; + cp->out_seq = opt->out_seq; + } atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); cp->state = state; cp->old_state = cp->state; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index df2f5a3..0f87e5d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -76,6 +76,7 @@ struct conntrack_gc_work { struct delayed_work dwork; u32 last_bucket; bool exiting; + long next_gc_run; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; @@ -83,9 +84,11 @@ static __read_mostly spinlock_t nf_conntrack_locks_all_lock; static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; +/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ #define GC_MAX_BUCKETS_DIV 64u -#define GC_MAX_BUCKETS 8192u -#define GC_INTERVAL (5 * HZ) +/* upper bound of scan intervals */ +#define GC_INTERVAL_MAX (2 * HZ) +/* maximum conntracks to evict per gc run */ #define GC_MAX_EVICTS 256u static struct conntrack_gc_work conntrack_gc_work; @@ -936,13 +939,13 @@ static noinline int early_drop(struct net *net, unsigned int _hash) static void gc_worker(struct work_struct *work) { unsigned int i, goal, buckets = 0, expired_count = 0; - unsigned long next_run = GC_INTERVAL; - unsigned int ratio, scanned = 0; struct conntrack_gc_work *gc_work; + unsigned int ratio, scanned = 0; + unsigned long next_run; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); - goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS); + goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; i = gc_work->last_bucket; do { @@ -982,17 +985,47 @@ static void gc_worker(struct work_struct *work) if (gc_work->exiting) return; + /* + * Eviction will normally happen from the packet path, and not + * from this gc worker. + * + * This worker is only here to reap expired entries when system went + * idle after a busy period. + * + * The heuristics below are supposed to balance conflicting goals: + * + * 1. Minimize time until we notice a stale entry + * 2. Maximize scan intervals to not waste cycles + * + * Normally, expired_count will be 0, this increases the next_run time + * to priorize 2) above. + * + * As soon as a timed-out entry is found, move towards 1) and increase + * the scan frequency. + * In case we have lots of evictions next scan is done immediately. + */ ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio >= 90 || expired_count == GC_MAX_EVICTS) + if (ratio >= 90 || expired_count == GC_MAX_EVICTS) { + gc_work->next_gc_run = 0; next_run = 0; + } else if (expired_count) { + gc_work->next_gc_run /= 2U; + next_run = msecs_to_jiffies(1); + } else { + if (gc_work->next_gc_run < GC_INTERVAL_MAX) + gc_work->next_gc_run += msecs_to_jiffies(1); + + next_run = gc_work->next_gc_run; + } gc_work->last_bucket = i; - schedule_delayed_work(&gc_work->dwork, next_run); + queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); + gc_work->next_gc_run = GC_INTERVAL_MAX; gc_work->exiting = false; } @@ -1885,7 +1918,7 @@ int nf_conntrack_init_start(void) nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); conntrack_gc_work_init(&conntrack_gc_work); - schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL); + queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, GC_INTERVAL_MAX); return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 336e215..7341adf 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -138,9 +138,14 @@ __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { - if (!strcmp(h->name, name) && - h->tuple.src.l3num == l3num && - h->tuple.dst.protonum == protonum) + if (strcmp(h->name, name)) + continue; + + if (h->tuple.src.l3num != NFPROTO_UNSPEC && + h->tuple.src.l3num != l3num) + continue; + + if (h->tuple.dst.protonum == protonum) return h; } } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 621b81c..c3fc14e 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1436,9 +1436,12 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, handler = &sip_handlers[i]; if (handler->request == NULL) continue; - if (*datalen < handler->len || + if (*datalen < handler->len + 2 || strncasecmp(*dptr, handler->method, handler->len)) continue; + if ((*dptr)[handler->len] != ' ' || + !isalpha((*dptr)[handler->len+1])) + continue; if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, &matchoff, &matchlen) <= 0) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 24db222..026581b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2956,12 +2956,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; +err3: + ops->destroy(set); err2: kfree(set); err1: @@ -3452,14 +3454,15 @@ void *nft_set_elem_init(const struct nft_set *set, return elem; } -void nft_set_elem_destroy(const struct nft_set *set, void *elem) +void nft_set_elem_destroy(const struct nft_set *set, void *elem, + bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_uninit(nft_set_ext_data(ext), set->dtype); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); kfree(elem); @@ -3565,6 +3568,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { + .net = ctx->net, .afi = ctx->afi, .table = ctx->table, .chain = (struct nft_chain *)binding->chain, @@ -3812,7 +3816,7 @@ void nft_set_gc_batch_release(struct rcu_head *rcu) gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i]); + nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); kfree(gcb); } EXPORT_SYMBOL_GPL(nft_set_gc_batch_release); @@ -4030,7 +4034,7 @@ static void nf_tables_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv); + nft_trans_elem(trans).priv, true); break; } kfree(trans); @@ -4171,7 +4175,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) break; case NFT_MSG_NEWSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv); + nft_trans_elem(trans).priv, true); break; } kfree(trans); @@ -4421,7 +4425,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, * Otherwise a 0 is returned and the attribute value is stored in the * destination variable. */ -unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) +int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) { u32 val; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 517f087..31ca947 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -44,18 +44,22 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, ®s->data[priv->sreg_key], ®s->data[priv->sreg_data], timeout, GFP_ATOMIC); - if (elem == NULL) { - if (set->size) - atomic_dec(&set->nelems); - return NULL; - } + if (elem == NULL) + goto err1; ext = nft_set_elem_ext(set, elem); if (priv->expr != NULL && nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) - return NULL; + goto err2; return elem; + +err2: + nft_set_elem_destroy(set, elem, false); +err1: + if (set->size) + atomic_dec(&set->nelems); + return NULL; } static void nft_dynset_eval(const struct nft_expr *expr, @@ -139,6 +143,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return PTR_ERR(set); } + if (set->ops->update == NULL) + return -EOPNOTSUPP; + if (set->flags & NFT_SET_CONSTANT) return -EBUSY; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 3794cb2..a3dface 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -98,7 +98,7 @@ static bool nft_hash_update(struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he; + struct nft_hash_elem *he, *prev; struct nft_hash_cmp_arg arg = { .genmask = NFT_GENMASK_ANY, .set = set, @@ -112,15 +112,24 @@ static bool nft_hash_update(struct nft_set *set, const u32 *key, he = new(set, expr, regs); if (he == NULL) goto err1; - if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params)) + + prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params); + if (IS_ERR(prev)) goto err2; + + /* Another cpu may race to insert the element with the same key */ + if (prev) { + nft_set_elem_destroy(set, he, true); + he = prev; + } + out: *ext = &he->ext; return true; err2: - nft_set_elem_destroy(set, he); + nft_set_elem_destroy(set, he, true); err1: return false; } @@ -332,7 +341,7 @@ static int nft_hash_init(const struct nft_set *set, static void nft_hash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy((const struct nft_set *)arg, ptr); + nft_set_elem_destroy((const struct nft_set *)arg, ptr, true); } static void nft_hash_destroy(const struct nft_set *set) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 38b5bda..36493a7 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -266,7 +266,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe); + nft_set_elem_destroy(set, rbe, true); } } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 69f78e9..b83e158 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -44,7 +44,7 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) u_int32_t newmark; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) + if (ct == NULL || nf_ct_is_untracked(ct)) return XT_CONTINUE; switch (info->mode) { @@ -97,7 +97,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) + if (ct == NULL || nf_ct_is_untracked(ct)) return false; return ((ct->mark & info->mask) == info->mark) ^ info->invert; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index b2f0e98..a554624 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -178,11 +178,8 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } cb->args[1] = i; } else { - if (req->sdiag_protocol >= MAX_LINKS) { - read_unlock(&nl_table_lock); - rcu_read_unlock(); + if (req->sdiag_protocol >= MAX_LINKS) return -ENOENT; - } err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 23cc126..49c28e8 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -404,7 +404,7 @@ int __genl_register_family(struct genl_family *family) err = genl_validate_assign_mc_groups(family); if (err) - goto errout_locked; + goto errout_free; list_add_tail(&family->family_list, genl_family_chain(family->id)); genl_unlock_all(); @@ -417,6 +417,8 @@ int __genl_register_family(struct genl_family *family) return 0; +errout_free: + kfree(family->attrbuf); errout_locked: genl_unlock_all(); errout: diff --git a/net/sctp/input.c b/net/sctp/input.c index a2ea1d1..a01a56e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -181,9 +181,10 @@ int sctp_rcv(struct sk_buff *skb) * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB */ if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) { - if (asoc) { - sctp_association_put(asoc); + if (transport) { + sctp_transport_put(transport); asoc = NULL; + transport = NULL; } else { sctp_endpoint_put(ep); ep = NULL; @@ -269,8 +270,8 @@ int sctp_rcv(struct sk_buff *skb) bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ - if (asoc) - sctp_association_put(asoc); + if (transport) + sctp_transport_put(transport); else sctp_endpoint_put(ep); @@ -283,8 +284,8 @@ discard_it: discard_release: /* Release the asoc/ep ref we took in the lookup calls. */ - if (asoc) - sctp_association_put(asoc); + if (transport) + sctp_transport_put(transport); else sctp_endpoint_put(ep); @@ -300,6 +301,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_inq *inqueue = &chunk->rcvr->inqueue; + struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = NULL; int backloged = 0; @@ -351,7 +353,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) done: /* Release the refs we took in sctp_add_backlog */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) - sctp_association_put(sctp_assoc(rcvr)); + sctp_transport_put(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_put(sctp_ep(rcvr)); else @@ -363,6 +365,7 @@ done: static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; + struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = chunk->rcvr; int ret; @@ -373,7 +376,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) * from us */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) - sctp_association_hold(sctp_assoc(rcvr)); + sctp_transport_hold(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_hold(sctp_ep(rcvr)); else @@ -537,15 +540,15 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, return sk; out: - sctp_association_put(asoc); + sctp_transport_put(transport); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ -void sctp_err_finish(struct sock *sk, struct sctp_association *asoc) +void sctp_err_finish(struct sock *sk, struct sctp_transport *t) { bh_unlock_sock(sk); - sctp_association_put(asoc); + sctp_transport_put(t); } /* @@ -641,7 +644,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) } out_unlock: - sctp_err_finish(sk, asoc); + sctp_err_finish(sk, transport); } /* @@ -952,11 +955,8 @@ static struct sctp_association *__sctp_lookup_association( goto out; asoc = t->asoc; - sctp_association_hold(asoc); *pt = t; - sctp_transport_put(t); - out: return asoc; } @@ -986,7 +986,7 @@ int sctp_has_association(struct net *net, struct sctp_transport *transport; if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) { - sctp_association_put(asoc); + sctp_transport_put(transport); return 1; } @@ -1021,7 +1021,6 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sctphdr *sh = sctp_hdr(skb); union sctp_params params; sctp_init_chunk_t *init; - struct sctp_transport *transport; struct sctp_af *af; /* @@ -1052,7 +1051,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, af->from_addr_param(paddr, params.addr, sh->source, 0); - asoc = __sctp_lookup_association(net, laddr, paddr, &transport); + asoc = __sctp_lookup_association(net, laddr, paddr, transportp); if (asoc) return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index f473779..176af30 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -198,7 +198,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } out_unlock: - sctp_err_finish(sk, asoc); + sctp_err_finish(sk, transport); out: if (likely(idev != NULL)) in6_dev_put(idev); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9fbb6fe..f23ad91 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1214,9 +1214,12 @@ static int __sctp_connect(struct sock *sk, timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK); - err = sctp_wait_for_connect(asoc, &timeo); - if ((err == 0 || err == -EINPROGRESS) && assoc_id) + if (assoc_id) *assoc_id = asoc->assoc_id; + err = sctp_wait_for_connect(asoc, &timeo); + /* Note: the asoc may be freed after the return of + * sctp_wait_for_connect. + */ /* Don't free association on exit. */ asoc = NULL; @@ -4282,19 +4285,18 @@ static void sctp_shutdown(struct sock *sk, int how) { struct net *net = sock_net(sk); struct sctp_endpoint *ep; - struct sctp_association *asoc; if (!sctp_style(sk, TCP)) return; - if (how & SEND_SHUTDOWN) { + ep = sctp_sk(sk)->ep; + if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) { + struct sctp_association *asoc; + sk->sk_state = SCTP_SS_CLOSING; - ep = sctp_sk(sk)->ep; - if (!list_empty(&ep->asocs)) { - asoc = list_entry(ep->asocs.next, - struct sctp_association, asocs); - sctp_primitive_SHUTDOWN(net, asoc, NULL); - } + asoc = list_entry(ep->asocs.next, + struct sctp_association, asocs); + sctp_primitive_SHUTDOWN(net, asoc, NULL); } } @@ -4480,12 +4482,9 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), if (!transport || !sctp_transport_hold(transport)) goto out; - sctp_association_hold(transport->asoc); - sctp_transport_put(transport); - rcu_read_unlock(); err = cb(transport, p); - sctp_association_put(transport->asoc); + sctp_transport_put(transport); out: return err; diff --git a/net/socket.c b/net/socket.c index 5a9bf5e..272518b 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2038,6 +2038,8 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, if (err) break; ++datagrams; + if (msg_data_left(&msg_sys)) + break; cond_resched(); } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 145082e..5d1c14a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2812,7 +2812,8 @@ static int unix_seq_show(struct seq_file *seq, void *v) i++; } for ( ; i < len; i++) - seq_putc(seq, u->addr->name->sun_path[i]); + seq_putc(seq, u->addr->name->sun_path[i] ?: + '@'); } unix_state_unlock(s); seq_putc(seq, '\n'); diff --git a/samples/bpf/Makefile b/samples/bpf/Makefile index 12b7304..72c5867 100644 --- a/samples/bpf/Makefile +++ b/samples/bpf/Makefile @@ -27,6 +27,7 @@ hostprogs-y += xdp2 hostprogs-y += test_current_task_under_cgroup hostprogs-y += trace_event hostprogs-y += sampleip +hostprogs-y += tc_l2_redirect test_verifier-objs := test_verifier.o libbpf.o test_maps-objs := test_maps.o libbpf.o @@ -56,6 +57,7 @@ test_current_task_under_cgroup-objs := bpf_load.o libbpf.o \ test_current_task_under_cgroup_user.o trace_event-objs := bpf_load.o libbpf.o trace_event_user.o sampleip-objs := bpf_load.o libbpf.o sampleip_user.o +tc_l2_redirect-objs := bpf_load.o libbpf.o tc_l2_redirect_user.o # Tell kbuild to always build the programs always := $(hostprogs-y) @@ -72,6 +74,7 @@ always += test_probe_write_user_kern.o always += trace_output_kern.o always += tcbpf1_kern.o always += tcbpf2_kern.o +always += tc_l2_redirect_kern.o always += lathist_kern.o always += offwaketime_kern.o always += spintest_kern.o @@ -111,6 +114,7 @@ HOSTLOADLIBES_xdp2 += -lelf HOSTLOADLIBES_test_current_task_under_cgroup += -lelf HOSTLOADLIBES_trace_event += -lelf HOSTLOADLIBES_sampleip += -lelf +HOSTLOADLIBES_tc_l2_redirect += -l elf # Allows pointing LLC/CLANG to a LLVM backend with bpf support, redefine on cmdline: # make samples/bpf/ LLC=~/git/llvm/build/bin/llc CLANG=~/git/llvm/build/bin/clang diff --git a/samples/bpf/tc_l2_redirect.sh b/samples/bpf/tc_l2_redirect.sh new file mode 100755 index 0000000..80a05591 --- /dev/null +++ b/samples/bpf/tc_l2_redirect.sh @@ -0,0 +1,173 @@ +#!/bin/bash + +[[ -z $TC ]] && TC='tc' +[[ -z $IP ]] && IP='ip' + +REDIRECT_USER='./tc_l2_redirect' +REDIRECT_BPF='./tc_l2_redirect_kern.o' + +RP_FILTER=$(< /proc/sys/net/ipv4/conf/all/rp_filter) +IPV6_FORWARDING=$(< /proc/sys/net/ipv6/conf/all/forwarding) + +function config_common { + local tun_type=$1 + + $IP netns add ns1 + $IP netns add ns2 + $IP link add ve1 type veth peer name vens1 + $IP link add ve2 type veth peer name vens2 + $IP link set dev ve1 up + $IP link set dev ve2 up + $IP link set dev ve1 mtu 1500 + $IP link set dev ve2 mtu 1500 + $IP link set dev vens1 netns ns1 + $IP link set dev vens2 netns ns2 + + $IP -n ns1 link set dev lo up + $IP -n ns1 link set dev vens1 up + $IP -n ns1 addr add 10.1.1.101/24 dev vens1 + $IP -n ns1 addr add 2401:db01::65/64 dev vens1 nodad + $IP -n ns1 route add default via 10.1.1.1 dev vens1 + $IP -n ns1 route add default via 2401:db01::1 dev vens1 + + $IP -n ns2 link set dev lo up + $IP -n ns2 link set dev vens2 up + $IP -n ns2 addr add 10.2.1.102/24 dev vens2 + $IP -n ns2 addr add 2401:db02::66/64 dev vens2 nodad + $IP -n ns2 addr add 10.10.1.102 dev lo + $IP -n ns2 addr add 2401:face::66/64 dev lo nodad + $IP -n ns2 link add ipt2 type ipip local 10.2.1.102 remote 10.2.1.1 + $IP -n ns2 link add ip6t2 type ip6tnl mode any local 2401:db02::66 remote 2401:db02::1 + $IP -n ns2 link set dev ipt2 up + $IP -n ns2 link set dev ip6t2 up + $IP netns exec ns2 $TC qdisc add dev vens2 clsact + $IP netns exec ns2 $TC filter add dev vens2 ingress bpf da obj $REDIRECT_BPF sec drop_non_tun_vip + if [[ $tun_type == "ipip" ]]; then + $IP -n ns2 route add 10.1.1.0/24 dev ipt2 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ipt2.rp_filter=0 + else + $IP -n ns2 route add 10.1.1.0/24 dev ip6t2 + $IP -n ns2 route add 2401:db01::/64 dev ip6t2 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.all.rp_filter=0 + $IP netns exec ns2 sysctl -q -w net.ipv4.conf.ip6t2.rp_filter=0 + fi + + $IP addr add 10.1.1.1/24 dev ve1 + $IP addr add 2401:db01::1/64 dev ve1 nodad + $IP addr add 10.2.1.1/24 dev ve2 + $IP addr add 2401:db02::1/64 dev ve2 nodad + + $TC qdisc add dev ve2 clsact + $TC filter add dev ve2 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_forward + + sysctl -q -w net.ipv4.conf.all.rp_filter=0 + sysctl -q -w net.ipv6.conf.all.forwarding=1 +} + +function cleanup { + set +e + [[ -z $DEBUG ]] || set +x + $IP netns delete ns1 >& /dev/null + $IP netns delete ns2 >& /dev/null + $IP link del ve1 >& /dev/null + $IP link del ve2 >& /dev/null + $IP link del ipt >& /dev/null + $IP link del ip6t >& /dev/null + sysctl -q -w net.ipv4.conf.all.rp_filter=$RP_FILTER + sysctl -q -w net.ipv6.conf.all.forwarding=$IPV6_FORWARDING + rm -f /sys/fs/bpf/tc/globals/tun_iface + [[ -z $DEBUG ]] || set -x + set -e +} + +function l2_to_ipip { + echo -n "l2_to_ipip $1: " + + local dir=$1 + + config_common ipip + + $IP link add ipt type ipip external + $IP link set dev ipt up + sysctl -q -w net.ipv4.conf.ipt.rp_filter=0 + sysctl -q -w net.ipv4.conf.ipt.forwarding=1 + + if [[ $dir == "egress" ]]; then + $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 + $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect + sysctl -q -w net.ipv4.conf.ve1.forwarding=1 + else + $TC qdisc add dev ve1 clsact + $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_iptun_ingress_redirect + fi + + $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ipt/ifindex) + + $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null + + if [[ $dir == "egress" ]]; then + # test direct egress to ve2 (i.e. not forwarding from + # ve1 to ve2). + ping -c1 10.10.1.102 >& /dev/null + fi + + cleanup + + echo "OK" +} + +function l2_to_ip6tnl { + echo -n "l2_to_ip6tnl $1: " + + local dir=$1 + + config_common ip6tnl + + $IP link add ip6t type ip6tnl mode any external + $IP link set dev ip6t up + sysctl -q -w net.ipv4.conf.ip6t.rp_filter=0 + sysctl -q -w net.ipv4.conf.ip6t.forwarding=1 + + if [[ $dir == "egress" ]]; then + $IP route add 10.10.1.0/24 via 10.2.1.102 dev ve2 + $IP route add 2401:face::/64 via 2401:db02::66 dev ve2 + $TC filter add dev ve2 egress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect + sysctl -q -w net.ipv4.conf.ve1.forwarding=1 + else + $TC qdisc add dev ve1 clsact + $TC filter add dev ve1 ingress bpf da obj $REDIRECT_BPF sec l2_to_ip6tun_ingress_redirect + fi + + $REDIRECT_USER -U /sys/fs/bpf/tc/globals/tun_iface -i $(< /sys/class/net/ip6t/ifindex) + + $IP netns exec ns1 ping -c1 10.10.1.102 >& /dev/null + $IP netns exec ns1 ping -6 -c1 2401:face::66 >& /dev/null + + if [[ $dir == "egress" ]]; then + # test direct egress to ve2 (i.e. not forwarding from + # ve1 to ve2). + ping -c1 10.10.1.102 >& /dev/null + ping -6 -c1 2401:face::66 >& /dev/null + fi + + cleanup + + echo "OK" +} + +cleanup +test_names="l2_to_ipip l2_to_ip6tnl" +test_dirs="ingress egress" +if [[ $# -ge 2 ]]; then + test_names=$1 + test_dirs=$2 +elif [[ $# -ge 1 ]]; then + test_names=$1 +fi + +for t in $test_names; do + for d in $test_dirs; do + $t $d + done +done diff --git a/samples/bpf/tc_l2_redirect_kern.c b/samples/bpf/tc_l2_redirect_kern.c new file mode 100644 index 0000000..92a4472 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_kern.c @@ -0,0 +1,236 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <uapi/linux/bpf.h> +#include <uapi/linux/if_ether.h> +#include <uapi/linux/if_packet.h> +#include <uapi/linux/ip.h> +#include <uapi/linux/ipv6.h> +#include <uapi/linux/in.h> +#include <uapi/linux/tcp.h> +#include <uapi/linux/filter.h> +#include <uapi/linux/pkt_cls.h> +#include <net/ipv6.h> +#include "bpf_helpers.h" + +#define _htonl __builtin_bswap32 + +#define PIN_GLOBAL_NS 2 +struct bpf_elf_map { + __u32 type; + __u32 size_key; + __u32 size_value; + __u32 max_elem; + __u32 flags; + __u32 id; + __u32 pinning; +}; + +/* copy of 'struct ethhdr' without __packed */ +struct eth_hdr { + unsigned char h_dest[ETH_ALEN]; + unsigned char h_source[ETH_ALEN]; + unsigned short h_proto; +}; + +struct bpf_elf_map SEC("maps") tun_iface = { + .type = BPF_MAP_TYPE_ARRAY, + .size_key = sizeof(int), + .size_value = sizeof(int), + .pinning = PIN_GLOBAL_NS, + .max_elem = 1, +}; + +static __always_inline bool is_vip_addr(__be16 eth_proto, __be32 daddr) +{ + if (eth_proto == htons(ETH_P_IP)) + return (_htonl(0xffffff00) & daddr) == _htonl(0x0a0a0100); + else if (eth_proto == htons(ETH_P_IPV6)) + return (daddr == _htonl(0x2401face)); + + return false; +} + +SEC("l2_to_iptun_ingress_forward") +int _l2_to_iptun_ingress_forward(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + int key = 0, *ifindex; + + int ret; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + ifindex = bpf_map_lookup_elem(&tun_iface, &key); + if (!ifindex) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + char fmt4[] = "ingress forward to ifindex:%d daddr4:%x\n"; + struct iphdr *iph = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (iph->protocol != IPPROTO_IPIP) + return TC_ACT_OK; + + bpf_trace_printk(fmt4, sizeof(fmt4), *ifindex, + _htonl(iph->daddr)); + return bpf_redirect(*ifindex, BPF_F_INGRESS); + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + char fmt6[] = "ingress forward to ifindex:%d daddr6:%x::%x\n"; + struct ipv6hdr *ip6h = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (ip6h->nexthdr != IPPROTO_IPIP && + ip6h->nexthdr != IPPROTO_IPV6) + return TC_ACT_OK; + + bpf_trace_printk(fmt6, sizeof(fmt6), *ifindex, + _htonl(ip6h->daddr.s6_addr32[0]), + _htonl(ip6h->daddr.s6_addr32[3])); + return bpf_redirect(*ifindex, BPF_F_INGRESS); + } + + return TC_ACT_OK; +} + +SEC("l2_to_iptun_ingress_redirect") +int _l2_to_iptun_ingress_redirect(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + int key = 0, *ifindex; + + int ret; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + ifindex = bpf_map_lookup_elem(&tun_iface, &key); + if (!ifindex) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; + struct iphdr *iph = data + sizeof(*eth); + __be32 daddr = iph->daddr; + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (!is_vip_addr(eth->h_proto, daddr)) + return TC_ACT_OK; + + bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(daddr), *ifindex); + } else { + return TC_ACT_OK; + } + + tkey.tunnel_id = 10000; + tkey.tunnel_ttl = 64; + tkey.remote_ipv4 = 0x0a020166; /* 10.2.1.102 */ + bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), 0); + return bpf_redirect(*ifindex, 0); +} + +SEC("l2_to_ip6tun_ingress_redirect") +int _l2_to_ip6tun_ingress_redirect(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + int key = 0, *ifindex; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + ifindex = bpf_map_lookup_elem(&tun_iface, &key); + if (!ifindex) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + char fmt4[] = "e/ingress redirect daddr4:%x to ifindex:%d\n"; + struct iphdr *iph = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (!is_vip_addr(eth->h_proto, iph->daddr)) + return TC_ACT_OK; + + bpf_trace_printk(fmt4, sizeof(fmt4), _htonl(iph->daddr), + *ifindex); + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + char fmt6[] = "e/ingress redirect daddr6:%x to ifindex:%d\n"; + struct ipv6hdr *ip6h = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (!is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) + return TC_ACT_OK; + + bpf_trace_printk(fmt6, sizeof(fmt6), + _htonl(ip6h->daddr.s6_addr32[0]), *ifindex); + } else { + return TC_ACT_OK; + } + + tkey.tunnel_id = 10000; + tkey.tunnel_ttl = 64; + /* 2401:db02:0:0:0:0:0:66 */ + tkey.remote_ipv6[0] = _htonl(0x2401db02); + tkey.remote_ipv6[1] = 0; + tkey.remote_ipv6[2] = 0; + tkey.remote_ipv6[3] = _htonl(0x00000066); + bpf_skb_set_tunnel_key(skb, &tkey, sizeof(tkey), BPF_F_TUNINFO_IPV6); + return bpf_redirect(*ifindex, 0); +} + +SEC("drop_non_tun_vip") +int _drop_non_tun_vip(struct __sk_buff *skb) +{ + struct bpf_tunnel_key tkey = {}; + void *data = (void *)(long)skb->data; + struct eth_hdr *eth = data; + void *data_end = (void *)(long)skb->data_end; + + if (data + sizeof(*eth) > data_end) + return TC_ACT_OK; + + if (eth->h_proto == htons(ETH_P_IP)) { + struct iphdr *iph = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*iph) > data_end) + return TC_ACT_OK; + + if (is_vip_addr(eth->h_proto, iph->daddr)) + return TC_ACT_SHOT; + } else if (eth->h_proto == htons(ETH_P_IPV6)) { + struct ipv6hdr *ip6h = data + sizeof(*eth); + + if (data + sizeof(*eth) + sizeof(*ip6h) > data_end) + return TC_ACT_OK; + + if (is_vip_addr(eth->h_proto, ip6h->daddr.s6_addr32[0])) + return TC_ACT_SHOT; + } + + return TC_ACT_OK; +} + +char _license[] SEC("license") = "GPL"; diff --git a/samples/bpf/tc_l2_redirect_user.c b/samples/bpf/tc_l2_redirect_user.c new file mode 100644 index 0000000..4013c53 --- /dev/null +++ b/samples/bpf/tc_l2_redirect_user.c @@ -0,0 +1,73 @@ +/* Copyright (c) 2016 Facebook + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + */ +#include <linux/unistd.h> +#include <linux/bpf.h> + +#include <stdlib.h> +#include <stdio.h> +#include <unistd.h> +#include <string.h> +#include <errno.h> + +#include "libbpf.h" + +static void usage(void) +{ + printf("Usage: tc_l2_ipip_redirect [...]\n"); + printf(" -U <file> Update an already pinned BPF array\n"); + printf(" -i <ifindex> Interface index\n"); + printf(" -h Display this help\n"); +} + +int main(int argc, char **argv) +{ + const char *pinned_file = NULL; + int ifindex = -1; + int array_key = 0; + int array_fd = -1; + int ret = -1; + int opt; + + while ((opt = getopt(argc, argv, "F:U:i:")) != -1) { + switch (opt) { + /* General args */ + case 'U': + pinned_file = optarg; + break; + case 'i': + ifindex = atoi(optarg); + break; + default: + usage(); + goto out; + } + } + + if (ifindex < 0 || !pinned_file) { + usage(); + goto out; + } + + array_fd = bpf_obj_get(pinned_file); + if (array_fd < 0) { + fprintf(stderr, "bpf_obj_get(%s): %s(%d)\n", + pinned_file, strerror(errno), errno); + goto out; + } + + /* bpf_tunnel_key.remote_ipv4 expects host byte orders */ + ret = bpf_update_elem(array_fd, &array_key, &ifindex, 0); + if (ret) { + perror("bpf_update_elem"); + goto out; + } + +out: + if (array_fd != -1) + close(array_fd); + return ret; +} |