From 5933a7bbb5de66482ea8aa874a7ebaf8e67603c4 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Tue, 1 Apr 2014 09:23:01 +0300 Subject: net: vxlan: fix crash when interface is created with no group If the vxlan interface is created without explicit group definition, there are corner cases which may cause kernel panic. For instance, in the following scenario: node A: $ ip link add dev vxlan42 address 2c:c2:60:00:10:20 type vxlan id 42 $ ip addr add dev vxlan42 10.0.0.1/24 $ ip link set up dev vxlan42 $ arp -i vxlan42 -s 10.0.0.2 2c:c2:60:00:01:02 $ bridge fdb add dev vxlan42 to 2c:c2:60:00:01:02 dst $ ping 10.0.0.2 node B: $ ip link add dev vxlan42 address 2c:c2:60:00:01:02 type vxlan id 42 $ ip addr add dev vxlan42 10.0.0.2/24 $ ip link set up dev vxlan42 $ arp -i vxlan42 -s 10.0.0.1 2c:c2:60:00:10:20 node B crashes: vxlan42: 2c:c2:60:00:10:20 migrated from 4011:eca4:c0a8:6466:c0a8:6415:8e09:2118 to (invalid address) vxlan42: 2c:c2:60:00:10:20 migrated from 4011:eca4:c0a8:6466:c0a8:6415:8e09:2118 to (invalid address) BUG: unable to handle kernel NULL pointer dereference at 0000000000000046 IP: [] ip6_route_output+0x58/0x82 PGD 7bd89067 PUD 7bd4e067 PMD 0 Oops: 0000 [#1] SMP Modules linked in: CPU: 1 PID: 0 Comm: swapper/1 Not tainted 3.14.0-rc8-hvx-xen-00019-g97a5221-dirty #154 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 task: ffff88007c774f50 ti: ffff88007c79c000 task.ti: ffff88007c79c000 RIP: 0010:[] [] ip6_route_output+0x58/0x82 RSP: 0018:ffff88007fd03668 EFLAGS: 00010282 RAX: 0000000000000000 RBX: ffffffff8186a000 RCX: 0000000000000040 RDX: 0000000000000000 RSI: ffff88007b0e4a80 RDI: ffff88007fd03754 RBP: ffff88007fd03688 R08: ffff88007b0e4a80 R09: 0000000000000000 R10: 0200000a0100000a R11: 0001002200000000 R12: ffff88007fd03740 R13: ffff88007b0e4a80 R14: ffff88007b0e4a80 R15: ffff88007bba0c50 FS: 0000000000000000(0000) GS:ffff88007fd00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000046 CR3: 000000007bb60000 CR4: 00000000000006e0 Stack: 0000000000000000 ffff88007fd037a0 ffffffff8186a000 ffff88007fd03740 ffff88007fd036c8 ffffffff814320bb 0000000000006e49 ffff88007b8b7360 ffff88007bdbf200 ffff88007bcbc000 ffff88007b8b7000 ffff88007b8b7360 Call Trace: [] ip6_dst_lookup_tail+0x2d/0xa4 [] ip6_dst_lookup+0x10/0x12 [] vxlan_xmit_one+0x32a/0x68c [] ? _raw_spin_unlock_irqrestore+0x12/0x14 [] ? lock_timer_base.isra.23+0x26/0x4b [] vxlan_xmit+0x66a/0x6a8 [] ? ipt_do_table+0x35f/0x37e [] ? selinux_ip_postroute+0x41/0x26e [] dev_hard_start_xmit+0x2ce/0x3ce [] __dev_queue_xmit+0x2d0/0x392 [] ? eth_header+0x28/0xb5 [] dev_queue_xmit+0xb/0xd [] neigh_resolve_output+0x134/0x152 [] ip_finish_output2+0x236/0x299 [] ip_finish_output+0x98/0x9d [] ip_output+0x62/0x67 [] dst_output+0xf/0x11 [] ip_local_out+0x1b/0x1f [] ip_send_skb+0x11/0x37 [] ip_push_pending_frames+0x2f/0x33 [] icmp_push_reply+0x106/0x115 [] icmp_reply+0x142/0x164 [] icmp_echo.part.16+0x46/0x48 [] ? nf_iterate+0x43/0x80 [] ? xfrm4_policy_check.constprop.11+0x52/0x52 [] icmp_echo+0x25/0x27 [] icmp_rcv+0x1d2/0x20a [] ? xfrm4_policy_check.constprop.11+0x52/0x52 [] ip_local_deliver_finish+0xd6/0x14f [] ? xfrm4_policy_check.constprop.11+0x52/0x52 [] NF_HOOK.constprop.10+0x4c/0x53 [] ip_local_deliver+0x4a/0x4f [] ip_rcv_finish+0x253/0x26a [] ? inet_add_protocol+0x3e/0x3e [] NF_HOOK.constprop.10+0x4c/0x53 [] ip_rcv+0x2a6/0x2ec [] __netif_receive_skb_core+0x43e/0x478 [] ? virtqueue_poll+0x16/0x27 [] __netif_receive_skb+0x55/0x5a [] process_backlog+0x76/0x12f [] net_rx_action+0xa2/0x1ab [] __do_softirq+0xca/0x1d1 [] irq_exit+0x3e/0x85 [] do_IRQ+0xa9/0xc4 [] common_interrupt+0x6d/0x6d [] ? native_safe_halt+0x6/0x8 [] default_idle+0x9/0xd [] arch_cpu_idle+0x13/0x1c [] cpu_startup_entry+0xbc/0x137 [] start_secondary+0x1a0/0x1a5 Code: 24 14 e8 f1 e5 01 00 31 d2 a8 32 0f 95 c2 49 8b 44 24 2c 49 0b 44 24 24 74 05 83 ca 04 eb 1c 4d 85 ed 74 17 49 8b 85 a8 02 00 00 <66> 8b 40 46 66 c1 e8 07 83 e0 07 c1 e0 03 09 c2 4c 89 e6 48 89 RIP [] ip6_route_output+0x58/0x82 RSP CR2: 0000000000000046 ---[ end trace 4612329caab37efd ]--- When vxlan interface is created without explicit group definition, the default_dst protocol family is initialiazed to AF_UNSPEC and the driver assumes IPv4 configuration. On the other side, the default_dst protocol family is used to differentiate between IPv4 and IPv6 cases and, since, AF_UNSPEC != AF_INET, the processing takes the IPv6 path. Making the IPv4 assumption explicit by settting default_dst protocol family to AF_INET4 and preventing mixing of IPv4 and IPv6 addresses in snooped fdb entries fixes the corner case crashes. Signed-off-by: Mike Rapoport Signed-off-by: David S. Miller --- drivers/net/vxlan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c index 0d862a5..c55e316 100644 --- a/drivers/net/vxlan.c +++ b/drivers/net/vxlan.c @@ -871,6 +871,9 @@ static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[], if (err) return err; + if (vxlan->default_dst.remote_ip.sa.sa_family != ip.sa.sa_family) + return -EAFNOSUPPORT; + spin_lock_bh(&vxlan->hash_lock); err = vxlan_fdb_create(vxlan, addr, &ip, ndm->ndm_state, flags, port, vni, ifindex, ndm->ndm_flags); @@ -2601,9 +2604,10 @@ static int vxlan_newlink(struct net *net, struct net_device *dev, vni = nla_get_u32(data[IFLA_VXLAN_ID]); dst->remote_vni = vni; + /* Unless IPv6 is explicitly requested, assume IPv4 */ + dst->remote_ip.sa.sa_family = AF_INET; if (data[IFLA_VXLAN_GROUP]) { dst->remote_ip.sin.sin_addr.s_addr = nla_get_be32(data[IFLA_VXLAN_GROUP]); - dst->remote_ip.sa.sa_family = AF_INET; } else if (data[IFLA_VXLAN_GROUP6]) { if (!IS_ENABLED(CONFIG_IPV6)) return -EPFNOSUPPORT; -- cgit v1.1 From 79eb9d28c9b22fa419e2c3f1b2cc6e285720ae41 Mon Sep 17 00:00:00 2001 From: Alexei Starovoitov Date: Tue, 1 Apr 2014 18:26:48 -0700 Subject: net: ti: fix CPTS driver build on arm fix build errors: drivers/net/ethernet/ti/cpts.c:266:12: error: 'ETH_HLEN' undeclared (first use in this function) drivers/net/ethernet/ti/cpts.c:276:23: error: 'VLAN_HLEN' undeclared (first use in this function) Fixes: 408eccce3204 ("net: ptp: move PTP classifier in its own file") Reported-by: Fengguang Wu Signed-off-by: Alexei Starovoitov Suggested-by: Daniel Borkmann Cc: Richard Cochran Acked-by: Daniel Borkmann Signed-off-by: David S. Miller --- drivers/net/ethernet/ti/cpts.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/net/ethernet/ti/cpts.c b/drivers/net/ethernet/ti/cpts.c index a3bbf59..2435139 100644 --- a/drivers/net/ethernet/ti/cpts.c +++ b/drivers/net/ethernet/ti/cpts.c @@ -26,6 +26,8 @@ #include #include #include +#include +#include #include "cpts.h" -- cgit v1.1 From 77bc6bed7121936bb2e019a8c336075f4c8eef62 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?YOSHIFUJI=20Hideaki=20/=20=E5=90=89=E8=97=A4=E8=8B=B1?= =?UTF-8?q?=E6=98=8E?= Date: Wed, 2 Apr 2014 12:48:42 +0900 Subject: isdnloop: Validate NUL-terminated strings from user. Return -EINVAL unless all of user-given strings are correctly NUL-terminated. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- drivers/isdn/isdnloop/isdnloop.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c index 02125e6..e1f8748 100644 --- a/drivers/isdn/isdnloop/isdnloop.c +++ b/drivers/isdn/isdnloop/isdnloop.c @@ -1070,6 +1070,12 @@ isdnloop_start(isdnloop_card *card, isdnloop_sdef *sdefp) return -EBUSY; if (copy_from_user((char *) &sdef, (char *) sdefp, sizeof(sdef))) return -EFAULT; + + for (i = 0; i < 3; i++) { + if (!memchr(sdef.num[i], 0, sizeof(sdef.num[i]))) + return -EINVAL; + } + spin_lock_irqsave(&card->isdnloop_lock, flags); switch (sdef.ptype) { case ISDN_PTYPE_EURO: -- cgit v1.1 From df1efc2d3037334f4a868aca9ae329d67058bbfd Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 2 Apr 2014 11:21:58 -0400 Subject: net: bnx2x: include irq.h for irqreturn_t definitions The bnx2x driver fails to build on ARM with: In file included from drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c:28:0: drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h:243:1: error: unknown type name 'irqreturn_t' irqreturn_t bnx2x_msix_sp_int(int irq, void *dev_instance); ^ drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h:251:1: error: unknown type name 'irqreturn_t' irqreturn_t bnx2x_interrupt(int irq, void *dev_instance); ^ Nothing in bnx2x_link.c or bnx2x_cmn.h is explicitly including the irq definitions, so we add an include of linux/irq.h to pick them up. Signed-off-by: Josh Boyer Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h index 05f4f5f..3448cc0 100644 --- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h +++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.h @@ -21,6 +21,7 @@ #include #include #include +#include #include "bnx2x.h" #include "bnx2x_sriov.h" -- cgit v1.1 From fef1f07cbf3d9b6f79beac8575554476c02c7f54 Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 2 Apr 2014 11:24:00 -0400 Subject: net: enic: include irq.h for irqreturn_t definitions The enic driver fails to build on ARM with: In file included from drivers/net/ethernet/cisco/enic/enic_res.c:40:0: drivers/net/ethernet/cisco/enic/enic.h:48:2: error: expected specifier-qualifier-list before 'irqreturn_t' irqreturn_t (*isr)(int, void *); ^ Nothing in the driver is explicitly including the irq definitions, so we add an include of linux/irq.h to pick them up. Signed-off-by: Josh Boyer Signed-off-by: David S. Miller --- drivers/net/ethernet/cisco/enic/enic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/cisco/enic/enic.h b/drivers/net/ethernet/cisco/enic/enic.h index e9f7c65..e35c8e0 100644 --- a/drivers/net/ethernet/cisco/enic/enic.h +++ b/drivers/net/ethernet/cisco/enic/enic.h @@ -29,6 +29,7 @@ #include "vnic_stats.h" #include "vnic_nic.h" #include "vnic_rss.h" +#include #define DRV_NAME "enic" #define DRV_DESCRIPTION "Cisco VIC Ethernet NIC Driver" -- cgit v1.1 From acdd32be6d3ec5f872d9fd930cac54d18d6c74ac Mon Sep 17 00:00:00 2001 From: Josh Boyer Date: Wed, 2 Apr 2014 11:25:47 -0400 Subject: net: qlcnic: include irq.h for irq definitions The qlcnic driver fails to build on ARM with errors like: In file included from drivers/net/ethernet/qlogic/qlcnic/qlcnic.h:36:0, from drivers/net/ethernet/qlogic/qlcnic/qlcnic_hw.c:8: drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.h:585:1: error: unknown type name 'irqreturn_t' irqreturn_t qlcnic_83xx_clear_legacy_intr(struct qlcnic_adapter *); ^ Nothing in the driver is explicitly including the irq definitions, so we add an include of linux/irq.h to pick them up. Signed-off-by: Josh Boyer Signed-off-by: David S. Miller --- drivers/net/ethernet/qlogic/qlcnic/qlcnic.h | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h index f31bb5e..7b52a88 100644 --- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h +++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic.h @@ -23,6 +23,7 @@ #include #include #include +#include #include -- cgit v1.1 From 9074ce249321861e535cdf8de9af0930a174dda9 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Wed, 2 Apr 2014 18:04:57 +0100 Subject: xen-netback: Rename map ops Rename identifiers to state explicitly that they refer to map ops. Signed-off-by: Zoltan Kiss Reviewed-by: Paul Durrant Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 46 ++++++++++++++++++++------------------- 1 file changed, 24 insertions(+), 22 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 3f021e0..4bb7886 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -820,13 +820,13 @@ struct xenvif_tx_cb { #define XENVIF_TX_CB(skb) ((struct xenvif_tx_cb *)(skb)->cb) -static inline void xenvif_tx_create_gop(struct xenvif *vif, - u16 pending_idx, - struct xen_netif_tx_request *txp, - struct gnttab_map_grant_ref *gop) +static inline void xenvif_tx_create_map_op(struct xenvif *vif, + u16 pending_idx, + struct xen_netif_tx_request *txp, + struct gnttab_map_grant_ref *mop) { - vif->pages_to_map[gop-vif->tx_map_ops] = vif->mmap_pages[pending_idx]; - gnttab_set_map_op(gop, idx_to_kaddr(vif, pending_idx), + vif->pages_to_map[mop-vif->tx_map_ops] = vif->mmap_pages[pending_idx]; + gnttab_set_map_op(mop, idx_to_kaddr(vif, pending_idx), GNTMAP_host_map | GNTMAP_readonly, txp->gref, vif->domid); @@ -880,7 +880,7 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif, shinfo->nr_frags++, txp++, gop++) { index = pending_index(vif->pending_cons++); pending_idx = vif->pending_ring[index]; - xenvif_tx_create_gop(vif, pending_idx, txp, gop); + xenvif_tx_create_map_op(vif, pending_idx, txp, gop); frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx); } @@ -900,7 +900,7 @@ static struct gnttab_map_grant_ref *xenvif_get_requests(struct xenvif *vif, shinfo->nr_frags++, txp++, gop++) { index = pending_index(vif->pending_cons++); pending_idx = vif->pending_ring[index]; - xenvif_tx_create_gop(vif, pending_idx, txp, gop); + xenvif_tx_create_map_op(vif, pending_idx, txp, gop); frag_set_pending_idx(&frags[shinfo->nr_frags], pending_idx); } @@ -940,9 +940,9 @@ static inline void xenvif_grant_handle_reset(struct xenvif *vif, static int xenvif_tx_check_gop(struct xenvif *vif, struct sk_buff *skb, - struct gnttab_map_grant_ref **gopp) + struct gnttab_map_grant_ref **gopp_map) { - struct gnttab_map_grant_ref *gop = *gopp; + struct gnttab_map_grant_ref *gop_map = *gopp_map; u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx; struct skb_shared_info *shinfo = skb_shinfo(skb); struct pending_tx_info *tx_info; @@ -951,11 +951,11 @@ static int xenvif_tx_check_gop(struct xenvif *vif, struct sk_buff *first_skb = NULL; /* Check status of header. */ - err = gop->status; + err = gop_map->status; if (unlikely(err)) xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); else - xenvif_grant_handle_set(vif, pending_idx , gop->handle); + xenvif_grant_handle_set(vif, pending_idx , gop_map->handle); /* Skip first skb fragment if it is on same page as header fragment. */ start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); @@ -968,10 +968,12 @@ check_frags: tx_info = &vif->pending_tx_info[pending_idx]; /* Check error status: if okay then remember grant handle. */ - newerr = (++gop)->status; + newerr = (++gop_map)->status; if (likely(!newerr)) { - xenvif_grant_handle_set(vif, pending_idx , gop->handle); + xenvif_grant_handle_set(vif, + pending_idx, + gop_map->handle); /* Had a previous error? Invalidate this fragment. */ if (unlikely(err)) xenvif_idx_unmap(vif, pending_idx); @@ -1023,7 +1025,7 @@ check_frags: } } - *gopp = gop + 1; + *gopp_map = gop_map + 1; return err; } @@ -1292,7 +1294,7 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) } } - xenvif_tx_create_gop(vif, pending_idx, &txreq, gop); + xenvif_tx_create_map_op(vif, pending_idx, &txreq, gop); gop++; @@ -1399,7 +1401,7 @@ static int xenvif_handle_frag_list(struct xenvif *vif, struct sk_buff *skb) static int xenvif_tx_submit(struct xenvif *vif) { - struct gnttab_map_grant_ref *gop = vif->tx_map_ops; + struct gnttab_map_grant_ref *gop_map = vif->tx_map_ops; struct sk_buff *skb; int work_done = 0; @@ -1412,7 +1414,7 @@ static int xenvif_tx_submit(struct xenvif *vif) txp = &vif->pending_tx_info[pending_idx].req; /* Check the remap error code. */ - if (unlikely(xenvif_tx_check_gop(vif, skb, &gop))) { + if (unlikely(xenvif_tx_check_gop(vif, skb, &gop_map))) { netdev_dbg(vif->dev, "netback grant failed.\n"); skb_shinfo(skb)->nr_frags = 0; kfree_skb(skb); @@ -1611,21 +1613,21 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif) /* Called after netfront has transmitted */ int xenvif_tx_action(struct xenvif *vif, int budget) { - unsigned nr_gops; + unsigned nr_mops; int work_done, ret; if (unlikely(!tx_work_todo(vif))) return 0; - nr_gops = xenvif_tx_build_gops(vif, budget); + nr_mops = xenvif_tx_build_gops(vif, budget); - if (nr_gops == 0) + if (nr_mops == 0) return 0; ret = gnttab_map_refs(vif->tx_map_ops, NULL, vif->pages_to_map, - nr_gops); + nr_mops); BUG_ON(ret); work_done = xenvif_tx_submit(vif); -- cgit v1.1 From bdab82759b8e3620096d6db46dc1cac38a52d779 Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Wed, 2 Apr 2014 18:04:58 +0100 Subject: xen-netback: Grant copy the header instead of map and memcpy An old inefficiency of the TX path that we are grant mapping the first slot, and then copy the header part to the linear area. Instead, doing a grant copy for that header straight on is more reasonable. Especially because there are ongoing efforts to make Xen avoiding TLB flush after unmap when the page were not touched in Dom0. In the original way the memcpy ruined that. The key changes: - the vif has a tx_copy_ops array again - xenvif_tx_build_gops sets up the grant copy operations - we don't have to figure out whether the header and first frag are on the same grant mapped page or not Note, we only grant copy PKT_PROT_LEN bytes from the first slot, the rest (if any) will be on the first frag, which is grant mapped. If the first slot is smaller than PKT_PROT_LEN, then we grant copy that, and later __pskb_pull_tail will pull more from the frags (if any) Signed-off-by: Zoltan Kiss Reviewed-by: Paul Durrant Acked-by: Ian Campbell Signed-off-by: David S. Miller --- drivers/net/xen-netback/common.h | 1 + drivers/net/xen-netback/netback.c | 122 +++++++++++++++++++++----------------- 2 files changed, 70 insertions(+), 53 deletions(-) diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h index 89d1d05..630a3fc 100644 --- a/drivers/net/xen-netback/common.h +++ b/drivers/net/xen-netback/common.h @@ -124,6 +124,7 @@ struct xenvif { struct pending_tx_info pending_tx_info[MAX_PENDING_REQS]; grant_handle_t grant_tx_handle[MAX_PENDING_REQS]; + struct gnttab_copy tx_copy_ops[MAX_PENDING_REQS]; struct gnttab_map_grant_ref tx_map_ops[MAX_PENDING_REQS]; struct gnttab_unmap_grant_ref tx_unmap_ops[MAX_PENDING_REQS]; /* passed to gnttab_[un]map_refs with pages under (un)mapping */ diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 4bb7886..99c8f09 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -940,35 +940,37 @@ static inline void xenvif_grant_handle_reset(struct xenvif *vif, static int xenvif_tx_check_gop(struct xenvif *vif, struct sk_buff *skb, - struct gnttab_map_grant_ref **gopp_map) + struct gnttab_map_grant_ref **gopp_map, + struct gnttab_copy **gopp_copy) { struct gnttab_map_grant_ref *gop_map = *gopp_map; u16 pending_idx = XENVIF_TX_CB(skb)->pending_idx; struct skb_shared_info *shinfo = skb_shinfo(skb); - struct pending_tx_info *tx_info; int nr_frags = shinfo->nr_frags; - int i, err, start; + int i, err; struct sk_buff *first_skb = NULL; /* Check status of header. */ - err = gop_map->status; - if (unlikely(err)) + err = (*gopp_copy)->status; + (*gopp_copy)++; + if (unlikely(err)) { + if (net_ratelimit()) + netdev_dbg(vif->dev, + "Grant copy of header failed! status: %d pending_idx% %u ref: %u\n", + (*gopp_copy)->status, + pending_idx, + (*gopp_copy)->source.u.ref); xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); - else - xenvif_grant_handle_set(vif, pending_idx , gop_map->handle); - - /* Skip first skb fragment if it is on same page as header fragment. */ - start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); + } check_frags: - for (i = start; i < nr_frags; i++) { + for (i = 0; i < nr_frags; i++, gop_map++) { int j, newerr; pending_idx = frag_get_pending_idx(&shinfo->frags[i]); - tx_info = &vif->pending_tx_info[pending_idx]; /* Check error status: if okay then remember grant handle. */ - newerr = (++gop_map)->status; + newerr = gop_map->status; if (likely(!newerr)) { xenvif_grant_handle_set(vif, @@ -981,18 +983,20 @@ check_frags: } /* Error on this fragment: respond to client with an error. */ + if (net_ratelimit()) + netdev_dbg(vif->dev, + "Grant map of %d. frag failed! status: %d pending_idx% %u ref: %u\n", + i, + gop_map->status, + pending_idx, + gop_map->ref); xenvif_idx_release(vif, pending_idx, XEN_NETIF_RSP_ERROR); /* Not the first error? Preceding frags already invalidated. */ if (err) continue; - /* First error: invalidate header and preceding fragments. */ - if (!first_skb) - pending_idx = XENVIF_TX_CB(skb)->pending_idx; - else - pending_idx = XENVIF_TX_CB(skb)->pending_idx; - xenvif_idx_unmap(vif, pending_idx); - for (j = start; j < i; j++) { + /* First error: invalidate preceding fragments. */ + for (j = 0; j < i; j++) { pending_idx = frag_get_pending_idx(&shinfo->frags[j]); xenvif_idx_unmap(vif, pending_idx); } @@ -1006,7 +1010,6 @@ check_frags: skb = shinfo->frag_list; shinfo = skb_shinfo(skb); nr_frags = shinfo->nr_frags; - start = 0; goto check_frags; } @@ -1017,15 +1020,13 @@ check_frags: if (first_skb && err) { int j; shinfo = skb_shinfo(first_skb); - pending_idx = XENVIF_TX_CB(skb)->pending_idx; - start = (frag_get_pending_idx(&shinfo->frags[0]) == pending_idx); - for (j = start; j < shinfo->nr_frags; j++) { + for (j = 0; j < shinfo->nr_frags; j++) { pending_idx = frag_get_pending_idx(&shinfo->frags[j]); xenvif_idx_unmap(vif, pending_idx); } } - *gopp_map = gop_map + 1; + *gopp_map = gop_map; return err; } @@ -1036,9 +1037,6 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb) int i; u16 prev_pending_idx = INVALID_PENDING_IDX; - if (skb_shinfo(skb)->destructor_arg) - prev_pending_idx = XENVIF_TX_CB(skb)->pending_idx; - for (i = 0; i < nr_frags; i++) { skb_frag_t *frag = shinfo->frags + i; struct xen_netif_tx_request *txp; @@ -1048,10 +1046,10 @@ static void xenvif_fill_frags(struct xenvif *vif, struct sk_buff *skb) pending_idx = frag_get_pending_idx(frag); /* If this is not the first frag, chain it to the previous*/ - if (unlikely(prev_pending_idx == INVALID_PENDING_IDX)) + if (prev_pending_idx == INVALID_PENDING_IDX) skb_shinfo(skb)->destructor_arg = &callback_param(vif, pending_idx); - else if (likely(pending_idx != prev_pending_idx)) + else callback_param(vif, prev_pending_idx).ctx = &callback_param(vif, pending_idx); @@ -1191,7 +1189,10 @@ static bool tx_credit_exceeded(struct xenvif *vif, unsigned size) return false; } -static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) +static void xenvif_tx_build_gops(struct xenvif *vif, + int budget, + unsigned *copy_ops, + unsigned *map_ops) { struct gnttab_map_grant_ref *gop = vif->tx_map_ops, *request_gop; struct sk_buff *skb; @@ -1294,22 +1295,36 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) } } - xenvif_tx_create_map_op(vif, pending_idx, &txreq, gop); - - gop++; - XENVIF_TX_CB(skb)->pending_idx = pending_idx; __skb_put(skb, data_len); + vif->tx_copy_ops[*copy_ops].source.u.ref = txreq.gref; + vif->tx_copy_ops[*copy_ops].source.domid = vif->domid; + vif->tx_copy_ops[*copy_ops].source.offset = txreq.offset; + + vif->tx_copy_ops[*copy_ops].dest.u.gmfn = + virt_to_mfn(skb->data); + vif->tx_copy_ops[*copy_ops].dest.domid = DOMID_SELF; + vif->tx_copy_ops[*copy_ops].dest.offset = + offset_in_page(skb->data); + + vif->tx_copy_ops[*copy_ops].len = data_len; + vif->tx_copy_ops[*copy_ops].flags = GNTCOPY_source_gref; + + (*copy_ops)++; skb_shinfo(skb)->nr_frags = ret; if (data_len < txreq.size) { skb_shinfo(skb)->nr_frags++; frag_set_pending_idx(&skb_shinfo(skb)->frags[0], pending_idx); + xenvif_tx_create_map_op(vif, pending_idx, &txreq, gop); + gop++; } else { frag_set_pending_idx(&skb_shinfo(skb)->frags[0], INVALID_PENDING_IDX); + memcpy(&vif->pending_tx_info[pending_idx].req, &txreq, + sizeof(txreq)); } vif->pending_cons++; @@ -1326,11 +1341,13 @@ static unsigned xenvif_tx_build_gops(struct xenvif *vif, int budget) vif->tx.req_cons = idx; - if ((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops)) + if (((gop-vif->tx_map_ops) >= ARRAY_SIZE(vif->tx_map_ops)) || + (*copy_ops >= ARRAY_SIZE(vif->tx_copy_ops))) break; } - return gop - vif->tx_map_ops; + (*map_ops) = gop - vif->tx_map_ops; + return; } /* Consolidate skb with a frag_list into a brand new one with local pages on @@ -1402,6 +1419,7 @@ static int xenvif_handle_frag_list(struct xenvif *vif, struct sk_buff *skb) static int xenvif_tx_submit(struct xenvif *vif) { struct gnttab_map_grant_ref *gop_map = vif->tx_map_ops; + struct gnttab_copy *gop_copy = vif->tx_copy_ops; struct sk_buff *skb; int work_done = 0; @@ -1414,27 +1432,22 @@ static int xenvif_tx_submit(struct xenvif *vif) txp = &vif->pending_tx_info[pending_idx].req; /* Check the remap error code. */ - if (unlikely(xenvif_tx_check_gop(vif, skb, &gop_map))) { - netdev_dbg(vif->dev, "netback grant failed.\n"); + if (unlikely(xenvif_tx_check_gop(vif, skb, &gop_map, &gop_copy))) { skb_shinfo(skb)->nr_frags = 0; kfree_skb(skb); continue; } data_len = skb->len; - memcpy(skb->data, - (void *)(idx_to_kaddr(vif, pending_idx)|txp->offset), - data_len); callback_param(vif, pending_idx).ctx = NULL; if (data_len < txp->size) { /* Append the packet payload as a fragment. */ txp->offset += data_len; txp->size -= data_len; - skb_shinfo(skb)->destructor_arg = - &callback_param(vif, pending_idx); } else { /* Schedule a response immediately. */ - xenvif_idx_unmap(vif, pending_idx); + xenvif_idx_release(vif, pending_idx, + XEN_NETIF_RSP_OKAY); } if (txp->flags & XEN_NETTXF_csum_blank) @@ -1613,22 +1626,25 @@ static inline void xenvif_tx_dealloc_action(struct xenvif *vif) /* Called after netfront has transmitted */ int xenvif_tx_action(struct xenvif *vif, int budget) { - unsigned nr_mops; + unsigned nr_mops, nr_cops = 0; int work_done, ret; if (unlikely(!tx_work_todo(vif))) return 0; - nr_mops = xenvif_tx_build_gops(vif, budget); + xenvif_tx_build_gops(vif, budget, &nr_cops, &nr_mops); - if (nr_mops == 0) + if (nr_cops == 0) return 0; - ret = gnttab_map_refs(vif->tx_map_ops, - NULL, - vif->pages_to_map, - nr_mops); - BUG_ON(ret); + gnttab_batch_copy(vif->tx_copy_ops, nr_cops); + if (nr_mops != 0) { + ret = gnttab_map_refs(vif->tx_map_ops, + NULL, + vif->pages_to_map, + nr_mops); + BUG_ON(ret); + } work_done = xenvif_tx_submit(vif); -- cgit v1.1 From 0f97ede45e65ffb6eab856313e79b14b902bcfaa Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 Apr 2014 20:52:56 +0200 Subject: packet: report tx_dropped in packet_direct_xmit Since commit 015f0688f57c ("net: net: add a core netdev->tx_dropped counter"), we can now account for TX drops from within the core stack instead of drivers. Therefore, fix packet_direct_xmit() and increase drop count when we encounter a problem before driver's xmit function was called (we do not want to doubly account for it). Suggested-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- net/packet/af_packet.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 01039d2..c81a971 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -275,6 +275,7 @@ static int packet_direct_xmit(struct sk_buff *skb) return ret; drop: + atomic_long_inc(&dev->tx_dropped); kfree_skb(skb); return NET_XMIT_DROP; } -- cgit v1.1 From 8e2f1a63f2217365223026422a2f8ba5967051d6 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Wed, 2 Apr 2014 20:52:57 +0200 Subject: packet: fix packet_direct_xmit for BQL enabled drivers Currently, in packet_direct_xmit() we test the assigned netdevice queue for netif_xmit_frozen_or_stopped() before doing an ndo_start_xmit(). This can have the side-effect that BQL enabled drivers which make use of netdev_tx_sent_queue() internally, set __QUEUE_STATE_STACK_XOFF from within the stack and would not fully fill the device's TX ring from packet sockets with PACKET_QDISC_BYPASS enabled. Instead, use a test without BQL bit so that bursts can be absorbed into the NICs TX ring. Fix and code suggested by Eric Dumazet, thanks! Signed-off-by: Eric Dumazet Signed-off-by: Daniel Borkmann Signed-off-by: David S. Miller --- include/linux/netdevice.h | 24 +++++++++++++++++++----- net/packet/af_packet.c | 2 +- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 775cc95..7ed3a3a 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -519,11 +519,18 @@ enum netdev_queue_state_t { __QUEUE_STATE_DRV_XOFF, __QUEUE_STATE_STACK_XOFF, __QUEUE_STATE_FROZEN, -#define QUEUE_STATE_ANY_XOFF ((1 << __QUEUE_STATE_DRV_XOFF) | \ - (1 << __QUEUE_STATE_STACK_XOFF)) -#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \ - (1 << __QUEUE_STATE_FROZEN)) }; + +#define QUEUE_STATE_DRV_XOFF (1 << __QUEUE_STATE_DRV_XOFF) +#define QUEUE_STATE_STACK_XOFF (1 << __QUEUE_STATE_STACK_XOFF) +#define QUEUE_STATE_FROZEN (1 << __QUEUE_STATE_FROZEN) + +#define QUEUE_STATE_ANY_XOFF (QUEUE_STATE_DRV_XOFF | QUEUE_STATE_STACK_XOFF) +#define QUEUE_STATE_ANY_XOFF_OR_FROZEN (QUEUE_STATE_ANY_XOFF | \ + QUEUE_STATE_FROZEN) +#define QUEUE_STATE_DRV_XOFF_OR_FROZEN (QUEUE_STATE_DRV_XOFF | \ + QUEUE_STATE_FROZEN) + /* * __QUEUE_STATE_DRV_XOFF is used by drivers to stop the transmit queue. The * netif_tx_* functions below are used to manipulate this flag. The @@ -2252,11 +2259,18 @@ static inline bool netif_xmit_stopped(const struct netdev_queue *dev_queue) return dev_queue->state & QUEUE_STATE_ANY_XOFF; } -static inline bool netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue) +static inline bool +netif_xmit_frozen_or_stopped(const struct netdev_queue *dev_queue) { return dev_queue->state & QUEUE_STATE_ANY_XOFF_OR_FROZEN; } +static inline bool +netif_xmit_frozen_or_drv_stopped(const struct netdev_queue *dev_queue) +{ + return dev_queue->state & QUEUE_STATE_DRV_XOFF_OR_FROZEN; +} + static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, unsigned int bytes) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c81a971..72e0c71 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -261,7 +261,7 @@ static int packet_direct_xmit(struct sk_buff *skb) local_bh_disable(); HARD_TX_LOCK(dev, txq, smp_processor_id()); - if (!netif_xmit_frozen_or_stopped(txq)) { + if (!netif_xmit_frozen_or_drv_stopped(txq)) { ret = ops->ndo_start_xmit(skb, dev); if (ret == NETDEV_TX_OK) txq_trans_update(txq); -- cgit v1.1 From d0290214de712150b118a532ded378a29255893b Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Wed, 2 Apr 2014 23:09:31 +0200 Subject: net: add busy_poll device feature Currently there is no way how to find out if a device supports busy polling. So add a feature and make it dependent on ndo_busy_poll existence. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- include/linux/netdev_features.h | 2 ++ net/core/dev.c | 7 +++++++ net/core/ethtool.c | 1 + 3 files changed, 10 insertions(+) diff --git a/include/linux/netdev_features.h b/include/linux/netdev_features.h index 5a09a48..c26d0ec 100644 --- a/include/linux/netdev_features.h +++ b/include/linux/netdev_features.h @@ -63,6 +63,7 @@ enum { NETIF_F_HW_VLAN_STAG_RX_BIT, /* Receive VLAN STAG HW acceleration */ NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */ NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */ + NETIF_F_BUSY_POLL_BIT, /* Busy poll */ /* * Add your fresh new feature above and remember to update @@ -118,6 +119,7 @@ enum { #define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX) #define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX) #define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD) +#define NETIF_F_BUSY_POLL __NETIF_F(BUSY_POLL) /* Features valid for ethtool to change */ /* = all defined minus driver/device-class-related */ diff --git a/net/core/dev.c b/net/core/dev.c index 7570634..75e88e0 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5696,6 +5696,13 @@ static netdev_features_t netdev_fix_features(struct net_device *dev, } } +#ifdef CONFIG_NET_RX_BUSY_POLL + if (dev->netdev_ops->ndo_busy_poll) + features |= NETIF_F_BUSY_POLL; + else +#endif + features &= ~NETIF_F_BUSY_POLL; + return features; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 30071de..640ba0e 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -97,6 +97,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] [NETIF_F_RXFCS_BIT] = "rx-fcs", [NETIF_F_RXALL_BIT] = "rx-all", [NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload", + [NETIF_F_BUSY_POLL_BIT] = "busy-poll", }; static int ethtool_get_features(struct net_device *dev, void __user *useraddr) -- cgit v1.1 From 56c8b193ea7052d191c344ef9196bb8d759e1abf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?fran=C3=A7ois=20romieu?= Date: Thu, 3 Apr 2014 01:13:03 +0200 Subject: sxgbe: use common NET_VENDOR_FOO style. Signed-off-by: Francois Romieu Acked-by: Valdis Kletnieks Signed-off-by: David S. Miller --- drivers/net/ethernet/samsung/Kconfig | 24 ++++++++++++++++++++---- drivers/net/ethernet/samsung/sxgbe/Kconfig | 9 --------- 2 files changed, 20 insertions(+), 13 deletions(-) delete mode 100644 drivers/net/ethernet/samsung/sxgbe/Kconfig diff --git a/drivers/net/ethernet/samsung/Kconfig b/drivers/net/ethernet/samsung/Kconfig index 7902341..2360d81 100644 --- a/drivers/net/ethernet/samsung/Kconfig +++ b/drivers/net/ethernet/samsung/Kconfig @@ -3,14 +3,30 @@ # config NET_VENDOR_SAMSUNG - bool "Samsung Ethernet device" + bool "Samsung Ethernet devices" default y ---help--- - This is the driver for the SXGBE 10G Ethernet IP block found on Samsung - platforms. + If you have a network (Ethernet) chipset belonging to this class, + say Y. + + Note that the answer to this question does not directly affect + the kernel: saying N will just case the configurator to skip all + the questions about Samsung chipsets. If you say Y, you will be asked + for your specific chipset/driver in the following questions. if NET_VENDOR_SAMSUNG -source "drivers/net/ethernet/samsung/sxgbe/Kconfig" +config SXGBE_ETH + tristate "Samsung 10G/2.5G/1G SXGBE Ethernet driver" + depends on HAS_IOMEM && HAS_DMA + select PHYLIB + select CRC32 + select PTP_1588_CLOCK + ---help--- + This is the driver for the SXGBE 10G Ethernet IP block found on + Samsung platforms. + + To compile this driver as a module, choose M here: the module + will be called samsung-sxgbe. endif # NET_VENDOR_SAMSUNG diff --git a/drivers/net/ethernet/samsung/sxgbe/Kconfig b/drivers/net/ethernet/samsung/sxgbe/Kconfig deleted file mode 100644 index d79288c..0000000 --- a/drivers/net/ethernet/samsung/sxgbe/Kconfig +++ /dev/null @@ -1,9 +0,0 @@ -config SXGBE_ETH - tristate "Samsung 10G/2.5G/1G SXGBE Ethernet driver" - depends on HAS_IOMEM && HAS_DMA - select PHYLIB - select CRC32 - select PTP_1588_CLOCK - ---help--- - This is the driver for the SXGBE 10G Ethernet IP block found on Samsung - platforms. -- cgit v1.1 From d9bd6461681c78fbe18087c20b4bb47845cd5564 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?fran=C3=A7ois=20romieu?= Date: Thu, 3 Apr 2014 01:13:43 +0200 Subject: sxgbe: fix driver probe error path and driver removal leaks sxgbe_drv_probe: mdio and priv->hw leaks sxgbe_drv_remove: clk and priv->hw leaks Signed-off-by: Francois Romieu Acked-by: Byungho An Signed-off-by: David S. Miller --- drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c | 27 ++++++++++++++++--------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c index a72688e..27e8c82 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c @@ -2113,11 +2113,11 @@ struct sxgbe_priv_data *sxgbe_drv_probe(struct device *device, /* allocate memory resources for Descriptor rings */ ret = txring_mem_alloc(priv); if (ret) - goto error_free_netdev; + goto error_free_hw; ret = rxring_mem_alloc(priv); if (ret) - goto error_free_netdev; + goto error_free_hw; ndev->netdev_ops = &sxgbe_netdev_ops; @@ -2163,7 +2163,7 @@ struct sxgbe_priv_data *sxgbe_drv_probe(struct device *device, if (IS_ERR(priv->sxgbe_clk)) { netdev_warn(ndev, "%s: warning: cannot get CSR clock\n", __func__); - goto error_clk_get; + goto error_napi_del; } /* If a specific clk_csr value is passed from the platform @@ -2182,24 +2182,27 @@ struct sxgbe_priv_data *sxgbe_drv_probe(struct device *device, if (ret < 0) { netdev_dbg(ndev, "%s: MDIO bus (id: %d) registration failed\n", __func__, priv->plat->bus_id); - goto error_mdio_register; + goto error_clk_put; } ret = register_netdev(ndev); if (ret) { pr_err("%s: ERROR %i registering the device\n", __func__, ret); - goto error_netdev_register; + goto error_mdio_unregister; } sxgbe_check_ether_addr(priv); return priv; -error_mdio_register: +error_mdio_unregister: + sxgbe_mdio_unregister(ndev); +error_clk_put: clk_put(priv->sxgbe_clk); -error_clk_get: -error_netdev_register: +error_napi_del: netif_napi_del(&priv->napi); +error_free_hw: + kfree(priv->hw); error_free_netdev: free_netdev(ndev); @@ -2224,11 +2227,15 @@ int sxgbe_drv_remove(struct net_device *ndev) priv->hw->mac->enable_tx(priv->ioaddr, false); priv->hw->mac->enable_rx(priv->ioaddr, false); - netif_napi_del(&priv->napi); + unregister_netdev(ndev); sxgbe_mdio_unregister(ndev); - unregister_netdev(ndev); + clk_put(priv->sxgbe_clk); + + netif_napi_del(&priv->napi); + + kfree(priv->hw); free_netdev(ndev); -- cgit v1.1 From a5e7ac5ce134d8f72f59631011fafa7bbf7ca174 Mon Sep 17 00:00:00 2001 From: Erik Hugne Date: Thu, 3 Apr 2014 08:28:01 +0200 Subject: tipc: fix regression bug where node events are not being generated Commit 5902385a2440a55f005b266c93e0bb9398e5a62b ("tipc: obsolete the remote management feature") introduces a regression where node topology events are not being generated because the publication that triggers this: {0, , } is no longer available. This will break applications that rely on node events to discover when nodes join/leave a cluster. We fix this by advertising the node publication when TIPC enters networking mode, and withdraws it upon shutdown. Signed-off-by: Erik Hugne Reviewed-by: Jon Maloy Reviewed-by: Ying Xue Signed-off-by: David S. Miller --- net/tipc/net.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/tipc/net.c b/net/tipc/net.c index 0374a81..4c564eb 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -182,6 +182,8 @@ void tipc_net_start(u32 addr) tipc_bclink_init(); write_unlock_bh(&tipc_net_lock); + tipc_nametbl_publish(TIPC_CFG_SRV, tipc_own_addr, tipc_own_addr, + TIPC_ZONE_SCOPE, 0, tipc_own_addr); pr_info("Started in network mode\n"); pr_info("Own node address %s, network identity %u\n", tipc_addr_string_fill(addr_string, tipc_own_addr), tipc_net_id); @@ -192,6 +194,7 @@ void tipc_net_stop(void) if (!tipc_own_addr) return; + tipc_nametbl_withdraw(TIPC_CFG_SRV, tipc_own_addr, 0, tipc_own_addr); write_lock_bh(&tipc_net_lock); tipc_bearer_stop(); tipc_bclink_stop(); -- cgit v1.1 From 240a12d58e3935945c8c0f767134f3da1ac05371 Mon Sep 17 00:00:00 2001 From: Philipp Zabel Date: Thu, 3 Apr 2014 11:28:10 +0200 Subject: net: Micrel KSZ8864RMN 4-port managed switch support This patch adds support for the Micrel KSZ8864RMN switch to the spi_ks8995 driver. The KSZ8864RMN switch has a wider 256-byte register space. Signed-off-by: Philipp Zabel Signed-off-by: David S. Miller --- drivers/net/phy/spi_ks8995.c | 52 ++++++++++++++++++++++++++++++++++---------- 1 file changed, 40 insertions(+), 12 deletions(-) diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c index 4cf5fb9..22b047f 100644 --- a/drivers/net/phy/spi_ks8995.c +++ b/drivers/net/phy/spi_ks8995.c @@ -1,5 +1,5 @@ /* - * SPI driver for Micrel/Kendin KS8995M ethernet switch + * SPI driver for Micrel/Kendin KS8995M and KSZ8864RMN ethernet switches * * Copyright (C) 2008 Gabor Juhos * @@ -70,7 +70,10 @@ #define KS8995_REG_IAD1 0x76 /* Indirect Access Data 1 */ #define KS8995_REG_IAD0 0x77 /* Indirect Access Data 0 */ +#define KSZ8864_REG_ID1 0xfe /* Chip ID in bit 7 */ + #define KS8995_REGS_SIZE 0x80 +#define KSZ8864_REGS_SIZE 0x100 #define ID1_CHIPID_M 0xf #define ID1_CHIPID_S 4 @@ -94,6 +97,7 @@ struct ks8995_switch { struct spi_device *spi; struct mutex lock; struct ks8995_pdata *pdata; + struct bin_attribute regs_attr; }; static inline u8 get_chip_id(u8 val) @@ -216,11 +220,11 @@ static ssize_t ks8995_registers_read(struct file *filp, struct kobject *kobj, dev = container_of(kobj, struct device, kobj); ks8995 = dev_get_drvdata(dev); - if (unlikely(off > KS8995_REGS_SIZE)) + if (unlikely(off > ks8995->regs_attr.size)) return 0; - if ((off + count) > KS8995_REGS_SIZE) - count = KS8995_REGS_SIZE - off; + if ((off + count) > ks8995->regs_attr.size) + count = ks8995->regs_attr.size - off; if (unlikely(!count)) return count; @@ -238,11 +242,11 @@ static ssize_t ks8995_registers_write(struct file *filp, struct kobject *kobj, dev = container_of(kobj, struct device, kobj); ks8995 = dev_get_drvdata(dev); - if (unlikely(off >= KS8995_REGS_SIZE)) + if (unlikely(off >= ks8995->regs_attr.size)) return -EFBIG; - if ((off + count) > KS8995_REGS_SIZE) - count = KS8995_REGS_SIZE - off; + if ((off + count) > ks8995->regs_attr.size) + count = ks8995->regs_attr.size - off; if (unlikely(!count)) return count; @@ -251,7 +255,7 @@ static ssize_t ks8995_registers_write(struct file *filp, struct kobject *kobj, } -static struct bin_attribute ks8995_registers_attr = { +static const struct bin_attribute ks8995_registers_attr = { .attr = { .name = "registers", .mode = S_IRUSR | S_IWUSR, @@ -306,20 +310,44 @@ static int ks8995_probe(struct spi_device *spi) goto err_drvdata; } + memcpy(&ks->regs_attr, &ks8995_registers_attr, sizeof(ks->regs_attr)); + if (get_chip_id(ids[1]) != CHIPID_M) { + u8 val; + + /* Check if this is a KSZ8864RMN */ + err = ks8995_read(ks, &val, KSZ8864_REG_ID1, sizeof(val)); + if (err < 0) { + dev_err(&spi->dev, + "unable to read chip id register, err=%d\n", + err); + goto err_drvdata; + } + if ((val & 0x80) == 0) { + dev_err(&spi->dev, "unknown chip:%02x,0\n", ids[1]); + goto err_drvdata; + } + ks->regs_attr.size = KSZ8864_REGS_SIZE; + } + err = ks8995_reset(ks); if (err) goto err_drvdata; - err = sysfs_create_bin_file(&spi->dev.kobj, &ks8995_registers_attr); + err = sysfs_create_bin_file(&spi->dev.kobj, &ks->regs_attr); if (err) { dev_err(&spi->dev, "unable to create sysfs file, err=%d\n", err); goto err_drvdata; } - dev_info(&spi->dev, "KS89%02X device found, Chip ID:%01x, " - "Revision:%01x\n", ids[0], - get_chip_id(ids[1]), get_chip_rev(ids[1])); + if (get_chip_id(ids[1]) == CHIPID_M) { + dev_info(&spi->dev, + "KS8995 device found, Chip ID:%x, Revision:%x\n", + get_chip_id(ids[1]), get_chip_rev(ids[1])); + } else { + dev_info(&spi->dev, "KSZ8864 device found, Revision:%x\n", + get_chip_rev(ids[1])); + } return 0; -- cgit v1.1 From e33d0ba8047b049c9262fdb1fcafb93cb52ceceb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 3 Apr 2014 09:28:10 -0700 Subject: net-gro: reset skb->truesize in napi_reuse_skb() Recycling skb always had been very tough... This time it appears GRO layer can accumulate skb->truesize adjustments made by drivers when they attach a fragment to skb. skb_gro_receive() can only subtract from skb->truesize the used part of a fragment. I spotted this problem seeing TcpExtPruneCalled and TcpExtTCPRcvCollapsed that were unexpected with a recent kernel, where TCP receive window should be sized properly to accept traffic coming from a driver not overshooting skb->truesize. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/core/dev.c b/net/core/dev.c index 75e88e0..5777018 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4043,6 +4043,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->vlan_tci = 0; skb->dev = napi->dev; skb->skb_iif = 0; + skb->truesize = SKB_TRUESIZE(skb_end_offset(skb)); napi->skb = skb; } -- cgit v1.1 From e5ac6eafba887821044c65b6fe59d9eb8b7c7f61 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Mon, 17 Mar 2014 22:27:50 +0100 Subject: netfilter: connlimit: fix UP build cannot use ARRAY_SIZE() if spinlock_t is empty struct. Fixes: 1442e7507dd597 ("netfilter: connlimit: use keyed locks") Reported-by: kbuild test robot Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connlimit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index 458464e..a6e129e 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -377,7 +377,7 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - for (i = 0; i < ARRAY_SIZE(info->data->locks); ++i) + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) spin_lock_init(&info->data->locks[i]); for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) -- cgit v1.1 From e00b437b3d6d4d26ecd95108b575ee1bcfcb478f Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Thu, 20 Mar 2014 11:53:39 +0100 Subject: netfilter: connlimit: move lock array out of struct connlimit_data Eric points out that the locks can be global. Moreover, both Jesper and Eric note that using only 32 locks increases false sharing as only two cache lines are used. This increases locks to 256 (16 cache lines assuming 64byte cacheline and 4 bytes per spinlock). Suggested-by: Jesper Dangaard Brouer Suggested-by: Eric Dumazet Signed-off-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_connlimit.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/net/netfilter/xt_connlimit.c b/net/netfilter/xt_connlimit.c index a6e129e..fbc66bb 100644 --- a/net/netfilter/xt_connlimit.c +++ b/net/netfilter/xt_connlimit.c @@ -32,8 +32,14 @@ #include #include -#define CONNLIMIT_SLOTS 32 -#define CONNLIMIT_LOCK_SLOTS 32 +#define CONNLIMIT_SLOTS 256U + +#ifdef CONFIG_LOCKDEP +#define CONNLIMIT_LOCK_SLOTS 8U +#else +#define CONNLIMIT_LOCK_SLOTS 256U +#endif + #define CONNLIMIT_GC_MAX_NODES 8 /* we will save the tuples of all connections we care about */ @@ -49,10 +55,11 @@ struct xt_connlimit_rb { union nf_inet_addr addr; /* search key */ }; +static spinlock_t xt_connlimit_locks[CONNLIMIT_LOCK_SLOTS] __cacheline_aligned_in_smp; + struct xt_connlimit_data { struct rb_root climit_root4[CONNLIMIT_SLOTS]; struct rb_root climit_root6[CONNLIMIT_SLOTS]; - spinlock_t locks[CONNLIMIT_LOCK_SLOTS]; }; static u_int32_t connlimit_rnd __read_mostly; @@ -297,11 +304,11 @@ static int count_them(struct net *net, root = &data->climit_root4[hash]; } - spin_lock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); + spin_lock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); count = count_tree(net, root, tuple, addr, mask, family); - spin_unlock_bh(&data->locks[hash % CONNLIMIT_LOCK_SLOTS]); + spin_unlock_bh(&xt_connlimit_locks[hash % CONNLIMIT_LOCK_SLOTS]); return count; } @@ -377,9 +384,6 @@ static int connlimit_mt_check(const struct xt_mtchk_param *par) return -ENOMEM; } - for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) - spin_lock_init(&info->data->locks[i]); - for (i = 0; i < ARRAY_SIZE(info->data->climit_root4); ++i) info->data->climit_root4[i] = RB_ROOT; for (i = 0; i < ARRAY_SIZE(info->data->climit_root6); ++i) @@ -435,11 +439,14 @@ static struct xt_match connlimit_mt_reg __read_mostly = { static int __init connlimit_mt_init(void) { - int ret; + int ret, i; BUILD_BUG_ON(CONNLIMIT_LOCK_SLOTS > CONNLIMIT_SLOTS); BUILD_BUG_ON((CONNLIMIT_SLOTS % CONNLIMIT_LOCK_SLOTS) != 0); + for (i = 0; i < CONNLIMIT_LOCK_SLOTS; ++i) + spin_lock_init(&xt_connlimit_locks[i]); + connlimit_conn_cachep = kmem_cache_create("xt_connlimit_conn", sizeof(struct xt_connlimit_conn), 0, 0, NULL); -- cgit v1.1 From a00e76349f3564bb8129fc0510dfd93248c3084d Mon Sep 17 00:00:00 2001 From: Alexey Perevalov Date: Wed, 19 Mar 2014 10:58:42 +0400 Subject: netfilter: x_tables: allow to use cgroup match for LOCAL_IN nf hooks This simple modification allows iptables to work with INPUT chain in combination with cgroup module. It could be useful for counting ingress traffic per cgroup with nfacct netfilter module. There were no problems to count the egress traffic that way formerly. It's possible to get classified sk_buff after PREROUTING, due to socket lookup being done in early_demux (tcp_v4_early_demux). Also it works for udp as well. Trivial usage example, assuming we're in the same shell every step and we have enough permissions: 1) Classic net_cls cgroup initialization: mkdir /sys/fs/cgroup/net_cls mount -t cgroup -o net_cls net_cls /sys/fs/cgroup/net_cls 2) Set up cgroup for interesting application: mkdir /sys/fs/cgroup/net_cls/wget echo 1 > /sys/fs/cgroup/net_cls/wget/net_cls.classid echo $BASHPID > /sys/fs/cgroup/net_cls/wget/cgroup.procs 3) Create kernel counters: nfacct add wget-cgroup-in iptables -A INPUT -m cgroup ! --cgroup 1 -m nfacct --nfacct-name wget-cgroup-in nfacct add wget-cgroup-out iptables -A OUTPUT -m cgroup ! --cgroup 1 -m nfacct --nfacct-name wget-cgroup-out 4) Network usage: wget https://www.kernel.org/pub/linux/kernel/v3.x/testing/linux-3.14-rc6.tar.xz 5) Check results: nfacct list Cgroup approach is being used for the DataUsage (counting & blocking traffic) feature for Samsung's modification of the Tizen OS. Signed-off-by: Alexey Perevalov Acked-by: Daniel Borkmann Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_cgroup.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c index 9a8e77e7..f4e8330 100644 --- a/net/netfilter/xt_cgroup.c +++ b/net/netfilter/xt_cgroup.c @@ -54,7 +54,8 @@ static struct xt_match cgroup_mt_reg __read_mostly = { .matchsize = sizeof(struct xt_cgroup_info), .me = THIS_MODULE, .hooks = (1 << NF_INET_LOCAL_OUT) | - (1 << NF_INET_POST_ROUTING), + (1 << NF_INET_POST_ROUTING) | + (1 << NF_INET_LOCAL_IN), }; static int __init cgroup_mt_init(void) -- cgit v1.1 From b8ddd9eac8788b0aa9a9d4e09d76dc9e1667bb2c Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 26 Mar 2014 14:37:59 +0400 Subject: netfilter: Add {ipt,ip6t}_osf aliases for xt_osf There are no these aliases, so kernel can not request appropriate match table: $ iptables -I INPUT -p tcp -m osf --genre Windows --ttl 2 -j DROP iptables: No chain/target/match by that name. setsockopt() requests ipt_osf module, which is not present. Add the aliases. Signed-off-by: Kirill Tkhai Signed-off-by: Pablo Neira Ayuso --- net/netfilter/xt_osf.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c index 7174611..c529161 100644 --- a/net/netfilter/xt_osf.c +++ b/net/netfilter/xt_osf.c @@ -422,4 +422,6 @@ module_exit(xt_osf_fini); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Evgeniy Polyakov "); MODULE_DESCRIPTION("Passive OS fingerprint matching."); +MODULE_ALIAS("ipt_osf"); +MODULE_ALIAS("ip6t_osf"); MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_OSF); -- cgit v1.1 From 223b02d923ecd7c84cf9780bb3686f455d279279 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Fri, 28 Mar 2014 13:54:32 +0400 Subject: netfilter: nf_conntrack: reserve two bytes for nf_ct_ext->len "len" contains sizeof(nf_ct_ext) and size of extensions. In a worst case it can contain all extensions. Bellow you can find sizes for all types of extensions. Their sum is definitely bigger than 256. nf_ct_ext_types[0]->len = 24 nf_ct_ext_types[1]->len = 32 nf_ct_ext_types[2]->len = 24 nf_ct_ext_types[3]->len = 32 nf_ct_ext_types[4]->len = 152 nf_ct_ext_types[5]->len = 2 nf_ct_ext_types[6]->len = 16 nf_ct_ext_types[7]->len = 8 I have seen "len" up to 280 and my host has crashes w/o this patch. The right way to fix this problem is reducing the size of the ecache extension (4) and Florian is going to do this, but these changes will be quite large to be appropriate for a stable tree. Fixes: 5b423f6a40a0 (netfilter: nf_conntrack: fix racy timer handling with reliable) Cc: Pablo Neira Ayuso Cc: Patrick McHardy Cc: Jozsef Kadlecsik Cc: "David S. Miller" Signed-off-by: Andrey Vagin Signed-off-by: Pablo Neira Ayuso --- include/net/netfilter/nf_conntrack_extend.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/netfilter/nf_conntrack_extend.h b/include/net/netfilter/nf_conntrack_extend.h index 956b175..55d1504 100644 --- a/include/net/netfilter/nf_conntrack_extend.h +++ b/include/net/netfilter/nf_conntrack_extend.h @@ -47,8 +47,8 @@ enum nf_ct_ext_id { /* Extensions: optional stuff which isn't permanently in struct. */ struct nf_ct_ext { struct rcu_head rcu; - u8 offset[NF_CT_EXT_NUM]; - u8 len; + u16 offset[NF_CT_EXT_NUM]; + u16 len; char data[0]; }; -- cgit v1.1 From a9bdd8365684810e3de804f8c51e52c26a5eccbb Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 24 Mar 2014 15:10:37 +0100 Subject: netfilter: nf_tables: set names cannot be larger than 15 bytes Currently, nf_tables trims off the set name if it exceeeds 15 bytes, so explicitly reject set names that are too large. Reported-by: Giuseppe Longo Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 33045a5..43ae487 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -1946,7 +1946,8 @@ static const struct nft_set_ops *nft_select_set_ops(const struct nlattr * const static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = { [NFTA_SET_TABLE] = { .type = NLA_STRING }, - [NFTA_SET_NAME] = { .type = NLA_STRING }, + [NFTA_SET_NAME] = { .type = NLA_STRING, + .len = IFNAMSIZ - 1 }, [NFTA_SET_FLAGS] = { .type = NLA_U32 }, [NFTA_SET_KEY_TYPE] = { .type = NLA_U32 }, [NFTA_SET_KEY_LEN] = { .type = NLA_U32 }, -- cgit v1.1 From 2fec6bb6f484b1a88b4a325724234d6cfd08c918 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Mon, 31 Mar 2014 12:26:39 +0200 Subject: netfilter: nf_tables: fix wrong format in request_module() The intended format in request_module is %.*s instead of %*.s. Reported-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso --- net/netfilter/nf_tables_api.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 43ae487..3fd159d 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -152,8 +152,8 @@ nf_tables_chain_type_lookup(const struct nft_af_info *afi, #ifdef CONFIG_MODULES if (autoload) { nfnl_unlock(NFNL_SUBSYS_NFTABLES); - request_module("nft-chain-%u-%*.s", afi->family, - nla_len(nla)-1, (const char *)nla_data(nla)); + request_module("nft-chain-%u-%.*s", afi->family, + nla_len(nla), (const char *)nla_data(nla)); nfnl_lock(NFNL_SUBSYS_NFTABLES); type = __nf_tables_chain_type_lookup(afi->family, nla); if (type != NULL) -- cgit v1.1 From 7db8df02797e29cfa7d62a7e0b19f41e64b8433e Mon Sep 17 00:00:00 2001 From: "zheng.li" Date: Wed, 2 Apr 2014 11:01:48 +0800 Subject: bonding: Inactive slaves should keep inactive flag's value bond_open is not setting the inactive flag correctly for some modes (alb and tlb), resulting in error behavior if the bond has been administratively set down and then back up. This effect should not occur when slaves are added while the bond is up; it's something that only happens after a down/up bounce of the bond. For example, in bond tlb or alb mode, domu send some ARP request which go out from dom0 bond's active slave, then the ARP broadcast request packets go back to inactive slave from switch, because the inactive slave's inactive flag is zero, kernel will receive the packets and pass them to bridge that cause dom0's bridge map domu's MAC address to port of bond, bridge should map domu's MAC to port of vif. Signed-off-by: Zheng Li Signed-off-by: Jay Vosburgh Signed-off-by: David S. Miller --- drivers/net/bonding/bond_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c index 95a6ca7..d9f8546 100644 --- a/drivers/net/bonding/bond_main.c +++ b/drivers/net/bonding/bond_main.c @@ -3077,7 +3077,7 @@ static int bond_open(struct net_device *bond_dev) if (bond_has_slaves(bond)) { read_lock(&bond->curr_slave_lock); bond_for_each_slave(bond, slave, iter) { - if ((bond->params.mode == BOND_MODE_ACTIVEBACKUP) + if (USES_PRIMARY(bond->params.mode) && (slave != bond->curr_active_slave)) { bond_set_slave_inactive_flags(slave, BOND_SLAVE_NOTIFY_NOW); -- cgit v1.1 From a50988a11d1bbf58213f81d46623596bfb51a969 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Fri, 4 Apr 2014 02:16:23 +0200 Subject: net: smc911x: Remove unused local variable The ioaddr local variable is assigned to but never used in the smc911x_rx_dma_irq() function, remove it. Signed-off-by: Laurent Pinchart Signed-off-by: David S. Miller --- drivers/net/ethernet/smsc/smc911x.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/smsc/smc911x.c b/drivers/net/ethernet/smsc/smc911x.c index 66b05e6..1c44e67 100644 --- a/drivers/net/ethernet/smsc/smc911x.c +++ b/drivers/net/ethernet/smsc/smc911x.c @@ -1211,7 +1211,6 @@ static void smc911x_rx_dma_irq(int dma, void *data) { struct net_device *dev = (struct net_device *)data; - unsigned long ioaddr = dev->base_addr; struct smc911x_local *lp = netdev_priv(dev); struct sk_buff *skb = lp->current_rx_skb; unsigned long flags; -- cgit v1.1 From 65e71ff342f8c2c8a9962ebc1fbceef4c7a529e0 Mon Sep 17 00:00:00 2001 From: Sachin Kamat Date: Fri, 4 Apr 2014 11:43:35 +0530 Subject: net: bcmgenet: Remove unnecessary version.h inclusion version.h inclusion is not necessary as detected by versioncheck. Signed-off-by: Sachin Kamat Signed-off-by: David S. Miller --- drivers/net/ethernet/broadcom/genet/bcmgenet.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/broadcom/genet/bcmgenet.c b/drivers/net/ethernet/broadcom/genet/bcmgenet.c index adf8acb..0966bd0 100644 --- a/drivers/net/ethernet/broadcom/genet/bcmgenet.c +++ b/drivers/net/ethernet/broadcom/genet/bcmgenet.c @@ -34,7 +34,6 @@ #include #include #include -#include #include #include #include -- cgit v1.1 From 00aefceb2fffcf4ea2fbc97ef5d4f79ef2668ecc Mon Sep 17 00:00:00 2001 From: Zoltan Kiss Date: Fri, 4 Apr 2014 15:45:24 +0100 Subject: xen-netback: Trivial format string fix There is a "%" after pending_idx instead of ":". Signed-off-by: Zoltan Kiss Signed-off-by: David S. Miller --- drivers/net/xen-netback/netback.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/xen-netback/netback.c b/drivers/net/xen-netback/netback.c index 99c8f09..7666540 100644 --- a/drivers/net/xen-netback/netback.c +++ b/drivers/net/xen-netback/netback.c @@ -956,7 +956,7 @@ static int xenvif_tx_check_gop(struct xenvif *vif, if (unlikely(err)) { if (net_ratelimit()) netdev_dbg(vif->dev, - "Grant copy of header failed! status: %d pending_idx% %u ref: %u\n", + "Grant copy of header failed! status: %d pending_idx: %u ref: %u\n", (*gopp_copy)->status, pending_idx, (*gopp_copy)->source.u.ref); @@ -985,7 +985,7 @@ check_frags: /* Error on this fragment: respond to client with an error. */ if (net_ratelimit()) netdev_dbg(vif->dev, - "Grant map of %d. frag failed! status: %d pending_idx% %u ref: %u\n", + "Grant map of %d. frag failed! status: %d pending_idx: %u ref: %u\n", i, gop_map->status, pending_idx, -- cgit v1.1 From c58dd2dd443c26d856a168db108a0cd11c285bf3 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 4 Apr 2014 17:57:45 +0200 Subject: netfilter: Can't fail and free after table replacement All xtables variants suffer from the defect that the copy_to_user() to copy the counters to user memory may fail after the table has already been exchanged and thus exposed. Return an error at this point will result in freeing the already exposed table. Any subsequent packet processing will result in a kernel panic. We can't copy the counters before exposing the new tables as we want provide the counter state after the old table has been unhooked. Therefore convert this into a silent error. Cc: Florian Westphal Signed-off-by: Thomas Graf Signed-off-by: Pablo Neira Ayuso --- net/bridge/netfilter/ebtables.c | 5 ++--- net/ipv4/netfilter/arp_tables.c | 6 ++++-- net/ipv4/netfilter/ip_tables.c | 6 ++++-- net/ipv6/netfilter/ip6_tables.c | 6 ++++-- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 0e474b1..1059ed3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1044,10 +1044,9 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, if (repl->num_counters && copy_to_user(repl->counters, counterstmp, repl->num_counters * sizeof(struct ebt_counter))) { - ret = -EFAULT; + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ebtables: counters copy to user failed while replacing table\n"); } - else - ret = 0; /* decrease module count and free resources */ EBT_ENTRY_ITERATE(table->entries, table->entries_size, diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 59da7cd..f95b6f9 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -1044,8 +1044,10 @@ static int __do_replace(struct net *net, const char *name, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("arptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 718dfbd..99e810f 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1231,8 +1231,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("iptables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 710238f..e080fbb 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1241,8 +1241,10 @@ __do_replace(struct net *net, const char *name, unsigned int valid_hooks, xt_free_table_info(oldinfo); if (copy_to_user(counters_ptr, counters, - sizeof(struct xt_counters) * num_counters) != 0) - ret = -EFAULT; + sizeof(struct xt_counters) * num_counters) != 0) { + /* Silent error, can't fail, new table is already in place */ + net_warn_ratelimited("ip6tables: counters copy to user failed while replacing table\n"); + } vfree(counters); xt_table_unlock(t); return ret; -- cgit v1.1 From 5f9fde5f799df7156eeb3fa58282e9fd2f38a5f8 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Sat, 5 Apr 2014 01:04:03 +0200 Subject: net: filter: be more defensive on div/mod by X==0 The old interpreter behaviour was that we returned with 0 whenever we found a division by 0 would take place. In the new interpreter we would currently just skip that instead and continue execution. It's true that a value of 0 as return might not be appropriate in all cases, but current users (socket filters -> drop packet, seccomp -> SECCOMP_RET_KILL, cls_bpf -> unclassified, etc) seem fine with that behaviour. Better this than undefined BPF program behaviour as it's expected that A contains the result of the division. In future, as more use cases open up, we could further adapt this return value to our needs, if necessary. So reintroduce return of 0 for division by 0 as in the old interpreter. Also in case of K which is guaranteed to be 32bit wide, sk_chk_filter() already takes care of preventing division by 0 invoked through K, so we can generally spare us these tests. Signed-off-by: Daniel Borkmann Reviewed-by: Alexei Starovoitov Signed-off-by: David S. Miller --- net/core/filter.c | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 765556b..e08b382 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -295,43 +295,43 @@ select_insn: (*(s64 *) &A) >>= K; CONT; BPF_ALU64_BPF_MOD_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = A; - if (X) - A = do_div(tmp, X); + A = do_div(tmp, X); CONT; BPF_ALU_BPF_MOD_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = (u32) A; - if (X) - A = do_div(tmp, (u32) X); + A = do_div(tmp, (u32) X); CONT; BPF_ALU64_BPF_MOD_BPF_K: tmp = A; - if (K) - A = do_div(tmp, K); + A = do_div(tmp, K); CONT; BPF_ALU_BPF_MOD_BPF_K: tmp = (u32) A; - if (K) - A = do_div(tmp, (u32) K); + A = do_div(tmp, (u32) K); CONT; BPF_ALU64_BPF_DIV_BPF_X: - if (X) - do_div(A, X); + if (unlikely(X == 0)) + return 0; + do_div(A, X); CONT; BPF_ALU_BPF_DIV_BPF_X: + if (unlikely(X == 0)) + return 0; tmp = (u32) A; - if (X) - do_div(tmp, (u32) X); + do_div(tmp, (u32) X); A = (u32) tmp; CONT; BPF_ALU64_BPF_DIV_BPF_K: - if (K) - do_div(A, K); + do_div(A, K); CONT; BPF_ALU_BPF_DIV_BPF_K: tmp = (u32) A; - if (K) - do_div(tmp, (u32) K); + do_div(tmp, (u32) K); A = (u32) tmp; CONT; BPF_ALU_BPF_END_BPF_TO_BE: -- cgit v1.1 From c28c7a6accb680d88b8586a4b32be4e54d60715e Mon Sep 17 00:00:00 2001 From: Jean Sacren Date: Sat, 5 Apr 2014 00:29:00 -0600 Subject: sxgbe: fix duplicate #include headers The commit 1edb9ca69e8a ("net: sxgbe: add basic framework for Samsung 10Gb ethernet driver") added support for Samsung 10Gb ethernet driver(sxgbe) with a minor issue of including linux/io.h header twice in sxgbe_dma.c file. Fix the duplicate #include by deleting the top one so that all the rest good #include headers would be preserved in the alphabetical order. Signed-off-by: Jean Sacren Cc: Byungho An Cc: Girish K S Cc: Siva Reddy Kallam Cc: Vipul Pandya Acked-by: Byungho An Signed-off-by: David S. Miller --- drivers/net/ethernet/samsung/sxgbe/sxgbe_dma.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_dma.c b/drivers/net/ethernet/samsung/sxgbe/sxgbe_dma.c index 28f89c41..4d989ff 100644 --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_dma.c +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_dma.c @@ -9,7 +9,6 @@ * it under the terms of the GNU General Public License version 2 as * published by the Free Software Foundation. */ -#include #include #include #include -- cgit v1.1 From 6c6a9855560d3cfa93120f2ab07b8ea0110f953d Mon Sep 17 00:00:00 2001 From: Jean Sacren Date: Sat, 5 Apr 2014 00:29:01 -0600 Subject: mac802154: fix duplicate #include headers The commit e6278d92005e ("mac802154: use header operations to create/parse headers") included the header net/ieee802154_netdev.h which had been included by the commit b70ab2e87f17 ("ieee802154: enforce consistent endianness in the 802.15.4 stack"). Fix this duplicate #include by deleting the latter one as the required header has already been in place. Signed-off-by: Jean Sacren Cc: Alexander Smirnov Cc: Dmitry Eremin-Solenikov Cc: Phoebe Buckheister Cc: linux-zigbee-devel@lists.sourceforge.net Signed-off-by: David S. Miller --- net/mac802154/mib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/mac802154/mib.c b/net/mac802154/mib.c index 153bd1d..f0991f2 100644 --- a/net/mac802154/mib.c +++ b/net/mac802154/mib.c @@ -26,7 +26,6 @@ #include #include #include -#include #include "mac802154.h" -- cgit v1.1 From 39d7f3200801a993bff15ea89d83e99d0c479594 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Sat, 5 Apr 2014 13:49:26 +0200 Subject: at86rf230: fix MAX_CSMA_RETRIES parameter This patch fix a copy&paste failure for setting the MAX_CSMA_RETRIES value of the at86rf212 chip which was introduced by commit f2fdd67c6bc89de0100410efb37de69b1c98ac03 ("ieee802154: enable smart transmitter features of RF212") Signed-off-by: Alexander Aring Cc: Phoebe Buckheister Signed-off-by: David S. Miller --- drivers/net/ieee802154/at86rf230.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ieee802154/at86rf230.c b/drivers/net/ieee802154/at86rf230.c index 89417ac..430bb0d 100644 --- a/drivers/net/ieee802154/at86rf230.c +++ b/drivers/net/ieee802154/at86rf230.c @@ -852,7 +852,7 @@ at86rf212_set_csma_params(struct ieee802154_dev *dev, u8 min_be, u8 max_be, if (rc) return rc; - return at86rf230_write_subreg(lp, SR_MAX_CSMA_RETRIES, max_be); + return at86rf230_write_subreg(lp, SR_MAX_CSMA_RETRIES, retries); } static int -- cgit v1.1 From 065d7e39563b092dbb429373bd8f0f2295768cea Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Sun, 6 Apr 2014 15:56:14 +0200 Subject: tipc: Let tipc_release() return 0 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/tipc/socket.c: In function ‘tipc_release’: net/tipc/socket.c:352: warning: ‘res’ is used uninitialized in this function Introduced by commit 24be34b5a0c9114541891d29dff1152bb1a8df34 ("tipc: eliminate upcall function pointers between port and socket"), which removed the sole initializer of "res". Just return 0 to fix it. Signed-off-by: Geert Uytterhoeven Signed-off-by: David S. Miller --- net/tipc/socket.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 29b7f26..adc12e2 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -301,7 +301,6 @@ static int tipc_release(struct socket *sock) struct tipc_sock *tsk; struct tipc_port *port; struct sk_buff *buf; - int res; /* * Exit if socket isn't fully initialized (occurs when a failed accept() @@ -349,7 +348,7 @@ static int tipc_release(struct socket *sock) sock_put(sk); sock->sk = NULL; - return res; + return 0; } /** -- cgit v1.1 From c293fb785bdda64d88f197e6758a3c16ae83e569 Mon Sep 17 00:00:00 2001 From: Gilles Chanteperdrix Date: Sun, 6 Apr 2014 20:37:44 +0200 Subject: net/at91_ether: avoid NULL pointer dereference The at91_ether driver calls macb_mii_init passing a 'struct macb' structure whose tx_clk member is initialized to 0. However, macb_handle_link_change() expects tx_clk to be the result of a call to clk_get, and so IS_ERR(tx_clk) to be true if the clock is invalid. This causes an oops when booting Linux 3.14 on the csb637 board. The following changes avoids this. Signed-off-by: Gilles Chanteperdrix Acked-by: Nicolas Ferre Signed-off-by: David S. Miller --- drivers/net/ethernet/cadence/at91_ether.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/cadence/at91_ether.c b/drivers/net/ethernet/cadence/at91_ether.c index ce75de9..4a79eda 100644 --- a/drivers/net/ethernet/cadence/at91_ether.c +++ b/drivers/net/ethernet/cadence/at91_ether.c @@ -342,6 +342,9 @@ static int __init at91ether_probe(struct platform_device *pdev) } clk_enable(lp->pclk); + lp->hclk = ERR_PTR(-ENOENT); + lp->tx_clk = ERR_PTR(-ENOENT); + /* Install the interrupt handler */ dev->irq = platform_get_irq(pdev, 0); res = devm_request_irq(&pdev->dev, dev->irq, at91ether_interrupt, 0, dev->name, dev); -- cgit v1.1 From 6f25cd47dcd2b9912c6e52aa833ba1614f7b5086 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Mon, 7 Apr 2014 17:18:30 +0200 Subject: pktgen: fix xmit test for BQL enabled devices Similarly as in commit 8e2f1a63f221 ("packet: fix packet_direct_xmit for BQL enabled drivers"), we test for __QUEUE_STATE_STACK_XOFF bit in pktgen's xmit, which would not fully fill the device's TX ring for BQL drivers that use netdev_tx_sent_queue(). Fix is to use, similarly as we do in packet sockets, netif_xmit_frozen_or_drv_stopped() test. Signed-off-by: Daniel Borkmann Cc: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index d0dac57..d068ec2 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3340,7 +3340,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev) __netif_tx_lock_bh(txq); - if (unlikely(netif_xmit_frozen_or_stopped(txq))) { + if (unlikely(netif_xmit_frozen_or_drv_stopped(txq))) { ret = NETDEV_TX_BUSY; pkt_dev->last_ok = 0; goto unlock; -- cgit v1.1 From 6859e7df6d9045a461412777e63bd8cef12f9705 Mon Sep 17 00:00:00 2001 From: Veaceslav Falico Date: Mon, 7 Apr 2014 11:25:12 +0200 Subject: netdev: remove potentially harmful checks Currently we're checking a variable for != NULL after actually dereferencing it, in netdev_lower_get_next_private*(). It's counter-intuitive at best, and can lead to faulty usage (as it implies that the variable can be NULL), so fix it by removing the useless checks. Reported-by: Daniel Borkmann CC: "David S. Miller" CC: Eric Dumazet CC: Nicolas Dichtel CC: Jiri Pirko CC: stephen hemminger CC: Jerry Chu Signed-off-by: Veaceslav Falico Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/core/dev.c b/net/core/dev.c index 5777018..14dac06 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4589,8 +4589,7 @@ void *netdev_lower_get_next_private(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = lower->list.next; + *iter = lower->list.next; return lower->private; } @@ -4618,8 +4617,7 @@ void *netdev_lower_get_next_private_rcu(struct net_device *dev, if (&lower->list == &dev->adj_list.lower) return NULL; - if (iter) - *iter = &lower->list; + *iter = &lower->list; return lower->private; } -- cgit v1.1 From 7563487cbf865284dcd35e9ef5a95380da046737 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Tue, 8 Apr 2014 12:23:09 +0300 Subject: isdnloop: several buffer overflows There are three buffer overflows addressed in this patch. 1) In isdnloop_fake_err() we add an 'E' to a 60 character string and then copy it into a 60 character buffer. I have made the destination buffer 64 characters and I'm changed the sprintf() to a snprintf(). 2) In isdnloop_parse_cmd(), p points to a 6 characters into a 60 character buffer so we have 54 characters. The ->eazlist[] is 11 characters long. I have modified the code to return if the source buffer is too long. 3) In isdnloop_command() the cbuf[] array was 60 characters long but the max length of the string then can be up to 79 characters. I made the cbuf array 80 characters long and changed the sprintf() to snprintf(). I also removed the temporary "dial" buffer and changed it to use "p" directly. Unfortunately, we pass the "cbuf" string from isdnloop_command() to isdnloop_writecmd() which truncates anything over 60 characters to make it fit in card->omsg[]. (It can accept values up to 255 characters so long as there is a '\n' character every 60 characters). For now I have just fixed the memory corruption bug and left the other problems in this driver alone. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- drivers/isdn/isdnloop/isdnloop.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/isdn/isdnloop/isdnloop.c b/drivers/isdn/isdnloop/isdnloop.c index e1f8748..5a4da94 100644 --- a/drivers/isdn/isdnloop/isdnloop.c +++ b/drivers/isdn/isdnloop/isdnloop.c @@ -518,9 +518,9 @@ static isdnloop_stat isdnloop_cmd_table[] = static void isdnloop_fake_err(isdnloop_card *card) { - char buf[60]; + char buf[64]; - sprintf(buf, "E%s", card->omsg); + snprintf(buf, sizeof(buf), "E%s", card->omsg); isdnloop_fake(card, buf, -1); isdnloop_fake(card, "NAK", -1); } @@ -903,6 +903,8 @@ isdnloop_parse_cmd(isdnloop_card *card) case 7: /* 0x;EAZ */ p += 3; + if (strlen(p) >= sizeof(card->eazlist[0])) + break; strcpy(card->eazlist[ch - 1], p); break; case 8: @@ -1133,7 +1135,7 @@ isdnloop_command(isdn_ctrl *c, isdnloop_card *card) { ulong a; int i; - char cbuf[60]; + char cbuf[80]; isdn_ctrl cmd; isdnloop_cdef cdef; @@ -1198,7 +1200,6 @@ isdnloop_command(isdn_ctrl *c, isdnloop_card *card) break; if ((c->arg & 255) < ISDNLOOP_BCH) { char *p; - char dial[50]; char dcode[4]; a = c->arg; @@ -1210,10 +1211,10 @@ isdnloop_command(isdn_ctrl *c, isdnloop_card *card) } else /* Normal Dial */ strcpy(dcode, "CAL"); - strcpy(dial, p); - sprintf(cbuf, "%02d;D%s_R%s,%02d,%02d,%s\n", (int) (a + 1), - dcode, dial, c->parm.setup.si1, - c->parm.setup.si2, c->parm.setup.eazmsn); + snprintf(cbuf, sizeof(cbuf), + "%02d;D%s_R%s,%02d,%02d,%s\n", (int) (a + 1), + dcode, p, c->parm.setup.si1, + c->parm.setup.si2, c->parm.setup.eazmsn); i = isdnloop_writecmd(cbuf, strlen(cbuf), 0, card); } break; -- cgit v1.1 From 52c35befb69b005c3fc5afdaae3a5717ad013411 Mon Sep 17 00:00:00 2001 From: Daniel Borkmann Date: Tue, 8 Apr 2014 17:26:13 +0200 Subject: net: sctp: wake up all assocs if sndbuf policy is per socket SCTP charges chunks for wmem accounting via skb->truesize in sctp_set_owner_w(), and sctp_wfree() respectively as the reverse operation. If a sender runs out of wmem, it needs to wait via sctp_wait_for_sndbuf(), and gets woken up by a call to __sctp_write_space() mostly via sctp_wfree(). __sctp_write_space() is being called per association. Although we assign sk->sk_write_space() to sctp_write_space(), which is then being done per socket, it is only used if send space is increased per socket option (SO_SNDBUF), as SOCK_USE_WRITE_QUEUE is set and therefore not invoked in sock_wfree(). Commit 4c3a5bdae293 ("sctp: Don't charge for data in sndbuf again when transmitting packet") fixed an issue where in case sctp_packet_transmit() manages to queue up more than sndbuf bytes, sctp_wait_for_sndbuf() will never be woken up again unless it is interrupted by a signal. However, a still remaining issue is that if net.sctp.sndbuf_policy=0, that is accounting per socket, and one-to-many sockets are in use, the reclaimed write space from sctp_wfree() is 'unfairly' handed back on the server to the association that is the lucky one to be woken up again via __sctp_write_space(), while the remaining associations are never be woken up again (unless by a signal). The effect disappears with net.sctp.sndbuf_policy=1, that is wmem accounting per association, as it guarantees a fair share of wmem among associations. Therefore, if we have reclaimed memory in case of per socket accounting, wake all related associations to a socket in a fair manner, that is, traverse the socket association list starting from the current neighbour of the association and issue a __sctp_write_space() to everyone until we end up waking ourselves. This guarantees that no association is preferred over another and even if more associations are taken into the one-to-many session, all receivers will get messages from the server and are not stalled forever on high load. This setting still leaves the advantage of per socket accounting in touch as an association can still use up global limits if unused by others. Fixes: 4eb701dfc618 ("[SCTP] Fix SCTP sendbuffer accouting.") Signed-off-by: Daniel Borkmann Cc: Thomas Graf Cc: Neil Horman Cc: Vlad Yasevich Acked-by: Vlad Yasevich Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/sctp/socket.c | 36 +++++++++++++++++++++++++++++++++++- 1 file changed, 35 insertions(+), 1 deletion(-) diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 981aaf8..5f83a6a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -6593,6 +6593,40 @@ static void __sctp_write_space(struct sctp_association *asoc) } } +static void sctp_wake_up_waiters(struct sock *sk, + struct sctp_association *asoc) +{ + struct sctp_association *tmp = asoc; + + /* We do accounting for the sndbuf space per association, + * so we only need to wake our own association. + */ + if (asoc->ep->sndbuf_policy) + return __sctp_write_space(asoc); + + /* Accounting for the sndbuf space is per socket, so we + * need to wake up others, try to be fair and in case of + * other associations, let them have a go first instead + * of just doing a sctp_write_space() call. + * + * Note that we reach sctp_wake_up_waiters() only when + * associations free up queued chunks, thus we are under + * lock and the list of associations on a socket is + * guaranteed not to change. + */ + for (tmp = list_next_entry(tmp, asocs); 1; + tmp = list_next_entry(tmp, asocs)) { + /* Manually skip the head element. */ + if (&tmp->asocs == &((sctp_sk(sk))->ep->asocs)) + continue; + /* Wake up association. */ + __sctp_write_space(tmp); + /* We've reached the end. */ + if (tmp == asoc) + break; + } +} + /* Do accounting for the sndbuf space. * Decrement the used sndbuf space of the corresponding association by the * data size which was just transmitted(freed). @@ -6620,7 +6654,7 @@ static void sctp_wfree(struct sk_buff *skb) sk_mem_uncharge(sk, skb->truesize); sock_wfree(skb); - __sctp_write_space(asoc); + sctp_wake_up_waiters(sk, asoc); sctp_association_put(asoc); } -- cgit v1.1