diff options
Diffstat (limited to 'net')
54 files changed, 469 insertions, 249 deletions
diff --git a/net/can/bcm.c b/net/can/bcm.c index 8e999ff..8af9d25 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -1549,24 +1549,31 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, struct sockaddr_can *addr = (struct sockaddr_can *)uaddr; struct sock *sk = sock->sk; struct bcm_sock *bo = bcm_sk(sk); + int ret = 0; if (len < sizeof(*addr)) return -EINVAL; - if (bo->bound) - return -EISCONN; + lock_sock(sk); + + if (bo->bound) { + ret = -EISCONN; + goto fail; + } /* bind a device to this socket */ if (addr->can_ifindex) { struct net_device *dev; dev = dev_get_by_index(&init_net, addr->can_ifindex); - if (!dev) - return -ENODEV; - + if (!dev) { + ret = -ENODEV; + goto fail; + } if (dev->type != ARPHRD_CAN) { dev_put(dev); - return -ENODEV; + ret = -ENODEV; + goto fail; } bo->ifindex = dev->ifindex; @@ -1577,17 +1584,24 @@ static int bcm_connect(struct socket *sock, struct sockaddr *uaddr, int len, bo->ifindex = 0; } - bo->bound = 1; - if (proc_dir) { /* unique socket address as filename */ sprintf(bo->procname, "%lu", sock_i_ino(sk)); bo->bcm_proc_read = proc_create_data(bo->procname, 0644, proc_dir, &bcm_proc_fops, sk); + if (!bo->bcm_proc_read) { + ret = -ENOMEM; + goto fail; + } } - return 0; + bo->bound = 1; + +fail: + release_sock(sk); + + return ret; } static int bcm_recvmsg(struct socket *sock, struct msghdr *msg, size_t size, diff --git a/net/ceph/ceph_fs.c b/net/ceph/ceph_fs.c index 7d54e94..dcbe67f 100644 --- a/net/ceph/ceph_fs.c +++ b/net/ceph/ceph_fs.c @@ -34,7 +34,8 @@ void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, fl->stripe_count = le32_to_cpu(legacy->fl_stripe_count); fl->object_size = le32_to_cpu(legacy->fl_object_size); fl->pool_id = le32_to_cpu(legacy->fl_pg_pool); - if (fl->pool_id == 0) + if (fl->pool_id == 0 && fl->stripe_unit == 0 && + fl->stripe_count == 0 && fl->object_size == 0) fl->pool_id = -1; } EXPORT_SYMBOL(ceph_file_layout_from_legacy); diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index d9bf7a1..e6ae15b 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -4094,6 +4094,7 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) osd_init(&osdc->homeless_osd); osdc->homeless_osd.o_osdc = osdc; osdc->homeless_osd.o_osd = CEPH_HOMELESS_OSD; + osdc->last_linger_id = CEPH_LINGER_ID_START; osdc->linger_requests = RB_ROOT; osdc->map_checks = RB_ROOT; osdc->linger_map_checks = RB_ROOT; diff --git a/net/core/dev.c b/net/core/dev.c index 7385c1a..6deba68 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1765,19 +1765,14 @@ EXPORT_SYMBOL_GPL(is_skb_forwardable); int __dev_forward_skb(struct net_device *dev, struct sk_buff *skb) { - if (skb_orphan_frags(skb, GFP_ATOMIC) || - unlikely(!is_skb_forwardable(dev, skb))) { - atomic_long_inc(&dev->rx_dropped); - kfree_skb(skb); - return NET_RX_DROP; - } + int ret = ____dev_forward_skb(dev, skb); - skb_scrub_packet(skb, true); - skb->priority = 0; - skb->protocol = eth_type_trans(skb, dev); - skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + if (likely(!ret)) { + skb->protocol = eth_type_trans(skb, dev); + skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); + } - return 0; + return ret; } EXPORT_SYMBOL_GPL(__dev_forward_skb); @@ -2599,7 +2594,7 @@ int skb_checksum_help(struct sk_buff *skb) goto out; } - *(__sum16 *)(skb->data + offset) = csum_fold(csum); + *(__sum16 *)(skb->data + offset) = csum_fold(csum) ?: CSUM_MANGLED_0; out_set_summed: skb->ip_summed = CHECKSUM_NONE; out: diff --git a/net/core/filter.c b/net/core/filter.c index cd9e2ba..dece94f 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -1628,6 +1628,19 @@ static inline int __bpf_rx_skb(struct net_device *dev, struct sk_buff *skb) return dev_forward_skb(dev, skb); } +static inline int __bpf_rx_skb_no_mac(struct net_device *dev, + struct sk_buff *skb) +{ + int ret = ____dev_forward_skb(dev, skb); + + if (likely(!ret)) { + skb->dev = dev; + ret = netif_rx(skb); + } + + return ret; +} + static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) { int ret; @@ -1647,6 +1660,51 @@ static inline int __bpf_tx_skb(struct net_device *dev, struct sk_buff *skb) return ret; } +static int __bpf_redirect_no_mac(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + /* skb->mac_len is not set on normal egress */ + unsigned int mlen = skb->network_header - skb->mac_header; + + __skb_pull(skb, mlen); + + /* At ingress, the mac header has already been pulled once. + * At egress, skb_pospull_rcsum has to be done in case that + * the skb is originated from ingress (i.e. a forwarded skb) + * to ensure that rcsum starts at net header. + */ + if (!skb_at_tc_ingress(skb)) + skb_postpull_rcsum(skb, skb_mac_header(skb), mlen); + skb_pop_mac_header(skb); + skb_reset_mac_len(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb_no_mac(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect_common(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + bpf_push_mac_rcsum(skb); + return flags & BPF_F_INGRESS ? + __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); +} + +static int __bpf_redirect(struct sk_buff *skb, struct net_device *dev, + u32 flags) +{ + switch (dev->type) { + case ARPHRD_TUNNEL: + case ARPHRD_TUNNEL6: + case ARPHRD_SIT: + case ARPHRD_IPGRE: + case ARPHRD_VOID: + case ARPHRD_NONE: + return __bpf_redirect_no_mac(skb, dev, flags); + default: + return __bpf_redirect_common(skb, dev, flags); + } +} + BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) { struct net_device *dev; @@ -1675,10 +1733,7 @@ BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) return -ENOMEM; } - bpf_push_mac_rcsum(clone); - - return flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, clone) : __bpf_tx_skb(dev, clone); + return __bpf_redirect(clone, dev, flags); } static const struct bpf_func_proto bpf_clone_redirect_proto = { @@ -1722,10 +1777,7 @@ int skb_do_redirect(struct sk_buff *skb) return -EINVAL; } - bpf_push_mac_rcsum(skb); - - return ri->flags & BPF_F_INGRESS ? - __bpf_rx_skb(dev, skb) : __bpf_tx_skb(dev, skb); + return __bpf_redirect(skb, dev, ri->flags); } static const struct bpf_func_proto bpf_redirect_proto = { diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 87e0181..b481a4a 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -122,7 +122,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector_key_keyid *key_keyid; bool skip_vlan = false; u8 ip_proto = 0; - bool ret = false; + bool ret; if (!data) { data = skb->data; @@ -549,12 +549,17 @@ ip_proto_again: out_good: ret = true; -out_bad: + key_control->thoff = (u16)nhoff; +out: key_basic->n_proto = proto; key_basic->ip_proto = ip_proto; - key_control->thoff = (u16)nhoff; return ret; + +out_bad: + ret = false; + key_control->thoff = min_t(u16, nhoff, skb ? skb->len : hlen); + goto out; } EXPORT_SYMBOL(__skb_flow_dissect); diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index fb7348f..db313ec 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -275,6 +275,7 @@ int rtnl_unregister(int protocol, int msgtype) rtnl_msg_handlers[protocol][msgindex].doit = NULL; rtnl_msg_handlers[protocol][msgindex].dumpit = NULL; + rtnl_msg_handlers[protocol][msgindex].calcit = NULL; return 0; } diff --git a/net/core/sock.c b/net/core/sock.c index 0397928..14e6145 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -453,7 +453,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) EXPORT_SYMBOL(sock_queue_rcv_skb); int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, - const int nested, unsigned int trim_cap) + const int nested, unsigned int trim_cap, bool refcounted) { int rc = NET_RX_SUCCESS; @@ -487,7 +487,8 @@ int __sk_receive_skb(struct sock *sk, struct sk_buff *skb, bh_unlock_sock(sk); out: - sock_put(sk); + if (refcounted) + sock_put(sk); return rc; discard_and_relse: kfree_skb(skb); @@ -1543,6 +1544,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL); newsk->sk_err = 0; + newsk->sk_err_soft = 0; newsk->sk_priority = 0; newsk->sk_incoming_cpu = raw_smp_processor_id(); atomic64_set(&newsk->sk_cookie, 0); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index a957aca..fda321d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -235,7 +235,7 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) { const struct iphdr *iph = (struct iphdr *)skb->data; const u8 offset = iph->ihl << 2; - const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + const struct dccp_hdr *dh; struct dccp_sock *dp; struct inet_sock *inet; const int type = icmp_hdr(skb)->type; @@ -245,11 +245,13 @@ static void dccp_v4_err(struct sk_buff *skb, u32 info) int err; struct net *net = dev_net(skb->dev); - if (skb->len < offset + sizeof(*dh) || - skb->len < offset + __dccp_basic_hdr_len(dh)) { - __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); - return; - } + /* Only need dccph_dport & dccph_sport which are the first + * 4 bytes in dccp header. + * Our caller (icmp_socket_deliver()) already pulled 8 bytes for us. + */ + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet_lookup_established(net, &dccp_hashinfo, iph->daddr, dh->dccph_dport, @@ -862,7 +864,7 @@ lookup: goto discard_and_relse; nf_reset(skb); - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4); + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, refcounted); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 32f9f1a..adfc790 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -70,7 +70,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, u8 type, u8 code, int offset, __be32 info) { const struct ipv6hdr *hdr = (const struct ipv6hdr *)skb->data; - const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + const struct dccp_hdr *dh; struct dccp_sock *dp; struct ipv6_pinfo *np; struct sock *sk; @@ -78,12 +78,13 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, __u64 seq; struct net *net = dev_net(skb->dev); - if (skb->len < offset + sizeof(*dh) || - skb->len < offset + __dccp_basic_hdr_len(dh)) { - __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), - ICMP6_MIB_INERRORS); - return; - } + /* Only need dccph_dport & dccph_sport which are the first + * 4 bytes in dccp header. + * Our caller (icmpv6_notify()) already pulled 8 bytes for us. + */ + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_sport) > 8); + BUILD_BUG_ON(offsetofend(struct dccp_hdr, dccph_dport) > 8); + dh = (struct dccp_hdr *)(skb->data + offset); sk = __inet6_lookup_established(net, &dccp_hashinfo, &hdr->daddr, dh->dccph_dport, @@ -738,7 +739,8 @@ lookup: if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4) ? -1 : 0; + return __sk_receive_skb(sk, skb, 1, dh->dccph_doff * 4, + refcounted) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) @@ -956,6 +958,7 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .getsockopt = ipv6_getsockopt, .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6), + .bind_conflict = inet6_csk_bind_conflict, #ifdef CONFIG_COMPAT .compat_setsockopt = compat_ipv6_setsockopt, .compat_getsockopt = compat_ipv6_getsockopt, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 41e65804..9fe25bf 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -1009,6 +1009,10 @@ void dccp_close(struct sock *sk, long timeout) __kfree_skb(skb); } + /* If socket has been already reset kill it. */ + if (sk->sk_state == DCCP_CLOSED) + goto adjudge_to_death; + if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 9648c97..5ddf5cd 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -533,9 +533,9 @@ EXPORT_SYMBOL(inet_dgram_connect); static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) { - DEFINE_WAIT(wait); + DEFINE_WAIT_FUNC(wait, woken_wake_function); - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + add_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending += writebias; /* Basic assumption: if someone sets sk->sk_err, he _must_ @@ -545,13 +545,12 @@ static long inet_wait_for_connect(struct sock *sk, long timeo, int writebias) */ while ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV)) { release_sock(sk); - timeo = schedule_timeout(timeo); + timeo = wait_woken(&wait, TASK_INTERRUPTIBLE, timeo); lock_sock(sk); if (signal_pending(current) || !timeo) break; - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); } - finish_wait(sk_sleep(sk), &wait); + remove_wait_queue(sk_sleep(sk), &wait); sk->sk_write_pending -= writebias; return timeo; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31cef36..4cff74d 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -2413,22 +2413,19 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, struct key_vector *l, **tp = &iter->tnode; t_key key; - /* use cache location of next-to-find key */ + /* use cached location of previously found key */ if (iter->pos > 0 && pos >= iter->pos) { - pos -= iter->pos; key = iter->key; } else { - iter->pos = 0; + iter->pos = 1; key = 0; } - while ((l = leaf_walk_rcu(tp, key)) != NULL) { + pos -= iter->pos; + + while ((l = leaf_walk_rcu(tp, key)) && (pos-- > 0)) { key = l->key + 1; iter->pos++; - - if (--pos <= 0) - break; - l = NULL; /* handle unlikely case of a key wrap */ @@ -2437,7 +2434,7 @@ static struct key_vector *fib_route_get_idx(struct fib_route_iter *iter, } if (l) - iter->key = key; /* remember it */ + iter->key = l->key; /* remember it */ else iter->pos = 0; /* forget it */ @@ -2465,7 +2462,7 @@ static void *fib_route_seq_start(struct seq_file *seq, loff_t *pos) return fib_route_get_idx(iter, *pos); iter->pos = 0; - iter->key = 0; + iter->key = KEY_MAX; return SEQ_START_TOKEN; } @@ -2474,7 +2471,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) { struct fib_route_iter *iter = seq->private; struct key_vector *l = NULL; - t_key key = iter->key; + t_key key = iter->key + 1; ++*pos; @@ -2483,7 +2480,7 @@ static void *fib_route_seq_next(struct seq_file *seq, void *v, loff_t *pos) l = leaf_walk_rcu(&iter->tnode, key); if (l) { - iter->key = l->key + 1; + iter->key = l->key; iter->pos++; } else { iter->pos = 0; diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 53a890b..691146a 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -479,7 +479,7 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4->flowi4_proto = IPPROTO_ICMP; fl4->fl4_icmp_type = type; fl4->fl4_icmp_code = code; - fl4->flowi4_oif = l3mdev_master_ifindex(skb_in->dev); + fl4->flowi4_oif = l3mdev_master_ifindex(skb_dst(skb_in)->dev); security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key_hash(net, fl4, @@ -504,7 +504,7 @@ static struct rtable *icmp_route_lookup(struct net *net, if (err) goto relookup_failed; - if (inet_addr_type_dev_table(net, skb_in->dev, + if (inet_addr_type_dev_table(net, skb_dst(skb_in)->dev, fl4_dec.saddr) == RTN_LOCAL) { rt2 = __ip_route_output_key(net, &fl4_dec); if (IS_ERR(rt2)) diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c index 8b4ffd2..9f0a7b9 100644 --- a/net/ipv4/ip_forward.c +++ b/net/ipv4/ip_forward.c @@ -117,7 +117,7 @@ int ip_forward(struct sk_buff *skb) if (opt->is_strictroute && rt->rt_uses_gateway) goto sr_failed; - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; mtu = ip_dst_mtu_maybe_forward(&rt->dst, true); if (ip_exceeds_mtu(skb, mtu)) { IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS); diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 37dfacd..eaf720b 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -239,19 +239,23 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk, struct sk_buff *segs; int ret = 0; - /* common case: fragmentation of segments is not allowed, - * or seglen is <= mtu + /* common case: seglen is <= mtu */ - if (((IPCB(skb)->flags & IPSKB_FRAG_SEGS) == 0) || - skb_gso_validate_mtu(skb, mtu)) + if (skb_gso_validate_mtu(skb, mtu)) return ip_finish_output2(net, sk, skb); - /* Slowpath - GSO segment length is exceeding the dst MTU. + /* Slowpath - GSO segment length exceeds the egress MTU. * - * This can happen in two cases: - * 1) TCP GRO packet, DF bit not set - * 2) skb arrived via virtio-net, we thus get TSO/GSO skbs directly - * from host network stack. + * This can happen in several cases: + * - Forwarding of a TCP GRO skb, when DF flag is not set. + * - Forwarding of an skb that arrived on a virtualization interface + * (virtio-net/vhost/tap) with TSO/GSO size set by other network + * stack. + * - Local GSO skb transmitted on an NETIF_F_TSO tunnel stacked over an + * interface with a smaller MTU. + * - Arriving GRO skb (or GSO skb in a virtualized environment) that is + * bridged to a NETIF_F_TSO tunnel stacked over an interface with an + * insufficent MTU. */ features = netif_skb_features(skb); BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET); @@ -1579,7 +1583,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, } oif = arg->bound_dev_if; - oif = oif ? : skb->skb_iif; + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; flowi4_init_output(&fl4, oif, IP4_REPLY_MARK(net, skb->mark), diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index 777bc18..fed3d29 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -63,7 +63,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, int pkt_len = skb->len - skb_inner_network_offset(skb); struct net *net = dev_net(rt->dst.dev); struct net_device *dev = skb->dev; - int skb_iif = skb->skb_iif; struct iphdr *iph; int err; @@ -73,16 +72,6 @@ void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb, skb_dst_set(skb, &rt->dst); memset(IPCB(skb), 0, sizeof(*IPCB(skb))); - if (skb_iif && !(df & htons(IP_DF))) { - /* Arrived from an ingress interface, got encapsulated, with - * fragmentation of encapulating frames allowed. - * If skb is gso, the resulting encapsulated network segments - * may exceed dst mtu. - * Allow IP Fragmentation of segments. - */ - IPCB(skb)->flags |= IPSKB_FRAG_SEGS; - } - /* Push down and install the IP header. */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index f2fd13b..665505d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1754,7 +1754,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, vif->dev->stats.tx_bytes += skb->len; } - IPCB(skb)->flags |= IPSKB_FORWARDED | IPSKB_FRAG_SEGS; + IPCB(skb)->flags |= IPSKB_FORWARDED; /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally * not only before forwarding, but after forwarding on all output diff --git a/net/ipv4/netfilter/nft_dup_ipv4.c b/net/ipv4/netfilter/nft_dup_ipv4.c index 7ab544f..0af3d8d 100644 --- a/net/ipv4/netfilter/nft_dup_ipv4.c +++ b/net/ipv4/netfilter/nft_dup_ipv4.c @@ -28,7 +28,7 @@ static void nft_dup_ipv4_eval(const struct nft_expr *expr, struct in_addr gw = { .s_addr = (__force __be32)regs->data[priv->sreg_addr], }; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv4(nft_net(pkt), pkt->skb, nft_hook(pkt), &gw, oif); } @@ -59,7 +59,9 @@ static int nft_dup_ipv4_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv4 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 2355883..d37fc6f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -755,7 +755,9 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow goto reject_redirect; } - n = ipv4_neigh_lookup(&rt->dst, NULL, &new_gw); + n = __ipv4_neigh_lookup(rt->dst.dev, new_gw); + if (!n) + n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev); if (!IS_ERR(n)) { if (!(n->nud_state & NUD_VALID)) { neigh_event_send(n, NULL); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index f8f924c..b025a69 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1162,7 +1162,7 @@ restart: err = -EPIPE; if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) - goto out_err; + goto do_error; sg = !!(sk->sk_route_caps & NETIF_F_SG); @@ -1239,7 +1239,7 @@ new_segment: if (!skb_can_coalesce(skb, i, pfrag->page, pfrag->offset)) { - if (i == sysctl_max_skb_frags || !sg) { + if (i >= sysctl_max_skb_frags || !sg) { tcp_mark_push(tp, skb); goto new_segment; } diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 10d728b..ab37c67 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -56,6 +56,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 delayed_ack_reserved; + u32 loss_cwnd; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -96,6 +97,7 @@ static void dctcp_init(struct sock *sk) ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->delayed_ack_reserved = 0; + ca->loss_cwnd = 0; ca->ce_state = 0; dctcp_reset(tp, ca); @@ -111,9 +113,10 @@ static void dctcp_init(struct sock *sk) static u32 dctcp_ssthresh(struct sock *sk) { - const struct dctcp *ca = inet_csk_ca(sk); + struct dctcp *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->loss_cwnd = tp->snd_cwnd; return max(tp->snd_cwnd - ((tp->snd_cwnd * ca->dctcp_alpha) >> 11U), 2U); } @@ -308,12 +311,20 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, return 0; } +static u32 dctcp_cwnd_undo(struct sock *sk) +{ + const struct dctcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); +} + static struct tcp_congestion_ops dctcp __read_mostly = { .init = dctcp_init, .in_ack_event = dctcp_update_alpha, .cwnd_event = dctcp_cwnd_event, .ssthresh = dctcp_ssthresh, .cong_avoid = tcp_reno_cong_avoid, + .undo_cwnd = dctcp_cwnd_undo, .set_state = dctcp_state, .get_info = dctcp_get_info, .flags = TCP_CONG_NEEDS_ECN, diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6491b7c..5555eb8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1567,6 +1567,21 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb) } EXPORT_SYMBOL(tcp_add_backlog); +int tcp_filter(struct sock *sk, struct sk_buff *skb) +{ + struct tcphdr *th = (struct tcphdr *)skb->data; + unsigned int eaten = skb->len; + int err; + + err = sk_filter_trim_cap(sk, skb, th->doff * 4); + if (!err) { + eaten -= skb->len; + TCP_SKB_CB(skb)->end_seq -= eaten; + } + return err; +} +EXPORT_SYMBOL(tcp_filter); + /* * From tcp_input.c */ @@ -1679,8 +1694,10 @@ process: nf_reset(skb); - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + iph = ip_hdr(skb); skb->dev = NULL; diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c index ab249fe..eb948ff 100644 --- a/net/ipv6/icmp.c +++ b/net/ipv6/icmp.c @@ -449,7 +449,7 @@ static void icmp6_send(struct sk_buff *skb, u8 type, u8 code, __u32 info, if (__ipv6_addr_needs_scope_id(addr_type)) iif = skb->dev->ifindex; else - iif = l3mdev_master_ifindex(skb->dev); + iif = l3mdev_master_ifindex(skb_dst(skb)->dev); /* * Must not send error if the source does not uniquely diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index ddc878d..b37054b 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1367,7 +1367,7 @@ emsgsize: if (((length > mtu) || (skb && skb_is_gso(skb))) && (sk->sk_protocol == IPPROTO_UDP) && - (rt->dst.dev->features & NETIF_F_UFO) && + (rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len && (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) { err = ip6_ufo_append_data(sk, queue, getfrag, from, length, hh_len, fragheaderlen, exthdrlen, diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c index a752052..b283f29 100644 --- a/net/ipv6/ip6_udp_tunnel.c +++ b/net/ipv6/ip6_udp_tunnel.c @@ -88,9 +88,6 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk, uh->len = htons(skb->len); - memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED - | IPSKB_REROUTED); skb_dst_set(skb, dst); udp6_set_csum(nocheck, skb, saddr, daddr, skb->len); diff --git a/net/ipv6/netfilter/nft_dup_ipv6.c b/net/ipv6/netfilter/nft_dup_ipv6.c index 26074a8..d8b5b60 100644 --- a/net/ipv6/netfilter/nft_dup_ipv6.c +++ b/net/ipv6/netfilter/nft_dup_ipv6.c @@ -26,7 +26,7 @@ static void nft_dup_ipv6_eval(const struct nft_expr *expr, { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); struct in6_addr *gw = (struct in6_addr *)®s->data[priv->sreg_addr]; - int oif = regs->data[priv->sreg_dev]; + int oif = priv->sreg_dev ? regs->data[priv->sreg_dev] : -1; nf_dup_ipv6(nft_net(pkt), pkt->skb, nft_hook(pkt), gw, oif); } @@ -57,7 +57,9 @@ static int nft_dup_ipv6_dump(struct sk_buff *skb, const struct nft_expr *expr) { struct nft_dup_ipv6 *priv = nft_expr_priv(expr); - if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr) || + if (nft_dump_register(skb, NFTA_DUP_SREG_ADDR, priv->sreg_addr)) + goto nla_put_failure; + if (priv->sreg_dev && nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev)) goto nla_put_failure; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 6aa014e..b317bb1 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1364,6 +1364,9 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, if (rt6->rt6i_flags & RTF_LOCAL) return; + if (dst_metric_locked(dst, RTAX_MTU)) + return; + dst_confirm(dst); mtu = max_t(u32, mtu, IPV6_MIN_MTU); if (mtu >= dst_mtu(dst)) @@ -2763,6 +2766,7 @@ static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg) PMTU discouvery. */ if (rt->dst.dev == arg->dev && + dst_metric_raw(&rt->dst, RTAX_MTU) && !dst_metric_locked(&rt->dst, RTAX_MTU)) { if (rt->rt6i_flags & RTF_CACHE) { /* For RTF_CACHE with rt6i_pmtu == 0 diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index aece1b1..28ec0a2 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -819,8 +819,12 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 fl6.flowi6_proto = IPPROTO_TCP; if (rt6_need_strict(&fl6.daddr) && !oif) fl6.flowi6_oif = tcp_v6_iif(skb); - else - fl6.flowi6_oif = oif ? : skb->skb_iif; + else { + if (!oif && netif_index_is_l3_master(net, skb->skb_iif)) + oif = skb->skb_iif; + + fl6.flowi6_oif = oif; + } fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark); fl6.fl6_dport = t1->dest; @@ -1227,7 +1231,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) if (skb->protocol == htons(ETH_P_IP)) return tcp_v4_do_rcv(sk, skb); - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard; /* @@ -1455,8 +1459,10 @@ process: if (tcp_v6_inbound_md5_hash(sk, skb)) goto discard_and_relse; - if (sk_filter(sk, skb)) + if (tcp_filter(sk, skb)) goto discard_and_relse; + th = (const struct tcphdr *)skb->data; + hdr = ipv6_hdr(skb); skb->dev = NULL; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 6b85ded..038c2ba 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3869,7 +3869,7 @@ static struct genl_family ip_vs_genl_family __ro_after_init = { .hdrsize = 0, .name = IPVS_GENL_NAME, .version = IPVS_GENL_VERSION, - .maxattr = IPVS_CMD_MAX, + .maxattr = IPVS_CMD_ATTR_MAX, .netnsok = true, /* Make ipvsadm to work on netns */ .module = THIS_MODULE, .ops = ip_vs_genl_ops, diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 1b07578..9350530 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -283,6 +283,7 @@ struct ip_vs_sync_buff { */ static void ntoh_seq(struct ip_vs_seq *no, struct ip_vs_seq *ho) { + memset(ho, 0, sizeof(*ho)); ho->init_seq = get_unaligned_be32(&no->init_seq); ho->delta = get_unaligned_be32(&no->delta); ho->previous_delta = get_unaligned_be32(&no->previous_delta); @@ -917,8 +918,10 @@ static void ip_vs_proc_conn(struct netns_ipvs *ipvs, struct ip_vs_conn_param *pa kfree(param->pe_data); } - if (opt) - memcpy(&cp->in_seq, opt, sizeof(*opt)); + if (opt) { + cp->in_seq = opt->in_seq; + cp->out_seq = opt->out_seq; + } atomic_set(&cp->in_pkts, sysctl_sync_threshold(ipvs)); cp->state = state; cp->old_state = cp->state; diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index e9ffe33..6a0bbfa 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -76,6 +76,7 @@ struct conntrack_gc_work { struct delayed_work dwork; u32 last_bucket; bool exiting; + long next_gc_run; }; static __read_mostly struct kmem_cache *nf_conntrack_cachep; @@ -83,9 +84,11 @@ static __read_mostly spinlock_t nf_conntrack_locks_all_lock; static __read_mostly DEFINE_SPINLOCK(nf_conntrack_locks_all_lock); static __read_mostly bool nf_conntrack_locks_all; +/* every gc cycle scans at most 1/GC_MAX_BUCKETS_DIV part of table */ #define GC_MAX_BUCKETS_DIV 64u -#define GC_MAX_BUCKETS 8192u -#define GC_INTERVAL (5 * HZ) +/* upper bound of scan intervals */ +#define GC_INTERVAL_MAX (2 * HZ) +/* maximum conntracks to evict per gc run */ #define GC_MAX_EVICTS 256u static struct conntrack_gc_work conntrack_gc_work; @@ -936,13 +939,13 @@ static noinline int early_drop(struct net *net, unsigned int _hash) static void gc_worker(struct work_struct *work) { unsigned int i, goal, buckets = 0, expired_count = 0; - unsigned long next_run = GC_INTERVAL; - unsigned int ratio, scanned = 0; struct conntrack_gc_work *gc_work; + unsigned int ratio, scanned = 0; + unsigned long next_run; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); - goal = min(nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV, GC_MAX_BUCKETS); + goal = nf_conntrack_htable_size / GC_MAX_BUCKETS_DIV; i = gc_work->last_bucket; do { @@ -982,17 +985,47 @@ static void gc_worker(struct work_struct *work) if (gc_work->exiting) return; + /* + * Eviction will normally happen from the packet path, and not + * from this gc worker. + * + * This worker is only here to reap expired entries when system went + * idle after a busy period. + * + * The heuristics below are supposed to balance conflicting goals: + * + * 1. Minimize time until we notice a stale entry + * 2. Maximize scan intervals to not waste cycles + * + * Normally, expired_count will be 0, this increases the next_run time + * to priorize 2) above. + * + * As soon as a timed-out entry is found, move towards 1) and increase + * the scan frequency. + * In case we have lots of evictions next scan is done immediately. + */ ratio = scanned ? expired_count * 100 / scanned : 0; - if (ratio >= 90 || expired_count == GC_MAX_EVICTS) + if (ratio >= 90 || expired_count == GC_MAX_EVICTS) { + gc_work->next_gc_run = 0; next_run = 0; + } else if (expired_count) { + gc_work->next_gc_run /= 2U; + next_run = msecs_to_jiffies(1); + } else { + if (gc_work->next_gc_run < GC_INTERVAL_MAX) + gc_work->next_gc_run += msecs_to_jiffies(1); + + next_run = gc_work->next_gc_run; + } gc_work->last_bucket = i; - schedule_delayed_work(&gc_work->dwork, next_run); + queue_delayed_work(system_long_wq, &gc_work->dwork, next_run); } static void conntrack_gc_work_init(struct conntrack_gc_work *gc_work) { INIT_DELAYED_WORK(&gc_work->dwork, gc_worker); + gc_work->next_gc_run = GC_INTERVAL_MAX; gc_work->exiting = false; } @@ -1884,7 +1917,7 @@ int nf_conntrack_init_start(void) nf_ct_untracked_status_or(IPS_CONFIRMED | IPS_UNTRACKED); conntrack_gc_work_init(&conntrack_gc_work); - schedule_delayed_work(&conntrack_gc_work.dwork, GC_INTERVAL); + queue_delayed_work(system_long_wq, &conntrack_gc_work.dwork, GC_INTERVAL_MAX); return 0; diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index 336e215..7341adf 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -138,9 +138,14 @@ __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) for (i = 0; i < nf_ct_helper_hsize; i++) { hlist_for_each_entry_rcu(h, &nf_ct_helper_hash[i], hnode) { - if (!strcmp(h->name, name) && - h->tuple.src.l3num == l3num && - h->tuple.dst.protonum == protonum) + if (strcmp(h->name, name)) + continue; + + if (h->tuple.src.l3num != NFPROTO_UNSPEC && + h->tuple.src.l3num != l3num) + continue; + + if (h->tuple.dst.protonum == protonum) return h; } } diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 621b81c..c3fc14e 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1436,9 +1436,12 @@ static int process_sip_request(struct sk_buff *skb, unsigned int protoff, handler = &sip_handlers[i]; if (handler->request == NULL) continue; - if (*datalen < handler->len || + if (*datalen < handler->len + 2 || strncasecmp(*dptr, handler->method, handler->len)) continue; + if ((*dptr)[handler->len] != ' ' || + !isalpha((*dptr)[handler->len+1])) + continue; if (ct_sip_get_header(ct, *dptr, 0, *datalen, SIP_HDR_CSEQ, &matchoff, &matchlen) <= 0) { diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 24db222..026581b 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -2956,12 +2956,14 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk, err = nft_trans_set_add(&ctx, NFT_MSG_NEWSET, set); if (err < 0) - goto err2; + goto err3; list_add_tail_rcu(&set->list, &table->sets); table->use++; return 0; +err3: + ops->destroy(set); err2: kfree(set); err1: @@ -3452,14 +3454,15 @@ void *nft_set_elem_init(const struct nft_set *set, return elem; } -void nft_set_elem_destroy(const struct nft_set *set, void *elem) +void nft_set_elem_destroy(const struct nft_set *set, void *elem, + bool destroy_expr) { struct nft_set_ext *ext = nft_set_elem_ext(set, elem); nft_data_uninit(nft_set_ext_key(ext), NFT_DATA_VALUE); if (nft_set_ext_exists(ext, NFT_SET_EXT_DATA)) nft_data_uninit(nft_set_ext_data(ext), set->dtype); - if (nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) + if (destroy_expr && nft_set_ext_exists(ext, NFT_SET_EXT_EXPR)) nf_tables_expr_destroy(NULL, nft_set_ext_expr(ext)); kfree(elem); @@ -3565,6 +3568,7 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set, dreg = nft_type_to_reg(set->dtype); list_for_each_entry(binding, &set->bindings, list) { struct nft_ctx bind_ctx = { + .net = ctx->net, .afi = ctx->afi, .table = ctx->table, .chain = (struct nft_chain *)binding->chain, @@ -3812,7 +3816,7 @@ void nft_set_gc_batch_release(struct rcu_head *rcu) gcb = container_of(rcu, struct nft_set_gc_batch, head.rcu); for (i = 0; i < gcb->head.cnt; i++) - nft_set_elem_destroy(gcb->head.set, gcb->elems[i]); + nft_set_elem_destroy(gcb->head.set, gcb->elems[i], true); kfree(gcb); } EXPORT_SYMBOL_GPL(nft_set_gc_batch_release); @@ -4030,7 +4034,7 @@ static void nf_tables_commit_release(struct nft_trans *trans) break; case NFT_MSG_DELSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv); + nft_trans_elem(trans).priv, true); break; } kfree(trans); @@ -4171,7 +4175,7 @@ static void nf_tables_abort_release(struct nft_trans *trans) break; case NFT_MSG_NEWSETELEM: nft_set_elem_destroy(nft_trans_elem_set(trans), - nft_trans_elem(trans).priv); + nft_trans_elem(trans).priv, true); break; } kfree(trans); @@ -4421,7 +4425,7 @@ static int nf_tables_check_loops(const struct nft_ctx *ctx, * Otherwise a 0 is returned and the attribute value is stored in the * destination variable. */ -unsigned int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) +int nft_parse_u32_check(const struct nlattr *attr, int max, u32 *dest) { u32 val; diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c index 4339e3f1..7de2f46 100644 --- a/net/netfilter/nft_dynset.c +++ b/net/netfilter/nft_dynset.c @@ -44,18 +44,22 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr, ®s->data[priv->sreg_key], ®s->data[priv->sreg_data], timeout, GFP_ATOMIC); - if (elem == NULL) { - if (set->size) - atomic_dec(&set->nelems); - return NULL; - } + if (elem == NULL) + goto err1; ext = nft_set_elem_ext(set, elem); if (priv->expr != NULL && nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0) - return NULL; + goto err2; return elem; + +err2: + nft_set_elem_destroy(set, elem, false); +err1: + if (set->size) + atomic_dec(&set->nelems); + return NULL; } static void nft_dynset_eval(const struct nft_expr *expr, @@ -139,6 +143,9 @@ static int nft_dynset_init(const struct nft_ctx *ctx, return PTR_ERR(set); } + if (set->ops->update == NULL) + return -EOPNOTSUPP; + if (set->flags & NFT_SET_CONSTANT) return -EBUSY; diff --git a/net/netfilter/nft_set_hash.c b/net/netfilter/nft_set_hash.c index 3794cb2..a3dface 100644 --- a/net/netfilter/nft_set_hash.c +++ b/net/netfilter/nft_set_hash.c @@ -98,7 +98,7 @@ static bool nft_hash_update(struct nft_set *set, const u32 *key, const struct nft_set_ext **ext) { struct nft_hash *priv = nft_set_priv(set); - struct nft_hash_elem *he; + struct nft_hash_elem *he, *prev; struct nft_hash_cmp_arg arg = { .genmask = NFT_GENMASK_ANY, .set = set, @@ -112,15 +112,24 @@ static bool nft_hash_update(struct nft_set *set, const u32 *key, he = new(set, expr, regs); if (he == NULL) goto err1; - if (rhashtable_lookup_insert_key(&priv->ht, &arg, &he->node, - nft_hash_params)) + + prev = rhashtable_lookup_get_insert_key(&priv->ht, &arg, &he->node, + nft_hash_params); + if (IS_ERR(prev)) goto err2; + + /* Another cpu may race to insert the element with the same key */ + if (prev) { + nft_set_elem_destroy(set, he, true); + he = prev; + } + out: *ext = &he->ext; return true; err2: - nft_set_elem_destroy(set, he); + nft_set_elem_destroy(set, he, true); err1: return false; } @@ -332,7 +341,7 @@ static int nft_hash_init(const struct nft_set *set, static void nft_hash_elem_destroy(void *ptr, void *arg) { - nft_set_elem_destroy((const struct nft_set *)arg, ptr); + nft_set_elem_destroy((const struct nft_set *)arg, ptr, true); } static void nft_hash_destroy(const struct nft_set *set) diff --git a/net/netfilter/nft_set_rbtree.c b/net/netfilter/nft_set_rbtree.c index 38b5bda..36493a7 100644 --- a/net/netfilter/nft_set_rbtree.c +++ b/net/netfilter/nft_set_rbtree.c @@ -266,7 +266,7 @@ static void nft_rbtree_destroy(const struct nft_set *set) while ((node = priv->root.rb_node) != NULL) { rb_erase(node, &priv->root); rbe = rb_entry(node, struct nft_rbtree_elem, node); - nft_set_elem_destroy(set, rbe); + nft_set_elem_destroy(set, rbe, true); } } diff --git a/net/netfilter/xt_connmark.c b/net/netfilter/xt_connmark.c index 69f78e9..b83e158 100644 --- a/net/netfilter/xt_connmark.c +++ b/net/netfilter/xt_connmark.c @@ -44,7 +44,7 @@ connmark_tg(struct sk_buff *skb, const struct xt_action_param *par) u_int32_t newmark; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) + if (ct == NULL || nf_ct_is_untracked(ct)) return XT_CONTINUE; switch (info->mode) { @@ -97,7 +97,7 @@ connmark_mt(const struct sk_buff *skb, struct xt_action_param *par) const struct nf_conn *ct; ct = nf_ct_get(skb, &ctinfo); - if (ct == NULL) + if (ct == NULL || nf_ct_is_untracked(ct)) return false; return ((ct->mark & info->mask) == info->mark) ^ info->invert; diff --git a/net/netlink/diag.c b/net/netlink/diag.c index b2f0e98..a554624 100644 --- a/net/netlink/diag.c +++ b/net/netlink/diag.c @@ -178,11 +178,8 @@ static int netlink_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) } cb->args[1] = i; } else { - if (req->sdiag_protocol >= MAX_LINKS) { - read_unlock(&nl_table_lock); - rcu_read_unlock(); + if (req->sdiag_protocol >= MAX_LINKS) return -ENOENT; - } err = __netlink_diag_dump(skb, cb, req->sdiag_protocol, s_num); } diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index bbd3bff..fb6e10f 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -383,6 +383,7 @@ int genl_register_family(struct genl_family *family) errout_remove: idr_remove(&genl_fam_idr, family->id); + kfree(family->attrbuf); errout_locked: genl_unlock_all(); return err; diff --git a/net/sctp/input.c b/net/sctp/input.c index a2ea1d1..a01a56e 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -181,9 +181,10 @@ int sctp_rcv(struct sk_buff *skb) * bound to another interface, via SO_BINDTODEVICE, treat it as OOTB */ if (sk->sk_bound_dev_if && (sk->sk_bound_dev_if != af->skb_iif(skb))) { - if (asoc) { - sctp_association_put(asoc); + if (transport) { + sctp_transport_put(transport); asoc = NULL; + transport = NULL; } else { sctp_endpoint_put(ep); ep = NULL; @@ -269,8 +270,8 @@ int sctp_rcv(struct sk_buff *skb) bh_unlock_sock(sk); /* Release the asoc/ep ref we took in the lookup calls. */ - if (asoc) - sctp_association_put(asoc); + if (transport) + sctp_transport_put(transport); else sctp_endpoint_put(ep); @@ -283,8 +284,8 @@ discard_it: discard_release: /* Release the asoc/ep ref we took in the lookup calls. */ - if (asoc) - sctp_association_put(asoc); + if (transport) + sctp_transport_put(transport); else sctp_endpoint_put(ep); @@ -300,6 +301,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; struct sctp_inq *inqueue = &chunk->rcvr->inqueue; + struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = NULL; int backloged = 0; @@ -351,7 +353,7 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) done: /* Release the refs we took in sctp_add_backlog */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) - sctp_association_put(sctp_assoc(rcvr)); + sctp_transport_put(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_put(sctp_ep(rcvr)); else @@ -363,6 +365,7 @@ done: static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) { struct sctp_chunk *chunk = SCTP_INPUT_CB(skb)->chunk; + struct sctp_transport *t = chunk->transport; struct sctp_ep_common *rcvr = chunk->rcvr; int ret; @@ -373,7 +376,7 @@ static int sctp_add_backlog(struct sock *sk, struct sk_buff *skb) * from us */ if (SCTP_EP_TYPE_ASSOCIATION == rcvr->type) - sctp_association_hold(sctp_assoc(rcvr)); + sctp_transport_hold(t); else if (SCTP_EP_TYPE_SOCKET == rcvr->type) sctp_endpoint_hold(sctp_ep(rcvr)); else @@ -537,15 +540,15 @@ struct sock *sctp_err_lookup(struct net *net, int family, struct sk_buff *skb, return sk; out: - sctp_association_put(asoc); + sctp_transport_put(transport); return NULL; } /* Common cleanup code for icmp/icmpv6 error handler. */ -void sctp_err_finish(struct sock *sk, struct sctp_association *asoc) +void sctp_err_finish(struct sock *sk, struct sctp_transport *t) { bh_unlock_sock(sk); - sctp_association_put(asoc); + sctp_transport_put(t); } /* @@ -641,7 +644,7 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info) } out_unlock: - sctp_err_finish(sk, asoc); + sctp_err_finish(sk, transport); } /* @@ -952,11 +955,8 @@ static struct sctp_association *__sctp_lookup_association( goto out; asoc = t->asoc; - sctp_association_hold(asoc); *pt = t; - sctp_transport_put(t); - out: return asoc; } @@ -986,7 +986,7 @@ int sctp_has_association(struct net *net, struct sctp_transport *transport; if ((asoc = sctp_lookup_association(net, laddr, paddr, &transport))) { - sctp_association_put(asoc); + sctp_transport_put(transport); return 1; } @@ -1021,7 +1021,6 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, struct sctphdr *sh = sctp_hdr(skb); union sctp_params params; sctp_init_chunk_t *init; - struct sctp_transport *transport; struct sctp_af *af; /* @@ -1052,7 +1051,7 @@ static struct sctp_association *__sctp_rcv_init_lookup(struct net *net, af->from_addr_param(paddr, params.addr, sh->source, 0); - asoc = __sctp_lookup_association(net, laddr, paddr, &transport); + asoc = __sctp_lookup_association(net, laddr, paddr, transportp); if (asoc) return asoc; } diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index f473779..176af30 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -198,7 +198,7 @@ static void sctp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } out_unlock: - sctp_err_finish(sk, asoc); + sctp_err_finish(sk, transport); out: if (likely(idev != NULL)) in6_dev_put(idev); diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9fbb6fe..f23ad91 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1214,9 +1214,12 @@ static int __sctp_connect(struct sock *sk, timeo = sock_sndtimeo(sk, f_flags & O_NONBLOCK); - err = sctp_wait_for_connect(asoc, &timeo); - if ((err == 0 || err == -EINPROGRESS) && assoc_id) + if (assoc_id) *assoc_id = asoc->assoc_id; + err = sctp_wait_for_connect(asoc, &timeo); + /* Note: the asoc may be freed after the return of + * sctp_wait_for_connect. + */ /* Don't free association on exit. */ asoc = NULL; @@ -4282,19 +4285,18 @@ static void sctp_shutdown(struct sock *sk, int how) { struct net *net = sock_net(sk); struct sctp_endpoint *ep; - struct sctp_association *asoc; if (!sctp_style(sk, TCP)) return; - if (how & SEND_SHUTDOWN) { + ep = sctp_sk(sk)->ep; + if (how & SEND_SHUTDOWN && !list_empty(&ep->asocs)) { + struct sctp_association *asoc; + sk->sk_state = SCTP_SS_CLOSING; - ep = sctp_sk(sk)->ep; - if (!list_empty(&ep->asocs)) { - asoc = list_entry(ep->asocs.next, - struct sctp_association, asocs); - sctp_primitive_SHUTDOWN(net, asoc, NULL); - } + asoc = list_entry(ep->asocs.next, + struct sctp_association, asocs); + sctp_primitive_SHUTDOWN(net, asoc, NULL); } } @@ -4480,12 +4482,9 @@ int sctp_transport_lookup_process(int (*cb)(struct sctp_transport *, void *), if (!transport || !sctp_transport_hold(transport)) goto out; - sctp_association_hold(transport->asoc); - sctp_transport_put(transport); - rcu_read_unlock(); err = cb(transport, p); - sctp_association_put(transport->asoc); + sctp_transport_put(transport); out: return err; diff --git a/net/socket.c b/net/socket.c index 4ce33c3..f9e26c6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2064,6 +2064,8 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen, if (err) break; ++datagrams; + if (msg_data_left(&msg_sys)) + break; cond_resched(); } diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index d8bd97a..3dfd769 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1616,7 +1616,7 @@ gss_validate(struct rpc_task *task, __be32 *p) { struct rpc_cred *cred = task->tk_rqstp->rq_cred; struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); - __be32 seq; + __be32 *seq = NULL; struct kvec iov; struct xdr_buf verf_buf; struct xdr_netobj mic; @@ -1631,9 +1631,12 @@ gss_validate(struct rpc_task *task, __be32 *p) goto out_bad; if (flav != RPC_AUTH_GSS) goto out_bad; - seq = htonl(task->tk_rqstp->rq_seqno); - iov.iov_base = &seq; - iov.iov_len = sizeof(seq); + seq = kmalloc(4, GFP_NOFS); + if (!seq) + goto out_bad; + *seq = htonl(task->tk_rqstp->rq_seqno); + iov.iov_base = seq; + iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_buf); mic.data = (u8 *)p; mic.len = len; @@ -1653,11 +1656,13 @@ gss_validate(struct rpc_task *task, __be32 *p) gss_put_ctx(ctx); dprintk("RPC: %5u %s: gss_verify_mic succeeded.\n", task->tk_pid, __func__); + kfree(seq); return p + XDR_QUADLEN(len); out_bad: gss_put_ctx(ctx); dprintk("RPC: %5u %s failed ret %ld.\n", task->tk_pid, __func__, PTR_ERR(ret)); + kfree(seq); return ret; } diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c index 244245b..90115ce 100644 --- a/net/sunrpc/auth_gss/gss_krb5_crypto.c +++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c @@ -166,8 +166,8 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, unsigned int usage, struct xdr_netobj *cksumout) { struct scatterlist sg[1]; - int err; - u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + int err = -1; + u8 *checksumdata; u8 rc4salt[4]; struct crypto_ahash *md5; struct crypto_ahash *hmac_md5; @@ -187,23 +187,22 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + if (!checksumdata) + return GSS_S_FAILURE; + md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC); if (IS_ERR(md5)) - return GSS_S_FAILURE; + goto out_free_cksum; hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(hmac_md5)) { - crypto_free_ahash(md5); - return GSS_S_FAILURE; - } + if (IS_ERR(hmac_md5)) + goto out_free_md5; req = ahash_request_alloc(md5, GFP_KERNEL); - if (!req) { - crypto_free_ahash(hmac_md5); - crypto_free_ahash(md5); - return GSS_S_FAILURE; - } + if (!req) + goto out_free_hmac_md5; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -232,11 +231,8 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, ahash_request_free(req); req = ahash_request_alloc(hmac_md5, GFP_KERNEL); - if (!req) { - crypto_free_ahash(hmac_md5); - crypto_free_ahash(md5); - return GSS_S_FAILURE; - } + if (!req) + goto out_free_hmac_md5; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -258,8 +254,12 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen, cksumout->len = kctx->gk5e->cksumlength; out: ahash_request_free(req); - crypto_free_ahash(md5); +out_free_hmac_md5: crypto_free_ahash(hmac_md5); +out_free_md5: + crypto_free_ahash(md5); +out_free_cksum: + kfree(checksumdata); return err ? GSS_S_FAILURE : 0; } @@ -276,8 +276,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, struct crypto_ahash *tfm; struct ahash_request *req; struct scatterlist sg[1]; - int err; - u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + int err = -1; + u8 *checksumdata; unsigned int checksumlen; if (kctx->gk5e->ctype == CKSUMTYPE_HMAC_MD5_ARCFOUR) @@ -291,15 +291,17 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + if (checksumdata == NULL) + return GSS_S_FAILURE; + tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) - return GSS_S_FAILURE; + goto out_free_cksum; req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!req) { - crypto_free_ahash(tfm); - return GSS_S_FAILURE; - } + if (!req) + goto out_free_ahash; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -349,7 +351,10 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen, cksumout->len = kctx->gk5e->cksumlength; out: ahash_request_free(req); +out_free_ahash: crypto_free_ahash(tfm); +out_free_cksum: + kfree(checksumdata); return err ? GSS_S_FAILURE : 0; } @@ -368,8 +373,8 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, struct crypto_ahash *tfm; struct ahash_request *req; struct scatterlist sg[1]; - int err; - u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN]; + int err = -1; + u8 *checksumdata; unsigned int checksumlen; if (kctx->gk5e->keyed_cksum == 0) { @@ -383,16 +388,18 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, return GSS_S_FAILURE; } + checksumdata = kmalloc(GSS_KRB5_MAX_CKSUM_LEN, GFP_NOFS); + if (!checksumdata) + return GSS_S_FAILURE; + tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) - return GSS_S_FAILURE; + goto out_free_cksum; checksumlen = crypto_ahash_digestsize(tfm); req = ahash_request_alloc(tfm, GFP_KERNEL); - if (!req) { - crypto_free_ahash(tfm); - return GSS_S_FAILURE; - } + if (!req) + goto out_free_ahash; ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL); @@ -433,7 +440,10 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen, } out: ahash_request_free(req); +out_free_ahash: crypto_free_ahash(tfm); +out_free_cksum: + kfree(checksumdata); return err ? GSS_S_FAILURE : 0; } @@ -666,14 +676,17 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, u32 ret; struct scatterlist sg[1]; SKCIPHER_REQUEST_ON_STACK(req, cipher); - u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2]; + u8 *data; struct page **save_pages; u32 len = buf->len - offset; - if (len > ARRAY_SIZE(data)) { + if (len > GSS_KRB5_MAX_BLOCKSIZE * 2) { WARN_ON(0); return -ENOMEM; } + data = kmalloc(GSS_KRB5_MAX_BLOCKSIZE * 2, GFP_NOFS); + if (!data) + return -ENOMEM; /* * For encryption, we want to read from the cleartext @@ -708,6 +721,7 @@ gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf, ret = write_bytes_to_xdr_buf(buf, offset, data, len); out: + kfree(data); return ret; } diff --git a/net/sunrpc/auth_gss/svcauth_gss.c b/net/sunrpc/auth_gss/svcauth_gss.c index d67f7e1..45662d7 100644 --- a/net/sunrpc/auth_gss/svcauth_gss.c +++ b/net/sunrpc/auth_gss/svcauth_gss.c @@ -718,30 +718,37 @@ gss_write_null_verf(struct svc_rqst *rqstp) static int gss_write_verf(struct svc_rqst *rqstp, struct gss_ctx *ctx_id, u32 seq) { - __be32 xdr_seq; + __be32 *xdr_seq; u32 maj_stat; struct xdr_buf verf_data; struct xdr_netobj mic; __be32 *p; struct kvec iov; + int err = -1; svc_putnl(rqstp->rq_res.head, RPC_AUTH_GSS); - xdr_seq = htonl(seq); + xdr_seq = kmalloc(4, GFP_KERNEL); + if (!xdr_seq) + return -1; + *xdr_seq = htonl(seq); - iov.iov_base = &xdr_seq; - iov.iov_len = sizeof(xdr_seq); + iov.iov_base = xdr_seq; + iov.iov_len = 4; xdr_buf_from_iov(&iov, &verf_data); p = rqstp->rq_res.head->iov_base + rqstp->rq_res.head->iov_len; mic.data = (u8 *)(p + 1); maj_stat = gss_get_mic(ctx_id, &verf_data, &mic); if (maj_stat != GSS_S_COMPLETE) - return -1; + goto out; *p++ = htonl(mic.len); memset((u8 *)p + mic.len, 0, round_up_to_quad(mic.len) - mic.len); p += XDR_QUADLEN(mic.len); if (!xdr_ressize_check(rqstp, p)) - return -1; - return 0; + goto out; + err = 0; +out: + kfree(xdr_seq); + return err; } struct gss_domain { diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 34dd7b2..62a4827 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -2753,14 +2753,18 @@ EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout); void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt) { + rcu_read_lock(); xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch)); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put); void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt) { + rcu_read_lock(); rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch), xprt); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt); @@ -2770,9 +2774,8 @@ bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt, struct rpc_xprt_switch *xps; bool ret; - xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); - rcu_read_lock(); + xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch); ret = rpc_xprt_switch_has_addr(xps, sap); rcu_read_unlock(); return ret; diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 2109495..26b26be 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -44,18 +44,20 @@ * being done. * * When the underlying transport disconnects, MRs are left in one of - * three states: + * four states: * * INVALID: The MR was not in use before the QP entered ERROR state. - * (Or, the LOCAL_INV WR has not completed or flushed yet). - * - * STALE: The MR was being registered or unregistered when the QP - * entered ERROR state, and the pending WR was flushed. * * VALID: The MR was registered before the QP entered ERROR state. * - * When frwr_op_map encounters STALE and VALID MRs, they are recovered - * with ib_dereg_mr and then are re-initialized. Beause MR recovery + * FLUSHED_FR: The MR was being registered when the QP entered ERROR + * state, and the pending WR was flushed. + * + * FLUSHED_LI: The MR was being invalidated when the QP entered ERROR + * state, and the pending WR was flushed. + * + * When frwr_op_map encounters FLUSHED and VALID MRs, they are recovered + * with ib_dereg_mr and then are re-initialized. Because MR recovery * allocates fresh resources, it is deferred to a workqueue, and the * recovered MRs are placed back on the rb_mws list when recovery is * complete. frwr_op_map allocates another MR for the current RPC while @@ -177,12 +179,15 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r) static void frwr_op_recover_mr(struct rpcrdma_mw *mw) { + enum rpcrdma_frmr_state state = mw->frmr.fr_state; struct rpcrdma_xprt *r_xprt = mw->mw_xprt; struct rpcrdma_ia *ia = &r_xprt->rx_ia; int rc; rc = __frwr_reset_mr(ia, mw); - ib_dma_unmap_sg(ia->ri_device, mw->mw_sg, mw->mw_nents, mw->mw_dir); + if (state != FRMR_FLUSHED_LI) + ib_dma_unmap_sg(ia->ri_device, + mw->mw_sg, mw->mw_nents, mw->mw_dir); if (rc) goto out_release; @@ -262,10 +267,8 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt) } static void -__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_frmr *frmr, - const char *wr) +__frwr_sendcompletion_flush(struct ib_wc *wc, const char *wr) { - frmr->fr_state = FRMR_IS_STALE; if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("rpcrdma: %s: %s (%u/0x%x)\n", wr, ib_wc_status_msg(wc->status), @@ -288,7 +291,8 @@ frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) { cqe = wc->wr_cqe; frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - __frwr_sendcompletion_flush(wc, frmr, "fastreg"); + frmr->fr_state = FRMR_FLUSHED_FR; + __frwr_sendcompletion_flush(wc, "fastreg"); } } @@ -308,7 +312,8 @@ frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) { cqe = wc->wr_cqe; frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - __frwr_sendcompletion_flush(wc, frmr, "localinv"); + frmr->fr_state = FRMR_FLUSHED_LI; + __frwr_sendcompletion_flush(wc, "localinv"); } } @@ -328,8 +333,10 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc) /* WARNING: Only wr_cqe and status are reliable at this point */ cqe = wc->wr_cqe; frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe); - if (wc->status != IB_WC_SUCCESS) - __frwr_sendcompletion_flush(wc, frmr, "localinv"); + if (wc->status != IB_WC_SUCCESS) { + frmr->fr_state = FRMR_FLUSHED_LI; + __frwr_sendcompletion_flush(wc, "localinv"); + } complete(&frmr->fr_linv_done); } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 2d8545c..20027f8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -177,18 +177,26 @@ xprt_rdma_bc_allocate(struct rpc_task *task) return -EINVAL; } + /* svc_rdma_sendto releases this page */ page = alloc_page(RPCRDMA_DEF_GFP); if (!page) return -ENOMEM; - rqst->rq_buffer = page_address(page); + + rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP); + if (!rqst->rq_rbuffer) { + put_page(page); + return -ENOMEM; + } return 0; } static void xprt_rdma_bc_free(struct rpc_task *task) { - /* No-op: ctxt and page have already been freed. */ + struct rpc_rqst *rqst = task->tk_rqstp; + + kfree(rqst->rq_rbuffer); } static int diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 0d35b76..6e1bba3 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -216,7 +216,8 @@ struct rpcrdma_rep { enum rpcrdma_frmr_state { FRMR_IS_INVALID, /* ready to be used */ FRMR_IS_VALID, /* in use */ - FRMR_IS_STALE, /* failed completion */ + FRMR_FLUSHED_FR, /* flushed FASTREG WR */ + FRMR_FLUSHED_LI, /* flushed LOCALINV WR */ }; struct rpcrdma_frmr { diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 7178d0a..af392d9 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2563,6 +2563,7 @@ static int bc_malloc(struct rpc_task *task) buf->len = PAGE_SIZE; rqst->rq_buffer = buf->data; + rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize; return 0; } diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 8762018..6a705d0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2812,7 +2812,8 @@ static int unix_seq_show(struct seq_file *seq, void *v) i++; } for ( ; i < len; i++) - seq_putc(seq, u->addr->name->sun_path[i]); + seq_putc(seq, u->addr->name->sun_path[i] ?: + '@'); } unix_state_unlock(s); seq_putc(seq, '\n'); |