diff options
Diffstat (limited to 'net')
86 files changed, 743 insertions, 463 deletions
diff --git a/net/atm/clip.c b/net/atm/clip.c index c12c258..127fe70 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -46,8 +46,8 @@ static struct net_device *clip_devs; static struct atm_vcc *atmarpd; -static struct neigh_table clip_tbl; static struct timer_list idle_timer; +static const struct neigh_ops clip_neigh_ops; static int to_atmarpd(enum atmarp_ctrl_type type, int itf, __be32 ip) { @@ -123,6 +123,8 @@ static int neigh_check_cb(struct neighbour *n) struct atmarp_entry *entry = neighbour_priv(n); struct clip_vcc *cv; + if (n->ops != &clip_neigh_ops) + return 0; for (cv = entry->vccs; cv; cv = cv->next) { unsigned long exp = cv->last_use + cv->idle_timeout; @@ -154,10 +156,10 @@ static int neigh_check_cb(struct neighbour *n) static void idle_timer_check(unsigned long dummy) { - write_lock(&clip_tbl.lock); - __neigh_for_each_release(&clip_tbl, neigh_check_cb); + write_lock(&arp_tbl.lock); + __neigh_for_each_release(&arp_tbl, neigh_check_cb); mod_timer(&idle_timer, jiffies + CLIP_CHECK_INTERVAL * HZ); - write_unlock(&clip_tbl.lock); + write_unlock(&arp_tbl.lock); } static int clip_arp_rcv(struct sk_buff *skb) diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ef92864..72eb187 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -71,19 +71,16 @@ static const char *const bt_slock_key_strings[BT_MAX_PROTO] = { "slock-AF_BLUETOOTH-BTPROTO_AVDTP", }; -static inline void bt_sock_reclassify_lock(struct socket *sock, int proto) +void bt_sock_reclassify_lock(struct sock *sk, int proto) { - struct sock *sk = sock->sk; - - if (!sk) - return; - + BUG_ON(!sk); BUG_ON(sock_owned_by_user(sk)); sock_lock_init_class_and_name(sk, bt_slock_key_strings[proto], &bt_slock_key[proto], bt_key_strings[proto], &bt_lock_key[proto]); } +EXPORT_SYMBOL(bt_sock_reclassify_lock); int bt_sock_register(int proto, const struct net_proto_family *ops) { @@ -145,7 +142,8 @@ static int bt_sock_create(struct net *net, struct socket *sock, int proto, if (bt_proto[proto] && try_module_get(bt_proto[proto]->owner)) { err = bt_proto[proto]->create(net, sock, proto, kern); - bt_sock_reclassify_lock(sock, proto); + if (!err) + bt_sock_reclassify_lock(sock->sk, proto); module_put(bt_proto[proto]->owner); } diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 3db4324..07bc69e 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -635,6 +635,10 @@ static int hci_conn_auth(struct hci_conn *conn, __u8 sec_level, __u8 auth_type) if (!test_and_set_bit(HCI_CONN_AUTH_PEND, &conn->pend)) { struct hci_cp_auth_requested cp; + + /* encrypt must be pending if auth is also pending */ + set_bit(HCI_CONN_ENCRYPT_PEND, &conn->pend); + cp.handle = cpu_to_le16(conn->handle); hci_send_cmd(conn->hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 845da3e..5aeb624 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -55,7 +55,7 @@ #define AUTO_OFF_TIMEOUT 2000 -int enable_hs; +bool enable_hs; static void hci_rx_work(struct work_struct *work); static void hci_cmd_work(struct work_struct *work); @@ -640,7 +640,8 @@ static int hci_dev_do_close(struct hci_dev *hdev) /* Reset device */ skb_queue_purge(&hdev->cmd_q); atomic_set(&hdev->cmd_cnt, 1); - if (!test_bit(HCI_RAW, &hdev->flags)) { + if (!test_bit(HCI_RAW, &hdev->flags) && + test_bit(HCI_QUIRK_NO_RESET, &hdev->quirks)) { set_bit(HCI_INIT, &hdev->flags); __hci_request(hdev, hci_reset_req, 0, msecs_to_jiffies(250)); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index faf0b11a..32d338c 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -1018,10 +1018,10 @@ static void l2cap_conn_del(struct hci_conn *hcon, int err) hci_chan_del(conn->hchan); if (conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work_sync(&conn->info_timer); if (test_and_clear_bit(HCI_CONN_LE_SMP_PEND, &hcon->pend)) { - __cancel_delayed_work(&conn->security_timer); + cancel_delayed_work_sync(&conn->security_timer); smp_chan_destroy(conn); } @@ -1120,7 +1120,7 @@ static struct l2cap_chan *l2cap_global_chan_by_psm(int state, __le16 psm, bdaddr return c1; } -inline int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst) +int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid, bdaddr_t *dst) { struct sock *sk = chan->sk; bdaddr_t *src = &bt_sk(sk)->src; @@ -2574,7 +2574,7 @@ static inline int l2cap_command_rej(struct l2cap_conn *conn, struct l2cap_cmd_hd if ((conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_SENT) && cmd->ident == conn->info_ident) { - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work(&conn->info_timer); conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; conn->info_ident = 0; @@ -2970,7 +2970,8 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, struct l2cap_cmd_hdr default: sk->sk_err = ECONNRESET; - __set_chan_timer(chan, L2CAP_DISC_REJ_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_REJ_TIMEOUT)); l2cap_send_disconn_req(conn, chan, ECONNRESET); goto done; } @@ -3120,7 +3121,7 @@ static inline int l2cap_information_rsp(struct l2cap_conn *conn, struct l2cap_cm conn->info_state & L2CAP_INFO_FEAT_MASK_REQ_DONE) return 0; - __cancel_delayed_work(&conn->info_timer); + cancel_delayed_work(&conn->info_timer); if (result != L2CAP_IR_SUCCESS) { conn->info_state |= L2CAP_INFO_FEAT_MASK_REQ_DONE; @@ -4478,7 +4479,8 @@ static inline void l2cap_check_encryption(struct l2cap_chan *chan, u8 encrypt) if (encrypt == 0x00) { if (chan->sec_level == BT_SECURITY_MEDIUM) { __clear_chan_timer(chan); - __set_chan_timer(chan, L2CAP_ENC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_ENC_TIMEOUT)); } else if (chan->sec_level == BT_SECURITY_HIGH) l2cap_chan_close(chan, ECONNREFUSED); } else { @@ -4499,7 +4501,7 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) if (hcon->type == LE_LINK) { smp_distribute_keys(conn, 0); - __cancel_delayed_work(&conn->security_timer); + cancel_delayed_work(&conn->security_timer); } rcu_read_lock(); @@ -4546,7 +4548,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) L2CAP_CONN_REQ, sizeof(req), &req); } else { __clear_chan_timer(chan); - __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_TIMEOUT)); } } else if (chan->state == BT_CONNECT2) { struct l2cap_conn_rsp rsp; @@ -4566,7 +4569,8 @@ int l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) } } else { l2cap_state_change(chan, BT_DISCONN); - __set_chan_timer(chan, L2CAP_DISC_TIMEOUT); + __set_chan_timer(chan, + msecs_to_jiffies(L2CAP_DISC_TIMEOUT)); res = L2CAP_CR_SEC_BLOCK; stat = L2CAP_CS_NO_INFO; } diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c index c61d967..401d942 100644 --- a/net/bluetooth/l2cap_sock.c +++ b/net/bluetooth/l2cap_sock.c @@ -849,6 +849,8 @@ static struct l2cap_chan *l2cap_sock_new_connection_cb(void *data) if (!sk) return NULL; + bt_sock_reclassify_lock(sk, BTPROTO_L2CAP); + l2cap_sock_init(sk, parent); return l2cap_pi(sk)->chan; @@ -1002,7 +1004,7 @@ static struct sock *l2cap_sock_alloc(struct net *net, struct socket *sock, int p INIT_LIST_HEAD(&bt_sk(sk)->accept_q); sk->sk_destruct = l2cap_sock_destruct; - sk->sk_sndtimeo = L2CAP_CONN_TIMEOUT; + sk->sk_sndtimeo = msecs_to_jiffies(L2CAP_CONN_TIMEOUT); sock_reset_flag(sk, SOCK_ZAPPED); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 501649bf..8a60238 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1164,12 +1164,18 @@ static int rfcomm_recv_ua(struct rfcomm_session *s, u8 dlci) break; case BT_DISCONN: - /* When socket is closed and we are not RFCOMM - * initiator rfcomm_process_rx already calls - * rfcomm_session_put() */ - if (s->sock->sk->sk_state != BT_CLOSED) - if (list_empty(&s->dlcs)) - rfcomm_session_put(s); + /* rfcomm_session_put is called later so don't do + * anything here otherwise we will mess up the session + * reference counter: + * + * (a) when we are the initiator dlc_unlink will drive + * the reference counter to 0 (there is no initial put + * after session_add) + * + * (b) when we are not the initiator rfcomm_rx_process + * will explicitly call put to balance the initial hold + * done after session add. + */ break; } } diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index f066678..22169c3 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -956,6 +956,8 @@ int rfcomm_connect_ind(struct rfcomm_session *s, u8 channel, struct rfcomm_dlc * if (!sk) goto done; + bt_sock_reclassify_lock(sk, BTPROTO_RFCOMM); + rfcomm_sock_init(sk, parent); bacpy(&bt_sk(sk)->src, &src); bacpy(&bt_sk(sk)->dst, &dst); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 568d5bf..702a1ae 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -446,8 +446,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct net_bridge *br, ip6h->nexthdr = IPPROTO_HOPOPTS; ip6h->hop_limit = 1; ipv6_addr_set(&ip6h->daddr, htonl(0xff020000), 0, 0, htonl(1)); - ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, - &ip6h->saddr); + if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0, + &ip6h->saddr)) { + kfree_skb(skb); + return NULL; + } ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest); hopopt = (u8 *)(ip6h + 1); diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 8412247..dec4f38 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -62,6 +62,15 @@ static int brnf_filter_pppoe_tagged __read_mostly = 0; #define brnf_filter_pppoe_tagged 0 #endif +#define IS_IP(skb) \ + (!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IP)) + +#define IS_IPV6(skb) \ + (!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_IPV6)) + +#define IS_ARP(skb) \ + (!vlan_tx_tag_present(skb) && skb->protocol == htons(ETH_P_ARP)) + static inline __be16 vlan_proto(const struct sk_buff *skb) { if (vlan_tx_tag_present(skb)) @@ -639,8 +648,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, return NF_DROP; br = p->br; - if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || - IS_PPPOE_IPV6(skb)) { + if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) { if (!brnf_call_ip6tables && !br->nf_call_ip6tables) return NF_ACCEPT; @@ -651,8 +659,7 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, if (!brnf_call_iptables && !br->nf_call_iptables) return NF_ACCEPT; - if (skb->protocol != htons(ETH_P_IP) && !IS_VLAN_IP(skb) && - !IS_PPPOE_IP(skb)) + if (!IS_IP(skb) && !IS_VLAN_IP(skb) && !IS_PPPOE_IP(skb)) return NF_ACCEPT; nf_bridge_pull_encap_header_rcsum(skb); @@ -701,7 +708,7 @@ static int br_nf_forward_finish(struct sk_buff *skb) struct nf_bridge_info *nf_bridge = skb->nf_bridge; struct net_device *in; - if (skb->protocol != htons(ETH_P_ARP) && !IS_VLAN_ARP(skb)) { + if (!IS_ARP(skb) && !IS_VLAN_ARP(skb)) { in = nf_bridge->physindev; if (nf_bridge->mask & BRNF_PKT_TYPE) { skb->pkt_type = PACKET_OTHERHOST; @@ -718,6 +725,7 @@ static int br_nf_forward_finish(struct sk_buff *skb) return 0; } + /* This is the 'purely bridged' case. For IP, we pass the packet to * netfilter with indev and outdev set to the bridge device, * but we are still able to filter on the 'real' indev/outdev @@ -744,11 +752,9 @@ static unsigned int br_nf_forward_ip(unsigned int hook, struct sk_buff *skb, if (!parent) return NF_DROP; - if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || - IS_PPPOE_IP(skb)) + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) pf = PF_INET; - else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || - IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) pf = PF_INET6; else return NF_ACCEPT; @@ -795,7 +801,7 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb, if (!brnf_call_arptables && !br->nf_call_arptables) return NF_ACCEPT; - if (skb->protocol != htons(ETH_P_ARP)) { + if (!IS_ARP(skb)) { if (!IS_VLAN_ARP(skb)) return NF_ACCEPT; nf_bridge_pull_encap_header(skb); @@ -853,11 +859,9 @@ static unsigned int br_nf_post_routing(unsigned int hook, struct sk_buff *skb, if (!realoutdev) return NF_DROP; - if (skb->protocol == htons(ETH_P_IP) || IS_VLAN_IP(skb) || - IS_PPPOE_IP(skb)) + if (IS_IP(skb) || IS_VLAN_IP(skb) || IS_PPPOE_IP(skb)) pf = PF_INET; - else if (skb->protocol == htons(ETH_P_IPV6) || IS_VLAN_IPV6(skb) || - IS_PPPOE_IPV6(skb)) + else if (IS_IPV6(skb) || IS_VLAN_IPV6(skb) || IS_PPPOE_IPV6(skb)) pf = PF_INET6; else return NF_ACCEPT; diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index dd147d7..8c836d9 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -17,9 +17,9 @@ #include "br_private_stp.h" /* since time values in bpdu are in jiffies and then scaled (1/256) - * before sending, make sure that is at least one. + * before sending, make sure that is at least one STP tick. */ -#define MESSAGE_AGE_INCR ((HZ < 256) ? 1 : (HZ/256)) +#define MESSAGE_AGE_INCR ((HZ / 256) + 1) static const char *const br_port_state_names[] = { [BR_STATE_DISABLED] = "disabled", @@ -31,7 +31,7 @@ static const char *const br_port_state_names[] = { void br_log_state(const struct net_bridge_port *p) { - br_info(p->br, "port %u(%s) entering %s state\n", + br_info(p->br, "port %u(%s) entered %s state\n", (unsigned) p->port_no, p->dev->name, br_port_state_names[p->state]); } @@ -186,7 +186,7 @@ static void br_record_config_information(struct net_bridge_port *p, p->designated_cost = bpdu->root_path_cost; p->designated_bridge = bpdu->bridge_id; p->designated_port = bpdu->port_id; - p->designated_age = jiffies + bpdu->message_age; + p->designated_age = jiffies - bpdu->message_age; mod_timer(&p->message_age_timer, jiffies + (p->br->max_age - bpdu->message_age)); diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 19308e3..f494496 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -98,14 +98,13 @@ void br_stp_disable_port(struct net_bridge_port *p) struct net_bridge *br = p->br; int wasroot; - br_log_state(p); - wasroot = br_is_root_bridge(br); br_become_designated_port(p); p->state = BR_STATE_DISABLED; p->topology_change_ack = 0; p->config_pending = 0; + br_log_state(p); br_ifinfo_notify(RTM_NEWLINK, p); del_timer(&p->message_age_timer); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 5864cc4..5fe2ff3 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1335,7 +1335,12 @@ static inline int ebt_make_matchname(const struct ebt_entry_match *m, const char *base, char __user *ubase) { char __user *hlp = ubase + ((char *)m - base); - if (copy_to_user(hlp, m->u.match->name, EBT_FUNCTION_MAXNAMELEN)) + char name[EBT_FUNCTION_MAXNAMELEN] = {}; + + /* ebtables expects 32 bytes long names but xt_match names are 29 bytes + long. Copy 29 bytes and fill remaining bytes with zeroes. */ + strncpy(name, m->u.match->name, sizeof(name)); + if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; } @@ -1344,7 +1349,10 @@ static inline int ebt_make_watchername(const struct ebt_entry_watcher *w, const char *base, char __user *ubase) { char __user *hlp = ubase + ((char *)w - base); - if (copy_to_user(hlp , w->u.watcher->name, EBT_FUNCTION_MAXNAMELEN)) + char name[EBT_FUNCTION_MAXNAMELEN] = {}; + + strncpy(name, w->u.watcher->name, sizeof(name)); + if (copy_to_user(hlp , name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; } @@ -1355,6 +1363,7 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) int ret; char __user *hlp; const struct ebt_entry_target *t; + char name[EBT_FUNCTION_MAXNAMELEN] = {}; if (e->bitmask == 0) return 0; @@ -1368,7 +1377,8 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase) ret = EBT_WATCHER_ITERATE(e, ebt_make_watchername, base, ubase); if (ret != 0) return ret; - if (copy_to_user(hlp, t->u.target->name, EBT_FUNCTION_MAXNAMELEN)) + strncpy(name, t->u.target->name, sizeof(name)); + if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN)) return -EFAULT; return 0; } @@ -1893,10 +1903,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, switch (compat_mwt) { case EBT_COMPAT_MATCH: - match = try_then_request_module(xt_find_match(NFPROTO_BRIDGE, - name, 0), "ebt_%s", name); - if (match == NULL) - return -ENOENT; + match = xt_request_find_match(NFPROTO_BRIDGE, name, 0); if (IS_ERR(match)) return PTR_ERR(match); @@ -1915,10 +1922,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt, break; case EBT_COMPAT_WATCHER: /* fallthrough */ case EBT_COMPAT_TARGET: - wt = try_then_request_module(xt_find_target(NFPROTO_BRIDGE, - name, 0), "ebt_%s", name); - if (wt == NULL) - return -ENOENT; + wt = xt_request_find_target(NFPROTO_BRIDGE, name, 0); if (IS_ERR(wt)) return PTR_ERR(wt); off = xt_compat_target_offset(wt); diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 673728a..82c5706 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -59,8 +59,6 @@ struct cfcnfg *get_cfcnfg(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); - if (!caifn) - return NULL; return caifn->cfg; } EXPORT_SYMBOL(get_cfcnfg); @@ -69,8 +67,6 @@ static struct caif_device_entry_list *caif_device_list(struct net *net) { struct caif_net *caifn; caifn = net_generic(net, caif_net_id); - if (!caifn) - return NULL; return &caifn->caifdevs; } @@ -99,8 +95,6 @@ static struct caif_device_entry *caif_device_alloc(struct net_device *dev) struct caif_device_entry *caifd; caifdevs = caif_device_list(dev_net(dev)); - if (!caifdevs) - return NULL; caifd = kzalloc(sizeof(*caifd), GFP_KERNEL); if (!caifd) @@ -120,8 +114,6 @@ static struct caif_device_entry *caif_get(struct net_device *dev) struct caif_device_entry_list *caifdevs = caif_device_list(dev_net(dev)); struct caif_device_entry *caifd; - if (!caifdevs) - return NULL; list_for_each_entry_rcu(caifd, &caifdevs->list, list) { if (caifd->netdev == dev) @@ -321,8 +313,6 @@ void caif_enroll_dev(struct net_device *dev, struct caif_dev_common *caifdev, struct caif_device_entry_list *caifdevs; caifdevs = caif_device_list(dev_net(dev)); - if (!cfg || !caifdevs) - return; caifd = caif_device_alloc(dev); if (!caifd) return; @@ -374,8 +364,6 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, cfg = get_cfcnfg(dev_net(dev)); caifdevs = caif_device_list(dev_net(dev)); - if (!cfg || !caifdevs) - return 0; caifd = caif_get(dev); if (caifd == NULL && dev->type != ARPHRD_CAIF) @@ -507,9 +495,6 @@ static struct notifier_block caif_device_notifier = { static int caif_init_net(struct net *net) { struct caif_net *caifn = net_generic(net, caif_net_id); - if (WARN_ON(!caifn)) - return -EINVAL; - INIT_LIST_HEAD(&caifn->caifdevs.list); mutex_init(&caifn->caifdevs.lock); @@ -527,9 +512,6 @@ static void caif_exit_net(struct net *net) caif_device_list(net); struct cfcnfg *cfg = get_cfcnfg(net); - if (!cfg || !caifdevs) - return; - rtnl_lock(); mutex_lock(&caifdevs->lock); @@ -569,7 +551,7 @@ static int __init caif_device_init(void) { int result; - result = register_pernet_device(&caif_net_ops); + result = register_pernet_subsys(&caif_net_ops); if (result) return result; @@ -582,7 +564,7 @@ static int __init caif_device_init(void) static void __exit caif_device_exit(void) { - unregister_pernet_device(&caif_net_ops); + unregister_pernet_subsys(&caif_net_ops); unregister_netdevice_notifier(&caif_device_notifier); dev_remove_pack(&caif_packet_type); } diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index a986280..a97d97a 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -539,8 +539,10 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, pkt = cfpkt_fromnative(CAIF_DIR_OUT, skb); memset(skb->cb, 0, sizeof(struct caif_payload_info)); - if (cf_sk->layer.dn == NULL) + if (cf_sk->layer.dn == NULL) { + kfree_skb(skb); return -EINVAL; + } return cf_sk->layer.dn->transmit(cf_sk->layer.dn, pkt); } @@ -683,10 +685,10 @@ static int caif_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } err = transmit_skb(skb, cf_sk, msg->msg_flags&MSG_DONTWAIT, timeo); - if (err < 0) { - kfree_skb(skb); + if (err < 0) + /* skb is already freed */ goto pipe_err; - } + sent += size; } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 598aafb..ba9cfd4 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -309,7 +309,6 @@ int caif_connect_client(struct net *net, struct caif_connect_request *conn_req, int err; struct cfctrl_link_param param; struct cfcnfg *cfg = get_cfcnfg(net); - caif_assert(cfg != NULL); rcu_read_lock(); err = caif_connect_req_to_link_param(cfg, conn_req, ¶m); diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index b36f24a..94b0861 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -248,7 +248,6 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, { struct cfmuxl *muxl = container_obj(layr); struct cflayer *layer; - int idx; rcu_read_lock(); list_for_each_entry_rcu(layer, &muxl->srvl_list, node) { @@ -257,14 +256,9 @@ static void cfmuxl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, if ((ctrl == _CAIF_CTRLCMD_PHYIF_DOWN_IND || ctrl == CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND) && - layer->id != 0) { - - idx = layer->id % UP_CACHE_SIZE; - spin_lock_bh(&muxl->receive_lock); - RCU_INIT_POINTER(muxl->up_cache[idx], NULL); - list_del_rcu(&layer->node); - spin_unlock_bh(&muxl->receive_lock); - } + layer->id != 0) + cfmuxl_remove_uplayer(layr, layer->id); + /* NOTE: ctrlcmd is not allowed to block */ layer->ctrlcmd(layer, ctrl, phyid); } diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c index 97f70e5..761ad9d 100644 --- a/net/ceph/ceph_common.c +++ b/net/ceph/ceph_common.c @@ -85,8 +85,6 @@ int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid) } else { pr_info("client%lld fsid %pU\n", ceph_client_id(client), fsid); memcpy(&client->fsid, fsid, sizeof(*fsid)); - ceph_debugfs_client_init(client); - client->have_fsid = true; } return 0; } diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 0b62dea..1845cde 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -8,8 +8,8 @@ #include <linux/ceph/mon_client.h> #include <linux/ceph/libceph.h> +#include <linux/ceph/debugfs.h> #include <linux/ceph/decode.h> - #include <linux/ceph/auth.h> /* @@ -340,8 +340,19 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, client->monc.monmap = monmap; kfree(old); + if (!client->have_fsid) { + client->have_fsid = true; + mutex_unlock(&monc->mutex); + /* + * do debugfs initialization without mutex to avoid + * creating a locking dependency + */ + ceph_debugfs_client_init(client); + goto out_unlocked; + } out: mutex_unlock(&monc->mutex); +out_unlocked: wake_up_all(&client->auth_wq); } diff --git a/net/core/dev.c b/net/core/dev.c index 115dee1..6ca32f6 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3500,14 +3500,20 @@ static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; + unsigned int maclen = skb->dev->hard_header_len; for (p = napi->gro_list; p; p = p->next) { unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; diffs |= p->vlan_tci ^ skb->vlan_tci; - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); + if (maclen == ETH_HLEN) + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + else if (!diffs) + diffs = memcmp(skb_mac_header(p), + skb_gro_mac_header(skb), + maclen); NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 921aa2b..3f79db1 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1190,6 +1190,8 @@ static noinline_for_stack int ethtool_flash_device(struct net_device *dev, if (!dev->ethtool_ops->flash_device) return -EOPNOTSUPP; + efl.data[ETHTOOL_FLASH_MAX_FILENAME - 1] = 0; + return dev->ethtool_ops->flash_device(dev, &efl); } @@ -1311,6 +1313,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) case ETHTOOL_GRXCSUM: case ETHTOOL_GTXCSUM: case ETHTOOL_GSG: + case ETHTOOL_GSSET_INFO: case ETHTOOL_GSTRINGS: case ETHTOOL_GTSO: case ETHTOOL_GPERMADDR: diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 0985b9b..a225089 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -1,4 +1,5 @@ #include <linux/skbuff.h> +#include <linux/export.h> #include <linux/ip.h> #include <linux/ipv6.h> #include <linux/if_vlan.h> diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e287346..2a83914 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -826,6 +826,8 @@ next_elt: write_unlock_bh(&tbl->lock); cond_resched(); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); } /* Cycle through all hash buckets every base_reachable_time/2 ticks. * ARP entry timeouts range from 1/2 base_reachable_time to 3/2 diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index aefcd7a..0e950fd 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -30,6 +30,20 @@ EXPORT_SYMBOL(init_net); #define INITIAL_NET_GEN_PTRS 13 /* +1 for len +2 for rcu_head */ +static unsigned int max_gen_ptrs = INITIAL_NET_GEN_PTRS; + +static struct net_generic *net_alloc_generic(void) +{ + struct net_generic *ng; + size_t generic_size = offsetof(struct net_generic, ptr[max_gen_ptrs]); + + ng = kzalloc(generic_size, GFP_KERNEL); + if (ng) + ng->len = max_gen_ptrs; + + return ng; +} + static int net_assign_generic(struct net *net, int id, void *data) { struct net_generic *ng, *old_ng; @@ -43,8 +57,7 @@ static int net_assign_generic(struct net *net, int id, void *data) if (old_ng->len >= id) goto assign; - ng = kzalloc(sizeof(struct net_generic) + - id * sizeof(void *), GFP_KERNEL); + ng = net_alloc_generic(); if (ng == NULL) return -ENOMEM; @@ -59,7 +72,6 @@ static int net_assign_generic(struct net *net, int id, void *data) * the old copy for kfree after a grace period. */ - ng->len = id; memcpy(&ng->ptr, &old_ng->ptr, old_ng->len * sizeof(void*)); rcu_assign_pointer(net->gen, ng); @@ -161,18 +173,6 @@ out_undo: goto out; } -static struct net_generic *net_alloc_generic(void) -{ - struct net_generic *ng; - size_t generic_size = sizeof(struct net_generic) + - INITIAL_NET_GEN_PTRS * sizeof(void *); - - ng = kzalloc(generic_size, GFP_KERNEL); - if (ng) - ng->len = INITIAL_NET_GEN_PTRS; - - return ng; -} #ifdef CONFIG_NET_NS static struct kmem_cache *net_cachep; @@ -483,6 +483,7 @@ again: } return error; } + max_gen_ptrs = max_t(unsigned int, max_gen_ptrs, *ops->id); } error = __register_pernet_operations(list, ops); if (error) { diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 556b082..ddefc51 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -194,7 +194,7 @@ static void netpoll_poll_dev(struct net_device *dev) poll_napi(dev); - if (dev->priv_flags & IFF_SLAVE) { + if (dev->flags & IFF_SLAVE) { if (dev->npinfo) { struct net_device *bond_dev = dev->master; struct sk_buff *skb; diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c index 3a9fd48..4dacc44 100644 --- a/net/core/netprio_cgroup.c +++ b/net/core/netprio_cgroup.c @@ -58,11 +58,12 @@ static int get_prioidx(u32 *prio) spin_lock_irqsave(&prioidx_map_lock, flags); prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ); + if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) { + spin_unlock_irqrestore(&prioidx_map_lock, flags); + return -ENOSPC; + } set_bit(prioidx, prioidx_map); spin_unlock_irqrestore(&prioidx_map_lock, flags); - if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ) - return -ENOSPC; - atomic_set(&max_prioidx, prioidx); *prio = prioidx; return 0; @@ -107,7 +108,7 @@ static void extend_netdev_table(struct net_device *dev, u32 new_len) static void update_netdev_tables(void) { struct net_device *dev; - u32 max_len = atomic_read(&max_prioidx); + u32 max_len = atomic_read(&max_prioidx) + 1; struct netprio_map *map; rtnl_lock(); @@ -270,7 +271,6 @@ static int netprio_device_event(struct notifier_block *unused, { struct net_device *dev = ptr; struct netprio_map *old; - u32 max_len = atomic_read(&max_prioidx); /* * Note this is called with rtnl_lock held so we have update side @@ -278,11 +278,6 @@ static int netprio_device_event(struct notifier_block *unused, */ switch (event) { - - case NETDEV_REGISTER: - if (max_len) - extend_netdev_table(dev, max_len); - break; case NETDEV_UNREGISTER: old = rtnl_dereference(dev->priomap); RCU_INIT_POINTER(dev->priomap, NULL); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 65f80c7..4d8ce93 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -767,8 +767,8 @@ done: return i; } -static unsigned long num_arg(const char __user * user_buffer, - unsigned long maxlen, unsigned long *num) +static long num_arg(const char __user *user_buffer, unsigned long maxlen, + unsigned long *num) { int i; *num = 0; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f16444b..f965dce 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -60,7 +60,6 @@ struct rtnl_link { }; static DEFINE_MUTEX(rtnl_mutex); -static u16 min_ifinfo_dump_size; void rtnl_lock(void) { @@ -724,10 +723,11 @@ static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) } /* All VF info */ -static inline int rtnl_vfinfo_size(const struct net_device *dev) +static inline int rtnl_vfinfo_size(const struct net_device *dev, + u32 ext_filter_mask) { - if (dev->dev.parent && dev_is_pci(dev->dev.parent)) { - + if (dev->dev.parent && dev_is_pci(dev->dev.parent) && + (ext_filter_mask & RTEXT_FILTER_VF)) { int num_vfs = dev_num_vf(dev->dev.parent); size_t size = nla_total_size(sizeof(struct nlattr)); size += nla_total_size(num_vfs * sizeof(struct nlattr)); @@ -766,7 +766,8 @@ static size_t rtnl_port_size(const struct net_device *dev) return port_self_size; } -static noinline size_t if_nlmsg_size(const struct net_device *dev) +static noinline size_t if_nlmsg_size(const struct net_device *dev, + u32 ext_filter_mask) { return NLMSG_ALIGN(sizeof(struct ifinfomsg)) + nla_total_size(IFNAMSIZ) /* IFLA_IFNAME */ @@ -784,8 +785,9 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev) + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_OPERSTATE */ + nla_total_size(1) /* IFLA_LINKMODE */ - + nla_total_size(4) /* IFLA_NUM_VF */ - + rtnl_vfinfo_size(dev) /* IFLA_VFINFO_LIST */ + + nla_total_size(ext_filter_mask + & RTEXT_FILTER_VF ? 4 : 0) /* IFLA_NUM_VF */ + + rtnl_vfinfo_size(dev, ext_filter_mask) /* IFLA_VFINFO_LIST */ + rtnl_port_size(dev) /* IFLA_VF_PORTS + IFLA_PORT_SELF */ + rtnl_link_get_size(dev) /* IFLA_LINKINFO */ + rtnl_link_get_af_size(dev); /* IFLA_AF_SPEC */ @@ -868,7 +870,7 @@ static int rtnl_port_fill(struct sk_buff *skb, struct net_device *dev) static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, int type, u32 pid, u32 seq, u32 change, - unsigned int flags) + unsigned int flags, u32 ext_filter_mask) { struct ifinfomsg *ifm; struct nlmsghdr *nlh; @@ -941,10 +943,11 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev, goto nla_put_failure; copy_rtnl_link_stats64(nla_data(attr), stats); - if (dev->dev.parent) + if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF)) NLA_PUT_U32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)); - if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent) { + if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent + && (ext_filter_mask & RTEXT_FILTER_VF)) { int i; struct nlattr *vfinfo, *vf; @@ -1048,6 +1051,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct hlist_head *head; struct hlist_node *node; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; s_h = cb->args[0]; s_idx = cb->args[1]; @@ -1055,6 +1060,13 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) rcu_read_lock(); cb->seq = net->dev_base_seq; + if (nlmsg_parse(cb->nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + ifla_policy) >= 0) { + + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + } + for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) { idx = 0; head = &net->dev_index_head[h]; @@ -1064,7 +1076,8 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb) if (rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0, - NLM_F_MULTI) <= 0) + NLM_F_MULTI, + ext_filter_mask) <= 0) goto out; nl_dump_check_consistent(cb, nlmsg_hdr(skb)); @@ -1100,6 +1113,7 @@ const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_VF_PORTS] = { .type = NLA_NESTED }, [IFLA_PORT_SELF] = { .type = NLA_NESTED }, [IFLA_AF_SPEC] = { .type = NLA_NESTED }, + [IFLA_EXT_MASK] = { .type = NLA_U32 }, }; EXPORT_SYMBOL(ifla_policy); @@ -1509,6 +1523,7 @@ errout: if (send_addr_notify) call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); + return err; } @@ -1839,6 +1854,7 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) struct net_device *dev = NULL; struct sk_buff *nskb; int err; + u32 ext_filter_mask = 0; err = nlmsg_parse(nlh, sizeof(*ifm), tb, IFLA_MAX, ifla_policy); if (err < 0) @@ -1847,6 +1863,9 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (tb[IFLA_IFNAME]) nla_strlcpy(ifname, tb[IFLA_IFNAME], IFNAMSIZ); + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + ifm = nlmsg_data(nlh); if (ifm->ifi_index > 0) dev = __dev_get_by_index(net, ifm->ifi_index); @@ -1858,12 +1877,12 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) if (dev == NULL) return -ENODEV; - nskb = nlmsg_new(if_nlmsg_size(dev), GFP_KERNEL); + nskb = nlmsg_new(if_nlmsg_size(dev, ext_filter_mask), GFP_KERNEL); if (nskb == NULL) return -ENOBUFS; err = rtnl_fill_ifinfo(nskb, dev, RTM_NEWLINK, NETLINK_CB(skb).pid, - nlh->nlmsg_seq, 0, 0); + nlh->nlmsg_seq, 0, 0, ext_filter_mask); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size */ WARN_ON(err == -EMSGSIZE); @@ -1874,8 +1893,32 @@ static int rtnl_getlink(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) return err; } -static u16 rtnl_calcit(struct sk_buff *skb) +static u16 rtnl_calcit(struct sk_buff *skb, struct nlmsghdr *nlh) { + struct net *net = sock_net(skb->sk); + struct net_device *dev; + struct nlattr *tb[IFLA_MAX+1]; + u32 ext_filter_mask = 0; + u16 min_ifinfo_dump_size = 0; + + if (nlmsg_parse(nlh, sizeof(struct rtgenmsg), tb, IFLA_MAX, + ifla_policy) >= 0) { + if (tb[IFLA_EXT_MASK]) + ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]); + } + + if (!ext_filter_mask) + return NLMSG_GOODSIZE; + /* + * traverse the list of net devices and compute the minimum + * buffer size based upon the filter mask. + */ + list_for_each_entry(dev, &net->dev_base_head, dev_list) { + min_ifinfo_dump_size = max_t(u16, min_ifinfo_dump_size, + if_nlmsg_size(dev, + ext_filter_mask)); + } + return min_ifinfo_dump_size; } @@ -1910,13 +1953,11 @@ void rtmsg_ifinfo(int type, struct net_device *dev, unsigned change) int err = -ENOBUFS; size_t if_info_size; - skb = nlmsg_new((if_info_size = if_nlmsg_size(dev)), GFP_KERNEL); + skb = nlmsg_new((if_info_size = if_nlmsg_size(dev, 0)), GFP_KERNEL); if (skb == NULL) goto errout; - min_ifinfo_dump_size = max_t(u16, if_info_size, min_ifinfo_dump_size); - - err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0); + err = rtnl_fill_ifinfo(skb, dev, type, 0, 0, change, 0, 0); if (err < 0) { /* -EMSGSIZE implies BUG in if_nlmsg_size() */ WARN_ON(err == -EMSGSIZE); @@ -1974,7 +2015,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) return -EOPNOTSUPP; calcit = rtnl_get_calcit(family, type); if (calcit) - min_dump_alloc = calcit(skb); + min_dump_alloc = calcit(skb, nlh); __rtnl_unlock(); rtnl = net->rtnl; diff --git a/net/core/sock.c b/net/core/sock.c index 5c5af998..02f8dfe 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1171,13 +1171,10 @@ EXPORT_SYMBOL(sock_update_classid); void sock_update_netprioidx(struct sock *sk) { - struct cgroup_netprio_state *state; if (in_interrupt()) return; - rcu_read_lock(); - state = task_netprio_state(current); - sk->sk_cgrp_prioidx = state ? state->prioidx : 0; - rcu_read_unlock(); + + sk->sk_cgrp_prioidx = task_netprioidx(current); } EXPORT_SYMBOL_GPL(sock_update_netprioidx); #endif @@ -1827,7 +1824,7 @@ suppress_allocation: /* Alas. Undo changes. */ sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM; - sk_memory_allocated_sub(sk, amt, parent_status); + sk_memory_allocated_sub(sk, amt); return 0; } @@ -1840,7 +1837,7 @@ EXPORT_SYMBOL(__sk_mem_schedule); void __sk_mem_reclaim(struct sock *sk) { sk_memory_allocated_sub(sk, - sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0); + sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT); sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1; if (sk_under_memory_pressure(sk) && diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index aa2a2c7..d183262 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -409,7 +409,7 @@ config INET_TCP_DIAG config INET_UDP_DIAG tristate "UDP: socket monitoring interface" - depends on INET_DIAG + depends on INET_DIAG && (IPV6 || IPV6=n) default n ---help--- Support for UDP socket monitoring interface used by the ss tool. diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 59402be..63e4989 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -863,7 +863,8 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { + (rt->dst.dev != dev && + pneigh_lookup(&arp_tbl, net, &tip, dev, 0)))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 2e4e244..19d66ce 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -123,11 +123,14 @@ again: smallest_size = tb->num_owners; smallest_rover = rover; if (atomic_read(&hashinfo->bsockets) > (high - low) + 1) { - spin_unlock(&head->lock); snum = smallest_rover; - goto have_snum; + goto tb_found; } } + if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb)) { + snum = rover; + goto tb_found; + } goto next; } break; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index bf4a9c4..d4d61b6 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -17,6 +17,7 @@ #include <linux/kernel.h> #include <linux/mm.h> #include <linux/net.h> +#include <linux/workqueue.h> #include <net/ip.h> #include <net/inetpeer.h> #include <net/secure_seq.h> @@ -66,6 +67,11 @@ static struct kmem_cache *peer_cachep __read_mostly; +static LIST_HEAD(gc_list); +static const int gc_delay = 60 * HZ; +static struct delayed_work gc_work; +static DEFINE_SPINLOCK(gc_lock); + #define node_height(x) x->avl_height #define peer_avl_empty ((struct inet_peer *)&peer_fake_node) @@ -102,6 +108,50 @@ int inet_peer_threshold __read_mostly = 65536 + 128; /* start to throw entries m int inet_peer_minttl __read_mostly = 120 * HZ; /* TTL under high load: 120 sec */ int inet_peer_maxttl __read_mostly = 10 * 60 * HZ; /* usual time to live: 10 min */ +static void inetpeer_gc_worker(struct work_struct *work) +{ + struct inet_peer *p, *n; + LIST_HEAD(list); + + spin_lock_bh(&gc_lock); + list_replace_init(&gc_list, &list); + spin_unlock_bh(&gc_lock); + + if (list_empty(&list)) + return; + + list_for_each_entry_safe(p, n, &list, gc_list) { + + if(need_resched()) + cond_resched(); + + if (p->avl_left != peer_avl_empty) { + list_add_tail(&p->avl_left->gc_list, &list); + p->avl_left = peer_avl_empty; + } + + if (p->avl_right != peer_avl_empty) { + list_add_tail(&p->avl_right->gc_list, &list); + p->avl_right = peer_avl_empty; + } + + n = list_entry(p->gc_list.next, struct inet_peer, gc_list); + + if (!atomic_read(&p->refcnt)) { + list_del(&p->gc_list); + kmem_cache_free(peer_cachep, p); + } + } + + if (list_empty(&list)) + return; + + spin_lock_bh(&gc_lock); + list_splice(&list, &gc_list); + spin_unlock_bh(&gc_lock); + + schedule_delayed_work(&gc_work, gc_delay); +} /* Called from ip_output.c:ip_init */ void __init inet_initpeers(void) @@ -126,6 +176,7 @@ void __init inet_initpeers(void) 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); + INIT_DELAYED_WORK_DEFERRABLE(&gc_work, inetpeer_gc_worker); } static int addr_compare(const struct inetpeer_addr *a, @@ -447,9 +498,8 @@ relookup: p->rate_last = 0; p->pmtu_expires = 0; p->pmtu_orig = 0; - p->redirect_genid = 0; memset(&p->redirect_learned, 0, sizeof(p->redirect_learned)); - + INIT_LIST_HEAD(&p->gc_list); /* Link the node. */ link_to_pool(p, base); @@ -509,3 +559,30 @@ bool inet_peer_xrlim_allow(struct inet_peer *peer, int timeout) return rc; } EXPORT_SYMBOL(inet_peer_xrlim_allow); + +void inetpeer_invalidate_tree(int family) +{ + struct inet_peer *old, *new, *prev; + struct inet_peer_base *base = family_to_base(family); + + write_seqlock_bh(&base->lock); + + old = base->root; + if (old == peer_avl_empty_rcu) + goto out; + + new = peer_avl_empty_rcu; + + prev = cmpxchg(&base->root, old, new); + if (prev == old) { + base->total = 0; + spin_lock(&gc_lock); + list_add_tail(&prev->gc_list, &gc_list); + spin_unlock(&gc_lock); + schedule_delayed_work(&gc_work, gc_delay); + } + +out: + write_sequnlock_bh(&base->lock); +} +EXPORT_SYMBOL(inetpeer_invalidate_tree); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 2b53a1f..38673d2 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -65,7 +65,7 @@ it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), and silently drop packet when it expires. It is a good - solution, but it supposes maintaing new variable in ALL + solution, but it supposes maintaining new variable in ALL skb, even if no tunneling is used. Current solution: xmit_recursion breaks dead loops. This is a percpu @@ -91,14 +91,14 @@ One of them is to parse packet trying to detect inner encapsulation made by our node. It is difficult or even impossible, especially, - taking into account fragmentation. TO be short, tt is not solution at all. + taking into account fragmentation. TO be short, ttl is not solution at all. Current solution: The solution was UNEXPECTEDLY SIMPLE. We force DF flag on tunnels with preconfigured hop limit, that is ALL. :-) Well, it does not remove the problem completely, but exponential growth of network traffic is changed to linear (branches, that exceed pmtu are pruned) and tunnel mtu - fastly degrades to value <68, where looping stops. + rapidly degrades to value <68, where looping stops. Yes, it is not good if there exists a router in the loop, which does not force DF, even when encapsulating packets have DF set. But it is not our problem! Nobody could accuse us, we made @@ -422,6 +422,10 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, if (register_netdevice(dev) < 0) goto failed_free; + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + dev_hold(dev); ipgre_tunnel_link(ign, nt); return nt; @@ -453,8 +457,8 @@ static void ipgre_err(struct sk_buff *skb, u32 info) GRE tunnels with enabled checksum. Tell them "thank you". Well, I wonder, rfc1812 was written by Cisco employee, - what the hell these idiots break standrads established - by themself??? + what the hell these idiots break standards established + by themselves??? */ const struct iphdr *iph = (const struct iphdr *)skb->data; diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index 1e60f76..42dd1a9 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -573,8 +573,8 @@ void ip_forward_options(struct sk_buff *skb) } if (srrptr + 3 <= srrspace) { opt->is_changed = 1; - ip_rt_get_source(&optptr[srrptr-1], skb, rt); ip_hdr(skb)->daddr = opt->nexthop; + ip_rt_get_source(&optptr[srrptr-1], skb, rt); optptr[2] = srrptr+4; } else if (net_ratelimit()) printk(KERN_CRIT "ip_forward(): Argh! Destination lost!\n"); diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index aea5a19..b072386 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -630,6 +630,7 @@ static int ping_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, pr_debug("ping_recvmsg(sk=%p,sk->num=%u)\n", isk, isk->inet_num); + err = -EOPNOTSUPP; if (flags & MSG_OOB) goto out; diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 3569d8e..6afc807 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -216,7 +216,6 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPPartialUndo", LINUX_MIB_TCPPARTIALUNDO), SNMP_MIB_ITEM("TCPDSACKUndo", LINUX_MIB_TCPDSACKUNDO), SNMP_MIB_ITEM("TCPLossUndo", LINUX_MIB_TCPLOSSUNDO), - SNMP_MIB_ITEM("TCPLoss", LINUX_MIB_TCPLOSS), SNMP_MIB_ITEM("TCPLostRetransmit", LINUX_MIB_TCPLOSTRETRANSMIT), SNMP_MIB_ITEM("TCPRenoFailures", LINUX_MIB_TCPRENOFAILURES), SNMP_MIB_ITEM("TCPSackFailures", LINUX_MIB_TCPSACKFAILURES), diff --git a/net/ipv4/route.c b/net/ipv4/route.c index bcacf54..0197747 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -132,7 +132,6 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ; static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20; static int ip_rt_min_advmss __read_mostly = 256; static int rt_chain_length_max __read_mostly = 20; -static int redirect_genid; static struct delayed_work expires_work; static unsigned long expires_ljiffies; @@ -937,7 +936,7 @@ static void rt_cache_invalidate(struct net *net) get_random_bytes(&shuffle, sizeof(shuffle)); atomic_add(shuffle + 1U, &net->ipv4.rt_genid); - redirect_genid++; + inetpeer_invalidate_tree(AF_INET); } /* @@ -1485,10 +1484,8 @@ void ip_rt_redirect(__be32 old_gw, __be32 daddr, __be32 new_gw, peer = rt->peer; if (peer) { - if (peer->redirect_learned.a4 != new_gw || - peer->redirect_genid != redirect_genid) { + if (peer->redirect_learned.a4 != new_gw) { peer->redirect_learned.a4 = new_gw; - peer->redirect_genid = redirect_genid; atomic_inc(&__rt_peer_genid); } check_peer_redir(&rt->dst, peer); @@ -1793,8 +1790,6 @@ static void ipv4_validate_peer(struct rtable *rt) if (peer) { check_peer_pmtu(&rt->dst, peer); - if (peer->redirect_genid != redirect_genid) - peer->redirect_learned.a4 = 0; if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) check_peer_redir(&rt->dst, peer); @@ -1958,8 +1953,7 @@ static void rt_init_metrics(struct rtable *rt, const struct flowi4 *fl4, dst_init_metrics(&rt->dst, peer->metrics, false); check_peer_pmtu(&rt->dst, peer); - if (peer->redirect_genid != redirect_genid) - peer->redirect_learned.a4 = 0; + if (peer->redirect_learned.a4 && peer->redirect_learned.a4 != rt->rt_gateway) { rt->rt_gateway = peer->redirect_learned.a4; diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index 51fdbb4..eab2a7f 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -278,6 +278,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, struct rtable *rt; __u8 rcv_wscale; bool ecn_ok = false; + struct flowi4 fl4; if (!sysctl_tcp_syncookies || !th->ack || th->rst) goto out; @@ -346,20 +347,16 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, * hasn't changed since we received the original syn, but I see * no easy way to do this. */ - { - struct flowi4 fl4; - - flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), - RT_SCOPE_UNIVERSE, IPPROTO_TCP, - inet_sk_flowi_flags(sk), - (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, - ireq->loc_addr, th->source, th->dest); - security_req_classify_flow(req, flowi4_to_flowi(&fl4)); - rt = ip_route_output_key(sock_net(sk), &fl4); - if (IS_ERR(rt)) { - reqsk_free(req); - goto out; - } + flowi4_init_output(&fl4, 0, sk->sk_mark, RT_CONN_FLAGS(sk), + RT_SCOPE_UNIVERSE, IPPROTO_TCP, + inet_sk_flowi_flags(sk), + (opt && opt->srr) ? opt->faddr : ireq->rmt_addr, + ireq->loc_addr, th->source, th->dest); + security_req_classify_flow(req, flowi4_to_flowi(&fl4)); + rt = ip_route_output_key(sock_net(sk), &fl4); + if (IS_ERR(rt)) { + reqsk_free(req); + goto out; } /* Try to redo what tcp_v4_send_synack did. */ @@ -373,5 +370,10 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb, ireq->rcv_wscale = rcv_wscale; ret = get_cookie_sock(sk, skb, req, &rt->dst); + /* ip_queue_xmit() depends on our flow being setup + * Normal sockets get it right from inet_csk_route_child_sock() + */ + if (ret) + inet_sk(ret)->cork.fl.u.ip4 = fl4; out: return ret; } diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 4aa7e9d..7a7724d 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -778,7 +778,6 @@ EXPORT_SYMBOL_GPL(net_ipv4_ctl_path); static __net_init int ipv4_sysctl_init_net(struct net *net) { struct ctl_table *table; - unsigned long limit; table = ipv4_net_table; if (!net_eq(net, &init_net)) { @@ -814,11 +813,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) net->ipv4.sysctl_rt_cache_rebuild_count = 4; - limit = nr_free_buffer_pages() / 8; - limit = max(limit, 128UL); - net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; - net->ipv4.sysctl_tcp_mem[1] = limit; - net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; + tcp_init_mem(net); net->ipv4.ipv4_hdr = register_net_sysctl_table(net, net_ipv4_ctl_path, table); diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 9bcdec3..22ef5f9 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1876,6 +1876,20 @@ void tcp_shutdown(struct sock *sk, int how) } EXPORT_SYMBOL(tcp_shutdown); +bool tcp_check_oom(struct sock *sk, int shift) +{ + bool too_many_orphans, out_of_socket_memory; + + too_many_orphans = tcp_too_many_orphans(sk, shift); + out_of_socket_memory = tcp_out_of_memory(sk); + + if (too_many_orphans && net_ratelimit()) + pr_info("TCP: too many orphaned sockets\n"); + if (out_of_socket_memory && net_ratelimit()) + pr_info("TCP: out of memory -- consider tuning tcp_mem\n"); + return too_many_orphans || out_of_socket_memory; +} + void tcp_close(struct sock *sk, long timeout) { struct sk_buff *skb; @@ -2015,10 +2029,7 @@ adjudge_to_death: } if (sk->sk_state != TCP_CLOSE) { sk_mem_reclaim(sk); - if (tcp_too_many_orphans(sk, 0)) { - if (net_ratelimit()) - printk(KERN_INFO "TCP: too many of orphaned " - "sockets\n"); + if (tcp_check_oom(sk, 0)) { tcp_set_state(sk, TCP_CLOSE); tcp_send_active_reset(sk, GFP_ATOMIC); NET_INC_STATS_BH(sock_net(sk), @@ -3216,11 +3227,21 @@ static int __init set_thash_entries(char *str) } __setup("thash_entries=", set_thash_entries); +void tcp_init_mem(struct net *net) +{ + unsigned long limit = nr_free_buffer_pages() / 8; + limit = max(limit, 128UL); + net->ipv4.sysctl_tcp_mem[0] = limit / 4 * 3; + net->ipv4.sysctl_tcp_mem[1] = limit; + net->ipv4.sysctl_tcp_mem[2] = net->ipv4.sysctl_tcp_mem[0] * 2; +} + void __init tcp_init(void) { struct sk_buff *skb = NULL; unsigned long limit; - int i, max_share, cnt; + int max_share, cnt; + unsigned int i; unsigned long jiffy = jiffies; BUILD_BUG_ON(sizeof(struct tcp_skb_cb) > sizeof(skb->cb)); @@ -3263,7 +3284,7 @@ void __init tcp_init(void) &tcp_hashinfo.bhash_size, NULL, 64 * 1024); - tcp_hashinfo.bhash_size = 1 << tcp_hashinfo.bhash_size; + tcp_hashinfo.bhash_size = 1U << tcp_hashinfo.bhash_size; for (i = 0; i < tcp_hashinfo.bhash_size; i++) { spin_lock_init(&tcp_hashinfo.bhash[i].lock); INIT_HLIST_HEAD(&tcp_hashinfo.bhash[i].chain); @@ -3276,9 +3297,10 @@ void __init tcp_init(void) sysctl_tcp_max_orphans = cnt / 2; sysctl_max_syn_backlog = max(128, cnt / 256); + tcp_init_mem(&init_net); /* Set per-socket limits to no more than 1/128 the pressure threshold */ - limit = ((unsigned long)init_net.ipv4.sysctl_tcp_mem[1]) - << (PAGE_SHIFT - 7); + limit = nr_free_buffer_pages() << (PAGE_SHIFT - 10); + limit = max(limit, 128UL); max_share = min(4UL*1024*1024, limit); sysctl_tcp_wmem[0] = SK_MEM_QUANTUM; diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 6187eb4..f45e1c2 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -63,7 +63,6 @@ static inline void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; - ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; ca->epoch_start = 0; @@ -72,7 +71,11 @@ static inline void bictcp_reset(struct bictcp *ca) static void bictcp_init(struct sock *sk) { - bictcp_reset(inet_csk_ca(sk)); + struct bictcp *ca = inet_csk_ca(sk); + + bictcp_reset(ca); + ca->loss_cwnd = 0; + if (initial_ssthresh) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } @@ -127,7 +130,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } /* if in slow start or link utilization is very low */ - if (ca->loss_cwnd == 0) { + if (ca->last_max_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } @@ -185,7 +188,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) { const struct tcp_sock *tp = tcp_sk(sk); const struct bictcp *ca = inet_csk_ca(sk); - return max(tp->snd_cwnd, ca->last_max_cwnd); + return max(tp->snd_cwnd, ca->loss_cwnd); } static void bictcp_state(struct sock *sk, u8 new_state) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index f376b05..a9077f4 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -107,7 +107,6 @@ static inline void bictcp_reset(struct bictcp *ca) { ca->cnt = 0; ca->last_max_cwnd = 0; - ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; ca->bic_origin_point = 0; @@ -142,7 +141,10 @@ static inline void bictcp_hystart_reset(struct sock *sk) static void bictcp_init(struct sock *sk) { - bictcp_reset(inet_csk_ca(sk)); + struct bictcp *ca = inet_csk_ca(sk); + + bictcp_reset(ca); + ca->loss_cwnd = 0; if (hystart) bictcp_hystart_reset(sk); @@ -275,7 +277,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) * The initial growth of cubic function may be too conservative * when the available bandwidth is still unknown. */ - if (ca->loss_cwnd == 0 && ca->cnt > 20) + if (ca->last_max_cwnd == 0 && ca->cnt > 20) ca->cnt = 20; /* increase cwnd 5% per RTT */ /* TCP Friendly */ @@ -342,7 +344,7 @@ static u32 bictcp_undo_cwnd(struct sock *sk) { struct bictcp *ca = inet_csk_ca(sk); - return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); + return max(tcp_sk(sk)->snd_cwnd, ca->loss_cwnd); } static void bictcp_state(struct sock *sk, u8 new_state) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 2877c3e..b5e315f 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -105,7 +105,6 @@ int sysctl_tcp_abc __read_mostly; #define FLAG_SYN_ACKED 0x10 /* This ACK acknowledged SYN. */ #define FLAG_DATA_SACKED 0x20 /* New SACK. */ #define FLAG_ECE 0x40 /* ECE in this ACK */ -#define FLAG_DATA_LOST 0x80 /* SACK detected data lossage. */ #define FLAG_SLOWPATH 0x100 /* Do not skip RFC checks for window update.*/ #define FLAG_ONLY_ORIG_SACKED 0x200 /* SACKs only non-rexmit sent before RTO */ #define FLAG_SND_UNA_ADVANCED 0x400 /* Snd_una was changed (!= FLAG_DATA_ACKED) */ @@ -1040,13 +1039,11 @@ static void tcp_skb_mark_lost_uncond_verify(struct tcp_sock *tp, * These 6 states form finite state machine, controlled by the following events: * 1. New ACK (+SACK) arrives. (tcp_sacktag_write_queue()) * 2. Retransmission. (tcp_retransmit_skb(), tcp_xmit_retransmit_queue()) - * 3. Loss detection event of one of three flavors: + * 3. Loss detection event of two flavors: * A. Scoreboard estimator decided the packet is lost. * A'. Reno "three dupacks" marks head of queue lost. - * A''. Its FACK modfication, head until snd.fack is lost. - * B. SACK arrives sacking data transmitted after never retransmitted - * hole was sent out. - * C. SACK arrives sacking SND.NXT at the moment, when the + * A''. Its FACK modification, head until snd.fack is lost. + * B. SACK arrives sacking SND.NXT at the moment, when the * segment was retransmitted. * 4. D-SACK added new rule: D-SACK changes any tag to S. * @@ -1153,7 +1150,7 @@ static int tcp_is_sackblock_valid(struct tcp_sock *tp, int is_dsack, } /* Check for lost retransmit. This superb idea is borrowed from "ratehalving". - * Event "C". Later note: FACK people cheated me again 8), we have to account + * Event "B". Later note: FACK people cheated me again 8), we have to account * for reordering! Ugly, but should help. * * Search retransmitted skbs from write_queue that were sent when snd_nxt was @@ -1310,25 +1307,26 @@ static int tcp_match_skb_to_sack(struct sock *sk, struct sk_buff *skb, return in_sack; } -static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, - struct tcp_sacktag_state *state, +/* Mark the given newly-SACKed range as such, adjusting counters and hints. */ +static u8 tcp_sacktag_one(struct sock *sk, + struct tcp_sacktag_state *state, u8 sacked, + u32 start_seq, u32 end_seq, int dup_sack, int pcount) { struct tcp_sock *tp = tcp_sk(sk); - u8 sacked = TCP_SKB_CB(skb)->sacked; int fack_count = state->fack_count; /* Account D-SACK for retransmitted packet. */ if (dup_sack && (sacked & TCPCB_RETRANS)) { if (tp->undo_marker && tp->undo_retrans && - after(TCP_SKB_CB(skb)->end_seq, tp->undo_marker)) + after(end_seq, tp->undo_marker)) tp->undo_retrans--; if (sacked & TCPCB_SACKED_ACKED) state->reord = min(fack_count, state->reord); } /* Nothing to do; acked frame is about to be dropped (was ACKed). */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->snd_una)) + if (!after(end_seq, tp->snd_una)) return sacked; if (!(sacked & TCPCB_SACKED_ACKED)) { @@ -1347,13 +1345,13 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* New sack for not retransmitted frame, * which was in hole. It is reordering. */ - if (before(TCP_SKB_CB(skb)->seq, + if (before(start_seq, tcp_highest_sack_seq(tp))) state->reord = min(fack_count, state->reord); /* SACK enhanced F-RTO (RFC4138; Appendix B) */ - if (!after(TCP_SKB_CB(skb)->end_seq, tp->frto_highmark)) + if (!after(end_seq, tp->frto_highmark)) state->flag |= FLAG_ONLY_ORIG_SACKED; } @@ -1371,8 +1369,7 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, /* Lost marker hint past SACKed? Tweak RFC3517 cnt */ if (!tcp_is_fack(tp) && (tp->lost_skb_hint != NULL) && - before(TCP_SKB_CB(skb)->seq, - TCP_SKB_CB(tp->lost_skb_hint)->seq)) + before(start_seq, TCP_SKB_CB(tp->lost_skb_hint)->seq)) tp->lost_cnt_hint += pcount; if (fack_count > tp->fackets_out) @@ -1391,6 +1388,9 @@ static u8 tcp_sacktag_one(const struct sk_buff *skb, struct sock *sk, return sacked; } +/* Shift newly-SACKed bytes from this skb to the immediately previous + * already-SACKed sk_buff. Mark the newly-SACKed bytes as such. + */ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, struct tcp_sacktag_state *state, unsigned int pcount, int shifted, int mss, @@ -1398,9 +1398,20 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *prev = tcp_write_queue_prev(sk, skb); + u32 start_seq = TCP_SKB_CB(skb)->seq; /* start of newly-SACKed */ + u32 end_seq = start_seq + shifted; /* end of newly-SACKed */ BUG_ON(!pcount); + /* Adjust counters and hints for the newly sacked sequence + * range but discard the return value since prev is already + * marked. We must tag the range first because the seq + * advancement below implicitly advances + * tcp_highest_sack_seq() when skb is highest_sack. + */ + tcp_sacktag_one(sk, state, TCP_SKB_CB(skb)->sacked, + start_seq, end_seq, dup_sack, pcount); + if (skb == tp->lost_skb_hint) tp->lost_cnt_hint += pcount; @@ -1427,9 +1438,6 @@ static int tcp_shifted_skb(struct sock *sk, struct sk_buff *skb, skb_shinfo(skb)->gso_type = 0; } - /* We discard results */ - tcp_sacktag_one(skb, sk, state, dup_sack, pcount); - /* Difference in this won't matter, both ACKed by the same cumul. ACK */ TCP_SKB_CB(prev)->sacked |= (TCP_SKB_CB(skb)->sacked & TCPCB_EVER_RETRANS); @@ -1577,6 +1585,10 @@ static struct sk_buff *tcp_shift_skb_data(struct sock *sk, struct sk_buff *skb, } } + /* tcp_sacktag_one() won't SACK-tag ranges below snd_una */ + if (!after(TCP_SKB_CB(skb)->seq + len, tp->snd_una)) + goto fallback; + if (!skb_shift(prev, skb, len)) goto fallback; if (!tcp_shifted_skb(sk, skb, state, pcount, len, mss, dup_sack)) @@ -1667,10 +1679,14 @@ static struct sk_buff *tcp_sacktag_walk(struct sk_buff *skb, struct sock *sk, break; if (in_sack) { - TCP_SKB_CB(skb)->sacked = tcp_sacktag_one(skb, sk, - state, - dup_sack, - tcp_skb_pcount(skb)); + TCP_SKB_CB(skb)->sacked = + tcp_sacktag_one(sk, + state, + TCP_SKB_CB(skb)->sacked, + TCP_SKB_CB(skb)->seq, + TCP_SKB_CB(skb)->end_seq, + dup_sack, + tcp_skb_pcount(skb)); if (!before(TCP_SKB_CB(skb)->seq, tcp_highest_sack_seq(tp))) @@ -1844,10 +1860,6 @@ tcp_sacktag_write_queue(struct sock *sk, const struct sk_buff *ack_skb, if (found_dup_sack && ((i + 1) == first_sack_index)) next_dup = &sp[i + 1]; - /* Event "B" in the comment above. */ - if (after(end_seq, tp->high_seq)) - state.flag |= FLAG_DATA_LOST; - /* Skip too early cached blocks */ while (tcp_sack_cache_ok(tp, cache) && !before(start_seq, cache->end_seq)) @@ -2515,8 +2527,11 @@ static void tcp_timeout_skbs(struct sock *sk) tcp_verify_left_out(tp); } -/* Mark head of queue up as lost. With RFC3517 SACK, the packets is - * is against sacked "cnt", otherwise it's against facked "cnt" +/* Detect loss in event "A" above by marking head of queue up as lost. + * For FACK or non-SACK(Reno) senders, the first "packets" number of segments + * are considered lost. For RFC3517 SACK, a segment is considered lost if it + * has at least tp->reordering SACKed seqments above it; "packets" refers to + * the maximum SACKed segments to pass before reaching this limit. */ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { @@ -2525,6 +2540,8 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) int cnt, oldcnt; int err; unsigned int mss; + /* Use SACK to deduce losses of new sequences sent during recovery */ + const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq; WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { @@ -2546,7 +2563,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) tp->lost_skb_hint = skb; tp->lost_cnt_hint = cnt; - if (after(TCP_SKB_CB(skb)->end_seq, tp->high_seq)) + if (after(TCP_SKB_CB(skb)->end_seq, loss_high)) break; oldcnt = cnt; @@ -2556,6 +2573,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) if (cnt > packets) { if ((tcp_is_sack(tp) && !tcp_is_fack(tp)) || + (TCP_SKB_CB(skb)->sacked & TCPCB_SACKED_ACKED) || (oldcnt >= packets)) break; @@ -3033,19 +3051,10 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, if (tcp_check_sack_reneging(sk, flag)) return; - /* C. Process data loss notification, provided it is valid. */ - if (tcp_is_fack(tp) && (flag & FLAG_DATA_LOST) && - before(tp->snd_una, tp->high_seq) && - icsk->icsk_ca_state != TCP_CA_Open && - tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); - } - - /* D. Check consistency of the current state. */ + /* C. Check consistency of the current state. */ tcp_verify_left_out(tp); - /* E. Check state exit conditions. State can be terminated + /* D. Check state exit conditions. State can be terminated * when high_seq is ACKed. */ if (icsk->icsk_ca_state == TCP_CA_Open) { WARN_ON(tp->retrans_out != 0); @@ -3077,7 +3086,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, } } - /* F. Process state. */ + /* E. Process state. */ switch (icsk->icsk_ca_state) { case TCP_CA_Recovery: if (!(flag & FLAG_SND_UNA_ADVANCED)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1eb4ad5..fd54c5f 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -631,7 +631,7 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len = sizeof(rep.th); #ifdef CONFIG_TCP_MD5SIG - key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->daddr) : NULL; + key = sk ? tcp_v4_md5_do_lookup(sk, ip_hdr(skb)->saddr) : NULL; if (key) { rep.opt[0] = htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | @@ -651,6 +651,11 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0; + /* When socket is gone, all binding information is lost. + * routing might fail in this case. using iif for oif to + * make sure we can deliver it + */ + arg.bound_dev_if = sk ? sk->sk_bound_dev_if : inet_iif(skb); net = dev_net(skb_dst(skb)->dev); arg.tos = ip_hdr(skb)->tos; @@ -1461,9 +1466,13 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; newinet->inet_id = newtp->write_seq ^ jiffies; - if (!dst && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) - goto put_and_exit; - + if (!dst) { + dst = inet_csk_route_child_sock(sk, newsk, req); + if (!dst) + goto put_and_exit; + } else { + /* syncookie case : see end of cookie_v4_check() */ + } sk_setup_caps(newsk, dst); tcp_mtup_init(newsk); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 8c8de27..4ff3b6d 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1141,11 +1141,9 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) sk_mem_uncharge(sk, len); sock_set_flag(sk, SOCK_QUEUE_SHRUNK); - /* Any change of skb->len requires recalculation of tso - * factor and mss. - */ + /* Any change of skb->len requires recalculation of tso factor. */ if (tcp_skb_pcount(skb) > 1) - tcp_set_skb_tso_segs(sk, skb, tcp_current_mss(sk)); + tcp_set_skb_tso_segs(sk, skb, tcp_skb_mss(skb)); return 0; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index a516d1e..cd2e072 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -77,10 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, int do_reset) if (sk->sk_err_soft) shift++; - if (tcp_too_many_orphans(sk, shift)) { - if (net_ratelimit()) - printk(KERN_INFO "Out of socket memory\n"); - + if (tcp_check_oom(sk, shift)) { /* Catch exceptional cases, when connection requires reset. * 1. Last segment was sent recently. */ if ((s32)(tcp_time_stamp - tp->lsndtime) <= TCP_TIMEWAIT_LEN || diff --git a/net/ipv4/xfrm4_mode_beet.c b/net/ipv4/xfrm4_mode_beet.c index 6341818..e3db3f9 100644 --- a/net/ipv4/xfrm4_mode_beet.c +++ b/net/ipv4/xfrm4_mode_beet.c @@ -110,10 +110,7 @@ static int xfrm4_beet_input(struct xfrm_state *x, struct sk_buff *skb) skb_push(skb, sizeof(*iph)); skb_reset_network_header(skb); - - memmove(skb->data - skb->mac_len, skb_mac_header(skb), - skb->mac_len); - skb_set_mac_header(skb, -skb->mac_len); + skb_mac_header_rebuild(skb); xfrm4_beet_make_header(skb); diff --git a/net/ipv4/xfrm4_mode_tunnel.c b/net/ipv4/xfrm4_mode_tunnel.c index 534972e..ed4bf11 100644 --- a/net/ipv4/xfrm4_mode_tunnel.c +++ b/net/ipv4/xfrm4_mode_tunnel.c @@ -66,7 +66,6 @@ static int xfrm4_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { - const unsigned char *old_mac; int err = -EINVAL; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPIP) @@ -84,10 +83,9 @@ static int xfrm4_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!(x->props.flags & XFRM_STATE_NOECN)) ipip_ecn_decapsulate(skb); - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + err = 0; out: diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a225d5e..6b8ebc5 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -434,6 +434,10 @@ static struct inet6_dev * ipv6_add_dev(struct net_device *dev) /* Join all-node multicast group */ ipv6_dev_mc_inc(dev, &in6addr_linklocal_allnodes); + /* Join all-router multicast group if forwarding is set */ + if (ndev->cnf.forwarding && dev && (dev->flags & IFF_MULTICAST)) + ipv6_dev_mc_inc(dev, &in6addr_linklocal_allrouters); + return ndev; } @@ -502,29 +506,31 @@ static void addrconf_forward_change(struct net *net, __s32 newf) rcu_read_unlock(); } -static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int old) +static int addrconf_fixup_forwarding(struct ctl_table *table, int *p, int newf) { struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); net = (struct net *)table->extra2; - if (p == &net->ipv6.devconf_dflt->forwarding) - return 0; + old = *p; + *p = newf; - if (!rtnl_trylock()) { - /* Restore the original values before restarting */ - *p = old; - return restart_syscall(); + if (p == &net->ipv6.devconf_dflt->forwarding) { + rtnl_unlock(); + return 0; } if (p == &net->ipv6.devconf_all->forwarding) { - __s32 newf = net->ipv6.devconf_all->forwarding; net->ipv6.devconf_dflt->forwarding = newf; addrconf_forward_change(net, newf); - } else if ((!*p) ^ (!old)) + } else if ((!newf) ^ (!old)) dev_forward_change((struct inet6_dev *)table->extra1); rtnl_unlock(); - if (*p) + if (newf) rt6_purge_dflt_routers(net); return 1; } @@ -4260,9 +4266,17 @@ int addrconf_sysctl_forward(ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; + ctl_table lctl; int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + /* + * ctl->data points to idev->cnf.forwarding, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); if (write) ret = addrconf_fixup_forwarding(ctl, valp, val); @@ -4300,26 +4314,27 @@ static void addrconf_disable_change(struct net *net, __s32 newf) rcu_read_unlock(); } -static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int old) +static int addrconf_disable_ipv6(struct ctl_table *table, int *p, int newf) { struct net *net; + int old; + + if (!rtnl_trylock()) + return restart_syscall(); net = (struct net *)table->extra2; + old = *p; + *p = newf; - if (p == &net->ipv6.devconf_dflt->disable_ipv6) + if (p == &net->ipv6.devconf_dflt->disable_ipv6) { + rtnl_unlock(); return 0; - - if (!rtnl_trylock()) { - /* Restore the original values before restarting */ - *p = old; - return restart_syscall(); } if (p == &net->ipv6.devconf_all->disable_ipv6) { - __s32 newf = net->ipv6.devconf_all->disable_ipv6; net->ipv6.devconf_dflt->disable_ipv6 = newf; addrconf_disable_change(net, newf); - } else if ((!*p) ^ (!old)) + } else if ((!newf) ^ (!old)) dev_disable_change((struct inet6_dev *)table->extra1); rtnl_unlock(); @@ -4333,9 +4348,17 @@ int addrconf_sysctl_disable(ctl_table *ctl, int write, int *valp = ctl->data; int val = *valp; loff_t pos = *ppos; + ctl_table lctl; int ret; - ret = proc_dointvec(ctl, write, buffer, lenp, ppos); + /* + * ctl->data points to idev->cnf.disable_ipv6, we should + * not modify it until we get the rtnl lock. + */ + lctl = *ctl; + lctl.data = &val; + + ret = proc_dointvec(&lctl, write, buffer, lenp, ppos); if (write) ret = addrconf_disable_ipv6(ctl, valp, val); diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index c7e95c8..5aa3981 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1926,8 +1926,10 @@ static int ip6mr_forward2(struct net *net, struct mr6_table *mrt, }; dst = ip6_route_output(net, NULL, &fl6); - if (!dst) + if (dst->error) { + dst_release(dst); goto out_free; + } skb_dst_drop(skb); skb_dst_set(skb, dst); diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index b853f06..16c33e3 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -257,7 +257,6 @@ static struct inet6_dev *ip6_mc_find_dev_rcu(struct net *net, if (rt) { dev = rt->dst.dev; - dev_hold(dev); dst_release(&rt->dst); } } else diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index d8f02ef..c964958 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1545,9 +1545,10 @@ void ndisc_send_redirect(struct sk_buff *skb, struct neighbour *neigh, &saddr_buf, &ipv6_hdr(skb)->saddr, dev->ifindex); dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL) + if (dst->error) { + dst_release(dst); return; - + } dst = xfrm_lookup(net, dst, flowi6_to_flowi(&fl6), NULL, 0); if (IS_ERR(dst)) return; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 8c2e3ab..22b7664 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1077,7 +1077,7 @@ struct dst_entry *icmp6_dst_alloc(struct net_device *dev, struct net *net = dev_net(dev); if (unlikely(!idev)) - return NULL; + return ERR_PTR(-ENODEV); rt = ip6_dst_alloc(&net->ipv6.ip6_dst_ops, dev, 0); if (unlikely(!rt)) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 906c7ca..3edd05a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1083,7 +1083,7 @@ static void tcp_v6_send_reset(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_TCP_MD5SIG if (sk) - key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->daddr); + key = tcp_v6_md5_do_lookup(sk, &ipv6_hdr(skb)->saddr); #endif if (th->ack) diff --git a/net/ipv6/xfrm6_mode_beet.c b/net/ipv6/xfrm6_mode_beet.c index a81ce94..9949a35 100644 --- a/net/ipv6/xfrm6_mode_beet.c +++ b/net/ipv6/xfrm6_mode_beet.c @@ -80,7 +80,6 @@ static int xfrm6_beet_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) { struct ipv6hdr *ip6h; - const unsigned char *old_mac; int size = sizeof(struct ipv6hdr); int err; @@ -90,10 +89,7 @@ static int xfrm6_beet_input(struct xfrm_state *x, struct sk_buff *skb) __skb_push(skb, size); skb_reset_network_header(skb); - - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); + skb_mac_header_rebuild(skb); xfrm6_beet_make_header(skb); diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c index 261e6e6..9f2095b 100644 --- a/net/ipv6/xfrm6_mode_tunnel.c +++ b/net/ipv6/xfrm6_mode_tunnel.c @@ -63,7 +63,6 @@ static int xfrm6_mode_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) { int err = -EINVAL; - const unsigned char *old_mac; if (XFRM_MODE_SKB_CB(skb)->protocol != IPPROTO_IPV6) goto out; @@ -80,10 +79,9 @@ static int xfrm6_mode_tunnel_input(struct xfrm_state *x, struct sk_buff *skb) if (!(x->props.flags & XFRM_STATE_NOECN)) ipip6_ecn_decapsulate(skb); - old_mac = skb_mac_header(skb); - skb_set_mac_header(skb, -skb->mac_len); - memmove(skb_mac_header(skb), old_mac, skb->mac_len); skb_reset_network_header(skb); + skb_mac_header_rebuild(skb); + err = 0; out: diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index d21e7eb..55670ec 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -393,11 +393,6 @@ static int l2tp_ip_backlog_recv(struct sock *sk, struct sk_buff *skb) { int rc; - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) - goto drop; - - nf_reset(skb); - /* Charge it to the socket, dropping if the queue is full. */ rc = sock_queue_rcv_skb(sk, skb); if (rc < 0) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index a18e6c3..b9bef2c 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -713,6 +713,7 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb = NULL; struct sock *sk = sock->sk; struct llc_sock *llc = llc_sk(sk); + unsigned long cpu_flags; size_t copied = 0; u32 peek_seq = 0; u32 *seq; @@ -838,7 +839,9 @@ static int llc_ui_recvmsg(struct kiocb *iocb, struct socket *sock, goto copy_uaddr; if (!(flags & MSG_PEEK)) { + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); sk_eat_skb(sk, skb, 0); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); *seq = 0; } @@ -859,7 +862,9 @@ copy_uaddr: llc_cmsg_rcv(msg, skb); if (!(flags & MSG_PEEK)) { + spin_lock_irqsave(&sk->sk_receive_queue.lock, cpu_flags); sk_eat_skb(sk, skb, 0); + spin_unlock_irqrestore(&sk->sk_receive_queue.lock, cpu_flags); *seq = 0; } diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index 38e6101..59edcd9 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -225,9 +225,9 @@ KEY_OPS(key); key, &key_##name##_ops); void ieee80211_debugfs_key_add(struct ieee80211_key *key) - { +{ static int keycount; - char buf[50]; + char buf[100]; struct sta_info *sta; if (!key->local->debugfs.keys) @@ -244,7 +244,8 @@ void ieee80211_debugfs_key_add(struct ieee80211_key *key) sta = key->sta; if (sta) { - sprintf(buf, "../../stations/%pM", sta->sta.addr); + sprintf(buf, "../../netdev:%s/stations/%pM", + sta->sdata->name, sta->sta.addr); key->debugfs.stalink = debugfs_create_symlink("station", key->debugfs.dir, buf); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 2406b3e..d86217d 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -63,14 +63,14 @@ static ssize_t sta_flags_read(struct file *file, char __user *userbuf, test_sta_flag(sta, WLAN_STA_##flg) ? #flg "\n" : "" int res = scnprintf(buf, sizeof(buf), - "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", + "%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s%s", TEST(AUTH), TEST(ASSOC), TEST(PS_STA), TEST(PS_DRIVER), TEST(AUTHORIZED), TEST(SHORT_PREAMBLE), TEST(WME), TEST(WDS), TEST(CLEAR_PS_FILT), TEST(MFP), TEST(BLOCK_BA), TEST(PSPOLL), TEST(UAPSD), TEST(SP), TEST(TDLS_PEER), - TEST(TDLS_PEER_AUTH)); + TEST(TDLS_PEER_AUTH), TEST(RATE_CONTROL)); #undef TEST return simple_read_from_buffer(userbuf, count, ppos, buf, res); } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index b3d76b7..a464396 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -106,6 +106,7 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, sdata->drop_unencrypted = capability & WLAN_CAPABILITY_PRIVACY ? 1 : 0; + local->oper_channel = chan; channel_type = ifibss->channel_type; if (channel_type > NL80211_CHAN_HT20 && !cfg80211_can_beacon_sec_chan(local->hw.wiphy, chan, channel_type)) diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e47768c..8e2137b 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1314,6 +1314,7 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local) continue; } /* count everything else */ + sdata->vif.bss_conf.idle = false; count++; } @@ -1331,6 +1332,9 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local) hw_roc = true; list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->vif.type == NL80211_IFTYPE_MONITOR || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; if (sdata->old_idle == sdata->vif.bss_conf.idle) continue; if (!ieee80211_sdata_running(sdata)) diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0a0d94a..b142bd4 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -910,6 +910,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", result); + ieee80211_led_init(local); + rtnl_lock(); result = ieee80211_init_rate_ctrl_alg(local, @@ -931,8 +933,6 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) rtnl_unlock(); - ieee80211_led_init(local); - local->network_latency_notifier.notifier_call = ieee80211_max_network_latency; result = pm_qos_add_notifier(PM_QOS_NETWORK_LATENCY, diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c index 73abb75..54df1b2b 100644 --- a/net/mac80211/mesh_hwmp.c +++ b/net/mac80211/mesh_hwmp.c @@ -119,12 +119,12 @@ static int mesh_path_sel_frame_tx(enum mpath_frame_type action, u8 flags, int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.mesh_action) + sizeof(mgmt->u.action.u.mesh_action); - skb = dev_alloc_skb(local->hw.extra_tx_headroom + + skb = dev_alloc_skb(local->tx_headroom + hdr_len + 2 + 37); /* max HWMP IE */ if (!skb) return -1; - skb_reserve(skb, local->hw.extra_tx_headroom); + skb_reserve(skb, local->tx_headroom); mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len); memset(mgmt, 0, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | @@ -250,12 +250,12 @@ int mesh_path_error_tx(u8 ttl, u8 *target, __le32 target_sn, if (time_before(jiffies, ifmsh->next_perr)) return -EAGAIN; - skb = dev_alloc_skb(local->hw.extra_tx_headroom + + skb = dev_alloc_skb(local->tx_headroom + hdr_len + 2 + 15 /* PERR IE */); if (!skb) return -1; - skb_reserve(skb, local->tx_headroom + local->hw.extra_tx_headroom); + skb_reserve(skb, local->tx_headroom); mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len); memset(mgmt, 0, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index 41ef1b4..a172517 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -172,7 +172,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, int hdr_len = offsetof(struct ieee80211_mgmt, u.action.u.self_prot) + sizeof(mgmt->u.action.u.self_prot); - skb = dev_alloc_skb(local->hw.extra_tx_headroom + + skb = dev_alloc_skb(local->tx_headroom + hdr_len + 2 + /* capability info */ 2 + /* AID */ @@ -186,7 +186,7 @@ static int mesh_plink_frame_tx(struct ieee80211_sub_if_data *sdata, sdata->u.mesh.ie_len); if (!skb) return -1; - skb_reserve(skb, local->hw.extra_tx_headroom); + skb_reserve(skb, local->tx_headroom); mgmt = (struct ieee80211_mgmt *) skb_put(skb, hdr_len); memset(mgmt, 0, hdr_len); mgmt->frame_control = cpu_to_le16(IEEE80211_FTYPE_MGMT | diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ecb4c84..295be92 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2750,7 +2750,6 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; - struct ieee80211_work *wk; u8 bssid[ETH_ALEN]; bool assoc_bss = false; @@ -2763,30 +2762,47 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, assoc_bss = true; } else { bool not_auth_yet = false; + struct ieee80211_work *tmp, *wk = NULL; mutex_unlock(&ifmgd->mtx); mutex_lock(&local->mtx); - list_for_each_entry(wk, &local->work_list, list) { - if (wk->sdata != sdata) + list_for_each_entry(tmp, &local->work_list, list) { + if (tmp->sdata != sdata) continue; - if (wk->type != IEEE80211_WORK_DIRECT_PROBE && - wk->type != IEEE80211_WORK_AUTH && - wk->type != IEEE80211_WORK_ASSOC && - wk->type != IEEE80211_WORK_ASSOC_BEACON_WAIT) + if (tmp->type != IEEE80211_WORK_DIRECT_PROBE && + tmp->type != IEEE80211_WORK_AUTH && + tmp->type != IEEE80211_WORK_ASSOC && + tmp->type != IEEE80211_WORK_ASSOC_BEACON_WAIT) continue; - if (memcmp(req->bss->bssid, wk->filter_ta, ETH_ALEN)) + if (memcmp(req->bss->bssid, tmp->filter_ta, ETH_ALEN)) continue; - not_auth_yet = wk->type == IEEE80211_WORK_DIRECT_PROBE; - list_del_rcu(&wk->list); - free_work(wk); + not_auth_yet = tmp->type == IEEE80211_WORK_DIRECT_PROBE; + list_del_rcu(&tmp->list); + synchronize_rcu(); + wk = tmp; break; } mutex_unlock(&local->mtx); + if (wk && wk->type == IEEE80211_WORK_ASSOC) { + /* clean up dummy sta & TX sync */ + sta_info_destroy_addr(wk->sdata, wk->filter_ta); + if (wk->assoc.synced) + drv_finish_tx_sync(local, wk->sdata, + wk->filter_ta, + IEEE80211_TX_SYNC_ASSOC); + } else if (wk && wk->type == IEEE80211_WORK_AUTH) { + if (wk->probe_auth.synced) + drv_finish_tx_sync(local, wk->sdata, + wk->filter_ta, + IEEE80211_TX_SYNC_AUTH); + } + kfree(wk); + /* * If somebody requests authentication and we haven't * sent out an auth frame yet there's no need to send diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 5a5a776..f9b8e81 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -336,7 +336,7 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, int i; u32 mask; - if (sta) { + if (sta && test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) { ista = &sta->sta; priv_sta = sta->rate_ctrl_priv; } @@ -344,7 +344,7 @@ void rate_control_get_rate(struct ieee80211_sub_if_data *sdata, for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { info->control.rates[i].idx = -1; info->control.rates[i].flags = 0; - info->control.rates[i].count = 1; + info->control.rates[i].count = 0; } if (sdata->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL) diff --git a/net/mac80211/rate.h b/net/mac80211/rate.h index 168427b..80cfc00 100644 --- a/net/mac80211/rate.h +++ b/net/mac80211/rate.h @@ -41,7 +41,7 @@ static inline void rate_control_tx_status(struct ieee80211_local *local, struct ieee80211_sta *ista = &sta->sta; void *priv_sta = sta->rate_ctrl_priv; - if (!ref) + if (!ref || !test_sta_flag(sta, WLAN_STA_RATE_CONTROL)) return; ref->ops->tx_status(ref->priv, sband, ista, priv_sta, skb); @@ -62,6 +62,7 @@ static inline void rate_control_rate_init(struct sta_info *sta) sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; ref->ops->rate_init(ref->priv, sband, ista, priv_sta); + set_sta_flag(sta, WLAN_STA_RATE_CONTROL); } static inline void rate_control_rate_update(struct ieee80211_local *local, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 7514091..5a5e504 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -611,7 +611,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; if (!tid_agg_rx->reorder_buf[index] && - tid_agg_rx->stored_mpdu_num > 1) { + tid_agg_rx->stored_mpdu_num) { /* * No buffers ready to be released, but check whether any * frames in the reorder buffer have timed out. diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 6f77f12..bfed851 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -52,6 +52,7 @@ * @WLAN_STA_SP: Station is in a service period, so don't try to * reply to other uAPSD trigger frames or PS-Poll. * @WLAN_STA_4ADDR_EVENT: 4-addr event was already sent for this frame. + * @WLAN_STA_RATE_CONTROL: rate control was initialized for this station. */ enum ieee80211_sta_info_flags { WLAN_STA_AUTH, @@ -71,6 +72,7 @@ enum ieee80211_sta_info_flags { WLAN_STA_UAPSD, WLAN_STA_SP, WLAN_STA_4ADDR_EVENT, + WLAN_STA_RATE_CONTROL, }; enum ieee80211_sta_state { diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 611c335..2555816 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -232,6 +232,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, __be16 dport = 0; /* destination port to forward */ unsigned int flags; struct ip_vs_conn_param param; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; union nf_inet_addr snet; /* source network of the client, after masking */ @@ -267,7 +268,6 @@ ip_vs_sched_persist(struct ip_vs_service *svc, { int protocol = iph.protocol; const union nf_inet_addr *vaddr = &iph.daddr; - const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; __be16 vport = 0; if (dst_port == svc->port) { diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 76613f5..fa4b82c 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -404,19 +404,49 @@ static void __nf_conntrack_hash_insert(struct nf_conn *ct, &net->ct.hash[repl_hash]); } -void nf_conntrack_hash_insert(struct nf_conn *ct) +int +nf_conntrack_hash_check_insert(struct nf_conn *ct) { struct net *net = nf_ct_net(ct); unsigned int hash, repl_hash; + struct nf_conntrack_tuple_hash *h; + struct hlist_nulls_node *n; u16 zone; zone = nf_ct_zone(ct); - hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); + repl_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + + spin_lock_bh(&nf_conntrack_lock); + + /* See if there's one in the list already, including reverse */ + hlist_nulls_for_each_entry(h, n, &net->ct.hash[hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + hlist_nulls_for_each_entry(h, n, &net->ct.hash[repl_hash], hnnode) + if (nf_ct_tuple_equal(&ct->tuplehash[IP_CT_DIR_REPLY].tuple, + &h->tuple) && + zone == nf_ct_zone(nf_ct_tuplehash_to_ctrack(h))) + goto out; + add_timer(&ct->timeout); + nf_conntrack_get(&ct->ct_general); __nf_conntrack_hash_insert(ct, hash, repl_hash); + NF_CT_STAT_INC(net, insert); + spin_unlock_bh(&nf_conntrack_lock); + + return 0; + +out: + NF_CT_STAT_INC(net, insert_failed); + spin_unlock_bh(&nf_conntrack_lock); + return -EEXIST; } -EXPORT_SYMBOL_GPL(nf_conntrack_hash_insert); +EXPORT_SYMBOL_GPL(nf_conntrack_hash_check_insert); /* Confirm a connection given skb; places it in hash table */ int @@ -605,8 +635,12 @@ static noinline int early_drop(struct net *net, unsigned int hash) if (del_timer(&ct->timeout)) { death_by_timeout((unsigned long)ct); - dropped = 1; - NF_CT_STAT_INC_ATOMIC(net, early_drop); + /* Check if we indeed killed this entry. Reliable event + delivery may have inserted it into the dying list. */ + if (test_bit(IPS_DYING_BIT, &ct->status)) { + dropped = 1; + NF_CT_STAT_INC_ATOMIC(net, early_drop); + } } nf_ct_put(ct); return dropped; diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 9307b03..b49da6c 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -943,20 +943,21 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb, } } - if (nf_conntrack_event_report(IPCT_DESTROY, ct, - NETLINK_CB(skb).pid, - nlmsg_report(nlh)) < 0) { + if (del_timer(&ct->timeout)) { + if (nf_conntrack_event_report(IPCT_DESTROY, ct, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)) < 0) { + nf_ct_delete_from_lists(ct); + /* we failed to report the event, try later */ + nf_ct_insert_dying_list(ct); + nf_ct_put(ct); + return 0; + } + /* death_by_timeout would report the event again */ + set_bit(IPS_DYING_BIT, &ct->status); nf_ct_delete_from_lists(ct); - /* we failed to report the event, try later */ - nf_ct_insert_dying_list(ct); nf_ct_put(ct); - return 0; } - - /* death_by_timeout would report the event again */ - set_bit(IPS_DYING_BIT, &ct->status); - - nf_ct_kill(ct); nf_ct_put(ct); return 0; @@ -1041,16 +1042,13 @@ ctnetlink_parse_nat_setup(struct nf_conn *ct, if (!parse_nat_setup) { #ifdef CONFIG_MODULES rcu_read_unlock(); - spin_unlock_bh(&nf_conntrack_lock); nfnl_unlock(); if (request_module("nf-nat-ipv4") < 0) { nfnl_lock(); - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); return -EOPNOTSUPP; } nfnl_lock(); - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); if (nfnetlink_parse_nat_setup_hook) return -EAGAIN; @@ -1367,15 +1365,12 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, nf_ct_protonum(ct)); if (helper == NULL) { rcu_read_unlock(); - spin_unlock_bh(&nf_conntrack_lock); #ifdef CONFIG_MODULES if (request_module("nfct-helper-%s", helpname) < 0) { - spin_lock_bh(&nf_conntrack_lock); err = -EOPNOTSUPP; goto err1; } - spin_lock_bh(&nf_conntrack_lock); rcu_read_lock(); helper = __nf_conntrack_helper_find(helpname, nf_ct_l3num(ct), @@ -1468,8 +1463,10 @@ ctnetlink_create_conntrack(struct net *net, u16 zone, if (tstamp) tstamp->start = ktime_to_ns(ktime_get_real()); - add_timer(&ct->timeout); - nf_conntrack_hash_insert(ct); + err = nf_conntrack_hash_check_insert(ct); + if (err < 0) + goto err2; + rcu_read_unlock(); return ct; @@ -1490,6 +1487,7 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, struct nf_conntrack_tuple otuple, rtuple; struct nf_conntrack_tuple_hash *h = NULL; struct nfgenmsg *nfmsg = nlmsg_data(nlh); + struct nf_conn *ct; u_int8_t u3 = nfmsg->nfgen_family; u16 zone; int err; @@ -1510,27 +1508,22 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, return err; } - spin_lock_bh(&nf_conntrack_lock); if (cda[CTA_TUPLE_ORIG]) - h = __nf_conntrack_find(net, zone, &otuple); + h = nf_conntrack_find_get(net, zone, &otuple); else if (cda[CTA_TUPLE_REPLY]) - h = __nf_conntrack_find(net, zone, &rtuple); + h = nf_conntrack_find_get(net, zone, &rtuple); if (h == NULL) { err = -ENOENT; if (nlh->nlmsg_flags & NLM_F_CREATE) { - struct nf_conn *ct; enum ip_conntrack_events events; ct = ctnetlink_create_conntrack(net, zone, cda, &otuple, &rtuple, u3); - if (IS_ERR(ct)) { - err = PTR_ERR(ct); - goto out_unlock; - } + if (IS_ERR(ct)) + return PTR_ERR(ct); + err = 0; - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); if (test_bit(IPS_EXPECTED_BIT, &ct->status)) events = IPCT_RELATED; else @@ -1545,23 +1538,19 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); + } return err; } /* implicit 'else' */ - /* We manipulate the conntrack inside the global conntrack table lock, - * so there's no need to increase the refcount */ err = -EEXIST; + ct = nf_ct_tuplehash_to_ctrack(h); if (!(nlh->nlmsg_flags & NLM_F_EXCL)) { - struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h); - + spin_lock_bh(&nf_conntrack_lock); err = ctnetlink_change_conntrack(ct, cda); + spin_unlock_bh(&nf_conntrack_lock); if (err == 0) { - nf_conntrack_get(&ct->ct_general); - spin_unlock_bh(&nf_conntrack_lock); nf_conntrack_eventmask_report((1 << IPCT_REPLY) | (1 << IPCT_ASSURED) | (1 << IPCT_HELPER) | @@ -1570,15 +1559,10 @@ ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb, (1 << IPCT_MARK), ct, NETLINK_CB(skb).pid, nlmsg_report(nlh)); - nf_ct_put(ct); - } else - spin_unlock_bh(&nf_conntrack_lock); - - return err; + } } -out_unlock: - spin_unlock_bh(&nf_conntrack_lock); + nf_ct_put(ct); return err; } diff --git a/net/netfilter/nf_queue.c b/net/netfilter/nf_queue.c index b3a7db6..ce60cf0 100644 --- a/net/netfilter/nf_queue.c +++ b/net/netfilter/nf_queue.c @@ -203,6 +203,27 @@ err: return status; } +#ifdef CONFIG_BRIDGE_NETFILTER +/* When called from bridge netfilter, skb->data must point to MAC header + * before calling skb_gso_segment(). Else, original MAC header is lost + * and segmented skbs will be sent to wrong destination. + */ +static void nf_bridge_adjust_skb_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_push(skb, skb->network_header - skb->mac_header); +} + +static void nf_bridge_adjust_segmented_data(struct sk_buff *skb) +{ + if (skb->nf_bridge) + __skb_pull(skb, skb->network_header - skb->mac_header); +} +#else +#define nf_bridge_adjust_skb_data(s) do {} while (0) +#define nf_bridge_adjust_segmented_data(s) do {} while (0) +#endif + int nf_queue(struct sk_buff *skb, struct list_head *elem, u_int8_t pf, unsigned int hook, @@ -212,7 +233,7 @@ int nf_queue(struct sk_buff *skb, unsigned int queuenum) { struct sk_buff *segs; - int err; + int err = -EINVAL; unsigned int queued; if (!skb_is_gso(skb)) @@ -228,23 +249,25 @@ int nf_queue(struct sk_buff *skb, break; } + nf_bridge_adjust_skb_data(skb); segs = skb_gso_segment(skb, 0); /* Does not use PTR_ERR to limit the number of error codes that can be * returned by nf_queue. For instance, callers rely on -ECANCELED to mean * 'ignore this hook'. */ if (IS_ERR(segs)) - return -EINVAL; - + goto out_err; queued = 0; err = 0; do { struct sk_buff *nskb = segs->next; segs->next = NULL; - if (err == 0) + if (err == 0) { + nf_bridge_adjust_segmented_data(segs); err = __nf_queue(segs, elem, pf, hook, indev, outdev, okfn, queuenum); + } if (err == 0) queued++; else @@ -252,11 +275,12 @@ int nf_queue(struct sk_buff *skb, segs = nskb; } while (segs); - /* also free orig skb if only some segments were queued */ - if (unlikely(err && queued)) - err = 0; - if (err == 0) + if (queued) { kfree_skb(skb); + return 0; + } + out_err: + nf_bridge_adjust_segmented_data(skb); return err; } diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 3aae66f..4d50579 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -152,9 +152,10 @@ tee_tg_route6(struct sk_buff *skb, const struct xt_tee_tginfo *info) fl6.flowlabel = ((iph->flow_lbl[0] & 0xF) << 16) | (iph->flow_lbl[1] << 8) | iph->flow_lbl[2]; dst = ip6_route_output(net, NULL, &fl6); - if (dst == NULL) + if (dst->error) { + dst_release(dst); return false; - + } skb_dst_drop(skb); skb_dst_set(skb, dst); skb->dev = dst->dev; diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 2725d1b..48badff 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1,5 +1,5 @@ /* - * Copyright (c) 2007-2011 Nicira Networks. + * Copyright (c) 2007-2012 Nicira Networks. * * This program is free software; you can redistribute it and/or * modify it under the terms of version 2 of the GNU General Public @@ -145,9 +145,16 @@ static void set_ip_addr(struct sk_buff *skb, struct iphdr *nh, inet_proto_csum_replace4(&tcp_hdr(skb)->check, skb, *addr, new_addr, 1); } else if (nh->protocol == IPPROTO_UDP) { - if (likely(transport_len >= sizeof(struct udphdr))) - inet_proto_csum_replace4(&udp_hdr(skb)->check, skb, - *addr, new_addr, 1); + if (likely(transport_len >= sizeof(struct udphdr))) { + struct udphdr *uh = udp_hdr(skb); + + if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) { + inet_proto_csum_replace4(&uh->check, skb, + *addr, new_addr, 1); + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } + } } csum_replace4(&nh->check, *addr, new_addr); @@ -197,8 +204,22 @@ static void set_tp_port(struct sk_buff *skb, __be16 *port, skb->rxhash = 0; } -static int set_udp_port(struct sk_buff *skb, - const struct ovs_key_udp *udp_port_key) +static void set_udp_port(struct sk_buff *skb, __be16 *port, __be16 new_port) +{ + struct udphdr *uh = udp_hdr(skb); + + if (uh->check && skb->ip_summed != CHECKSUM_PARTIAL) { + set_tp_port(skb, port, new_port, &uh->check); + + if (!uh->check) + uh->check = CSUM_MANGLED_0; + } else { + *port = new_port; + skb->rxhash = 0; + } +} + +static int set_udp(struct sk_buff *skb, const struct ovs_key_udp *udp_port_key) { struct udphdr *uh; int err; @@ -210,16 +231,15 @@ static int set_udp_port(struct sk_buff *skb, uh = udp_hdr(skb); if (udp_port_key->udp_src != uh->source) - set_tp_port(skb, &uh->source, udp_port_key->udp_src, &uh->check); + set_udp_port(skb, &uh->source, udp_port_key->udp_src); if (udp_port_key->udp_dst != uh->dest) - set_tp_port(skb, &uh->dest, udp_port_key->udp_dst, &uh->check); + set_udp_port(skb, &uh->dest, udp_port_key->udp_dst); return 0; } -static int set_tcp_port(struct sk_buff *skb, - const struct ovs_key_tcp *tcp_port_key) +static int set_tcp(struct sk_buff *skb, const struct ovs_key_tcp *tcp_port_key) { struct tcphdr *th; int err; @@ -328,11 +348,11 @@ static int execute_set_action(struct sk_buff *skb, break; case OVS_KEY_ATTR_TCP: - err = set_tcp_port(skb, nla_data(nested_attr)); + err = set_tcp(skb, nla_data(nested_attr)); break; case OVS_KEY_ATTR_UDP: - err = set_udp_port(skb, nla_data(nested_attr)); + err = set_udp(skb, nla_data(nested_attr)); break; } diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index ce64c18..2c03050 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -1521,6 +1521,9 @@ static struct vport *lookup_vport(struct ovs_header *ovs_header, vport = ovs_vport_locate(nla_data(a[OVS_VPORT_ATTR_NAME])); if (!vport) return ERR_PTR(-ENODEV); + if (ovs_header->dp_ifindex && + ovs_header->dp_ifindex != get_dpifindex(vport->dp)) + return ERR_PTR(-ENODEV); return vport; } else if (a[OVS_VPORT_ATTR_PORT_NO]) { u32 port_no = nla_get_u32(a[OVS_VPORT_ATTR_PORT_NO]); diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index bb6ad81..424ff62 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -68,7 +68,6 @@ static int rds_release(struct socket *sock) { struct sock *sk = sock->sk; struct rds_sock *rs; - unsigned long flags; if (!sk) goto out; @@ -94,10 +93,10 @@ static int rds_release(struct socket *sock) rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_del_init(&rs->rs_item); rds_sock_count--; - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); rds_trans_put(rs->rs_transport); @@ -409,7 +408,6 @@ static const struct proto_ops rds_proto_ops = { static int __rds_create(struct socket *sock, struct sock *sk, int protocol) { - unsigned long flags; struct rds_sock *rs; sock_init_data(sock, sk); @@ -426,10 +424,10 @@ static int __rds_create(struct socket *sock, struct sock *sk, int protocol) spin_lock_init(&rs->rs_rdma_lock); rs->rs_rdma_keys = RB_ROOT; - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_add_tail(&rs->rs_item, &rds_sock_list); rds_sock_count++; - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); return 0; } @@ -471,12 +469,11 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, { struct rds_sock *rs; struct rds_incoming *inc; - unsigned long flags; unsigned int total = 0; len /= sizeof(struct rds_info_message); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); list_for_each_entry(rs, &rds_sock_list, rs_item) { read_lock(&rs->rs_recv_lock); @@ -492,7 +489,7 @@ static void rds_sock_inc_info(struct socket *sock, unsigned int len, read_unlock(&rs->rs_recv_lock); } - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -504,11 +501,10 @@ static void rds_sock_info(struct socket *sock, unsigned int len, { struct rds_info_socket sinfo; struct rds_sock *rs; - unsigned long flags; len /= sizeof(struct rds_info_socket); - spin_lock_irqsave(&rds_sock_lock, flags); + spin_lock_bh(&rds_sock_lock); if (len < rds_sock_count) goto out; @@ -529,7 +525,7 @@ out: lens->nr = rds_sock_count; lens->each = sizeof(struct rds_info_socket); - spin_unlock_irqrestore(&rds_sock_lock, flags); + spin_unlock_bh(&rds_sock_lock); } static void rds_exit(void) diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c index 4cba13e..ae3a035 100644 --- a/net/rxrpc/ar-key.c +++ b/net/rxrpc/ar-key.c @@ -232,7 +232,7 @@ static int rxrpc_krb5_decode_principal(struct krb5_principal *princ, if (toklen <= (n_parts + 1) * 4) return -EINVAL; - princ->name_parts = kcalloc(sizeof(char *), n_parts, GFP_KERNEL); + princ->name_parts = kcalloc(n_parts, sizeof(char *), GFP_KERNEL); if (!princ->name_parts) return -ENOMEM; @@ -355,7 +355,7 @@ static int rxrpc_krb5_decode_tagged_array(struct krb5_tagged_data **_td, _debug("n_elem %d", n_elem); - td = kcalloc(sizeof(struct krb5_tagged_data), n_elem, + td = kcalloc(n_elem, sizeof(struct krb5_tagged_data), GFP_KERNEL); if (!td) return -ENOMEM; diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c index e465064..7e267d7 100644 --- a/net/sched/sch_choke.c +++ b/net/sched/sch_choke.c @@ -148,8 +148,7 @@ struct choke_skb_cb { static inline struct choke_skb_cb *choke_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct choke_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct choke_skb_cb)); return (struct choke_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index e7e1d0b..5da548f 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -130,8 +130,7 @@ struct netem_skb_cb { static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct netem_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb)); return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data; } @@ -419,7 +418,7 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) cb = netem_skb_cb(skb); if (q->gap == 0 || /* not doing reordering */ - q->counter < q->gap || /* inside last reordering gap */ + q->counter < q->gap - 1 || /* inside last reordering gap */ q->reorder < get_crandom(&q->reorder_cor)) { psched_time_t now; psched_tdiff_t delay; @@ -502,9 +501,8 @@ tfifo_dequeue: /* if more time remaining? */ if (cb->time_to_send <= psched_get_time()) { - skb = qdisc_dequeue_tail(sch); - if (unlikely(!skb)) - goto qdisc_dequeue; + __skb_unlink(skb, &sch->q); + sch->qstats.backlog -= qdisc_pkt_len(skb); #ifdef CONFIG_NET_CLS_ACT /* @@ -540,7 +538,6 @@ deliver: qdisc_watchdog_schedule(&q->watchdog, cb->time_to_send); } -qdisc_dequeue: if (q->qdisc) { skb = q->qdisc->ops->dequeue(q->qdisc); if (skb) diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 96e42ca..d7eea99 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -94,8 +94,7 @@ struct sfb_skb_cb { static inline struct sfb_skb_cb *sfb_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfb_skb_cb)); + qdisc_cb_private_validate(skb, sizeof(struct sfb_skb_cb)); return (struct sfb_skb_cb *)qdisc_skb_cb(skb)->data; } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 67494ae..02a21ab 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -166,9 +166,8 @@ struct sfq_skb_cb { static inline struct sfq_skb_cb *sfq_skb_cb(const struct sk_buff *skb) { - BUILD_BUG_ON(sizeof(skb->cb) < - sizeof(struct qdisc_skb_cb) + sizeof(struct sfq_skb_cb)); - return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; + qdisc_cb_private_validate(skb, sizeof(struct sfq_skb_cb)); + return (struct sfq_skb_cb *)qdisc_skb_cb(skb)->data; } static unsigned int sfq_hash(const struct sfq_sched_data *q, @@ -470,11 +469,15 @@ enqueue: if (slot->qlen == 1) { /* The flow is new */ if (q->tail == NULL) { /* It is the first flow */ slot->next = x; - q->tail = slot; } else { slot->next = q->tail->next; q->tail->next = x; } + /* We put this flow at the end of our flow list. + * This might sound unfair for a new flow to wait after old ones, + * but we could endup servicing new flows only, and freeze old ones. + */ + q->tail = slot; /* We could use a bigger initial quantum for new flows */ slot->allot = q->scaled_quantum; } diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index 1426ec3..75762f3 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -92,6 +92,7 @@ generic_create_cred(struct rpc_auth *auth, struct auth_cred *acred, int flags) if (gcred->acred.group_info != NULL) get_group_info(gcred->acred.group_info); gcred->acred.machine_cred = acred->machine_cred; + gcred->acred.principal = acred->principal; dprintk("RPC: allocated %s cred %p for uid %d gid %d\n", gcred->acred.machine_cred ? "machine" : "generic", @@ -123,6 +124,17 @@ generic_destroy_cred(struct rpc_cred *cred) call_rcu(&cred->cr_rcu, generic_free_cred_callback); } +static int +machine_cred_match(struct auth_cred *acred, struct generic_cred *gcred, int flags) +{ + if (!gcred->acred.machine_cred || + gcred->acred.principal != acred->principal || + gcred->acred.uid != acred->uid || + gcred->acred.gid != acred->gid) + return 0; + return 1; +} + /* * Match credentials against current process creds. */ @@ -132,9 +144,12 @@ generic_match(struct auth_cred *acred, struct rpc_cred *cred, int flags) struct generic_cred *gcred = container_of(cred, struct generic_cred, gc_base); int i; + if (acred->machine_cred) + return machine_cred_match(acred, gcred, flags); + if (gcred->acred.uid != acred->uid || gcred->acred.gid != acred->gid || - gcred->acred.machine_cred != acred->machine_cred) + gcred->acred.machine_cred != 0) goto out_nomatch; /* Optimisation in the case where pointers are identical... */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index aad8fb6..85d3bb7 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1918,7 +1918,7 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; unix_state_lock(sk); - skb = skb_dequeue(&sk->sk_receive_queue); + skb = skb_peek(&sk->sk_receive_queue); if (skb == NULL) { unix_sk(sk)->recursion_level = 0; if (copied >= target) @@ -1958,11 +1958,8 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (check_creds) { /* Never glue messages from different writers */ if ((UNIXCB(skb).pid != siocb->scm->pid) || - (UNIXCB(skb).cred != siocb->scm->cred)) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + (UNIXCB(skb).cred != siocb->scm->cred)) break; - } } else { /* Copy credentials */ scm_set_cred(siocb->scm, UNIXCB(skb).pid, UNIXCB(skb).cred); @@ -1977,8 +1974,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, chunk = min_t(unsigned int, skb->len, size); if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); if (copied == 0) copied = -EFAULT; break; @@ -1993,13 +1988,10 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) unix_detach_fds(siocb->scm, skb); - /* put the skb back if we didn't use it up.. */ - if (skb->len) { - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); + if (skb->len) break; - } + skb_unlink(skb, &sk->sk_receive_queue); consume_skb(skb); if (siocb->scm->fp) @@ -2010,9 +2002,6 @@ static int unix_stream_recvmsg(struct kiocb *iocb, struct socket *sock, if (UNIXCB(skb).fp) siocb->scm->fp = scm_fp_dup(UNIXCB(skb).fp); - /* put message back and return */ - skb_queue_head(&sk->sk_receive_queue, skb); - sk->sk_data_ready(sk, skb->len); break; } } while (size); |