diff options
Diffstat (limited to 'net')
200 files changed, 6115 insertions, 3685 deletions
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 01ddb04..889f4ac 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -35,12 +35,12 @@ drop: } EXPORT_SYMBOL(__vlan_hwaccel_rx); -int vlan_hwaccel_do_receive(struct sk_buff *skb) +void vlan_hwaccel_do_receive(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct vlan_rx_stats *rx_stats; - skb->dev = vlan_dev_info(dev)->real_dev; + skb->dev = vlan_dev_real_dev(dev); netif_nit_deliver(skb); skb->dev = dev; @@ -69,7 +69,6 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb) break; } u64_stats_update_end(&rx_stats->syncp); - return 0; } struct net_device *vlan_dev_real_dev(const struct net_device *dev) @@ -106,9 +105,12 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, goto drop; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - p->dev == skb->dev && !compare_ether_header( - skb_mac_header(p), skb_gro_mac_header(skb)); + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index c85109d..078eb16 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -222,7 +222,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) } } -static unsigned int +static int p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt) { int ret, n; diff --git a/net/atm/common.c b/net/atm/common.c index 940404a..1b9c52a 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -792,7 +792,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, default: if (level == SOL_SOCKET) return -EINVAL; - break; + break; } if (!vcc->dev || !vcc->dev->ops->getsockopt) return -EINVAL; diff --git a/net/atm/lec.c b/net/atm/lec.c index d98bde1..181d70c 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -220,7 +220,6 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc) static int lec_open(struct net_device *dev) { netif_start_queue(dev); - memset(&dev->stats, 0, sizeof(struct net_device_stats)); return 0; } diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index cfdfd7e..26eaebf 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1103,7 +1103,7 @@ done: out: release_sock(sk); - return 0; + return err; } /* diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 7805945..a169084 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -412,7 +412,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) { ax25_uid_assoc *user; ax25_route *ax25_rt; - int err; + int err = 0; if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) return -EHOSTUNREACH; @@ -453,7 +453,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) put: ax25_put_route(ax25_rt); - return 0; + return err; } struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 421c45b..ed0f22f 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -297,13 +297,12 @@ unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *w mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; if (sk->sk_state == BT_CLOSED) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index c03d2c3..89ad25a 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -61,30 +61,27 @@ static int port_cost(struct net_device *dev) } -/* - * Check for port carrier transistions. - * Called from work queue to allow for calling functions that - * might sleep (such as speed check), and to debounce. - */ +/* Check for port carrier transistions. */ void br_port_carrier_check(struct net_bridge_port *p) { struct net_device *dev = p->dev; struct net_bridge *br = p->br; - if (netif_carrier_ok(dev)) + if (netif_running(dev) && netif_carrier_ok(dev)) p->path_cost = port_cost(dev); - if (netif_running(br->dev)) { - spin_lock_bh(&br->lock); - if (netif_carrier_ok(dev)) { - if (p->state == BR_STATE_DISABLED) - br_stp_enable_port(p); - } else { - if (p->state != BR_STATE_DISABLED) - br_stp_disable_port(p); - } - spin_unlock_bh(&br->lock); + if (!netif_running(br->dev)) + return; + + spin_lock_bh(&br->lock); + if (netif_running(dev) && netif_carrier_ok(dev)) { + if (p->state == BR_STATE_DISABLED) + br_stp_enable_port(p); + } else { + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); } + spin_unlock_bh(&br->lock); } static void release_nbp(struct kobject *kobj) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 826cd52..6d04cfd 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -141,7 +141,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb) const unsigned char *dest = eth_hdr(skb)->h_dest; int (*rhook)(struct sk_buff *skb); - if (skb->pkt_type == PACKET_LOOPBACK) + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return skb; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 0b586e9..0fd01dd 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -9,6 +9,8 @@ * and Sakari Ailus <sakari.ailus@nokia.com> */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/version.h> #include <linux/module.h> #include <linux/kernel.h> @@ -214,7 +216,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, switch (what) { case NETDEV_REGISTER: - pr_info("CAIF: %s():register %s\n", __func__, dev->name); + netdev_info(dev, "register\n"); caifd = caif_device_alloc(dev); if (caifd == NULL) break; @@ -225,14 +227,13 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; case NETDEV_UP: - pr_info("CAIF: %s(): up %s\n", __func__, dev->name); + netdev_info(dev, "up\n"); caifd = caif_get(dev); if (caifd == NULL) break; caifdev = netdev_priv(dev); if (atomic_read(&caifd->state) == NETDEV_UP) { - pr_info("CAIF: %s():%s already up\n", - __func__, dev->name); + netdev_info(dev, "already up\n"); break; } atomic_set(&caifd->state, what); @@ -273,7 +274,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, caifd = caif_get(dev); if (caifd == NULL) break; - pr_info("CAIF: %s():going down %s\n", __func__, dev->name); + netdev_info(dev, "going down\n"); if (atomic_read(&caifd->state) == NETDEV_GOING_DOWN || atomic_read(&caifd->state) == NETDEV_DOWN) @@ -295,11 +296,10 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, caifd = caif_get(dev); if (caifd == NULL) break; - pr_info("CAIF: %s(): down %s\n", __func__, dev->name); + netdev_info(dev, "down\n"); if (atomic_read(&caifd->in_use)) - pr_warning("CAIF: %s(): " - "Unregistering an active CAIF device: %s\n", - __func__, dev->name); + netdev_warn(dev, + "Unregistering an active CAIF device\n"); cfcnfg_del_phy_layer(get_caif_conf(), &caifd->layer); dev_put(dev); atomic_set(&caifd->state, what); @@ -307,7 +307,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, case NETDEV_UNREGISTER: caifd = caif_get(dev); - pr_info("CAIF: %s(): unregister %s\n", __func__, dev->name); + netdev_info(dev, "unregister\n"); atomic_set(&caifd->state, what); caif_device_destroy(dev); break; @@ -391,7 +391,7 @@ static int __init caif_device_init(void) int result; cfg = cfcnfg_create(); if (!cfg) { - pr_warning("CAIF: %s(): can't create cfcnfg.\n", __func__); + pr_warn("can't create cfcnfg\n"); goto err_cfcnfg_create_failed; } result = register_pernet_device(&caif_net_ops); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 8ce9047..fd1f5df 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/fs.h> #include <linux/init.h> #include <linux/module.h> @@ -157,8 +159,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { - trace_printk("CAIF: %s():" - " sending flow OFF (queue len = %d %d)\n", + trace_printk("CAIF: %s(): " + "sending flow OFF (queue len = %d %d)\n", __func__, atomic_read(&cf_sk->sk.sk_rmem_alloc), sk_rcvbuf_lowwater(cf_sk)); @@ -172,8 +174,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return err; if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); - trace_printk("CAIF: %s():" - " sending flow OFF due to rmem_schedule\n", + trace_printk("CAIF: %s(): " + "sending flow OFF due to rmem_schedule\n", __func__); dbfs_atomic_inc(&cnt.num_rx_flow_off); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); @@ -275,8 +277,7 @@ static void caif_ctrl_cb(struct cflayer *layr, break; default: - pr_debug("CAIF: %s(): Unexpected flow command %d\n", - __func__, flow); + pr_debug("Unexpected flow command %d\n", flow); } } @@ -536,8 +537,7 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, /* Slight paranoia, probably not needed. */ if (unlikely(loopcnt++ > 1000)) { - pr_warning("CAIF: %s(): transmit retries failed," - " error = %d\n", __func__, ret); + pr_warn("transmit retries failed, error = %d\n", ret); break; } @@ -902,8 +902,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, cf_sk->maxframe = dev->mtu - (headroom + tailroom); dev_put(dev); if (cf_sk->maxframe < 1) { - pr_warning("CAIF: %s(): CAIF Interface MTU too small (%d)\n", - __func__, dev->mtu); + pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); err = -ENODEV; goto out; } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 1c29189..ef93a13 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -3,6 +3,9 @@ * Author: Sjur Brendeland/sjur.brandeland@stericsson.com * License terms: GNU General Public License (GPL) version 2 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/kernel.h> #include <linux/stddef.h> #include <linux/slab.h> @@ -78,7 +81,7 @@ struct cfcnfg *cfcnfg_create(void) /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } this->mux = cfmuxl_create(); @@ -106,7 +109,7 @@ struct cfcnfg *cfcnfg_create(void) layer_set_up(this->ctrl, this); return this; out_of_mem: - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); kfree(this->mux); kfree(this->ctrl); kfree(this); @@ -194,7 +197,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) caif_assert(adap_layer != NULL); channel_id = adap_layer->id; if (adap_layer->dn == NULL || channel_id == 0) { - pr_err("CAIF: %s():adap_layer->id is 0\n", __func__); + pr_err("adap_layer->id is 0\n"); ret = -ENOTCONN; goto end; } @@ -204,9 +207,8 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) layer_set_up(servl, NULL); ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer); if (servl == NULL) { - pr_err("CAIF: %s(): PROTOCOL ERROR " - "- Error removing service_layer Channel_Id(%d)", - __func__, channel_id); + pr_err("PROTOCOL ERROR - Error removing service_layer Channel_Id(%d)", + channel_id); ret = -EINVAL; goto end; } @@ -216,18 +218,14 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) phyinfo = cfcnfg_get_phyinfo(cnfg, phyid); if (phyinfo == NULL) { - pr_warning("CAIF: %s(): " - "No interface to send disconnect to\n", - __func__); + pr_warn("No interface to send disconnect to\n"); ret = -ENODEV; goto end; } if (phyinfo->id != phyid || phyinfo->phy_layer->id != phyid || phyinfo->frm_layer->id != phyid) { - pr_err("CAIF: %s(): " - "Inconsistency in phy registration\n", - __func__); + pr_err("Inconsistency in phy registration\n"); ret = -EINVAL; goto end; } @@ -276,21 +274,20 @@ int cfcnfg_add_adaptation_layer(struct cfcnfg *cnfg, { struct cflayer *frml; if (adap_layer == NULL) { - pr_err("CAIF: %s(): adap_layer is zero", __func__); + pr_err("adap_layer is zero\n"); return -EINVAL; } if (adap_layer->receive == NULL) { - pr_err("CAIF: %s(): adap_layer->receive is NULL", __func__); + pr_err("adap_layer->receive is NULL\n"); return -EINVAL; } if (adap_layer->ctrlcmd == NULL) { - pr_err("CAIF: %s(): adap_layer->ctrlcmd == NULL", __func__); + pr_err("adap_layer->ctrlcmd == NULL\n"); return -EINVAL; } frml = cnfg->phy_layers[param->phyid].frm_layer; if (frml == NULL) { - pr_err("CAIF: %s(): Specified PHY type does not exist!", - __func__); + pr_err("Specified PHY type does not exist!\n"); return -ENODEV; } caif_assert(param->phyid == cnfg->phy_layers[param->phyid].id); @@ -330,9 +327,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, struct net_device *netdev; if (adapt_layer == NULL) { - pr_debug("CAIF: %s(): link setup response " - "but no client exist, send linkdown back\n", - __func__); + pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); return; } @@ -374,13 +369,11 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: - pr_err("CAIF: %s(): Protocol error. " - "Link setup response - unknown channel type\n", - __func__); + pr_err("Protocol error. Link setup response - unknown channel type\n"); return; } if (!servicel) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } layer_set_dn(servicel, cnfg->mux); @@ -418,7 +411,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, } } if (*phyid == 0) { - pr_err("CAIF: %s(): No Available PHY ID\n", __func__); + pr_err("No Available PHY ID\n"); return; } @@ -427,7 +420,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, phy_driver = cfserl_create(CFPHYTYPE_FRAG, *phyid, stx); if (!phy_driver) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } @@ -436,7 +429,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, phy_driver = NULL; break; default: - pr_err("CAIF: %s(): %d", __func__, phy_type); + pr_err("%d\n", phy_type); return; break; } @@ -455,7 +448,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, phy_layer->type = phy_type; frml = cffrml_create(*phyid, fcs); if (!frml) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cnfg->phy_layers[*phyid].frm_layer = frml; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 563145f..08f267a 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -36,7 +38,7 @@ struct cflayer *cfctrl_create(void) struct cfctrl *this = kmalloc(sizeof(struct cfctrl), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfctrl, serv.layer) == 0); @@ -132,9 +134,7 @@ struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl, list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (cfctrl_req_eq(req, p)) { if (p != first) - pr_warning("CAIF: %s(): Requests are not " - "received in order\n", - __func__); + pr_warn("Requests are not received in order\n"); atomic_set(&ctrl->rsp_seq_no, p->sequence_no); @@ -177,7 +177,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid) int ret; struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } caif_assert(offsetof(struct cfctrl, serv.layer) == 0); @@ -189,8 +189,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid) ret = cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); if (ret < 0) { - pr_err("CAIF: %s(): Could not transmit enum message\n", - __func__); + pr_err("Could not transmit enum message\n"); cfpkt_destroy(pkt); } } @@ -208,7 +207,7 @@ int cfctrl_linkup_request(struct cflayer *layer, char utility_name[16]; struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP); @@ -253,13 +252,13 @@ int cfctrl_linkup_request(struct cflayer *layer, param->u.utility.paramlen); break; default: - pr_warning("CAIF: %s():Request setup of bad link type = %d\n", - __func__, param->linktype); + pr_warn("Request setup of bad link type = %d\n", + param->linktype); return -EINVAL; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } req->client_layer = user_layer; @@ -276,8 +275,7 @@ int cfctrl_linkup_request(struct cflayer *layer, ret = cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); if (ret < 0) { - pr_err("CAIF: %s(): Could not transmit linksetup request\n", - __func__); + pr_err("Could not transmit linksetup request\n"); cfpkt_destroy(pkt); return -ENODEV; } @@ -291,7 +289,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid, struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY); @@ -300,8 +298,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid, ret = cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); if (ret < 0) { - pr_err("CAIF: %s(): Could not transmit link-down request\n", - __func__); + pr_err("Could not transmit link-down request\n"); cfpkt_destroy(pkt); } return ret; @@ -313,7 +310,7 @@ void cfctrl_sleep_req(struct cflayer *layer) struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cfpkt_addbdy(pkt, CFCTRL_CMD_SLEEP); @@ -330,7 +327,7 @@ void cfctrl_wake_req(struct cflayer *layer) struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cfpkt_addbdy(pkt, CFCTRL_CMD_WAKE); @@ -347,7 +344,7 @@ void cfctrl_getstartreason_req(struct cflayer *layer) struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cfpkt_addbdy(pkt, CFCTRL_CMD_START_REASON); @@ -364,12 +361,11 @@ void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer) struct cfctrl_request_info *p, *tmp; struct cfctrl *ctrl = container_obj(layr); spin_lock(&ctrl->info_list_lock); - pr_warning("CAIF: %s(): enter\n", __func__); + pr_warn("enter\n"); list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (p->client_layer == adap_layer) { - pr_warning("CAIF: %s(): cancel req :%d\n", __func__, - p->sequence_no); + pr_warn("cancel req :%d\n", p->sequence_no); list_del(&p->list); kfree(p); } @@ -520,9 +516,8 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) cfpkt_extr_head(pkt, ¶m, len); break; default: - pr_warning("CAIF: %s(): Request setup " - "- invalid link type (%d)", - __func__, serv); + pr_warn("Request setup - invalid link type (%d)\n", + serv); goto error; } @@ -532,9 +527,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) || cfpkt_erroneous(pkt)) { - pr_err("CAIF: %s(): Invalid O/E bit or parse " - "error on CAIF control channel", - __func__); + pr_err("Invalid O/E bit or parse error on CAIF control channel\n"); cfctrl->res.reject_rsp(cfctrl->serv.layer.up, 0, req ? req->client_layer @@ -556,8 +549,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid); break; case CFCTRL_CMD_LINK_ERR: - pr_err("CAIF: %s(): Frame Error Indication received\n", - __func__); + pr_err("Frame Error Indication received\n"); cfctrl->res.linkerror_ind(); break; case CFCTRL_CMD_ENUM: @@ -576,7 +568,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) cfctrl->res.radioset_rsp(); break; default: - pr_err("CAIF: %s(): Unrecognized Control Frame\n", __func__); + pr_err("Unrecognized Control Frame\n"); goto error; break; } @@ -595,8 +587,7 @@ static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, case CAIF_CTRLCMD_FLOW_OFF_IND: spin_lock(&this->info_list_lock); if (!list_empty(&this->list)) { - pr_debug("CAIF: %s(): Received flow off in " - "control layer", __func__); + pr_debug("Received flow off in control layer\n"); } spin_unlock(&this->info_list_lock); break; @@ -620,7 +611,7 @@ static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt) if (!ctrl->loop_linkused[linkid]) goto found; spin_unlock(&ctrl->loop_linkid_lock); - pr_err("CAIF: %s(): Out of link-ids\n", __func__); + pr_err("Out of link-ids\n"); return -EINVAL; found: if (!ctrl->loop_linkused[linkid]) diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c index 676648c..496fda9 100644 --- a/net/caif/cfdbgl.c +++ b/net/caif/cfdbgl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/slab.h> #include <net/caif/caif_layer.h> @@ -17,7 +19,7 @@ struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!dbg) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c index ed9d53a..d3ed264 100644 --- a/net/caif/cfdgml.c +++ b/net/caif/cfdgml.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -26,7 +28,7 @@ struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!dgm) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -49,14 +51,14 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt) caif_assert(layr->ctrlcmd != NULL); if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if ((cmd & DGM_CMD_BIT) == 0) { if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -75,8 +77,7 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt) return 0; default: cfpkt_destroy(pkt); - pr_info("CAIF: %s(): Unknown datagram control %d (0x%x)\n", - __func__, cmd, cmd); + pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd); return -EPROTO; } } diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c index e86a4ca..a445043 100644 --- a/net/caif/cffrml.c +++ b/net/caif/cffrml.c @@ -6,6 +6,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -32,7 +34,7 @@ struct cflayer *cffrml_create(u16 phyid, bool use_fcs) { struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cffrml, layer) == 0); @@ -83,7 +85,7 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) if (cfpkt_setlen(pkt, len) < 0) { ++cffrml_rcv_error; - pr_err("CAIF: %s():Framing length error (%d)\n", __func__, len); + pr_err("Framing length error (%d)\n", len); cfpkt_destroy(pkt); return -EPROTO; } @@ -99,14 +101,14 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) cfpkt_add_trail(pkt, &tmp, 2); ++cffrml_rcv_error; ++cffrml_rcv_checsum_error; - pr_info("CAIF: %s(): Frame checksum error " - "(0x%x != 0x%x)\n", __func__, hdrchks, pktchks); + pr_info("Frame checksum error (0x%x != 0x%x)\n", + hdrchks, pktchks); return -EILSEQ; } } if (cfpkt_erroneous(pkt)) { ++cffrml_rcv_error; - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -132,7 +134,7 @@ static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) cfpkt_add_head(pkt, &tmp, 2); cfpkt_info(pkt)->hdr_len += 2; if (cfpkt_erroneous(pkt)) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); return -EPROTO; } ret = layr->dn->transmit(layr->dn, pkt); diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index 80c8d33..46f34b2 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -3,6 +3,9 @@ * Author: Sjur Brendeland/sjur.brandeland@stericsson.com * License terms: GNU General Public License (GPL) version 2 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -190,7 +193,7 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) u8 id; struct cflayer *up; if (cfpkt_extr_head(pkt, &id, 1) < 0) { - pr_err("CAIF: %s(): erroneous Caif Packet\n", __func__); + pr_err("erroneous Caif Packet\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -199,8 +202,8 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) up = get_up(muxl, id); spin_unlock(&muxl->receive_lock); if (up == NULL) { - pr_info("CAIF: %s():Received data on unknown link ID = %d " - "(0x%x) up == NULL", __func__, id, id); + pr_info("Received data on unknown link ID = %d (0x%x) up == NULL", + id, id); cfpkt_destroy(pkt); /* * Don't return ERROR, since modem misbehaves and sends out @@ -223,9 +226,8 @@ static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt) struct caif_payload_info *info = cfpkt_info(pkt); dn = get_dn(muxl, cfpkt_info(pkt)->dev_info); if (dn == NULL) { - pr_warning("CAIF: %s(): Send data on unknown phy " - "ID = %d (0x%x)\n", - __func__, info->dev_info->id, info->dev_info->id); + pr_warn("Send data on unknown phy ID = %d (0x%x)\n", + info->dev_info->id, info->dev_info->id); return -ENOTCONN; } info->hdr_len += 1; diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index c49a669..d7e865e 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/string.h> #include <linux/skbuff.h> #include <linux/hardirq.h> @@ -12,11 +14,12 @@ #define PKT_PREFIX 48 #define PKT_POSTFIX 2 #define PKT_LEN_WHEN_EXTENDING 128 -#define PKT_ERROR(pkt, errmsg) do { \ - cfpkt_priv(pkt)->erronous = true; \ - skb_reset_tail_pointer(&pkt->skb); \ - pr_warning("CAIF: " errmsg);\ - } while (0) +#define PKT_ERROR(pkt, errmsg) \ +do { \ + cfpkt_priv(pkt)->erronous = true; \ + skb_reset_tail_pointer(&pkt->skb); \ + pr_warn(errmsg); \ +} while (0) struct cfpktq { struct sk_buff_head head; @@ -130,13 +133,13 @@ int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) return -EPROTO; if (unlikely(len > skb->len)) { - PKT_ERROR(pkt, "cfpkt_extr_head read beyond end of packet\n"); + PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } if (unlikely(len > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_extr_head linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } @@ -156,11 +159,11 @@ int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) return -EPROTO; if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_extr_trail linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb->data + len > skb_tail_pointer(skb))) { - PKT_ERROR(pkt, "cfpkt_extr_trail read beyond end of packet\n"); + PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } from = skb_tail_pointer(skb) - len; @@ -202,7 +205,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) /* Make sure data is writable */ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { - PKT_ERROR(pkt, "cfpkt_add_body: cow failed\n"); + PKT_ERROR(pkt, "cow failed\n"); return -EPROTO; } /* @@ -211,8 +214,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) * lengths of the top SKB. */ if (lastskb != skb) { - pr_warning("CAIF: %s(): Packet is non-linear\n", - __func__); + pr_warn("Packet is non-linear\n"); skb->len += len; skb->data_len += len; } @@ -242,14 +244,14 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_headroom(skb) < len)) { - PKT_ERROR(pkt, "cfpkt_add_head: no headroom\n"); + PKT_ERROR(pkt, "no headroom\n"); return -EPROTO; } /* Make sure data is writable */ ret = skb_cow_data(skb, 0, &lastskb); if (unlikely(ret < 0)) { - PKT_ERROR(pkt, "cfpkt_add_head: cow failed\n"); + PKT_ERROR(pkt, "cow failed\n"); return ret; } @@ -283,7 +285,7 @@ inline u16 cfpkt_iterate(struct cfpkt *pkt, if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(&pkt->skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_iterate: linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); @@ -309,7 +311,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len) /* Need to expand SKB */ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) - PKT_ERROR(pkt, "cfpkt_setlen: skb_pad_trail failed\n"); + PKT_ERROR(pkt, "skb_pad_trail failed\n"); return cfpkt_getlen(pkt); } @@ -380,8 +382,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos) return NULL; if (skb->data + pos > skb_tail_pointer(skb)) { - PKT_ERROR(pkt, - "cfpkt_split: trying to split beyond end of packet"); + PKT_ERROR(pkt, "trying to split beyond end of packet\n"); return NULL; } @@ -455,17 +456,17 @@ int cfpkt_raw_append(struct cfpkt *pkt, void **buf, unsigned int buflen) return -EPROTO; /* Make sure SKB is writable */ if (unlikely(skb_cow_data(skb, 0, &lastskb) < 0)) { - PKT_ERROR(pkt, "cfpkt_raw_append: skb_cow_data failed\n"); + PKT_ERROR(pkt, "skb_cow_data failed\n"); return -EPROTO; } if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_raw_append: linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb_tailroom(skb) < buflen)) { - PKT_ERROR(pkt, "cfpkt_raw_append: buffer too short - failed\n"); + PKT_ERROR(pkt, "buffer too short - failed\n"); return -EPROTO; } @@ -483,14 +484,13 @@ int cfpkt_raw_extract(struct cfpkt *pkt, void **buf, unsigned int buflen) return -EPROTO; if (unlikely(buflen > skb->len)) { - PKT_ERROR(pkt, "cfpkt_raw_extract: buflen too large " - "- failed\n"); + PKT_ERROR(pkt, "buflen too large - failed\n"); return -EPROTO; } if (unlikely(buflen > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_raw_extract: linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index 9a69924..bde8481 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -48,7 +50,7 @@ struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info, kzalloc(sizeof(struct cfrfml), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } @@ -178,9 +180,7 @@ out: cfpkt_destroy(rfml->incomplete_frm); rfml->incomplete_frm = NULL; - pr_info("CAIF: %s(): " - "Connection error %d triggered on RFM link\n", - __func__, err); + pr_info("Connection error %d triggered on RFM link\n", err); /* Trigger connection error upon failure.*/ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, @@ -280,9 +280,7 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt) out: if (err != 0) { - pr_info("CAIF: %s(): " - "Connection error %d triggered on RFM link\n", - __func__, err); + pr_info("Connection error %d triggered on RFM link\n", err); /* Trigger connection error upon failure.*/ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c index a11fbd6..9297f7d 100644 --- a/net/caif/cfserl.c +++ b/net/caif/cfserl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/spinlock.h> #include <linux/slab.h> @@ -34,7 +36,7 @@ struct cflayer *cfserl_create(int type, int instance, bool use_stx) { struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfserl, layer) == 0); diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index f40939a..ab5e542 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/kernel.h> #include <linux/types.h> #include <linux/errno.h> @@ -79,8 +81,7 @@ static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: - pr_warning("CAIF: %s(): " - "Unexpected ctrl in cfsrvl (%d)\n", __func__, ctrl); + pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; @@ -107,14 +108,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", - __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", - __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -131,14 +130,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", - __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", - __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c index 02795af..efad410 100644 --- a/net/caif/cfutill.c +++ b/net/caif/cfutill.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> @@ -26,7 +28,7 @@ struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!util) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -47,7 +49,7 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt) caif_assert(layr->up->receive != NULL); caif_assert(layr->up->ctrlcmd != NULL); if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -64,16 +66,14 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt) cfpkt_destroy(pkt); return 0; case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */ - pr_err("CAIF: %s(): REMOTE SHUTDOWN REQUEST RECEIVED\n", - __func__); + pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n"); layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0); service->open = false; cfpkt_destroy(pkt); return 0; default: cfpkt_destroy(pkt); - pr_warning("CAIF: %s(): Unknown service control %d (0x%x)\n", - __func__, cmd, cmd); + pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd); return -EPROTO; } } diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c index 77cc09f..3b425b1 100644 --- a/net/caif/cfveil.c +++ b/net/caif/cfveil.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/stddef.h> #include <linux/slab.h> #include <net/caif/caif_layer.h> @@ -25,7 +27,7 @@ struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!vei) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -47,7 +49,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt) if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -67,8 +69,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt) cfpkt_destroy(pkt); return 0; default: /* SET RS232 PIN */ - pr_warning("CAIF: %s():Unknown VEI control packet %d (0x%x)!\n", - __func__, cmd, cmd); + pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd); cfpkt_destroy(pkt); return -EPROTO; } @@ -86,7 +87,7 @@ static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt) caif_assert(layr->dn->transmit != NULL); if (cfpkt_add_head(pkt, &tmp, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); return -EPROTO; } diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c index ada6ee2..bf6fef2 100644 --- a/net/caif/cfvidl.c +++ b/net/caif/cfvidl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/kernel.h> #include <linux/types.h> #include <linux/slab.h> @@ -21,7 +23,7 @@ struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!vid) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -38,7 +40,7 @@ static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt) { u32 videoheader; if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 4293e19..86aac24 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -5,6 +5,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include <linux/version.h> #include <linux/fs.h> #include <linux/init.h> @@ -29,7 +31,7 @@ #define CAIF_NET_DEFAULT_QUEUE_LEN 500 #undef pr_debug -#define pr_debug pr_warning +#define pr_debug pr_warn /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); @@ -142,8 +144,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); - pr_debug("CAIF: %s(): NET flowctrl func called flow: %s\n", - __func__, + pr_debug("NET flowctrl func called flow: %s\n", flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : @@ -196,12 +197,12 @@ static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) priv = netdev_priv(dev); if (skb->len > priv->netdev->mtu) { - pr_warning("CAIF: %s(): Size of skb exceeded MTU\n", __func__); + pr_warn("Size of skb exceeded MTU\n"); return -ENOSPC; } if (!priv->flowenabled) { - pr_debug("CAIF: %s(): dropping packets flow off\n", __func__); + pr_debug("dropping packets flow off\n"); return NETDEV_TX_BUSY; } @@ -237,7 +238,7 @@ static int chnl_net_open(struct net_device *dev) ASSERT_RTNL(); priv = netdev_priv(dev); if (!priv) { - pr_debug("CAIF: %s(): chnl_net_open: no priv\n", __func__); + pr_debug("chnl_net_open: no priv\n"); return -ENODEV; } @@ -246,18 +247,17 @@ static int chnl_net_open(struct net_device *dev) result = caif_connect_client(&priv->conn_req, &priv->chnl, &llifindex, &headroom, &tailroom); if (result != 0) { - pr_debug("CAIF: %s(): err: " - "Unable to register and open device," - " Err:%d\n", - __func__, - result); + pr_debug("err: " + "Unable to register and open device," + " Err:%d\n", + result); goto error; } lldev = dev_get_by_index(dev_net(dev), llifindex); if (lldev == NULL) { - pr_debug("CAIF: %s(): no interface?\n", __func__); + pr_debug("no interface?\n"); result = -ENODEV; goto error; } @@ -279,9 +279,7 @@ static int chnl_net_open(struct net_device *dev) dev_put(lldev); if (mtu < 100) { - pr_warning("CAIF: %s(): " - "CAIF Interface MTU too small (%d)\n", - __func__, mtu); + pr_warn("CAIF Interface MTU too small (%d)\n", mtu); result = -ENODEV; goto error; } @@ -296,33 +294,32 @@ static int chnl_net_open(struct net_device *dev) rtnl_lock(); if (result == -ERESTARTSYS) { - pr_debug("CAIF: %s(): wait_event_interruptible" - " woken by a signal\n", __func__); + pr_debug("wait_event_interruptible woken by a signal\n"); result = -ERESTARTSYS; goto error; } if (result == 0) { - pr_debug("CAIF: %s(): connect timeout\n", __func__); + pr_debug("connect timeout\n"); caif_disconnect_client(&priv->chnl); priv->state = CAIF_DISCONNECTED; - pr_debug("CAIF: %s(): state disconnected\n", __func__); + pr_debug("state disconnected\n"); result = -ETIMEDOUT; goto error; } if (priv->state != CAIF_CONNECTED) { - pr_debug("CAIF: %s(): connect failed\n", __func__); + pr_debug("connect failed\n"); result = -ECONNREFUSED; goto error; } - pr_debug("CAIF: %s(): CAIF Netdevice connected\n", __func__); + pr_debug("CAIF Netdevice connected\n"); return 0; error: caif_disconnect_client(&priv->chnl); priv->state = CAIF_DISCONNECTED; - pr_debug("CAIF: %s(): state disconnected\n", __func__); + pr_debug("state disconnected\n"); return result; } @@ -413,7 +410,7 @@ static void caif_netlink_parms(struct nlattr *data[], struct caif_connect_request *conn_req) { if (!data) { - pr_warning("CAIF: %s: no params data found\n", __func__); + pr_warn("no params data found\n"); return; } if (data[IFLA_CAIF_IPV4_CONNID]) @@ -442,8 +439,7 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev, ret = register_netdevice(dev); if (ret) - pr_warning("CAIF: %s(): device rtml registration failed\n", - __func__); + pr_warn("device rtml registration failed\n"); return ret; } diff --git a/net/can/raw.c b/net/can/raw.c index a10e333..7d77e67 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -647,12 +647,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err < 0) goto free_skb; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto free_skb; /* to be able to check the received tx sock reference in raw_rcv() */ - skb_tx(skb)->prevent_sk_orphan = 1; + skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF; skb->dev = dev; skb->sk = sk; diff --git a/net/core/datagram.c b/net/core/datagram.c index 251997a..4df1b7a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -746,13 +746,12 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/core/dev.c b/net/core/dev.c index b9b22a3..fc2dc93 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,6 +371,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) * --ANK (980803) */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + /** * dev_add_pack - add packet handler * @pt: packet type declaration @@ -386,16 +394,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) void dev_add_pack(struct packet_type *pt) { - int hash; + struct list_head *head = ptype_head(pt); - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) - list_add_rcu(&pt->list, &ptype_all); - else { - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); @@ -414,15 +417,10 @@ EXPORT_SYMBOL(dev_add_pack); */ void __dev_remove_pack(struct packet_type *pt) { - struct list_head *head; + struct list_head *head = ptype_head(pt); struct packet_type *pt1; - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) - head = &ptype_all; - else - head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { @@ -433,7 +431,7 @@ void __dev_remove_pack(struct packet_type *pt) printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); out: - spin_unlock_bh(&ptype_lock); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); @@ -1902,14 +1900,14 @@ static int dev_gso_segment(struct sk_buff *skb) /* * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested, since - * drivers need to call skb_tstamp_tx() to send the timestamp. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c */ static inline void skb_orphan_try(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (sk && !skb_tx(skb)->flags) { + if (sk && !skb_shinfo(skb)->tx_flags) { /* skb_tx_hash() wont be able to get sk. * We copy sk_hash into skb->rxhash */ @@ -1930,7 +1928,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { return skb_is_nonlinear(skb) && - ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -2259,69 +2257,44 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); - /* - * get_rps_cpu is called from netif_receive_skb and returns the target - * CPU from the RPS map of the receiving queue for a given skb. - * rcu_read_lock must be held on entry. + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow **rflowp) +__u32 __skb_get_rxhash(struct sk_buff *skb) { + int nhoff, hash = 0, poff; struct ipv6hdr *ip6; struct iphdr *ip; - struct netdev_rx_queue *rxqueue; - struct rps_map *map; - struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; - int cpu = -1; u8 ip_proto; - u16 tcpu; u32 addr1, addr2, ihl; union { u32 v32; u16 v16[2]; } ports; - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); - goto done; - } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; - - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) - goto done; - - if (skb->rxhash) - goto got_hash; /* Skip hash computation on packet header */ + nhoff = skb_network_offset(skb); switch (skb->protocol) { case __constant_htons(ETH_P_IP): - if (!pskb_may_pull(skb, sizeof(*ip))) + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data; - ip_proto = ip->protocol; + ip = (struct iphdr *) (skb->data + nhoff); + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + else + ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): - if (!pskb_may_pull(skb, sizeof(*ip6))) + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data; + ip6 = (struct ipv6hdr *) (skb->data + nhoff); ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2330,33 +2303,80 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, default: goto done; } - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) { - ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + + ports.v32 = 0; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += ihl * 4 + poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); - break; } - default: - ports.v32 = 0; - break; } /* get a consistent hash (same value on both flow directions) */ if (addr2 < addr1) swap(addr1, addr2); - skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); - if (!skb->rxhash) - skb->rxhash = 1; -got_hash: + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map = NULL; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " + "on queue %u, but number of RX queues is %u\n", + dev->name, index, dev->num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (rxqueue->rps_map) { + map = rcu_dereference(rxqueue->rps_map); + if (map && map->len == 1) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rxqueue->rps_flow_table) { + goto done; + } + + skb_reset_network_header(skb); + if (!skb_get_rxhash(skb)) + goto done; + flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { @@ -2396,7 +2416,6 @@ got_hash: } } - map = rcu_dereference(rxqueue->rps_map); if (map) { tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; @@ -2828,8 +2847,8 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) - return NET_RX_SUCCESS; + if (vlan_tx_tag_present(skb)) + vlan_hwaccel_do_receive(skb); /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) @@ -3050,7 +3069,7 @@ out: return netif_receive_skb(skb); } -static void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -3063,6 +3082,7 @@ static void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } +EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { @@ -3077,7 +3097,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) goto normal; - if (skb_is_gso(skb) || skb_has_frags(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; rcu_read_lock(); @@ -3156,16 +3176,18 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static gro_result_t +static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - (p->dev == skb->dev) && - !compare_ether_header(skb_mac_header(p), + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 7a85367..970eb98 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -205,18 +205,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; - if (!ops->get_drvinfo) - return -EOPNOTSUPP; - memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; - ops->get_drvinfo(dev, &info); + if (ops && ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ - if (ops->get_sset_count) { + if (ops && ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -229,9 +235,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops->get_regs_len) + if (ops && ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); - if (ops->get_eeprom_len) + if (ops && ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) @@ -1402,14 +1408,22 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (!dev || !netif_device_present(dev)) return -ENODEV; - if (!dev->ethtool_ops) - return -EOPNOTSUPP; - if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (!dev->ethtool_ops) { + /* ETHTOOL_GDRVINFO does not require any driver support. + * It is also unprivileged and does not change anything, + * so we can take a shortcut to it. */ + if (ethcmd == ETHTOOL_GDRVINFO) + return ethtool_get_drvinfo(dev, useraddr); + else + return -EOPNOTSUPP; + } + /* Allow some commands to be done by anyone */ switch (ethcmd) { + case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GCOALESCE: diff --git a/net/core/iovec.c b/net/core/iovec.c index 1cd98df..f4657c2 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -41,7 +41,9 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, if (m->msg_namelen) { if (mode == VERIFY_READ) { - err = move_addr_to_kernel(m->msg_name, m->msg_namelen, + void __user *namep; + namep = (void __user __force *) m->msg_name; + err = move_addr_to_kernel(namep, m->msg_namelen, address); if (err < 0) return err; @@ -52,7 +54,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, } size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, m->msg_iov, size)) + if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) return -EFAULT; m->msg_iov = iov; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index af4dfba..76485a3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -515,7 +515,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, return attribute->store(queue, attribute, buf, count); } -static struct sysfs_ops rx_queue_sysfs_ops = { +static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; @@ -789,12 +789,13 @@ static const void *net_netlink_ns(struct sock *sk) return sock_net(sk); } -static struct kobj_ns_type_operations net_ns_type_operations = { +struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_ns = net_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, }; +EXPORT_SYMBOL_GPL(net_ns_type_operations); static void net_kobj_ns_exit(struct net *net) { diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 10a1ea7..386c228 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3907,8 +3907,6 @@ static void __exit pg_cleanup(void) { struct pktgen_thread *t; struct list_head *q, *n; - wait_queue_head_t queue; - init_waitqueue_head(&queue); /* Stop all interfaces & threads */ diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f78d821..b2a718d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -612,36 +612,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) { - struct rtnl_link_stats64 a; - - a.rx_packets = b->rx_packets; - a.tx_packets = b->tx_packets; - a.rx_bytes = b->rx_bytes; - a.tx_bytes = b->tx_bytes; - a.rx_errors = b->rx_errors; - a.tx_errors = b->tx_errors; - a.rx_dropped = b->rx_dropped; - a.tx_dropped = b->tx_dropped; - - a.multicast = b->multicast; - a.collisions = b->collisions; - - a.rx_length_errors = b->rx_length_errors; - a.rx_over_errors = b->rx_over_errors; - a.rx_crc_errors = b->rx_crc_errors; - a.rx_frame_errors = b->rx_frame_errors; - a.rx_fifo_errors = b->rx_fifo_errors; - a.rx_missed_errors = b->rx_missed_errors; - - a.tx_aborted_errors = b->tx_aborted_errors; - a.tx_carrier_errors = b->tx_carrier_errors; - a.tx_fifo_errors = b->tx_fifo_errors; - a.tx_heartbeat_errors = b->tx_heartbeat_errors; - a.tx_window_errors = b->tx_window_errors; - - a.rx_compressed = b->rx_compressed; - a.tx_compressed = b->tx_compressed; - memcpy(v, &a, sizeof(a)); + memcpy(v, b, sizeof(*b)); } /* All VF info */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c83b421..752c197 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -202,8 +202,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; - kmemcheck_annotate_bitfield(skb, flags1); - kmemcheck_annotate_bitfield(skb, flags2); #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->mac_header = ~0U; #endif @@ -340,7 +338,7 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); kfree(skb->head); @@ -685,16 +683,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb->data - skb->head; - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end + skb->data_len, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); -#endif + int headerlen = skb_headroom(skb); + unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) return NULL; @@ -726,20 +718,14 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head, gfp_mask); -#endif + unsigned int size = skb_end_pointer(skb) - skb->head; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) goto out; /* Set the data pointer */ - skb_reserve(n, skb->data - skb->head); + skb_reserve(n, skb_headroom(skb)); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ @@ -759,7 +745,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(n)->nr_frags = i; } - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } @@ -791,12 +777,9 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, { int i; u8 *data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - int size = nhead + skb->end + ntail; -#else - int size = nhead + (skb->end - skb->head) + ntail; -#endif + int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; long off; + bool fastpath; BUG_ON(nhead < 0); @@ -810,23 +793,36 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, goto nodata; /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - memcpy(data + nhead, skb->head, skb->tail); -#else - memcpy(data + nhead, skb->head, skb->tail - skb->head); -#endif - memcpy(data + size, skb_end_pointer(skb), + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); + /* Check if we can avoid taking references on fragments if we own + * the last reference on skb->head. (see skb_release_data()) + */ + if (!skb->cloned) + fastpath = true; + else { + int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; - if (skb_has_frags(skb)) - skb_clone_fraglist(skb); + fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; + } - skb_release_data(skb); + if (fastpath) { + kfree(skb->head); + } else { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + skb_release_data(skb); + } off = (data + nhead) - skb->head; skb->head = data; @@ -1099,7 +1095,7 @@ drop_pages: for (; i < nfrags; i++) put_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); goto done; } @@ -1194,7 +1190,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ - if (!skb_has_frags(skb)) + if (!skb_has_frag_list(skb)) goto pull_pages; /* Estimate size of pulled pages. */ @@ -2323,7 +2319,7 @@ next_skb: st->frag_data = NULL; } - if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; @@ -2893,7 +2889,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; /* Easy case. Most of packets will go this way. */ - if (!skb_has_frags(skb)) { + if (!skb_has_frag_list(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more @@ -2928,7 +2924,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1) || + skb_has_frag_list(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } @@ -2937,7 +2933,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1)) { + skb_has_frag_list(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ @@ -3020,7 +3016,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } else { /* * no hardware time stamps available, - * so keep the skb_shared_tx and only + * so keep the shared tx_flags and only * store software time stamp */ skb->tstamp = ktime_get_real(); diff --git a/net/core/sock.c b/net/core/sock.c index b05b9b6..f3a06c4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1557,6 +1557,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, EXPORT_SYMBOL(sock_alloc_send_skb); static void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); @@ -1573,6 +1575,8 @@ static void __lock_sock(struct sock *sk) } static void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { struct sk_buff *skb = sk->sk_backlog.head; diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 8408398..0581143 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. -config IP_DCCP_CCID3_RTO - int "Use higher bound for nofeedback timer" - default 100 - depends on IP_DCCP_CCID3 && EXPERIMENTAL - ---help--- - Use higher lower bound for nofeedback timer expiration. - - The TFRC nofeedback timer normally expires after the maximum of 4 - RTTs and twice the current send interval (RFC 3448, 4.3). On LANs - with a small RTT this can mean a high processing load and reduced - performance, since then the nofeedback timer is triggered very - frequently. - - This option enables to set a higher lower bound for the nofeedback - value. Values in units of milliseconds can be set here. - - A value of 0 disables this feature by enforcing the value specified - in RFC 3448. The following values have been suggested as bounds for - experimental use: - * 16-20ms to match the typical multimedia inter-frame interval - * 100ms as a reasonable compromise [default] - * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4) - - The default of 100ms is a compromise between a large value for - efficient DCCP implementations, and a small value to avoid disrupting - the network in times of congestion. - - The purpose of the nofeedback timer is to slow DCCP down when there - is serious network congestion: experimenting with larger values should - therefore not be performed on WANs. - config IP_DCCP_TFRC_LIB def_bool y if IP_DCCP_CCID3 diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9b3ae99..dc18172 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,59 +25,14 @@ */ #include <linux/slab.h> #include "../feat.h" -#include "../ccid.h" -#include "../dccp.h" #include "ccid2.h" #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) - -static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc) -{ - int len = 0; - int pipe = 0; - struct ccid2_seq *seqp = hc->tx_seqh; - - /* there is data in the chain */ - if (seqp != hc->tx_seqt) { - seqp = seqp->ccid2s_prev; - len++; - if (!seqp->ccid2s_acked) - pipe++; - - while (seqp != hc->tx_seqt) { - struct ccid2_seq *prev = seqp->ccid2s_prev; - - len++; - if (!prev->ccid2s_acked) - pipe++; - - /* packets are sent sequentially */ - BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, - prev->ccid2s_seq ) >= 0); - BUG_ON(time_before(seqp->ccid2s_sent, - prev->ccid2s_sent)); - - seqp = prev; - } - } - - BUG_ON(pipe != hc->tx_pipe); - ccid2_pr_debug("len of chain=%d\n", len); - - do { - seqp = seqp->ccid2s_prev; - len++; - } while (seqp != hc->tx_seqh); - - ccid2_pr_debug("total len=%d\n", len); - BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN); -} #else #define ccid2_pr_debug(format, a...) -#define ccid2_hc_tx_check_sanity(hc) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) @@ -156,19 +111,10 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val) -{ - ccid2_pr_debug("change SRTT to %ld\n", val); - hc->tx_srtt = val; -} - -static void ccid2_start_rto_timer(struct sock *sk); - static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -178,23 +124,19 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); - ccid2_hc_tx_check_sanity(hc); - /* back-off timer */ hc->tx_rto <<= 1; + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; - s = hc->tx_rto / HZ; - if (s > 60) - hc->tx_rto = 60 * HZ; - - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; + hc->tx_cwnd = 1; + hc->tx_pipe = 0; /* clear state about stuff we sent */ hc->tx_seqt = hc->tx_seqh; @@ -204,22 +146,11 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - ccid2_hc_tx_check_sanity(hc); out: bh_unlock_sock(sk); sock_put(sk); } -static void ccid2_start_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto); - - BUG_ON(timer_pending(&hc->tx_rtotimer)); - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -} - static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); @@ -230,7 +161,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) hc->tx_seqh->ccid2s_seq = dp->dccps_gss; hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = jiffies; + hc->tx_seqh->ccid2s_sent = ccid2_time_stamp; next = hc->tx_seqh->ccid2s_next; /* check if we need to alloc more space */ @@ -296,23 +227,20 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) } #endif - /* setup RTO timer */ - if (!timer_pending(&hc->tx_rtotimer)) - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { struct ccid2_seq *seqp = hc->tx_seqt; while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", + ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); seqp = seqp->ccid2s_next; } } while (0); ccid2_pr_debug("=========\n"); - ccid2_hc_tx_check_sanity(hc); #endif } @@ -378,17 +306,87 @@ out_invalid_option: return -1; } -static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) +/** + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * This code is almost identical with TCP's tcp_rtt_estimator(), since + * - it has a higher sampling frequency (recommended by RFC 1323), + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, + * - it is simple (cf. more complex proposals such as Eifel timer or research + * which suggests that the gain should be set according to window size), + * - in tests it was found to work well with CCID2 [gerrit]. + */ +static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + long m = mrtt ? : 1; - sk_stop_timer(sk, &hc->tx_rtotimer); - ccid2_pr_debug("deleted RTO timer\n"); + if (hc->tx_srtt == 0) { + /* First measurement m */ + hc->tx_srtt = m << 3; + hc->tx_mdev = m << 1; + + hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); + hc->tx_rttvar = hc->tx_mdev_max; + + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + } else { + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ + m -= (hc->tx_srtt >> 3); + hc->tx_srtt += m; + + /* Similarly, update scaled mdev with regard to |m| */ + if (m < 0) { + m = -m; + m -= (hc->tx_mdev >> 2); + /* + * This neutralises RTO increase when RTT < SRTT - mdev + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control + * in Linux TCP", USENIX 2002, pp. 49-62). + */ + if (m > 0) + m >>= 3; + } else { + m -= (hc->tx_mdev >> 2); + } + hc->tx_mdev += m; + + if (hc->tx_mdev > hc->tx_mdev_max) { + hc->tx_mdev_max = hc->tx_mdev; + if (hc->tx_mdev_max > hc->tx_rttvar) + hc->tx_rttvar = hc->tx_mdev_max; + } + + /* + * Decay RTTVAR at most once per flight, exploiting that + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) + * GAR is a useful bound for FlightSize = pipe. + * AWL is probably too low here, as it over-estimates pipe. + */ + if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { + if (hc->tx_mdev_max < hc->tx_rttvar) + hc->tx_rttvar -= (hc->tx_rttvar - + hc->tx_mdev_max) >> 2; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + hc->tx_mdev_max = tcp_rto_min(sk); + } + } + + /* + * Set RTO from SRTT and RTTVAR + * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. + * This agrees with RFC 4341, 5: + * "Because DCCP does not retransmit data, DCCP does not require + * TCP's recommended minimum timeout of one second". + */ + hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; + + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; } -static inline void ccid2_new_ack(struct sock *sk, - struct ccid2_seq *seqp, - unsigned int *maxincr) +static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); @@ -402,93 +400,27 @@ static inline void ccid2_new_ack(struct sock *sk, hc->tx_cwnd += 1; hc->tx_packets_acked = 0; } - - /* update RTO */ - if (hc->tx_srtt == -1 || - time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { - unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; - int s; - - /* first measurement */ - if (hc->tx_srtt == -1) { - ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", - r, jiffies, - (unsigned long long)seqp->ccid2s_seq); - ccid2_change_srtt(hc, r); - hc->tx_rttvar = r >> 1; - } else { - /* RTTVAR */ - long tmp = hc->tx_srtt - r; - long srtt; - - if (tmp < 0) - tmp *= -1; - - tmp >>= 2; - hc->tx_rttvar *= 3; - hc->tx_rttvar >>= 2; - hc->tx_rttvar += tmp; - - /* SRTT */ - srtt = hc->tx_srtt; - srtt *= 7; - srtt >>= 3; - tmp = r >> 3; - srtt += tmp; - ccid2_change_srtt(hc, srtt); - } - s = hc->tx_rttvar << 2; - /* clock granularity is 1 when based on jiffies */ - if (!s) - s = 1; - hc->tx_rto = hc->tx_srtt + s; - - /* must be at least a second */ - s = hc->tx_rto / HZ; - /* DCCP doesn't require this [but I like it cuz my code sux] */ -#if 1 - if (s < 1) - hc->tx_rto = HZ; -#endif - /* max 60 seconds */ - if (s > 60) - hc->tx_rto = HZ * 60; - - hc->tx_lastrtt = jiffies; - - ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", - hc->tx_srtt, hc->tx_rttvar, - hc->tx_rto, HZ, r); - } - - /* we got a new ack, so re-start RTO timer */ - ccid2_hc_tx_kill_rto_timer(sk); - ccid2_start_rto_timer(sk); -} - -static void ccid2_hc_tx_dec_pipe(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if (hc->tx_pipe == 0) - DCCP_BUG("pipe == 0"); - else - hc->tx_pipe--; - - if (hc->tx_pipe == 0) - ccid2_hc_tx_kill_rto_timer(sk); + /* + * FIXME: RTT is sampled several times per acknowledgment (for each + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). + * This causes the RTT to be over-estimated, since the older entries + * in the Ack Vector have earlier sending times. + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps: requires changes in other places. + */ + ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) { + if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; } - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; hc->tx_ssthresh = max(hc->tx_cwnd, 2U); @@ -510,7 +442,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) int done = 0; unsigned int maxincr = 0; - ccid2_hc_tx_check_sanity(hc); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -620,7 +551,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) { done = 1; @@ -677,7 +608,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) break; @@ -695,7 +626,11 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_seqt = hc->tx_seqt->ccid2s_next; } - ccid2_hc_tx_check_sanity(hc); + /* restart RTO timer if not all outstanding data has been acked */ + if (hc->tx_pipe == 0) + sk_stop_timer(sk, &hc->tx_rtotimer); + else + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -707,12 +642,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hc->tx_ssthresh = ~0U; - /* - * RFC 4341, 5: "The cwnd parameter is initialized to at most four - * packets for new connections, following the rules from [RFC3390]". - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. - */ - hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); + /* Use larger initial windows (RFC 4341, section 5). */ + hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); /* Make sure that Ack Ratio is enabled and within bounds. */ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); @@ -723,15 +654,11 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hc)) return -ENOMEM; - hc->tx_rto = 3 * HZ; - ccid2_change_srtt(hc, -1); - hc->tx_rttvar = -1; + hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - - ccid2_hc_tx_check_sanity(hc); return 0; } @@ -740,7 +667,7 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); int i; - ccid2_hc_tx_kill_rto_timer(sk); + sk_stop_timer(sk, &hc->tx_rtotimer); for (i = 0; i < hc->tx_seqbufc; i++) kfree(hc->tx_seqbuf[i]); diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 1ec6a30..9731c2d 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -18,18 +18,23 @@ #ifndef _DCCP_CCID2_H_ #define _DCCP_CCID2_H_ -#include <linux/dccp.h> #include <linux/timer.h> #include <linux/types.h> #include "../ccid.h" +#include "../dccp.h" + +/* + * CCID-2 timestamping faces the same issues as TCP timestamping. + * Hence we reuse/share as much of the code as possible. + */ +#define ccid2_time_stamp tcp_time_stamp + /* NUMDUPACK parameter from RFC 4341, p. 6 */ #define NUMDUPACK 3 -struct sock; - struct ccid2_seq { u64 ccid2s_seq; - unsigned long ccid2s_sent; + u32 ccid2s_sent; int ccid2s_acked; struct ccid2_seq *ccid2s_prev; struct ccid2_seq *ccid2s_next; @@ -42,7 +47,12 @@ struct ccid2_seq { * struct ccid2_hc_tx_sock - CCID2 TX half connection * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_lastrtt: time RTT was last measured + * @tx_srtt: smoothed RTT estimate, scaled by 2^3 + * @tx_mdev: smoothed RTT variation, scaled by 2^2 + * @tx_mdev_max: maximum of @mdev during one flight + * @tx_rttvar: moving average/maximum of @mdev_max + * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) + * @tx_rtt_seq: to decay RTTVAR at most once per flight * @tx_rpseq: last consecutive seqno * @tx_rpdupack: dupacks since rpseq */ @@ -55,14 +65,19 @@ struct ccid2_hc_tx_sock { int tx_seqbufc; struct ccid2_seq *tx_seqh; struct ccid2_seq *tx_seqt; - long tx_rto; - long tx_srtt; - long tx_rttvar; - unsigned long tx_lastrtt; + + /* RTT measurement: variables/principles are the same as in TCP */ + u32 tx_srtt, + tx_mdev, + tx_mdev_max, + tx_rttvar, + tx_rto; + u64 tx_rtt_seq:48; struct timer_list tx_rtotimer; + u64 tx_rpseq; int tx_rpdupack; - unsigned long tx_last_cong; + u32 tx_last_cong; u64 tx_high_ack; }; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 95f7529..278e170 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -218,9 +218,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 + * RTO is 0 if and only if no feedback has been received yet. */ - if (hc->tx_t_rto == 0 || /* no feedback received yet */ - hc->tx_p == 0) { + if (hc->tx_t_rto == 0 || hc->tx_p == 0) { /* halve send rate directly */ hc->tx_x = max(hc->tx_x / 2, @@ -256,7 +256,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */ + if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); @@ -372,7 +372,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -386,7 +386,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_state != TFRC_SSTATE_NO_FBACK) return; - opt_recv = &hc->tx_options_received; now = ktime_get_real(); /* Estimate RTT from history if ACK number is valid */ @@ -461,13 +460,12 @@ done_computing_x: sk->sk_write_space(sk); /* - * Update timeout interval for the nofeedback timer. - * We use a configuration option to increase the lower bound. - * This can help avoid triggering the nofeedback timer too - * often ('spinning') on LANs with small RTTs. + * Update timeout interval for the nofeedback timer. In order to control + * rate halving on networks with very low RTTs (<= 1 ms), use per-route + * tunable RTAX_RTO_MIN value as the lower bound. */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, + USEC_PER_SEC/HZ * tcp_rto_min(sk)); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) @@ -489,11 +487,9 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, int rc = 0; const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; - opt_recv = &hc->tx_options_received; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { opt_recv->ccid3or_seqno = dp->dccps_gsr; opt_recv->ccid3or_loss_event_rate = ~0; @@ -567,34 +563,30 @@ static void ccid3_hc_tx_exit(struct sock *sk) static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - struct ccid3_hc_tx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_tx_sk(sk); - info->tcpi_rto = hc->tx_t_rto; - info->tcpi_rtt = hc->tx_rtt; + info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; + info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hc; + const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); + struct tfrc_tx_info tfrc; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(hc->tx_tfrc)) + if (len < sizeof(tfrc)) return -EINVAL; - len = sizeof(hc->tx_tfrc); - val = &hc->tx_tfrc; + tfrc.tfrctx_x = hc->tx_x; + tfrc.tfrctx_x_recv = hc->tx_x_recv; + tfrc.tfrctx_x_calc = hc->tx_x_calc; + tfrc.tfrctx_rtt = hc->tx_rtt; + tfrc.tfrctx_p = hc->tx_p; + tfrc.tfrctx_rto = hc->tx_t_rto; + tfrc.tfrctx_ipi = hc->tx_t_ipi; + len = sizeof(tfrc); + val = &tfrc; break; default: return -ENOPROTOOPT; @@ -701,14 +693,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; - hc = ccid3_hc_rx_sk(sk); - if (dccp_packet_without_ack(skb)) return 0; @@ -749,10 +739,11 @@ static u32 ccid3_first_li(struct sock *sk) x_recv = scaled_div32(hc->rx_bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); - if ((x_recv = hc->rx_x_recv) == 0) { + if (hc->rx_x_recv == 0) { DCCP_BUG("stored value of X_recv is zero"); return ~0U; } + x_recv = hc->rx_x_recv; } fval = scaled_div(hc->rx_s, hc->rx_rtt); @@ -870,30 +861,18 @@ static void ccid3_hc_rx_exit(struct sock *sk) static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { - const struct ccid3_hc_rx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_rx_sk(sk); - info->tcpi_ca_state = hc->rx_state; + info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = hc->rx_rtt; + info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct tfrc_rx_info rx_info; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 0326357..b7e569c 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -42,7 +42,7 @@ #include "lib/tfrc.h" #include "../ccid.h" -/* Two seconds as per RFC 3448 4.2 */ +/* Two seconds as per RFC 5348, 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ @@ -95,14 +95,13 @@ enum ccid3_hc_tx_states { * @tx_options_received: Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - struct tfrc_tx_info tx_tfrc; -#define tx_x tx_tfrc.tfrctx_x -#define tx_x_recv tx_tfrc.tfrctx_x_recv -#define tx_x_calc tx_tfrc.tfrctx_x_calc -#define tx_rtt tx_tfrc.tfrctx_rtt -#define tx_p tx_tfrc.tfrctx_p -#define tx_t_rto tx_tfrc.tfrctx_rto -#define tx_t_ipi tx_tfrc.tfrctx_ipi + u64 tx_x; + u64 tx_x_recv; + u32 tx_x_calc; + u32 tx_rtt; + u32 tx_p; + u32 tx_t_rto; + u32 tx_t_ipi; u16 tx_s; enum ccid3_hc_tx_states tx_state:8; u8 tx_last_win_count; @@ -131,16 +130,12 @@ enum ccid3_hc_rx_states { /** * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3) - * @rx_rtt: Receiver estimate of rtt (non-standard) - * @rx_p: Current loss event rate (RFC 3448 5.4) * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) * @rx_state: Receiver state, one of %ccid3_hc_rx_states * @rx_bytes_recv: Total sum of DCCP payload bytes * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) * @rx_rtt: Receiver estimate of RTT * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_tstamp_last_ack: Time at which last feedback was sent * @rx_hist: Packet history (loss detection + RTT sampling) * @rx_li_hist: Loss Interval database * @rx_s: Received packet size in bytes diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index baeb1ea..2ef1152 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -693,22 +693,22 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) aux = scp->accessdata.acc_userl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); aux = scp->accessdata.acc_passl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); aux = scp->accessdata.acc_accl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux); + memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux); scp->persist = dn_nsp_persist(sk); scp->persist_fxn = dn_nsp_retrans_conninit; diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index dc54bd0..baa98fb 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -1009,7 +1009,6 @@ static int __init aun_udp_initialise(void) struct sockaddr_in sin; skb_queue_head_init(&aun_queue); - spin_lock_init(&aun_queue_lock); setup_timer(&ab_cleanup_timer, ab_cleanup, 0); ab_cleanup_timer.expires = jiffies + (HZ*2); add_timer(&ab_cleanup_timer); @@ -1167,7 +1166,6 @@ static int __init econet_proto_init(void) goto out; sock_register(&econet_family_ops); #ifdef CONFIG_ECONET_AUNUDP - spin_lock_init(&aun_queue_lock); aun_udp_initialise(); #endif #ifdef CONFIG_ECONET_NATIVE diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 215c839..85e7b45 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -367,7 +367,7 @@ struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count) EXPORT_SYMBOL(alloc_etherdev_mq); static size_t _format_mac_addr(char *buf, int buflen, - const unsigned char *addr, int len) + const unsigned char *addr, int len) { int i; char *cp = buf; @@ -376,7 +376,7 @@ static size_t _format_mac_addr(char *buf, int buflen, cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]); if (i == len - 1) break; - cp += strlcpy(cp, ":", buflen - (cp - buf)); + cp += scnprintf(cp, buflen - (cp - buf), ":"); } return cp - buf; } @@ -386,7 +386,7 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) size_t l; l = _format_mac_addr(buf, PAGE_SIZE, addr, len); - l += strlcpy(buf + l, "\n", PAGE_SIZE - l); + l += scnprintf(buf + l, PAGE_SIZE - l, "\n"); return ((ssize_t) l); } EXPORT_SYMBOL(sysfs_format_mac); diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 571f895..5462e2d 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -215,8 +215,15 @@ config NET_IPIP be inserted in and removed from the running kernel whenever you want). Most people won't need this and can say N. +config NET_IPGRE_DEMUX + tristate "IP: GRE demultiplexer" + help + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. + config NET_IPGRE tristate "IP: GRE tunnels over IP" + depends on NET_IPGRE_DEMUX help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87c..4978d22 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6a1100c..f581f77 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret); /* * inet_ehash_secret must be set exactly once - * Instead of using a dedicated spinlock, we (ab)use inetsw_lock */ void build_ehash_secret(void) { u32 rnd; + do { get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - spin_lock_bh(&inetsw_lock); - if (!inet_ehash_secret) - inet_ehash_secret = rnd; - spin_unlock_bh(&inetsw_lock); + + cmpxchg(&inet_ehash_secret, 0, rnd); } EXPORT_SYMBOL(build_ehash_secret); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 96c1955..dcfe7e9 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -55,7 +55,7 @@ * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. - * Alan Cox : Took the AP1000 nasty FDDI hack and + * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... @@ -120,7 +120,7 @@ EXPORT_SYMBOL(clip_tbl_hook); #endif #include <asm/system.h> -#include <asm/uaccess.h> +#include <linux/uaccess.h> #include <linux/netfilter_arp.h> @@ -173,32 +173,32 @@ const struct neigh_ops arp_broken_ops = { EXPORT_SYMBOL(arp_broken_ops); struct neigh_table arp_tbl = { - .family = AF_INET, - .entry_size = sizeof(struct neighbour) + 4, - .key_len = 4, - .hash = arp_hash, - .constructor = arp_constructor, - .proxy_redo = parp_redo, - .id = "arp_cache", - .parms = { - .tbl = &arp_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, - .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len = 3, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, - .locktime = 1 * HZ, + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, + .hash = arp_hash, + .constructor = arp_constructor, + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { + .tbl = &arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, }, - .gc_interval = 30 * HZ, - .gc_thresh1 = 128, - .gc_thresh2 = 512, - .gc_thresh3 = 1024, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); @@ -233,7 +233,7 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32*)neigh->primary_key; + __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; @@ -296,16 +296,19 @@ static int arp_constructor(struct neighbour *neigh) neigh->ops = &arp_broken_ops; neigh->output = neigh->ops->output; return 0; +#else + break; #endif - ;} + } #endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); - } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); - } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + } else if (neigh->type == RTN_BROADCAST || + (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } @@ -315,7 +318,7 @@ static int arp_constructor(struct neighbour *neigh) else neigh->ops = &arp_generic_ops; - if (neigh->nud_state&NUD_VALID) + if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; @@ -334,7 +337,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 saddr = 0; u8 *dst_ha = NULL; struct net_device *dev = neigh->dev; - __be32 target = *(__be32*)neigh->primary_key; + __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; @@ -347,7 +350,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(dev_net(dev), + ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ @@ -369,16 +373,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); - if ((probes -= neigh->parms->ucast_probes) < 0) { - if (!(neigh->nud_state&NUD_VALID)) - printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + probes -= neigh->parms->ucast_probes; + if (probes < 0) { + if (!(neigh->nud_state & NUD_VALID)) + printk(KERN_DEBUG + "trying to ucast probe in NUD_INVALID\n"); dst_ha = neigh->ha; read_lock_bh(&neigh->lock); - } else if ((probes -= neigh->parms->app_probes) < 0) { + } else { + probes -= neigh->parms->app_probes; + if (probes < 0) { #ifdef CONFIG_ARPD - neigh_app_ns(neigh); + neigh_app_ns(neigh); #endif - return; + return; + } } arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, @@ -451,7 +460,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) * is allowed to use this function, it is scheduled to be removed. --ANK */ -static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) +static int arp_set_predefined(int addr_hint, unsigned char *haddr, + __be32 paddr, struct net_device *dev) { switch (addr_hint) { case RTN_LOCAL: @@ -483,7 +493,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = skb_rtable(skb)->rt_gateway; - if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, + paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); @@ -515,13 +526,14 @@ int arp_bind_neighbour(struct dst_entry *dst) return -EINVAL; if (n == NULL) { __be32 nexthop = ((struct rtable *)dst)->rt_gateway; - if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) nexthop = 0; n = __neigh_lookup_errno( #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) - dev->type == ARPHRD_ATM ? clip_tbl_hook : + dev->type == ARPHRD_ATM ? + clip_tbl_hook : #endif - &arp_tbl, &nexthop, dev); + &arp_tbl, &nexthop, dev); if (IS_ERR(n)) return PTR_ERR(n); dst->neighbour = n; @@ -543,8 +555,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, if (!IN_DEV_PROXY_ARP(in_dev)) return 0; - - if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) + imi = IN_DEV_MEDIUM_ID(in_dev); + if (imi == 0) return 1; if (imi == -1) return 0; @@ -685,7 +697,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp->ar_pln = 4; arp->ar_op = htons(type); - arp_ptr=(unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; @@ -735,9 +747,8 @@ void arp_send(int type, int ptype, __be32 dest_ip, skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); - if (skb == NULL) { + if (skb == NULL) return; - } arp_xmit(skb); } @@ -815,7 +826,7 @@ static int arp_process(struct sk_buff *skb) /* * Extract fields */ - arp_ptr= (unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); @@ -869,16 +880,17 @@ static int arp_process(struct sk_buff *skb) addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { - int dont_send = 0; + int dont_send; - if (!dont_send) - dont_send |= arp_ignore(in_dev,sip,tip); + dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) - dont_send |= arp_filter(sip,tip,dev); + dont_send |= arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); neigh_release(n); } } @@ -887,8 +899,7 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) - { + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -896,9 +907,12 @@ static int arp_process(struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); } else { - pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); + pneigh_enqueue(&arp_tbl, + in_dev->arp_parms, skb); return 0; } goto out; @@ -939,7 +953,8 @@ static int arp_process(struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; - neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); + neigh_update(n, sha, state, + override ? NEIGH_UPDATE_F_OVERRIDE : 0); neigh_release(n); } @@ -975,7 +990,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, arp->ar_pln != 4) goto freeskb; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) goto out_of_mem; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); @@ -1019,7 +1035,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, - r->arp_ha.sa_data); + r->arp_ha.sa_data); if (!dev) return -ENODEV; } @@ -1033,7 +1049,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, } static int arp_req_set(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { __be32 ip; struct neighbour *neigh; @@ -1046,10 +1062,11 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; dev = rt->dst.dev; ip_rt_put(rt); @@ -1083,9 +1100,9 @@ static int arp_req_set(struct net *net, struct arpreq *r, unsigned state = NUD_STALE; if (r->arp_flags & ATF_PERM) state = NUD_PERMANENT; - err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, - NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); } @@ -1094,12 +1111,12 @@ static int arp_req_set(struct net *net, struct arpreq *r, static unsigned arp_state_to_flags(struct neighbour *neigh) { - unsigned flags = 0; if (neigh->nud_state&NUD_PERMANENT) - flags = ATF_PERM|ATF_COM; + return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) - flags = ATF_COM; - return flags; + return ATF_COM; + else + return 0; } /* @@ -1142,7 +1159,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, } static int arp_req_delete(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { int err; __be32 ip; @@ -1153,10 +1170,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; dev = rt->dst.dev; ip_rt_put(rt); @@ -1166,7 +1184,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, err = -ENXIO; neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { - if (neigh->nud_state&~NUD_NOARP) + if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN); @@ -1186,24 +1204,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) struct net_device *dev = NULL; switch (cmd) { - case SIOCDARP: - case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - case SIOCGARP: - err = copy_from_user(&r, arg, sizeof(struct arpreq)); - if (err) - return -EFAULT; - break; - default: - return -EINVAL; + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && - (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) + (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; if (!(r.arp_flags & ATF_NETMASK)) ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = @@ -1211,7 +1229,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) + dev = __dev_get_by_name(net, r.arp_dev); + if (dev == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1243,7 +1262,8 @@ out: return err; } -static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +static int arp_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) { struct net_device *dev = ptr; @@ -1311,12 +1331,13 @@ static char *ax2asc2(ax25_address *a, char *buf) for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; - if (c != ' ') *s++ = c; + if (c != ' ') + *s++ = c; } *s++ = '-'; - - if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + n = (a->ax25_call[6] >> 1) & 0x0F; + if (n > 9) { *s++ = '1'; n -= 10; } @@ -1325,10 +1346,9 @@ static char *ax2asc2(ax25_address *a, char *buf) *s++ = '\0'; if (*buf == '\0' || *buf == '-') - return "*"; + return "*"; return buf; - } #endif /* CONFIG_AX25 */ @@ -1408,10 +1428,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos) /* ------------------------------------------------------------------------ */ static const struct seq_operations arp_seq_ops = { - .start = arp_seq_start, - .next = neigh_seq_next, - .stop = neigh_seq_stop, - .show = arp_seq_show, + .start = arp_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = arp_seq_show, }; static int arp_seq_open(struct inode *inode, struct file *file) diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c new file mode 100644 index 0000000..b546736d --- /dev/null +++ b/net/ipv4/gre.c @@ -0,0 +1,151 @@ +/* + * GRE over IPv4 demultiplexer driver + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/kmod.h> +#include <linux/skbuff.h> +#include <linux/in.h> +#include <linux/netdevice.h> +#include <linux/version.h> +#include <linux/spinlock.h> +#include <net/protocol.h> +#include <net/gre.h> + + +const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static DEFINE_SPINLOCK(gre_proto_lock); + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version]) + goto err_out_unlock; + + rcu_assign_pointer(gre_proto[version], proto); + spin_unlock(&gre_proto_lock); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version] != proto) + goto err_out_unlock; + rcu_assign_pointer(gre_proto[version], NULL); + spin_unlock(&gre_proto_lock); + synchronize_rcu(); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +static int gre_rcv(struct sk_buff *skb) +{ + const struct gre_protocol *proto; + u8 ver; + int ret; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->handler) + goto drop_unlock; + ret = proto->handler(skb); + rcu_read_unlock(); + return ret; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + const struct gre_protocol *proto; + u8 ver; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->err_handler) + goto drop_unlock; + proto->err_handler(skb, info); + rcu_read_unlock(); + return; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); +} + +static const struct net_protocol net_gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .netns_ok = 1, +}; + +static int __init gre_init(void) +{ + pr_info("GRE over IPv4 demultiplexor driver"); + + if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { + pr_err("gre: can't add protocol\n"); + return -EAGAIN; + } + + return 0; +} + +static void __exit gre_exit(void) +{ + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); + diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a0d847c7..96bc7f9 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) @@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; { struct flowi fl = { diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b7c4165..f4dc879 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 945b20a..8517689 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -44,6 +44,7 @@ #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/rtnetlink.h> +#include <net/gre.h> #ifdef CONFIG_IPV6 #include <net/ipv6.h> @@ -1278,10 +1279,9 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) } -static const struct net_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .netns_ok = 1, +static const struct gre_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, }; static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) @@ -1663,7 +1663,7 @@ static int __init ipgre_init(void) if (err < 0) return err; - err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { printk(KERN_INFO "ipgre init: can't add protocol\n"); goto add_proto_failed; @@ -1683,7 +1683,7 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_net_ops); goto out; @@ -1693,7 +1693,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) + if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); unregister_pernet_device(&ipgre_net_ops); } diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 04b6989..e427620 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -487,7 +487,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { struct sk_buff *frag; int first_len = skb_pagelen(skb); int truesizes = 0; @@ -837,10 +837,9 @@ int ip_append_data(struct sock *sk, inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->dst.header_len) != 0) { - length += exthdrlen; - transhdrlen += exthdrlen; - } + exthdrlen = rt->dst.header_len; + length += exthdrlen; + transhdrlen += exthdrlen; } else { rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) @@ -953,7 +952,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->shtx.flags = 0; + ipc->tx_flags = 0; } if (skb == NULL) goto error; @@ -964,7 +963,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - *skb_tx(skb) = ipc->shtx; + skb_shinfo(skb)->tx_flags = ipc->tx_flags; /* * Find where to start putting bytes. @@ -1384,7 +1383,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ec03673..3c6f8f3 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -744,7 +744,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) ipn->tunnels_wc[0] = tunnel; } -static struct xfrm_tunnel ipip_handler = { +static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 3a43cf3..1e26a48 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,6 +29,7 @@ #include <net/netfilter/nf_conntrack.h> #include <net/net_namespace.h> #include <net/checksum.h> +#include <net/ip.h> #define CLUSTERIP_VERSION "0.8" @@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; - u_int16_t sport, dport; - const u_int16_t *ports; - - switch (iph->protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ICMP: - ports = (const void *)iph+iph->ihl*4; - sport = ports[0]; - dport = ports[1]; - break; - default: + u_int16_t sport = 0, dport = 0; + int poff; + + poff = proto_ports_offset(iph->protocol); + if (poff >= 0) { + const u_int16_t *ports; + u16 _ports[2]; + + ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); + if (ports) { + sport = ports[0]; + dport = ports[1]; + } + } else { if (net_ratelimit()) pr_info("unknown protocol %u\n", iph->protocol); - sport = dport = 0; } switch (config->hash_mode) { diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index f2d2973..65699c2 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,8 +28,7 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; -static DEFINE_SPINLOCK(inet_proto_lock); +const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly; /* * Add a protocol handler to the hash tables @@ -37,20 +36,9 @@ static DEFINE_SPINLOCK(inet_proto_lock); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; + int hash = protocol & (MAX_INET_PROTOS - 1); - hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash]) { - ret = -1; - } else { - inet_protos[hash] = prot; - ret = 0; - } - spin_unlock_bh(&inet_proto_lock); - - return ret; + return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -60,18 +48,9 @@ EXPORT_SYMBOL(inet_add_protocol); int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; - - hash = protocol & (MAX_INET_PROTOS - 1); + int ret, hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash] == prot) { - inet_protos[hash] = NULL; - ret = 0; - } else { - ret = -1; - } - spin_unlock_bh(&inet_proto_lock); + ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 009a7b2..1f85ef2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 6298f75..e24d48d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1268,18 +1268,11 @@ skip_hashing: void rt_bind_peer(struct rtable *rt, int create) { - static DEFINE_SPINLOCK(rt_peer_lock); struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; - } - spin_unlock_bh(&rt_peer_lock); - if (peer) + if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); } diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 3fb1428..3e8a4db 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2389,7 +2389,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = tp->af_specific->md5_parse(sk, optval, optlen); break; #endif - + case TCP_USER_TIMEOUT: + /* Cap the max timeout in ms TCP will retry/retrans + * before giving up and aborting (ETIMEDOUT) a connection. + */ + icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; default: err = -ENOPROTOOPT; break; @@ -2608,6 +2613,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: val = tp->thin_dupack; break; + + case TCP_USER_TIMEOUT: + val = jiffies_to_msecs(icsk->icsk_user_timeout); + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e663b78..1bc87a0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC3390. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than use a multiplier in the relevant range. - */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { - if (tp->mss_cache > 1460) - cwnd = 2; - else - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - } + if (!cwnd) + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0207662..a0232f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2571,7 +2571,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) return tcp_gro_receive(head, skb); } -EXPORT_SYMBOL(tcp4_gro_receive); int tcp4_gro_complete(struct sk_buff *skb) { @@ -2584,7 +2583,6 @@ int tcp4_gro_complete(struct sk_buff *skb) return tcp_gro_complete(skb); } -EXPORT_SYMBOL(tcp4_gro_complete); struct proto tcp_prot = { .name = "TCP", diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index de3bd84..ea09d2f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to value enough for senders, - * following RFC2414. Senders, not following this RFC, - * will be satisfied with 2. - */ + /* Set initial window to value enough for senders, following RFC5681. */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = 4; - if (mss > 1460 * 3) - init_cwnd = 2; - else if (mss > 1460) - init_cwnd = 3; + int init_cwnd = rfc3390_bytes_to_packets(mss); + /* when initializing use the value from init_rcv_wnd * rather than the default from above */ @@ -2429,6 +2423,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -2555,6 +2555,11 @@ static void tcp_connect_init(struct sock *sk) tcp_initialize_rcv_mss(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space(sk); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index c35b469..baea4a1 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -138,10 +138,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, - unsigned int boundary) + unsigned int boundary, + unsigned int timeout) { - unsigned int timeout, linear_backoff_thresh; - unsigned int start_ts; + unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -151,14 +151,15 @@ static bool retransmits_timed_out(struct sock *sk, else start_ts = tcp_sk(sk)->retrans_stamp; - linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); - - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; - else - timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + if (likely(timeout == 0)) { + linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; + else + timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } return (tcp_time_stamp - start_ts) >= timeout; } @@ -174,7 +175,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -187,14 +188,16 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (retransmits_timed_out(sk, retry_until)) { + if (retransmits_timed_out(sk, retry_until, + (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 : + icsk->icsk_user_timeout)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -436,7 +439,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; @@ -556,7 +559,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { - if (icsk->icsk_probes_out >= keepalive_probes(tp)) { + /* If the TCP_USER_TIMEOUT option is enabled, use that + * to determine when to timeout instead. + */ + if ((icsk->icsk_user_timeout != 0 && + elapsed >= icsk->icsk_user_timeout && + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 59186ca..9a17bd2 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,8 +14,8 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm_tunnel *tunnel4_handlers; -static struct xfrm_tunnel *tunnel64_handlers; +static struct xfrm_tunnel *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel **fam_handlers(unsigned short family) @@ -39,7 +39,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; @@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) } EXPORT_SYMBOL(xfrm4_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; @@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; @@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; @@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) break; } @@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) break; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index fb23c2e..b3f7e8c 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EOPNOTSUPP; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (up->pending) { /* @@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - err = sock_tx_timestamp(msg, sk, &ipc.shtx); + err = sock_tx_timestamp(sk, &ipc.tx_flags); if (err) return err; if (msg->msg_controllen) { diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 41f5982..8280645 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) return -ENOENT; } -static struct xfrm_tunnel xfrm_tunnel_handler = { +static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct xfrm_tunnel xfrm64_tunnel_handler = { +static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ab70a3f..5bc893e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2964,7 +2964,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) start sending router solicitations. */ - if (ifp->idev->cnf.forwarding == 0 && + if ((ifp->idev->cnf.forwarding == 0 || + ifp->idev->cnf.forwarding == 2) && ifp->idev->cnf.rtr_solicits > 0 && (dev->flags&IFF_LOOPBACK) == 0 && (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d40b330..1838927 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -637,7 +637,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } mtu -= hlen + sizeof(struct frag_hdr); - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); int truesizes = 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 0fd027f..29f99dd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1372,13 +1372,13 @@ static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev) ip6n->tnls_wc[0] = t; } -static struct xfrm6_tunnel ip4ip6_handler = { +static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; -static struct xfrm6_tunnel ip6ip6_handler = { +static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 58841c4..69a0051 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1105,6 +1105,18 @@ errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } +static inline int accept_ra(struct inet6_dev *in6_dev) +{ + /* + * If forwarding is enabled, RA are not accepted unless the special + * hybrid mode (accept_ra=2) is enabled. + */ + if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2) + return 0; + + return in6_dev->cnf.accept_ra; +} + static void ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); @@ -1158,8 +1170,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - /* skip route and link configuration on routers */ - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) + if (!accept_ra(in6_dev)) goto skip_linkparms; #ifdef CONFIG_IPV6_NDISC_NODETYPE @@ -1309,8 +1320,7 @@ skip_linkparms: NEIGH_UPDATE_F_ISROUTER); } - /* skip route and link configuration on routers */ - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) + if (!accept_ra(in6_dev)) goto out; #ifdef CONFIG_IPV6_ROUTE_INFO diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 578f3c1..138a8b3 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -363,7 +363,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index 1fa3468..9bb936a 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -25,28 +25,14 @@ #include <linux/spinlock.h> #include <net/protocol.h> -const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; -static DEFINE_SPINLOCK(inet6_proto_lock); - +const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS] __read_mostly; int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet6_proto_lock); - - if (inet6_protos[hash]) { - ret = -1; - } else { - inet6_protos[hash] = prot; - ret = 0; - } - - spin_unlock_bh(&inet6_proto_lock); + int hash = protocol & (MAX_INET_PROTOS - 1); - return ret; + return !cmpxchg(&inet6_protos[hash], NULL, prot) ? 0 : -1; } - EXPORT_SYMBOL(inet6_add_protocol); /* @@ -57,20 +43,10 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet6_proto_lock); - - if (inet6_protos[hash] != prot) { - ret = -1; - } else { - inet6_protos[hash] = NULL; - ret = 0; - } - - spin_unlock_bh(&inet6_proto_lock); + ret = (cmpxchg(&inet6_protos[hash], prot, NULL) == prot) ? 0 : -1; synchronize_net(); return ret; } - EXPORT_SYMBOL(inet6_del_protocol); diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 64cfef1..c7ba314 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -458,7 +458,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 4699cd3..86618eb 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1132,7 +1132,7 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) sitn->tunnels_wc[0] = tunnel; } -static struct xfrm_tunnel sit_handler = { +static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index fc3c86a..d986472 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -30,8 +30,8 @@ #include <net/protocol.h> #include <net/xfrm.h> -static struct xfrm6_tunnel *tunnel6_handlers; -static struct xfrm6_tunnel *tunnel46_handlers; +static struct xfrm6_tunnel *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel *tunnel46_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) @@ -51,7 +51,7 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; @@ -88,6 +88,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) EXPORT_SYMBOL(xfrm6_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -95,7 +100,7 @@ static int tunnel6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; @@ -113,7 +118,7 @@ static int tunnel46_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel46_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; @@ -129,7 +134,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { struct xfrm6_tunnel *handler; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) break; } diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 2ce3a82..ac7584b 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -317,13 +317,13 @@ static const struct xfrm_type xfrm6_tunnel_type = { .output = xfrm6_tunnel_output, }; -static struct xfrm6_tunnel xfrm6_tunnel_handler = { +static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, }; -static struct xfrm6_tunnel xfrm46_tunnel_handler = { +static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 5bb8353..8ee1ff6 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -45,13 +45,11 @@ static int irlan_eth_close(struct net_device *dev); static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev); static void irlan_eth_set_multicast_list( struct net_device *dev); -static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); static const struct net_device_ops irlan_eth_netdev_ops = { .ndo_open = irlan_eth_open, .ndo_stop = irlan_eth_close, .ndo_start_xmit = irlan_eth_xmit, - .ndo_get_stats = irlan_eth_get_stats, .ndo_set_multicast_list = irlan_eth_set_multicast_list, .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, @@ -208,10 +206,10 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, * tried :-) DB */ /* irttp_data_request already free the packet */ - self->stats.tx_dropped++; + dev->stats.tx_dropped++; } else { - self->stats.tx_packets++; - self->stats.tx_bytes += len; + dev->stats.tx_packets++; + dev->stats.tx_bytes += len; } return NETDEV_TX_OK; @@ -226,15 +224,16 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) { struct irlan_cb *self = instance; + struct net_device *dev = self->dev; if (skb == NULL) { - ++self->stats.rx_dropped; + dev->stats.rx_dropped++; return 0; } if (skb->len < ETH_HLEN) { IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n", __func__, skb->len); - ++self->stats.rx_dropped; + dev->stats.rx_dropped++; dev_kfree_skb(skb); return 0; } @@ -244,10 +243,10 @@ int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) * might have been previously set by the low level IrDA network * device driver */ - skb->protocol = eth_type_trans(skb, self->dev); /* Remove eth header */ + skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */ - self->stats.rx_packets++; - self->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; netif_rx(skb); /* Eat it! */ @@ -348,16 +347,3 @@ static void irlan_eth_set_multicast_list(struct net_device *dev) else irlan_set_broadcast_filter(self, FALSE); } - -/* - * Function irlan_get_stats (dev) - * - * Get the current statistics for this device - * - */ -static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev) -{ - struct irlan_cb *self = netdev_priv(dev); - - return &self->stats; -} diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 1ae6976..8d9ce0a 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -144,7 +144,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, nf_reset(skb); if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) { - dev->last_rx = jiffies; dev->stats.rx_packets++; dev->stats.rx_bytes += data_len; } else diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c index a87cb3b..d2b03e0 100644 --- a/net/mac80211/aes_ccm.c +++ b/net/mac80211/aes_ccm.c @@ -138,10 +138,8 @@ struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[]) struct crypto_cipher *tfm; tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return NULL; - - crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN); + if (!IS_ERR(tfm)) + crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN); return tfm; } diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index 3d097b3..b4d66cc 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -119,10 +119,8 @@ struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]) struct crypto_cipher *tfm; tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return NULL; - - crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN); + if (!IS_ERR(tfm)) + crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN); return tfm; } diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 965b272..58eab9e 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -86,6 +86,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, tid, 0, reason); del_timer_sync(&tid_rx->session_timer); + del_timer_sync(&tid_rx->reorder_timer); call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx); } @@ -120,6 +121,20 @@ static void sta_rx_agg_session_timer_expired(unsigned long data) ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work); } +static void sta_rx_agg_reorder_timer_expired(unsigned long data) +{ + u8 *ptid = (u8 *)data; + u8 *timer_to_id = ptid - *ptid; + struct sta_info *sta = container_of(timer_to_id, struct sta_info, + timer_to_tid[0]); + + rcu_read_lock(); + spin_lock(&sta->lock); + ieee80211_release_reorder_timeout(sta, *ptid); + spin_unlock(&sta->lock); + rcu_read_unlock(); +} + static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, u8 dialog_token, u16 status, u16 policy, u16 buf_size, u16 timeout) @@ -251,11 +266,18 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, goto end; } + spin_lock_init(&tid_agg_rx->reorder_lock); + /* rx timer */ tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired; tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid]; init_timer(&tid_agg_rx->session_timer); + /* rx reorder timer */ + tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired; + tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid]; + init_timer(&tid_agg_rx->reorder_timer); + /* prepare reordering buffer */ tid_agg_rx->reorder_buf = kcalloc(buf_size, sizeof(struct sk_buff *), GFP_ATOMIC); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 29ac8e1..5de1ca3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -19,33 +19,6 @@ #include "rate.h" #include "mesh.h" -static bool nl80211_type_check(enum nl80211_iftype type) -{ - switch (type) { - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_MONITOR: -#ifdef CONFIG_MAC80211_MESH - case NL80211_IFTYPE_MESH_POINT: -#endif - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: - return true; - default: - return false; - } -} - -static bool nl80211_params_check(enum nl80211_iftype type, - struct vif_params *params) -{ - if (!nl80211_type_check(type)) - return false; - - return true; -} - static int ieee80211_add_iface(struct wiphy *wiphy, char *name, enum nl80211_iftype type, u32 *flags, struct vif_params *params) @@ -55,9 +28,6 @@ static int ieee80211_add_iface(struct wiphy *wiphy, char *name, struct ieee80211_sub_if_data *sdata; int err; - if (!nl80211_params_check(type, params)) - return -EINVAL; - err = ieee80211_if_add(local, name, &dev, type, params); if (err || type != NL80211_IFTYPE_MONITOR || !flags) return err; @@ -82,12 +52,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int ret; - if (ieee80211_sdata_running(sdata)) - return -EBUSY; - - if (!nl80211_params_check(type, params)) - return -EINVAL; - ret = ieee80211_if_change_type(sdata, type); if (ret) return ret; @@ -114,44 +78,30 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, u8 key_idx, const u8 *mac_addr, struct key_params *params) { - struct ieee80211_sub_if_data *sdata; + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta = NULL; - enum ieee80211_key_alg alg; struct ieee80211_key *key; int err; - if (!netif_running(dev)) + if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - sdata = IEEE80211_DEV_TO_SUB_IF(dev); - + /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: - case WLAN_CIPHER_SUITE_WEP104: - alg = ALG_WEP; - break; case WLAN_CIPHER_SUITE_TKIP: - alg = ALG_TKIP; - break; - case WLAN_CIPHER_SUITE_CCMP: - alg = ALG_CCMP; - break; - case WLAN_CIPHER_SUITE_AES_CMAC: - alg = ALG_AES_CMAC; + case WLAN_CIPHER_SUITE_WEP104: + if (IS_ERR(sdata->local->wep_tx_tfm)) + return -EINVAL; break; default: - return -EINVAL; + break; } - /* reject WEP and TKIP keys if WEP failed to initialize */ - if ((alg == ALG_WEP || alg == ALG_TKIP) && - IS_ERR(sdata->local->wep_tx_tfm)) - return -EINVAL; - - key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key, - params->seq_len, params->seq); - if (!key) - return -ENOMEM; + key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len, + params->key, params->seq_len, params->seq); + if (IS_ERR(key)) + return PTR_ERR(key); mutex_lock(&sdata->local->sta_mtx); @@ -164,9 +114,10 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, } } - ieee80211_key_link(key, sdata, sta); + err = ieee80211_key_link(key, sdata, sta); + if (err) + ieee80211_key_free(sdata->local, key); - err = 0; out_unlock: mutex_unlock(&sdata->local->sta_mtx); @@ -247,10 +198,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, memset(¶ms, 0, sizeof(params)); - switch (key->conf.alg) { - case ALG_TKIP: - params.cipher = WLAN_CIPHER_SUITE_TKIP; + params.cipher = key->conf.cipher; + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_TKIP: iv32 = key->u.tkip.tx.iv32; iv16 = key->u.tkip.tx.iv16; @@ -268,8 +219,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, params.seq = seq; params.seq_len = 6; break; - case ALG_CCMP: - params.cipher = WLAN_CIPHER_SUITE_CCMP; + case WLAN_CIPHER_SUITE_CCMP: seq[0] = key->u.ccmp.tx_pn[5]; seq[1] = key->u.ccmp.tx_pn[4]; seq[2] = key->u.ccmp.tx_pn[3]; @@ -279,14 +229,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, params.seq = seq; params.seq_len = 6; break; - case ALG_WEP: - if (key->conf.keylen == 5) - params.cipher = WLAN_CIPHER_SUITE_WEP40; - else - params.cipher = WLAN_CIPHER_SUITE_WEP104; - break; - case ALG_AES_CMAC: - params.cipher = WLAN_CIPHER_SUITE_AES_CMAC; + case WLAN_CIPHER_SUITE_AES_CMAC: seq[0] = key->u.aes_cmac.tx_pn[5]; seq[1] = key->u.aes_cmac.tx_pn[4]; seq[2] = key->u.aes_cmac.tx_pn[3]; @@ -1143,9 +1086,9 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy, p.uapsd = false; if (drv_conf_tx(local, params->queue, &p)) { - printk(KERN_DEBUG "%s: failed to set TX queue " - "parameters for queue %d\n", - wiphy_name(local->hw.wiphy), params->queue); + wiphy_debug(local->hw.wiphy, + "failed to set TX queue parameters for queue %d\n", + params->queue); return -EINVAL; } @@ -1541,11 +1484,11 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, return ieee80211_wk_cancel_remain_on_channel(sdata, cookie); } -static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_channel_type channel_type, - bool channel_type_valid, - const u8 *buf, size_t len, u64 *cookie) +static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, + const u8 *buf, size_t len, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -1575,8 +1518,6 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev, return -ENOLINK; break; case NL80211_IFTYPE_STATION: - if (!(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) - flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; break; default: return -EOPNOTSUPP; @@ -1647,6 +1588,6 @@ struct cfg80211_ops mac80211_config_ops = { .set_bitrate_mask = ieee80211_set_bitrate_mask, .remain_on_channel = ieee80211_remain_on_channel, .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, - .action = ieee80211_action, + .mgmt_tx = ieee80211_mgmt_tx, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, }; diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index a694c59..e81ef4e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -85,13 +85,15 @@ static ssize_t tsf_write(struct file *file, if (strncmp(buf, "reset", 5) == 0) { if (local->ops->reset_tsf) { drv_reset_tsf(local); - printk(KERN_INFO "%s: debugfs reset TSF\n", wiphy_name(local->hw.wiphy)); + wiphy_info(local->hw.wiphy, "debugfs reset TSF\n"); } } else { tsf = simple_strtoul(buf, NULL, 0); if (local->ops->set_tsf) { drv_set_tsf(local, tsf); - printk(KERN_INFO "%s: debugfs set TSF to %#018llx\n", wiphy_name(local->hw.wiphy), tsf); + wiphy_info(local->hw.wiphy, + "debugfs set TSF to %#018llx\n", tsf); + } } diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index fa5e76e..1647f8d 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -64,26 +64,13 @@ static ssize_t key_algorithm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char *alg; + char buf[15]; struct ieee80211_key *key = file->private_data; + u32 c = key->conf.cipher; - switch (key->conf.alg) { - case ALG_WEP: - alg = "WEP\n"; - break; - case ALG_TKIP: - alg = "TKIP\n"; - break; - case ALG_CCMP: - alg = "CCMP\n"; - break; - case ALG_AES_CMAC: - alg = "AES-128-CMAC\n"; - break; - default: - return 0; - } - return simple_read_from_buffer(userbuf, count, ppos, alg, strlen(alg)); + sprintf(buf, "%.2x-%.2x-%.2x:%d\n", + c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff); + return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } KEY_OPS(algorithm); @@ -95,21 +82,22 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, int len; struct ieee80211_key *key = file->private_data; - switch (key->conf.alg) { - case ALG_WEP: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: len = scnprintf(buf, sizeof(buf), "%08x %04x\n", key->u.tkip.tx.iv32, key->u.tkip.tx.iv16); break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: tpn = key->u.ccmp.tx_pn; len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: tpn = key->u.aes_cmac.tx_pn; len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], @@ -130,11 +118,12 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, int i, len; const u8 *rpn; - switch (key->conf.alg) { - case ALG_WEP: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: for (i = 0; i < NUM_RX_DATA_QUEUES; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%08x %04x\n", @@ -142,7 +131,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, key->u.tkip.rx[i].iv16); len = p - buf; break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) { rpn = key->u.ccmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, @@ -152,7 +141,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, } len = p - buf; break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: rpn = key->u.aes_cmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", @@ -174,11 +163,11 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf, char buf[20]; int len; - switch (key->conf.alg) { - case ALG_CCMP: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.replays); break; @@ -196,8 +185,8 @@ static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, char buf[20]; int len; - switch (key->conf.alg) { - case ALG_AES_CMAC: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_AES_CMAC: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.icverrors); break; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 14123dc..6064b7b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -54,6 +54,20 @@ static inline int drv_add_interface(struct ieee80211_local *local, return ret; } +static inline int drv_change_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type) +{ + int ret; + + might_sleep(); + + trace_drv_change_interface(local, sdata, type); + ret = local->ops->change_interface(&local->hw, &sdata->vif, type); + trace_drv_return_int(local, ret); + return ret; +} + static inline void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_vif *vif) { diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index 5d5d2a9..f6f3d89 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -136,6 +136,31 @@ TRACE_EVENT(drv_add_interface, ) ); +TRACE_EVENT(drv_change_interface, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type), + + TP_ARGS(local, sdata, type), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u32, new_type) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->new_type = type; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " new type:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type + ) +); + TRACE_EVENT(drv_remove_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), @@ -336,7 +361,7 @@ TRACE_EVENT(drv_set_key, LOCAL_ENTRY VIF_ENTRY STA_ENTRY - __field(enum ieee80211_key_alg, alg) + __field(u32, cipher) __field(u8, hw_key_idx) __field(u8, flags) __field(s8, keyidx) @@ -346,7 +371,7 @@ TRACE_EVENT(drv_set_key, LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; - __entry->alg = key->alg; + __entry->cipher = key->cipher; __entry->flags = key->flags; __entry->keyidx = key->keyidx; __entry->hw_key_idx = key->hw_key_idx; diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 9d101fb..11f74f5 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -265,3 +265,31 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, return 0; } + +void ieee80211_request_smps_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.request_smps_work); + + mutex_lock(&sdata->u.mgd.mtx); + __ieee80211_request_smps(sdata, sdata->u.mgd.driver_smps_mode); + mutex_unlock(&sdata->u.mgd.mtx); +} + +void ieee80211_request_smps(struct ieee80211_vif *vif, + enum ieee80211_smps_mode smps_mode) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return; + + if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF)) + smps_mode = IEEE80211_SMPS_AUTOMATIC; + + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.request_smps_work); +} +/* this might change ... don't want non-open drivers using it */ +EXPORT_SYMBOL_GPL(ieee80211_request_smps); diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index c691780..1a3aae5 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -427,8 +427,8 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, return NULL; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n", - wiphy_name(local->hw.wiphy), addr, sdata->name); + wiphy_debug(local->hw.wiphy, "Adding new IBSS station %pM (dev=%s)\n", + addr, sdata->name); #endif sta = sta_info_alloc(sdata, addr, gfp); @@ -920,12 +920,14 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN); sdata->u.ibss.ssid_len = params->ssid_len; + mutex_unlock(&sdata->u.ibss.mtx); + + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); ieee80211_queue_work(&sdata->local->hw, &sdata->work); - mutex_unlock(&sdata->u.ibss.mtx); - return 0; } @@ -980,7 +982,9 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) mutex_unlock(&sdata->u.ibss.mtx); + mutex_lock(&local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&local->mtx); return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 65e0ed6..4e635e2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -50,12 +50,6 @@ struct ieee80211_local; * increased memory use (about 2 kB of RAM per entry). */ #define IEEE80211_FRAGMENT_MAX 4 -/* - * Time after which we ignore scan results and no longer report/use - * them in any way. - */ -#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ) - #define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024)) #define IEEE80211_DEFAULT_UAPSD_QUEUES \ @@ -170,6 +164,7 @@ typedef unsigned __bitwise__ ieee80211_rx_result; #define IEEE80211_RX_RA_MATCH BIT(1) #define IEEE80211_RX_AMSDU BIT(2) #define IEEE80211_RX_FRAGMENTED BIT(3) +#define IEEE80211_MALFORMED_ACTION_FRM BIT(4) /* only add flags here that do not change with subframes of an aMPDU */ struct ieee80211_rx_data { @@ -343,7 +338,10 @@ struct ieee80211_if_managed { unsigned long timers_running; /* used for quiesce/restart */ bool powersave; /* powersave requested for this iface */ enum ieee80211_smps_mode req_smps, /* requested smps mode */ - ap_smps; /* smps mode AP thinks we're in */ + ap_smps, /* smps mode AP thinks we're in */ + driver_smps_mode; /* smps mode request */ + + struct work_struct request_smps_work; unsigned int flags; @@ -371,6 +369,13 @@ struct ieee80211_if_managed { int ave_beacon_signal; /* + * Number of Beacon frames used in ave_beacon_signal. This can be used + * to avoid generating less reliable cqm events that would be based + * only on couple of received frames. + */ + unsigned int count_beacon_signal; + + /* * Last Beacon frame signal strength average (ave_beacon_signal / 16) * that triggered a cqm event. 0 indicates that no event has been * generated for the current association. @@ -474,6 +479,19 @@ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), }; +/** + * enum ieee80211_sdata_state_bits - virtual interface state bits + * @SDATA_STATE_RUNNING: virtual interface is up & running; this + * mirrors netif_running() but is separate for interface type + * change handling while the interface is up + * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel + * mode, so queues are stopped + */ +enum ieee80211_sdata_state_bits { + SDATA_STATE_RUNNING, + SDATA_STATE_OFFCHANNEL, +}; + struct ieee80211_sub_if_data { struct list_head list; @@ -487,6 +505,8 @@ struct ieee80211_sub_if_data { unsigned int flags; + unsigned long state; + int drop_unencrypted; char name[IFNAMSIZ]; @@ -497,6 +517,9 @@ struct ieee80211_sub_if_data { */ bool ht_opmode_valid; + /* to detect idle changes */ + bool old_idle; + /* Fragment table for host-based reassembly */ struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX]; unsigned int fragment_next; @@ -508,6 +531,8 @@ struct ieee80211_sub_if_data { struct ieee80211_key *default_mgmt_key; u16 sequence_number; + __be16 control_port_protocol; + bool control_port_no_encrypt; struct work_struct work; struct sk_buff_head skb_queue; @@ -595,11 +620,17 @@ enum queue_stop_reason { * determine if we are on the operating channel or not * @SCAN_OFF_CHANNEL: We're off our operating channel for scanning, * gets only set in conjunction with SCAN_SW_SCANNING + * @SCAN_COMPLETED: Set for our scan work function when the driver reported + * that the scan completed. + * @SCAN_ABORTED: Set for our scan work function when the driver reported + * a scan complete for an aborted scan. */ enum { SCAN_SW_SCANNING, SCAN_HW_SCANNING, SCAN_OFF_CHANNEL, + SCAN_COMPLETED, + SCAN_ABORTED, }; /** @@ -634,7 +665,6 @@ struct ieee80211_local { /* * work stuff, potentially off-channel (in the future) */ - struct mutex work_mtx; struct list_head work_list; struct timer_list work_timer; struct work_struct work_work; @@ -656,6 +686,8 @@ struct ieee80211_local { int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll; unsigned int filter_flags; /* FIF_* */ + bool wiphy_ciphers_allocated; + /* protects the aggregated multicast list and filter calls */ spinlock_t filter_lock; @@ -746,9 +778,10 @@ struct ieee80211_local { */ struct mutex key_mtx; + /* mutex for scan and work locking */ + struct mutex mtx; /* Scanning and BSS list */ - struct mutex scan_mtx; unsigned long scanning; struct cfg80211_ssid scan_ssid; struct cfg80211_scan_request *int_scan_req; @@ -870,6 +903,11 @@ struct ieee80211_local { struct dentry *keys; } debugfs; #endif + + /* dummy netdev for use w/ NAPI */ + struct net_device napi_dev; + + struct napi_struct napi; }; static inline struct ieee80211_sub_if_data * @@ -1071,7 +1109,7 @@ void ieee80211_recalc_idle(struct ieee80211_local *local); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { - return netif_running(sdata->dev); + return test_bit(SDATA_STATE_RUNNING, &sdata->state); } /* tx handling */ @@ -1105,6 +1143,7 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid); +void ieee80211_request_smps_work(struct work_struct *work); void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason); @@ -1131,6 +1170,7 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); void ieee80211_ba_session_work(struct work_struct *work); void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); +void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, @@ -1146,6 +1186,12 @@ int __ieee80211_suspend(struct ieee80211_hw *hw); static inline int __ieee80211_resume(struct ieee80211_hw *hw) { + struct ieee80211_local *local = hw_to_local(hw); + + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), + "%s: resume with hardware scan still in progress\n", + wiphy_name(hw->wiphy)); + return ieee80211_reconfig(hw_to_local(hw)); } #else diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index ebbe264..c1cc200 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -94,21 +94,14 @@ static inline int identical_mac_addr_allowed(int type1, int type2) type2 == NL80211_IFTYPE_AP_VLAN)); } -static int ieee80211_open(struct net_device *dev) +static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype iftype) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - struct ieee80211_sub_if_data *nsdata; struct ieee80211_local *local = sdata->local; - struct sta_info *sta; - u32 changed = 0; - int res; - u32 hw_reconf_flags = 0; - u8 null_addr[ETH_ALEN] = {0}; + struct ieee80211_sub_if_data *nsdata; + struct net_device *dev = sdata->dev; - /* fail early if user set an invalid address */ - if (compare_ether_addr(dev->dev_addr, null_addr) && - !is_valid_ether_addr(dev->dev_addr)) - return -EADDRNOTAVAIL; + ASSERT_RTNL(); /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { @@ -125,7 +118,7 @@ static int ieee80211_open(struct net_device *dev) * belonging to the same hardware. Then, however, we're * faced with having to adopt two different TSF timers... */ - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && + if (iftype == NL80211_IFTYPE_ADHOC && nsdata->vif.type == NL80211_IFTYPE_ADHOC) return -EBUSY; @@ -139,19 +132,36 @@ static int ieee80211_open(struct net_device *dev) /* * check whether it may have the same address */ - if (!identical_mac_addr_allowed(sdata->vif.type, + if (!identical_mac_addr_allowed(iftype, nsdata->vif.type)) return -ENOTUNIQ; /* * can only add VLANs to enabled APs */ - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + if (iftype == NL80211_IFTYPE_AP_VLAN && nsdata->vif.type == NL80211_IFTYPE_AP) sdata->bss = &nsdata->u.ap; } } + return 0; +} + +/* + * NOTE: Be very careful when changing this function, it must NOT return + * an error on interface type changes that have been pre-checked, so most + * checks should be in ieee80211_check_concurrent_iface. + */ +static int ieee80211_do_open(struct net_device *dev, bool coming_up) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + u32 changed = 0; + int res; + u32 hw_reconf_flags = 0; + switch (sdata->vif.type) { case NL80211_IFTYPE_WDS: if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) @@ -177,7 +187,7 @@ static int ieee80211_open(struct net_device *dev) /* no special treatment */ break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: /* cannot happen */ WARN_ON(1); break; @@ -187,39 +197,30 @@ static int ieee80211_open(struct net_device *dev) res = drv_start(local); if (res) goto err_del_bss; + if (local->ops->napi_poll) + napi_enable(&local->napi); /* we're brought up, everything changes */ hw_reconf_flags = ~0; ieee80211_led_radio(local, true); } /* - * Check all interfaces and copy the hopefully now-present - * MAC address to those that have the special null one. + * Copy the hopefully now-present MAC address to + * this interface, if it has the special null one. */ - list_for_each_entry(nsdata, &local->interfaces, list) { - struct net_device *ndev = nsdata->dev; - - /* - * No need to check running since we do not allow - * it to start up with this invalid address. - */ - if (compare_ether_addr(null_addr, ndev->dev_addr) == 0) { - memcpy(ndev->dev_addr, - local->hw.wiphy->perm_addr, - ETH_ALEN); - memcpy(ndev->perm_addr, ndev->dev_addr, ETH_ALEN); + if (is_zero_ether_addr(dev->dev_addr)) { + memcpy(dev->dev_addr, + local->hw.wiphy->perm_addr, + ETH_ALEN); + memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN); + + if (!is_valid_ether_addr(dev->dev_addr)) { + if (!local->open_count) + drv_stop(local); + return -EADDRNOTAVAIL; } } - /* - * Validate the MAC address for this device. - */ - if (!is_valid_ether_addr(dev->dev_addr)) { - if (!local->open_count) - drv_stop(local); - return -EADDRNOTAVAIL; - } - switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: /* no need to tell driver */ @@ -253,9 +254,11 @@ static int ieee80211_open(struct net_device *dev) netif_carrier_on(dev); break; default: - res = drv_add_interface(local, &sdata->vif); - if (res) - goto err_stop; + if (coming_up) { + res = drv_add_interface(local, &sdata->vif); + if (res) + goto err_stop; + } if (ieee80211_vif_is_mesh(&sdata->vif)) { local->fif_other_bss++; @@ -307,9 +310,13 @@ static int ieee80211_open(struct net_device *dev) if (sdata->flags & IEEE80211_SDATA_PROMISC) atomic_inc(&local->iff_promiscs); + mutex_lock(&local->mtx); hw_reconf_flags |= __ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + + if (coming_up) + local->open_count++; - local->open_count++; if (hw_reconf_flags) { ieee80211_hw_config(local, hw_reconf_flags); /* @@ -324,6 +331,8 @@ static int ieee80211_open(struct net_device *dev) netif_tx_start_all_queues(dev); + set_bit(SDATA_STATE_RUNNING, &sdata->state); + return 0; err_del_interface: drv_remove_interface(local, &sdata->vif); @@ -337,19 +346,38 @@ static int ieee80211_open(struct net_device *dev) return res; } -static int ieee80211_stop(struct net_device *dev) +static int ieee80211_open(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int err; + + /* fail early if user set an invalid address */ + if (!is_zero_ether_addr(dev->dev_addr) && + !is_valid_ether_addr(dev->dev_addr)) + return -EADDRNOTAVAIL; + + err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); + if (err) + return err; + + return ieee80211_do_open(dev, true); +} + +static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, + bool going_down) +{ struct ieee80211_local *local = sdata->local; unsigned long flags; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; int i; + clear_bit(SDATA_STATE_RUNNING, &sdata->state); + /* * Stop TX on this interface first. */ - netif_tx_stop_all_queues(dev); + netif_tx_stop_all_queues(sdata->dev); /* * Purge work for this interface. @@ -366,12 +394,9 @@ static int ieee80211_stop(struct net_device *dev) * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * - * We could relax this and only unlink the stations from the - * hash table and list but keep them on a per-sdata list that - * will be inserted back again when the interface is brought - * up again, but I don't currently see a use case for that, - * except with WDS which gets a STA entry created when it is - * brought up. + * This is relevant only in AP, WDS and mesh modes, since in + * all other modes we've already removed all stations when + * disconnecting etc. */ sta_info_flush(local, sdata); @@ -390,11 +415,12 @@ static int ieee80211_stop(struct net_device *dev) if (sdata->vif.type == NL80211_IFTYPE_AP) local->fif_pspoll--; - netif_addr_lock_bh(dev); + netif_addr_lock_bh(sdata->dev); spin_lock_bh(&local->filter_lock); - __hw_addr_unsync(&local->mc_list, &dev->mc, dev->addr_len); + __hw_addr_unsync(&local->mc_list, &sdata->dev->mc, + sdata->dev->addr_len); spin_unlock_bh(&local->filter_lock); - netif_addr_unlock_bh(dev); + netif_addr_unlock_bh(sdata->dev); ieee80211_configure_filter(local); @@ -406,11 +432,21 @@ static int ieee80211_stop(struct net_device *dev) struct ieee80211_sub_if_data *vlan, *tmpsdata; struct beacon_data *old_beacon = sdata->u.ap.beacon; + /* sdata_running will return false, so this will disable */ + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_BEACON_ENABLED); + /* remove beacon */ rcu_assign_pointer(sdata->u.ap.beacon, NULL); synchronize_rcu(); kfree(old_beacon); + /* free all potentially still buffered bcast frames */ + while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) { + local->total_ps_buffered--; + dev_kfree_skb(skb); + } + /* down all dependent devices, that is VLANs */ list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, u.vlan.list) @@ -418,7 +454,8 @@ static int ieee80211_stop(struct net_device *dev) WARN_ON(!list_empty(&sdata->u.ap.vlans)); } - local->open_count--; + if (going_down) + local->open_count--; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: @@ -450,27 +487,6 @@ static int ieee80211_stop(struct net_device *dev) ieee80211_configure_filter(local); break; - case NL80211_IFTYPE_STATION: - del_timer_sync(&sdata->u.mgd.chswitch_timer); - del_timer_sync(&sdata->u.mgd.timer); - del_timer_sync(&sdata->u.mgd.conn_mon_timer); - del_timer_sync(&sdata->u.mgd.bcn_mon_timer); - /* - * If any of the timers fired while we waited for it, it will - * have queued its work. Now the work will be running again - * but will not rearm the timer again because it checks - * whether the interface is running, which, at this point, - * it no longer is. - */ - cancel_work_sync(&sdata->u.mgd.chswitch_work); - cancel_work_sync(&sdata->u.mgd.monitor_work); - cancel_work_sync(&sdata->u.mgd.beacon_connection_loss_work); - - /* fall through */ - case NL80211_IFTYPE_ADHOC: - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) - del_timer_sync(&sdata->u.ibss.timer); - /* fall through */ case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif)) { /* other_bss and allmulti are always set on mesh @@ -498,27 +514,34 @@ static int ieee80211_stop(struct net_device *dev) ieee80211_scan_cancel(local); /* - * Disable beaconing for AP and mesh, IBSS can't - * still be joined to a network at this point. + * Disable beaconing here for mesh only, AP and IBSS + * are already taken care of. */ - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT) { + if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); - } - /* free all remaining keys, there shouldn't be any */ + /* + * Free all remaining keys, there shouldn't be any, + * except maybe group keys in AP more or WDS? + */ ieee80211_free_keys(sdata); - drv_remove_interface(local, &sdata->vif); + + if (going_down) + drv_remove_interface(local, &sdata->vif); } sdata->bss = NULL; + mutex_lock(&local->mtx); hw_reconf_flags |= __ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); ieee80211_recalc_ps(local, -1); if (local->open_count == 0) { + if (local->ops->napi_poll) + napi_disable(&local->napi); ieee80211_clear_tx_pending(local); ieee80211_stop_device(local); @@ -541,6 +564,13 @@ static int ieee80211_stop(struct net_device *dev) } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +static int ieee80211_stop(struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + ieee80211_do_stop(sdata, true); return 0; } @@ -585,8 +615,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - struct beacon_data *beacon; - struct sk_buff *skb; int flushed; int i; @@ -599,37 +627,8 @@ static void ieee80211_teardown_sdata(struct net_device *dev) __skb_queue_purge(&sdata->fragments[i].skb_list); sdata->fragment_next = 0; - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP: - beacon = sdata->u.ap.beacon; - rcu_assign_pointer(sdata->u.ap.beacon, NULL); - synchronize_rcu(); - kfree(beacon); - - while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) { - local->total_ps_buffered--; - dev_kfree_skb(skb); - } - - break; - case NL80211_IFTYPE_MESH_POINT: - if (ieee80211_vif_is_mesh(&sdata->vif)) - mesh_rmc_free(sdata); - break; - case NL80211_IFTYPE_ADHOC: - if (WARN_ON(sdata->u.ibss.presp)) - kfree_skb(sdata->u.ibss.presp); - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_WDS: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_MONITOR: - break; - case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: - BUG(); - break; - } + if (ieee80211_vif_is_mesh(&sdata->vif)) + mesh_rmc_free(sdata); flushed = sta_info_flush(local, sdata); WARN_ON(flushed); @@ -847,6 +846,9 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->dev->netdev_ops = &ieee80211_dataif_ops; sdata->wdev.iftype = type; + sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE); + sdata->control_port_no_encrypt = false; + /* only monitor differs */ sdata->dev->type = ARPHRD_ETHER; @@ -878,7 +880,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP_VLAN: break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: BUG(); break; } @@ -886,9 +888,72 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, ieee80211_debugfs_add_netdev(sdata); } +static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type) +{ + struct ieee80211_local *local = sdata->local; + int ret, err; + + ASSERT_RTNL(); + + if (!local->ops->change_interface) + return -EBUSY; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + /* + * Could maybe also all others here? + * Just not sure how that interacts + * with the RX/config path e.g. for + * mesh. + */ + break; + default: + return -EBUSY; + } + + switch (type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + /* + * Could probably support everything + * but WDS here (WDS do_open can fail + * under memory pressure, which this + * code isn't prepared to handle). + */ + break; + default: + return -EBUSY; + } + + ret = ieee80211_check_concurrent_iface(sdata, type); + if (ret) + return ret; + + ieee80211_do_stop(sdata, false); + + ieee80211_teardown_sdata(sdata->dev); + + ret = drv_change_interface(local, sdata, type); + if (ret) + type = sdata->vif.type; + + ieee80211_setup_sdata(sdata, type); + + err = ieee80211_do_open(sdata->dev, false); + WARN(err, "type change: do_open returned %d", err); + + return ret; +} + int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type) { + int ret; + ASSERT_RTNL(); if (type == sdata->vif.type) @@ -899,18 +964,15 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, type == NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; - /* - * We could, here, on changes between IBSS/STA/MESH modes, - * invoke an MLME function instead that disassociates etc. - * and goes into the requested mode. - */ - - if (ieee80211_sdata_running(sdata)) - return -EBUSY; - - /* Purge and reset type-dependent state. */ - ieee80211_teardown_sdata(sdata->dev); - ieee80211_setup_sdata(sdata, type); + if (ieee80211_sdata_running(sdata)) { + ret = ieee80211_runtime_change_iftype(sdata, type); + if (ret) + return ret; + } else { + /* Purge and reset type-dependent state. */ + ieee80211_teardown_sdata(sdata->dev); + ieee80211_setup_sdata(sdata, type); + } /* reset some values that shouldn't be kept across type changes */ sdata->vif.bss_conf.basic_rates = @@ -1167,8 +1229,7 @@ static u32 ieee80211_idle_off(struct ieee80211_local *local, return 0; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: device no longer idle - %s\n", - wiphy_name(local->hw.wiphy), reason); + wiphy_debug(local->hw.wiphy, "device no longer idle - %s\n", reason); #endif local->hw.conf.flags &= ~IEEE80211_CONF_IDLE; @@ -1181,8 +1242,7 @@ static u32 ieee80211_idle_on(struct ieee80211_local *local) return 0; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: device now idle\n", - wiphy_name(local->hw.wiphy)); + wiphy_debug(local->hw.wiphy, "device now idle\n"); #endif drv_flush(local, false); @@ -1195,28 +1255,61 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; int count = 0; + bool working = false, scanning = false; + struct ieee80211_work *wk; - if (!list_empty(&local->work_list)) - return ieee80211_idle_off(local, "working"); - - if (local->scanning) - return ieee80211_idle_off(local, "scanning"); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON(debug_locks && !lockdep_rtnl_is_held() && + !lockdep_is_held(&local->iflist_mtx)); +#endif + lockdep_assert_held(&local->mtx); list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) + if (!ieee80211_sdata_running(sdata)) { + sdata->vif.bss_conf.idle = true; continue; + } + + sdata->old_idle = sdata->vif.bss_conf.idle; + /* do not count disabled managed interfaces */ if (sdata->vif.type == NL80211_IFTYPE_STATION && - !sdata->u.mgd.associated) + !sdata->u.mgd.associated) { + sdata->vif.bss_conf.idle = true; continue; + } /* do not count unused IBSS interfaces */ if (sdata->vif.type == NL80211_IFTYPE_ADHOC && - !sdata->u.ibss.ssid_len) + !sdata->u.ibss.ssid_len) { + sdata->vif.bss_conf.idle = true; continue; + } /* count everything else */ count++; } + list_for_each_entry(wk, &local->work_list, list) { + working = true; + wk->sdata->vif.bss_conf.idle = false; + } + + if (local->scan_sdata) { + scanning = true; + local->scan_sdata->vif.bss_conf.idle = false; + } + + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->old_idle == sdata->vif.bss_conf.idle) + continue; + if (!ieee80211_sdata_running(sdata)) + continue; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + } + + if (working) + return ieee80211_idle_off(local, "working"); + if (scanning) + return ieee80211_idle_off(local, "scanning"); if (!count) return ieee80211_idle_on(local); else diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 1b9d87e..3570f8c 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -60,7 +60,7 @@ static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key) return NULL; } -static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) +static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata; struct ieee80211_sta *sta; @@ -68,8 +68,10 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) might_sleep(); - if (!key->local->ops->set_key) - return; + if (!key->local->ops->set_key) { + ret = -EOPNOTSUPP; + goto out_unsupported; + } assert_key_lock(key->local); @@ -87,10 +89,27 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; if (ret && ret != -ENOSPC && ret != -EOPNOTSUPP) - printk(KERN_ERR "mac80211-%s: failed to set key " - "(%d, %pM) to hardware (%d)\n", - wiphy_name(key->local->hw.wiphy), - key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + wiphy_err(key->local->hw.wiphy, + "failed to set key (%d, %pM) to hardware (%d)\n", + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + +out_unsupported: + if (ret) { + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + case WLAN_CIPHER_SUITE_TKIP: + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_AES_CMAC: + /* all of these we can do in software */ + ret = 0; + break; + default: + ret = -EINVAL; + } + } + + return ret; } static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) @@ -121,10 +140,9 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta, &key->conf); if (ret) - printk(KERN_ERR "mac80211-%s: failed to remove key " - "(%d, %pM) from hardware (%d)\n", - wiphy_name(key->local->hw.wiphy), - key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + wiphy_err(key->local->hw.wiphy, + "failed to remove key (%d, %pM) from hardware (%d)\n", + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; } @@ -227,20 +245,18 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, } } -struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, - int idx, - size_t key_len, +struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, const u8 *key_data, size_t seq_len, const u8 *seq) { struct ieee80211_key *key; - int i, j; + int i, j, err; BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS); key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); if (!key) - return NULL; + return ERR_PTR(-ENOMEM); /* * Default to software encryption; we'll later upload the @@ -249,15 +265,16 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, key->conf.flags = 0; key->flags = 0; - key->conf.alg = alg; + key->conf.cipher = cipher; key->conf.keyidx = idx; key->conf.keylen = key_len; - switch (alg) { - case ALG_WEP: + switch (cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: key->conf.iv_len = WEP_IV_LEN; key->conf.icv_len = WEP_ICV_LEN; break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: key->conf.iv_len = TKIP_IV_LEN; key->conf.icv_len = TKIP_ICV_LEN; if (seq) { @@ -269,7 +286,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, } } break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: key->conf.iv_len = CCMP_HDR_LEN; key->conf.icv_len = CCMP_MIC_LEN; if (seq) { @@ -278,42 +295,38 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, key->u.ccmp.rx_pn[i][j] = seq[CCMP_PN_LEN - j - 1]; } - break; - case ALG_AES_CMAC: - key->conf.iv_len = 0; - key->conf.icv_len = sizeof(struct ieee80211_mmie); - if (seq) - for (j = 0; j < 6; j++) - key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1]; - break; - } - memcpy(key->conf.key, key_data, key_len); - INIT_LIST_HEAD(&key->list); - - if (alg == ALG_CCMP) { /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data); - if (!key->u.ccmp.tfm) { + if (IS_ERR(key->u.ccmp.tfm)) { + err = PTR_ERR(key->u.ccmp.tfm); kfree(key); - return NULL; + key = ERR_PTR(err); } - } - - if (alg == ALG_AES_CMAC) { + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + key->conf.iv_len = 0; + key->conf.icv_len = sizeof(struct ieee80211_mmie); + if (seq) + for (j = 0; j < 6; j++) + key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1]; /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. */ key->u.aes_cmac.tfm = ieee80211_aes_cmac_key_setup(key_data); - if (!key->u.aes_cmac.tfm) { + if (IS_ERR(key->u.aes_cmac.tfm)) { + err = PTR_ERR(key->u.aes_cmac.tfm); kfree(key); - return NULL; + key = ERR_PTR(err); } + break; } + memcpy(key->conf.key, key_data, key_len); + INIT_LIST_HEAD(&key->list); return key; } @@ -326,9 +339,9 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key) if (key->local) ieee80211_key_disable_hw_accel(key); - if (key->conf.alg == ALG_CCMP) + if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP) ieee80211_aes_key_free(key->u.ccmp.tfm); - if (key->conf.alg == ALG_AES_CMAC) + if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC) ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); if (key->local) ieee80211_debugfs_key_remove(key); @@ -336,12 +349,12 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key) kfree(key); } -void ieee80211_key_link(struct ieee80211_key *key, - struct ieee80211_sub_if_data *sdata, - struct sta_info *sta) +int ieee80211_key_link(struct ieee80211_key *key, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) { struct ieee80211_key *old_key; - int idx; + int idx, ret; BUG_ON(!sdata); BUG_ON(!key); @@ -396,9 +409,11 @@ void ieee80211_key_link(struct ieee80211_key *key, ieee80211_debugfs_key_add(key); - ieee80211_key_enable_hw_accel(key); + ret = ieee80211_key_enable_hw_accel(key); mutex_unlock(&sdata->local->key_mtx); + + return ret; } static void __ieee80211_key_free(struct ieee80211_key *key) diff --git a/net/mac80211/key.h b/net/mac80211/key.h index b665bbb..cb9a4a6 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -123,18 +123,16 @@ struct ieee80211_key { struct ieee80211_key_conf conf; }; -struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, - int idx, - size_t key_len, +struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, const u8 *key_data, size_t seq_len, const u8 *seq); /* * Insert a key into data structures (sdata, sta if necessary) * to make it used, free old key. */ -void ieee80211_key_link(struct ieee80211_key *key, - struct ieee80211_sub_if_data *sdata, - struct sta_info *sta); +int __must_check ieee80211_key_link(struct ieee80211_key *key, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta); void ieee80211_key_free(struct ieee80211_local *local, struct ieee80211_key *key); void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index ded5c38..4935b84 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -99,11 +99,13 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) int ret = 0; int power; enum nl80211_channel_type channel_type; + u32 offchannel_flag; might_sleep(); scan_chan = local->scan_channel; + offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; if (scan_chan) { chan = scan_chan; channel_type = NL80211_CHAN_NO_HT; @@ -117,8 +119,9 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) channel_type = local->_oper_channel_type; local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL; } + offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; - if (chan != local->hw.conf.channel || + if (offchannel_flag || chan != local->hw.conf.channel || channel_type != local->hw.conf.channel_type) { local->hw.conf.channel = chan; local->hw.conf.channel_type = channel_type; @@ -302,7 +305,13 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) trace_api_restart_hw(local); - /* use this reason, __ieee80211_resume will unblock it */ + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), + "%s called with hardware scan in progress\n", __func__); + + if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) + ieee80211_scan_cancel(local); + + /* use this reason, ieee80211_reconfig will unblock it */ ieee80211_stop_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_SUSPEND); @@ -336,9 +345,6 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, struct ieee80211_if_managed *ifmgd; int c = 0; - if (!netif_running(ndev)) - return NOTIFY_DONE; - /* Make sure it's our interface that got changed */ if (!wdev) return NOTIFY_DONE; @@ -349,6 +355,9 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, sdata = IEEE80211_DEV_TO_SUB_IF(ndev); bss_conf = &sdata->vif.bss_conf; + if (!ieee80211_sdata_running(sdata)) + return NOTIFY_DONE; + /* ARP filtering is only supported in managed mode */ if (sdata->vif.type != NL80211_IFTYPE_STATION) return NOTIFY_DONE; @@ -390,6 +399,65 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, } #endif +static int ieee80211_napi_poll(struct napi_struct *napi, int budget) +{ + struct ieee80211_local *local = + container_of(napi, struct ieee80211_local, napi); + + return local->ops->napi_poll(&local->hw, budget); +} + +void ieee80211_napi_schedule(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + napi_schedule(&local->napi); +} +EXPORT_SYMBOL(ieee80211_napi_schedule); + +void ieee80211_napi_complete(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + napi_complete(&local->napi); +} +EXPORT_SYMBOL(ieee80211_napi_complete); + +/* There isn't a lot of sense in it, but you can transmit anything you like */ +static const struct ieee80211_txrx_stypes +ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { + [NL80211_IFTYPE_ADHOC] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_STATION] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + }, + [NL80211_IFTYPE_AP] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_AP_VLAN] = { + /* copy AP */ + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, +}; + struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { @@ -419,6 +487,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, if (!wiphy) return NULL; + wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes; + wiphy->flags |= WIPHY_FLAG_NETNS_OK | WIPHY_FLAG_4ADDR_AP | WIPHY_FLAG_4ADDR_STATION; @@ -455,7 +525,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, __hw_addr_init(&local->mc_list); mutex_init(&local->iflist_mtx); - mutex_init(&local->scan_mtx); + mutex_init(&local->mtx); mutex_init(&local->key_mtx); spin_lock_init(&local->filter_lock); @@ -494,6 +564,9 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); + /* init dummy netdev for use w/ NAPI */ + init_dummy_netdev(&local->napi_dev); + return local_to_hw(local); } EXPORT_SYMBOL(ieee80211_alloc_hw); @@ -506,6 +579,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int channels, max_bitrates; bool supp_ht; static const u32 cipher_suites[] = { + /* keep WEP first, it may be removed below */ WLAN_CIPHER_SUITE_WEP40, WLAN_CIPHER_SUITE_WEP104, WLAN_CIPHER_SUITE_TKIP, @@ -554,6 +628,14 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 always supports monitor */ local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR); +#ifndef CONFIG_MAC80211_MESH + /* mesh depends on Kconfig, but drivers should set it if they want */ + local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT); +#endif + + /* mac80211 supports control port protocol changing */ + local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL; + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) @@ -589,10 +671,41 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (local->hw.wiphy->max_scan_ie_len) local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len; - local->hw.wiphy->cipher_suites = cipher_suites; - local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites); - if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) - local->hw.wiphy->n_cipher_suites--; + /* Set up cipher suites unless driver already did */ + if (!local->hw.wiphy->cipher_suites) { + local->hw.wiphy->cipher_suites = cipher_suites; + local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites); + if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) + local->hw.wiphy->n_cipher_suites--; + } + if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) { + if (local->hw.wiphy->cipher_suites == cipher_suites) { + local->hw.wiphy->cipher_suites += 2; + local->hw.wiphy->n_cipher_suites -= 2; + } else { + u32 *suites; + int r, w = 0; + + /* Filter out WEP */ + + suites = kmemdup( + local->hw.wiphy->cipher_suites, + sizeof(u32) * local->hw.wiphy->n_cipher_suites, + GFP_KERNEL); + if (!suites) + return -ENOMEM; + for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) { + u32 suite = local->hw.wiphy->cipher_suites[r]; + if (suite == WLAN_CIPHER_SUITE_WEP40 || + suite == WLAN_CIPHER_SUITE_WEP104) + continue; + suites[w++] = suite; + } + local->hw.wiphy->cipher_suites = suites; + local->hw.wiphy->n_cipher_suites = w; + local->wiphy_ciphers_allocated = true; + } + } result = wiphy_register(local->hw.wiphy); if (result < 0) @@ -641,16 +754,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) result = ieee80211_wep_init(local); if (result < 0) - printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n", - wiphy_name(local->hw.wiphy), result); + wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", + result); rtnl_lock(); result = ieee80211_init_rate_ctrl_alg(local, hw->rate_control_algorithm); if (result < 0) { - printk(KERN_DEBUG "%s: Failed to initialize rate control " - "algorithm\n", wiphy_name(local->hw.wiphy)); + wiphy_debug(local->hw.wiphy, + "Failed to initialize rate control algorithm\n"); goto fail_rate; } @@ -659,8 +772,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) result = ieee80211_if_add(local, "wlan%d", NULL, NL80211_IFTYPE_STATION, NULL); if (result) - printk(KERN_WARNING "%s: Failed to add default virtual iface\n", - wiphy_name(local->hw.wiphy)); + wiphy_warn(local->hw.wiphy, + "Failed to add default virtual iface\n"); } rtnl_unlock(); @@ -683,6 +796,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) goto fail_ifa; #endif + netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll, + local->hw.napi_weight); + return 0; #ifdef CONFIG_INET @@ -703,6 +819,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) fail_workqueue: wiphy_unregister(local->hw.wiphy); fail_wiphy_register: + if (local->wiphy_ciphers_allocated) + kfree(local->hw.wiphy->cipher_suites); kfree(local->int_scan_req); return result; } @@ -738,6 +856,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) */ del_timer_sync(&local->work_timer); + cancel_work_sync(&local->restart_work); cancel_work_sync(&local->reconfig_filter); ieee80211_clear_tx_pending(local); @@ -746,8 +865,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) if (skb_queue_len(&local->skb_queue) || skb_queue_len(&local->skb_queue_unreliable)) - printk(KERN_WARNING "%s: skb_queue not empty\n", - wiphy_name(local->hw.wiphy)); + wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); @@ -764,7 +882,10 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) struct ieee80211_local *local = hw_to_local(hw); mutex_destroy(&local->iflist_mtx); - mutex_destroy(&local->scan_mtx); + mutex_destroy(&local->mtx); + + if (local->wiphy_ciphers_allocated) + kfree(local->hw.wiphy->cipher_suites); wiphy_free(local->hw.wiphy); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b6c163a..0cb822c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -54,6 +54,12 @@ */ #define IEEE80211_SIGNAL_AVE_WEIGHT 3 +/* + * How many Beacon frames need to have been used in average signal strength + * before starting to indicate signal change events. + */ +#define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 + #define TMR_RUNNING_TIMER 0 #define TMR_RUNNING_CHANSW 1 @@ -778,16 +784,17 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, params.uapsd = uapsd; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d " - "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", - wiphy_name(local->hw.wiphy), queue, aci, acm, - params.aifs, params.cw_min, params.cw_max, params.txop, - params.uapsd); + wiphy_debug(local->hw.wiphy, + "WMM queue=%d aci=%d acm=%d aifs=%d " + "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", + queue, aci, acm, + params.aifs, params.cw_min, params.cw_max, + params.txop, params.uapsd); #endif if (drv_conf_tx(local, queue, ¶ms)) - printk(KERN_DEBUG "%s: failed to set TX queue " - "parameters for queue %d\n", - wiphy_name(local->hw.wiphy), queue); + wiphy_debug(local->hw.wiphy, + "failed to set TX queue parameters for queue %d\n", + queue); } /* enable WMM or activate new settings */ @@ -990,6 +997,11 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, if (remove_sta) sta_info_destroy_addr(sdata, bssid); + + del_timer_sync(&sdata->u.mgd.conn_mon_timer); + del_timer_sync(&sdata->u.mgd.bcn_mon_timer); + del_timer_sync(&sdata->u.mgd.timer); + del_timer_sync(&sdata->u.mgd.chswitch_timer); } void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, @@ -1103,8 +1115,11 @@ static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata) printk(KERN_DEBUG "Connection to AP %pM lost.\n", bssid); ieee80211_set_disassoc(sdata, true); - ieee80211_recalc_idle(local); mutex_unlock(&ifmgd->mtx); + + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); /* * must be outside lock due to cfg80211, * but that's not a problem. @@ -1173,7 +1188,9 @@ ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, sdata->name, bssid, reason_code); ieee80211_set_disassoc(sdata, true); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return RX_MGMT_CFG80211_DEAUTH; } @@ -1203,7 +1220,9 @@ ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, sdata->name, mgmt->sa, reason_code); ieee80211_set_disassoc(sdata, true); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return RX_MGMT_CFG80211_DISASSOC; } @@ -1540,15 +1559,18 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ifmgd->last_beacon_signal = rx_status->signal; if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; - ifmgd->ave_beacon_signal = rx_status->signal; + ifmgd->ave_beacon_signal = rx_status->signal * 16; ifmgd->last_cqm_event_signal = 0; + ifmgd->count_beacon_signal = 1; } else { ifmgd->ave_beacon_signal = (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 + (16 - IEEE80211_SIGNAL_AVE_WEIGHT) * ifmgd->ave_beacon_signal) / 16; + ifmgd->count_beacon_signal++; } if (bss_conf->cqm_rssi_thold && + ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && !(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) { int sig = ifmgd->ave_beacon_signal / 16; int last_event = ifmgd->last_cqm_event_signal; @@ -1751,7 +1773,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_work *wk; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; @@ -1783,7 +1805,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, free_work(wk); break; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len); } @@ -1840,8 +1862,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) " after %dms, disconnecting.\n", bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ); ieee80211_set_disassoc(sdata, true); - ieee80211_recalc_idle(local); mutex_unlock(&ifmgd->mtx); + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); /* * must be outside lock due to cfg80211, * but that's not a problem. @@ -1917,6 +1941,8 @@ void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata) * time -- the code here is properly synchronised. */ + cancel_work_sync(&ifmgd->request_smps_work); + cancel_work_sync(&ifmgd->beacon_connection_loss_work); if (del_timer_sync(&ifmgd->timer)) set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running); @@ -1952,6 +1978,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work); INIT_WORK(&ifmgd->beacon_connection_loss_work, ieee80211_beacon_connection_loss_work); + INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_work); setup_timer(&ifmgd->timer, ieee80211_sta_timer, (unsigned long) sdata); setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, @@ -2249,6 +2276,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, else ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT; + sdata->control_port_protocol = req->crypto.control_port_ethertype; + sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; + ieee80211_add_work(wk); return 0; } @@ -2275,7 +2305,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, mutex_unlock(&ifmgd->mtx); - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; @@ -2294,7 +2324,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, free_work(wk); break; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); /* * If somebody requests authentication and we haven't @@ -2319,7 +2349,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, if (assoc_bss) sta_info_destroy_addr(sdata, bssid); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return 0; } @@ -2357,7 +2389,9 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, cookie, !req->local_state_change); sta_info_destroy_addr(sdata, bssid); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return 0; } diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index c36b191..eeacaa5 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -112,8 +112,10 @@ void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local) * used from user space controlled off-channel operations. */ if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_MONITOR) + sdata->vif.type != NL80211_IFTYPE_MONITOR) { + set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); netif_tx_stop_all_queues(sdata->dev); + } } mutex_unlock(&local->iflist_mtx); } @@ -131,6 +133,7 @@ void ieee80211_offchannel_stop_station(struct ieee80211_local *local) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { + set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); netif_tx_stop_all_queues(sdata->dev); if (sdata->u.mgd.associated) ieee80211_offchannel_ps_enable(sdata); @@ -155,8 +158,20 @@ void ieee80211_offchannel_return(struct ieee80211_local *local, ieee80211_offchannel_ps_disable(sdata); } - if (sdata->vif.type != NL80211_IFTYPE_MONITOR) + if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { + clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); + /* + * This may wake up queues even though the driver + * currently has them stopped. This is not very + * likely, since the driver won't have gotten any + * (or hardly any) new packets while we weren't + * on the right channel, and even if it happens + * it will at most lead to queueing up one more + * packet per queue in mac80211 rather than on + * the interface qdisc. + */ netif_tx_wake_all_queues(sdata->dev); + } /* re-enable beaconing */ if (enable_beaconing && diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index d287fde..ce671df 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -12,7 +12,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw) struct ieee80211_sub_if_data *sdata; struct sta_info *sta; - ieee80211_scan_cancel(local); + if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) + ieee80211_scan_cancel(local); ieee80211_stop_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_SUSPEND); diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index be04d461..4f772de 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -368,8 +368,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, ref = rate_control_alloc(name, local); if (!ref) { - printk(KERN_WARNING "%s: Failed to select rate control " - "algorithm\n", wiphy_name(local->hw.wiphy)); + wiphy_warn(local->hw.wiphy, + "Failed to select rate control algorithm\n"); return -ENOENT; } @@ -380,9 +380,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, sta_info_flush(local, NULL); } - printk(KERN_DEBUG "%s: Selected rate control " - "algorithm '%s'\n", wiphy_name(local->hw.wiphy), - ref->ops->name); + wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n", + ref->ops->name); return 0; } diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c index 47438b4..135f36f 100644 --- a/net/mac80211/rc80211_pid_debugfs.c +++ b/net/mac80211/rc80211_pid_debugfs.c @@ -162,7 +162,7 @@ static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf, file_info->next_entry = (file_info->next_entry + 1) % RC_PID_EVENT_RING_SIZE; - /* Print information about the event. Note that userpace needs to + /* Print information about the event. Note that userspace needs to * provide large enough buffers. */ length = length < RC_PID_PRINT_BUF_SIZE ? length : RC_PID_PRINT_BUF_SIZE; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index fa0f37e..ac205a3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -538,20 +538,12 @@ static void ieee80211_release_reorder_frame(struct ieee80211_hw *hw, int index, struct sk_buff_head *frames) { - struct ieee80211_supported_band *sband; - struct ieee80211_rate *rate = NULL; struct sk_buff *skb = tid_agg_rx->reorder_buf[index]; - struct ieee80211_rx_status *status; if (!skb) goto no_frame; - status = IEEE80211_SKB_RXCB(skb); - - /* release the reordered frames to stack */ - sband = hw->wiphy->bands[status->band]; - if (!(status->flag & RX_FLAG_HT)) - rate = &sband->bitrates[status->rate_idx]; + /* release the frame from the reorder ring buffer */ tid_agg_rx->stored_mpdu_num--; tid_agg_rx->reorder_buf[index] = NULL; __skb_queue_tail(frames, skb); @@ -580,9 +572,78 @@ static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw, * frames that have not yet been received are assumed to be lost and the skb * can be released for processing. This may also release other skb's from the * reorder buffer if there are no additional gaps between the frames. + * + * Callers must hold tid_agg_rx->reorder_lock. */ #define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10) +static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx, + struct sk_buff_head *frames) +{ + int index, j; + + /* release the buffer until next missing frame */ + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + tid_agg_rx->buf_size; + if (!tid_agg_rx->reorder_buf[index] && + tid_agg_rx->stored_mpdu_num > 1) { + /* + * No buffers ready to be released, but check whether any + * frames in the reorder buffer have timed out. + */ + int skipped = 1; + for (j = (index + 1) % tid_agg_rx->buf_size; j != index; + j = (j + 1) % tid_agg_rx->buf_size) { + if (!tid_agg_rx->reorder_buf[j]) { + skipped++; + continue; + } + if (!time_after(jiffies, tid_agg_rx->reorder_time[j] + + HT_RX_REORDER_BUF_TIMEOUT)) + goto set_release_timer; + +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + wiphy_debug(hw->wiphy, + "release an RX reorder frame due to timeout on earlier frames\n"); +#endif + ieee80211_release_reorder_frame(hw, tid_agg_rx, + j, frames); + + /* + * Increment the head seq# also for the skipped slots. + */ + tid_agg_rx->head_seq_num = + (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK; + skipped = 0; + } + } else while (tid_agg_rx->reorder_buf[index]) { + ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames); + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + tid_agg_rx->buf_size; + } + + if (tid_agg_rx->stored_mpdu_num) { + j = index = seq_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; + + for (; j != (index - 1) % tid_agg_rx->buf_size; + j = (j + 1) % tid_agg_rx->buf_size) { + if (tid_agg_rx->reorder_buf[j]) + break; + } + + set_release_timer: + + mod_timer(&tid_agg_rx->reorder_timer, + tid_agg_rx->reorder_time[j] + + HT_RX_REORDER_BUF_TIMEOUT); + } else { + del_timer(&tid_agg_rx->reorder_timer); + } +} + /* * As this function belongs to the RX path it must be under * rcu_read_lock protection. It returns false if the frame @@ -598,14 +659,16 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4; u16 head_seq_num, buf_size; int index; + bool ret = true; buf_size = tid_agg_rx->buf_size; head_seq_num = tid_agg_rx->head_seq_num; + spin_lock(&tid_agg_rx->reorder_lock); /* frame with out of date sequence number */ if (seq_less(mpdu_seq_num, head_seq_num)) { dev_kfree_skb(skb); - return true; + goto out; } /* @@ -626,7 +689,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, /* check if we already stored this frame */ if (tid_agg_rx->reorder_buf[index]) { dev_kfree_skb(skb); - return true; + goto out; } /* @@ -636,58 +699,19 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, if (mpdu_seq_num == tid_agg_rx->head_seq_num && tid_agg_rx->stored_mpdu_num == 0) { tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); - return false; + ret = false; + goto out; } /* put the frame in the reordering buffer */ tid_agg_rx->reorder_buf[index] = skb; tid_agg_rx->reorder_time[index] = jiffies; tid_agg_rx->stored_mpdu_num++; - /* release the buffer until next missing frame */ - index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % - tid_agg_rx->buf_size; - if (!tid_agg_rx->reorder_buf[index] && - tid_agg_rx->stored_mpdu_num > 1) { - /* - * No buffers ready to be released, but check whether any - * frames in the reorder buffer have timed out. - */ - int j; - int skipped = 1; - for (j = (index + 1) % tid_agg_rx->buf_size; j != index; - j = (j + 1) % tid_agg_rx->buf_size) { - if (!tid_agg_rx->reorder_buf[j]) { - skipped++; - continue; - } - if (!time_after(jiffies, tid_agg_rx->reorder_time[j] + - HT_RX_REORDER_BUF_TIMEOUT)) - break; + ieee80211_sta_reorder_release(hw, tid_agg_rx, frames); -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_DEBUG "%s: release an RX reorder " - "frame due to timeout on earlier " - "frames\n", - wiphy_name(hw->wiphy)); -#endif - ieee80211_release_reorder_frame(hw, tid_agg_rx, - j, frames); - - /* - * Increment the head seq# also for the skipped slots. - */ - tid_agg_rx->head_seq_num = - (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK; - skipped = 0; - } - } else while (tid_agg_rx->reorder_buf[index]) { - ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames); - index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % - tid_agg_rx->buf_size; - } - - return true; + out: + spin_unlock(&tid_agg_rx->reorder_lock); + return ret; } /* @@ -873,6 +897,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (!is_multicast_ether_addr(hdr->addr1) && stakey) { rx->key = stakey; + if ((status->flag & RX_FLAG_DECRYPTED) && + (status->flag & RX_FLAG_IV_STRIPPED)) + return RX_CONTINUE; /* Skip decryption if the frame is not protected. */ if (!ieee80211_has_protected(fc)) return RX_CONTINUE; @@ -935,7 +962,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) * pairwise or station-to-station keys, but for WEP we allow * using a key index as well. */ - if (rx->key && rx->key->conf.alg != ALG_WEP && + if (rx->key && rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 && !is_multicast_ether_addr(hdr->addr1)) rx->key = NULL; } @@ -951,8 +979,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; /* the hdr variable is invalid now! */ - switch (rx->key->conf.alg) { - case ALG_WEP: + switch (rx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: /* Check for weak IVs if possible */ if (rx->sta && ieee80211_is_data(fc) && (!(status->flag & RX_FLAG_IV_STRIPPED) || @@ -962,15 +991,21 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) result = ieee80211_crypto_wep_decrypt(rx); break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: result = ieee80211_crypto_tkip_decrypt(rx); break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: result = ieee80211_crypto_ccmp_decrypt(rx); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: result = ieee80211_crypto_aes_cmac_decrypt(rx); break; + default: + /* + * We can reach here only with HW-only algorithms + * but why didn't it decrypt the frame?! + */ + return RX_DROP_UNUSABLE; } /* either the frame has been decrypted or will be dropped */ @@ -1265,7 +1300,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) /* This is the first fragment of a new frame. */ entry = ieee80211_reassemble_add(rx->sdata, frag, seq, rx->queue, &(rx->skb)); - if (rx->key && rx->key->conf.alg == ALG_CCMP && + if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP && ieee80211_has_protected(fc)) { int queue = ieee80211_is_mgmt(fc) ? NUM_RX_DATA_QUEUES : rx->queue; @@ -1294,7 +1329,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) int i; u8 pn[CCMP_PN_LEN], *rpn; int queue; - if (!rx->key || rx->key->conf.alg != ALG_CCMP) + if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP) return RX_DROP_UNUSABLE; memcpy(pn, entry->last_pn, CCMP_PN_LEN); for (i = CCMP_PN_LEN - 1; i >= 0; i--) { @@ -1492,7 +1527,7 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc) * Allow EAPOL frames to us/the PAE group address regardless * of whether the frame was encrypted or not. */ - if (ehdr->h_proto == htons(ETH_P_PAE) && + if (ehdr->h_proto == rx->sdata->control_port_protocol && (compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 || compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0)) return true; @@ -1909,13 +1944,36 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, } static ieee80211_rx_result debug_noinline +ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + + /* + * From here on, look only at management frames. + * Data and control frames are already handled, + * and unknown (reserved) frames are useless. + */ + if (rx->skb->len < 24) + return RX_DROP_MONITOR; + + if (!ieee80211_is_mgmt(mgmt->frame_control)) + return RX_DROP_MONITOR; + + if (!(rx->flags & IEEE80211_RX_RA_MATCH)) + return RX_DROP_MONITOR; + + if (ieee80211_drop_unencrypted_mgmt(rx)) + return RX_DROP_UNUSABLE; + + return RX_CONTINUE; +} + +static ieee80211_rx_result debug_noinline ieee80211_rx_h_action(struct ieee80211_rx_data *rx) { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; - struct sk_buff *nskb; - struct ieee80211_rx_status *status; int len = rx->skb->len; if (!ieee80211_is_action(mgmt->frame_control)) @@ -1931,9 +1989,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) if (!(rx->flags & IEEE80211_RX_RA_MATCH)) return RX_DROP_UNUSABLE; - if (ieee80211_drop_unencrypted_mgmt(rx)) - return RX_DROP_UNUSABLE; - switch (mgmt->u.action.category) { case WLAN_CATEGORY_BACK: /* @@ -2024,17 +2079,36 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto queue; } + return RX_CONTINUE; + invalid: - /* - * For AP mode, hostapd is responsible for handling any action - * frames that we didn't handle, including returning unknown - * ones. For all other modes we will return them to the sender, - * setting the 0x80 bit in the action category, as required by - * 802.11-2007 7.3.1.11. - */ - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - return RX_DROP_MONITOR; + rx->flags |= IEEE80211_MALFORMED_ACTION_FRM; + /* will return in the next handlers */ + return RX_CONTINUE; + + handled: + if (rx->sta) + rx->sta->rx_packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; + + queue: + rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; + skb_queue_tail(&sdata->skb_queue, rx->skb); + ieee80211_queue_work(&local->hw, &sdata->work); + if (rx->sta) + rx->sta->rx_packets++; + return RX_QUEUED; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) +{ + struct ieee80211_rx_status *status; + + /* skip known-bad action frames and return them in the next handler */ + if (rx->flags & IEEE80211_MALFORMED_ACTION_FRM) + return RX_CONTINUE; /* * Getting here means the kernel doesn't know how to handle @@ -2044,10 +2118,44 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) */ status = IEEE80211_SKB_RXCB(rx->skb); - if (cfg80211_rx_action(rx->sdata->dev, status->freq, - rx->skb->data, rx->skb->len, - GFP_ATOMIC)) - goto handled; + if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq, + rx->skb->data, rx->skb->len, + GFP_ATOMIC)) { + if (rx->sta) + rx->sta->rx_packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; + } + + + return RX_CONTINUE; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct sk_buff *nskb; + struct ieee80211_sub_if_data *sdata = rx->sdata; + + if (!ieee80211_is_action(mgmt->frame_control)) + return RX_CONTINUE; + + /* + * For AP mode, hostapd is responsible for handling any action + * frames that we didn't handle, including returning unknown + * ones. For all other modes we will return them to the sender, + * setting the 0x80 bit in the action category, as required by + * 802.11-2007 7.3.1.11. + * Newer versions of hostapd shall also use the management frame + * registration mechanisms, but older ones still use cooked + * monitor interfaces so push all frames there. + */ + if (!(rx->flags & IEEE80211_MALFORMED_ACTION_FRM) && + (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) + return RX_DROP_MONITOR; /* do not return rejected action frames */ if (mgmt->u.action.category & 0x80) @@ -2066,20 +2174,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) ieee80211_tx_skb(rx->sdata, nskb); } - - handled: - if (rx->sta) - rx->sta->rx_packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; - - queue: - rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_packets++; - return RX_QUEUED; } static ieee80211_rx_result debug_noinline @@ -2090,15 +2186,6 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) struct ieee80211_mgmt *mgmt = (void *)rx->skb->data; __le16 stype; - if (!(rx->flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_MONITOR; - - if (rx->skb->len < 24) - return RX_DROP_MONITOR; - - if (ieee80211_drop_unencrypted_mgmt(rx)) - return RX_DROP_UNUSABLE; - rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb); if (rxs != RX_CONTINUE) return rxs; @@ -2267,19 +2354,46 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, dev_kfree_skb(skb); } +static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, + ieee80211_rx_result res) +{ + switch (res) { + case RX_DROP_MONITOR: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); + if (rx->sta) + rx->sta->rx_dropped++; + /* fall through */ + case RX_CONTINUE: { + struct ieee80211_rate *rate = NULL; + struct ieee80211_supported_band *sband; + struct ieee80211_rx_status *status; + + status = IEEE80211_SKB_RXCB((rx->skb)); + + sband = rx->local->hw.wiphy->bands[status->band]; + if (!(status->flag & RX_FLAG_HT)) + rate = &sband->bitrates[status->rate_idx]; + + ieee80211_rx_cooked_monitor(rx, rate); + break; + } + case RX_DROP_UNUSABLE: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); + if (rx->sta) + rx->sta->rx_dropped++; + dev_kfree_skb(rx->skb); + break; + case RX_QUEUED: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued); + break; + } +} -static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, - struct ieee80211_rx_data *rx, - struct sk_buff *skb, - struct ieee80211_rate *rate) +static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, + struct sk_buff_head *frames) { - struct sk_buff_head reorder_release; ieee80211_rx_result res = RX_DROP_MONITOR; - - __skb_queue_head_init(&reorder_release); - - rx->skb = skb; - rx->sdata = sdata; + struct sk_buff *skb; #define CALL_RXH(rxh) \ do { \ @@ -2288,17 +2402,7 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, goto rxh_next; \ } while (0); - /* - * NB: the rxh_next label works even if we jump - * to it from here because then the list will - * be empty, which is a trivial check - */ - CALL_RXH(ieee80211_rx_h_passive_scan) - CALL_RXH(ieee80211_rx_h_check) - - ieee80211_rx_reorder_ampdu(rx, &reorder_release); - - while ((skb = __skb_dequeue(&reorder_release))) { + while ((skb = __skb_dequeue(frames))) { /* * all the other fields are valid across frames * that belong to an aMPDU since they are on the @@ -2316,42 +2420,95 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, CALL_RXH(ieee80211_rx_h_remove_qos_control) CALL_RXH(ieee80211_rx_h_amsdu) #ifdef CONFIG_MAC80211_MESH - if (ieee80211_vif_is_mesh(&sdata->vif)) + if (ieee80211_vif_is_mesh(&rx->sdata->vif)) CALL_RXH(ieee80211_rx_h_mesh_fwding); #endif CALL_RXH(ieee80211_rx_h_data) /* special treatment -- needs the queue */ - res = ieee80211_rx_h_ctrl(rx, &reorder_release); + res = ieee80211_rx_h_ctrl(rx, frames); if (res != RX_CONTINUE) goto rxh_next; + CALL_RXH(ieee80211_rx_h_mgmt_check) CALL_RXH(ieee80211_rx_h_action) + CALL_RXH(ieee80211_rx_h_userspace_mgmt) + CALL_RXH(ieee80211_rx_h_action_return) CALL_RXH(ieee80211_rx_h_mgmt) + rxh_next: + ieee80211_rx_handlers_result(rx, res); + #undef CALL_RXH + } +} + +static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, + struct ieee80211_rx_data *rx, + struct sk_buff *skb) +{ + struct sk_buff_head reorder_release; + ieee80211_rx_result res = RX_DROP_MONITOR; + + __skb_queue_head_init(&reorder_release); + + rx->skb = skb; + rx->sdata = sdata; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(rx); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0); + + CALL_RXH(ieee80211_rx_h_passive_scan) + CALL_RXH(ieee80211_rx_h_check) + + ieee80211_rx_reorder_ampdu(rx, &reorder_release); + + ieee80211_rx_handlers(rx, &reorder_release); + return; rxh_next: - switch (res) { - case RX_DROP_MONITOR: - I802_DEBUG_INC(sdata->local->rx_handlers_drop); - if (rx->sta) - rx->sta->rx_dropped++; - /* fall through */ - case RX_CONTINUE: - ieee80211_rx_cooked_monitor(rx, rate); - break; - case RX_DROP_UNUSABLE: - I802_DEBUG_INC(sdata->local->rx_handlers_drop); - if (rx->sta) - rx->sta->rx_dropped++; - dev_kfree_skb(rx->skb); - break; - case RX_QUEUED: - I802_DEBUG_INC(sdata->local->rx_handlers_queued); - break; - } - } + ieee80211_rx_handlers_result(rx, res); + +#undef CALL_RXH +} + +/* + * This function makes calls into the RX path. Therefore the + * caller must hold the sta_info->lock and everything has to + * be under rcu_read_lock protection as well. + */ +void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) +{ + struct sk_buff_head frames; + struct ieee80211_rx_data rx = { }; + struct tid_ampdu_rx *tid_agg_rx; + + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (!tid_agg_rx) + return; + + __skb_queue_head_init(&frames); + + /* construct rx struct */ + rx.sta = sta; + rx.sdata = sta->sdata; + rx.local = sta->local; + rx.queue = tid; + rx.flags |= IEEE80211_RX_RA_MATCH; + + if (unlikely(test_bit(SCAN_HW_SCANNING, &sta->local->scanning) || + test_bit(SCAN_OFF_CHANNEL, &sta->local->scanning))) + rx.flags |= IEEE80211_RX_IN_SCAN; + + spin_lock(&tid_agg_rx->reorder_lock); + ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx, &frames); + spin_unlock(&tid_agg_rx->reorder_lock); + + ieee80211_rx_handlers(&rx, &frames); } /* main receive path */ @@ -2433,7 +2590,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: /* should never get here */ WARN_ON(1); break; @@ -2447,8 +2604,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, * be called with rcu_read_lock protection. */ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, - struct sk_buff *skb, - struct ieee80211_rate *rate) + struct sk_buff *skb) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_local *local = hw_to_local(hw); @@ -2550,13 +2706,12 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, skb_new = skb_copy(skb, GFP_ATOMIC); if (!skb_new) { if (net_ratelimit()) - printk(KERN_DEBUG "%s: failed to copy " - "multicast frame for %s\n", - wiphy_name(local->hw.wiphy), - prev->name); + wiphy_debug(local->hw.wiphy, + "failed to copy multicast frame for %s\n", + prev->name); goto next; } - ieee80211_invoke_rx_handlers(prev, &rx, skb_new, rate); + ieee80211_invoke_rx_handlers(prev, &rx, skb_new); next: prev = sdata; } @@ -2572,7 +2727,7 @@ next: } } if (prev) - ieee80211_invoke_rx_handlers(prev, &rx, skb, rate); + ieee80211_invoke_rx_handlers(prev, &rx, skb); else dev_kfree_skb(skb); } @@ -2615,28 +2770,37 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) if (WARN_ON(!local->started)) goto drop; - if (status->flag & RX_FLAG_HT) { + if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) { /* - * rate_idx is MCS index, which can be [0-76] as documented on: - * - * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n - * - * Anything else would be some sort of driver or hardware error. - * The driver should catch hardware errors. + * Validate the rate, unless a PLCP error means that + * we probably can't have a valid rate here anyway. */ - if (WARN((status->rate_idx < 0 || - status->rate_idx > 76), - "Rate marked as an HT rate but passed " - "status->rate_idx is not " - "an MCS index [0-76]: %d (0x%02x)\n", - status->rate_idx, - status->rate_idx)) - goto drop; - } else { - if (WARN_ON(status->rate_idx < 0 || - status->rate_idx >= sband->n_bitrates)) - goto drop; - rate = &sband->bitrates[status->rate_idx]; + + if (status->flag & RX_FLAG_HT) { + /* + * rate_idx is MCS index, which can be [0-76] + * as documented on: + * + * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n + * + * Anything else would be some sort of driver or + * hardware error. The driver should catch hardware + * errors. + */ + if (WARN((status->rate_idx < 0 || + status->rate_idx > 76), + "Rate marked as an HT rate but passed " + "status->rate_idx is not " + "an MCS index [0-76]: %d (0x%02x)\n", + status->rate_idx, + status->rate_idx)) + goto drop; + } else { + if (WARN_ON(status->rate_idx < 0 || + status->rate_idx >= sband->n_bitrates)) + goto drop; + rate = &sband->bitrates[status->rate_idx]; + } } /* @@ -2658,7 +2822,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) return; } - __ieee80211_rx_handle_packet(hw, skb, rate); + __ieee80211_rx_handle_packet(hw, skb); rcu_read_unlock(); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 872d7b6..d60389b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -248,14 +248,12 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) return true; } -void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool was_hw_scan; - trace_api_scan_completed(local, aborted); - - mutex_lock(&local->scan_mtx); + mutex_lock(&local->mtx); /* * It's ok to abort a not-yet-running scan (that @@ -267,7 +265,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) aborted = true; if (WARN_ON(!local->scan_req)) { - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return; } @@ -275,7 +273,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return; } @@ -291,7 +289,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) local->scan_channel = NULL; /* we only have to protect scan_req and hw/sw scan */ - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); if (was_hw_scan) @@ -304,12 +302,26 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_offchannel_return(local, true); done: + mutex_lock(&local->mtx); ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); ieee80211_mesh_notify_scan_completed(local); ieee80211_queue_work(&local->hw, &local->work_work); } + +void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_scan_completed(local, aborted); + + set_bit(SCAN_COMPLETED, &local->scanning); + if (aborted) + set_bit(SCAN_ABORTED, &local->scanning); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); +} EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local) @@ -447,7 +459,7 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local, /* if no more bands/channels left, complete scan and advance to the idle state */ if (local->scan_channel_idx >= local->scan_req->n_channels) { - ieee80211_scan_completed(&local->hw, false); + __ieee80211_scan_completed(&local->hw, false); return 1; } @@ -639,17 +651,25 @@ void ieee80211_scan_work(struct work_struct *work) struct ieee80211_sub_if_data *sdata = local->scan_sdata; unsigned long next_delay = 0; - mutex_lock(&local->scan_mtx); + if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { + bool aborted; + + aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); + __ieee80211_scan_completed(&local->hw, aborted); + return; + } + + mutex_lock(&local->mtx); if (!sdata || !local->scan_req) { - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return; } if (local->hw_scan_req) { int rc = drv_hw_scan(local, sdata, local->hw_scan_req); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); if (rc) - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); return; } @@ -661,20 +681,20 @@ void ieee80211_scan_work(struct work_struct *work) local->scan_sdata = NULL; rc = __ieee80211_start_scan(sdata, req); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); if (rc) - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); return; } - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); /* * Avoid re-scheduling when the sdata is going away. */ if (!ieee80211_sdata_running(sdata)) { - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); return; } @@ -711,9 +731,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, { int res; - mutex_lock(&sdata->local->scan_mtx); + mutex_lock(&sdata->local->mtx); res = __ieee80211_start_scan(sdata, req); - mutex_unlock(&sdata->local->scan_mtx); + mutex_unlock(&sdata->local->mtx); return res; } @@ -726,7 +746,7 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, int ret = -EBUSY; enum ieee80211_band band; - mutex_lock(&local->scan_mtx); + mutex_lock(&local->mtx); /* busy scanning */ if (local->scan_req) @@ -761,7 +781,7 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req); unlock: - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return ret; } @@ -775,11 +795,11 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) * Only call this function when a scan can't be * queued -- mostly at suspend under RTNL. */ - mutex_lock(&local->scan_mtx); + mutex_lock(&local->mtx); abortscan = test_bit(SCAN_SW_SCANNING, &local->scanning) || (!local->scanning && local->scan_req); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); if (abortscan) - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); } diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 6d86f0c..687077e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -174,8 +174,7 @@ static void __sta_info_free(struct ieee80211_local *local, } #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Destroyed STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ kfree(sta); @@ -262,8 +261,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Allocated STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ #ifdef CONFIG_MAC80211_MESH @@ -300,8 +298,9 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async) sta->uploaded = true; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (async) - printk(KERN_DEBUG "%s: Finished adding IBSS STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, + "Finished adding IBSS STA %pM\n", + sta->sta.addr); #endif } @@ -411,8 +410,8 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) spin_unlock_irqrestore(&local->sta_lock, flags); #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Added IBSS STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n", + sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ ieee80211_queue_work(&local->hw, &local->sta_finish_work); @@ -459,8 +458,7 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) } #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Inserted STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ /* move reference to rcu-protected */ @@ -690,8 +688,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta) #endif #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Removed STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ cancel_work_sync(&sta->drv_unblock_wk); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 54262e7..810c5ce 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -103,6 +103,7 @@ struct tid_ampdu_tx { * @reorder_buf: buffer to reorder incoming aggregated MPDUs * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) + * @reorder_timer: releases expired frames from the reorder buffer. * @head_seq_num: head sequence number in reordering buffer. * @stored_mpdu_num: number of MPDUs in reordering buffer * @ssn: Starting Sequence Number expected to be aggregated. @@ -110,20 +111,25 @@ struct tid_ampdu_tx { * @timeout: reset timer value (in TUs). * @dialog_token: dialog token for aggregation session * @rcu_head: RCU head used for freeing this struct + * @reorder_lock: serializes access to reorder buffer, see below. * * This structure is protected by RCU and the per-station * spinlock. Assignments to the array holding it must hold - * the spinlock, only the RX path can access it under RCU - * lock-free. The RX path, since it is single-threaded, - * can even modify the structure without locking since the - * only other modifications to it are done when the struct - * can not yet or no longer be found by the RX path. + * the spinlock. + * + * The @reorder_lock is used to protect the variables and + * arrays such as @reorder_buf, @reorder_time, @head_seq_num, + * @stored_mpdu_num and @reorder_time from being corrupted by + * concurrent access of the RX path and the expired frame + * release timer. */ struct tid_ampdu_rx { struct rcu_head rcu_head; + spinlock_t reorder_lock; struct sk_buff **reorder_buf; unsigned long *reorder_time; struct timer_list session_timer; + struct timer_list reorder_timer; u16 head_seq_num; u16 stored_mpdu_num; u16 ssn; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 10caec5..571b32b 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -114,11 +114,10 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (net_ratelimit()) - printk(KERN_DEBUG "%s: dropped TX filtered frame, " - "queue_len=%d PS=%d @%lu\n", - wiphy_name(local->hw.wiphy), - skb_queue_len(&sta->tx_filtered), - !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies); + wiphy_debug(local->hw.wiphy, + "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", + skb_queue_len(&sta->tx_filtered), + !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies); #endif dev_kfree_skb(skb); } @@ -296,7 +295,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) } if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) - cfg80211_action_tx_status( + cfg80211_mgmt_tx_status( skb->dev, (unsigned long) skb, skb->data, skb->len, !!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC); diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index c54db96..ccf3737 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -351,8 +351,8 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) local->total_ps_buffered = total; #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG - printk(KERN_DEBUG "%s: PS buffers full - purged %d frames\n", - wiphy_name(local->hw.wiphy), purged); + wiphy_debug(local->hw.wiphy, "PS buffers full - purged %d frames\n", + purged); #endif } @@ -509,6 +509,18 @@ ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx) } static ieee80211_tx_result debug_noinline +ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + + if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol && + tx->sdata->control_port_no_encrypt)) + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) { struct ieee80211_key *key = NULL; @@ -527,7 +539,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) else if ((key = rcu_dereference(tx->sdata->default_key))) tx->key = key; else if (tx->sdata->drop_unencrypted && - (tx->skb->protocol != cpu_to_be16(ETH_P_PAE)) && + (tx->skb->protocol != tx->sdata->control_port_protocol) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && (!ieee80211_is_robust_mgmt_frame(hdr) || (ieee80211_is_action(hdr->frame_control) && @@ -543,15 +555,16 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) tx->key->tx_rx_count++; /* TODO: add threshold stuff again */ - switch (tx->key->conf.alg) { - case ALG_WEP: + switch (tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: if (ieee80211_is_auth(hdr->frame_control)) break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: if (!ieee80211_is_data_present(hdr->frame_control)) tx->key = NULL; break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, tx->skb)) @@ -561,7 +574,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) IEEE80211_KEY_FLAG_SW_MGMT) && ieee80211_is_mgmt(hdr->frame_control); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: if (!ieee80211_is_mgmt(hdr->frame_control)) tx->key = NULL; break; @@ -946,22 +959,31 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) static ieee80211_tx_result debug_noinline ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + if (!tx->key) return TX_CONTINUE; - switch (tx->key->conf.alg) { - case ALG_WEP: + switch (tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: return ieee80211_crypto_wep_encrypt(tx); - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: return ieee80211_crypto_tkip_encrypt(tx); - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: return ieee80211_crypto_ccmp_encrypt(tx); - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: return ieee80211_crypto_aes_cmac_encrypt(tx); + default: + /* handle hw-only algorithm */ + if (info->control.hw_key) { + ieee80211_tx_set_protected(tx); + return TX_CONTINUE; + } + break; + } - /* not reached */ - WARN_ON(1); return TX_DROP; } @@ -1339,6 +1361,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) CALL_TXH(ieee80211_tx_h_dynamic_ps); CALL_TXH(ieee80211_tx_h_check_assoc); CALL_TXH(ieee80211_tx_h_ps_buf); + CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); @@ -1511,8 +1534,8 @@ static int ieee80211_skb_resize(struct ieee80211_local *local, I802_DEBUG_INC(local->tx_expand_skb_head); if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) { - printk(KERN_DEBUG "%s: failed to reallocate TX buffer\n", - wiphy_name(local->hw.wiphy)); + wiphy_debug(local->hw.wiphy, + "failed to reallocate TX buffer\n"); return -ENOMEM; } @@ -1699,7 +1722,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, u16 ethertype, hdrlen, meshhdrlen = 0; __le16 fc; struct ieee80211_hdr hdr; - struct ieee80211s_hdr mesh_hdr; + struct ieee80211s_hdr mesh_hdr __maybe_unused; const u8 *encaps_data; int encaps_len, skip_header_bytes; int nh_pos, h_pos; @@ -1816,7 +1839,8 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, #endif case NL80211_IFTYPE_STATION: memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); - if (sdata->u.mgd.use_4addr && ethertype != ETH_P_PAE) { + if (sdata->u.mgd.use_4addr && + cpu_to_be16(ethertype) != sdata->control_port_protocol) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); @@ -1869,7 +1893,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (!ieee80211_vif_is_mesh(&sdata->vif) && unlikely(!is_multicast_ether_addr(hdr.addr1) && !(sta_flags & WLAN_STA_AUTHORIZED) && - !(ethertype == ETH_P_PAE && + !(cpu_to_be16(ethertype) == sdata->control_port_protocol && compare_ether_addr(sdata->vif.addr, skb->data + ETH_ALEN) == 0))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG @@ -2068,8 +2092,7 @@ void ieee80211_tx_pending(unsigned long data) if (skb_queue_empty(&local->pending[i])) list_for_each_entry_rcu(sdata, &local->interfaces, list) - netif_tx_wake_queue( - netdev_get_tx_queue(sdata->dev, i)); + netif_wake_subqueue(sdata->dev, i); } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 748387d..bd40b11 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -283,8 +283,11 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, if (skb_queue_empty(&local->pending[queue])) { rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) - netif_tx_wake_queue(netdev_get_tx_queue(sdata->dev, queue)); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) + continue; + netif_wake_subqueue(sdata->dev, queue); + } rcu_read_unlock(); } else tasklet_schedule(&local->tx_pending_tasklet); @@ -323,7 +326,7 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) - netif_tx_stop_queue(netdev_get_tx_queue(sdata->dev, queue)); + netif_stop_subqueue(sdata->dev, queue); rcu_read_unlock(); } @@ -471,7 +474,7 @@ void ieee80211_iterate_active_interfaces( list_for_each_entry(sdata, &local->interfaces, list) { switch (sdata->vif.type) { - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: @@ -505,7 +508,7 @@ void ieee80211_iterate_active_interfaces_atomic( list_for_each_entry_rcu(sdata, &local->interfaces, list) { switch (sdata->vif.type) { - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: @@ -1189,7 +1192,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* ignore virtual */ break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: WARN_ON(1); break; } @@ -1308,7 +1311,7 @@ void ieee80211_recalc_smps(struct ieee80211_local *local, */ list_for_each_entry(sdata, &local->interfaces, list) { - if (!netif_running(sdata->dev)) + if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type != NL80211_IFTYPE_STATION) goto set; diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index 9ebc8d8..f27484c 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -240,7 +240,7 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local, keyidx = skb->data[hdrlen + 3] >> 6; - if (!key || keyidx != key->conf.keyidx || key->conf.alg != ALG_WEP) + if (!key || keyidx != key->conf.keyidx) return -1; klen = 3 + key->conf.keylen; diff --git a/net/mac80211/work.c b/net/mac80211/work.c index 81d4ad6..ae344d1 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -43,7 +43,7 @@ enum work_action { /* utils */ static inline void ASSERT_WORK_MTX(struct ieee80211_local *local) { - WARN_ON(!mutex_is_locked(&local->work_mtx)); + lockdep_assert_held(&local->mtx); } /* @@ -757,7 +757,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { const u8 *bssid = NULL; @@ -833,7 +833,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, WARN(1, "unexpected: %d", rma); } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); if (rma != WORK_ACT_DONE) goto out; @@ -845,9 +845,9 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, case WORK_DONE_REQUEUE: synchronize_rcu(); wk->started = false; /* restart */ - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_add_tail(&wk->list, &local->work_list); - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); } out: @@ -888,9 +888,9 @@ static void ieee80211_work_work(struct work_struct *work) while ((skb = skb_dequeue(&local->work_skb_queue))) ieee80211_work_rx_queued_mgmt(local, skb); - ieee80211_recalc_idle(local); + mutex_lock(&local->mtx); - mutex_lock(&local->work_mtx); + ieee80211_recalc_idle(local); list_for_each_entry_safe(wk, tmp, &local->work_list, list) { bool started = wk->started; @@ -995,20 +995,16 @@ static void ieee80211_work_work(struct work_struct *work) run_again(local, jiffies + HZ/2); } - mutex_lock(&local->scan_mtx); - if (list_empty(&local->work_list) && local->scan_req && !local->scanning) ieee80211_queue_delayed_work(&local->hw, &local->scan_work, round_jiffies_relative(0)); - mutex_unlock(&local->scan_mtx); - - mutex_unlock(&local->work_mtx); - ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + list_for_each_entry_safe(wk, tmp, &free_work, list) { wk->done(wk, NULL); list_del(&wk->list); @@ -1035,16 +1031,15 @@ void ieee80211_add_work(struct ieee80211_work *wk) wk->started = false; local = wk->sdata->local; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_add_tail(&wk->list, &local->work_list); - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); ieee80211_queue_work(&local->hw, &local->work_work); } void ieee80211_work_init(struct ieee80211_local *local) { - mutex_init(&local->work_mtx); INIT_LIST_HEAD(&local->work_list); setup_timer(&local->work_timer, ieee80211_work_timer, (unsigned long)local); @@ -1057,7 +1052,7 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct ieee80211_work *wk; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; @@ -1065,19 +1060,19 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata) wk->started = true; wk->timeout = jiffies; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); /* run cleanups etc. */ ieee80211_work_work(&local->work_work); - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; WARN_ON(1); break; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); } ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata, @@ -1163,7 +1158,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_work *wk, *tmp; bool found = false; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry_safe(wk, tmp, &local->work_list, list) { if ((unsigned long) wk == cookie) { wk->timeout = jiffies; @@ -1171,7 +1166,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata, break; } } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); if (!found) return -ENOENT; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 8d59d27..43882b3 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -36,8 +36,8 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) int tail; hdr = (struct ieee80211_hdr *)skb->data; - if (!tx->key || tx->key->conf.alg != ALG_TKIP || skb->len < 24 || - !ieee80211_is_data_present(hdr->frame_control)) + if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || + skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control)) return TX_CONTINUE; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -94,7 +94,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) if (status->flag & RX_FLAG_MMIC_STRIPPED) return RX_CONTINUE; - if (!rx->key || rx->key->conf.alg != ALG_TKIP || + if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || !ieee80211_has_protected(hdr->frame_control) || !ieee80211_is_data_present(hdr->frame_control)) return RX_CONTINUE; @@ -221,19 +221,13 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) if (!rx->sta || skb->len - hdrlen < 12) return RX_DROP_UNUSABLE; - if (status->flag & RX_FLAG_DECRYPTED) { - if (status->flag & RX_FLAG_IV_STRIPPED) { - /* - * Hardware took care of all processing, including - * replay protection, and stripped the ICV/IV so - * we cannot do any checks here. - */ - return RX_CONTINUE; - } - - /* let TKIP code verify IV, but skip decryption */ + /* + * Let TKIP code verify IV, but skip decryption. + * In the case where hardware checks the IV as well, + * we don't even get here, see ieee80211_rx_h_decrypt() + */ + if (status->flag & RX_FLAG_DECRYPTED) hwaccel = 1; - } res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm, key, skb->data + hdrlen, @@ -447,10 +441,6 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx) if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; - if ((status->flag & RX_FLAG_DECRYPTED) && - (status->flag & RX_FLAG_IV_STRIPPED)) - return RX_CONTINUE; - ccmp_hdr2pn(pn, skb->data + hdrlen); queue = ieee80211_is_mgmt(hdr->frame_control) ? @@ -564,10 +554,6 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx) if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; - if ((status->flag & RX_FLAG_DECRYPTED) && - (status->flag & RX_FLAG_IV_STRIPPED)) - return RX_CONTINUE; - if (skb->len < 24 + sizeof(*mmie)) return RX_DROP_UNUSABLE; diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4c2f89d..0c043b6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -40,6 +40,7 @@ #include <net/udp.h> #include <net/icmp.h> /* for icmp_send */ #include <net/route.h> +#include <net/ip6_checksum.h> #include <linux/netfilter.h> #include <linux/netfilter_ipv4.h> @@ -637,10 +638,12 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } /* And finally the ICMP checksum */ - icmph->icmp6_cksum = 0; - /* TODO IPv6: is this correct for ICMPv6? */ - ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; if (inout) IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, @@ -1381,8 +1384,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (atomic_read(&cp->in_pkts) % - sysctl_ip_vs_sync_threshold[1] + (pkts % sysctl_ip_vs_sync_threshold[1] == sysctl_ip_vs_sync_threshold[0])) || (cp->old_state != cp->state && ((cp->state == IP_VS_SCTP_S_CLOSED) || @@ -1393,7 +1395,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, } } - if (af == AF_INET && + /* Keep this block last: TCP and others with pp->num_states <= 1 */ + else if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0f0c079..ca8ec8c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock); static DEFINE_RWLOCK(__ip_vs_rs_lock); /* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); +static DEFINE_SPINLOCK(ip_vs_securetcp_lock); /* lock for drop entry handling */ static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); @@ -204,7 +204,7 @@ static void update_defense_level(void) spin_unlock(&__ip_vs_droppacket_lock); /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); + spin_lock(&ip_vs_securetcp_lock); switch (sysctl_ip_vs_secure_tcp) { case 0: if (old_secure_tcp >= 2) @@ -238,7 +238,7 @@ static void update_defense_level(void) old_secure_tcp = sysctl_ip_vs_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); + spin_unlock(&ip_vs_securetcp_lock); local_bh_enable(); } @@ -843,7 +843,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, return -EINVAL; } - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); if (dest == NULL) { pr_err("%s(): no memory.\n", __func__); return -ENOMEM; @@ -1177,7 +1177,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, } #endif - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; @@ -2155,7 +2155,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; - goto out_unlock; + goto out_drop_service; } switch (cmd) { @@ -2189,6 +2189,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; } +out_drop_service: if (svc) ip_vs_service_put(svc); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index bbc1ac7..727e45b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -35,7 +35,7 @@ static LIST_HEAD(ip_vs_schedulers); /* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); +static DEFINE_SPINLOCK(ip_vs_sched_lock); /* @@ -108,7 +108,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - read_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -122,14 +122,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); return sched; } if (sched->module) module_put(sched->module); } - read_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); return NULL; } @@ -184,10 +184,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - write_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); @@ -200,7 +200,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); @@ -211,7 +211,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -229,9 +229,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } - write_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; @@ -241,7 +241,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); /* decrease the module use count */ ip_vs_use_count_dec(); diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index b46a839..9228ee0d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, { __be16 _ports[2], *ports; u8 nexthdr; + int poff; memset(dst, 0, sizeof(*dst)); @@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, return 0; } - switch (nexthdr) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - ports = skb_header_pointer(skb, protoff, sizeof(_ports), + poff = proto_ports_offset(nexthdr); + if (poff >= 0) { + ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports), &_ports); - break; - default: + } else { _ports[0] = _ports[1] = 0; ports = _ports; - break; } if (!ports) return -1; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9a17f28..3616f27b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -488,7 +488,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto out_unlock; @@ -1209,7 +1209,7 @@ static int packet_snd(struct socket *sock, err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); if (err) goto out_free; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto out_free; diff --git a/net/phonet/pep.c b/net/phonet/pep.c index b2a3ae6..04e3419 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -834,6 +834,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; + int err; if (pn_flow_safe(pn->tx_fc) && !atomic_add_unless(&pn->tx_credits, -1, 0)) { @@ -852,7 +853,10 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; - return pn_skb_send(sk, skb, &pipe_srv); + err = pn_skb_send(sk, skb, &pipe_srv); + if (err && pn_flow_safe(pn->tx_fc)) + atomic_inc(&pn->tx_credits); + return err; } static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, @@ -872,7 +876,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len, flags & MSG_DONTWAIT, &err); if (!skb) - return -ENOBUFS; + return err; skb_reserve(skb, MAX_PHONET_HEADER + 3); err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index b18e48f..d0a4294 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -292,8 +292,7 @@ static void phonet_route_autodel(struct net_device *dev) if (bitmap_empty(deleted, 64)) return; /* short-circuit RCU */ synchronize_rcu(); - for (i = find_first_bit(deleted, 64); i < 64; - i = find_next_bit(deleted, 64, i + 1)) { + for_each_set_bit(i, deleted, 64) { rtm_phonet_notify(RTM_DELROUTE, dev, i); dev_put(dev); } diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 6e9848bf..7c91f73 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -281,7 +281,9 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock, if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return POLLHUP; - if (sk->sk_state == TCP_ESTABLISHED && atomic_read(&pn->tx_credits)) + if (sk->sk_state == TCP_ESTABLISHED && + atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf && + atomic_read(&pn->tx_credits)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index aebfecb..bb6ad81 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -39,7 +39,15 @@ #include <net/sock.h> #include "rds.h" -#include "rdma.h" + +char *rds_str_array(char **array, size_t elements, size_t index) +{ + if ((index < elements) && array[index]) + return array[index]; + else + return "unknown"; +} +EXPORT_SYMBOL(rds_str_array); /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); @@ -62,7 +70,7 @@ static int rds_release(struct socket *sock) struct rds_sock *rs; unsigned long flags; - if (sk == NULL) + if (!sk) goto out; rs = rds_sk_to_rs(sk); @@ -73,7 +81,15 @@ static int rds_release(struct socket *sock) * with the socket. */ rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); + + /* + * the binding lookup hash uses rcu, we need to + * make sure we sychronize_rcu before we free our + * entry + */ rds_remove_bound(rs); + synchronize_rcu(); + rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); @@ -83,6 +99,8 @@ static int rds_release(struct socket *sock) rds_sock_count--; spin_unlock_irqrestore(&rds_sock_lock, flags); + rds_trans_put(rs->rs_transport); + sock->sk = NULL; sock_put(sk); out: @@ -514,7 +532,7 @@ out: spin_unlock_irqrestore(&rds_sock_lock, flags); } -static void __exit rds_exit(void) +static void rds_exit(void) { sock_unregister(rds_family_ops.family); proto_unregister(&rds_proto); @@ -529,7 +547,7 @@ static void __exit rds_exit(void) } module_exit(rds_exit); -static int __init rds_init(void) +static int rds_init(void) { int ret; diff --git a/net/rds/bind.c b/net/rds/bind.c index 5d95fc0..2f6b3fc 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -34,45 +34,52 @@ #include <net/sock.h> #include <linux/in.h> #include <linux/if_arp.h> +#include <linux/jhash.h> #include "rds.h" -/* - * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't - * particularly zippy. - * - * This is now called for every incoming frame so we arguably care much more - * about it than we used to. - */ +#define BIND_HASH_SIZE 1024 +static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; static DEFINE_SPINLOCK(rds_bind_lock); -static struct rb_root rds_bind_tree = RB_ROOT; -static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, - struct rds_sock *insert) +static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +{ + return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & + (BIND_HASH_SIZE - 1)); +} + +static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, + struct rds_sock *insert) { - struct rb_node **p = &rds_bind_tree.rb_node; - struct rb_node *parent = NULL; struct rds_sock *rs; + struct hlist_node *node; + struct hlist_head *head = hash_to_bucket(addr, port); u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - while (*p) { - parent = *p; - rs = rb_entry(parent, struct rds_sock, rs_bound_node); - + rcu_read_lock(); + hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (needle < cmp) - p = &(*p)->rb_left; - else if (needle > cmp) - p = &(*p)->rb_right; - else + if (cmp == needle) { + rcu_read_unlock(); return rs; + } } + rcu_read_unlock(); if (insert) { - rb_link_node(&insert->rs_bound_node, parent, p); - rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); + /* + * make sure our addr and port are set before + * we are added to the list, other people + * in rcu will find us as soon as the + * hlist_add_head_rcu is done + */ + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + rds_sock_addref(insert); + + hlist_add_head_rcu(&insert->rs_bound_node, head); } return NULL; } @@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; - unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); - rs = rds_bind_tree_walk(addr, port, NULL); + rs = rds_bind_lookup(addr, port, NULL); + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - spin_unlock_irqrestore(&rds_bind_lock, flags); rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -121,22 +126,15 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) do { if (rover == 0) rover++; - if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { - *port = cpu_to_be16(rover); + if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + *port = rs->rs_bound_port; ret = 0; + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); break; } } while (rover++ != last); - if (ret == 0) { - rs->rs_bound_addr = addr; - rs->rs_bound_port = *port; - rds_sock_addref(rs); - - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); - } - spin_unlock_irqrestore(&rds_bind_lock, flags); return ret; @@ -153,7 +151,7 @@ void rds_remove_bound(struct rds_sock *rs) rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - rb_erase(&rs->rs_bound_node, &rds_bind_tree); + hlist_del_init_rcu(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } @@ -184,7 +182,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; trans = rds_trans_get_preferred(sin->sin_addr.s_addr); - if (trans == NULL) { + if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); if (printk_ratelimit()) @@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); + + /* we might have called rds_remove_bound on error */ + if (ret) + synchronize_rcu(); return ret; } diff --git a/net/rds/cong.c b/net/rds/cong.c index 0871a29f..75ea686 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); - if (map == NULL) + if (!map) return NULL; map->m_addr = addr; @@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); - if (ret == NULL) { + if (!ret) { ret = map; map = NULL; } @@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn) conn->c_lcong = rds_cong_from_addr(conn->c_laddr); conn->c_fcong = rds_cong_from_addr(conn->c_faddr); - if (conn->c_lcong == NULL || conn->c_fcong == NULL) + if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; @@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) list_for_each_entry(conn, &map->m_conn_list, c_map_item) { if (!test_and_set_bit(0, &conn->c_map_queued)) { rds_stats_inc(s_cong_update_queued); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_send_xmit(conn); } } diff --git a/net/rds/connection.c b/net/rds/connection.c index 7619b67..870992e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -37,7 +37,6 @@ #include "rds.h" #include "loop.h" -#include "rdma.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) @@ -63,18 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) -static inline int rds_conn_is_sending(struct rds_connection *conn) -{ - int ret = 0; - - if (!mutex_trylock(&conn->c_send_lock)) - ret = 1; - else - mutex_unlock(&conn->c_send_lock); - - return ret; -} - +/* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) @@ -82,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, struct rds_connection *conn, *ret = NULL; struct hlist_node *pos; - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && conn->c_trans == trans) { ret = conn; @@ -129,10 +117,11 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); + struct rds_transport *loop_trans; unsigned long flags; int ret; - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && !is_outgoing) { @@ -143,12 +132,12 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, parent = conn; conn = parent->c_passive; } - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); if (conn) goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); - if (conn == NULL) { + if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } @@ -159,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; - mutex_init(&conn->c_send_lock); + init_waitqueue_head(&conn->c_waitq); INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); @@ -175,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - if (rds_trans_get_preferred(faddr)) { + loop_trans = rds_trans_get_preferred(faddr); + if (loop_trans) { + rds_trans_put(loop_trans); conn->c_loopback = 1; if (is_outgoing && trans->t_prefer_loopback) { /* "outgoing" connection - and the transport @@ -238,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head(&conn->c_hash_node, head); + hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } @@ -263,21 +254,91 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); +void rds_conn_shutdown(struct rds_connection *conn) +{ + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); + + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work_sync(&conn->c_conn_w); + rcu_read_lock(); + if (!hlist_unhashed(&conn->c_hash_node)) { + rcu_read_unlock(); + rds_queue_reconnect(conn); + } else { + rcu_read_unlock(); + } +} + +/* + * Stop and free a connection. + * + * This can only be used in very limited circumstances. It assumes that once + * the conn has been shutdown that no one else is referencing the connection. + * We can only ensure this in the rmmod path in the current code. + */ void rds_conn_destroy(struct rds_connection *conn) { struct rds_message *rm, *rtmp; + unsigned long flags; rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); - hlist_del_init(&conn->c_hash_node); + /* Ensure conn will not be scheduled for reconnect */ + spin_lock_irq(&rds_conn_lock); + hlist_del_init_rcu(&conn->c_hash_node); + spin_unlock_irq(&rds_conn_lock); + synchronize_rcu(); - /* wait for the rds thread to shut it down */ - atomic_set(&conn->c_state, RDS_CONN_ERROR); - cancel_delayed_work(&conn->c_conn_w); - queue_work(rds_wq, &conn->c_down_w); - flush_workqueue(rds_wq); + /* shut the connection down */ + rds_conn_drop(conn); + flush_work(&conn->c_down_w); + + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&conn->c_send_w); + cancel_delayed_work_sync(&conn->c_recv_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, @@ -302,7 +363,9 @@ void rds_conn_destroy(struct rds_connection *conn) BUG_ON(!list_empty(&conn->c_retrans)); kmem_cache_free(rds_conn_slab, conn); + spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; + spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); @@ -316,23 +379,23 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, struct list_head *list; struct rds_connection *conn; struct rds_message *rm; - unsigned long flags; unsigned int total = 0; + unsigned long flags; size_t i; len /= sizeof(struct rds_info_message); - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { if (want_send) list = &conn->c_send_queue; else list = &conn->c_retrans; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&conn->c_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { @@ -343,11 +406,10 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, conn->c_faddr, 0); } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&conn->c_lock, flags); } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -377,19 +439,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, uint64_t buffer[(item_len + 7) / 8]; struct hlist_head *head; struct hlist_node *pos; - struct hlist_node *tmp; struct rds_connection *conn; - unsigned long flags; size_t i; - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) @@ -405,8 +465,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, lens->nr++; } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); @@ -423,8 +482,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, sizeof(cinfo->transport)); cinfo->flags = 0; - rds_conn_info_set(cinfo->flags, - rds_conn_is_sending(conn), SENDING); + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, @@ -444,12 +503,12 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } -int __init rds_conn_init(void) +int rds_conn_init(void) { rds_conn_slab = kmem_cache_create("rds_connection", sizeof(struct rds_connection), 0, 0, NULL); - if (rds_conn_slab == NULL) + if (!rds_conn_slab) return -ENOMEM; rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); @@ -487,6 +546,18 @@ void rds_conn_drop(struct rds_connection *conn) EXPORT_SYMBOL_GPL(rds_conn_drop); /* + * If the connection is down, trigger a connect. We may have scheduled a + * delayed reconnect however - in this case we should not interfere. + */ +void rds_conn_connect_if_down(struct rds_connection *conn) +{ + if (rds_conn_state(conn) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); + +/* * An error occurred on the connection */ void diff --git a/net/rds/ib.c b/net/rds/ib.c index 8f2d6dd..b12a395 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -53,12 +53,71 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); +/* + * we have a clumsy combination of RCU and a rwsem protecting this list + * because it is used both in the get_mr fast path and while blocking in + * the FMR flushing path. + */ +DECLARE_RWSEM(rds_ib_devices_lock); struct list_head rds_ib_devices; /* NOTE: if also grabbing ibdev lock, grab this first */ DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); +void rds_ib_nodev_connect(void) +{ + struct rds_ib_connection *ic; + + spin_lock(&ib_nodev_conns_lock); + list_for_each_entry(ic, &ib_nodev_conns, ib_node) + rds_conn_connect_if_down(ic->conn); + spin_unlock(&ib_nodev_conns_lock); +} + +void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + spin_lock_irqsave(&rds_ibdev->spinlock, flags); + list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) + rds_conn_drop(ic->conn); + spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); +} + +/* + * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references + * from interrupt context so we push freing off into a work struct in krdsd. + */ +static void rds_ib_dev_free(struct work_struct *work) +{ + struct rds_ib_ipaddr *i_ipaddr, *i_next; + struct rds_ib_device *rds_ibdev = container_of(work, + struct rds_ib_device, free_work); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr) + ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->pd) + ib_dealloc_pd(rds_ibdev->pd); + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + kfree(rds_ibdev); +} + +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) +{ + BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); + if (atomic_dec_and_test(&rds_ibdev->refcount)) + queue_work(rds_wq, &rds_ibdev->free_work); +} + void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; @@ -77,11 +136,14 @@ void rds_ib_add_one(struct ib_device *device) goto free_attr; } - rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, + ibdev_to_node(device)); if (!rds_ibdev) goto free_attr; spin_lock_init(&rds_ibdev->spinlock); + atomic_set(&rds_ibdev->refcount, 1); + INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); rds_ibdev->max_wrs = dev_attr->max_qp_wr; rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); @@ -91,68 +153,107 @@ void rds_ib_add_one(struct ib_device *device) min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : fmr_pool_size; + rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_ibdev->pd)) - goto free_dev; + if (IS_ERR(rds_ibdev->pd)) { + rds_ibdev->pd = NULL; + goto put_dev; + } - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) - goto err_pd; + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) { + rds_ibdev->mr = NULL; + goto put_dev; + } rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); if (IS_ERR(rds_ibdev->mr_pool)) { rds_ibdev->mr_pool = NULL; - goto err_mr; + goto put_dev; } INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); - list_add_tail(&rds_ibdev->list, &rds_ib_devices); + + down_write(&rds_ib_devices_lock); + list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); + up_write(&rds_ib_devices_lock); + atomic_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); + atomic_inc(&rds_ibdev->refcount); - goto free_attr; + rds_ib_nodev_connect(); -err_mr: - ib_dereg_mr(rds_ibdev->mr); -err_pd: - ib_dealloc_pd(rds_ibdev->pd); -free_dev: - kfree(rds_ibdev); +put_dev: + rds_ib_dev_put(rds_ibdev); free_attr: kfree(dev_attr); } +/* + * New connections use this to find the device to associate with the + * connection. It's not in the fast path so we're not concerned about the + * performance of the IB call. (As of this writing, it uses an interrupt + * blocking spinlock to serialize walking a per-device list of all registered + * clients.) + * + * RCU is used to handle incoming connections racing with device teardown. + * Rather than use a lock to serialize removal from the client_data and + * getting a new reference, we use an RCU grace period. The destruction + * path removes the device from client_data and then waits for all RCU + * readers to finish. + * + * A new connection can get NULL from this if its arriving on a + * device that is in the process of being removed. + */ +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + + rcu_read_lock(); + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (rds_ibdev) + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; +} + +/* + * The IB stack is letting us know that a device is going away. This can + * happen if the underlying HCA driver is removed or if PCI hotplug is removing + * the pci function, for example. + * + * This can be called at any time and can be racing with any other RDS path. + */ void rds_ib_remove_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct rds_ib_ipaddr *i_ipaddr, *i_next; rds_ibdev = ib_get_client_data(device, &rds_ib_client); if (!rds_ibdev) return; - list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); - } + rds_ib_dev_shutdown(rds_ibdev); - rds_ib_destroy_conns(rds_ibdev); + /* stop connection attempts from getting a reference to this device. */ + ib_set_client_data(device, &rds_ib_client, NULL); - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); - - ib_dereg_mr(rds_ibdev->mr); - - while (ib_dealloc_pd(rds_ibdev->pd)) { - rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); - msleep(1); - } + down_write(&rds_ib_devices_lock); + list_del_rcu(&rds_ibdev->list); + up_write(&rds_ib_devices_lock); - list_del(&rds_ibdev->list); - kfree(rds_ibdev); + /* + * This synchronize rcu is waiting for readers of both the ib + * client data and the devices list to finish before we drop + * both of those references. + */ + synchronize_rcu(); + rds_ib_dev_put(rds_ibdev); + rds_ib_dev_put(rds_ibdev); } struct ib_client rds_ib_client = { @@ -186,7 +287,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; @@ -248,29 +349,36 @@ static int rds_ib_laddr_check(__be32 addr) return ret; } +static void rds_ib_unregister_client(void) +{ + ib_unregister_client(&rds_ib_client); + /* wait for rds_ib_dev_free() to complete */ + flush_workqueue(rds_wq); +} + void rds_ib_exit(void) { rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); + rds_ib_unregister_client(); rds_ib_destroy_nodev_conns(); - ib_unregister_client(&rds_ib_client); rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); + rds_ib_fmr_exit(); } struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_complete = rds_ib_xmit_complete, .xmit = rds_ib_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_ib_xmit_rdma, + .xmit_atomic = rds_ib_xmit_atomic, .recv = rds_ib_recv, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, .conn_connect = rds_ib_conn_connect, .conn_shutdown = rds_ib_conn_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, - .inc_purge = rds_ib_inc_purge, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_handle_connect = rds_ib_cm_handle_connect, @@ -286,16 +394,20 @@ struct rds_transport rds_ib_transport = { .t_type = RDS_TRANS_IB }; -int __init rds_ib_init(void) +int rds_ib_init(void) { int ret; INIT_LIST_HEAD(&rds_ib_devices); - ret = ib_register_client(&rds_ib_client); + ret = rds_ib_fmr_init(); if (ret) goto out; + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out_fmr_exit; + ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; @@ -317,7 +429,9 @@ out_recv: out_sysctl: rds_ib_sysctl_exit(); out_ibreg: - ib_unregister_client(&rds_ib_client); + rds_ib_unregister_client(); +out_fmr_exit: + rds_ib_fmr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index 64df4e7..7ad3d57 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -3,11 +3,13 @@ #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <linux/pci.h> +#include <linux/slab.h> #include "rds.h" #include "rdma_transport.h" #define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 4096 +#define RDS_FMR_POOL_SIZE 8192 #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 @@ -19,6 +21,9 @@ #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ +#define RDS_IB_RECYCLE_BATCH_COUNT 32 + +extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; /* @@ -26,20 +31,29 @@ extern struct list_head rds_ib_devices; * try and minimize the amount of memory tied up both the device and * socket receive queues. */ -/* page offset of the final full frag that fits in the page */ -#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) struct rds_page_frag { struct list_head f_item; - struct page *f_page; - unsigned long f_offset; - dma_addr_t f_mapped; + struct list_head f_cache_entry; + struct scatterlist f_sg; }; struct rds_ib_incoming { struct list_head ii_frags; + struct list_head ii_cache_entry; struct rds_incoming ii_inc; }; +struct rds_ib_cache_head { + struct list_head *first; + unsigned long count; +}; + +struct rds_ib_refill_cache { + struct rds_ib_cache_head *percpu; + struct list_head *xfer; + struct list_head *ready; +}; + struct rds_ib_connect_private { /* Add new fields at the end, and don't permute existing fields. */ __be32 dp_saddr; @@ -53,8 +67,7 @@ struct rds_ib_connect_private { }; struct rds_ib_send_work { - struct rds_message *s_rm; - struct rds_rdma_op *s_op; + void *s_op; struct ib_send_wr s_wr; struct ib_sge s_sge[RDS_IB_MAX_SGE]; unsigned long s_queued; @@ -92,10 +105,11 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; - struct rds_message *i_rm; + struct rm_data_op *i_data_op; struct rds_header *i_send_hdrs; u64 i_send_hdrs_dma; struct rds_ib_send_work *i_sends; + atomic_t i_signaled_sends; /* rx */ struct tasklet_struct i_recv_tasklet; @@ -106,8 +120,9 @@ struct rds_ib_connection { struct rds_header *i_recv_hdrs; u64 i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; - struct rds_page_frag i_frag; u64 i_ack_recv; /* last ACK received */ + struct rds_ib_refill_cache i_cache_incs; + struct rds_ib_refill_cache i_cache_frags; /* sending acks */ unsigned long i_ack_flags; @@ -138,7 +153,6 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; - long i_unsignaled_bytes; }; /* This assumes that atomic_t is at least 32 bits */ @@ -164,9 +178,17 @@ struct rds_ib_device { unsigned int max_fmrs; int max_sge; unsigned int max_wrs; + unsigned int max_initiator_depth; + unsigned int max_responder_resources; spinlock_t spinlock; /* protect the above */ + atomic_t refcount; + struct work_struct free_work; }; +#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) +#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device)) +#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) + /* bits for i_ack_flags */ #define IB_ACK_IN_FLIGHT 0 #define IB_ACK_REQUESTED 1 @@ -202,6 +224,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_pool_flush; uint64_t s_ib_rdma_mr_pool_wait; uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_atomic_cswp; + uint64_t s_ib_atomic_fadd; }; extern struct workqueue_struct *rds_ib_wq; @@ -243,6 +267,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, extern struct rds_transport rds_ib_transport; extern void rds_ib_add_one(struct ib_device *device); extern void rds_ib_remove_one(struct ib_device *device); +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; extern unsigned int fmr_pool_size; @@ -258,7 +284,7 @@ void rds_ib_conn_free(void *arg); int rds_ib_conn_connect(struct rds_connection *conn); void rds_ib_conn_shutdown(struct rds_connection *conn); void rds_ib_state_change(struct sock *sk); -int __init rds_ib_listen_init(void); +int rds_ib_listen_init(void); void rds_ib_listen_stop(void); void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -275,15 +301,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); -static inline void rds_ib_destroy_nodev_conns(void) -{ - __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); -} -static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) -{ - __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); -} +void rds_ib_destroy_nodev_conns(void); struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); @@ -292,14 +310,16 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); +int rds_ib_fmr_init(void); +void rds_ib_fmr_exit(void); /* ib_recv.c */ -int __init rds_ib_recv_init(void); +int rds_ib_recv_init(void); void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill); -void rds_ib_inc_purge(struct rds_incoming *inc); +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); +void rds_ib_recv_free_caches(struct rds_ib_connection *ic); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); @@ -325,17 +345,19 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ +char *rds_ib_wc_status_str(enum ib_wc_status status); void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, u32 *adv_credits, int need_posted, int max_posted); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); @@ -344,7 +366,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_ib_sysctl_init(void); +int rds_ib_sysctl_init(void); void rds_ib_sysctl_exit(void); extern unsigned long rds_ib_sysctl_max_send_wr; extern unsigned long rds_ib_sysctl_max_recv_wr; @@ -354,28 +376,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; extern ctl_table rds_ib_sysctl_table[]; -/* - * Helper functions for getting/setting the header and data SGEs in - * RDS packets (not RDMA) - * - * From version 3.1 onwards, header is in front of data in the sge. - */ -static inline struct ib_sge * -rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - if (ic->conn->c_version > RDS_PROTOCOL_3_0) - return &sge[0]; - else - return &sge[1]; -} - -static inline struct ib_sge * -rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - if (ic->conn->c_version > RDS_PROTOCOL_3_0) - return &sge[1]; - else - return &sge[0]; -} - #endif diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f688327..bc3dbc1 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -38,6 +38,36 @@ #include "rds.h" #include "ib.h" +static char *rds_ib_event_type_strings[] = { +#define RDS_IB_EVENT_STRING(foo) \ + [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) + RDS_IB_EVENT_STRING(CQ_ERR), + RDS_IB_EVENT_STRING(QP_FATAL), + RDS_IB_EVENT_STRING(QP_REQ_ERR), + RDS_IB_EVENT_STRING(QP_ACCESS_ERR), + RDS_IB_EVENT_STRING(COMM_EST), + RDS_IB_EVENT_STRING(SQ_DRAINED), + RDS_IB_EVENT_STRING(PATH_MIG), + RDS_IB_EVENT_STRING(PATH_MIG_ERR), + RDS_IB_EVENT_STRING(DEVICE_FATAL), + RDS_IB_EVENT_STRING(PORT_ACTIVE), + RDS_IB_EVENT_STRING(PORT_ERR), + RDS_IB_EVENT_STRING(LID_CHANGE), + RDS_IB_EVENT_STRING(PKEY_CHANGE), + RDS_IB_EVENT_STRING(SM_CHANGE), + RDS_IB_EVENT_STRING(SRQ_ERR), + RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), + RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), + RDS_IB_EVENT_STRING(CLIENT_REREGISTER), +#undef RDS_IB_EVENT_STRING +}; + +static char *rds_ib_event_str(enum ib_event_type type) +{ + return rds_str_array(rds_ib_event_type_strings, + ARRAY_SIZE(rds_ib_event_type_strings), type); +}; + /* * Set the selected protocol version */ @@ -95,7 +125,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even { const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_ib_device *rds_ibdev; struct ib_qp_attr qp_attr; int err; @@ -111,11 +140,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + if (conn->c_version < RDS_PROTOCOL(3,1)) { + printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," + " no longer supported\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } else { + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + } /* * Init rings and fill recv. this needs to wait until protocol negotiation @@ -125,7 +164,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + rds_ib_recv_refill(conn, 1); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); @@ -135,12 +174,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even if (err) printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); - /* update ib_device with this local ipaddr & conn */ - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + /* update ib_device with this local ipaddr */ + err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); if (err) - printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); - rds_ib_add_conn(rds_ibdev, conn); + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", + err); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ @@ -153,18 +191,23 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, struct rdma_conn_param *conn_param, struct rds_ib_connect_private *dp, - u32 protocol_version) + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth) { + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; + memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; + + conn_param->responder_resources = + min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { - struct rds_ib_connection *ic = conn->c_transport_data; - memset(dp, 0, sizeof(*dp)); dp->dp_saddr = conn->c_laddr; dp->dp_daddr = conn->c_faddr; @@ -189,7 +232,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cq_event_handler(struct ib_event *event, void *data) { - rdsdebug("event %u data %p\n", event->event, data); + rdsdebug("event %u (%s) data %p\n", + event->event, rds_ib_event_str(event->event), data); } static void rds_ib_qp_event_handler(struct ib_event *event, void *data) @@ -197,16 +241,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) struct rds_connection *conn = data; struct rds_ib_connection *ic = conn->c_transport_data; - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, + rds_ib_event_str(event->event)); switch (event->event) { case IB_EVENT_COMM_EST: rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - rdsdebug("Fatal QP Event %u " + rdsdebug("Fatal QP Event %u (%s) " "- connection %pI4->%pI4, reconnecting\n", - event->event, &conn->c_laddr, &conn->c_faddr); + event->event, rds_ib_event_str(event->event), + &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; } @@ -224,18 +270,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_device *rds_ibdev; int ret; - /* rds_ib_add_one creates a rds_ib_device object per IB device, - * and allocates a protection domain, memory range and FMR pool - * for each. If that fails for any reason, it will not register - * the rds_ibdev at all. + /* + * It's normal to see a null device if an incoming connection races + * with device removal, so we don't print a warning. */ - rds_ibdev = ib_get_client_data(dev, &rds_ib_client); - if (rds_ibdev == NULL) { - if (printk_ratelimit()) - printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", - dev->name); + rds_ibdev = rds_ib_get_client_data(dev); + if (!rds_ibdev) return -EOPNOTSUPP; - } + + /* add the conn now so that connection establishment has the dev */ + rds_ib_add_conn(rds_ibdev, conn); if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); @@ -306,7 +350,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -316,7 +360,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -324,22 +368,24 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - if (ic->i_sends == NULL) { + ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ibdev_to_node(dev)); + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; } memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); - if (ic->i_recvs == NULL) { + ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ibdev_to_node(dev)); + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; @@ -352,6 +398,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_cq, ic->i_recv_cq); out: + rds_ib_dev_put(rds_ibdev); return ret; } @@ -409,7 +456,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rds_ib_connection *ic = NULL; struct rdma_conn_param conn_param; u32 version; - int err, destroy = 1; + int err = 1, destroy = 1; /* Check whether the remote protocol version matches ours. */ version = rds_ib_protocol_compatible(event); @@ -448,7 +495,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* Wait and see - our connect may still be succeeding */ rds_ib_stats_inc(s_ib_connect_raced); } - mutex_unlock(&conn->c_cm_lock); goto out; } @@ -479,20 +525,20 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { + if (err) rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; out: - rdma_reject(cm_id, NULL, 0); + if (conn) + mutex_unlock(&conn->c_cm_lock); + if (err) + rdma_reject(cm_id, NULL, 0); return destroy; } @@ -516,8 +562,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + UINT_MAX, UINT_MAX); ret = rdma_connect(cm_id, &conn_param); if (ret) rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); @@ -601,9 +647,19 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id, err); } + /* + * We want to wait for tx and rx completion to finish + * before we tear down the connection, but we have to be + * careful not to get stuck waiting on a send ring that + * only has unsignaled sends in it. We've shutdown new + * sends before getting here so by waiting for signaled + * sends to complete we're ensured that there will be no + * more tx processing. + */ wait_event(rds_ib_ring_empty_wait, - rds_ib_ring_empty(&ic->i_send_ring) && - rds_ib_ring_empty(&ic->i_recv_ring)); + rds_ib_ring_empty(&ic->i_recv_ring) && + (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_recv_tasklet); if (ic->i_send_hdrs) ib_dma_free_coherent(dev, @@ -654,9 +710,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) BUG_ON(ic->rds_ibdev); /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; + if (ic->i_data_op) { + struct rds_message *rm; + + rm = container_of(ic->i_data_op, struct rds_message, data); + rds_message_put(rm); + ic->i_data_op = NULL; } /* Clear the ACK state */ @@ -690,12 +749,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_ib_connection *ic; unsigned long flags; + int ret; /* XXX too lazy? */ ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); - if (ic == NULL) + if (!ic) return -ENOMEM; + ret = rds_ib_recv_alloc_caches(ic); + if (ret) { + kfree(ic); + return ret; + } + INIT_LIST_HEAD(&ic->ib_node); tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, (unsigned long) ic); @@ -703,6 +769,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); #endif + atomic_set(&ic->i_signaled_sends, 0); /* * rds_ib_conn_shutdown() waits for these to be emptied so they @@ -744,6 +811,8 @@ void rds_ib_conn_free(void *arg) list_del(&ic->ib_node); spin_unlock_irq(lock_ptr); + rds_ib_recv_free_caches(ic); + kfree(ic); } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a54cd63..8f6e221 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -32,11 +32,16 @@ */ #include <linux/kernel.h> #include <linux/slab.h> +#include <linux/rculist.h> #include "rds.h" -#include "rdma.h" #include "ib.h" +#include "xlist.h" +struct workqueue_struct *rds_ib_fmr_wq; + +static DEFINE_PER_CPU(unsigned long, clean_list_grace); +#define CLEAN_LIST_BUSY_BIT 0 /* * This is stored as mr->r_trans_private. @@ -45,7 +50,11 @@ struct rds_ib_mr { struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct ib_fmr *fmr; - struct list_head list; + + struct xlist_head xlist; + + /* unmap_list is for freeing */ + struct list_head unmap_list; unsigned int remap_count; struct scatterlist *sg; @@ -59,14 +68,16 @@ struct rds_ib_mr { */ struct rds_ib_mr_pool { struct mutex flush_lock; /* serialize fmr invalidate */ - struct work_struct flush_worker; /* flush worker */ + struct delayed_work flush_worker; /* flush worker */ - spinlock_t list_lock; /* protect variables below */ atomic_t item_count; /* total # of MRs */ atomic_t dirty_count; /* # dirty of MRs */ - struct list_head drop_list; /* MRs that have reached their max_maps limit */ - struct list_head free_list; /* unused MRs */ - struct list_head clean_list; /* unused & unamapped MRs */ + + struct xlist_head drop_list; /* MRs that have reached their max_maps limit */ + struct xlist_head free_list; /* unused MRs */ + struct xlist_head clean_list; /* global unused & unamapped MRs */ + wait_queue_head_t flush_wait; + atomic_t free_pinned; /* memory pinned by free MRs */ unsigned long max_items; unsigned long max_items_soft; @@ -74,7 +85,7 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -83,16 +94,17 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; - list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - spin_unlock_irq(&rds_ibdev->spinlock); + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); return rds_ibdev; } } - spin_unlock_irq(&rds_ibdev->spinlock); } + rcu_read_unlock(); return NULL; } @@ -108,7 +120,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); - list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; @@ -116,17 +128,24 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { - struct rds_ib_ipaddr *i_ipaddr, *next; + struct rds_ib_ipaddr *i_ipaddr; + struct rds_ib_ipaddr *to_free = NULL; + spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); + list_del_rcu(&i_ipaddr->list); + to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); + + if (to_free) { + synchronize_rcu(); + kfree(to_free); + } } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) @@ -134,8 +153,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) + if (rds_ibdev_old) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_dev_put(rds_ibdev_old); + } return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } @@ -156,6 +177,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; + atomic_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) @@ -175,18 +197,18 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; + rds_ib_dev_put(rds_ibdev); } -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) +void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(list_lock); - list_splice(list, &tmp_list); - INIT_LIST_HEAD(list); - spin_unlock_irq(list_lock); + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); @@ -200,12 +222,12 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&pool->free_list); - INIT_LIST_HEAD(&pool->drop_list); - INIT_LIST_HEAD(&pool->clean_list); + INIT_XLIST_HEAD(&pool->free_list); + INIT_XLIST_HEAD(&pool->drop_list); + INIT_XLIST_HEAD(&pool->clean_list); mutex_init(&pool->flush_lock); - spin_lock_init(&pool->list_lock); - INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + init_waitqueue_head(&pool->flush_wait); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); pool->fmr_attr.max_pages = fmr_message_size; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; @@ -233,34 +255,60 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { - flush_workqueue(rds_wq); - rds_ib_flush_mr_pool(pool, 1); + cancel_delayed_work_sync(&pool->flush_worker); + rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } +static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl, + struct rds_ib_mr **ibmr_ret) +{ + struct xlist_head *ibmr_xl; + ibmr_xl = xlist_del_head_fast(xl); + *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist); +} + static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; - unsigned long flags; + struct xlist_head *ret; + unsigned long *flag; - spin_lock_irqsave(&pool->list_lock, flags); - if (!list_empty(&pool->clean_list)) { - ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); - list_del_init(&ibmr->list); - } - spin_unlock_irqrestore(&pool->list_lock, flags); + preempt_disable(); + flag = &__get_cpu_var(clean_list_grace); + set_bit(CLEAN_LIST_BUSY_BIT, flag); + ret = xlist_del_head(&pool->clean_list); + if (ret) + ibmr = list_entry(ret, struct rds_ib_mr, xlist); + clear_bit(CLEAN_LIST_BUSY_BIT, flag); + preempt_enable(); return ibmr; } +static inline void wait_clean_list_grace(void) +{ + int cpu; + unsigned long *flag; + + for_each_online_cpu(cpu) { + flag = &per_cpu(clean_list_grace, cpu); + while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) + cpu_relax(); + } +} + static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; struct rds_ib_mr *ibmr = NULL; int err = 0, iter = 0; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + while (1) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) @@ -287,19 +335,24 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) /* We do have some empty MRs. Flush them out. */ rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; } - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); if (!ibmr) { err = -ENOMEM; goto out_no_cigar; } + memset(ibmr, 0, sizeof(*ibmr)); + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE), + IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_ATOMIC), &pool->fmr_attr); if (IS_ERR(ibmr->fmr)) { err = PTR_ERR(ibmr->fmr); @@ -367,7 +420,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm if (page_cnt > fmr_message_size) return -EINVAL; - dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) return -ENOMEM; @@ -441,7 +495,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(in_interrupt()); + BUG_ON(irqs_disabled()); set_page_dirty(page); put_page(page); } @@ -477,33 +531,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr } /* + * given an xlist of mrs, put them all into the list_head for more processing + */ +static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list) +{ + struct rds_ib_mr *ibmr; + struct xlist_head splice; + struct xlist_head *cur; + struct xlist_head *next; + + splice.next = NULL; + xlist_splice(xlist, &splice); + cur = splice.next; + while (cur) { + next = cur->next; + ibmr = list_entry(cur, struct rds_ib_mr, xlist); + list_add_tail(&ibmr->unmap_list, list); + cur = next; + } +} + +/* + * this takes a list head of mrs and turns it into an xlist of clusters. + * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for + * reuse. + */ +static void list_append_to_xlist(struct rds_ib_mr_pool *pool, + struct list_head *list, struct xlist_head *xlist, + struct xlist_head **tail_ret) +{ + struct rds_ib_mr *ibmr; + struct xlist_head *cur_mr = xlist; + struct xlist_head *tail_mr = NULL; + + list_for_each_entry(ibmr, list, unmap_list) { + tail_mr = &ibmr->xlist; + tail_mr->next = NULL; + cur_mr->next = tail_mr; + cur_mr = tail_mr; + } + *tail_ret = tail_mr; +} + +/* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; + struct xlist_head clean_xlist; + struct xlist_head *clean_tail; LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned long flags; unsigned int nfreed = 0, ncleaned = 0, free_goal; int ret = 0; rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); - mutex_lock(&pool->flush_lock); + if (ibmr_ret) { + DEFINE_WAIT(wait); + while(!mutex_trylock(&pool->flush_lock)) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + + prepare_to_wait(&pool->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (xlist_empty(&pool->clean_list)) + schedule(); + + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + } + finish_wait(&pool->flush_wait, &wait); + } else + mutex_lock(&pool->flush_lock); + + if (ibmr_ret) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + goto out; + } + } - spin_lock_irqsave(&pool->list_lock, flags); /* Get the list of all MRs to be dropped. Ordering matters - - * we want to put drop_list ahead of free_list. */ - list_splice_init(&pool->free_list, &unmap_list); - list_splice_init(&pool->drop_list, &unmap_list); + * we want to put drop_list ahead of free_list. + */ + xlist_append_to_list(&pool->drop_list, &unmap_list); + xlist_append_to_list(&pool->free_list, &unmap_list); if (free_all) - list_splice_init(&pool->clean_list, &unmap_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + xlist_append_to_list(&pool->clean_list, &unmap_list); free_goal = rds_ib_flush_goal(pool, free_all); @@ -511,19 +641,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) goto out; /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, &unmap_list, list) + list_for_each_entry(ibmr, &unmap_list, unmap_list) list_add(&ibmr->fmr->list, &fmr_list); + ret = ib_unmap_fmr(&fmr_list); if (ret) printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, &unmap_list, list) { + list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { rds_ib_stats_inc(s_ib_rdma_mr_free); - list_del(&ibmr->list); + list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); nfreed++; @@ -531,9 +662,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) ncleaned++; } - spin_lock_irqsave(&pool->list_lock, flags); - list_splice(&unmap_list, &pool->clean_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + if (!list_empty(&unmap_list)) { + /* we have to make sure that none of the things we're about + * to put on the clean list would race with other cpus trying + * to pull items off. The xlist would explode if we managed to + * remove something from the clean list and then add it back again + * while another CPU was spinning on that same item in xlist_del_head. + * + * This is pretty unlikely, but just in case wait for an xlist grace period + * here before adding anything back into the clean list. + */ + wait_clean_list_grace(); + + list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail); + if (ibmr_ret) + refill_local(pool, &clean_xlist, ibmr_ret); + + /* refill_local may have emptied our list */ + if (!xlist_empty(&clean_xlist)) + xlist_add(clean_xlist.next, clean_tail, &pool->clean_list); + + } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(ncleaned, &pool->dirty_count); @@ -541,14 +690,35 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) out: mutex_unlock(&pool->flush_lock); + if (waitqueue_active(&pool->flush_wait)) + wake_up(&pool->flush_wait); +out_nolock: return ret; } +int rds_ib_fmr_init(void) +{ + rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); + if (!rds_ib_fmr_wq) + return -ENOMEM; + return 0; +} + +/* + * By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_fmr_exit(void) +{ + destroy_workqueue(rds_ib_fmr_wq); +} + static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { - struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) @@ -556,47 +726,49 @@ void rds_ib_free_mr(void *trans_private, int invalidate) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; - unsigned long flags; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ - spin_lock_irqsave(&pool->list_lock, flags); if (ibmr->remap_count >= pool->fmr_attr.max_maps) - list_add(&ibmr->list, &pool->drop_list); + xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list); else - list_add(&ibmr->list, &pool->free_list); + xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); - spin_unlock_irqrestore(&pool->list_lock, flags); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_work(rds_wq, &pool->flush_worker); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_work(rds_wq, &pool->flush_worker); + queue_delayed_work(rds_ib_fmr_wq, + &pool->flush_worker, 10); } } + + rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; + down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; if (pool) - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } + up_read(&rds_ib_devices_lock); } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, @@ -628,6 +800,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); ibmr->device = rds_ibdev; + rds_ibdev = NULL; out: if (ret) { @@ -635,5 +808,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, rds_ib_free_mr(ibmr, 0); ibmr = ERR_PTR(ret); } + if (rds_ibdev) + rds_ib_dev_put(rds_ibdev); return ibmr; } + diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index c74e990..e29e0ca 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -43,42 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; static struct kmem_cache *rds_ib_frag_slab; static atomic_t rds_ib_allocation = ATOMIC_INIT(0); -static void rds_ib_frag_drop_page(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - __free_page(frag->f_page); - frag->f_page = NULL; -} - -static void rds_ib_frag_free(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); - kmem_cache_free(rds_ib_frag_slab, frag); -} - -/* - * We map a page at a time. Its fragments are posted in order. This - * is called in fragment order as the fragments get send completion events. - * Only the last frag in the page performs the unmapping. - * - * It's OK for ring cleanup to call this in whatever order it likes because - * DMA is not in flight and so we can unmap while other ring entries still - * hold page references in their frags. - */ -static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, - struct rds_ib_recv_work *recv) -{ - struct rds_page_frag *frag = recv->r_frag; - - rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); - if (frag->f_mapped) - ib_dma_unmap_page(ic->i_cm_id->device, - frag->f_mapped, - RDS_FRAG_SIZE, DMA_FROM_DEVICE); - frag->f_mapped = 0; -} - void rds_ib_recv_init_ring(struct rds_ib_connection *ic) { struct rds_ib_recv_work *recv; @@ -95,16 +59,161 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.sg_list = recv->r_sge; recv->r_wr.num_sge = RDS_IB_RECV_SGE; - sge = rds_ib_data_sge(ic, recv->r_sge); + sge = &recv->r_sge[0]; + sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + + sge = &recv->r_sge[1]; sge->addr = 0; sge->length = RDS_FRAG_SIZE; sge->lkey = ic->i_mr->lkey; + } +} - sge = rds_ib_header_sge(ic, recv->r_sge); - sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; +/* + * The entire 'from' list, including the from element itself, is put on + * to the tail of the 'to' list. + */ +static void list_splice_entire_tail(struct list_head *from, + struct list_head *to) +{ + struct list_head *from_last = from->prev; + + list_splice_tail(from_last, to); + list_add_tail(from_last, to); +} + +static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) +{ + struct list_head *tmp; + + tmp = xchg(&cache->xfer, NULL); + if (tmp) { + if (cache->ready) + list_splice_entire_tail(tmp, cache->ready); + else + cache->ready = tmp; + } +} + +static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) +{ + struct rds_ib_cache_head *head; + int cpu; + + cache->percpu = alloc_percpu(struct rds_ib_cache_head); + if (!cache->percpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + head->first = NULL; + head->count = 0; + } + cache->xfer = NULL; + cache->ready = NULL; + + return 0; +} + +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) +{ + int ret; + + ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); + if (!ret) { + ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); + if (ret) + free_percpu(ic->i_cache_incs.percpu); } + + return ret; +} + +static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, + struct list_head *caller_list) +{ + struct rds_ib_cache_head *head; + int cpu; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + if (head->first) { + list_splice_entire_tail(head->first, caller_list); + head->first = NULL; + } + } + + if (cache->ready) { + list_splice_entire_tail(cache->ready, caller_list); + cache->ready = NULL; + } +} + +void rds_ib_recv_free_caches(struct rds_ib_connection *ic) +{ + struct rds_ib_incoming *inc; + struct rds_ib_incoming *inc_tmp; + struct rds_page_frag *frag; + struct rds_page_frag *frag_tmp; + LIST_HEAD(list); + + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); + free_percpu(ic->i_cache_incs.percpu); + + list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { + list_del(&inc->ii_cache_entry); + WARN_ON(!list_empty(&inc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, inc); + } + + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); + free_percpu(ic->i_cache_frags.percpu); + + list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { + list_del(&frag->f_cache_entry); + WARN_ON(!list_empty(&frag->f_item)); + kmem_cache_free(rds_ib_frag_slab, frag); + } +} + +/* fwd decl */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache); +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); + + +/* Recycle frag and attached recv buffer f_sg */ +static void rds_ib_frag_free(struct rds_ib_connection *ic, + struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); + + rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); +} + +/* Recycle inc after freeing attached frags */ +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + struct rds_ib_connection *ic = inc->i_conn->c_transport_data; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + /* Free attached frags */ + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_free(ic, frag); + } + BUG_ON(!list_empty(&ibinc->ii_frags)); + + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); } static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, @@ -115,10 +224,8 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, recv->r_ibinc = NULL; } if (recv->r_frag) { - rds_ib_recv_unmap_page(ic, recv); - if (recv->r_frag->f_page) - rds_ib_frag_drop_page(recv->r_frag); - rds_ib_frag_free(recv->r_frag); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); + rds_ib_frag_free(ic, recv->r_frag); recv->r_frag = NULL; } } @@ -129,84 +236,111 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) for (i = 0; i < ic->i_recv_ring.w_nr; i++) rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); - - if (ic->i_frag.f_page) - rds_ib_frag_drop_page(&ic->i_frag); } -static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, - gfp_t kptr_gfp, gfp_t page_gfp) +static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, + gfp_t slab_mask) { - struct rds_ib_connection *ic = conn->c_transport_data; - dma_addr_t dma_addr; - struct ib_sge *sge; - int ret = -ENOMEM; + struct rds_ib_incoming *ibinc; + struct list_head *cache_item; + int avail_allocs; - if (recv->r_ibinc == NULL) { - if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { + cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); + if (cache_item) { + ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); + } else { + avail_allocs = atomic_add_unless(&rds_ib_allocation, + 1, rds_ib_sysctl_max_recv_allocation); + if (!avail_allocs) { rds_ib_stats_inc(s_ib_rx_alloc_limit); - goto out; + return NULL; } - recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, - kptr_gfp); - if (recv->r_ibinc == NULL) { + ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); + if (!ibinc) { atomic_dec(&rds_ib_allocation); - goto out; + return NULL; } - INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); - rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); } + INIT_LIST_HEAD(&ibinc->ii_frags); + rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); - if (recv->r_frag == NULL) { - recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) - goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - recv->r_frag->f_page = NULL; + return ibinc; +} + +static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, + gfp_t slab_mask, gfp_t page_mask) +{ + struct rds_page_frag *frag; + struct list_head *cache_item; + int ret; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); + if (cache_item) { + frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + } else { + frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); + if (!frag) + return NULL; + + sg_init_table(&frag->f_sg, 1); + ret = rds_page_remainder_alloc(&frag->f_sg, + RDS_FRAG_SIZE, page_mask); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, frag); + return NULL; + } } - if (ic->i_frag.f_page == NULL) { - ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) - goto out; - ic->i_frag.f_offset = 0; + INIT_LIST_HEAD(&frag->f_item); + + return frag; +} + +static int rds_ib_recv_refill_one(struct rds_connection *conn, + struct rds_ib_recv_work *recv, int prefill) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct ib_sge *sge; + int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; + + if (prefill) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; } - dma_addr = ib_dma_map_page(ic->i_cm_id->device, - ic->i_frag.f_page, - ic->i_frag.f_offset, - RDS_FRAG_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) - goto out; + if (!ic->i_cache_incs.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + if (!ic->i_cache_frags.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); /* - * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() - * must be called on this recv. This happens as completions hit - * in order or on connection shutdown. + * ibinc was taken from recv if recv contained the start of a message. + * recvs that were continuations will still have this allocated. */ - recv->r_frag->f_page = ic->i_frag.f_page; - recv->r_frag->f_offset = ic->i_frag.f_offset; - recv->r_frag->f_mapped = dma_addr; + if (!recv->r_ibinc) { + recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); + if (!recv->r_ibinc) + goto out; + } - sge = rds_ib_data_sge(ic, recv->r_sge); - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; + WARN_ON(recv->r_frag); /* leak! */ + recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); + if (!recv->r_frag) + goto out; + + ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + WARN_ON(ret != 1); - sge = rds_ib_header_sge(ic, recv->r_sge); + sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); sge->length = sizeof(struct rds_header); - get_page(recv->r_frag->f_page); - - if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { - ic->i_frag.f_offset += RDS_FRAG_SIZE; - } else { - put_page(ic->i_frag.f_page); - ic->i_frag.f_page = NULL; - ic->i_frag.f_offset = 0; - } + sge = &recv->r_sge[1]; + sge->addr = sg_dma_address(&recv->r_frag->f_sg); + sge->length = sg_dma_len(&recv->r_frag->f_sg); ret = 0; out: @@ -216,13 +350,11 @@ out: /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into - * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc - * pairs don't go unmatched. + * sockets. * * -1 is returned if posting fails due to temporary resource exhaustion. */ -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; @@ -236,28 +368,25 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, if (pos >= ic->i_recv_ring.w_nr) { printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", pos); - ret = -EINVAL; break; } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + ret = rds_ib_recv_refill_one(conn, recv, prefill); if (ret) { - ret = -1; break; } /* XXX when can this fail? */ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, - recv->r_ibinc, recv->r_frag->f_page, - (long) recv->r_frag->f_mapped, ret); + recv->r_ibinc, sg_page(&recv->r_frag->f_sg), + (long) sg_dma_address(&recv->r_frag->f_sg), ret); if (ret) { rds_ib_conn_error(conn, "recv post on " "%pI4 returned %d, disconnecting and " "reconnecting\n", &conn->c_faddr, ret); - ret = -1; break; } @@ -270,37 +399,73 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); - return ret; } -void rds_ib_inc_purge(struct rds_incoming *inc) +/* + * We want to recycle several types of recv allocations, like incs and frags. + * To use this, the *_free() function passes in the ptr to a list_head within + * the recyclee, as well as the cache to put it on. + * + * First, we put the memory on a percpu list. When this reaches a certain size, + * We move it to an intermediate non-percpu list in a lockless manner, with some + * xchg/compxchg wizardry. + * + * N.B. Instead of a list_head as the anchor, we use a single pointer, which can + * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and + * list_empty() will return true with one element is actually present. + */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - struct rds_page_frag *frag; - struct rds_page_frag *pos; + unsigned long flags; + struct rds_ib_cache_head *chp; + struct list_head *old; - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); - rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); + local_irq_save(flags); - list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { - list_del_init(&frag->f_item); - rds_ib_frag_drop_page(frag); - rds_ib_frag_free(frag); - } + chp = per_cpu_ptr(cache->percpu, smp_processor_id()); + if (!chp->first) + INIT_LIST_HEAD(new_item); + else /* put on front */ + list_add_tail(new_item, chp->first); + chp->first = new_item; + chp->count++; + + if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT) + goto end; + + /* + * Return our per-cpu first list to the cache's xfer by atomically + * grabbing the current xfer list, appending it to our per-cpu list, + * and then atomically returning that entire list back to the + * cache's xfer list as long as it's still empty. + */ + do { + old = xchg(&cache->xfer, NULL); + if (old) + list_splice_entire_tail(old, chp->first); + old = cmpxchg(&cache->xfer, NULL, chp->first); + } while (old); + + chp->first = NULL; + chp->count = 0; +end: + local_irq_restore(flags); } -void rds_ib_inc_free(struct rds_incoming *inc) +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + struct list_head *head = cache->ready; + + if (head) { + if (!list_empty(head)) { + cache->ready = head->next; + list_del_init(head); + } else + cache->ready = NULL; + } - rds_ib_inc_purge(inc); - rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); - BUG_ON(!list_empty(&ibinc->ii_frags)); - kmem_cache_free(rds_ib_incoming_slab, ibinc); - atomic_dec(&rds_ib_allocation); - BUG_ON(atomic_read(&rds_ib_allocation) < 0); + return head; } int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, @@ -336,13 +501,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, to_copy = min_t(unsigned long, to_copy, len - copied); rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " - "[%p, %lu] + %lu\n", + "[%p, %u] + %lu\n", to_copy, iov->iov_base, iov->iov_len, iov_off, - frag->f_page, frag->f_offset, frag_off); + sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); /* XXX needs + offset for multiple recvs per page */ - ret = rds_page_copy_to_user(frag->f_page, - frag->f_offset + frag_off, + ret = rds_page_copy_to_user(sg_page(&frag->f_sg), + frag->f_sg.offset + frag_off, iov->iov_base + iov_off, to_copy); if (ret) { @@ -557,47 +722,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) return rds_ib_get_ack(ic); } -static struct rds_header *rds_ib_get_header(struct rds_connection *conn, - struct rds_ib_recv_work *recv, - u32 data_len) -{ - struct rds_ib_connection *ic = conn->c_transport_data; - void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; - void *addr; - u32 misplaced_hdr_bytes; - - /* - * Support header at the front (RDS 3.1+) as well as header-at-end. - * - * Cases: - * 1) header all in header buff (great!) - * 2) header all in data page (copy all to header buff) - * 3) header split across hdr buf + data page - * (move bit in hdr buff to end before copying other bit from data page) - */ - if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) - return hdr_buff; - - if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { - addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); - memcpy(hdr_buff, - addr + recv->r_frag->f_offset + data_len, - sizeof(struct rds_header)); - kunmap_atomic(addr, KM_SOFTIRQ0); - return hdr_buff; - } - - misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); - - memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); - - addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); - memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, - sizeof(struct rds_header) - misplaced_hdr_bytes); - kunmap_atomic(addr, KM_SOFTIRQ0); - return hdr_buff; -} - /* * It's kind of lame that we're copying from the posted receive pages into * long-lived bitmaps. We could have posted the bitmaps and rdma written into @@ -639,7 +763,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0); src = addr + frag_off; dst = (void *)map->m_page_addrs[map_page] + map_off; @@ -710,7 +834,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, } data_len -= sizeof(struct rds_header); - ihdr = rds_ib_get_header(conn, recv, data_len); + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { @@ -742,12 +866,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, * the inc is freed. We don't go that route, so we have to drop the * page ref ourselves. We can't just leave the page on the recv * because that confuses the dma mapping of pages and each recv's use - * of a partial page. We can leave the frag, though, it will be - * reused. + * of a partial page. * * FIXME: Fold this into the code path below. */ - rds_ib_frag_drop_page(recv->r_frag); + rds_ib_frag_free(ic, recv->r_frag); + recv->r_frag = NULL; return; } @@ -757,7 +881,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (ibinc == NULL) { + if (!ibinc) { ibinc = recv->r_ibinc; recv->r_ibinc = NULL; ic->i_ibinc = ibinc; @@ -842,32 +966,38 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, struct rds_ib_recv_work *recv; while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_rx_cq_event); recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; - rds_ib_recv_unmap_page(ic, recv); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); /* * Also process recvs in connecting state because it is possible * to get a recv completion _before_ the rdmacm ESTABLISHED * event is processed. */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, state); + } else { /* We expect errors as the qp is drained during shutdown */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, state); - } else { - rds_ib_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status); - } + if (rds_conn_up(conn) || rds_conn_connecting(conn)) + rds_ib_conn_error(conn, "recv completion on %pI4 had " + "status %u (%s), disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status, + rds_ib_wc_status_str(wc.status)); } + /* + * It's very important that we only free this ring entry if we've truly + * freed the resources allocated to the entry. The refilling path can + * leak if we don't. + */ rds_ib_ring_free(&ic->i_recv_ring, 1); } } @@ -897,11 +1027,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data) if (rds_ib_ring_empty(&ic->i_recv_ring)) rds_ib_stats_inc(s_ib_rx_ring_empty); - /* - * If the ring is running low, then schedule the thread to refill. - */ if (rds_ib_ring_low(&ic->i_recv_ring)) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + rds_ib_recv_refill(conn, 0); } int rds_ib_recv(struct rds_connection *conn) @@ -910,25 +1037,13 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_lock(&ic->i_recv_mutex); - if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) - ret = -ENOMEM; - else - rds_ib_stats_inc(s_ib_rx_refill_from_thread); - mutex_unlock(&ic->i_recv_mutex); - if (rds_conn_up(conn)) rds_ib_attempt_ack(ic); return ret; } -int __init rds_ib_recv_init(void) +int rds_ib_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; @@ -939,14 +1054,14 @@ int __init rds_ib_recv_init(void) rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", sizeof(struct rds_ib_incoming), - 0, 0, NULL); - if (rds_ib_incoming_slab == NULL) + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_incoming_slab) goto out; rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), - 0, 0, NULL); - if (rds_ib_frag_slab == NULL) + 0, SLAB_HWCACHE_ALIGN, NULL); + if (!rds_ib_frag_slab) kmem_cache_destroy(rds_ib_incoming_slab); else ret = 0; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 17fa808..71f373c 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -36,11 +36,49 @@ #include <linux/dmapool.h> #include "rds.h" -#include "rdma.h" #include "ib.h" -static void rds_ib_send_rdma_complete(struct rds_message *rm, - int wc_status) +static char *rds_ib_wc_status_strings[] = { +#define RDS_IB_WC_STATUS_STR(foo) \ + [IB_WC_##foo] = __stringify(IB_WC_##foo) + RDS_IB_WC_STATUS_STR(SUCCESS), + RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), + RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), + RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), + RDS_IB_WC_STATUS_STR(MW_BIND_ERR), + RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), + RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_OP_ERR), + RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), + RDS_IB_WC_STATUS_STR(INV_EECN_ERR), + RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), + RDS_IB_WC_STATUS_STR(FATAL_ERR), + RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), + RDS_IB_WC_STATUS_STR(GENERAL_ERR), +#undef RDS_IB_WC_STATUS_STR +}; + +char *rds_ib_wc_status_str(enum ib_wc_status status) +{ + return rds_str_array(rds_ib_wc_status_strings, + ARRAY_SIZE(rds_ib_wc_status_strings), status); +} + +/* + * Convert IB-specific error message to RDS error message and call core + * completion handler. + */ +static void rds_ib_send_complete(struct rds_message *rm, + int wc_status, + void (*complete)(struct rds_message *rm, int status)) { int notify_status; @@ -60,69 +98,125 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, notify_status = RDS_RDMA_OTHER_ERROR; break; } - rds_rdma_send_complete(rm, notify_status); + complete(rm, notify_status); +} + +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); } static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, - struct rds_rdma_op *op) + struct rm_rdma_op *op, + int wc_status) { - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } + + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * we would need to take an event for the rdma WR. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_complete(container_of(op, struct rds_message, rdma), + wc_status, rds_rdma_send_complete); + + if (op->op_write) + rds_stats_add(s_send_rdma_bytes, op->op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, op->op_bytes); } -static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, - int wc_status) +static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, + struct rm_atomic_op *op, + int wc_status) { - struct rds_message *rm = send->s_rm; - - rdsdebug("ic %p send %p rm %p\n", ic, send, rm); - - ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, - DMA_TO_DEVICE); - - if (rm->m_rdma_op != NULL) { - rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); - - /* If the user asked for a completion notification on this - * message, we can implement three different semantics: - * 1. Notify when we received the ACK on the RDS message - * that was queued with the RDMA. This provides reliable - * notification of RDMA status at the expense of a one-way - * packet delay. - * 2. Notify when the IB stack gives us the completion event for - * the RDMA operation. - * 3. Notify when the IB stack gives us the completion event for - * the accompanying RDS messages. - * Here, we implement approach #3. To implement approach #2, - * call rds_rdma_send_complete from the cq_handler. To implement #1, - * don't call rds_rdma_send_complete at all, and fall back to the notify - * handling in the ACK processing code. - * - * Note: There's no need to explicitly sync any RDMA buffers using - * ib_dma_sync_sg_for_cpu - the completion for the RDMA - * operation itself unmapped the RDMA buffers, which takes care - * of synching. - */ - rds_ib_send_rdma_complete(rm, wc_status); + /* unmap atomic recvbuf */ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, + DMA_FROM_DEVICE); + op->op_mapped = 0; + } - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); - else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_ib_send_complete(container_of(op, struct rds_message, atomic), + wc_status, rds_atomic_send_complete); + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) + rds_ib_stats_inc(s_ib_atomic_cswp); + else + rds_ib_stats_inc(s_ib_atomic_fadd); +} + +/* + * Unmap the resources associated with a struct send_work. + * + * Returns the rm for no good reason other than it is unobtainable + * other than by switching on wr.opcode, currently, and the caller, + * the event handler, needs it. + */ +static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = NULL; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, data); + rds_ib_send_unmap_data(ic, send->s_op, wc_status); + } + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, rdma); + rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); + } + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_ATOMIC_CMP_AND_SWP: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, atomic); + rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); + } + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; } - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); + send->s_wr.opcode = 0xdead; - rds_message_put(rm); - send->s_rm = NULL; + return rm; } void rds_ib_send_init_ring(struct rds_ib_connection *ic) @@ -133,23 +227,18 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { struct ib_sge *sge; - send->s_rm = NULL; send->s_op = NULL; send->s_wr.wr_id = i; send->s_wr.sg_list = send->s_sge; - send->s_wr.num_sge = 1; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.send_flags = 0; send->s_wr.ex.imm_data = 0; - sge = rds_ib_data_sge(ic, send->s_sge); - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); + sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; + + send->s_sge[1].lkey = ic->i_mr->lkey; } } @@ -159,16 +248,24 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) u32 i; for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - if (send->s_wr.opcode == 0xdead) - continue; - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); - if (send->s_op) - rds_ib_send_unmap_rdma(ic, send->s_op); + if (send->s_op && send->s_wr.opcode != 0xdead) + rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); } } /* + * The only fast path caller always has a non-zero nr, so we don't + * bother testing nr before performing the atomic sub. + */ +static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) +{ + if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); + BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); +} + +/* * The _oldest/_free ring operations here race cleanly with the alloc/unalloc * operations performed in the send path. As the sender allocs and potentially * unallocs the next free entry in the ring it doesn't alter which is @@ -178,12 +275,14 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) { struct rds_connection *conn = context; struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_message *rm = NULL; struct ib_wc wc; struct rds_ib_send_work *send; u32 completed; u32 oldest; u32 i = 0; int ret; + int nr_sig = 0; rdsdebug("cq %p conn %p\n", cq, conn); rds_ib_stats_inc(s_ib_tx_cq_call); @@ -192,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rdsdebug("ib_req_notify_cq send failed: %d\n", ret); while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_tx_cq_event); @@ -210,51 +310,30 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; - /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_wr.opcode) { - case IB_WR_SEND: - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, wc.status); - break; - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - /* Nothing to be done - the SG list will be unmapped - * when the SEND completes. */ - break; - default: - if (printk_ratelimit()) - printk(KERN_NOTICE - "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_wr.opcode); - break; - } + rm = rds_ib_send_unmap_op(ic, send, wc.status); - send->s_wr.opcode = 0xdead; - send->s_wr.num_sge = 1; if (send->s_queued + HZ/2 < jiffies) rds_ib_stats_inc(s_ib_tx_stalled); - /* If a RDMA operation produced an error, signal this right - * away. If we don't, the subsequent SEND that goes with this - * RDMA will be canceled with ERR_WFLUSH, and the application - * never learn that the RDMA failed. */ - if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { - struct rds_message *rm; - - rm = rds_send_get_message(conn, send->s_op); - if (rm) { - if (rm->m_rdma_op) - rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); - rds_ib_send_rdma_complete(rm, wc.status); - rds_message_put(rm); + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); } + rds_message_put(rm); + send->s_op = NULL; } oldest = (oldest + 1) % ic->i_send_ring.w_nr; } rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || test_bit(0, &conn->c_map_queued)) @@ -262,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) /* We expect errors as the qp is drained during shutdown */ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, - "send completion on %pI4 " - "had status %u, disconnecting and reconnecting\n", - &conn->c_faddr, wc.status); + rds_ib_conn_error(conn, "send completion on %pI4 had status " + "%u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc.status, + rds_ib_wc_status_str(wc.status)); } } } @@ -294,7 +373,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * credits (see rds_ib_send_add_credits below). * * The RDS send code is essentially single-threaded; rds_send_xmit - * grabs c_send_lock to ensure exclusive access to the send ring. + * sets RDS_IN_XMIT to ensure exclusive access to the send ring. * However, the ACK sending code is independent and can race with * message SENDs. * @@ -413,40 +492,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } -static inline void -rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, unsigned int pos, - unsigned long buffer, unsigned int length, - int send_flags) +static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + bool notify) { - struct ib_sge *sge; - - WARN_ON(pos != send - ic->i_sends); - - send->s_wr.send_flags = send_flags; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.num_sge = 2; - send->s_wr.next = NULL; - send->s_queued = jiffies; - send->s_op = NULL; - - if (length != 0) { - sge = rds_ib_data_sge(ic, send->s_sge); - sge->addr = buffer; - sge->length = length; - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); - } else { - /* We're sending a packet with no payload. There is only - * one SGE */ - send->s_wr.num_sge = 1; - sge = &send->s_sge[0]; + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0 || notify) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED; + return 1; } - - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; + return 0; } /* @@ -475,13 +535,14 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, u32 pos; u32 i; u32 work_alloc; - u32 credit_alloc; + u32 credit_alloc = 0; u32 posted; u32 adv_credits = 0; int send_flags = 0; - int sent; + int bytes_sent = 0; int ret; int flow_controlled = 0; + int nr_sig = 0; BUG_ON(off % RDS_FRAG_SIZE); BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); @@ -507,14 +568,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - credit_alloc = work_alloc; if (ic->i_flowctl) { credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); adv_credits += posted; if (credit_alloc < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); work_alloc = credit_alloc; - flow_controlled++; + flow_controlled = 1; } if (work_alloc == 0) { set_bit(RDS_LL_SEND_FULL, &conn->c_flags); @@ -525,31 +585,25 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { - /* - printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", - be16_to_cpu(rm->m_inc.i_hdr.h_dport), - rm->m_inc.i_hdr.h_flags, - be32_to_cpu(rm->m_inc.i_hdr.h_len)); - */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (!ic->i_data_op) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.op_count = 0; } - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; rds_message_addref(rm); - ic->i_rm = rm; + ic->i_data_op = &rm->data; /* Finalize the header */ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) @@ -559,10 +613,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -582,99 +636,77 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* * Update adv_credits since we reset the ACK_REQUIRED bit. */ - rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); - adv_credits += posted; - BUG_ON(adv_credits > 255); + if (ic->i_flowctl) { + rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } } - send = &ic->i_sends[pos]; - first = send; - prev = NULL; - scat = &rm->m_sg[sg]; - sent = 0; - i = 0; - /* Sometimes you want to put a fence between an RDMA * READ and the following SEND. * We could either do this all the time * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; - /* - * We could be copying the header into the unused tail of the page. - * That would need to be changed in the future when those pages might - * be mapped userspace pages or page cache pages. So instead we always - * use a second sge and our long-lived ring of mapped headers. We send - * the header after the data so that the data payload can be aligned on - * the receiver. - */ + /* Each frag gets a header. Msgs may be 0 bytes */ + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &ic->i_data_op->op_sg[sg]; + i = 0; + do { + unsigned int len = 0; - /* handle a 0-len message */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { - rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); - goto add_header; - } + /* Set up the header */ + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; - /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { - unsigned int len; + send->s_sge[0].addr = ic->i_send_hdrs_dma + + (pos * sizeof(struct rds_header)); + send->s_sge[0].length = sizeof(struct rds_header); - send = &ic->i_sends[pos]; + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); - rds_ib_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + off, len, - send_flags); + /* Set up the data, if present */ + if (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]) { + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + send->s_wr.num_sge = 2; - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time - * on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } + send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].length = len; - ic->i_unsignaled_bytes -= len; - if (ic->i_unsignaled_bytes <= 0) { - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + bytes_sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } } + rds_ib_set_wr_signal_state(ic, send, 0); + /* * Always signal the last one if we're stopping due to flow control. */ - if (flow_controlled && i == (work_alloc-1)) + if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + rdsdebug("send %p wr %p num_sge %u next %p\n", send, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); - sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { - scat++; - off = 0; - } - -add_header: - /* Tack on the header after the data. The header SGE should already - * have been set up to point to the right header buffer. */ - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - - if (0) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", - be16_to_cpu(hdr->h_dport), - hdr->h_flags, - be32_to_cpu(hdr->h_len)); - } - if (adv_credits) { + if (ic->i_flowctl && adv_credits) { struct rds_header *hdr = &ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ @@ -689,20 +721,25 @@ add_header: prev = send; pos = (pos + 1) % ic->i_send_ring.w_nr; - } + send = &ic->i_sends[pos]; + i++; + + } while (i < work_alloc + && scat != &rm->data.op_sg[rm->data.op_count]); /* Account the RDS header in the number of bytes we sent, but just once. * The caller has no concept of fragmentation. */ if (hdr_off == 0) - sent += sizeof(struct rds_header); + bytes_sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { - prev->s_rm = ic->i_rm; - prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - ic->i_rm = NULL; + if (scat == &rm->data.op_sg[rm->data.op_count]) { + prev->s_op = ic->i_data_op; + prev->s_wr.send_flags |= IB_SEND_SOLICITED; + ic->i_data_op = NULL; } + /* Put back wrs & credits we didn't use */ if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; @@ -710,6 +747,9 @@ add_header: if (ic->i_flowctl && i < credit_alloc) rds_ib_send_add_credits(conn, credit_alloc - i); + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + /* XXX need to worry about failed_wr and partial sends. */ failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); @@ -720,32 +760,127 @@ add_header: printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); - if (prev->s_rm) { - ic->i_rm = prev->s_rm; - prev->s_rm = NULL; + rds_ib_sub_signaled(ic, nr_sig); + if (prev->s_op) { + ic->i_data_op = prev->s_op; + prev->s_op = NULL; } rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); goto out; } - ret = sent; + ret = bytes_sent; out: BUG_ON(adv_credits); return ret; } -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +/* + * Issue atomic operation. + * A simplified version of the rdma case, we always map 1 SG, and + * only 8 bytes, for the return value from the atomic operation. + */ +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + u32 pos; + u32 work_alloc; + int ret; + int nr_sig = 0; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); + if (work_alloc != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + /* address of send request in ring */ + send = &ic->i_sends[pos]; + send->s_queued = jiffies; + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; + send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; + } else { /* FADD */ + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; + send->s_wr.wr.atomic.swap = 0; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_wr.wr.atomic.swap_mask = 0; + } + nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; + send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_op = op; + rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); + + /* map 8 byte retval buffer to the device */ + ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); + rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); + if (ret != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + /* Convert our struct scatterlist to struct ib_sge */ + send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].lkey = ic->i_mr->lkey; + + rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, + send->s_sge[0].addr, send->s_sge[0].length); + + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + + failed_wr = &send->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, + send, &send->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); + goto out; + } + + if (unlikely(failed_wr != &send->s_wr)) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &send->s_wr); + } + +out: + return ret; +} + +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_send_work *send = NULL; struct rds_ib_send_work *first; struct rds_ib_send_work *prev; struct ib_send_wr *failed_wr; - struct rds_ib_device *rds_ibdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; + u32 max_sge = ic->rds_ibdev->max_sge; u32 pos; u32 work_alloc; u32 i; @@ -753,29 +888,28 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) int sent; int ret; int num_sge; - - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - - /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + int nr_sig = 0; + + /* map the op the first time we see it */ + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } /* * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_ibdev->max_sge); + i = ceil(op->op_count, max_sge); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -788,30 +922,24 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags = IB_SEND_SIGNALED; - } + send->s_op = NULL; + + nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); - send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; - send->s_op = op; + send->s_wr.wr.rdma.rkey = op->op_rkey; - if (num_sge > rds_ibdev->max_sge) { - send->s_wr.num_sge = rds_ibdev->max_sge; - num_sge -= rds_ibdev->max_sge; + if (num_sge > max_sge) { + send->s_wr.num_sge = max_sge; + num_sge -= max_sge; } else { send->s_wr.num_sge = num_sge; } @@ -821,7 +949,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); @@ -843,15 +971,20 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = ic->i_sends; } - /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) - prev->s_wr.send_flags = IB_SEND_SIGNALED; + /* give a reference to the last op */ + if (scat == &op->op_sg[op->op_count]) { + prev->s_op = op; + rds_message_addref(container_of(op, struct rds_message, rdma)); + } if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; } + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, @@ -861,6 +994,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); goto out; } diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d2c904d..2d5965d 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rdma_mr_pool_flush", "ib_rdma_mr_pool_wait", "ib_rdma_mr_pool_depleted", + "ib_atomic_cswp", + "ib_atomic_fadd", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index 03f01cb..fc3da37 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16; static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; -unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); -static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; -static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; - /* * This sysctl does nothing. * @@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = { .extra2 = &rds_ib_sysctl_max_unsig_wr_max, }, { - .procname = "max_unsignaled_bytes", - .data = &rds_ib_sysctl_max_unsig_bytes, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, - .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, - }, - { .procname = "max_recv_allocation", .data = &rds_ib_sysctl_max_recv_allocation, .maxlen = sizeof(unsigned long), @@ -132,10 +119,10 @@ void rds_ib_sysctl_exit(void) unregister_sysctl_table(rds_ib_sysctl_hdr); } -int __init rds_ib_sysctl_init(void) +int rds_ib_sysctl_init(void) { rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); - if (rds_ib_sysctl_hdr == NULL) + if (!rds_ib_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/info.c b/net/rds/info.c index c45c417..4fdf1b6 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func) BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); - BUG_ON(rds_info_funcs[offset] != NULL); + BUG_ON(rds_info_funcs[offset]); rds_info_funcs[offset] = func; spin_unlock(&rds_info_lock); } @@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func); */ void rds_info_iter_unmap(struct rds_info_iterator *iter) { - if (iter->addr != NULL) { + if (iter->addr) { kunmap_atomic(iter->addr, KM_USER0); iter->addr = NULL; } @@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, unsigned long this; while (bytes) { - if (iter->addr == NULL) + if (!iter->addr) iter->addr = kmap_atomic(*iter->pages, KM_USER0); this = min(bytes, PAGE_SIZE - iter->offset); @@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, >> PAGE_SHIFT; pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } @@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, call_func: func = rds_info_funcs[optname - RDS_INFO_FIRST]; - if (func == NULL) { + if (!func) { ret = -ENOPROTOOPT; goto out; } @@ -234,7 +234,7 @@ call_func: ret = -EFAULT; out: - for (i = 0; pages != NULL && i < nr_pages; i++) + for (i = 0; pages && i < nr_pages; i++) put_page(pages[i]); kfree(pages); diff --git a/net/rds/iw.c b/net/rds/iw.c index c8f3d35..56808ca 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = { .laddr_check = rds_iw_laddr_check, .xmit_complete = rds_iw_xmit_complete, .xmit = rds_iw_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_iw_xmit_rdma, .recv = rds_iw_recv, .conn_alloc = rds_iw_conn_alloc, @@ -272,7 +271,6 @@ struct rds_transport rds_iw_transport = { .conn_connect = rds_iw_conn_connect, .conn_shutdown = rds_iw_conn_shutdown, .inc_copy_to_user = rds_iw_inc_copy_to_user, - .inc_purge = rds_iw_inc_purge, .inc_free = rds_iw_inc_free, .cm_initiate_connect = rds_iw_cm_initiate_connect, .cm_handle_connect = rds_iw_cm_handle_connect, @@ -289,7 +287,7 @@ struct rds_transport rds_iw_transport = { .t_prefer_loopback = 1, }; -int __init rds_iw_init(void) +int rds_iw_init(void) { int ret; diff --git a/net/rds/iw.h b/net/rds/iw.h index eef2f0c..543e665 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -70,7 +70,7 @@ struct rds_iw_send_work { struct rds_message *s_rm; /* We should really put these into a union: */ - struct rds_rdma_op *s_op; + struct rm_rdma_op *s_op; struct rds_iw_mapping *s_mapping; struct ib_mr *s_mr; struct ib_fast_reg_page_list *s_page_list; @@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg); int rds_iw_conn_connect(struct rds_connection *conn); void rds_iw_conn_shutdown(struct rds_connection *conn); void rds_iw_state_change(struct sock *sk); -int __init rds_iw_listen_init(void); +int rds_iw_listen_init(void); void rds_iw_listen_stop(void); void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -321,12 +321,11 @@ void rds_iw_flush_mrs(void); void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); /* ib_recv.c */ -int __init rds_iw_recv_init(void); +int rds_iw_recv_init(void); void rds_iw_recv_exit(void); int rds_iw_recv(struct rds_connection *conn); int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, gfp_t page_gfp, int prefill); -void rds_iw_inc_purge(struct rds_incoming *inc); void rds_iw_inc_free(struct rds_incoming *inc); int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); @@ -358,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_iw_send_init_ring(struct rds_iw_connection *ic); void rds_iw_send_clear_ring(struct rds_iw_connection *ic); -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, @@ -371,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_iw_sysctl_init(void); +int rds_iw_sysctl_init(void); void rds_iw_sysctl_exit(void); extern unsigned long rds_iw_sysctl_max_send_wr; extern unsigned long rds_iw_sysctl_max_recv_wr; diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index b5dd6ac..712cf2d 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) * the rds_iwdev at all. */ rds_iwdev = ib_get_client_data(dev, &rds_iw_client); - if (rds_iwdev == NULL) { + if (!rds_iwdev) { if (printk_ratelimit()) printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", dev->name); @@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); - if (ic->i_sends == NULL) { + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; @@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) rds_iw_send_init_ring(ic); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); - if (ic->i_recvs == NULL) { + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; @@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) /* XXX too lazy? */ ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); - if (ic == NULL) + if (!ic) return -ENOMEM; INIT_LIST_HEAD(&ic->iw_node); diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 13dc186..4e152e2 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -34,7 +34,6 @@ #include <linux/slab.h> #include "rds.h" -#include "rdma.h" #include "iw.h" diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 3d47906..5e57347 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag) static void rds_iw_frag_free(struct rds_page_frag *frag) { rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); + BUG_ON(frag->f_page); kmem_cache_free(rds_iw_frag_slab, frag); } @@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, struct ib_sge *sge; int ret = -ENOMEM; - if (recv->r_iwinc == NULL) { + if (!recv->r_iwinc) { if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { rds_iw_stats_inc(s_iw_rx_alloc_limit); goto out; } recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, kptr_gfp); - if (recv->r_iwinc == NULL) { + if (!recv->r_iwinc) { atomic_dec(&rds_iw_allocation); goto out; } @@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); } - if (recv->r_frag == NULL) { + if (!recv->r_frag) { recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) + if (!recv->r_frag) goto out; INIT_LIST_HEAD(&recv->r_frag->f_item); recv->r_frag->f_page = NULL; } - if (ic->i_frag.f_page == NULL) { + if (!ic->i_frag.f_page) { ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) + if (!ic->i_frag.f_page) goto out; ic->i_frag.f_offset = 0; } @@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, return ret; } -void rds_iw_inc_purge(struct rds_incoming *inc) +static void rds_iw_inc_purge(struct rds_incoming *inc) { struct rds_iw_incoming *iwinc; struct rds_page_frag *frag; @@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (iwinc == NULL) { + if (!iwinc) { iwinc = recv->r_iwinc; recv->r_iwinc = NULL; ic->i_iwinc = iwinc; @@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn) return ret; } -int __init rds_iw_recv_init(void) +int rds_iw_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; @@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void) rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", sizeof(struct rds_iw_incoming), 0, 0, NULL); - if (rds_iw_incoming_slab == NULL) + if (!rds_iw_incoming_slab) goto out; rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", sizeof(struct rds_page_frag), 0, 0, NULL); - if (rds_iw_frag_slab == NULL) + if (!rds_iw_frag_slab) kmem_cache_destroy(rds_iw_incoming_slab); else ret = 0; diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 52182ff..6280ea0 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -36,7 +36,6 @@ #include <linux/dmapool.h> #include "rds.h" -#include "rdma.h" #include "iw.h" static void rds_iw_send_rdma_complete(struct rds_message *rm, @@ -64,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, } static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } } @@ -83,11 +82,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, + rm->data.op_sg, rm->data.op_nents, DMA_TO_DEVICE); - if (rm->m_rdma_op != NULL) { - rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); + if (rm->rdma.op_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -111,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, */ rds_iw_send_rdma_complete(rm, wc_status); - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + if (rm->rdma.op_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -556,25 +555,27 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { + if (!ic->i_rm) { /* printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", be16_to_cpu(rm->m_inc.i_hdr.h_dport), rm->m_inc.i_hdr.h_flags, be32_to_cpu(rm->m_inc.i_hdr.h_len)); */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.op_count = 0; } ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; @@ -590,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -621,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->m_sg[sg]; + scat = &rm->data.op_sg[sg]; sent = 0; i = 0; @@ -631,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; /* @@ -650,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { unsigned int len; send = &ic->i_sends[pos]; @@ -728,7 +729,7 @@ add_header: sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { + if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_rm = ic->i_rm; prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; ic->i_rm = NULL; @@ -784,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); } -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_iw_connection *ic = conn->c_transport_data; struct rds_iw_send_work *send = NULL; @@ -794,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) struct rds_iw_device *rds_iwdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; u32 pos, fr_pos; u32 work_alloc; u32 i; @@ -806,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } - if (!op->r_write) { + if (!op->op_write) { /* Alloc space on the send queue for the fastreg */ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); if (work_alloc != 1) { @@ -835,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_iwdev->max_sge); + i = ceil(op->op_count, rds_iwdev->max_sge); work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -846,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } send = &ic->i_sends[pos]; - if (!op->r_write) { + if (!op->op_write) { first = prev = &ic->i_sends[fr_pos]; } else { first = send; prev = NULL; } - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; @@ -873,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * for local access after RDS is finished with it, using * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. */ - if (op->r_write) + if (op->op_write) send->s_wr.opcode = IB_WR_RDMA_WRITE; else send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; + send->s_wr.wr.rdma.rkey = op->op_rkey; send->s_op = op; if (num_sge > rds_iwdev->max_sge) { @@ -893,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) @@ -927,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) + if (scat == &op->op_sg[op->op_count]) first->s_wr.send_flags = IB_SEND_SIGNALED; if (i < work_alloc) { @@ -941,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * adapters do not allow using the lkey for this at all. To bypass this use a * fastreg_mr (or possibly a dma_mr) */ - if (!op->r_write) { + if (!op->op_write) { rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], - op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); work_alloc++; } diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 1c4428a..23e3a9a 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -122,10 +122,10 @@ void rds_iw_sysctl_exit(void) unregister_sysctl_table(rds_iw_sysctl_hdr); } -int __init rds_iw_sysctl_init(void) +int rds_iw_sysctl_init(void) { rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); - if (rds_iw_sysctl_hdr == NULL) + if (!rds_iw_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/loop.c b/net/rds/loop.c index dd98793..c390156 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -61,10 +61,17 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { + /* Do not send cong updates to loopback */ + if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + } + BUG_ON(hdr_off || sg || off); rds_inc_init(&rm->m_inc, conn, conn->c_laddr); - rds_message_addref(rm); /* for the inc */ + /* For the embedded inc. Matching put is in loop_inc_free() */ + rds_message_addref(rm); rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, GFP_KERNEL, KM_USER0); @@ -77,16 +84,14 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); } -static int rds_loop_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, - unsigned long offset) +/* + * See rds_loop_xmit(). Since our inc is embedded in the rm, we + * make sure the rm lives at least until the inc is done. + */ +static void rds_loop_inc_free(struct rds_incoming *inc) { - BUG_ON(offset); - BUG_ON(map != conn->c_lcong); - - rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); } /* we need to at least give the thread something to succeed */ @@ -112,7 +117,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) unsigned long flags; lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); - if (lc == NULL) + if (!lc) return -ENOMEM; INIT_LIST_HEAD(&lc->loop_node); @@ -169,14 +174,12 @@ void rds_loop_exit(void) */ struct rds_transport rds_loop_transport = { .xmit = rds_loop_xmit, - .xmit_cong_map = rds_loop_xmit_cong_map, .recv = rds_loop_recv, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, .conn_connect = rds_loop_conn_connect, .conn_shutdown = rds_loop_conn_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, - .inc_purge = rds_message_inc_purge, - .inc_free = rds_message_inc_free, + .inc_free = rds_loop_inc_free, .t_name = "loopback", }; diff --git a/net/rds/message.c b/net/rds/message.c index 9a1d67e..84f937f 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -34,9 +34,6 @@ #include <linux/slab.h> #include "rds.h" -#include "rdma.h" - -static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_NONE] = 0, @@ -63,29 +60,31 @@ static void rds_message_purge(struct rds_message *rm) if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->m_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); + for (i = 0; i < rm->data.op_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->m_sg[i])); + __free_page(sg_page(&rm->data.op_sg[i])); } - rm->m_nents = 0; + rm->data.op_nents = 0; - if (rm->m_rdma_op) - rds_rdma_free_op(rm->m_rdma_op); - if (rm->m_rdma_mr) - rds_mr_put(rm->m_rdma_mr); -} + if (rm->rdma.op_active) + rds_rdma_free_op(&rm->rdma); + if (rm->rdma.op_rdma_mr) + rds_mr_put(rm->rdma.op_rdma_mr); -void rds_message_inc_purge(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_purge(rm); + if (rm->atomic.op_active) + rds_atomic_free_op(&rm->atomic); + if (rm->atomic.op_rdma_mr) + rds_mr_put(rm->atomic.op_rdma_mr); } void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); - + if (atomic_read(&rm->m_refcount) == 0) { +printk(KERN_CRIT "danger refcount zero on %p\n", rm); +WARN_ON(1); + } if (atomic_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_conn_item)); @@ -96,12 +95,6 @@ void rds_message_put(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_put); -void rds_message_inc_free(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_put(rm); -} - void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq) { @@ -214,41 +207,68 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o } EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); -struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) +/* + * Each rds_message is allocated with extra space for the scatterlist entries + * rds ops will need. This is to minimize memory allocation count. Then, each rds op + * can grab SGs when initializing its part of the rds_message. + */ +struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) { struct rds_message *rm; - rm = kzalloc(sizeof(struct rds_message) + - (nents * sizeof(struct scatterlist)), gfp); + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); if (!rm) goto out; - if (nents) - sg_init_table(rm->m_sg, nents); + rm->m_used_sgs = 0; + rm->m_total_sgs = extra_len / sizeof(struct scatterlist); + atomic_set(&rm->m_refcount, 1); INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); spin_lock_init(&rm->m_rs_lock); + init_waitqueue_head(&rm->m_flush_wait); out: return rm; } +/* + * RDS ops use this to grab SG entries from the rm's sg pool. + */ +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) +{ + struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; + struct scatterlist *sg_ret; + + WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); + WARN_ON(!nents); + + sg_ret = &sg_first[rm->m_used_sgs]; + sg_init_table(sg_ret, nents); + rm->m_used_sgs += nents; + + return sg_ret; +} + struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) { struct rds_message *rm; unsigned int i; + int num_sgs = ceil(total_len, PAGE_SIZE); + int extra_bytes = num_sgs * sizeof(struct scatterlist); - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) + rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); + if (!rm) return ERR_PTR(-ENOMEM); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->m_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); - for (i = 0; i < rm->m_nents; ++i) { - sg_set_page(&rm->m_sg[i], + for (i = 0; i < rm->data.op_nents; ++i) { + sg_set_page(&rm->data.op_sg[i], virt_to_page(page_addrs[i]), PAGE_SIZE, 0); } @@ -256,40 +276,33 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len) { unsigned long to_copy; unsigned long iov_off; unsigned long sg_off; - struct rds_message *rm; struct iovec *iov; struct scatterlist *sg; - int ret; - - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) { - ret = -ENOMEM; - goto out; - } + int ret = 0; rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); /* * now allocate and copy in the data payload. */ - sg = rm->m_sg; + sg = rm->data.op_sg; iov = first_iov; iov_off = 0; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ while (total_len) { - if (sg_page(sg) == NULL) { + if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, total_len, GFP_HIGHUSER); if (ret) goto out; - rm->m_nents++; + rm->data.op_nents++; sg_off = 0; } @@ -320,14 +333,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, sg++; } - ret = 0; out: - if (ret) { - if (rm) - rds_message_put(rm); - rm = ERR_PTR(ret); - } - return rm; + return ret; } int rds_message_inc_copy_to_user(struct rds_incoming *inc, @@ -348,7 +355,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, iov = first_iov; iov_off = 0; - sg = rm->m_sg; + sg = rm->data.op_sg; vec_off = 0; copied = 0; @@ -394,15 +401,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, */ void rds_message_wait(struct rds_message *rm) { - wait_event(rds_message_flush_waitq, + wait_event_interruptible(rm->m_flush_wait, !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); } void rds_message_unmapped(struct rds_message *rm) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); - if (waitqueue_active(&rds_message_flush_waitq)) - wake_up(&rds_message_flush_waitq); + wake_up_interruptible(&rm->m_flush_wait); } EXPORT_SYMBOL_GPL(rds_message_unmapped); diff --git a/net/rds/page.c b/net/rds/page.c index 595a952..5e44f5ae 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); - if (page == NULL) { + if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); @@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); - if (page == NULL) { + if (!page) { ret = -ENOMEM; break; } @@ -186,6 +186,7 @@ out: ret ? 0 : scat->length); return ret; } +EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); static int rds_page_remainder_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 75fd13b..4806467 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -35,7 +35,7 @@ #include <linux/rbtree.h> #include <linux/dma-mapping.h> /* for DMA_*_DEVICE */ -#include "rdma.h" +#include "rds.h" /* * XXX @@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; + unsigned long flags; /* Release any MRs associated with this socket */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = container_of(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + rds_destroy_mr(mr); rds_mr_put(mr); + spin_lock_irqsave(&rs->rs_rdma_lock, flags); } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); @@ -181,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } - if (rs->rs_transport->get_mr == NULL) { + if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } @@ -197,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); - if (mr == NULL) { + if (!mr) { ret = -ENOMEM; goto out; } @@ -230,13 +238,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ - ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); + ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret < 0) goto out; nents = ret; sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (sg == NULL) { + if (!sg) { ret = -ENOMEM; goto out; } @@ -406,68 +414,127 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr && (mr->r_use_once || force)) { + if (!mr) { + printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + return; + } + + if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; - } else if (mr) - atomic_inc(&mr->r_refcount); + } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ - if (mr != NULL) { - if (mr->r_trans->sync_mr) - mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); - - /* If the MR was marked as invalidate, this will - * trigger an async flush. */ - if (zot_me) - rds_destroy_mr(mr); - rds_mr_put(mr); - } + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); } -void rds_rdma_free_op(struct rds_rdma_op *ro) +void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->r_nents; i++) { - struct page *page = sg_page(&ro->r_sg[i]); + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ - if (!ro->r_write) { - BUG_ON(in_interrupt()); + if (!ro->op_write) { + BUG_ON(irqs_disabled()); set_page_dirty(page); } put_page(page); } - kfree(ro->r_notifier); - kfree(ro); + kfree(ro->op_notifier); + ro->op_notifier = NULL; + ro->op_active = 0; +} + +void rds_atomic_free_op(struct rm_atomic_op *ao) +{ + struct page *page = sg_page(ao->op_sg); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + set_page_dirty(page); + put_page(page); + + kfree(ao->op_notifier); + ao->op_notifier = NULL; + ao->op_active = 0; +} + + +/* + * Count the number of pages needed to describe an incoming iovec. + */ +static int rds_rdma_pages(struct rds_rdma_args *args) +{ + struct rds_iovec vec; + struct rds_iovec __user *local_vec; + unsigned int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) + return -EFAULT; + + nr_pages = rds_pages_in_vec(&vec); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) +{ + return rds_rdma_pages(args) * sizeof(struct scatterlist); } /* - * args is a pointer to an in-kernel copy in the sendmsg cmsg. + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op */ -static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, - struct rds_rdma_args *args) +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) { + struct rds_rdma_args *args; struct rds_iovec vec; - struct rds_rdma_op *op = NULL; + struct rm_rdma_op *op = &rm->rdma; unsigned int nr_pages; - unsigned int max_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec __user *local_vec; - struct scatterlist *sg; unsigned int nr; unsigned int i, j; - int ret; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->rdma.op_active) + return -EINVAL; + args = CMSG_DATA(cmsg); if (rs->rs_bound_addr == 0) { ret = -ENOTCONN; /* XXX not a great errno */ @@ -479,61 +546,38 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, goto out; } - nr_pages = 0; - max_pages = 0; - - local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; - - /* figure out the number of pages in the vector */ - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; - goto out; - } - - max_pages = max(nr, max_pages); - nr_pages += nr; - } - - pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { - ret = -ENOMEM; + nr_pages = rds_rdma_pages(args); + if (nr_pages < 0) goto out; - } - op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); - if (op == NULL) { + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { ret = -ENOMEM; goto out; } - op->r_write = !!(args->flags & RDS_RDMA_READWRITE); - op->r_fence = !!(args->flags & RDS_RDMA_FENCE); - op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); - op->r_recverr = rs->rs_recverr; + op->op_write = !!(args->flags & RDS_RDMA_READWRITE); + op->op_fence = !!(args->flags & RDS_RDMA_FENCE); + op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->op_silent = !!(args->flags & RDS_RDMA_SILENT); + op->op_active = 1; + op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); - sg_init_table(op->r_sg, nr_pages); + op->op_sg = rds_message_alloc_sgs(rm, nr_pages); - if (op->r_notify || op->r_recverr) { + if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); - if (!op->r_notifier) { + op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->op_notifier) { ret = -ENOMEM; goto out; } - op->r_notifier->n_user_token = args->user_token; - op->r_notifier->n_status = RDS_RDMA_SUCCESS; + op->op_notifier->n_user_token = args->user_token; + op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and @@ -543,15 +587,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ - op->r_key = rds_rdma_cookie_key(args->cookie); - op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + op->op_rkey = rds_rdma_cookie_key(args->cookie); + op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, - op->r_key); + op->op_rkey); + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; for (i = 0; i < args->nr_local; i++) { if (copy_from_user(&vec, &local_vec[i], @@ -569,15 +615,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, rs->rs_user_addr = vec.addr; rs->rs_user_bytes = vec.bytes; - /* did the user change the vec under us? */ - if (nr > max_pages || op->r_nents + nr > nr_pages) { - ret = -EINVAL; - goto out; - } /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); + ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); if (ret < 0) goto out; @@ -588,8 +629,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, for (j = 0; j < nr; j++) { unsigned int offset = vec.addr & ~PAGE_MASK; + struct scatterlist *sg; - sg = &op->r_sg[op->r_nents + j]; + sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), offset); @@ -601,10 +643,9 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, vec.bytes -= sg->length; } - op->r_nents += nr; + op->op_nents += nr; } - if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, @@ -612,38 +653,17 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ret = -EINVAL; goto out; } - op->r_bytes = nr_bytes; + op->op_bytes = nr_bytes; ret = 0; out: kfree(pages); - if (ret) { - if (op) - rds_rdma_free_op(op); - op = ERR_PTR(ret); - } - return op; -} - -/* - * The application asks for a RDMA transfer. - * Extract all arguments and set up the rdma_op - */ -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg) -{ - struct rds_rdma_op *op; - - if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || - rm->m_rdma_op != NULL) - return -EINVAL; + if (ret) + rds_rdma_free_op(op); - op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); - if (IS_ERR(op)) - return PTR_ERR(op); rds_stats_inc(s_send_rdma); - rm->m_rdma_op = op; - return 0; + + return ret; } /* @@ -673,7 +693,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr == NULL) + if (!mr) err = -EINVAL; /* invalid r_key */ else atomic_inc(&mr->r_refcount); @@ -681,7 +701,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); - rm->m_rdma_mr = mr; + rm->rdma.op_rdma_mr = mr; } return err; } @@ -699,5 +719,98 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, rm->m_rdma_cookie != 0) return -EINVAL; - return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); +} + +/* + * Fill in rds_message for an atomic request. + */ +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct page *page = NULL; + struct rds_atomic_args *args; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) + || rm->atomic.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); + + /* Nonmasked & masked cmsg ops converted to masked hw ops */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = 0; + break; + case RDS_CMSG_MASKED_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->m_fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; + break; + case RDS_CMSG_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->cswp.compare; + rm->atomic.op_m_cswp.swap = args->cswp.swap; + rm->atomic.op_m_cswp.compare_mask = ~0; + rm->atomic.op_m_cswp.swap_mask = ~0; + break; + case RDS_CMSG_MASKED_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->m_cswp.compare; + rm->atomic.op_m_cswp.swap = args->m_cswp.swap; + rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; + rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; + break; + default: + BUG(); /* should never happen */ + } + + rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); + rm->atomic.op_active = 1; + rm->atomic.op_recverr = rs->rs_recverr; + rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + + /* verify 8 byte-aligned */ + if (args->local_addr & 0x7) { + ret = -EFAULT; + goto err; + } + + ret = rds_pin_pages(args->local_addr, 1, &page, 1); + if (ret != 1) + goto err; + ret = 0; + + sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); + + if (rm->atomic.op_notify || rm->atomic.op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + if (!rm->atomic.op_notifier) { + ret = -ENOMEM; + goto err; + } + + rm->atomic.op_notifier->n_user_token = args->user_token; + rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); + rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); + + return ret; +err: + if (page) + put_page(page); + kfree(rm->atomic.op_notifier); + + return ret; } diff --git a/net/rds/rdma.h b/net/rds/rdma.h deleted file mode 100644 index 909c398..0000000 --- a/net/rds/rdma.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef _RDS_RDMA_H -#define _RDS_RDMA_H - -#include <linux/rbtree.h> -#include <linux/spinlock.h> -#include <linux/scatterlist.h> - -#include "rds.h" - -struct rds_mr { - struct rb_node r_rb_node; - atomic_t r_refcount; - u32 r_key; - - /* A copy of the creation flags */ - unsigned int r_use_once:1; - unsigned int r_invalidate:1; - unsigned int r_write:1; - - /* This is for RDS_MR_DEAD. - * It would be nice & consistent to make this part of the above - * bit field here, but we need to use test_and_set_bit. - */ - unsigned long r_state; - struct rds_sock *r_sock; /* back pointer to the socket that owns us */ - struct rds_transport *r_trans; - void *r_trans_private; -}; - -/* Flags for mr->r_state */ -#define RDS_MR_DEAD 0 - -struct rds_rdma_op { - u32 r_key; - u64 r_remote_addr; - unsigned int r_write:1; - unsigned int r_fence:1; - unsigned int r_notify:1; - unsigned int r_recverr:1; - unsigned int r_mapped:1; - struct rds_notifier *r_notifier; - unsigned int r_bytes; - unsigned int r_nents; - unsigned int r_count; - struct scatterlist r_sg[0]; -}; - -static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) -{ - return r_key | (((u64) offset) << 32); -} - -static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) -{ - return cookie; -} - -static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) -{ - return cookie >> 32; -} - -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); -void rds_rdma_drop_keys(struct rds_sock *rs); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -void rds_rdma_free_op(struct rds_rdma_op *ro); -void rds_rdma_send_complete(struct rds_message *rm, int); - -extern void __rds_put_mr_final(struct rds_mr *mr); -static inline void rds_mr_put(struct rds_mr *mr) -{ - if (atomic_dec_and_test(&mr->r_refcount)) - __rds_put_mr_final(mr); -} - -#endif diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index e599ba2..e6ed10a 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -36,6 +36,34 @@ static struct rdma_cm_id *rds_rdma_listen_id; +static char *rds_cm_event_strings[] = { +#define RDS_CM_EVENT_STRING(foo) \ + [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) + RDS_CM_EVENT_STRING(ADDR_RESOLVED), + RDS_CM_EVENT_STRING(ADDR_ERROR), + RDS_CM_EVENT_STRING(ROUTE_RESOLVED), + RDS_CM_EVENT_STRING(ROUTE_ERROR), + RDS_CM_EVENT_STRING(CONNECT_REQUEST), + RDS_CM_EVENT_STRING(CONNECT_RESPONSE), + RDS_CM_EVENT_STRING(CONNECT_ERROR), + RDS_CM_EVENT_STRING(UNREACHABLE), + RDS_CM_EVENT_STRING(REJECTED), + RDS_CM_EVENT_STRING(ESTABLISHED), + RDS_CM_EVENT_STRING(DISCONNECTED), + RDS_CM_EVENT_STRING(DEVICE_REMOVAL), + RDS_CM_EVENT_STRING(MULTICAST_JOIN), + RDS_CM_EVENT_STRING(MULTICAST_ERROR), + RDS_CM_EVENT_STRING(ADDR_CHANGE), + RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), +#undef RDS_CM_EVENT_STRING +}; + +static char *rds_cm_event_str(enum rdma_cm_event_type type) +{ + return rds_str_array(rds_cm_event_strings, + ARRAY_SIZE(rds_cm_event_strings), type); +}; + int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rds_transport *trans; int ret = 0; - rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, - event->event); + rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, + event->event, rds_cm_event_str(event->event)); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; @@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, default: /* things like device disconnect? */ - printk(KERN_ERR "RDS: unknown event %u!\n", event->event); + printk(KERN_ERR "RDS: unknown event %u (%s)!\n", + event->event, rds_cm_event_str(event->event)); break; } @@ -117,12 +146,13 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); - rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); + rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, + rds_cm_event_str(event->event), ret); return ret; } -static int __init rds_rdma_listen_init(void) +static int rds_rdma_listen_init(void) { struct sockaddr_in sin; struct rdma_cm_id *cm_id; @@ -177,7 +207,7 @@ static void rds_rdma_listen_stop(void) } } -int __init rds_rdma_init(void) +int rds_rdma_init(void) { int ret; diff --git a/net/rds/rds.h b/net/rds/rds.h index c224b5b..8103dcf 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -80,6 +80,7 @@ enum { /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 +#define RDS_IN_XMIT 2 struct rds_connection { struct hlist_node c_hash_node; @@ -91,12 +92,13 @@ struct rds_connection { struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; - struct mutex c_send_lock; /* protect send ring */ struct rds_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; unsigned int c_xmit_data_off; + unsigned int c_xmit_atomic_sent; unsigned int c_xmit_rdma_sent; + unsigned int c_xmit_data_sent; spinlock_t c_lock; /* protect msg queues */ u64 c_next_tx_seq; @@ -116,11 +118,10 @@ struct rds_connection { struct delayed_work c_conn_w; struct work_struct c_down_w; struct mutex c_cm_lock; /* protect conn state & cm */ + wait_queue_head_t c_waitq; struct list_head c_map_item; unsigned long c_map_queued; - unsigned long c_map_offset; - unsigned long c_map_bytes; unsigned int c_unacked_packets; unsigned int c_unacked_bytes; @@ -206,6 +207,48 @@ struct rds_incoming { rds_rdma_cookie_t i_rdma_cookie; }; +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + +/* atomic operation types */ +#define RDS_ATOMIC_TYPE_CSWP 0 +#define RDS_ATOMIC_TYPE_FADD 1 + /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty @@ -258,13 +301,71 @@ struct rds_message { * -> rs->rs_lock */ spinlock_t m_rs_lock; + wait_queue_head_t m_flush_wait; + struct rds_sock *m_rs; - struct rds_rdma_op *m_rdma_op; + + /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; - struct rds_mr *m_rdma_mr; - unsigned int m_nents; - unsigned int m_count; - struct scatterlist m_sg[0]; + + unsigned int m_used_sgs; + unsigned int m_total_sgs; + + void *m_final_op; + + struct { + struct rm_atomic_op { + int op_type; + union { + struct { + uint64_t compare; + uint64_t swap; + uint64_t compare_mask; + uint64_t swap_mask; + } op_m_cswp; + struct { + uint64_t add; + uint64_t nocarry_mask; + } op_m_fadd; + }; + + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } atomic; + struct rm_rdma_op { + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_write:1; + unsigned int op_fence:1; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_silent:1; + unsigned int op_active:1; + unsigned int op_bytes; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; + } rdma; + struct rm_data_op { + unsigned int op_active:1; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + } data; + }; }; /* @@ -305,10 +406,6 @@ struct rds_notifier { * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. - * - * @xmit_cong_map: This asks the transport to send the local bitmap down the - * given connection. XXX get a better story about the bitmap - * flag and header. */ #define RDS_TRANS_IB 0 @@ -332,13 +429,11 @@ struct rds_transport { void (*xmit_complete)(struct rds_connection *conn); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); - int (*xmit_cong_map)(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset); - int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); + int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); + int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, size_t size); - void (*inc_purge)(struct rds_incoming *inc); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, @@ -367,17 +462,11 @@ struct rds_sock { * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ - struct rb_node rs_bound_node; + struct hlist_node rs_bound_node; __be32 rs_bound_addr; __be32 rs_conn_addr; __be16 rs_bound_port; __be16 rs_conn_port; - - /* - * This is only used to communicate the transport between bind and - * initiating connections. All other trans use is referenced through - * the connection. - */ struct rds_transport *rs_transport; /* @@ -466,8 +555,8 @@ struct rds_statistics { uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; - uint64_t s_send_sem_contention; - uint64_t s_send_sem_queue_raced; + uint64_t s_send_lock_contention; + uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; @@ -487,6 +576,7 @@ struct rds_statistics { }; /* af_rds.c */ +char *rds_str_array(char **array, size_t elements, size_t index); void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); @@ -521,15 +611,17 @@ void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ -int __init rds_conn_init(void); +int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); +void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); +void rds_conn_connect_if_down(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, @@ -566,7 +658,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, @@ -580,7 +673,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, size_t size); -void rds_message_inc_purge(struct rds_incoming *inc); void rds_message_inc_free(struct rds_incoming *inc); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); @@ -636,14 +728,39 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); -int rds_send_acked_before(struct rds_connection *conn, u64 seq); void rds_send_remove_from_sock(struct list_head *messages, int status); int rds_send_pong(struct rds_connection *conn, __be16 dport); struct rds_message *rds_send_get_message(struct rds_connection *, - struct rds_rdma_op *); + struct rm_rdma_op *); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_rdma_extra_size(struct rds_rdma_args *args); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rm_rdma_op *ro); +void rds_atomic_free_op(struct rm_atomic_op *ao); +void rds_rdma_send_complete(struct rds_message *rm, int wc_status); +void rds_atomic_send_complete(struct rds_message *rm, int wc_status); +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); + +extern void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); @@ -657,14 +774,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) -int __init rds_stats_init(void); +int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr); /* sysctl.c */ -int __init rds_sysctl_init(void); +int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; @@ -678,9 +795,10 @@ extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ -int __init rds_threads_init(void); +int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; +void rds_queue_reconnect(struct rds_connection *conn); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); @@ -691,9 +809,10 @@ void rds_connect_complete(struct rds_connection *conn); int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(__be32 addr); +void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); -int __init rds_trans_init(void); +int rds_trans_init(void); void rds_trans_exit(void); #endif diff --git a/net/rds/recv.c b/net/rds/recv.c index c93588c..68800f0 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -36,7 +36,6 @@ #include <linux/in.h> #include "rds.h" -#include "rdma.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr) @@ -210,7 +209,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, } rs = rds_find_bound(daddr, inc->i_hdr.h_dport); - if (rs == NULL) { + if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } @@ -251,7 +250,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) { unsigned long flags; - if (*inc == NULL) { + if (!*inc) { read_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&rs->rs_recv_queue)) { *inc = list_entry(rs->rs_recv_queue.next, @@ -334,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) if (msghdr) { cmsg.user_token = notifier->n_user_token; - cmsg.status = notifier->n_status; + cmsg.status = notifier->n_status; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, - sizeof(cmsg), &cmsg); + sizeof(cmsg), &cmsg); if (err) break; } diff --git a/net/rds/send.c b/net/rds/send.c index 9c1c6bc..9b951a0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -37,7 +37,6 @@ #include <linux/list.h> #include "rds.h" -#include "rdma.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog @@ -54,7 +53,8 @@ module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); /* - * Reset the send state. Caller must hold c_send_lock when calling here. + * Reset the send state. Callers must ensure that this doesn't race with + * rds_send_xmit(). */ void rds_send_reset(struct rds_connection *conn) { @@ -62,18 +62,22 @@ void rds_send_reset(struct rds_connection *conn) unsigned long flags; if (conn->c_xmit_rm) { + rm = conn->c_xmit_rm; + conn->c_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ - rds_message_unmapped(conn->c_xmit_rm); - rds_message_put(conn->c_xmit_rm); - conn->c_xmit_rm = NULL; + rds_message_unmapped(rm); + rds_message_put(rm); } + conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; + conn->c_xmit_atomic_sent = 0; conn->c_xmit_rdma_sent = 0; + conn->c_xmit_data_sent = 0; conn->c_map_queued = 0; @@ -90,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); } +static int acquire_in_xmit(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; +} + +static void release_in_xmit(struct rds_connection *conn) +{ + clear_bit(RDS_IN_XMIT, &conn->c_flags); + smp_mb__after_clear_bit(); + /* + * We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* * We're making the concious trade-off here to only send one message * down the connection at a time. @@ -109,102 +132,69 @@ int rds_send_xmit(struct rds_connection *conn) struct rds_message *rm; unsigned long flags; unsigned int tmp; - unsigned int send_quota = send_batch_count; struct scatterlist *sg; int ret = 0; - int was_empty = 0; LIST_HEAD(to_be_dropped); +restart: + /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. - * - * The sem holder will issue a retry if they notice that someone queued - * a message after they stopped walking the send queue but before they - * dropped the sem. */ - if (!mutex_trylock(&conn->c_send_lock)) { - rds_stats_inc(s_send_sem_contention); + if (!acquire_in_xmit(conn)) { + rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } + /* + * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, + * we do the opposite to avoid races. + */ + if (!rds_conn_up(conn)) { + release_in_xmit(conn); + ret = 0; + goto out; + } + if (conn->c_trans->xmit_prepare) conn->c_trans->xmit_prepare(conn); /* * spin trying to push headers and data down the connection until - * the connection doens't make forward progress. + * the connection doesn't make forward progress. */ - while (--send_quota) { - /* - * See if need to send a congestion map update if we're - * between sending messages. The send_sem protects our sole - * use of c_map_offset and _bytes. - * Note this is used only by transports that define a special - * xmit_cong_map function. For all others, we create allocate - * a cong_map message and treat it just like any other send. - */ - if (conn->c_map_bytes) { - ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, - conn->c_map_offset); - if (ret <= 0) - break; + while (1) { - conn->c_map_offset += ret; - conn->c_map_bytes -= ret; - if (conn->c_map_bytes) - continue; - } - - /* If we're done sending the current message, clear the - * offset and S/G temporaries. - */ rm = conn->c_xmit_rm; - if (rm != NULL && - conn->c_xmit_hdr_off == sizeof(struct rds_header) && - conn->c_xmit_sg == rm->m_nents) { - conn->c_xmit_rm = NULL; - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_rdma_sent = 0; - /* Release the reference to the previous message. */ - rds_message_put(rm); - rm = NULL; - } - - /* If we're asked to send a cong map update, do so. + /* + * If between sending messages, we can send a pending congestion + * map update. */ - if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { - if (conn->c_trans->xmit_cong_map != NULL) { - conn->c_map_offset = 0; - conn->c_map_bytes = sizeof(struct rds_header) + - RDS_CONG_MAP_BYTES; - continue; - } - + if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { rm = rds_cong_update_alloc(conn); if (IS_ERR(rm)) { ret = PTR_ERR(rm); break; } + rm->data.op_active = 1; conn->c_xmit_rm = rm; } /* - * Grab the next message from the send queue, if there is one. + * If not already working on one, grab the next message. * * c_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ - if (rm == NULL) { + if (!rm) { unsigned int len; spin_lock_irqsave(&conn->c_lock, flags); @@ -224,10 +214,8 @@ int rds_send_xmit(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); - if (rm == NULL) { - was_empty = 1; + if (!rm) break; - } /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire @@ -236,13 +224,12 @@ int rds_send_xmit(struct rds_connection *conn) * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->m_rdma_op && + if (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); spin_unlock_irqrestore(&conn->c_lock, flags); - rds_message_put(rm); continue; } @@ -263,23 +250,55 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_rm = rm; } - /* - * Try and send an rdma message. Let's see if we can - * keep this simple and require that the transport either - * send the whole rdma or none of it. - */ - if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); + /* The transport either sends the whole rdma or none of it */ + if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + rm->m_final_op = &rm->rdma; + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) break; conn->c_xmit_rdma_sent = 1; + /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); } - if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || - conn->c_xmit_sg < rm->m_nents) { + if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + rm->m_final_op = &rm->atomic; + ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); + if (ret) + break; + conn->c_xmit_atomic_sent = 1; + + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + + /* + * A number of cases require an RDS header to be sent + * even if there is no data. + * We permit 0-byte sends; rds-ping depends on this. + * However, if there are exclusively attached silent ops, + * we skip the hdr/data send, to enable silent operation. + */ + if (rm->data.op_nents == 0) { + int ops_present; + int all_ops_are_silent = 1; + + ops_present = (rm->atomic.op_active || rm->rdma.op_active); + if (rm->atomic.op_active && !rm->atomic.op_silent) + all_ops_are_silent = 0; + if (rm->rdma.op_active && !rm->rdma.op_silent) + all_ops_are_silent = 0; + + if (ops_present && all_ops_are_silent + && !rm->m_rdma_cookie) + rm->data.op_active = 0; + } + + if (rm->data.op_active && !conn->c_xmit_data_sent) { + rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, @@ -295,7 +314,7 @@ int rds_send_xmit(struct rds_connection *conn) ret -= tmp; } - sg = &rm->m_sg[conn->c_xmit_sg]; + sg = &rm->data.op_sg[conn->c_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - conn->c_xmit_data_off); @@ -306,49 +325,63 @@ int rds_send_xmit(struct rds_connection *conn) sg++; conn->c_xmit_sg++; BUG_ON(ret != 0 && - conn->c_xmit_sg == rm->m_nents); + conn->c_xmit_sg == rm->data.op_nents); } } + + if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && + (conn->c_xmit_sg == rm->data.op_nents)) + conn->c_xmit_data_sent = 1; } - } - /* Nuke any messages we decided not to retransmit. */ - if (!list_empty(&to_be_dropped)) - rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + /* + * A rm will only take multiple times through this loop + * if there is a data op. Thus, if the data is sent (or there was + * none), then we're done with the rm. + */ + if (!rm->data.op_active || conn->c_xmit_data_sent) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_data_sent = 0; + + rds_message_put(rm); + } + } if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); - /* - * We might be racing with another sender who queued a message but - * backed off on noticing that we held the c_send_lock. If we check - * for queued messages after dropping the sem then either we'll - * see the queued message or the queuer will get the sem. If we - * notice the queued message then we trigger an immediate retry. - * - * We need to be careful only to do this when we stopped processing - * the send queue because it was empty. It's the only way we - * stop processing the loop when the transport hasn't taken - * responsibility for forward progress. - */ - mutex_unlock(&conn->c_send_lock); + release_in_xmit(conn); - if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { - /* We exhausted the send quota, but there's work left to - * do. Return and (re-)schedule the send worker. - */ - ret = -EAGAIN; + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) { + /* irqs on here, so we can put(), unlike above */ + list_for_each_entry(rm, &to_be_dropped, m_conn_item) + rds_message_put(rm); + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } - if (ret == 0 && was_empty) { - /* A simple bit test would be way faster than taking the - * spin lock */ - spin_lock_irqsave(&conn->c_lock, flags); + /* + * Other senders can queue a message after we last test the send queue + * but before we clear RDS_IN_XMIT. In that case they'd back off and + * not try and send their newly queued message. We need to check the + * send queue after having cleared RDS_IN_XMIT so that their message + * doesn't get stuck on the send queue. + * + * If the transport cannot continue (i.e ret != 0), then it must + * call us when more room is available, such as from the tx + * completion handler. + */ + if (ret == 0) { + smp_mb(); if (!list_empty(&conn->c_send_queue)) { - rds_stats_inc(s_send_sem_queue_raced); - ret = -EAGAIN; + rds_stats_inc(s_send_lock_queue_raced); + goto restart; } - spin_unlock_irqrestore(&conn->c_lock, flags); } out: return ret; @@ -376,52 +409,60 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, } /* - * Returns true if there are no messages on the send and retransmit queues - * which have a sequence number greater than or equal to the given sequence - * number. + * This is pretty similar to what happens below in the ACK + * handling code - except that we call here as soon as we get + * the IB send completion on the RDMA op and the accompanying + * message. */ -int rds_send_acked_before(struct rds_connection *conn, u64 seq) +void rds_rdma_send_complete(struct rds_message *rm, int status) { - struct rds_message *rm, *tmp; - int ret = 1; + struct rds_sock *rs = NULL; + struct rm_rdma_op *ro; + struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; - } + ro = &rm->rdma; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && + ro->op_active && ro->op_notify && ro->op_notifier) { + notifier = ro->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); - list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ro->op_notifier = NULL; } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); - return ret; + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } } +EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* - * This is pretty similar to what happens below in the ACK - * handling code - except that we call here as soon as we get - * the IB send completion on the RDMA op and the accompanying - * message. + * Just like above, except looks at atomic op */ -void rds_rdma_send_complete(struct rds_message *rm, int status) +void rds_atomic_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; - struct rds_rdma_op *ro; + struct rm_atomic_op *ao; struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); - ro = rm->m_rdma_op; - if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro && ro->r_notify && ro->r_notifier) { - notifier = ro->r_notifier; + ao = &rm->atomic; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) + && ao->op_active && ao->op_notify && ao->op_notifier) { + notifier = ao->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -430,17 +471,17 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); - ro->r_notifier = NULL; + ao->op_notifier = NULL; } - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); sock_put(rds_rs_to_sk(rs)); } } -EXPORT_SYMBOL_GPL(rds_rdma_send_complete); +EXPORT_SYMBOL_GPL(rds_atomic_send_complete); /* * This is the same as rds_rdma_send_complete except we @@ -448,15 +489,23 @@ EXPORT_SYMBOL_GPL(rds_rdma_send_complete); * socket, socket lock) and can just move the notifier. */ static inline void -__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { - struct rds_rdma_op *ro; + struct rm_rdma_op *ro; + struct rm_atomic_op *ao; + + ro = &rm->rdma; + if (ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_notifier->n_status = status; + list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); + ro->op_notifier = NULL; + } - ro = rm->m_rdma_op; - if (ro && ro->r_notify && ro->r_notifier) { - ro->r_notifier->n_status = status; - list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); - ro->r_notifier = NULL; + ao = &rm->atomic; + if (ao->op_active && ao->op_notify && ao->op_notifier) { + ao->op_notifier->n_status = status; + list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); + ao->op_notifier = NULL; } /* No need to wake the app - caller does this */ @@ -468,7 +517,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status * So speed is not an issue here. */ struct rds_message *rds_send_get_message(struct rds_connection *conn, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { struct rds_message *rm, *tmp, *found = NULL; unsigned long flags; @@ -476,7 +525,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, spin_lock_irqsave(&conn->c_lock, flags); list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (rm->m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; goto out; @@ -484,7 +533,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, } list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (rm->m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; break; @@ -544,19 +593,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rds_rdma_op *ro = rm->m_rdma_op; + struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); - if (ro && ro->r_notifier && (status || ro->r_notify)) { - notifier = ro->r_notifier; + if (ro->op_active && ro->op_notifier && + (ro->op_notify || (ro->op_recverr && status))) { + notifier = ro->op_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; - rm->m_rdma_op->r_notifier = NULL; + rm->rdma.op_notifier = NULL; } was_on_sock = 1; rm->m_rs = NULL; @@ -619,9 +669,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; - unsigned long flags, flags2; + unsigned long flags; LIST_HEAD(list); - int wake = 0; /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); @@ -631,59 +680,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) dest->sin_port != rm->m_inc.i_hdr.h_dport)) continue; - wake = 1; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); } /* order flag updates with the rs lock */ - if (wake) - smp_mb__after_clear_bit(); + smp_mb__after_clear_bit(); spin_unlock_irqrestore(&rs->rs_lock, flags); - conn = NULL; + if (list_empty(&list)) + return; - /* now remove the messages from the conn list as needed */ + /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { - /* We do this here rather than in the loop above, so that - * we don't have to nest m_rs_lock under rs->rs_lock */ - spin_lock_irqsave(&rm->m_rs_lock, flags2); - /* If this is a RDMA operation, notify the app. */ - spin_lock(&rs->rs_lock); - __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); - spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags2); + conn = rm->m_inc.i_conn; + + spin_lock_irqsave(&conn->c_lock, flags); /* - * If we see this flag cleared then we're *sure* that someone - * else beat us to removing it from the conn. If we race - * with their flag update we'll get the lock and then really - * see that the flag has been cleared. + * Maybe someone else beat us to removing rm from the conn. + * If we race with their flag update we'll get the lock and + * then really see that the flag has been cleared. */ - if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + spin_unlock_irqrestore(&conn->c_lock, flags); continue; - - if (conn != rm->m_inc.i_conn) { - if (conn) - spin_unlock_irqrestore(&conn->c_lock, flags); - conn = rm->m_inc.i_conn; - spin_lock_irqsave(&conn->c_lock, flags); } + list_del_init(&rm->m_conn_item); + spin_unlock_irqrestore(&conn->c_lock, flags); - if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { - list_del_init(&rm->m_conn_item); - rds_message_put(rm); - } - } + /* + * Couldn't grab m_rs_lock in top loop (lock ordering), + * but we can now. + */ + spin_lock_irqsave(&rm->m_rs_lock, flags); - if (conn) - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_lock(&rs->rs_lock); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); - if (wake) - rds_wake_sk_sleep(rs); + rm->m_rs = NULL; + spin_unlock_irqrestore(&rm->m_rs_lock, flags); + + rds_message_put(rm); + } + + rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); @@ -763,6 +807,63 @@ out: return *queued; } +/* + * rds_message is getting to be quite complicated, and we'd like to allocate + * it all in one go. This figures out how big it needs to be up front. + */ +static int rds_rm_size(struct msghdr *msg, int data_len) +{ + struct cmsghdr *cmsg; + int size = 0; + int cmsg_groups = 0; + int retval; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + cmsg_groups |= 1; + retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + if (retval < 0) + return retval; + size += retval; + + break; + + case RDS_CMSG_RDMA_DEST: + case RDS_CMSG_RDMA_MAP: + cmsg_groups |= 2; + /* these are valid but do no add any size */ + break; + + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + cmsg_groups |= 1; + size += sizeof(struct scatterlist); + break; + + default: + return -EINVAL; + } + + } + + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + + /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ + if (cmsg_groups == 3) + return -EINVAL; + + return size; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -777,7 +878,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set - * rm->m_rdma_cookie and rm->m_rdma_mr. + * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: @@ -793,6 +894,12 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, if (!ret) *allocated_mr = 1; break; + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: + ret = rds_cmsg_atomic(rs, rm, cmsg); + break; default: return -EINVAL; @@ -850,13 +957,26 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - rm = rds_message_copy_from_user(msg->msg_iov, payload_len); - if (IS_ERR(rm)) { - ret = PTR_ERR(rm); - rm = NULL; + /* size of rm including all sgs */ + ret = rds_rm_size(msg, payload_len); + if (ret < 0) + goto out; + + rm = rds_message_alloc(ret, GFP_KERNEL); + if (!rm) { + ret = -ENOMEM; goto out; } + /* Attach data to the rm */ + if (payload_len) { + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); + if (ret) + goto out; + } + rm->data.op_active = 1; + rm->m_daddr = daddr; /* rds_conn_create has a spinlock that runs with IRQ off. @@ -879,22 +999,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; - if ((rm->m_rdma_cookie || rm->m_rdma_op) && - conn->c_trans->xmit_rdma == NULL) { + if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - rm->m_rdma_op, conn->c_trans->xmit_rdma); + &rm->rdma, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { + if (printk_ratelimit()) + printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", + &rm->atomic, conn->c_trans->xmit_atomic); + ret = -EOPNOTSUPP; + goto out; + } + + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { @@ -938,7 +1059,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_stats_inc(s_send_queued); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_worker(&conn->c_send_w.work); + rds_send_xmit(conn); rds_message_put(rm); return payload_len; @@ -966,20 +1087,15 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); - if (rm == NULL) { + if (!rm) { ret = -ENOMEM; goto out; } rm->m_daddr = conn->c_faddr; + rm->data.op_active = 1; - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); if (ret) @@ -999,7 +1115,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + rds_send_xmit(conn); + rds_message_put(rm); return 0; diff --git a/net/rds/stats.c b/net/rds/stats.c index 7598eb0..10c759c 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = { "recv_ping", "send_queue_empty", "send_queue_full", - "send_sem_contention", - "send_sem_queue_raced", + "send_lock_contention", + "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", @@ -143,7 +143,7 @@ void rds_stats_exit(void) rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } -int __init rds_stats_init(void) +int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 7829a20..25ad0c7 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -105,13 +105,13 @@ void rds_sysctl_exit(void) unregister_sysctl_table(rds_sysctl_reg_table); } -int __init rds_sysctl_init(void) +int rds_sysctl_init(void) { rds_sysctl_reconnect_min = msecs_to_jiffies(1); rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); - if (rds_sysctl_reg_table == NULL) + if (!rds_sysctl_reg_table) return -ENOMEM; return 0; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index babf457..eeb08e6 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) struct rds_tcp_connection *tc; tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); - if (tc == NULL) + if (!tc) return -ENOMEM; tc->t_sock = NULL; @@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, .xmit_prepare = rds_tcp_xmit_prepare, .xmit_complete = rds_tcp_xmit_complete, - .xmit_cong_map = rds_tcp_xmit_cong_map, .xmit = rds_tcp_xmit, .recv = rds_tcp_recv, .conn_alloc = rds_tcp_conn_alloc, @@ -266,7 +265,6 @@ struct rds_transport rds_tcp_transport = { .conn_connect = rds_tcp_conn_connect, .conn_shutdown = rds_tcp_conn_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, - .inc_purge = rds_tcp_inc_purge, .inc_free = rds_tcp_inc_free, .stats_info_copy = rds_tcp_stats_info_copy, .exit = rds_tcp_exit, @@ -276,14 +274,14 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; -int __init rds_tcp_init(void) +int rds_tcp_init(void) { int ret; rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", sizeof(struct rds_tcp_connection), 0, 0, NULL); - if (rds_tcp_conn_slab == NULL) { + if (!rds_tcp_conn_slab) { ret = -ENOMEM; goto out; } diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 844fa6b..f5e6f7b 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -43,7 +43,7 @@ struct rds_tcp_statistics { }; /* tcp.c */ -int __init rds_tcp_init(void); +int rds_tcp_init(void); void rds_tcp_exit(void); void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); @@ -61,16 +61,15 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -int __init rds_tcp_listen_init(void); +int rds_tcp_listen_init(void); void rds_tcp_listen_stop(void); void rds_tcp_listen_data_ready(struct sock *sk, int bytes); /* tcp_recv.c */ -int __init rds_tcp_recv_init(void); +int rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); void rds_tcp_data_ready(struct sock *sk, int bytes); int rds_tcp_recv(struct rds_connection *conn); -void rds_tcp_inc_purge(struct rds_incoming *inc); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); @@ -81,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_tcp_write_space(struct sock *sk); -int rds_tcp_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset); /* tcp_stats.c */ DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index c397524..a65ee78 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { + if (!conn) { state_change = sk->sk_state_change; goto out; } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 975183f..ae27869 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; - if (ready == NULL) { /* check for teardown race */ + if (!ready) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } @@ -135,7 +135,7 @@ out: ready(sk, bytes); } -int __init rds_tcp_listen_init(void) +int rds_tcp_listen_init(void) { struct sockaddr_in sin; struct socket *sock = NULL; @@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void) struct socket *sock = rds_tcp_listen_sock; struct sock *sk; - if (sock == NULL) + if (!sock) return; sk = sock->sk; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 1aba687..7017f3af 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -39,7 +39,7 @@ static struct kmem_cache *rds_tcp_incoming_slab; -void rds_tcp_inc_purge(struct rds_incoming *inc) +static void rds_tcp_inc_purge(struct rds_incoming *inc) { struct rds_tcp_incoming *tinc; tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); @@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, * processing. */ while (left) { - if (tinc == NULL) { + if (!tinc) { tinc = kmem_cache_alloc(rds_tcp_incoming_slab, arg->gfp); - if (tinc == NULL) { + if (!tinc) { desc->error = -ENOMEM; goto out; } @@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, if (left && tc->t_tinc_data_rem) { clone = skb_clone(skb, arg->gfp); - if (clone == NULL) { + if (!clone) { desc->error = -ENOMEM; goto out; } @@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { /* check for teardown race */ + if (!conn) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } @@ -342,12 +342,12 @@ out: ready(sk, bytes); } -int __init rds_tcp_recv_init(void) +int rds_tcp_recv_init(void) { rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", sizeof(struct rds_tcp_incoming), 0, 0, NULL); - if (rds_tcp_incoming_slab == NULL) + if (!rds_tcp_incoming_slab) return -ENOMEM; return 0; } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index a28b895..2979fb4 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) } /* the core send_sem serializes this with other xmit and shutdown */ -int rds_tcp_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset) -{ - static struct rds_header rds_tcp_map_header = { - .h_flags = RDS_FLAG_CONG_BITMAP, - }; - struct rds_tcp_connection *tc = conn->c_transport_data; - unsigned long i; - int ret; - int copied = 0; - - /* Some problem claims cpu_to_be32(constant) isn't a constant. */ - rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES); - - if (offset < sizeof(struct rds_header)) { - ret = rds_tcp_sendmsg(tc->t_sock, - (void *)&rds_tcp_map_header + offset, - sizeof(struct rds_header) - offset); - if (ret <= 0) - return ret; - offset += ret; - copied = ret; - if (offset < sizeof(struct rds_header)) - return ret; - } - - offset -= sizeof(struct rds_header); - i = offset / PAGE_SIZE; - offset = offset % PAGE_SIZE; - BUG_ON(i >= RDS_CONG_MAP_PAGES); - - do { - ret = tc->t_sock->ops->sendpage(tc->t_sock, - virt_to_page(map->m_page_addrs[i]), - offset, PAGE_SIZE - offset, - MSG_DONTWAIT); - if (ret <= 0) - break; - copied += ret; - offset += ret; - if (offset == PAGE_SIZE) { - offset = 0; - i++; - } - } while (i < RDS_CONG_MAP_PAGES); - - return copied ? copied : ret; -} - -/* the core send_sem serializes this with other xmit and shutdown */ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { @@ -166,21 +116,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - while (sg < rm->m_nents) { + while (sg < rm->data.op_nents) { ret = tc->t_sock->ops->sendpage(tc->t_sock, - sg_page(&rm->m_sg[sg]), - rm->m_sg[sg].offset + off, - rm->m_sg[sg].length - off, + sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, + rm->data.op_sg[sg].length - off, MSG_DONTWAIT|MSG_NOSIGNAL); - rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), - rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, + rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); if (ret <= 0) break; off += ret; done += ret; - if (off == rm->m_sg[sg].length) { + if (off == rm->data.op_sg[sg].length) { off = 0; sg++; } @@ -226,7 +176,7 @@ void rds_tcp_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { + if (!conn) { write_space = sk->sk_write_space; goto out; } diff --git a/net/rds/threads.c b/net/rds/threads.c index 786c20e..0fd90f8 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -61,7 +61,7 @@ * * Transition to state DISCONNECTING/DOWN: * - Inside the shutdown worker; synchronizes with xmit path - * through c_send_lock, and with connection management callbacks + * through RDS_IN_XMIT, and with connection management callbacks * via c_cm_lock. * * For receive callbacks, we rely on the underlying transport @@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ -static void rds_queue_reconnect(struct rds_connection *conn) +void rds_queue_reconnect(struct rds_connection *conn) { unsigned long rand; @@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work) } } -void rds_shutdown_worker(struct work_struct *work) -{ - struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); - - /* shut it down unless it's down already */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { - /* - * Quiesce the connection mgmt handlers before we start tearing - * things down. We don't hold the mutex for the entire - * duration of the shutdown operation, else we may be - * deadlocking with the CM handler. Instead, the CM event - * handler is supposed to check for state DISCONNECTING - */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && - !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { - rds_conn_error(conn, "shutdown called in state %d\n", - atomic_read(&conn->c_state)); - mutex_unlock(&conn->c_cm_lock); - return; - } - mutex_unlock(&conn->c_cm_lock); - - mutex_lock(&conn->c_send_lock); - conn->c_trans->conn_shutdown(conn); - rds_conn_reset(conn); - mutex_unlock(&conn->c_send_lock); - - if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { - /* This can happen - eg when we're in the middle of tearing - * down the connection, and someone unloads the rds module. - * Quite reproduceable with loopback connections. - * Mostly harmless. - */ - rds_conn_error(conn, - "%s: failed to transition to state DOWN, " - "current state is %d\n", - __func__, - atomic_read(&conn->c_state)); - return; - } - } - - /* Then reconnect if it's still live. - * The passive side of an IB loopback connection is never added - * to the conn hash, so we never trigger a reconnect on this - * conn - the reconnect is always triggered by the active peer. */ - cancel_delayed_work(&conn->c_conn_w); - if (!hlist_unhashed(&conn->c_hash_node)) - rds_queue_reconnect(conn); -} - void rds_send_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); @@ -252,15 +200,22 @@ void rds_recv_worker(struct work_struct *work) } } +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + rds_conn_shutdown(conn); +} + void rds_threads_exit(void) { destroy_workqueue(rds_wq); } -int __init rds_threads_init(void) +int rds_threads_init(void) { - rds_wq = create_workqueue("krdsd"); - if (rds_wq == NULL) + rds_wq = create_singlethread_workqueue("krdsd"); + if (!rds_wq) return -ENOMEM; return 0; diff --git a/net/rds/transport.c b/net/rds/transport.c index 7e10679..7f2ac4f 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans) } EXPORT_SYMBOL_GPL(rds_trans_unregister); +void rds_trans_put(struct rds_transport *trans) +{ + if (trans && trans->t_owner) + module_put(trans->t_owner); +} + struct rds_transport *rds_trans_get_preferred(__be32 addr) { struct rds_transport *ret = NULL; - int i; + struct rds_transport *trans; + unsigned int i; if (IN_LOOPBACK(ntohl(addr))) return &rds_loop_transport; down_read(&rds_trans_sem); - for (i = 0; i < RDS_TRANS_COUNT; i++) - { - if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { - ret = transports[i]; + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && (trans->laddr_check(addr) == 0) && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; break; } } diff --git a/net/rds/xlist.h b/net/rds/xlist.h new file mode 100644 index 0000000..e6b5190 --- /dev/null +++ b/net/rds/xlist.h @@ -0,0 +1,80 @@ +#ifndef _LINUX_XLIST_H +#define _LINUX_XLIST_H + +#include <linux/stddef.h> +#include <linux/poison.h> +#include <linux/prefetch.h> +#include <asm/system.h> + +struct xlist_head { + struct xlist_head *next; +}; + +static inline void INIT_XLIST_HEAD(struct xlist_head *list) +{ + list->next = NULL; +} + +static inline int xlist_empty(struct xlist_head *head) +{ + return head->next == NULL; +} + +static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, + struct xlist_head *head) +{ + struct xlist_head *cur; + struct xlist_head *check; + + while (1) { + cur = head->next; + tail->next = cur; + check = cmpxchg(&head->next, cur, new); + if (check == cur) + break; + } +} + +static inline struct xlist_head *xlist_del_head(struct xlist_head *head) +{ + struct xlist_head *cur; + struct xlist_head *check; + struct xlist_head *next; + + while (1) { + cur = head->next; + if (!cur) + goto out; + + next = cur->next; + check = cmpxchg(&head->next, cur, next); + if (check == cur) + goto out; + } +out: + return cur; +} + +static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head) +{ + struct xlist_head *cur; + + cur = head->next; + if (!cur) + return NULL; + + head->next = cur->next; + return cur; +} + +static inline void xlist_splice(struct xlist_head *list, + struct xlist_head *head) +{ + struct xlist_head *cur; + + WARN_ON(head->next); + cur = xchg(&list->next, NULL); + head->next = cur; +} + +#endif diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2f691fb..a36270a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -518,6 +518,16 @@ config NET_ACT_SKBEDIT To compile this code as a module, choose M here: the module will be called act_skbedit. +config NET_ACT_CSUM + tristate "Checksum Updating" + depends on NET_CLS_ACT && INET + ---help--- + Say Y here to update some common checksum after some direct + packet alterations. + + To compile this code as a module, choose M here: the + module will be called act_csum. + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index f14e71b..960f5db 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o +obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c new file mode 100644 index 0000000..67dc7ce --- /dev/null +++ b/net/sched/act_csum.c @@ -0,0 +1,595 @@ +/* + * Checksum updating actions + * + * Copyright (c) 2010 Gregoire Baron <baronchon@n7mm.org> + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include <linux/types.h> +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/spinlock.h> + +#include <linux/netlink.h> +#include <net/netlink.h> +#include <linux/rtnetlink.h> + +#include <linux/skbuff.h> + +#include <net/ip.h> +#include <net/ipv6.h> +#include <net/icmp.h> +#include <linux/icmpv6.h> +#include <linux/igmp.h> +#include <net/tcp.h> +#include <net/udp.h> +#include <net/ip6_checksum.h> + +#include <net/act_api.h> + +#include <linux/tc_act/tc_csum.h> +#include <net/tc_act/tc_csum.h> + +#define CSUM_TAB_MASK 15 +static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; +static u32 csum_idx_gen; +static DEFINE_RWLOCK(csum_lock); + +static struct tcf_hashinfo csum_hash_info = { + .htab = tcf_csum_ht, + .hmask = CSUM_TAB_MASK, + .lock = &csum_lock, +}; + +static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { + [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, +}; + +static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_CSUM_MAX + 1]; + struct tc_csum *parm; + struct tcf_common *pc; + struct tcf_csum *p; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy); + if (err < 0) + return err; + + if (tb[TCA_CSUM_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_CSUM_PARMS]); + + pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, + &csum_idx_gen, &csum_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + p = to_tcf_csum(pc); + ret = ACT_P_CREATED; + } else { + p = to_tcf_csum(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &csum_hash_info); + return -EEXIST; + } + } + + spin_lock_bh(&p->tcf_lock); + p->tcf_action = parm->action; + p->update_flags = parm->update_flags; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &csum_hash_info); + + return ret; +} + +static int tcf_csum_cleanup(struct tc_action *a, int bind) +{ + struct tcf_csum *p = a->priv; + return tcf_hash_release(&p->common, bind, &csum_hash_info); +} + +/** + * tcf_csum_skb_nextlayer - Get next layer pointer + * @skb: sk_buff to use + * @ihl: previous summed headers length + * @ipl: complete packet length + * @jhl: next header length + * + * Check the expected next layer availability in the specified sk_buff. + * Return the next layer pointer if pass, NULL otherwise. + */ +static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl, + unsigned int jhl) +{ + int ntkoff = skb_network_offset(skb); + int hl = ihl + jhl; + + if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || + (skb_cloned(skb) && + !skb_clone_writable(skb, hl + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return NULL; + else + return (void *)(skb_network_header(skb) + ihl); +} + +static int tcf_csum_ipv4_icmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct icmphdr *icmph; + + icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph)); + if (icmph == NULL) + return 0; + + icmph->checksum = 0; + skb->csum = csum_partial(icmph, ipl - ihl, 0); + icmph->checksum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_igmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct igmphdr *igmph; + + igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph)); + if (igmph == NULL) + return 0; + + igmph->csum = 0; + skb->csum = csum_partial(igmph, ipl - ihl, 0); + igmph->csum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct icmp6hdr *icmp6h; + + icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); + if (icmp6h == NULL) + return 0; + + icmp6h->icmp6_cksum = 0; + skb->csum = csum_partial(icmp6h, ipl - ihl, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_ICMPV6, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = tcp_v4_check(ipl - ihl, + iph->saddr, iph->daddr, skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_TCP, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use iph->tot_len, or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + if (udplite || udph->check) { + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + ul, iph->protocol, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul, + udplite ? IPPROTO_UDPLITE : IPPROTO_UDP, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) +{ + struct iphdr *iph; + int ntkoff; + + ntkoff = skb_network_offset(skb); + + if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff)) + goto fail; + + iph = ip_hdr(skb); + + switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { + case IPPROTO_ICMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_IGMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP) + if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 0)) + goto fail; + break; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 1)) + goto fail; + break; + } + + if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { + if (skb_cloned(skb) && + !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto fail; + + ip_send_check(iph); + } + + return 1; + +fail: + return 0; +} + +static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, + unsigned int ixhl, unsigned int *pl) +{ + int off, len, optlen; + unsigned char *xh = (void *)ip6xh; + + off = sizeof(*ip6xh); + len = ixhl - off; + + while (len > 1) { + switch (xh[off]) { + case IPV6_TLV_PAD0: + optlen = 1; + break; + case IPV6_TLV_JUMBO: + optlen = xh[off + 1] + 2; + if (optlen != 6 || len < 6 || (off & 3) != 2) + /* wrong jumbo option length/alignment */ + return 0; + *pl = ntohl(*(__be32 *)(xh + off + 2)); + goto done; + default: + optlen = xh[off + 1] + 2; + if (optlen > len) + /* ignore obscure options */ + goto done; + break; + } + off += optlen; + len -= optlen; + } + +done: + return 1; +} + +static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) +{ + struct ipv6hdr *ip6h; + struct ipv6_opt_hdr *ip6xh; + unsigned int hl, ixhl; + unsigned int pl; + int ntkoff; + u8 nexthdr; + + ntkoff = skb_network_offset(skb); + + hl = sizeof(*ip6h); + + if (!pskb_may_pull(skb, hl + ntkoff)) + goto fail; + + ip6h = ipv6_hdr(skb); + + pl = ntohs(ip6h->payload_len); + nexthdr = ip6h->nexthdr; + + do { + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + goto ignore_skb; + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff)) + goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); + ixhl = ipv6_optlen(ip6xh); + if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) + goto fail; + if ((nexthdr == NEXTHDR_HOP) && + !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) + goto fail; + nexthdr = ip6xh->nexthdr; + hl += ixhl; + break; + case IPPROTO_ICMPV6: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv6_icmp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv6_tcp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 0)) + goto fail; + goto done; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 1)) + goto fail; + goto done; + default: + goto ignore_skb; + } + } while (pskb_may_pull(skb, hl + 1 + ntkoff)); + +done: +ignore_skb: + return 1; + +fail: + return 0; +} + +static int tcf_csum(struct sk_buff *skb, + struct tc_action *a, struct tcf_result *res) +{ + struct tcf_csum *p = a->priv; + int action; + u32 update_flags; + + spin_lock(&p->tcf_lock); + p->tcf_tm.lastuse = jiffies; + p->tcf_bstats.bytes += qdisc_pkt_len(skb); + p->tcf_bstats.packets++; + action = p->tcf_action; + update_flags = p->update_flags; + spin_unlock(&p->tcf_lock); + + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + + switch (skb->protocol) { + case cpu_to_be16(ETH_P_IP): + if (!tcf_csum_ipv4(skb, update_flags)) + goto drop; + break; + case cpu_to_be16(ETH_P_IPV6): + if (!tcf_csum_ipv6(skb, update_flags)) + goto drop; + break; + } + + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_csum_dump(struct sk_buff *skb, + struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_csum *p = a->priv; + struct tc_csum opt = { + .update_flags = p->update_flags, + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_csum_ops = { + .kind = "csum", + .hinfo = &csum_hash_info, + .type = TCA_ACT_CSUM, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_csum, + .dump = tcf_csum_dump, + .cleanup = tcf_csum_cleanup, + .lookup = tcf_hash_search, + .init = tcf_csum_init, + .walk = tcf_generic_walker +}; + +MODULE_DESCRIPTION("Checksum updating actions"); +MODULE_LICENSE("GPL"); + +static int __init csum_init_module(void) +{ + return tcf_register_action(&act_csum_ops); +} + +static void __exit csum_cleanup_module(void) +{ + tcf_unregister_action(&act_csum_ops); +} + +module_init(csum_init_module); +module_exit(csum_cleanup_module); diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index e17096e..5b271a1 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -111,44 +111,41 @@ static u32 flow_get_proto(struct sk_buff *skb) } } -static int has_ports(u8 protocol) -{ - switch (protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - return 1; - default: - return 0; - } -} - static u32 flow_get_proto_src(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - has_ports(iph->protocol) && - pskb_network_may_pull(skb, iph->ihl * 4 + 2)) - return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4)); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + poff)); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; - if (!pskb_network_may_pull(skb, sizeof(*iph) + 2)) + if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); - if (has_ports(iph->nexthdr)) - return ntohs(*(__be16 *)&iph[1]); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff)); + } break; } } @@ -161,24 +158,36 @@ static u32 flow_get_proto_dst(struct sk_buff *skb) switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - has_ports(iph->protocol) && - pskb_network_may_pull(skb, iph->ihl * 4 + 4)) - return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2)); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + 2 + poff)); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; - if (!pskb_network_may_pull(skb, sizeof(*iph) + 4)) + if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); - if (has_ports(iph->nexthdr)) - return ntohs(*(__be16 *)((void *)&iph[1] + 2)); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff + 2)); + } break; } } @@ -297,6 +306,11 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb) return tag & VLAN_VID_MASK; } +static u32 flow_get_rxhash(struct sk_buff *skb) +{ + return skb_get_rxhash(skb); +} + static u32 flow_key_get(struct sk_buff *skb, int key) { switch (key) { @@ -334,6 +348,8 @@ static u32 flow_key_get(struct sk_buff *skb, int key) return flow_get_skgid(skb); case FLOW_KEY_VLAN_TAG: return flow_get_vlan_tag(skb); + case FLOW_KEY_RXHASH: + return flow_get_rxhash(skb); default: WARN_ON(1); return 0; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 3bcac8a..34da5e2 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -223,6 +223,11 @@ META_COLLECTOR(int_maclen) dst->value = skb->mac_len; } +META_COLLECTOR(int_rxhash) +{ + dst->value = skb_get_rxhash(skb); +} + /************************************************************************** * Netfilter **************************************************************************/ @@ -541,6 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), + [META_ID(RXHASH)] = META_FUNC(int_rxhash), } }; diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 408eea7..6fb3d41 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -360,7 +360,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } - if (!s || tsize != s->tsize || (!tab && tsize > 0)) + if (tsize != s->tsize || (!tab && tsize > 0)) return ERR_PTR(-EINVAL); spin_lock(&qdisc_stab_lock); diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 201cbac..3cf478d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -123,40 +123,39 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) case htons(ETH_P_IP): { const struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) goto err; iph = ip_hdr(skb); h = (__force u32)iph->daddr; h2 = (__force u32)iph->saddr ^ iph->protocol; - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - (iph->protocol == IPPROTO_TCP || - iph->protocol == IPPROTO_UDP || - iph->protocol == IPPROTO_UDPLITE || - iph->protocol == IPPROTO_SCTP || - iph->protocol == IPPROTO_DCCP || - iph->protocol == IPPROTO_ESP) && - pskb_network_may_pull(skb, iph->ihl * 4 + 4)) - h2 ^= *(((u32*)iph) + iph->ihl); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) goto err; iph = ipv6_hdr(skb); h = (__force u32)iph->daddr.s6_addr32[3]; h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; - if ((iph->nexthdr == IPPROTO_TCP || - iph->nexthdr == IPPROTO_UDP || - iph->nexthdr == IPPROTO_UDPLITE || - iph->nexthdr == IPPROTO_SCTP || - iph->nexthdr == IPPROTO_DCCP || - iph->nexthdr == IPPROTO_ESP) && - pskb_network_may_pull(skb, sizeof(*iph) + 4)) - h2 ^= *(u32*)&iph[1]; + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { + iph = ipv6_hdr(skb); + h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff); + } break; } default: diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 0b85e52..5f1fb8b 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -48,6 +48,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/fcntl.h> #include <linux/poll.h> diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 476caaf..6c85564 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -37,6 +37,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/net.h> diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index ccb6dc4..397296f 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -43,6 +43,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <net/sctp/sctp.h> #include <net/sctp/sm.h> #include <linux/interrupt.h> diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7326891..95e0c8e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -47,6 +47,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/errno.h> #include <linux/types.h> @@ -336,7 +338,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk, memcpy(saddr, baddr, sizeof(union sctp_addr)); SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr); } else { - printk(KERN_ERR "%s: asoc:%p Could not find a valid source " + pr_err("%s: asoc:%p Could not find a valid source " "address for the dest:%pI6\n", __func__, asoc, &daddr->v6.sin6_addr); } diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index f73ec0e..8ef8e7d 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -38,6 +38,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <net/sctp/sctp.h> @@ -134,8 +136,7 @@ void sctp_dbg_objcnt_init(void) ent = proc_create("sctp_dbg_objcnt", 0, proc_net_sctp, &sctp_objcnt_ops); if (!ent) - printk(KERN_WARNING - "sctp_dbg_objcnt: Unable to create /proc entry.\n"); + pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n"); } /* Cleanup the objcount entry in the proc filesystem. */ diff --git a/net/sctp/output.c b/net/sctp/output.c index a646681..901764b 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -41,6 +41,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index c04b2eb..8c6d379 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/list.h> /* For struct list_head */ #include <linux/socket.h> @@ -1463,23 +1465,23 @@ static void sctp_check_transmitted(struct sctp_outq *q, /* Display the end of the * current range. */ - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); } /* Start a new range. */ - SCTP_DEBUG_PRINTK(",%08x", tsn); + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); dbg_ack_tsn = tsn; break; case 1: /* The last TSN was NOT ACKed. */ if (dbg_last_kept_tsn != dbg_kept_tsn) { /* Display the end of current range. */ - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); } - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); /* FALL THROUGH... */ default: @@ -1526,18 +1528,18 @@ static void sctp_check_transmitted(struct sctp_outq *q, break; if (dbg_last_kept_tsn != dbg_kept_tsn) - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); - SCTP_DEBUG_PRINTK(",%08x", tsn); + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); dbg_kept_tsn = tsn; break; case 0: if (dbg_last_ack_tsn != dbg_ack_tsn) - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_ack_tsn); - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("\n"); /* FALL THROUGH... */ default: @@ -1556,17 +1558,17 @@ static void sctp_check_transmitted(struct sctp_outq *q, switch (dbg_prt_state) { case 0: if (dbg_last_ack_tsn != dbg_ack_tsn) { - SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn); } else { - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); } break; case 1: if (dbg_last_kept_tsn != dbg_kept_tsn) { - SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn); } else { - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); } } #endif /* SCTP_DEBUG */ diff --git a/net/sctp/probe.c b/net/sctp/probe.c index db3a42b..2e63e9d 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -22,6 +22,8 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/kernel.h> #include <linux/kprobes.h> #include <linux/socket.h> @@ -192,7 +194,7 @@ static __init int sctpprobe_init(void) if (ret) goto remove_proc; - pr_info("SCTP probe registered (port=%d)\n", port); + pr_info("probe registered (port=%d)\n", port); return 0; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5027b83..f774e65 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/module.h> #include <linux/init.h> #include <linux/netdevice.h> @@ -707,8 +709,7 @@ static int sctp_ctl_sock_init(void) &init_net); if (err < 0) { - printk(KERN_ERR - "SCTP: Failed to create the SCTP control socket.\n"); + pr_err("Failed to create the SCTP control socket\n"); return err; } return 0; @@ -1206,7 +1207,7 @@ SCTP_STATIC __init int sctp_init(void) __get_free_pages(GFP_ATOMIC, order); } while (!sctp_assoc_hashtable && --order > 0); if (!sctp_assoc_hashtable) { - printk(KERN_ERR "SCTP: Failed association hash alloc.\n"); + pr_err("Failed association hash alloc\n"); status = -ENOMEM; goto err_ahash_alloc; } @@ -1220,7 +1221,7 @@ SCTP_STATIC __init int sctp_init(void) sctp_ep_hashtable = (struct sctp_hashbucket *) kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { - printk(KERN_ERR "SCTP: Failed endpoint_hash alloc.\n"); + pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; goto err_ehash_alloc; } @@ -1239,7 +1240,7 @@ SCTP_STATIC __init int sctp_init(void) __get_free_pages(GFP_ATOMIC, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { - printk(KERN_ERR "SCTP: Failed bind hash alloc."); + pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } @@ -1248,8 +1249,7 @@ SCTP_STATIC __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } - printk(KERN_INFO "SCTP: Hash tables configured " - "(established %d bind %d)\n", + pr_info("Hash tables configured (established %d bind %d)\n", sctp_assoc_hashsize, sctp_port_hashsize); /* Disable ADDIP by default. */ @@ -1290,8 +1290,7 @@ SCTP_STATIC __init int sctp_init(void) /* Initialize the control inode/socket for handling OOTB packets. */ if ((status = sctp_ctl_sock_init())) { - printk (KERN_ERR - "SCTP: Failed to initialize the SCTP control sock.\n"); + pr_err("Failed to initialize the SCTP control sock\n"); goto err_ctl_sock_init; } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 246f929..2cc46f0 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -50,6 +50,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f5e5e27..b21b218 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -47,6 +47,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/skbuff.h> #include <linux/types.h> #include <linux/socket.h> @@ -1146,26 +1148,23 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, case SCTP_DISPOSITION_VIOLATION: if (net_ratelimit()) - printk(KERN_ERR "sctp protocol violation state %d " - "chunkid %d\n", state, subtype.chunk); + pr_err("protocol violation state %d chunkid %d\n", + state, subtype.chunk); break; case SCTP_DISPOSITION_NOT_IMPL: - printk(KERN_WARNING "sctp unimplemented feature in state %d, " - "event_type %d, event_id %d\n", - state, event_type, subtype.chunk); + pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n", + state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_BUG: - printk(KERN_ERR "sctp bug in state %d, " - "event_type %d, event_id %d\n", + pr_err("bug in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); BUG(); break; default: - printk(KERN_ERR "sctp impossible disposition %d " - "in state %d, event_type %d, event_id %d\n", + pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", status, state, event_type, subtype.chunk); BUG(); break; @@ -1679,8 +1678,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_cmd_send_asconf(asoc); break; default: - printk(KERN_WARNING "Impossible command: %u, %p\n", - cmd->verb, cmd->obj.ptr); + pr_warn("Impossible command: %u, %p\n", + cmd->verb, cmd->obj.ptr); break; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index d344dc4..4b4eb7c 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -50,6 +50,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/ip.h> @@ -1138,18 +1140,16 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, if (unlikely(!link)) { if (from_addr.sa.sa_family == AF_INET6) { if (net_ratelimit()) - printk(KERN_WARNING - "%s association %p could not find address %pI6\n", - __func__, - asoc, - &from_addr.v6.sin6_addr); + pr_warn("%s association %p could not find address %pI6\n", + __func__, + asoc, + &from_addr.v6.sin6_addr); } else { if (net_ratelimit()) - printk(KERN_WARNING - "%s association %p could not find address %pI4\n", - __func__, - asoc, - &from_addr.v4.sin_addr.s_addr); + pr_warn("%s association %p could not find address %pI4\n", + __func__, + asoc, + &from_addr.v4.sin_addr.s_addr); } return SCTP_DISPOSITION_DISCARD; } diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 6d9b3aa..546d4387 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/skbuff.h> #include <net/sctp/sctp.h> #include <net/sctp/sm.h> @@ -66,15 +68,19 @@ static const sctp_sm_table_entry_t bug = { .name = "sctp_sf_bug" }; -#define DO_LOOKUP(_max, _type, _table) \ - if ((event_subtype._type > (_max))) { \ - printk(KERN_WARNING \ - "sctp table %p possible attack:" \ - " event %d exceeds max %d\n", \ - _table, event_subtype._type, _max); \ - return &bug; \ - } \ - return &_table[event_subtype._type][(int)state]; +#define DO_LOOKUP(_max, _type, _table) \ +({ \ + const sctp_sm_table_entry_t *rtn; \ + \ + if ((event_subtype._type > (_max))) { \ + pr_warn("table %p possible attack: event %d exceeds max %d\n", \ + _table, event_subtype._type, _max); \ + rtn = &bug; \ + } else \ + rtn = &_table[event_subtype._type][(int)state]; \ + \ + rtn; \ +}) const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, sctp_state_t state, @@ -83,21 +89,15 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, switch (event_type) { case SCTP_EVENT_T_CHUNK: return sctp_chunk_event_lookup(event_subtype.chunk, state); - break; case SCTP_EVENT_T_TIMEOUT: - DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, - timeout_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, + timeout_event_table); case SCTP_EVENT_T_OTHER: - DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, + other_event_table); case SCTP_EVENT_T_PRIMITIVE: - DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, - primitive_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, + primitive_event_table); default: /* Yikes! We got an illegal event type. */ return &bug; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ca44917..6a691d8 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -57,6 +57,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/types.h> #include <linux/kernel.h> #include <linux/wait.h> @@ -2458,9 +2460,8 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, if (params.sack_delay == 0 && params.sack_freq == 0) return 0; } else if (optlen == sizeof(struct sctp_assoc_value)) { - printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " - "in delayed_ack socket option deprecated\n"); - printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n"); + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; @@ -2868,10 +2869,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int val; if (optlen == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in maxseg socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); if (copy_from_user(&val, optval, optlen)) return -EFAULT; params.assoc_id = 0; @@ -3121,10 +3120,8 @@ static int sctp_setsockopt_maxburst(struct sock *sk, int assoc_id = 0; if (optlen == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in max_burst socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); if (copy_from_user(&val, optval, optlen)) return -EFAULT; } else if (optlen == sizeof(struct sctp_assoc_value)) { @@ -3595,7 +3592,40 @@ out: /* The SCTP ioctl handler. */ SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) { - return -ENOIOCTLCMD; + int rc = -ENOTCONN; + + sctp_lock_sock(sk); + + /* + * SEQPACKET-style sockets in LISTENING state are valid, for + * SCTP, so only discard TCP-style sockets in LISTENING state. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + goto out; + + switch (cmd) { + case SIOCINQ: { + struct sk_buff *skb; + unsigned int amount = 0; + + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount of this packet since + * that is all that will be read. + */ + amount = skb->len; + } + rc = put_user(amount, (int __user *)arg); + } + break; + default: + rc = -ENOIOCTLCMD; + break; + } +out: + sctp_release_sock(sk); + return rc; } /* This is the function which gets called during socket creation to @@ -4281,9 +4311,8 @@ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len, if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else if (len == sizeof(struct sctp_assoc_value)) { - printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " - "in delayed_ack socket option deprecated\n"); - printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n"); + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else @@ -4929,10 +4958,8 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in maxseg socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5023,10 +5050,8 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in max_burst socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5586,8 +5611,7 @@ SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog) tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) { if (net_ratelimit()) { - printk(KERN_INFO - "SCTP: failed to load transform for %s: %ld\n", + pr_info("failed to load transform for %s: %ld\n", sctp_hmac_alg, PTR_ERR(tfm)); } return -ENOSYS; @@ -5716,13 +5740,12 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* Is it readable? Reconsider this code with TCP-style support. */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* The association is either gone or not ready. */ diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 132046c..d3ae493 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -48,6 +48,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include <linux/slab.h> #include <linux/types.h> #include <linux/random.h> @@ -244,10 +246,9 @@ void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) struct dst_entry *dst; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", - __func__, pmtu, - SCTP_DEFAULT_MINSEGMENT); + pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", + __func__, pmtu, + SCTP_DEFAULT_MINSEGMENT); /* Use default minimum segment size and disable * pmtu discovery on this transport. */ diff --git a/net/socket.c b/net/socket.c index 2270b94..717a5f1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -535,14 +535,13 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -int sock_tx_timestamp(struct msghdr *msg, struct sock *sk, - union skb_shared_tx *shtx) +int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) { - shtx->flags = 0; + *tx_flags = 0; if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - shtx->hardware = 1; + *tx_flags |= SKBTX_HW_TSTAMP; if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - shtx->software = 1; + *tx_flags |= SKBTX_SW_TSTAMP; return 0; } EXPORT_SYMBOL(sock_tx_timestamp); @@ -1919,7 +1918,8 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) * Afterwards, it will be a kernel pointer. Thus the compiler-assisted * checking falls down on this. */ - if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control, + if (copy_from_user(ctl_buf, + (void __user __force *)msg_sys.msg_control, ctl_len)) goto out_freectl; msg_sys.msg_control = ctl_buf; @@ -3054,14 +3054,19 @@ int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) { mm_segment_t oldfs = get_fs(); + char __user *uoptval; + int __user *uoptlen; int err; + uoptval = (char __user __force *) optval; + uoptlen = (int __user __force *) optlen; + set_fs(KERNEL_DS); if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, optname, optval, optlen); + err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); else - err = sock->ops->getsockopt(sock, level, optname, optval, - optlen); + err = sock->ops->getsockopt(sock, level, optname, uoptval, + uoptlen); set_fs(oldfs); return err; } @@ -3071,13 +3076,16 @@ int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen) { mm_segment_t oldfs = get_fs(); + char __user *uoptval; int err; + uoptval = (char __user __force *) optval; + set_fs(KERNEL_DS); if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, optval, optlen); + err = sock_setsockopt(sock, level, optname, uoptval, optlen); else - err = sock->ops->setsockopt(sock, level, optname, optval, + err = sock->ops->setsockopt(sock, level, optname, uoptval, optlen); set_fs(oldfs); return err; diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index a008c66..b11248c 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -143,6 +143,19 @@ static void bcbuf_decr_acks(struct sk_buff *buf) } +static void bclink_set_last_sent(void) +{ + if (bcl->next_out) + bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1); + else + bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); +} + +u32 tipc_bclink_get_last_sent(void) +{ + return bcl->fsm_msg_cnt; +} + /** * bclink_set_gap - set gap according to contents of current deferred pkt queue * @@ -237,8 +250,10 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) /* Try resolving broadcast link congestion, if necessary */ - if (unlikely(bcl->next_out)) + if (unlikely(bcl->next_out)) { tipc_link_push_queue(bcl); + bclink_set_last_sent(); + } if (unlikely(released && !list_empty(&bcl->waiting_ports))) tipc_link_wakeup_ports(bcl, 0); spin_unlock_bh(&bc_lock); @@ -395,7 +410,7 @@ int tipc_bclink_send_msg(struct sk_buff *buf) if (unlikely(res == -ELINKCONG)) buf_discard(buf); else - bcl->stats.sent_info++; + bclink_set_last_sent(); if (bcl->out_queue_size > bcl->stats.max_queue_sz) bcl->stats.max_queue_sz = bcl->out_queue_size; @@ -529,15 +544,6 @@ receive: tipc_node_unlock(node); } -u32 tipc_bclink_get_last_sent(void) -{ - u32 last_sent = mod(bcl->next_out_no - 1); - - if (bcl->next_out) - last_sent = mod(buf_seqno(bcl->next_out) - 1); - return last_sent; -} - u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr) { return (n_ptr->bclink.supported && @@ -570,6 +576,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, msg = buf_msg(buf); msg_set_non_seq(msg, 1); msg_set_mc_netid(msg, tipc_net_id); + bcl->stats.sent_info++; } /* Send buffer over bearers until all targets reached */ @@ -609,11 +616,13 @@ static int tipc_bcbearer_send(struct sk_buff *buf, bcbearer->remains = bcbearer->remains_new; } - /* Unable to reach all targets */ + /* + * Unable to reach all targets (indicate success, since currently + * there isn't code in place to properly block & unblock the + * pseudo-bearer used by the broadcast link) + */ - bcbearer->bearer.publ.blocked = 1; - bcl->stats.bearer_congs++; - return 1; + return TIPC_OK; } /** diff --git a/net/tipc/core.c b/net/tipc/core.c index 6964681..466b861 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -169,6 +169,7 @@ void tipc_core_stop(void) tipc_nametbl_stop(); tipc_ref_table_stop(); tipc_socket_stop(); + tipc_log_resize(0); } /** @@ -203,7 +204,9 @@ static int __init tipc_init(void) { int res; - tipc_log_resize(CONFIG_TIPC_LOG); + if (tipc_log_resize(CONFIG_TIPC_LOG) != 0) + warn("Unable to create log buffer\n"); + info("Activated (version " TIPC_MOD_VER " compiled " __DATE__ " " __TIME__ ")\n"); @@ -230,7 +233,6 @@ static void __exit tipc_exit(void) tipc_core_stop_net(); tipc_core_stop(); info("Deactivated\n"); - tipc_log_resize(0); } module_init(tipc_init); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index fc1fcf5..f28d1ae 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -203,6 +203,14 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr) return; } spin_lock_bh(&n_ptr->lock); + + /* Don't talk to neighbor during cleanup after last session */ + + if (n_ptr->cleanup_required) { + spin_unlock_bh(&n_ptr->lock); + return; + } + link = n_ptr->links[b_ptr->identity]; if (!link) { dbg("creating link\n"); diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 6230d16..6e988ba 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -72,17 +72,26 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, { struct sk_buff *clone; struct net_device *dev; + int delta; clone = skb_clone(buf, GFP_ATOMIC); - if (clone) { - skb_reset_network_header(clone); - dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; - clone->dev = dev; - dev_hard_header(clone, dev, ETH_P_TIPC, - &dest->dev_addr.eth_addr, - dev->dev_addr, clone->len); - dev_queue_xmit(clone); + if (!clone) + return 0; + + dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; + delta = dev->hard_header_len - skb_headroom(buf); + + if ((delta > 0) && + pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(clone); + return 0; } + + skb_reset_network_header(clone); + clone->dev = dev; + dev_hard_header(clone, dev, ETH_P_TIPC, &dest->dev_addr.eth_addr, + dev->dev_addr, clone->len); + dev_queue_xmit(clone); return 0; } @@ -92,15 +101,12 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, * Accept only packets explicitly sent to this node, or broadcast packets; * ignores packets sent using Ethernet multicast, and traffic sent to other * nodes (which can happen if interface is running in promiscuous mode). - * Routine truncates any Ethernet padding/CRC appended to the message, - * and ensures message size matches actual length */ static int recv_msg(struct sk_buff *buf, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv; - u32 size; if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(buf); @@ -109,13 +115,9 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev, if (likely(eb_ptr->bearer)) { if (likely(buf->pkt_type <= PACKET_BROADCAST)) { - size = msg_size((struct tipc_msg *)buf->data); - skb_trim(buf, size); - if (likely(buf->len == size)) { - buf->next = NULL; - tipc_recv_msg(buf, eb_ptr->bearer); - return 0; - } + buf->next = NULL; + tipc_recv_msg(buf, eb_ptr->bearer); + return 0; } } kfree_skb(buf); @@ -133,6 +135,16 @@ static int enable_bearer(struct tipc_bearer *tb_ptr) struct eth_bearer *eb_ptr = ð_bearers[0]; struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; + int pending_dev = 0; + + /* Find unused Ethernet bearer structure */ + + while (eb_ptr->dev) { + if (!eb_ptr->bearer) + pending_dev++; + if (++eb_ptr == stop) + return pending_dev ? -EAGAIN : -EDQUOT; + } /* Find device with specified name */ diff --git a/net/tipc/link.c b/net/tipc/link.c index a3616b9..a6a3102 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1802,6 +1802,15 @@ static int link_recv_buf_validate(struct sk_buff *buf) return pskb_may_pull(buf, hdr_size); } +/** + * tipc_recv_msg - process TIPC messages arriving from off-node + * @head: pointer to message buffer chain + * @tb_ptr: pointer to bearer message arrived on + * + * Invoked with no locks held. Bearer pointer must point to a valid bearer + * structure (i.e. cannot be NULL), but bearer can be inactive. + */ + void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) { read_lock_bh(&tipc_net_lock); @@ -1819,6 +1828,11 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) head = head->next; + /* Ensure bearer is still enabled */ + + if (unlikely(!b_ptr->active)) + goto cont; + /* Ensure message is well-formed */ if (unlikely(!link_recv_buf_validate(buf))) @@ -1855,13 +1869,22 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) goto cont; } - /* Locate unicast link endpoint that should handle message */ + /* Locate neighboring node that sent message */ n_ptr = tipc_node_find(msg_prevnode(msg)); if (unlikely(!n_ptr)) goto cont; tipc_node_lock(n_ptr); + /* Don't talk to neighbor during cleanup after last session */ + + if (n_ptr->cleanup_required) { + tipc_node_unlock(n_ptr); + goto cont; + } + + /* Locate unicast link endpoint that should handle message */ + l_ptr = n_ptr->links[b_ptr->identity]; if (unlikely(!l_ptr)) { tipc_node_unlock(n_ptr); diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 8ba7962..c13c2c7 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -613,8 +613,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, } /* - * tipc_nametbl_translate(): Translate tipc_name -> tipc_portid. - * Very time-critical. + * tipc_nametbl_translate - translate name to port id * * Note: on entry 'destnode' is the search domain used during translation; * on exit it passes back the node address of the matching port (if any) @@ -685,7 +684,6 @@ found: } spin_unlock_bh(&seq->lock); not_found: - *destnode = 0; read_unlock_bh(&tipc_nametbl_lock); return 0; } @@ -877,7 +875,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, u32 index) { char portIdStr[27]; - char *scopeStr; + const char *scope_str[] = {"", " zone", " cluster", " node"}; struct publication *publ = sseq->zone_list; tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper); @@ -893,15 +891,8 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, tipc_node(publ->node), publ->ref); tipc_printf(buf, "%-26s ", portIdStr); if (depth > 3) { - if (publ->node != tipc_own_addr) - scopeStr = ""; - else if (publ->scope == TIPC_NODE_SCOPE) - scopeStr = "node"; - else if (publ->scope == TIPC_CLUSTER_SCOPE) - scopeStr = "cluster"; - else - scopeStr = "zone"; - tipc_printf(buf, "%-10u %s", publ->key, scopeStr); + tipc_printf(buf, "%-10u %s", publ->key, + scope_str[publ->scope]); } publ = publ->zone_list_next; @@ -951,24 +942,19 @@ static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth, static void nametbl_header(struct print_buf *buf, u32 depth) { - tipc_printf(buf, "Type "); - - if (depth > 1) - tipc_printf(buf, "Lower Upper "); - if (depth > 2) - tipc_printf(buf, "Port Identity "); - if (depth > 3) - tipc_printf(buf, "Publication"); - - tipc_printf(buf, "\n-----------"); - - if (depth > 1) - tipc_printf(buf, "--------------------- "); - if (depth > 2) - tipc_printf(buf, "-------------------------- "); - if (depth > 3) - tipc_printf(buf, "------------------"); - + const char *header[] = { + "Type ", + "Lower Upper ", + "Port Identity ", + "Publication Scope" + }; + + int i; + + if (depth > 4) + depth = 4; + for (i = 0; i < depth; i++) + tipc_printf(buf, header[i]); tipc_printf(buf, "\n"); } diff --git a/net/tipc/net.c b/net/tipc/net.c index f61b769..7e05af4 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -248,6 +248,7 @@ void tipc_net_route_msg(struct sk_buff *buf) /* Handle message for another node */ msg_dbg(msg, "NET>SEND>: "); + skb_trim(buf, msg_size(msg)); tipc_link_send(buf, dnode, msg_link_selector(msg)); } diff --git a/net/tipc/node.c b/net/tipc/node.c index b634942..b702c7b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -237,8 +237,7 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr) int tipc_node_has_active_links(struct tipc_node *n_ptr) { - return (n_ptr && - ((n_ptr->active_links[0]) || (n_ptr->active_links[1]))); + return n_ptr->active_links[0] != NULL; } int tipc_node_has_redundant_links(struct tipc_node *n_ptr) @@ -384,6 +383,20 @@ static void node_established_contact(struct tipc_node *n_ptr) tipc_highest_allowed_slave); } +static void node_cleanup_finished(unsigned long node_addr) +{ + struct tipc_node *n_ptr; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find(node_addr); + if (n_ptr) { + tipc_node_lock(n_ptr); + n_ptr->cleanup_required = 0; + tipc_node_unlock(n_ptr); + } + read_unlock_bh(&tipc_net_lock); +} + static void node_lost_contact(struct tipc_node *n_ptr) { struct cluster *c_ptr; @@ -458,6 +471,11 @@ static void node_lost_contact(struct tipc_node *n_ptr) tipc_k_signal((Handler)ns->handle_node_down, (unsigned long)ns->usr_handle); } + + /* Prevent re-contact with node until all cleanup is done */ + + n_ptr->cleanup_required = 1; + tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr); } /** diff --git a/net/tipc/node.h b/net/tipc/node.h index 6f990da..45f3db3 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -52,6 +52,7 @@ * @active_links: pointers to active links to node * @links: pointers to all links to node * @working_links: number of working links to node (both active and standby) + * @cleanup_required: non-zero if cleaning up after a prior loss of contact * @link_cnt: number of links to node * @permit_changeover: non-zero if node has redundant links to this system * @routers: bitmap (used for multicluster communication) @@ -78,6 +79,7 @@ struct tipc_node { struct link *links[MAX_BEARERS]; int link_cnt; int working_links; + int cleanup_required; int permit_changeover; u32 routers[512/32]; int last_router; diff --git a/net/tipc/port.c b/net/tipc/port.c index 0737680..d760336 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -588,19 +588,10 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf) if (!p_ptr) { err = TIPC_ERR_NO_PORT; } else if (p_ptr->publ.connected) { - if (port_peernode(p_ptr) != msg_orignode(msg)) + if ((port_peernode(p_ptr) != msg_orignode(msg)) || + (port_peerport(p_ptr) != msg_origport(msg))) { err = TIPC_ERR_NO_PORT; - if (port_peerport(p_ptr) != msg_origport(msg)) - err = TIPC_ERR_NO_PORT; - if (!err && msg_routed(msg)) { - u32 seqno = msg_transp_seqno(msg); - u32 myno = ++p_ptr->last_in_seqno; - if (seqno != myno) { - err = TIPC_ERR_NO_PORT; - abort_buf = port_build_self_abort_msg(p_ptr, err); - } - } - if (msg_type(msg) == CONN_ACK) { + } else if (msg_type(msg) == CONN_ACK) { int wakeup = tipc_port_congested(p_ptr) && p_ptr->publ.congested && p_ptr->wakeup; @@ -1473,7 +1464,7 @@ int tipc_forward2name(u32 ref, msg_set_destnode(msg, destnode); msg_set_destport(msg, destport); - if (likely(destport || destnode)) { + if (likely(destport)) { p_ptr->sent++; if (likely(destnode == tipc_own_addr)) return tipc_port_recv_sections(p_ptr, num_sect, msg_sect); @@ -1551,7 +1542,7 @@ int tipc_forward_buf2name(u32 ref, skb_push(buf, LONG_H_SIZE); skb_copy_to_linear_data(buf, msg, LONG_H_SIZE); msg_dbg(buf_msg(buf),"PREP:"); - if (likely(destport || destnode)) { + if (likely(destport)) { p_ptr->sent++; if (destnode == tipc_own_addr) return tipc_port_recv_msg(buf); diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 66e889b..f7ac94d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -64,6 +64,7 @@ struct tipc_sock { struct sock sk; struct tipc_port *p; struct tipc_portid peer_name; + long conn_timeout; }; #define tipc_sk(sk) ((struct tipc_sock *)(sk)) @@ -240,9 +241,9 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol, sock->state = state; sock_init_data(sock, sk); - sk->sk_rcvtimeo = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); sk->sk_backlog_rcv = backlog_rcv; tipc_sk(sk)->p = tp_ptr; + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); spin_unlock_bh(tp_ptr->lock); @@ -429,36 +430,55 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr, * to handle any preventable race conditions, so TIPC will do the same ... * * TIPC sets the returned events as follows: - * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty - * or if a connection-oriented socket is does not have an active connection - * (i.e. a read operation will not block). - * b) POLLOUT is set except when a socket's connection has been terminated - * (i.e. a write operation will not block). - * c) POLLHUP is set when a socket's connection has been terminated. - * - * IMPORTANT: The fact that a read or write operation will not block does NOT - * imply that the operation will succeed! + * + * socket state flags set + * ------------ --------- + * unconnected no read flags + * no write flags + * + * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue + * no write flags + * + * connected POLLIN/POLLRDNORM if data in rx queue + * POLLOUT if port is not congested + * + * disconnecting POLLIN/POLLRDNORM/POLLHUP + * no write flags + * + * listening POLLIN if SYN in rx queue + * no write flags + * + * ready POLLIN/POLLRDNORM if data in rx queue + * [connectionless] POLLOUT (since port cannot be congested) + * + * IMPORTANT: The fact that a read or write operation is indicated does NOT + * imply that the operation will succeed, merely that it should be performed + * and will not block. */ static unsigned int poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - u32 mask; + u32 mask = 0; poll_wait(file, sk_sleep(sk), wait); - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sock->state == SS_UNCONNECTED) || - (sock->state == SS_DISCONNECTING)) - mask = (POLLRDNORM | POLLIN); - else - mask = 0; - - if (sock->state == SS_DISCONNECTING) - mask |= POLLHUP; - else - mask |= POLLOUT; + switch ((int)sock->state) { + case SS_READY: + case SS_CONNECTED: + if (!tipc_sk_port(sk)->congested) + mask |= POLLOUT; + /* fall thru' */ + case SS_CONNECTING: + case SS_LISTENING: + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= (POLLIN | POLLRDNORM); + break; + case SS_DISCONNECTING: + mask = (POLLIN | POLLRDNORM | POLLHUP); + break; + } return mask; } @@ -1026,9 +1046,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, struct sk_buff *buf; struct tipc_msg *msg; unsigned int sz; - int sz_to_copy; + int sz_to_copy, target, needed; int sz_copied = 0; - int needed; char __user *crs = m->msg_iov->iov_base; unsigned char *buf_crs; u32 err; @@ -1050,6 +1069,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, goto exit; } + target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); + restart: /* Look for a message in receive queue; wait if necessary */ @@ -1138,7 +1159,7 @@ restart: if ((sz_copied < buf_len) && /* didn't get all requested data */ (!skb_queue_empty(&sk->sk_receive_queue) || - (flags & MSG_WAITALL)) && /* and more is ready or required */ + (sz_copied < target)) && /* and more is ready or required */ (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ (!err)) /* and haven't reached a FIN */ goto restart; @@ -1365,6 +1386,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, struct msghdr m = {NULL,}; struct sk_buff *buf; struct tipc_msg *msg; + long timeout; int res; lock_sock(sk); @@ -1379,7 +1401,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, /* For now, TIPC does not support the non-blocking form of connect() */ if (flags & O_NONBLOCK) { - res = -EWOULDBLOCK; + res = -EOPNOTSUPP; goto exit; } @@ -1425,11 +1447,12 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ + timeout = tipc_sk(sk)->conn_timeout; release_sock(sk); res = wait_event_interruptible_timeout(*sk_sleep(sk), (!skb_queue_empty(&sk->sk_receive_queue) || (sock->state != SS_CONNECTING)), - sk->sk_rcvtimeo); + timeout ? timeout : MAX_SCHEDULE_TIMEOUT); lock_sock(sk); if (res > 0) { @@ -1692,7 +1715,7 @@ static int setsockopt(struct socket *sock, res = tipc_set_portunreturnable(tport->ref, value); break; case TIPC_CONN_TIMEOUT: - sk->sk_rcvtimeo = msecs_to_jiffies(value); + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value); /* no need to set "res", since already 0 at this point */ break; default: @@ -1747,7 +1770,7 @@ static int getsockopt(struct socket *sock, res = tipc_portunreturnable(tport->ref, &value); break; case TIPC_CONN_TIMEOUT: - value = jiffies_to_msecs(sk->sk_rcvtimeo); + value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout); /* no need to set "res", since already 0 at this point */ break; case TIPC_NODE_RECVQ_DEPTH: diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0b39b24..c586da3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2033,11 +2033,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/wireless/core.c b/net/wireless/core.c index d6d046b..d587ad2 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -253,11 +253,16 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, WARN_ON(err); wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } + + return err; } wiphy_net_set(&rdev->wiphy, net); - return err; + err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev)); + WARN_ON(err); + + return 0; } static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) @@ -428,7 +433,7 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check ifmodes */ WARN_ON(!ifmodes); - ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1; + ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1; if (WARN_ON(ifmodes != wiphy->interface_modes)) wiphy->interface_modes = ifmodes; @@ -683,8 +688,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work); INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); - INIT_LIST_HEAD(&wdev->action_registrations); - spin_lock_init(&wdev->action_registrations_lock); + INIT_LIST_HEAD(&wdev->mgmt_registrations); + spin_lock_init(&wdev->mgmt_registrations_lock); mutex_lock(&rdev->devlist_mtx); list_add_rcu(&wdev->list, &rdev->netdev_list); @@ -804,7 +809,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, sysfs_remove_link(&dev->dev.kobj, "phy80211"); list_del_rcu(&wdev->list); rdev->devlist_generation++; - cfg80211_mlme_purge_actions(wdev); + cfg80211_mlme_purge_registrations(wdev); #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.keys); #endif diff --git a/net/wireless/core.h b/net/wireless/core.h index 63d57ae..58ab2c7 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -331,16 +331,17 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *resp_ie, size_t resp_ie_len, u16 status, bool wextev, struct cfg80211_bss *bss); -int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, - const u8 *match_data, int match_len); -void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid); -void cfg80211_mlme_purge_actions(struct wireless_dev *wdev); -int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_channel_type channel_type, - bool channel_type_valid, - const u8 *buf, size_t len, u64 *cookie); +int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, + u16 frame_type, const u8 *match_data, + int match_len); +void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); +void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); +int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, + const u8 *buf, size_t len, u64 *cookie); /* SME */ int __cfg80211_connect(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index d1a3fb9..8515b1e 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -149,7 +149,7 @@ void __cfg80211_send_deauth(struct net_device *dev, struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; int i; - bool found = false; + bool found = false, was_current = false; ASSERT_WDEV_LOCK(wdev); @@ -159,6 +159,7 @@ void __cfg80211_send_deauth(struct net_device *dev, cfg80211_put_bss(&wdev->current_bss->pub); wdev->current_bss = NULL; found = true; + was_current = true; } else for (i = 0; i < MAX_AUTH_BSSES; i++) { if (wdev->auth_bsses[i] && memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) { @@ -183,7 +184,7 @@ void __cfg80211_send_deauth(struct net_device *dev, nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL); - if (wdev->sme_state == CFG80211_SME_CONNECTED) { + if (wdev->sme_state == CFG80211_SME_CONNECTED && was_current) { u16 reason_code; bool from_ap; @@ -747,31 +748,51 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, } EXPORT_SYMBOL(cfg80211_new_sta); -struct cfg80211_action_registration { +struct cfg80211_mgmt_registration { struct list_head list; u32 nlpid; int match_len; + __le16 frame_type; + u8 match[]; }; -int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, - const u8 *match_data, int match_len) +int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, + u16 frame_type, const u8 *match_data, + int match_len) { - struct cfg80211_action_registration *reg, *nreg; + struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; + u16 mgmt_type; + + if (!wdev->wiphy->mgmt_stypes) + return -EOPNOTSUPP; + + if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) + return -EINVAL; + + if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) + return -EINVAL; + + mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; + if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) + return -EINVAL; nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); if (!nreg) return -ENOMEM; - spin_lock_bh(&wdev->action_registrations_lock); + spin_lock_bh(&wdev->mgmt_registrations_lock); - list_for_each_entry(reg, &wdev->action_registrations, list) { + list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); + if (frame_type != le16_to_cpu(reg->frame_type)) + continue; + if (memcmp(reg->match, match_data, mlen) == 0) { err = -EALREADY; break; @@ -786,62 +807,75 @@ int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; nreg->nlpid = snd_pid; - list_add(&nreg->list, &wdev->action_registrations); + nreg->frame_type = cpu_to_le16(frame_type); + list_add(&nreg->list, &wdev->mgmt_registrations); out: - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); return err; } -void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid) +void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid) { - struct cfg80211_action_registration *reg, *tmp; + struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->action_registrations_lock); + spin_lock_bh(&wdev->mgmt_registrations_lock); - list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) { + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlpid == nlpid) { list_del(®->list); kfree(reg); } } - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); } -void cfg80211_mlme_purge_actions(struct wireless_dev *wdev) +void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { - struct cfg80211_action_registration *reg, *tmp; + struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->action_registrations_lock); + spin_lock_bh(&wdev->mgmt_registrations_lock); - list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) { + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(®->list); kfree(reg); } - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); } -int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_channel_type channel_type, - bool channel_type_valid, - const u8 *buf, size_t len, u64 *cookie) +int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, + const u8 *buf, size_t len, u64 *cookie) { struct wireless_dev *wdev = dev->ieee80211_ptr; const struct ieee80211_mgmt *mgmt; + u16 stype; + + if (!wdev->wiphy->mgmt_stypes) + return -EOPNOTSUPP; - if (rdev->ops->action == NULL) + if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; + if (len < 24 + 1) return -EINVAL; mgmt = (const struct ieee80211_mgmt *) buf; - if (!ieee80211_is_action(mgmt->frame_control)) + + if (!ieee80211_is_mgmt(mgmt->frame_control)) return -EINVAL; - if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { + + stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; + if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4))) + return -EINVAL; + + if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { /* Verify that we are associated with the destination AP */ wdev_lock(wdev); @@ -862,64 +896,75 @@ int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, return -EINVAL; /* Transmit the Action frame as requested by user space */ - return rdev->ops->action(&rdev->wiphy, dev, chan, channel_type, - channel_type_valid, buf, len, cookie); + return rdev->ops->mgmt_tx(&rdev->wiphy, dev, chan, channel_type, + channel_type_valid, buf, len, cookie); } -bool cfg80211_rx_action(struct net_device *dev, int freq, const u8 *buf, - size_t len, gfp_t gfp) +bool cfg80211_rx_mgmt(struct net_device *dev, int freq, const u8 *buf, + size_t len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - struct cfg80211_action_registration *reg; - const u8 *action_data; - int action_data_len; + struct cfg80211_mgmt_registration *reg; + const struct ieee80211_txrx_stypes *stypes = + &wiphy->mgmt_stypes[wdev->iftype]; + struct ieee80211_mgmt *mgmt = (void *)buf; + const u8 *data; + int data_len; bool result = false; + __le16 ftype = mgmt->frame_control & + cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); + u16 stype; - /* frame length - min size excluding category */ - action_data_len = len - (IEEE80211_MIN_ACTION_SIZE - 1); + stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; - /* action data starts with category */ - action_data = buf + IEEE80211_MIN_ACTION_SIZE - 1; + if (!(stypes->rx & BIT(stype))) + return false; - spin_lock_bh(&wdev->action_registrations_lock); + data = buf + ieee80211_hdrlen(mgmt->frame_control); + data_len = len - ieee80211_hdrlen(mgmt->frame_control); + + spin_lock_bh(&wdev->mgmt_registrations_lock); + + list_for_each_entry(reg, &wdev->mgmt_registrations, list) { + if (reg->frame_type != ftype) + continue; - list_for_each_entry(reg, &wdev->action_registrations, list) { - if (reg->match_len > action_data_len) + if (reg->match_len > data_len) continue; - if (memcmp(reg->match, action_data, reg->match_len)) + if (memcmp(reg->match, data, reg->match_len)) continue; /* found match! */ /* Indicate the received Action frame to user space */ - if (nl80211_send_action(rdev, dev, reg->nlpid, freq, - buf, len, gfp)) + if (nl80211_send_mgmt(rdev, dev, reg->nlpid, freq, + buf, len, gfp)) continue; result = true; break; } - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); return result; } -EXPORT_SYMBOL(cfg80211_rx_action); +EXPORT_SYMBOL(cfg80211_rx_mgmt); -void cfg80211_action_tx_status(struct net_device *dev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) +void cfg80211_mgmt_tx_status(struct net_device *dev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); /* Indicate TX status of the Action frame to user space */ - nl80211_send_action_tx_status(rdev, dev, cookie, buf, len, ack, gfp); + nl80211_send_mgmt_tx_status(rdev, dev, cookie, buf, len, ack, gfp); } -EXPORT_SYMBOL(cfg80211_action_tx_status); +EXPORT_SYMBOL(cfg80211_mgmt_tx_status); void cfg80211_cqm_rssi_notify(struct net_device *dev, enum nl80211_cqm_rssi_threshold_event rssi_event, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 37902a5..85a23de 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -136,6 +136,8 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { .len = sizeof(struct nl80211_sta_flag_update), }, [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, + [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, + [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, @@ -156,6 +158,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 }, + [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 }, }; /* policy for the attributes */ @@ -437,6 +440,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, struct ieee80211_rate *rate; int i; u16 ifmodes = dev->wiphy.interface_modes; + const struct ieee80211_txrx_stypes *mgmt_stypes = + dev->wiphy.mgmt_stypes; hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY); if (!hdr) @@ -471,6 +476,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS, dev->wiphy.max_num_pmkids); + if (dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) + NLA_PUT_FLAG(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE); + nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES); if (!nl_modes) goto nla_put_failure; @@ -587,7 +595,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, CMD(flush_pmksa, FLUSH_PMKSA); CMD(remain_on_channel, REMAIN_ON_CHANNEL); CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); - CMD(action, ACTION); + CMD(mgmt_tx, FRAME); if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { i++; NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); @@ -608,6 +616,55 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, nla_nest_end(msg, nl_cmds); + if (mgmt_stypes) { + u16 stypes; + struct nlattr *nl_ftypes, *nl_ifs; + enum nl80211_iftype ift; + + nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES); + if (!nl_ifs) + goto nla_put_failure; + + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + goto nla_put_failure; + i = 0; + stypes = mgmt_stypes[ift].tx; + while (stypes) { + if (stypes & 1) + NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT); + stypes >>= 1; + i++; + } + nla_nest_end(msg, nl_ftypes); + } + + nla_nest_end(msg, nl_ifs); + + nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES); + if (!nl_ifs) + goto nla_put_failure; + + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + goto nla_put_failure; + i = 0; + stypes = mgmt_stypes[ift].rx; + while (stypes) { + if (stypes & 1) + NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT); + stypes >>= 1; + i++; + } + nla_nest_end(msg, nl_ftypes); + } + nla_nest_end(msg, nl_ifs); + } + return genlmsg_end(msg, hdr); nla_put_failure: @@ -3572,6 +3629,21 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock_rtnl; + if (key.idx >= 0) { + int i; + bool ok = false; + for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) { + if (key.p.cipher == rdev->wiphy.cipher_suites[i]) { + ok = true; + break; + } + } + if (!ok) { + err = -EINVAL; + goto out; + } + } + if (!rdev->ops->auth) { err = -EOPNOTSUPP; goto out; @@ -3624,7 +3696,8 @@ unlock_rtnl: return err; } -static int nl80211_crypto_settings(struct genl_info *info, +static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, + struct genl_info *info, struct cfg80211_crypto_settings *settings, int cipher_limit) { @@ -3632,6 +3705,19 @@ static int nl80211_crypto_settings(struct genl_info *info, settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT]; + if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { + u16 proto; + proto = nla_get_u16( + info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); + settings->control_port_ethertype = cpu_to_be16(proto); + if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) && + proto != ETH_P_PAE) + return -EINVAL; + if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]) + settings->control_port_no_encrypt = true; + } else + settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE); + if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { void *data; int len, i; @@ -3759,7 +3845,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_PREV_BSSID]) prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); - err = nl80211_crypto_settings(info, &crypto, 1); + err = nl80211_crypto_settings(rdev, info, &crypto, 1); if (!err) err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid, ssid, ssid_len, ie, ie_len, use_mfp, @@ -4236,7 +4322,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.privacy = info->attrs[NL80211_ATTR_PRIVACY]; - err = nl80211_crypto_settings(info, &connect.crypto, + err = nl80211_crypto_settings(rdev, info, &connect.crypto, NL80211_MAX_NR_CIPHER_SUITES); if (err) return err; @@ -4717,17 +4803,18 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, return err; } -static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info) +static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; struct net_device *dev; + u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION; int err; if (!info->attrs[NL80211_ATTR_FRAME_MATCH]) return -EINVAL; - if (nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]) < 1) - return -EINVAL; + if (info->attrs[NL80211_ATTR_FRAME_TYPE]) + frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]); rtnl_lock(); @@ -4742,12 +4829,13 @@ static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info) } /* not much point in registering if we can't reply */ - if (!rdev->ops->action) { + if (!rdev->ops->mgmt_tx) { err = -EOPNOTSUPP; goto out; } - err = cfg80211_mlme_register_action(dev->ieee80211_ptr, info->snd_pid, + err = cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid, + frame_type, nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH])); out: @@ -4758,7 +4846,7 @@ static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info) return err; } -static int nl80211_action(struct sk_buff *skb, struct genl_info *info) +static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; struct net_device *dev; @@ -4781,7 +4869,7 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock_rtnl; - if (!rdev->ops->action) { + if (!rdev->ops->mgmt_tx) { err = -EOPNOTSUPP; goto out; } @@ -4824,17 +4912,17 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info) } hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, - NL80211_CMD_ACTION); + NL80211_CMD_FRAME); if (IS_ERR(hdr)) { err = PTR_ERR(hdr); goto free_msg; } - err = cfg80211_mlme_action(rdev, dev, chan, channel_type, - channel_type_valid, - nla_data(info->attrs[NL80211_ATTR_FRAME]), - nla_len(info->attrs[NL80211_ATTR_FRAME]), - &cookie); + err = cfg80211_mlme_mgmt_tx(rdev, dev, chan, channel_type, + channel_type_valid, + nla_data(info->attrs[NL80211_ATTR_FRAME]), + nla_len(info->attrs[NL80211_ATTR_FRAME]), + &cookie); if (err) goto free_msg; @@ -5333,14 +5421,14 @@ static struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, }, { - .cmd = NL80211_CMD_REGISTER_ACTION, - .doit = nl80211_register_action, + .cmd = NL80211_CMD_REGISTER_FRAME, + .doit = nl80211_register_mgmt, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, { - .cmd = NL80211_CMD_ACTION, - .doit = nl80211_action, + .cmd = NL80211_CMD_FRAME, + .doit = nl80211_tx_mgmt, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, @@ -6040,9 +6128,9 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, nl80211_mlme_mcgrp.id, gfp); } -int nl80211_send_action(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u32 nlpid, - int freq, const u8 *buf, size_t len, gfp_t gfp) +int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 nlpid, + int freq, const u8 *buf, size_t len, gfp_t gfp) { struct sk_buff *msg; void *hdr; @@ -6052,7 +6140,7 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev, if (!msg) return -ENOMEM; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION); + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME); if (!hdr) { nlmsg_free(msg); return -ENOMEM; @@ -6080,10 +6168,10 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u64 cookie, - const u8 *buf, size_t len, bool ack, - gfp_t gfp) +void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp) { struct sk_buff *msg; void *hdr; @@ -6092,7 +6180,7 @@ void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION_TX_STATUS); + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS); if (!hdr) { nlmsg_free(msg); return; @@ -6179,7 +6267,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb, list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) list_for_each_entry_rcu(wdev, &rdev->netdev_list, list) - cfg80211_mlme_unregister_actions(wdev, notify->pid); + cfg80211_mlme_unregister_socket(wdev, notify->pid); rcu_read_unlock(); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 2ad7fbc..30d2f93 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -74,13 +74,13 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp); -int nl80211_send_action(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u32 nlpid, int freq, - const u8 *buf, size_t len, gfp_t gfp); -void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u64 cookie, - const u8 *buf, size_t len, bool ack, - gfp_t gfp); +int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 nlpid, int freq, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp); void nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index f180db0..b0d9a08 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -36,6 +36,7 @@ #include <linux/slab.h> #include <linux/list.h> #include <linux/random.h> +#include <linux/ctype.h> #include <linux/nl80211.h> #include <linux/platform_device.h> #include <net/cfg80211.h> @@ -181,14 +182,6 @@ static bool is_alpha2_set(const char *alpha2) return false; } -static bool is_alpha_upper(char letter) -{ - /* ASCII A - Z */ - if (letter >= 65 && letter <= 90) - return true; - return false; -} - static bool is_unknown_alpha2(const char *alpha2) { if (!alpha2) @@ -220,7 +213,7 @@ static bool is_an_alpha2(const char *alpha2) { if (!alpha2) return false; - if (is_alpha_upper(alpha2[0]) && is_alpha_upper(alpha2[1])) + if (isalpha(alpha2[0]) && isalpha(alpha2[1])) return true; return false; } @@ -1399,6 +1392,11 @@ static DECLARE_WORK(reg_work, reg_todo); static void queue_regulatory_request(struct regulatory_request *request) { + if (isalpha(request->alpha2[0])) + request->alpha2[0] = toupper(request->alpha2[0]); + if (isalpha(request->alpha2[1])) + request->alpha2[1] = toupper(request->alpha2[1]); + spin_lock(®_requests_lock); list_add_tail(&request->list, ®_requests_list); spin_unlock(®_requests_lock); diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 9f2cef3..74a9e3c 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -110,6 +110,13 @@ static int wiphy_resume(struct device *dev) return ret; } +static const void *wiphy_namespace(struct device *d) +{ + struct wiphy *wiphy = container_of(d, struct wiphy, dev); + + return wiphy_net(wiphy); +} + struct class ieee80211_class = { .name = "ieee80211", .owner = THIS_MODULE, @@ -120,6 +127,8 @@ struct class ieee80211_class = { #endif .suspend = wiphy_suspend, .resume = wiphy_resume, + .ns_type = &net_ns_type_operations, + .namespace = wiphy_namespace, }; int wiphy_sysfs_init(void) diff --git a/net/wireless/util.c b/net/wireless/util.c index 0c8a1e8..bca32eb 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -183,7 +183,14 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, return -EINVAL; break; default: - return -EINVAL; + /* + * We don't know anything about this algorithm, + * allow using it -- but the driver must check + * all parameters! We still check below whether + * or not the driver supports this algorithm, + * of course. + */ + break; } if (params->seq) { @@ -221,7 +228,7 @@ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); -unsigned int ieee80211_hdrlen(__le16 fc) +unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; @@ -823,7 +830,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: /* not happening */ break; } diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 8f5116f..dc675a3 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -611,7 +611,7 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) #endif #ifdef CONFIG_CFG80211_WEXT - if (dev->ieee80211_ptr && dev->ieee80211_ptr && + if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy && dev->ieee80211_ptr->wiphy->wext && dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 9818198..6fffe62 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -197,6 +197,8 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev, wdev->wext.connect.ssid_len = len; wdev->wext.connect.crypto.control_port = false; + wdev->wext.connect.crypto.control_port_ethertype = + cpu_to_be16(ETH_P_PAE); err = cfg80211_mgd_wext_connect(rdev, wdev); out: |