From 4e6cbfd09c66893e5134c9896e9af353c2322b66 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Thu, 29 Jul 2010 16:14:13 -0400 Subject: mac80211: support use of NAPI for bottom-half processing This patch implement basic infrastructure to support use of NAPI by mac80211-based hardware drivers. Because mac80211 devices can support multiple netdevs, a dummy netdev is used for interfacing with the NAPI code in the core of the network stack. That structure is hidden from the hardware drivers, but the actual napi_struct is exposed in the ieee80211_hw structure so that the poll routines in drivers can retrieve that structure. Hardware drivers can also specify their own weight value for NAPI polling. Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 5 +++++ net/mac80211/iface.c | 4 ++++ net/mac80211/main.c | 30 ++++++++++++++++++++++++++++++ 3 files changed, 39 insertions(+) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 65e0ed6..79d5645 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -870,6 +870,11 @@ struct ieee80211_local { struct dentry *keys; } debugfs; #endif + + /* dummy netdev for use w/ NAPI */ + struct net_device napi_dev; + + struct napi_struct napi; }; static inline struct ieee80211_sub_if_data * diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index ebbe264..c1008a9 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -187,6 +187,8 @@ static int ieee80211_open(struct net_device *dev) res = drv_start(local); if (res) goto err_del_bss; + if (local->ops->napi_poll) + napi_enable(&local->napi); /* we're brought up, everything changes */ hw_reconf_flags = ~0; ieee80211_led_radio(local, true); @@ -519,6 +521,8 @@ static int ieee80211_stop(struct net_device *dev) ieee80211_recalc_ps(local, -1); if (local->open_count == 0) { + if (local->ops->napi_poll) + napi_disable(&local->napi); ieee80211_clear_tx_pending(local); ieee80211_stop_device(local); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 798a91b..1ed956c 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -390,6 +390,30 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, } #endif +static int ieee80211_napi_poll(struct napi_struct *napi, int budget) +{ + struct ieee80211_local *local = + container_of(napi, struct ieee80211_local, napi); + + return local->ops->napi_poll(&local->hw, budget); +} + +void ieee80211_napi_schedule(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + napi_schedule(&local->napi); +} +EXPORT_SYMBOL(ieee80211_napi_schedule); + +void ieee80211_napi_complete(struct ieee80211_hw *hw) +{ + struct ieee80211_local *local = hw_to_local(hw); + + napi_complete(&local->napi); +} +EXPORT_SYMBOL(ieee80211_napi_complete); + struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { @@ -494,6 +518,9 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, skb_queue_head_init(&local->skb_queue); skb_queue_head_init(&local->skb_queue_unreliable); + /* init dummy netdev for use w/ NAPI */ + init_dummy_netdev(&local->napi_dev); + return local_to_hw(local); } EXPORT_SYMBOL(ieee80211_alloc_hw); @@ -683,6 +710,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) goto fail_ifa; #endif + netif_napi_add(&local->napi_dev, &local->napi, ieee80211_napi_poll, + local->hw.napi_weight); + return 0; #ifdef CONFIG_INET -- cgit v1.1 From c61029c77fb68d7a182c0ae010f0f9dcae4e196c Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Thu, 5 Aug 2010 14:26:24 -0400 Subject: wireless: upcase alpha2 values in queue_regulatory_request This provides a little more flexibility for human users, and it allows us to use isalpha rather than the custom is_alpha_upper. Signed-off-by: John W. Linville --- net/wireless/reg.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/wireless/reg.c b/net/wireless/reg.c index f180db0..b0d9a08 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include #include @@ -181,14 +182,6 @@ static bool is_alpha2_set(const char *alpha2) return false; } -static bool is_alpha_upper(char letter) -{ - /* ASCII A - Z */ - if (letter >= 65 && letter <= 90) - return true; - return false; -} - static bool is_unknown_alpha2(const char *alpha2) { if (!alpha2) @@ -220,7 +213,7 @@ static bool is_an_alpha2(const char *alpha2) { if (!alpha2) return false; - if (is_alpha_upper(alpha2[0]) && is_alpha_upper(alpha2[1])) + if (isalpha(alpha2[0]) && isalpha(alpha2[1])) return true; return false; } @@ -1399,6 +1392,11 @@ static DECLARE_WORK(reg_work, reg_todo); static void queue_regulatory_request(struct regulatory_request *request) { + if (isalpha(request->alpha2[0])) + request->alpha2[0] = toupper(request->alpha2[0]); + if (isalpha(request->alpha2[1])) + request->alpha2[1] = toupper(request->alpha2[1]); + spin_lock(®_requests_lock); list_add_tail(&request->list, ®_requests_list); spin_unlock(®_requests_lock); -- cgit v1.1 From ffd2778bb984afe3cc264e22a125c06587020aa3 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Thu, 29 Jul 2010 17:36:43 +0200 Subject: mac80211: fix driver offchannel notification when the channel does not change When running in client mode and associating to an AP, the channel change is usually performed with the offchannel flag still set. However after the assoc is complete, the following channel change event is suppressed because the run time channel is already set to the operating channel. Fix this by sending channel change notifications to the driver even if only the offchannel flag changes. Signed-off-by: Felix Fietkau Signed-off-by: John W. Linville --- net/mac80211/main.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 1ed956c..18b8df9 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -99,11 +99,13 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) int ret = 0; int power; enum nl80211_channel_type channel_type; + u32 offchannel_flag; might_sleep(); scan_chan = local->scan_channel; + offchannel_flag = local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; if (scan_chan) { chan = scan_chan; channel_type = NL80211_CHAN_NO_HT; @@ -117,8 +119,9 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) channel_type = local->_oper_channel_type; local->hw.conf.flags &= ~IEEE80211_CONF_OFFCHANNEL; } + offchannel_flag ^= local->hw.conf.flags & IEEE80211_CONF_OFFCHANNEL; - if (chan != local->hw.conf.channel || + if (offchannel_flag || chan != local->hw.conf.channel || channel_type != local->hw.conf.channel_type) { local->hw.conf.channel = chan; local->hw.conf.channel_type = channel_type; -- cgit v1.1 From fc88518916793af8ad6a02e05ff254d95c36d875 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 30 Jul 2010 13:23:12 +0200 Subject: mac80211: don't check rates on PLCP error frames Frames that failed PLCP error checks are most likely microwave transmissions (well, maybe not ...) and don't have a proper rate detected, so ignore it. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index fa0f37e..225e8ee 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2615,28 +2615,37 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) if (WARN_ON(!local->started)) goto drop; - if (status->flag & RX_FLAG_HT) { + if (likely(!(status->flag & RX_FLAG_FAILED_PLCP_CRC))) { /* - * rate_idx is MCS index, which can be [0-76] as documented on: - * - * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n - * - * Anything else would be some sort of driver or hardware error. - * The driver should catch hardware errors. + * Validate the rate, unless a PLCP error means that + * we probably can't have a valid rate here anyway. */ - if (WARN((status->rate_idx < 0 || - status->rate_idx > 76), - "Rate marked as an HT rate but passed " - "status->rate_idx is not " - "an MCS index [0-76]: %d (0x%02x)\n", - status->rate_idx, - status->rate_idx)) - goto drop; - } else { - if (WARN_ON(status->rate_idx < 0 || - status->rate_idx >= sband->n_bitrates)) - goto drop; - rate = &sband->bitrates[status->rate_idx]; + + if (status->flag & RX_FLAG_HT) { + /* + * rate_idx is MCS index, which can be [0-76] + * as documented on: + * + * http://wireless.kernel.org/en/developers/Documentation/ieee80211/802.11n + * + * Anything else would be some sort of driver or + * hardware error. The driver should catch hardware + * errors. + */ + if (WARN((status->rate_idx < 0 || + status->rate_idx > 76), + "Rate marked as an HT rate but passed " + "status->rate_idx is not " + "an MCS index [0-76]: %d (0x%02x)\n", + status->rate_idx, + status->rate_idx)) + goto drop; + } else { + if (WARN_ON(status->rate_idx < 0 || + status->rate_idx >= sband->n_bitrates)) + goto drop; + rate = &sband->bitrates[status->rate_idx]; + } } /* -- cgit v1.1 From a1699b75a1db31a1da2f0fc610ee696d02a19280 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 30 Jul 2010 16:46:07 +0200 Subject: mac80211: unify scan and work mutexes Having both scan and work mutexes is not just a bit too fine grained, it also creates issues when there's code that needs both since they then need to be acquired in the right order, which can be hard to do. Therefore, use just a single mutex for both. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 4 ++-- net/mac80211/main.c | 4 ++-- net/mac80211/mlme.c | 8 ++++---- net/mac80211/scan.c | 30 +++++++++++++++--------------- net/mac80211/work.c | 35 +++++++++++++++-------------------- 5 files changed, 38 insertions(+), 43 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 79d5645..fb4363e 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -634,7 +634,6 @@ struct ieee80211_local { /* * work stuff, potentially off-channel (in the future) */ - struct mutex work_mtx; struct list_head work_list; struct timer_list work_timer; struct work_struct work_work; @@ -746,9 +745,10 @@ struct ieee80211_local { */ struct mutex key_mtx; + /* mutex for scan and work locking */ + struct mutex mtx; /* Scanning and BSS list */ - struct mutex scan_mtx; unsigned long scanning; struct cfg80211_ssid scan_ssid; struct cfg80211_scan_request *int_scan_req; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 18b8df9..06b9608 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -482,7 +482,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, __hw_addr_init(&local->mc_list); mutex_init(&local->iflist_mtx); - mutex_init(&local->scan_mtx); + mutex_init(&local->mtx); mutex_init(&local->key_mtx); spin_lock_init(&local->filter_lock); @@ -791,7 +791,7 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) struct ieee80211_local *local = hw_to_local(hw); mutex_destroy(&local->iflist_mtx); - mutex_destroy(&local->scan_mtx); + mutex_destroy(&local->mtx); wiphy_free(local->hw.wiphy); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index b6c163a..17e9257 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1751,7 +1751,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_work *wk; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; @@ -1783,7 +1783,7 @@ void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, free_work(wk); break; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); cfg80211_send_deauth(sdata->dev, (u8 *)mgmt, skb->len); } @@ -2275,7 +2275,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, mutex_unlock(&ifmgd->mtx); - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; @@ -2294,7 +2294,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, free_work(wk); break; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); /* * If somebody requests authentication and we haven't diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 872d7b6..f31f5497 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -255,7 +255,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) trace_api_scan_completed(local, aborted); - mutex_lock(&local->scan_mtx); + mutex_lock(&local->mtx); /* * It's ok to abort a not-yet-running scan (that @@ -267,7 +267,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) aborted = true; if (WARN_ON(!local->scan_req)) { - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return; } @@ -275,7 +275,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return; } @@ -291,7 +291,7 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) local->scan_channel = NULL; /* we only have to protect scan_req and hw/sw scan */ - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); if (was_hw_scan) @@ -639,15 +639,15 @@ void ieee80211_scan_work(struct work_struct *work) struct ieee80211_sub_if_data *sdata = local->scan_sdata; unsigned long next_delay = 0; - mutex_lock(&local->scan_mtx); + mutex_lock(&local->mtx); if (!sdata || !local->scan_req) { - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return; } if (local->hw_scan_req) { int rc = drv_hw_scan(local, sdata, local->hw_scan_req); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); if (rc) ieee80211_scan_completed(&local->hw, true); return; @@ -661,14 +661,14 @@ void ieee80211_scan_work(struct work_struct *work) local->scan_sdata = NULL; rc = __ieee80211_start_scan(sdata, req); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); if (rc) ieee80211_scan_completed(&local->hw, true); return; } - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); /* * Avoid re-scheduling when the sdata is going away. @@ -711,9 +711,9 @@ int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, { int res; - mutex_lock(&sdata->local->scan_mtx); + mutex_lock(&sdata->local->mtx); res = __ieee80211_start_scan(sdata, req); - mutex_unlock(&sdata->local->scan_mtx); + mutex_unlock(&sdata->local->mtx); return res; } @@ -726,7 +726,7 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, int ret = -EBUSY; enum ieee80211_band band; - mutex_lock(&local->scan_mtx); + mutex_lock(&local->mtx); /* busy scanning */ if (local->scan_req) @@ -761,7 +761,7 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, ret = __ieee80211_start_scan(sdata, sdata->local->int_scan_req); unlock: - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); return ret; } @@ -775,10 +775,10 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) * Only call this function when a scan can't be * queued -- mostly at suspend under RTNL. */ - mutex_lock(&local->scan_mtx); + mutex_lock(&local->mtx); abortscan = test_bit(SCAN_SW_SCANNING, &local->scanning) || (!local->scanning && local->scan_req); - mutex_unlock(&local->scan_mtx); + mutex_unlock(&local->mtx); if (abortscan) ieee80211_scan_completed(&local->hw, true); diff --git a/net/mac80211/work.c b/net/mac80211/work.c index 81d4ad6..b98af64 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -43,7 +43,7 @@ enum work_action { /* utils */ static inline void ASSERT_WORK_MTX(struct ieee80211_local *local) { - WARN_ON(!mutex_is_locked(&local->work_mtx)); + lockdep_assert_held(&local->mtx); } /* @@ -757,7 +757,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, mgmt = (struct ieee80211_mgmt *) skb->data; fc = le16_to_cpu(mgmt->frame_control); - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { const u8 *bssid = NULL; @@ -833,7 +833,7 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, WARN(1, "unexpected: %d", rma); } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); if (rma != WORK_ACT_DONE) goto out; @@ -845,9 +845,9 @@ static void ieee80211_work_rx_queued_mgmt(struct ieee80211_local *local, case WORK_DONE_REQUEUE: synchronize_rcu(); wk->started = false; /* restart */ - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_add_tail(&wk->list, &local->work_list); - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); } out: @@ -890,7 +890,7 @@ static void ieee80211_work_work(struct work_struct *work) ieee80211_recalc_idle(local); - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry_safe(wk, tmp, &local->work_list, list) { bool started = wk->started; @@ -995,17 +995,13 @@ static void ieee80211_work_work(struct work_struct *work) run_again(local, jiffies + HZ/2); } - mutex_lock(&local->scan_mtx); - if (list_empty(&local->work_list) && local->scan_req && !local->scanning) ieee80211_queue_delayed_work(&local->hw, &local->scan_work, round_jiffies_relative(0)); - mutex_unlock(&local->scan_mtx); - - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); ieee80211_recalc_idle(local); @@ -1035,16 +1031,15 @@ void ieee80211_add_work(struct ieee80211_work *wk) wk->started = false; local = wk->sdata->local; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_add_tail(&wk->list, &local->work_list); - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); ieee80211_queue_work(&local->hw, &local->work_work); } void ieee80211_work_init(struct ieee80211_local *local) { - mutex_init(&local->work_mtx); INIT_LIST_HEAD(&local->work_list); setup_timer(&local->work_timer, ieee80211_work_timer, (unsigned long)local); @@ -1057,7 +1052,7 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata) struct ieee80211_local *local = sdata->local; struct ieee80211_work *wk; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; @@ -1065,19 +1060,19 @@ void ieee80211_work_purge(struct ieee80211_sub_if_data *sdata) wk->started = true; wk->timeout = jiffies; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); /* run cleanups etc. */ ieee80211_work_work(&local->work_work); - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry(wk, &local->work_list, list) { if (wk->sdata != sdata) continue; WARN_ON(1); break; } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); } ieee80211_rx_result ieee80211_work_rx_mgmt(struct ieee80211_sub_if_data *sdata, @@ -1163,7 +1158,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata, struct ieee80211_work *wk, *tmp; bool found = false; - mutex_lock(&local->work_mtx); + mutex_lock(&local->mtx); list_for_each_entry_safe(wk, tmp, &local->work_list, list) { if ((unsigned long) wk == cookie) { wk->timeout = jiffies; @@ -1171,7 +1166,7 @@ int ieee80211_wk_cancel_remain_on_channel(struct ieee80211_sub_if_data *sdata, break; } } - mutex_unlock(&local->work_mtx); + mutex_unlock(&local->mtx); if (!found) return -ENOENT; -- cgit v1.1 From 1ac62ba7c985109868a18d959986425148481f47 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 1 Aug 2010 17:37:03 +0100 Subject: mac80211: Don't squash error codes in key setup functions ieee80211_add_key() currently returns -ENOMEM in case of any error, including a missing crypto algorithm. Change ieee80211_key_alloc() and ieee80211_aes_{key_setup_encrypt,cmac_key_setup}() to encode errors with ERR_PTR() rather than returning NULL, and change ieee80211_add_key() accordingly. Compile-tested only. Reported-by: Marcin Owsiany Signed-off-by: Ben Hutchings Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/aes_ccm.c | 6 ++---- net/mac80211/aes_cmac.c | 6 ++---- net/mac80211/cfg.c | 4 ++-- net/mac80211/key.c | 14 ++++++++------ 4 files changed, 14 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mac80211/aes_ccm.c b/net/mac80211/aes_ccm.c index a87cb3b..d2b03e0 100644 --- a/net/mac80211/aes_ccm.c +++ b/net/mac80211/aes_ccm.c @@ -138,10 +138,8 @@ struct crypto_cipher *ieee80211_aes_key_setup_encrypt(const u8 key[]) struct crypto_cipher *tfm; tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return NULL; - - crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN); + if (!IS_ERR(tfm)) + crypto_cipher_setkey(tfm, key, ALG_CCMP_KEY_LEN); return tfm; } diff --git a/net/mac80211/aes_cmac.c b/net/mac80211/aes_cmac.c index 3d097b3..b4d66cc 100644 --- a/net/mac80211/aes_cmac.c +++ b/net/mac80211/aes_cmac.c @@ -119,10 +119,8 @@ struct crypto_cipher * ieee80211_aes_cmac_key_setup(const u8 key[]) struct crypto_cipher *tfm; tfm = crypto_alloc_cipher("aes", 0, CRYPTO_ALG_ASYNC); - if (IS_ERR(tfm)) - return NULL; - - crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN); + if (!IS_ERR(tfm)) + crypto_cipher_setkey(tfm, key, AES_CMAC_KEY_LEN); return tfm; } diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 29ac8e1..19c6146 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -150,8 +150,8 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key, params->seq_len, params->seq); - if (!key) - return -ENOMEM; + if (IS_ERR(key)) + return PTR_ERR(key); mutex_lock(&sdata->local->sta_mtx); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 1b9d87e..d6dbc8e 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -234,13 +234,13 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, size_t seq_len, const u8 *seq) { struct ieee80211_key *key; - int i, j; + int i, j, err; BUG_ON(idx < 0 || idx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS); key = kzalloc(sizeof(struct ieee80211_key) + key_len, GFP_KERNEL); if (!key) - return NULL; + return ERR_PTR(-ENOMEM); /* * Default to software encryption; we'll later upload the @@ -296,9 +296,10 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, * it does not need to be initialized for every packet. */ key->u.ccmp.tfm = ieee80211_aes_key_setup_encrypt(key_data); - if (!key->u.ccmp.tfm) { + if (IS_ERR(key->u.ccmp.tfm)) { + err = PTR_ERR(key->u.ccmp.tfm); kfree(key); - return NULL; + key = ERR_PTR(err); } } @@ -309,9 +310,10 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, */ key->u.aes_cmac.tfm = ieee80211_aes_cmac_key_setup(key_data); - if (!key->u.aes_cmac.tfm) { + if (IS_ERR(key->u.aes_cmac.tfm)) { + err = PTR_ERR(key->u.aes_cmac.tfm); kfree(key); - return NULL; + key = ERR_PTR(err); } } -- cgit v1.1 From aa0c86364f925c6f12195072562b18c5609ff082 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Thu, 5 Aug 2010 01:36:04 +0200 Subject: mac80211: put rx handlers into separate functions This patch takes the reorder logic from the RX path and moves it into separate routines to make the expired frame release accessible. Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/rx.c | 214 ++++++++++++++++++++++++++++++++---------------------- 1 file changed, 128 insertions(+), 86 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 225e8ee..8301b4a 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -583,6 +583,57 @@ static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw, */ #define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10) +static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, + struct tid_ampdu_rx *tid_agg_rx, + struct sk_buff_head *frames) +{ + int index; + + /* release the buffer until next missing frame */ + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + tid_agg_rx->buf_size; + if (!tid_agg_rx->reorder_buf[index] && + tid_agg_rx->stored_mpdu_num > 1) { + /* + * No buffers ready to be released, but check whether any + * frames in the reorder buffer have timed out. + */ + int j; + int skipped = 1; + for (j = (index + 1) % tid_agg_rx->buf_size; j != index; + j = (j + 1) % tid_agg_rx->buf_size) { + if (!tid_agg_rx->reorder_buf[j]) { + skipped++; + continue; + } + if (!time_after(jiffies, tid_agg_rx->reorder_time[j] + + HT_RX_REORDER_BUF_TIMEOUT)) + break; + +#ifdef CONFIG_MAC80211_HT_DEBUG + if (net_ratelimit()) + printk(KERN_DEBUG "%s: release an RX reorder " + "frame due to timeout on earlier " + "frames\n", + wiphy_name(hw->wiphy)); +#endif + ieee80211_release_reorder_frame(hw, tid_agg_rx, + j, frames); + + /* + * Increment the head seq# also for the skipped slots. + */ + tid_agg_rx->head_seq_num = + (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK; + skipped = 0; + } + } else while (tid_agg_rx->reorder_buf[index]) { + ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames); + index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % + tid_agg_rx->buf_size; + } +} + /* * As this function belongs to the RX path it must be under * rcu_read_lock protection. It returns false if the frame @@ -643,49 +694,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, tid_agg_rx->reorder_buf[index] = skb; tid_agg_rx->reorder_time[index] = jiffies; tid_agg_rx->stored_mpdu_num++; - /* release the buffer until next missing frame */ - index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % - tid_agg_rx->buf_size; - if (!tid_agg_rx->reorder_buf[index] && - tid_agg_rx->stored_mpdu_num > 1) { - /* - * No buffers ready to be released, but check whether any - * frames in the reorder buffer have timed out. - */ - int j; - int skipped = 1; - for (j = (index + 1) % tid_agg_rx->buf_size; j != index; - j = (j + 1) % tid_agg_rx->buf_size) { - if (!tid_agg_rx->reorder_buf[j]) { - skipped++; - continue; - } - if (!time_after(jiffies, tid_agg_rx->reorder_time[j] + - HT_RX_REORDER_BUF_TIMEOUT)) - break; - -#ifdef CONFIG_MAC80211_HT_DEBUG - if (net_ratelimit()) - printk(KERN_DEBUG "%s: release an RX reorder " - "frame due to timeout on earlier " - "frames\n", - wiphy_name(hw->wiphy)); -#endif - ieee80211_release_reorder_frame(hw, tid_agg_rx, - j, frames); - - /* - * Increment the head seq# also for the skipped slots. - */ - tid_agg_rx->head_seq_num = - (tid_agg_rx->head_seq_num + skipped) & SEQ_MASK; - skipped = 0; - } - } else while (tid_agg_rx->reorder_buf[index]) { - ieee80211_release_reorder_frame(hw, tid_agg_rx, index, frames); - index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % - tid_agg_rx->buf_size; - } + ieee80211_sta_reorder_release(hw, tid_agg_rx, frames); return true; } @@ -2267,19 +2276,46 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, dev_kfree_skb(skb); } - -static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, - struct ieee80211_rx_data *rx, - struct sk_buff *skb, - struct ieee80211_rate *rate) +static void ieee80211_rx_handlers_result(struct ieee80211_rx_data *rx, + ieee80211_rx_result res) { - struct sk_buff_head reorder_release; - ieee80211_rx_result res = RX_DROP_MONITOR; + switch (res) { + case RX_DROP_MONITOR: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); + if (rx->sta) + rx->sta->rx_dropped++; + /* fall through */ + case RX_CONTINUE: { + struct ieee80211_rate *rate = NULL; + struct ieee80211_supported_band *sband; + struct ieee80211_rx_status *status; + + status = IEEE80211_SKB_RXCB((rx->skb)); + + sband = rx->local->hw.wiphy->bands[status->band]; + if (!(status->flag & RX_FLAG_HT)) + rate = &sband->bitrates[status->rate_idx]; - __skb_queue_head_init(&reorder_release); + ieee80211_rx_cooked_monitor(rx, rate); + break; + } + case RX_DROP_UNUSABLE: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_drop); + if (rx->sta) + rx->sta->rx_dropped++; + dev_kfree_skb(rx->skb); + break; + case RX_QUEUED: + I802_DEBUG_INC(rx->sdata->local->rx_handlers_queued); + break; + } +} - rx->skb = skb; - rx->sdata = sdata; +static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, + struct sk_buff_head *frames) +{ + ieee80211_rx_result res = RX_DROP_MONITOR; + struct sk_buff *skb; #define CALL_RXH(rxh) \ do { \ @@ -2288,17 +2324,7 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, goto rxh_next; \ } while (0); - /* - * NB: the rxh_next label works even if we jump - * to it from here because then the list will - * be empty, which is a trivial check - */ - CALL_RXH(ieee80211_rx_h_passive_scan) - CALL_RXH(ieee80211_rx_h_check) - - ieee80211_rx_reorder_ampdu(rx, &reorder_release); - - while ((skb = __skb_dequeue(&reorder_release))) { + while ((skb = __skb_dequeue(frames))) { /* * all the other fields are valid across frames * that belong to an aMPDU since they are on the @@ -2316,42 +2342,58 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, CALL_RXH(ieee80211_rx_h_remove_qos_control) CALL_RXH(ieee80211_rx_h_amsdu) #ifdef CONFIG_MAC80211_MESH - if (ieee80211_vif_is_mesh(&sdata->vif)) + if (ieee80211_vif_is_mesh(&rx->sdata->vif)) CALL_RXH(ieee80211_rx_h_mesh_fwding); #endif CALL_RXH(ieee80211_rx_h_data) /* special treatment -- needs the queue */ - res = ieee80211_rx_h_ctrl(rx, &reorder_release); + res = ieee80211_rx_h_ctrl(rx, frames); if (res != RX_CONTINUE) goto rxh_next; CALL_RXH(ieee80211_rx_h_action) CALL_RXH(ieee80211_rx_h_mgmt) + rxh_next: + ieee80211_rx_handlers_result(rx, res); + #undef CALL_RXH + } +} + +static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, + struct ieee80211_rx_data *rx, + struct sk_buff *skb, + struct ieee80211_rate *rate) +{ + struct sk_buff_head reorder_release; + ieee80211_rx_result res = RX_DROP_MONITOR; + + __skb_queue_head_init(&reorder_release); + + rx->skb = skb; + rx->sdata = sdata; + +#define CALL_RXH(rxh) \ + do { \ + res = rxh(rx); \ + if (res != RX_CONTINUE) \ + goto rxh_next; \ + } while (0); + + CALL_RXH(ieee80211_rx_h_passive_scan) + CALL_RXH(ieee80211_rx_h_check) + + ieee80211_rx_reorder_ampdu(rx, &reorder_release); + + ieee80211_rx_handlers(rx, &reorder_release); + return; rxh_next: - switch (res) { - case RX_DROP_MONITOR: - I802_DEBUG_INC(sdata->local->rx_handlers_drop); - if (rx->sta) - rx->sta->rx_dropped++; - /* fall through */ - case RX_CONTINUE: - ieee80211_rx_cooked_monitor(rx, rate); - break; - case RX_DROP_UNUSABLE: - I802_DEBUG_INC(sdata->local->rx_handlers_drop); - if (rx->sta) - rx->sta->rx_dropped++; - dev_kfree_skb(rx->skb); - break; - case RX_QUEUED: - I802_DEBUG_INC(sdata->local->rx_handlers_queued); - break; - } - } + ieee80211_rx_handlers_result(rx, res); + +#undef CALL_RXH } /* main receive path */ -- cgit v1.1 From 071d9ac253ff51154beb7e33967168e30bc96053 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Thu, 5 Aug 2010 01:36:36 +0200 Subject: mac80211: remove unused rate function parameter This patch removes a few stale parameters and variables which survived the last, large rx-path reorganization: "mac80211: correctly place aMPDU RX reorder code" Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/rx.c | 22 ++++++---------------- 1 file changed, 6 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8301b4a..d5b91b6 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -538,20 +538,12 @@ static void ieee80211_release_reorder_frame(struct ieee80211_hw *hw, int index, struct sk_buff_head *frames) { - struct ieee80211_supported_band *sband; - struct ieee80211_rate *rate = NULL; struct sk_buff *skb = tid_agg_rx->reorder_buf[index]; - struct ieee80211_rx_status *status; if (!skb) goto no_frame; - status = IEEE80211_SKB_RXCB(skb); - - /* release the reordered frames to stack */ - sband = hw->wiphy->bands[status->band]; - if (!(status->flag & RX_FLAG_HT)) - rate = &sband->bitrates[status->rate_idx]; + /* release the frame from the reorder ring buffer */ tid_agg_rx->stored_mpdu_num--; tid_agg_rx->reorder_buf[index] = NULL; __skb_queue_tail(frames, skb); @@ -2364,8 +2356,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, struct ieee80211_rx_data *rx, - struct sk_buff *skb, - struct ieee80211_rate *rate) + struct sk_buff *skb) { struct sk_buff_head reorder_release; ieee80211_rx_result res = RX_DROP_MONITOR; @@ -2489,8 +2480,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, * be called with rcu_read_lock protection. */ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, - struct sk_buff *skb, - struct ieee80211_rate *rate) + struct sk_buff *skb) { struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_local *local = hw_to_local(hw); @@ -2598,7 +2588,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, prev->name); goto next; } - ieee80211_invoke_rx_handlers(prev, &rx, skb_new, rate); + ieee80211_invoke_rx_handlers(prev, &rx, skb_new); next: prev = sdata; } @@ -2614,7 +2604,7 @@ next: } } if (prev) - ieee80211_invoke_rx_handlers(prev, &rx, skb, rate); + ieee80211_invoke_rx_handlers(prev, &rx, skb); else dev_kfree_skb(skb); } @@ -2709,7 +2699,7 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) return; } - __ieee80211_rx_handle_packet(hw, skb, rate); + __ieee80211_rx_handle_packet(hw, skb); rcu_read_unlock(); -- cgit v1.1 From 2bff8ebf32a7c5ec9e5f5eeffef94a8cb622f5f0 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Thu, 5 Aug 2010 01:36:41 +0200 Subject: mac80211: AMPDU rx reorder timeout timer This patch introduces a new timer, which will release queued-up MPDUs from the reorder buffer, whenever they've waited for more than HT_RX_REORDER_BUF_TIMEOUT (which is at around 100 ms). The advantage of having a dedicated timer, instead of relying on a constant stream of freshly arriving aMPDUs to release the old ones, is particularly observable when even a small fraction of MPDUs are forever lost at low network speeds. Previously under these circumstances frames would become stuck in the reorder buffer and the network stack of both HT peers throttled back, instead of revving up and gunning the pipes. Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/agg-rx.c | 22 +++++++++++++++ net/mac80211/ieee80211_i.h | 1 + net/mac80211/rx.c | 70 +++++++++++++++++++++++++++++++++++++++++----- net/mac80211/sta_info.h | 16 +++++++---- 4 files changed, 97 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 965b272..58eab9e 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -86,6 +86,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, tid, 0, reason); del_timer_sync(&tid_rx->session_timer); + del_timer_sync(&tid_rx->reorder_timer); call_rcu(&tid_rx->rcu_head, ieee80211_free_tid_rx); } @@ -120,6 +121,20 @@ static void sta_rx_agg_session_timer_expired(unsigned long data) ieee80211_queue_work(&sta->local->hw, &sta->ampdu_mlme.work); } +static void sta_rx_agg_reorder_timer_expired(unsigned long data) +{ + u8 *ptid = (u8 *)data; + u8 *timer_to_id = ptid - *ptid; + struct sta_info *sta = container_of(timer_to_id, struct sta_info, + timer_to_tid[0]); + + rcu_read_lock(); + spin_lock(&sta->lock); + ieee80211_release_reorder_timeout(sta, *ptid); + spin_unlock(&sta->lock); + rcu_read_unlock(); +} + static void ieee80211_send_addba_resp(struct ieee80211_sub_if_data *sdata, u8 *da, u16 tid, u8 dialog_token, u16 status, u16 policy, u16 buf_size, u16 timeout) @@ -251,11 +266,18 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, goto end; } + spin_lock_init(&tid_agg_rx->reorder_lock); + /* rx timer */ tid_agg_rx->session_timer.function = sta_rx_agg_session_timer_expired; tid_agg_rx->session_timer.data = (unsigned long)&sta->timer_to_tid[tid]; init_timer(&tid_agg_rx->session_timer); + /* rx reorder timer */ + tid_agg_rx->reorder_timer.function = sta_rx_agg_reorder_timer_expired; + tid_agg_rx->reorder_timer.data = (unsigned long)&sta->timer_to_tid[tid]; + init_timer(&tid_agg_rx->reorder_timer); + /* prepare reordering buffer */ tid_agg_rx->reorder_buf = kcalloc(buf_size, sizeof(struct sk_buff *), GFP_ATOMIC); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index fb4363e..b44e03a 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1136,6 +1136,7 @@ void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); void ieee80211_ba_session_work(struct work_struct *work); void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid); +void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid); /* Spectrum management */ void ieee80211_process_measurement_req(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index d5b91b6..f24a0a1 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -572,6 +572,8 @@ static void ieee80211_release_reorder_frames(struct ieee80211_hw *hw, * frames that have not yet been received are assumed to be lost and the skb * can be released for processing. This may also release other skb's from the * reorder buffer if there are no additional gaps between the frames. + * + * Callers must hold tid_agg_rx->reorder_lock. */ #define HT_RX_REORDER_BUF_TIMEOUT (HZ / 10) @@ -579,7 +581,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, struct tid_ampdu_rx *tid_agg_rx, struct sk_buff_head *frames) { - int index; + int index, j; /* release the buffer until next missing frame */ index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % @@ -590,7 +592,6 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, * No buffers ready to be released, but check whether any * frames in the reorder buffer have timed out. */ - int j; int skipped = 1; for (j = (index + 1) % tid_agg_rx->buf_size; j != index; j = (j + 1) % tid_agg_rx->buf_size) { @@ -600,7 +601,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, } if (!time_after(jiffies, tid_agg_rx->reorder_time[j] + HT_RX_REORDER_BUF_TIMEOUT)) - break; + goto set_release_timer; #ifdef CONFIG_MAC80211_HT_DEBUG if (net_ratelimit()) @@ -624,6 +625,25 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; } + + if (tid_agg_rx->stored_mpdu_num) { + j = index = seq_sub(tid_agg_rx->head_seq_num, + tid_agg_rx->ssn) % tid_agg_rx->buf_size; + + for (; j != (index - 1) % tid_agg_rx->buf_size; + j = (j + 1) % tid_agg_rx->buf_size) { + if (tid_agg_rx->reorder_buf[j]) + break; + } + + set_release_timer: + + mod_timer(&tid_agg_rx->reorder_timer, + tid_agg_rx->reorder_time[j] + + HT_RX_REORDER_BUF_TIMEOUT); + } else { + del_timer(&tid_agg_rx->reorder_timer); + } } /* @@ -641,14 +661,16 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, u16 mpdu_seq_num = (sc & IEEE80211_SCTL_SEQ) >> 4; u16 head_seq_num, buf_size; int index; + bool ret = true; buf_size = tid_agg_rx->buf_size; head_seq_num = tid_agg_rx->head_seq_num; + spin_lock(&tid_agg_rx->reorder_lock); /* frame with out of date sequence number */ if (seq_less(mpdu_seq_num, head_seq_num)) { dev_kfree_skb(skb); - return true; + goto out; } /* @@ -669,7 +691,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, /* check if we already stored this frame */ if (tid_agg_rx->reorder_buf[index]) { dev_kfree_skb(skb); - return true; + goto out; } /* @@ -679,7 +701,8 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, if (mpdu_seq_num == tid_agg_rx->head_seq_num && tid_agg_rx->stored_mpdu_num == 0) { tid_agg_rx->head_seq_num = seq_inc(tid_agg_rx->head_seq_num); - return false; + ret = false; + goto out; } /* put the frame in the reordering buffer */ @@ -688,7 +711,9 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_hw *hw, tid_agg_rx->stored_mpdu_num++; ieee80211_sta_reorder_release(hw, tid_agg_rx, frames); - return true; + out: + spin_unlock(&tid_agg_rx->reorder_lock); + return ret; } /* @@ -2387,6 +2412,37 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, #undef CALL_RXH } +/* + * This function makes calls into the RX path. Therefore the + * caller must hold the sta_info->lock and everything has to + * be under rcu_read_lock protection as well. + */ +void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) +{ + struct sk_buff_head frames; + struct ieee80211_rx_data rx = { }; + + __skb_queue_head_init(&frames); + + /* construct rx struct */ + rx.sta = sta; + rx.sdata = sta->sdata; + rx.local = sta->local; + rx.queue = tid; + rx.flags |= IEEE80211_RX_RA_MATCH; + + if (unlikely(test_bit(SCAN_HW_SCANNING, &sta->local->scanning) || + test_bit(SCAN_OFF_CHANNEL, &sta->local->scanning))) + rx.flags |= IEEE80211_RX_IN_SCAN; + + spin_lock(&sta->ampdu_mlme.tid_rx[tid]->reorder_lock); + ieee80211_sta_reorder_release(&sta->local->hw, + sta->ampdu_mlme.tid_rx[tid], &frames); + spin_unlock(&sta->ampdu_mlme.tid_rx[tid]->reorder_lock); + + ieee80211_rx_handlers(&rx, &frames); +} + /* main receive path */ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 54262e7..810c5ce 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -103,6 +103,7 @@ struct tid_ampdu_tx { * @reorder_buf: buffer to reorder incoming aggregated MPDUs * @reorder_time: jiffies when skb was added * @session_timer: check if peer keeps Tx-ing on the TID (by timeout value) + * @reorder_timer: releases expired frames from the reorder buffer. * @head_seq_num: head sequence number in reordering buffer. * @stored_mpdu_num: number of MPDUs in reordering buffer * @ssn: Starting Sequence Number expected to be aggregated. @@ -110,20 +111,25 @@ struct tid_ampdu_tx { * @timeout: reset timer value (in TUs). * @dialog_token: dialog token for aggregation session * @rcu_head: RCU head used for freeing this struct + * @reorder_lock: serializes access to reorder buffer, see below. * * This structure is protected by RCU and the per-station * spinlock. Assignments to the array holding it must hold - * the spinlock, only the RX path can access it under RCU - * lock-free. The RX path, since it is single-threaded, - * can even modify the structure without locking since the - * only other modifications to it are done when the struct - * can not yet or no longer be found by the RX path. + * the spinlock. + * + * The @reorder_lock is used to protect the variables and + * arrays such as @reorder_buf, @reorder_time, @head_seq_num, + * @stored_mpdu_num and @reorder_time from being corrupted by + * concurrent access of the RX path and the expired frame + * release timer. */ struct tid_ampdu_rx { struct rcu_head rcu_head; + spinlock_t reorder_lock; struct sk_buff **reorder_buf; unsigned long *reorder_time; struct timer_list session_timer; + struct timer_list reorder_timer; u16 head_seq_num; u16 stored_mpdu_num; u16 ssn; -- cgit v1.1 From 3f3b6a8d90b6e762e2bb83e6a9e86d9534b56cdc Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Aug 2010 10:20:27 +0200 Subject: cfg80211: deauth doesn't always imply disconnect When an AP sends a deauth frame, or we send one to an AP, that only means we lost our connection if we were actually connected to that AP. Check this to avoid sending spurious "disconnected" events and breaking "iw ... link" reporting. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/mlme.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index d1a3fb9..ee0af32 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -149,7 +149,7 @@ void __cfg80211_send_deauth(struct net_device *dev, struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *)buf; const u8 *bssid = mgmt->bssid; int i; - bool found = false; + bool found = false, was_current = false; ASSERT_WDEV_LOCK(wdev); @@ -159,6 +159,7 @@ void __cfg80211_send_deauth(struct net_device *dev, cfg80211_put_bss(&wdev->current_bss->pub); wdev->current_bss = NULL; found = true; + was_current = true; } else for (i = 0; i < MAX_AUTH_BSSES; i++) { if (wdev->auth_bsses[i] && memcmp(wdev->auth_bsses[i]->pub.bssid, bssid, ETH_ALEN) == 0) { @@ -183,7 +184,7 @@ void __cfg80211_send_deauth(struct net_device *dev, nl80211_send_deauth(rdev, dev, buf, len, GFP_KERNEL); - if (wdev->sme_state == CFG80211_SME_CONNECTED) { + if (wdev->sme_state == CFG80211_SME_CONNECTED && was_current) { u16 reason_code; bool from_ap; -- cgit v1.1 From 1fdaa46e9f26ccbab5e0eb8c4d4f8e1fbf32c7df Mon Sep 17 00:00:00 2001 From: Andrea Gelmini Date: Thu, 5 Aug 2010 15:51:35 +0200 Subject: net: mac80211: Fix a typo. "userpace" -> "userspace" Signed-off-by: Andrea Gelmini Signed-off-by: John W. Linville --- net/mac80211/rc80211_pid_debugfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/rc80211_pid_debugfs.c b/net/mac80211/rc80211_pid_debugfs.c index 47438b4..135f36f 100644 --- a/net/mac80211/rc80211_pid_debugfs.c +++ b/net/mac80211/rc80211_pid_debugfs.c @@ -162,7 +162,7 @@ static ssize_t rate_control_pid_events_read(struct file *file, char __user *buf, file_info->next_entry = (file_info->next_entry + 1) % RC_PID_EVENT_RING_SIZE; - /* Print information about the event. Note that userpace needs to + /* Print information about the event. Note that userspace needs to * provide large enough buffers. */ length = length < RC_PID_PRINT_BUF_SIZE ? length : RC_PID_PRINT_BUF_SIZE; -- cgit v1.1 From 7da7cc1d42d8ce02cca16df8c021e6d657f1f8fd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Aug 2010 17:02:38 +0200 Subject: mac80211: per interface idle notification Sometimes we don't just need to know whether or not the device is idle, but also per interface. This adds that reporting capability to mac80211. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ibss.c | 8 +++++-- net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/iface.c | 53 +++++++++++++++++++++++++++++++++++++++------- net/mac80211/mlme.c | 17 +++++++++++++-- net/mac80211/scan.c | 2 ++ net/mac80211/work.c | 8 +++---- 6 files changed, 75 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index c691780..32af971 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -920,12 +920,14 @@ int ieee80211_ibss_join(struct ieee80211_sub_if_data *sdata, memcpy(sdata->u.ibss.ssid, params->ssid, IEEE80211_MAX_SSID_LEN); sdata->u.ibss.ssid_len = params->ssid_len; + mutex_unlock(&sdata->u.ibss.mtx); + + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); ieee80211_queue_work(&sdata->local->hw, &sdata->work); - mutex_unlock(&sdata->u.ibss.mtx); - return 0; } @@ -980,7 +982,9 @@ int ieee80211_ibss_leave(struct ieee80211_sub_if_data *sdata) mutex_unlock(&sdata->u.ibss.mtx); + mutex_lock(&local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&local->mtx); return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index b44e03a..98e783c 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -497,6 +497,9 @@ struct ieee80211_sub_if_data { */ bool ht_opmode_valid; + /* to detect idle changes */ + bool old_idle; + /* Fragment table for host-based reassembly */ struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX]; unsigned int fragment_next; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index c1008a9..9459aee 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -309,7 +309,9 @@ static int ieee80211_open(struct net_device *dev) if (sdata->flags & IEEE80211_SDATA_PROMISC) atomic_inc(&local->iff_promiscs); + mutex_lock(&local->mtx); hw_reconf_flags |= __ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); local->open_count++; if (hw_reconf_flags) { @@ -516,7 +518,9 @@ static int ieee80211_stop(struct net_device *dev) sdata->bss = NULL; + mutex_lock(&local->mtx); hw_reconf_flags |= __ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); ieee80211_recalc_ps(local, -1); @@ -1199,28 +1203,61 @@ u32 __ieee80211_recalc_idle(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; int count = 0; + bool working = false, scanning = false; + struct ieee80211_work *wk; - if (!list_empty(&local->work_list)) - return ieee80211_idle_off(local, "working"); - - if (local->scanning) - return ieee80211_idle_off(local, "scanning"); +#ifdef CONFIG_PROVE_LOCKING + WARN_ON(debug_locks && !lockdep_rtnl_is_held() && + !lockdep_is_held(&local->iflist_mtx)); +#endif + lockdep_assert_held(&local->mtx); list_for_each_entry(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) + if (!ieee80211_sdata_running(sdata)) { + sdata->vif.bss_conf.idle = true; continue; + } + + sdata->old_idle = sdata->vif.bss_conf.idle; + /* do not count disabled managed interfaces */ if (sdata->vif.type == NL80211_IFTYPE_STATION && - !sdata->u.mgd.associated) + !sdata->u.mgd.associated) { + sdata->vif.bss_conf.idle = true; continue; + } /* do not count unused IBSS interfaces */ if (sdata->vif.type == NL80211_IFTYPE_ADHOC && - !sdata->u.ibss.ssid_len) + !sdata->u.ibss.ssid_len) { + sdata->vif.bss_conf.idle = true; continue; + } /* count everything else */ count++; } + list_for_each_entry(wk, &local->work_list, list) { + working = true; + wk->sdata->vif.bss_conf.idle = false; + } + + if (local->scan_sdata) { + scanning = true; + local->scan_sdata->vif.bss_conf.idle = false; + } + + list_for_each_entry(sdata, &local->interfaces, list) { + if (sdata->old_idle == sdata->vif.bss_conf.idle) + continue; + if (!ieee80211_sdata_running(sdata)) + continue; + ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_IDLE); + } + + if (working) + return ieee80211_idle_off(local, "working"); + if (scanning) + return ieee80211_idle_off(local, "scanning"); if (!count) return ieee80211_idle_on(local); else diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 17e9257..82e7cec 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1103,8 +1103,11 @@ static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata) printk(KERN_DEBUG "Connection to AP %pM lost.\n", bssid); ieee80211_set_disassoc(sdata, true); - ieee80211_recalc_idle(local); mutex_unlock(&ifmgd->mtx); + + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); /* * must be outside lock due to cfg80211, * but that's not a problem. @@ -1173,7 +1176,9 @@ ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, sdata->name, bssid, reason_code); ieee80211_set_disassoc(sdata, true); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return RX_MGMT_CFG80211_DEAUTH; } @@ -1203,7 +1208,9 @@ ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, sdata->name, mgmt->sa, reason_code); ieee80211_set_disassoc(sdata, true); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return RX_MGMT_CFG80211_DISASSOC; } @@ -1840,8 +1847,10 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) " after %dms, disconnecting.\n", bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ); ieee80211_set_disassoc(sdata, true); - ieee80211_recalc_idle(local); mutex_unlock(&ifmgd->mtx); + mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); /* * must be outside lock due to cfg80211, * but that's not a problem. @@ -2319,7 +2328,9 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, if (assoc_bss) sta_info_destroy_addr(sdata, bssid); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return 0; } @@ -2357,7 +2368,9 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, cookie, !req->local_state_change); sta_info_destroy_addr(sdata, bssid); + mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); + mutex_unlock(&sdata->local->mtx); return 0; } diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index f31f5497..31f233f 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -304,7 +304,9 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_offchannel_return(local, true); done: + mutex_lock(&local->mtx); ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); ieee80211_mesh_notify_scan_completed(local); diff --git a/net/mac80211/work.c b/net/mac80211/work.c index b98af64..ae344d1 100644 --- a/net/mac80211/work.c +++ b/net/mac80211/work.c @@ -888,10 +888,10 @@ static void ieee80211_work_work(struct work_struct *work) while ((skb = skb_dequeue(&local->work_skb_queue))) ieee80211_work_rx_queued_mgmt(local, skb); - ieee80211_recalc_idle(local); - mutex_lock(&local->mtx); + ieee80211_recalc_idle(local); + list_for_each_entry_safe(wk, tmp, &local->work_list, list) { bool started = wk->started; @@ -1001,10 +1001,10 @@ static void ieee80211_work_work(struct work_struct *work) &local->scan_work, round_jiffies_relative(0)); - mutex_unlock(&local->mtx); - ieee80211_recalc_idle(local); + mutex_unlock(&local->mtx); + list_for_each_entry_safe(wk, tmp, &free_work, list) { wk->done(wk, NULL); list_del(&wk->list); -- cgit v1.1 From d1f5b7a34aa5ff703c4966ea2652d4212ac75940 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Aug 2010 17:05:55 +0200 Subject: mac80211: allow drivers to request SM PS mode change Sometimes drivers have more information than the stack about how their antennas/chains are used, and may require that the SM PS mode be changed. This could happen, for example, when detecting that the user disconnected an antenna. Thus this patch introduces API to allow drivers to request SM PS mode changes. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ht.c | 28 ++++++++++++++++++++++++++++ net/mac80211/ieee80211_i.h | 6 +++++- net/mac80211/mlme.c | 3 +++ 3 files changed, 36 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 9d101fb..11f74f5 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -265,3 +265,31 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, return 0; } + +void ieee80211_request_smps_work(struct work_struct *work) +{ + struct ieee80211_sub_if_data *sdata = + container_of(work, struct ieee80211_sub_if_data, + u.mgd.request_smps_work); + + mutex_lock(&sdata->u.mgd.mtx); + __ieee80211_request_smps(sdata, sdata->u.mgd.driver_smps_mode); + mutex_unlock(&sdata->u.mgd.mtx); +} + +void ieee80211_request_smps(struct ieee80211_vif *vif, + enum ieee80211_smps_mode smps_mode) +{ + struct ieee80211_sub_if_data *sdata = vif_to_sdata(vif); + + if (WARN_ON(vif->type != NL80211_IFTYPE_STATION)) + return; + + if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF)) + smps_mode = IEEE80211_SMPS_AUTOMATIC; + + ieee80211_queue_work(&sdata->local->hw, + &sdata->u.mgd.request_smps_work); +} +/* this might change ... don't want non-open drivers using it */ +EXPORT_SYMBOL_GPL(ieee80211_request_smps); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 98e783c..1bf05bf 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -343,7 +343,10 @@ struct ieee80211_if_managed { unsigned long timers_running; /* used for quiesce/restart */ bool powersave; /* powersave requested for this iface */ enum ieee80211_smps_mode req_smps, /* requested smps mode */ - ap_smps; /* smps mode AP thinks we're in */ + ap_smps, /* smps mode AP thinks we're in */ + driver_smps_mode; /* smps mode request */ + + struct work_struct request_smps_work; unsigned int flags; @@ -1113,6 +1116,7 @@ void ieee80211_send_delba(struct ieee80211_sub_if_data *sdata, int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps, const u8 *da, const u8 *bssid); +void ieee80211_request_smps_work(struct work_struct *work); void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, u16 initiator, u16 reason); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 82e7cec..38996a4 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1926,6 +1926,8 @@ void ieee80211_sta_quiesce(struct ieee80211_sub_if_data *sdata) * time -- the code here is properly synchronised. */ + cancel_work_sync(&ifmgd->request_smps_work); + cancel_work_sync(&ifmgd->beacon_connection_loss_work); if (del_timer_sync(&ifmgd->timer)) set_bit(TMR_RUNNING_TIMER, &ifmgd->timers_running); @@ -1961,6 +1963,7 @@ void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata) INIT_WORK(&ifmgd->chswitch_work, ieee80211_chswitch_work); INIT_WORK(&ifmgd->beacon_connection_loss_work, ieee80211_beacon_connection_loss_work); + INIT_WORK(&ifmgd->request_smps_work, ieee80211_request_smps_work); setup_timer(&ifmgd->timer, ieee80211_sta_timer, (unsigned long) sdata); setup_timer(&ifmgd->bcn_mon_timer, ieee80211_sta_bcn_mon_timer, -- cgit v1.1 From 04600794958f1833f5571c6cde40f260ab557f55 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 5 Aug 2010 17:45:15 +0200 Subject: cfg80211: support sysfs namespaces Enable using network namespaces with wireless devices even when sysfs is enabled using the same infrastructure that was built for netdevs. Signed-off-by: Johannes Berg Acked-by: "Eric W. Biederman" Signed-off-by: John W. Linville --- net/core/net-sysfs.c | 3 ++- net/wireless/core.c | 7 ++++++- net/wireless/sysfs.c | 9 +++++++++ 3 files changed, 17 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index af4dfba..7d74854 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -789,12 +789,13 @@ static const void *net_netlink_ns(struct sock *sk) return sock_net(sk); } -static struct kobj_ns_type_operations net_ns_type_operations = { +struct kobj_ns_type_operations net_ns_type_operations = { .type = KOBJ_NS_TYPE_NET, .current_ns = net_current_ns, .netlink_ns = net_netlink_ns, .initial_ns = net_initial_ns, }; +EXPORT_SYMBOL_GPL(net_ns_type_operations); static void net_kobj_ns_exit(struct net *net) { diff --git a/net/wireless/core.c b/net/wireless/core.c index 541e2ff..c70909c 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -253,11 +253,16 @@ int cfg80211_switch_netns(struct cfg80211_registered_device *rdev, WARN_ON(err); wdev->netdev->features |= NETIF_F_NETNS_LOCAL; } + + return err; } wiphy_net_set(&rdev->wiphy, net); - return err; + err = device_rename(&rdev->wiphy.dev, dev_name(&rdev->wiphy.dev)); + WARN_ON(err); + + return 0; } static void cfg80211_rfkill_poll(struct rfkill *rfkill, void *data) diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 9f2cef3..74a9e3c 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -110,6 +110,13 @@ static int wiphy_resume(struct device *dev) return ret; } +static const void *wiphy_namespace(struct device *d) +{ + struct wiphy *wiphy = container_of(d, struct wiphy, dev); + + return wiphy_net(wiphy); +} + struct class ieee80211_class = { .name = "ieee80211", .owner = THIS_MODULE, @@ -120,6 +127,8 @@ struct class ieee80211_class = { #endif .suspend = wiphy_suspend, .resume = wiphy_resume, + .ns_type = &net_ns_type_operations, + .namespace = wiphy_namespace, }; int wiphy_sysfs_init(void) -- cgit v1.1 From 97359d1235eaf634fe706c9faa6e40181cc95fb8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 Aug 2010 09:46:38 +0200 Subject: mac80211: use cipher suite selectors Currently, mac80211 translates the cfg80211 cipher suite selectors into ALG_* values. That isn't all too useful, and some drivers benefit from the distinction between WEP40 and WEP104 as well. Therefore, convert it all to use the cipher suite selectors. Signed-off-by: Johannes Berg Acked-by: Gertjan van Wingerde Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 44 ++++++++++-------------------------- net/mac80211/debugfs_key.c | 55 ++++++++++++++++++--------------------------- net/mac80211/driver-trace.h | 4 ++-- net/mac80211/key.c | 25 ++++++++++----------- net/mac80211/key.h | 4 +--- net/mac80211/rx.c | 18 ++++++++------- net/mac80211/tx.c | 22 +++++++++--------- net/mac80211/wep.c | 2 +- net/mac80211/wpa.c | 6 ++--- 9 files changed, 75 insertions(+), 105 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 19c6146..9a35d9e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -116,7 +116,6 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, { struct ieee80211_sub_if_data *sdata; struct sta_info *sta = NULL; - enum ieee80211_key_alg alg; struct ieee80211_key *key; int err; @@ -125,31 +124,20 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, sdata = IEEE80211_DEV_TO_SUB_IF(dev); + /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: - case WLAN_CIPHER_SUITE_WEP104: - alg = ALG_WEP; - break; case WLAN_CIPHER_SUITE_TKIP: - alg = ALG_TKIP; - break; - case WLAN_CIPHER_SUITE_CCMP: - alg = ALG_CCMP; - break; - case WLAN_CIPHER_SUITE_AES_CMAC: - alg = ALG_AES_CMAC; + case WLAN_CIPHER_SUITE_WEP104: + if (IS_ERR(sdata->local->wep_tx_tfm)) + return -EINVAL; break; default: - return -EINVAL; + break; } - /* reject WEP and TKIP keys if WEP failed to initialize */ - if ((alg == ALG_WEP || alg == ALG_TKIP) && - IS_ERR(sdata->local->wep_tx_tfm)) - return -EINVAL; - - key = ieee80211_key_alloc(alg, key_idx, params->key_len, params->key, - params->seq_len, params->seq); + key = ieee80211_key_alloc(params->cipher, key_idx, params->key_len, + params->key, params->seq_len, params->seq); if (IS_ERR(key)) return PTR_ERR(key); @@ -247,10 +235,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, memset(¶ms, 0, sizeof(params)); - switch (key->conf.alg) { - case ALG_TKIP: - params.cipher = WLAN_CIPHER_SUITE_TKIP; + params.cipher = key->conf.cipher; + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_TKIP: iv32 = key->u.tkip.tx.iv32; iv16 = key->u.tkip.tx.iv16; @@ -268,8 +256,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, params.seq = seq; params.seq_len = 6; break; - case ALG_CCMP: - params.cipher = WLAN_CIPHER_SUITE_CCMP; + case WLAN_CIPHER_SUITE_CCMP: seq[0] = key->u.ccmp.tx_pn[5]; seq[1] = key->u.ccmp.tx_pn[4]; seq[2] = key->u.ccmp.tx_pn[3]; @@ -279,14 +266,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, params.seq = seq; params.seq_len = 6; break; - case ALG_WEP: - if (key->conf.keylen == 5) - params.cipher = WLAN_CIPHER_SUITE_WEP40; - else - params.cipher = WLAN_CIPHER_SUITE_WEP104; - break; - case ALG_AES_CMAC: - params.cipher = WLAN_CIPHER_SUITE_AES_CMAC; + case WLAN_CIPHER_SUITE_AES_CMAC: seq[0] = key->u.aes_cmac.tx_pn[5]; seq[1] = key->u.aes_cmac.tx_pn[4]; seq[2] = key->u.aes_cmac.tx_pn[3]; diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c index fa5e76e..1647f8d 100644 --- a/net/mac80211/debugfs_key.c +++ b/net/mac80211/debugfs_key.c @@ -64,26 +64,13 @@ static ssize_t key_algorithm_read(struct file *file, char __user *userbuf, size_t count, loff_t *ppos) { - char *alg; + char buf[15]; struct ieee80211_key *key = file->private_data; + u32 c = key->conf.cipher; - switch (key->conf.alg) { - case ALG_WEP: - alg = "WEP\n"; - break; - case ALG_TKIP: - alg = "TKIP\n"; - break; - case ALG_CCMP: - alg = "CCMP\n"; - break; - case ALG_AES_CMAC: - alg = "AES-128-CMAC\n"; - break; - default: - return 0; - } - return simple_read_from_buffer(userbuf, count, ppos, alg, strlen(alg)); + sprintf(buf, "%.2x-%.2x-%.2x:%d\n", + c >> 24, (c >> 16) & 0xff, (c >> 8) & 0xff, c & 0xff); + return simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf)); } KEY_OPS(algorithm); @@ -95,21 +82,22 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf, int len; struct ieee80211_key *key = file->private_data; - switch (key->conf.alg) { - case ALG_WEP: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: len = scnprintf(buf, sizeof(buf), "%08x %04x\n", key->u.tkip.tx.iv32, key->u.tkip.tx.iv16); break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: tpn = key->u.ccmp.tx_pn; len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], tpn[5]); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: tpn = key->u.aes_cmac.tx_pn; len = scnprintf(buf, sizeof(buf), "%02x%02x%02x%02x%02x%02x\n", tpn[0], tpn[1], tpn[2], tpn[3], tpn[4], @@ -130,11 +118,12 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, int i, len; const u8 *rpn; - switch (key->conf.alg) { - case ALG_WEP: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: len = scnprintf(buf, sizeof(buf), "\n"); break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: for (i = 0; i < NUM_RX_DATA_QUEUES; i++) p += scnprintf(p, sizeof(buf)+buf-p, "%08x %04x\n", @@ -142,7 +131,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, key->u.tkip.rx[i].iv16); len = p - buf; break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: for (i = 0; i < NUM_RX_DATA_QUEUES + 1; i++) { rpn = key->u.ccmp.rx_pn[i]; p += scnprintf(p, sizeof(buf)+buf-p, @@ -152,7 +141,7 @@ static ssize_t key_rx_spec_read(struct file *file, char __user *userbuf, } len = p - buf; break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: rpn = key->u.aes_cmac.rx_pn; p += scnprintf(p, sizeof(buf)+buf-p, "%02x%02x%02x%02x%02x%02x\n", @@ -174,11 +163,11 @@ static ssize_t key_replays_read(struct file *file, char __user *userbuf, char buf[20]; int len; - switch (key->conf.alg) { - case ALG_CCMP: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_CCMP: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.ccmp.replays); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.replays); break; @@ -196,8 +185,8 @@ static ssize_t key_icverrors_read(struct file *file, char __user *userbuf, char buf[20]; int len; - switch (key->conf.alg) { - case ALG_AES_CMAC: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_AES_CMAC: len = scnprintf(buf, sizeof(buf), "%u\n", key->u.aes_cmac.icverrors); break; diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index 5d5d2a9..b5a9558 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -336,7 +336,7 @@ TRACE_EVENT(drv_set_key, LOCAL_ENTRY VIF_ENTRY STA_ENTRY - __field(enum ieee80211_key_alg, alg) + __field(u32, cipher) __field(u8, hw_key_idx) __field(u8, flags) __field(s8, keyidx) @@ -346,7 +346,7 @@ TRACE_EVENT(drv_set_key, LOCAL_ASSIGN; VIF_ASSIGN; STA_ASSIGN; - __entry->alg = key->alg; + __entry->cipher = key->cipher; __entry->flags = key->flags; __entry->keyidx = key->keyidx; __entry->hw_key_idx = key->hw_key_idx; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index d6dbc8e..3203d1d 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -227,9 +227,7 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, } } -struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, - int idx, - size_t key_len, +struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, const u8 *key_data, size_t seq_len, const u8 *seq) { @@ -249,15 +247,16 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, key->conf.flags = 0; key->flags = 0; - key->conf.alg = alg; + key->conf.cipher = cipher; key->conf.keyidx = idx; key->conf.keylen = key_len; - switch (alg) { - case ALG_WEP: + switch (cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: key->conf.iv_len = WEP_IV_LEN; key->conf.icv_len = WEP_ICV_LEN; break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: key->conf.iv_len = TKIP_IV_LEN; key->conf.icv_len = TKIP_ICV_LEN; if (seq) { @@ -269,7 +268,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, } } break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: key->conf.iv_len = CCMP_HDR_LEN; key->conf.icv_len = CCMP_MIC_LEN; if (seq) { @@ -279,7 +278,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, seq[CCMP_PN_LEN - j - 1]; } break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: key->conf.iv_len = 0; key->conf.icv_len = sizeof(struct ieee80211_mmie); if (seq) @@ -290,7 +289,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, memcpy(key->conf.key, key_data, key_len); INIT_LIST_HEAD(&key->list); - if (alg == ALG_CCMP) { + if (cipher == WLAN_CIPHER_SUITE_CCMP) { /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. @@ -303,7 +302,7 @@ struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, } } - if (alg == ALG_AES_CMAC) { + if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) { /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. @@ -328,9 +327,9 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key) if (key->local) ieee80211_key_disable_hw_accel(key); - if (key->conf.alg == ALG_CCMP) + if (key->conf.cipher == WLAN_CIPHER_SUITE_CCMP) ieee80211_aes_key_free(key->u.ccmp.tfm); - if (key->conf.alg == ALG_AES_CMAC) + if (key->conf.cipher == WLAN_CIPHER_SUITE_AES_CMAC) ieee80211_aes_cmac_key_free(key->u.aes_cmac.tfm); if (key->local) ieee80211_debugfs_key_remove(key); diff --git a/net/mac80211/key.h b/net/mac80211/key.h index b665bbb..53b5ce1 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -123,9 +123,7 @@ struct ieee80211_key { struct ieee80211_key_conf conf; }; -struct ieee80211_key *ieee80211_key_alloc(enum ieee80211_key_alg alg, - int idx, - size_t key_len, +struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, const u8 *key_data, size_t seq_len, const u8 *seq); /* diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index f24a0a1..ad24270 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -961,7 +961,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) * pairwise or station-to-station keys, but for WEP we allow * using a key index as well. */ - if (rx->key && rx->key->conf.alg != ALG_WEP && + if (rx->key && rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 && !is_multicast_ether_addr(hdr->addr1)) rx->key = NULL; } @@ -977,8 +978,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) return RX_DROP_UNUSABLE; /* the hdr variable is invalid now! */ - switch (rx->key->conf.alg) { - case ALG_WEP: + switch (rx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: /* Check for weak IVs if possible */ if (rx->sta && ieee80211_is_data(fc) && (!(status->flag & RX_FLAG_IV_STRIPPED) || @@ -988,13 +990,13 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) result = ieee80211_crypto_wep_decrypt(rx); break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: result = ieee80211_crypto_tkip_decrypt(rx); break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: result = ieee80211_crypto_ccmp_decrypt(rx); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: result = ieee80211_crypto_aes_cmac_decrypt(rx); break; } @@ -1291,7 +1293,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) /* This is the first fragment of a new frame. */ entry = ieee80211_reassemble_add(rx->sdata, frag, seq, rx->queue, &(rx->skb)); - if (rx->key && rx->key->conf.alg == ALG_CCMP && + if (rx->key && rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP && ieee80211_has_protected(fc)) { int queue = ieee80211_is_mgmt(fc) ? NUM_RX_DATA_QUEUES : rx->queue; @@ -1320,7 +1322,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) int i; u8 pn[CCMP_PN_LEN], *rpn; int queue; - if (!rx->key || rx->key->conf.alg != ALG_CCMP) + if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP) return RX_DROP_UNUSABLE; memcpy(pn, entry->last_pn, CCMP_PN_LEN); for (i = CCMP_PN_LEN - 1; i >= 0; i--) { diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index c54db96..bc4fefc 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -543,15 +543,16 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) tx->key->tx_rx_count++; /* TODO: add threshold stuff again */ - switch (tx->key->conf.alg) { - case ALG_WEP: + switch (tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: if (ieee80211_is_auth(hdr->frame_control)) break; - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: if (!ieee80211_is_data_present(hdr->frame_control)) tx->key = NULL; break; - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: if (!ieee80211_is_data_present(hdr->frame_control) && !ieee80211_use_mfp(hdr->frame_control, tx->sta, tx->skb)) @@ -561,7 +562,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) IEEE80211_KEY_FLAG_SW_MGMT) && ieee80211_is_mgmt(hdr->frame_control); break; - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: if (!ieee80211_is_mgmt(hdr->frame_control)) tx->key = NULL; break; @@ -949,14 +950,15 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) if (!tx->key) return TX_CONTINUE; - switch (tx->key->conf.alg) { - case ALG_WEP: + switch (tx->key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: return ieee80211_crypto_wep_encrypt(tx); - case ALG_TKIP: + case WLAN_CIPHER_SUITE_TKIP: return ieee80211_crypto_tkip_encrypt(tx); - case ALG_CCMP: + case WLAN_CIPHER_SUITE_CCMP: return ieee80211_crypto_ccmp_encrypt(tx); - case ALG_AES_CMAC: + case WLAN_CIPHER_SUITE_AES_CMAC: return ieee80211_crypto_aes_cmac_encrypt(tx); } diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index 9ebc8d8..f27484c 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -240,7 +240,7 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local, keyidx = skb->data[hdrlen + 3] >> 6; - if (!key || keyidx != key->conf.keyidx || key->conf.alg != ALG_WEP) + if (!key || keyidx != key->conf.keyidx) return -1; klen = 3 + key->conf.keylen; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 8d59d27..b08ad94 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -36,8 +36,8 @@ ieee80211_tx_h_michael_mic_add(struct ieee80211_tx_data *tx) int tail; hdr = (struct ieee80211_hdr *)skb->data; - if (!tx->key || tx->key->conf.alg != ALG_TKIP || skb->len < 24 || - !ieee80211_is_data_present(hdr->frame_control)) + if (!tx->key || tx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || + skb->len < 24 || !ieee80211_is_data_present(hdr->frame_control)) return TX_CONTINUE; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -94,7 +94,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) if (status->flag & RX_FLAG_MMIC_STRIPPED) return RX_CONTINUE; - if (!rx->key || rx->key->conf.alg != ALG_TKIP || + if (!rx->key || rx->key->conf.cipher != WLAN_CIPHER_SUITE_TKIP || !ieee80211_has_protected(hdr->frame_control) || !ieee80211_is_data_present(hdr->frame_control)) return RX_CONTINUE; -- cgit v1.1 From 60ae0f20058d19ada94093dc3ef7ae0737597fba Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 Aug 2010 09:46:39 +0200 Subject: mac80211: move key tfm setup There's no need to keep separate if statements for setting up the CCMP/AES-CMAC tfm structs; move that into the existing switch statement. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/key.c | 26 ++++++++++---------------- 1 file changed, 10 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 3203d1d..9c27c53 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -277,19 +277,6 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, key->u.ccmp.rx_pn[i][j] = seq[CCMP_PN_LEN - j - 1]; } - break; - case WLAN_CIPHER_SUITE_AES_CMAC: - key->conf.iv_len = 0; - key->conf.icv_len = sizeof(struct ieee80211_mmie); - if (seq) - for (j = 0; j < 6; j++) - key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1]; - break; - } - memcpy(key->conf.key, key_data, key_len); - INIT_LIST_HEAD(&key->list); - - if (cipher == WLAN_CIPHER_SUITE_CCMP) { /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. @@ -300,9 +287,13 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, kfree(key); key = ERR_PTR(err); } - } - - if (cipher == WLAN_CIPHER_SUITE_AES_CMAC) { + break; + case WLAN_CIPHER_SUITE_AES_CMAC: + key->conf.iv_len = 0; + key->conf.icv_len = sizeof(struct ieee80211_mmie); + if (seq) + for (j = 0; j < 6; j++) + key->u.aes_cmac.rx_pn[j] = seq[6 - j - 1]; /* * Initialize AES key state here as an optimization so that * it does not need to be initialized for every packet. @@ -314,7 +305,10 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, kfree(key); key = ERR_PTR(err); } + break; } + memcpy(key->conf.key, key_data, key_len); + INIT_LIST_HEAD(&key->list); return key; } -- cgit v1.1 From dc1580ddfc1f70636f6ef80a385902f7e8278deb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 Aug 2010 09:46:40 +0200 Subject: mac80211: remove unused status flag checks The decryption code verifies whether or not a given frame was decrypted and verified by hardware. This is unnecessary, as the crypto RX handler already does it long before the decryption code is even invoked, so remove that code to avoid confusion. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 3 +++ net/mac80211/wpa.c | 26 ++++++-------------------- 2 files changed, 9 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index ad24270..4fdbed58 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -899,6 +899,9 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (!is_multicast_ether_addr(hdr->addr1) && stakey) { rx->key = stakey; + if ((status->flag & RX_FLAG_DECRYPTED) && + (status->flag & RX_FLAG_IV_STRIPPED)) + return RX_CONTINUE; /* Skip decryption if the frame is not protected. */ if (!ieee80211_has_protected(fc)) return RX_CONTINUE; diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index b08ad94..43882b3 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -221,19 +221,13 @@ ieee80211_crypto_tkip_decrypt(struct ieee80211_rx_data *rx) if (!rx->sta || skb->len - hdrlen < 12) return RX_DROP_UNUSABLE; - if (status->flag & RX_FLAG_DECRYPTED) { - if (status->flag & RX_FLAG_IV_STRIPPED) { - /* - * Hardware took care of all processing, including - * replay protection, and stripped the ICV/IV so - * we cannot do any checks here. - */ - return RX_CONTINUE; - } - - /* let TKIP code verify IV, but skip decryption */ + /* + * Let TKIP code verify IV, but skip decryption. + * In the case where hardware checks the IV as well, + * we don't even get here, see ieee80211_rx_h_decrypt() + */ + if (status->flag & RX_FLAG_DECRYPTED) hwaccel = 1; - } res = ieee80211_tkip_decrypt_data(rx->local->wep_rx_tfm, key, skb->data + hdrlen, @@ -447,10 +441,6 @@ ieee80211_crypto_ccmp_decrypt(struct ieee80211_rx_data *rx) if (!rx->sta || data_len < 0) return RX_DROP_UNUSABLE; - if ((status->flag & RX_FLAG_DECRYPTED) && - (status->flag & RX_FLAG_IV_STRIPPED)) - return RX_CONTINUE; - ccmp_hdr2pn(pn, skb->data + hdrlen); queue = ieee80211_is_mgmt(hdr->frame_control) ? @@ -564,10 +554,6 @@ ieee80211_crypto_aes_cmac_decrypt(struct ieee80211_rx_data *rx) if (!ieee80211_is_mgmt(hdr->frame_control)) return RX_CONTINUE; - if ((status->flag & RX_FLAG_DECRYPTED) && - (status->flag & RX_FLAG_IV_STRIPPED)) - return RX_CONTINUE; - if (skb->len < 24 + sizeof(*mmie)) return RX_DROP_UNUSABLE; -- cgit v1.1 From 5daa8a8e691e28c6c725e7e91319b160b555c615 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 Aug 2010 09:46:41 +0200 Subject: mac80211: dont advertise WEP if unavailable When WEP is unavailable, don't advertise it to cfg80211. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/main.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 06b9608..0afccda 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -536,6 +536,7 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) int channels, max_bitrates; bool supp_ht; static const u32 cipher_suites[] = { + /* keep WEP first, it may be removed below */ WLAN_CIPHER_SUITE_WEP40, WLAN_CIPHER_SUITE_WEP104, WLAN_CIPHER_SUITE_TKIP, @@ -623,6 +624,10 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites); if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) local->hw.wiphy->n_cipher_suites--; + if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) { + local->hw.wiphy->cipher_suites += 2; + local->hw.wiphy->n_cipher_suites -= 2; + } result = wiphy_register(local->hw.wiphy); if (result < 0) -- cgit v1.1 From afea0b7af7a0c070da8b2029d721abc930e5f96f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 10 Aug 2010 09:46:42 +0200 Subject: cfg80211: check if WEP is available for shared key auth When shared key auth is requested, cfg80211 should verify that the device is capable of WEP crypto which is required. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 37902a5..bb5b78e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3572,6 +3572,21 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock_rtnl; + if (key.idx >= 0) { + int i; + bool ok = false; + for (i = 0; i < rdev->wiphy.n_cipher_suites; i++) { + if (key.p.cipher == rdev->wiphy.cipher_suites[i]) { + ok = true; + break; + } + } + if (!ok) { + err = -EINVAL; + goto out; + } + } + if (!rdev->ops->auth) { err = -EOPNOTSUPP; goto out; -- cgit v1.1 From bfb564e7391340638afe4ad67744a8f3858e7566 Mon Sep 17 00:00:00 2001 From: Krishna Kumar Date: Wed, 4 Aug 2010 06:15:52 +0000 Subject: core: Factor out flow calculation from get_rps_cpu Factor out flow calculation code from get_rps_cpu, since other functions can use the same code. Revisions: v2 (Ben): Separate flow calcuation out and use in select queue. v3 (Arnd): Don't re-implement MIN. v4 (Changli): skb->data points to ethernet header in macvtap, and make a fast path. Tested macvtap with this patch. v5 (Changli): - Cache skb->rxhash in skb_get_rxhash - macvtap may not have pow(2) queues, so change code for queue selection. (Arnd): - Use first available queue if all fails. Signed-off-by: Krishna Kumar Signed-off-by: David S. Miller --- net/core/dev.c | 106 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 62 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 1ae6543..586a11c 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2259,69 +2259,41 @@ static inline void ____napi_schedule(struct softnet_data *sd, __raise_softirq_irqoff(NET_RX_SOFTIRQ); } -#ifdef CONFIG_RPS - -/* One global table that all flow-based protocols share. */ -struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; -EXPORT_SYMBOL(rps_sock_flow_table); - /* - * get_rps_cpu is called from netif_receive_skb and returns the target - * CPU from the RPS map of the receiving queue for a given skb. - * rcu_read_lock must be held on entry. + * __skb_get_rxhash: calculate a flow hash based on src/dst addresses + * and src/dst port numbers. Returns a non-zero hash number on success + * and 0 on failure. */ -static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, - struct rps_dev_flow **rflowp) +__u32 __skb_get_rxhash(struct sk_buff *skb) { + int nhoff, hash = 0; struct ipv6hdr *ip6; struct iphdr *ip; - struct netdev_rx_queue *rxqueue; - struct rps_map *map; - struct rps_dev_flow_table *flow_table; - struct rps_sock_flow_table *sock_flow_table; - int cpu = -1; u8 ip_proto; - u16 tcpu; u32 addr1, addr2, ihl; union { u32 v32; u16 v16[2]; } ports; - if (skb_rx_queue_recorded(skb)) { - u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); - goto done; - } - rxqueue = dev->_rx + index; - } else - rxqueue = dev->_rx; - - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) - goto done; - - if (skb->rxhash) - goto got_hash; /* Skip hash computation on packet header */ + nhoff = skb_network_offset(skb); switch (skb->protocol) { case __constant_htons(ETH_P_IP): - if (!pskb_may_pull(skb, sizeof(*ip))) + if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data; + ip = (struct iphdr *) skb->data + nhoff; ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; break; case __constant_htons(ETH_P_IPV6): - if (!pskb_may_pull(skb, sizeof(*ip6))) + if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data; + ip6 = (struct ipv6hdr *) skb->data + nhoff; ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; @@ -2330,6 +2302,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, default: goto done; } + switch (ip_proto) { case IPPROTO_TCP: case IPPROTO_UDP: @@ -2338,8 +2311,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, case IPPROTO_AH: case IPPROTO_SCTP: case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4)) { - ports.v32 = * (__force u32 *) (skb->data + (ihl * 4)); + if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff + + (ihl * 4)); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); break; @@ -2352,11 +2326,55 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, /* get a consistent hash (same value on both flow directions) */ if (addr2 < addr1) swap(addr1, addr2); - skb->rxhash = jhash_3words(addr1, addr2, ports.v32, hashrnd); - if (!skb->rxhash) - skb->rxhash = 1; -got_hash: + hash = jhash_3words(addr1, addr2, ports.v32, hashrnd); + if (!hash) + hash = 1; + +done: + return hash; +} +EXPORT_SYMBOL(__skb_get_rxhash); + +#ifdef CONFIG_RPS + +/* One global table that all flow-based protocols share. */ +struct rps_sock_flow_table *rps_sock_flow_table __read_mostly; +EXPORT_SYMBOL(rps_sock_flow_table); + +/* + * get_rps_cpu is called from netif_receive_skb and returns the target + * CPU from the RPS map of the receiving queue for a given skb. + * rcu_read_lock must be held on entry. + */ +static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, + struct rps_dev_flow **rflowp) +{ + struct netdev_rx_queue *rxqueue; + struct rps_map *map; + struct rps_dev_flow_table *flow_table; + struct rps_sock_flow_table *sock_flow_table; + int cpu = -1; + u16 tcpu; + + if (skb_rx_queue_recorded(skb)) { + u16 index = skb_get_rx_queue(skb); + if (unlikely(index >= dev->num_rx_queues)) { + WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " + "on queue %u, but number of RX queues is %u\n", + dev->name, index, dev->num_rx_queues); + goto done; + } + rxqueue = dev->_rx + index; + } else + rxqueue = dev->_rx; + + if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + goto done; + + if (!skb_get_rxhash(skb)) + goto done; + flow_table = rcu_dereference(rxqueue->rps_flow_table); sock_flow_table = rcu_dereference(rps_sock_flow_table); if (flow_table && sock_flow_table) { -- cgit v1.1 From 510a05edce43ec29ceb105677ad8e6ff58f02c72 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 5 Aug 2010 10:19:00 +0000 Subject: net/atm: Adjust confusing if indentation Outdent the code following an if. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r disable braces4@ position p1,p2; statement S1,S2; @@ ( if (...) { ... } | if (...) S1@p1 S2@p2 ) @script:python@ p1 << r.p1; p2 << r.p2; @@ if (p1[0].column == p2[0].column): cocci.print_main("branch",p1) cocci.print_secs("after",p2) // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/atm/common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/atm/common.c b/net/atm/common.c index 940404a..1b9c52a 100644 --- a/net/atm/common.c +++ b/net/atm/common.c @@ -792,7 +792,7 @@ int vcc_getsockopt(struct socket *sock, int level, int optname, default: if (level == SOL_SOCKET) return -EINVAL; - break; + break; } if (!vcc->dev || !vcc->dev->ops->getsockopt) return -EINVAL; -- cgit v1.1 From bb8a10bbd10a45db0eb45bac520489bdbc0917ef Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 5 Aug 2010 10:29:38 +0000 Subject: net/decnet: Adjust confusing if indentation Indent the branch of an if. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r disable braces4@ position p1,p2; statement S1,S2; @@ ( if (...) { ... } | if (...) S1@p1 S2@p2 ) @script:python@ p1 << r.p1; p2 << r.p2; @@ if (p1[0].column == p2[0].column): cocci.print_main("branch",p1) cocci.print_secs("after",p2) // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/decnet/dn_nsp_out.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/decnet/dn_nsp_out.c b/net/decnet/dn_nsp_out.c index baeb1ea..2ef1152 100644 --- a/net/decnet/dn_nsp_out.c +++ b/net/decnet/dn_nsp_out.c @@ -693,22 +693,22 @@ void dn_nsp_send_conninit(struct sock *sk, unsigned char msgflg) aux = scp->accessdata.acc_userl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_user, aux); aux = scp->accessdata.acc_passl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_pass, aux); aux = scp->accessdata.acc_accl; *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); + memcpy(skb_put(skb, aux), scp->accessdata.acc_acc, aux); aux = (__u8)le16_to_cpu(scp->conndata_out.opt_optl); *skb_put(skb, 1) = aux; if (aux > 0) - memcpy(skb_put(skb,aux), scp->conndata_out.opt_data, aux); + memcpy(skb_put(skb, aux), scp->conndata_out.opt_data, aux); scp->persist = dn_nsp_persist(sk); scp->persist_fxn = dn_nsp_retrans_conninit; -- cgit v1.1 From 01414802054c382072b6cb9a1bdc6e243c74b2d5 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 17 Aug 2010 02:31:15 -0700 Subject: ethtool: Provide a default implementation of ethtool_ops::get_drvinfo The driver name and bus address for a net_device can normally be found through the driver model now. Instead of requiring drivers to provide this information redundantly through the ethtool_ops::get_drvinfo operation, use the driver model to do so if the driver does not define the operation. Since ETHTOOL_GDRVINFO no longer requires the driver to implement any operations, do not require net_device::ethtool_ops to be set either. Remove implementations of get_drvinfo and ethtool_ops that provide only this information. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 33 +++++++++++++++++++++++---------- 1 file changed, 23 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 7a85367..d2c4da5 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -205,18 +205,24 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo info; const struct ethtool_ops *ops = dev->ethtool_ops; - if (!ops->get_drvinfo) - return -EOPNOTSUPP; - memset(&info, 0, sizeof(info)); info.cmd = ETHTOOL_GDRVINFO; - ops->get_drvinfo(dev, &info); + if (ops && ops->get_drvinfo) { + ops->get_drvinfo(dev, &info); + } else if (dev->dev.parent && dev->dev.parent->driver) { + strlcpy(info.bus_info, dev_name(dev->dev.parent), + sizeof(info.bus_info)); + strlcpy(info.driver, dev->dev.parent->driver->name, + sizeof(info.driver)); + } else { + return -EOPNOTSUPP; + } /* * this method of obtaining string set info is deprecated; * Use ETHTOOL_GSSET_INFO instead. */ - if (ops->get_sset_count) { + if (ops && ops->get_sset_count) { int rc; rc = ops->get_sset_count(dev, ETH_SS_TEST); @@ -229,9 +235,9 @@ static noinline_for_stack int ethtool_get_drvinfo(struct net_device *dev, if (rc >= 0) info.n_priv_flags = rc; } - if (ops->get_regs_len) + if (ops && ops->get_regs_len) info.regdump_len = ops->get_regs_len(dev); - if (ops->get_eeprom_len) + if (ops && ops->get_eeprom_len) info.eedump_len = ops->get_eeprom_len(dev); if (copy_to_user(useraddr, &info, sizeof(info))) @@ -1402,12 +1408,19 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) if (!dev || !netif_device_present(dev)) return -ENODEV; - if (!dev->ethtool_ops) - return -EOPNOTSUPP; - if (copy_from_user(ðcmd, useraddr, sizeof(ethcmd))) return -EFAULT; + if (!dev->ethtool_ops) { + /* ETHTOOL_GDRVINFO does not require any driver support. + * It is also unprivileged and does not change anything, + * so we can take a shortcut to it. */ + if (ethcmd == ETHTOOL_GDRVINFO) + return ethtool_get_drvinfo(dev, useraddr); + else + return -EOPNOTSUPP; + } + /* Allow some commands to be done by anyone */ switch (ethcmd) { case ETHTOOL_GDRVINFO: -- cgit v1.1 From f81380925209bc60732a57eef41ab440c056aacf Mon Sep 17 00:00:00 2001 From: Anders Kaseorg <[andersk@ksplice.com]> Date: Tue, 17 Aug 2010 11:00:03 +0000 Subject: tipc: Fix log buffer memory leak if initialization fails Moves log buffer cleanup into tipc_core_stop() so that memory allocated for the log buffer is freed if tipc_core_start() is unsuccessful. Signed-off-by: Anders Kaseorg Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/core.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index 6964681..466b861 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -169,6 +169,7 @@ void tipc_core_stop(void) tipc_nametbl_stop(); tipc_ref_table_stop(); tipc_socket_stop(); + tipc_log_resize(0); } /** @@ -203,7 +204,9 @@ static int __init tipc_init(void) { int res; - tipc_log_resize(CONFIG_TIPC_LOG); + if (tipc_log_resize(CONFIG_TIPC_LOG) != 0) + warn("Unable to create log buffer\n"); + info("Activated (version " TIPC_MOD_VER " compiled " __DATE__ " " __TIME__ ")\n"); @@ -230,7 +233,6 @@ static void __exit tipc_exit(void) tipc_core_stop_net(); tipc_core_stop(); info("Deactivated\n"); - tipc_log_resize(0); } module_init(tipc_init); -- cgit v1.1 From 3720d40b201fe82dce1d8a64a31bfbf49c221771 Mon Sep 17 00:00:00 2001 From: Florian Westphal Date: Tue, 17 Aug 2010 11:00:04 +0000 Subject: tipc: add SO_RCVLOWAT support to stream socket receive path Add support for the SO_RCVLOWAT socket option to TIPC's stream socket type. Signed-off-by: Florian Westphal Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/socket.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 66e889b..69d0fd1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1026,9 +1026,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, struct sk_buff *buf; struct tipc_msg *msg; unsigned int sz; - int sz_to_copy; + int sz_to_copy, target, needed; int sz_copied = 0; - int needed; char __user *crs = m->msg_iov->iov_base; unsigned char *buf_crs; u32 err; @@ -1050,6 +1049,8 @@ static int recv_stream(struct kiocb *iocb, struct socket *sock, goto exit; } + target = sock_rcvlowat(sk, flags & MSG_WAITALL, buf_len); + restart: /* Look for a message in receive queue; wait if necessary */ @@ -1138,7 +1139,7 @@ restart: if ((sz_copied < buf_len) && /* didn't get all requested data */ (!skb_queue_empty(&sk->sk_receive_queue) || - (flags & MSG_WAITALL)) && /* and more is ready or required */ + (sz_copied < target)) && /* and more is ready or required */ (!(flags & MSG_PEEK)) && /* and aren't just peeking at data */ (!err)) /* and haven't reached a FIN */ goto restart; -- cgit v1.1 From 35997e3157eba16c6124d440bdf9272087129b2a Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:05 +0000 Subject: tipc: Provide correct error code for unsupported connect() operation Modify TIPC to return EOPNOTSUPP if an application attempts to perform a non-blocking connect() operation, which is not supported by TIPC. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 69d0fd1..b89c7b1 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1380,7 +1380,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, /* For now, TIPC does not support the non-blocking form of connect() */ if (flags & O_NONBLOCK) { - res = -EWOULDBLOCK; + res = -EOPNOTSUPP; goto exit; } -- cgit v1.1 From f662c07058f7e6365ae65080d772f9122f6f50a9 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:06 +0000 Subject: tipc: correct problems with misleading flags returned using poll() Prevent TIPC from incorrectly setting returned flags to poll() in the following cases: - an unconnected socket no longer indicates that it is always readable - an unconnected, connecting, or listening socket no longer indicates that it is always writable Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/socket.c | 61 ++++++++++++++++++++++++++++++++++++------------------- 1 file changed, 40 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index b89c7b1..7b81fdd 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -429,36 +429,55 @@ static int get_name(struct socket *sock, struct sockaddr *uaddr, * to handle any preventable race conditions, so TIPC will do the same ... * * TIPC sets the returned events as follows: - * a) POLLRDNORM and POLLIN are set if the socket's receive queue is non-empty - * or if a connection-oriented socket is does not have an active connection - * (i.e. a read operation will not block). - * b) POLLOUT is set except when a socket's connection has been terminated - * (i.e. a write operation will not block). - * c) POLLHUP is set when a socket's connection has been terminated. - * - * IMPORTANT: The fact that a read or write operation will not block does NOT - * imply that the operation will succeed! + * + * socket state flags set + * ------------ --------- + * unconnected no read flags + * no write flags + * + * connecting POLLIN/POLLRDNORM if ACK/NACK in rx queue + * no write flags + * + * connected POLLIN/POLLRDNORM if data in rx queue + * POLLOUT if port is not congested + * + * disconnecting POLLIN/POLLRDNORM/POLLHUP + * no write flags + * + * listening POLLIN if SYN in rx queue + * no write flags + * + * ready POLLIN/POLLRDNORM if data in rx queue + * [connectionless] POLLOUT (since port cannot be congested) + * + * IMPORTANT: The fact that a read or write operation is indicated does NOT + * imply that the operation will succeed, merely that it should be performed + * and will not block. */ static unsigned int poll(struct file *file, struct socket *sock, poll_table *wait) { struct sock *sk = sock->sk; - u32 mask; + u32 mask = 0; poll_wait(file, sk_sleep(sk), wait); - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sock->state == SS_UNCONNECTED) || - (sock->state == SS_DISCONNECTING)) - mask = (POLLRDNORM | POLLIN); - else - mask = 0; - - if (sock->state == SS_DISCONNECTING) - mask |= POLLHUP; - else - mask |= POLLOUT; + switch ((int)sock->state) { + case SS_READY: + case SS_CONNECTED: + if (!tipc_sk_port(sk)->congested) + mask |= POLLOUT; + /* fall thru' */ + case SS_CONNECTING: + case SS_LISTENING: + if (!skb_queue_empty(&sk->sk_receive_queue)) + mask |= (POLLIN | POLLRDNORM); + break; + case SS_DISCONNECTING: + mask = (POLLIN | POLLRDNORM | POLLHUP); + break; + } return mask; } -- cgit v1.1 From b02b69c8a403859ec72090742727e853d606a325 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:07 +0000 Subject: tipc: Check for disabled bearer when processing incoming messages Add a check to tipc_recv_msg() to ensure it discards messages arriving on a newly disabled bearer. This is needed to deal with a race condition that can arise if the bearer is in the midst of being disabled when it receives a message. Performing the check after tipc_net_lock has been taken ensures that TIPC's bearers are in a stable state while the message is being processed. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/link.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'net') diff --git a/net/tipc/link.c b/net/tipc/link.c index a3616b9..9d18c9b 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1802,6 +1802,15 @@ static int link_recv_buf_validate(struct sk_buff *buf) return pskb_may_pull(buf, hdr_size); } +/** + * tipc_recv_msg - process TIPC messages arriving from off-node + * @head: pointer to message buffer chain + * @tb_ptr: pointer to bearer message arrived on + * + * Invoked with no locks held. Bearer pointer must point to a valid bearer + * structure (i.e. cannot be NULL), but bearer can be inactive. + */ + void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) { read_lock_bh(&tipc_net_lock); @@ -1819,6 +1828,11 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) head = head->next; + /* Ensure bearer is still enabled */ + + if (unlikely(!b_ptr->active)) + goto cont; + /* Ensure message is well-formed */ if (unlikely(!link_recv_buf_validate(buf))) -- cgit v1.1 From 7e3e5d0950559d1118dccbdff3c765fffcf04fd5 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:08 +0000 Subject: tipc: Prevent crash when broadcast link cannot send to all nodes Allow TIPC's broadcast link to continue operation when it is unable to send a message to all nodes in the cluster. Previously, the broadcast link attempted to put the broadcast pseudo-bearer into a blocked state; however, this caused a crash because the associated bearer structure is only partially initialized. Further investigation has revealed some conceptual problems with blocking the pseudo-bearer; consequently, this functionality has been disabled for the time being and the undelivered message is eventually resent by the broadcast link's existing message retransmission mechanism (if possible). Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/bcast.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index a008c66..42b1737 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -609,11 +609,13 @@ static int tipc_bcbearer_send(struct sk_buff *buf, bcbearer->remains = bcbearer->remains_new; } - /* Unable to reach all targets */ + /* + * Unable to reach all targets (indicate success, since currently + * there isn't code in place to properly block & unblock the + * pseudo-bearer used by the broadcast link) + */ - bcbearer->bearer.publ.blocked = 1; - bcl->stats.bearer_congs++; - return 1; + return TIPC_OK; } /** -- cgit v1.1 From 5b1f7bdeb698547cc319c7a302a5acf585227a92 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:09 +0000 Subject: tipc: Fix premature broadcast advertisement by sending node Prevent a TIPC node from sending out a LINK_STATE message advertising a broadcast message that it is in the process of sending, but has not yet actually sent. Previously, it was possible for a link timeout to occur in between the time the broadcast link updated its "last message sent" counter and the time the broadcast message was passed to the broadcast bearer for transmission. This ensures that the code which issues the LINK_STATE message isn't informed of the new message until the broadcast bearer has had a chance to send it. Note: The "last message sent" value is stored in the "fsm_msg_count" field of the link structure used by the broadcast link. Since the broadcast link doesn't utilize the normal link FSM, this field can be re-used rather than adding a new field to the broadcast link. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/bcast.c | 30 +++++++++++++++++++----------- 1 file changed, 19 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index 42b1737..eefdd1a 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -143,6 +143,19 @@ static void bcbuf_decr_acks(struct sk_buff *buf) } +static void bclink_set_last_sent(void) +{ + if (bcl->next_out) + bcl->fsm_msg_cnt = mod(buf_seqno(bcl->next_out) - 1); + else + bcl->fsm_msg_cnt = mod(bcl->next_out_no - 1); +} + +u32 tipc_bclink_get_last_sent(void) +{ + return bcl->fsm_msg_cnt; +} + /** * bclink_set_gap - set gap according to contents of current deferred pkt queue * @@ -237,8 +250,10 @@ void tipc_bclink_acknowledge(struct tipc_node *n_ptr, u32 acked) /* Try resolving broadcast link congestion, if necessary */ - if (unlikely(bcl->next_out)) + if (unlikely(bcl->next_out)) { tipc_link_push_queue(bcl); + bclink_set_last_sent(); + } if (unlikely(released && !list_empty(&bcl->waiting_ports))) tipc_link_wakeup_ports(bcl, 0); spin_unlock_bh(&bc_lock); @@ -394,8 +409,10 @@ int tipc_bclink_send_msg(struct sk_buff *buf) res = tipc_link_send_buf(bcl, buf); if (unlikely(res == -ELINKCONG)) buf_discard(buf); - else + else { bcl->stats.sent_info++; + bclink_set_last_sent(); + } if (bcl->out_queue_size > bcl->stats.max_queue_sz) bcl->stats.max_queue_sz = bcl->out_queue_size; @@ -529,15 +546,6 @@ receive: tipc_node_unlock(node); } -u32 tipc_bclink_get_last_sent(void) -{ - u32 last_sent = mod(bcl->next_out_no - 1); - - if (bcl->next_out) - last_sent = mod(buf_seqno(bcl->next_out) - 1); - return last_sent; -} - u32 tipc_bclink_acks_missing(struct tipc_node *n_ptr) { return (n_ptr->bclink.supported && -- cgit v1.1 From 0048b826afae7c47afdc47c3854707581cafe3d8 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:10 +0000 Subject: tipc: Fix bug in broadcast link transmit statistics computation Modify TIPC's broadcast link so that it counts each piece of a fragmented message individually, rather than as treating the group as a single message. This ensures that proper correlation of sent and received traffic can be done when the broadcast link statistics are displayed, and is consistent with the way fragments are counted by TIPC's unicast links. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/bcast.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index eefdd1a..b11248c 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -409,10 +409,8 @@ int tipc_bclink_send_msg(struct sk_buff *buf) res = tipc_link_send_buf(bcl, buf); if (unlikely(res == -ELINKCONG)) buf_discard(buf); - else { - bcl->stats.sent_info++; + else bclink_set_last_sent(); - } if (bcl->out_queue_size > bcl->stats.max_queue_sz) bcl->stats.max_queue_sz = bcl->out_queue_size; @@ -578,6 +576,7 @@ static int tipc_bcbearer_send(struct sk_buff *buf, msg = buf_msg(buf); msg_set_non_seq(msg, 1); msg_set_mc_netid(msg, tipc_net_id); + bcl->stats.sent_info++; } /* Send buffer over bearers until all targets reached */ -- cgit v1.1 From 96d841b7038b8091af3530a008793f5577337d3a Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:11 +0000 Subject: tipc: Remove per-connection sequence number logic Remove validation of the per-connection sequence numbers on routable connections, since routable connections are not supported by TIPC. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/port.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/tipc/port.c b/net/tipc/port.c index 0737680..ebcbc21 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -588,19 +588,10 @@ void tipc_port_recv_proto_msg(struct sk_buff *buf) if (!p_ptr) { err = TIPC_ERR_NO_PORT; } else if (p_ptr->publ.connected) { - if (port_peernode(p_ptr) != msg_orignode(msg)) + if ((port_peernode(p_ptr) != msg_orignode(msg)) || + (port_peerport(p_ptr) != msg_origport(msg))) { err = TIPC_ERR_NO_PORT; - if (port_peerport(p_ptr) != msg_origport(msg)) - err = TIPC_ERR_NO_PORT; - if (!err && msg_routed(msg)) { - u32 seqno = msg_transp_seqno(msg); - u32 myno = ++p_ptr->last_in_seqno; - if (seqno != myno) { - err = TIPC_ERR_NO_PORT; - abort_buf = port_build_self_abort_msg(p_ptr, err); - } - } - if (msg_type(msg) == CONN_ACK) { + } else if (msg_type(msg) == CONN_ACK) { int wakeup = tipc_port_congested(p_ptr) && p_ptr->publ.congested && p_ptr->wakeup; -- cgit v1.1 From 76ae0d71d839b365faa7fdca0eec85a6d1a20d95 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:12 +0000 Subject: tipc: Optimize tipc_node_has_active_links() Eliminate unnecessary checking for null node pointer and redundant check of second active link array entry. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/node.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/tipc/node.c b/net/tipc/node.c index b634942..9408517 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -237,8 +237,7 @@ void tipc_node_link_down(struct tipc_node *n_ptr, struct link *l_ptr) int tipc_node_has_active_links(struct tipc_node *n_ptr) { - return (n_ptr && - ((n_ptr->active_links[0]) || (n_ptr->active_links[1]))); + return n_ptr->active_links[0] != NULL; } int tipc_node_has_redundant_links(struct tipc_node *n_ptr) -- cgit v1.1 From c2de58140a380172610b6a0f07f975abb2fbb311 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:14 +0000 Subject: tipc: Minor enhancements to name table display format Eliminate printing of dashes after name table column headers (to adhere more closely to the standard format used in tipc-config), and simplify name table display logic using array lookups rather than if-then-else logic. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/name_table.c | 44 ++++++++++++++++---------------------------- 1 file changed, 16 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 8ba7962..d504e49 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -877,7 +877,7 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, u32 index) { char portIdStr[27]; - char *scopeStr; + const char *scope_str[] = {"", " zone", " cluster", " node"}; struct publication *publ = sseq->zone_list; tipc_printf(buf, "%-10u %-10u ", sseq->lower, sseq->upper); @@ -893,15 +893,8 @@ static void subseq_list(struct sub_seq *sseq, struct print_buf *buf, u32 depth, tipc_node(publ->node), publ->ref); tipc_printf(buf, "%-26s ", portIdStr); if (depth > 3) { - if (publ->node != tipc_own_addr) - scopeStr = ""; - else if (publ->scope == TIPC_NODE_SCOPE) - scopeStr = "node"; - else if (publ->scope == TIPC_CLUSTER_SCOPE) - scopeStr = "cluster"; - else - scopeStr = "zone"; - tipc_printf(buf, "%-10u %s", publ->key, scopeStr); + tipc_printf(buf, "%-10u %s", publ->key, + scope_str[publ->scope]); } publ = publ->zone_list_next; @@ -951,24 +944,19 @@ static void nameseq_list(struct name_seq *seq, struct print_buf *buf, u32 depth, static void nametbl_header(struct print_buf *buf, u32 depth) { - tipc_printf(buf, "Type "); - - if (depth > 1) - tipc_printf(buf, "Lower Upper "); - if (depth > 2) - tipc_printf(buf, "Port Identity "); - if (depth > 3) - tipc_printf(buf, "Publication"); - - tipc_printf(buf, "\n-----------"); - - if (depth > 1) - tipc_printf(buf, "--------------------- "); - if (depth > 2) - tipc_printf(buf, "-------------------------- "); - if (depth > 3) - tipc_printf(buf, "------------------"); - + const char *header[] = { + "Type ", + "Lower Upper ", + "Port Identity ", + "Publication Scope" + }; + + int i; + + if (depth > 4) + depth = 4; + for (i = 0; i < depth; i++) + tipc_printf(buf, header[i]); tipc_printf(buf, "\n"); } -- cgit v1.1 From 564e83b51a12b794e3f63a2d872398e1ee21616f Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:15 +0000 Subject: tipc: Allow connect() to wait indefinitely Cause a socket whose TIPC_CONN_TIMEOUT option is zero to wait indefinitely for a response to a connection request using connect(). Previously, specifying a timeout of 0 ms resulted in an immediate timeout, which was inconsistent with the behavior specified by Posix for a socket's receive and send timeout. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/socket.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/tipc/socket.c b/net/tipc/socket.c index 7b81fdd..f7ac94d 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -64,6 +64,7 @@ struct tipc_sock { struct sock sk; struct tipc_port *p; struct tipc_portid peer_name; + long conn_timeout; }; #define tipc_sk(sk) ((struct tipc_sock *)(sk)) @@ -240,9 +241,9 @@ static int tipc_create(struct net *net, struct socket *sock, int protocol, sock->state = state; sock_init_data(sock, sk); - sk->sk_rcvtimeo = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); sk->sk_backlog_rcv = backlog_rcv; tipc_sk(sk)->p = tp_ptr; + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(CONN_TIMEOUT_DEFAULT); spin_unlock_bh(tp_ptr->lock); @@ -1385,6 +1386,7 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, struct msghdr m = {NULL,}; struct sk_buff *buf; struct tipc_msg *msg; + long timeout; int res; lock_sock(sk); @@ -1445,11 +1447,12 @@ static int connect(struct socket *sock, struct sockaddr *dest, int destlen, /* Wait until an 'ACK' or 'RST' arrives, or a timeout occurs */ + timeout = tipc_sk(sk)->conn_timeout; release_sock(sk); res = wait_event_interruptible_timeout(*sk_sleep(sk), (!skb_queue_empty(&sk->sk_receive_queue) || (sock->state != SS_CONNECTING)), - sk->sk_rcvtimeo); + timeout ? timeout : MAX_SCHEDULE_TIMEOUT); lock_sock(sk); if (res > 0) { @@ -1712,7 +1715,7 @@ static int setsockopt(struct socket *sock, res = tipc_set_portunreturnable(tport->ref, value); break; case TIPC_CONN_TIMEOUT: - sk->sk_rcvtimeo = msecs_to_jiffies(value); + tipc_sk(sk)->conn_timeout = msecs_to_jiffies(value); /* no need to set "res", since already 0 at this point */ break; default: @@ -1767,7 +1770,7 @@ static int getsockopt(struct socket *sock, res = tipc_portunreturnable(tport->ref, &value); break; case TIPC_CONN_TIMEOUT: - value = jiffies_to_msecs(sk->sk_rcvtimeo); + value = jiffies_to_msecs(tipc_sk(sk)->conn_timeout); /* no need to set "res", since already 0 at this point */ break; case TIPC_NODE_RECVQ_DEPTH: -- cgit v1.1 From 5a68d5ee000bb784c4856391b4861739c8bbd341 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Tue, 17 Aug 2010 11:00:16 +0000 Subject: tipc: Prevent missing name table entries when link flip-flops rapidly Ensure that TIPC does not re-establish communication with a neighboring node until it has finished updating all data structures containing information about that node to reflect the earlier loss of contact. Previously, it was possible for TIPC to perform its purge of name table entries relating to the node once contact had already been re-established, resulting in the unwanted removal of valid name table entries. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/discover.c | 8 ++++++++ net/tipc/link.c | 11 ++++++++++- net/tipc/node.c | 19 +++++++++++++++++++ net/tipc/node.h | 2 ++ 4 files changed, 39 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/tipc/discover.c b/net/tipc/discover.c index fc1fcf5..f28d1ae 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -203,6 +203,14 @@ void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr) return; } spin_lock_bh(&n_ptr->lock); + + /* Don't talk to neighbor during cleanup after last session */ + + if (n_ptr->cleanup_required) { + spin_unlock_bh(&n_ptr->lock); + return; + } + link = n_ptr->links[b_ptr->identity]; if (!link) { dbg("creating link\n"); diff --git a/net/tipc/link.c b/net/tipc/link.c index 9d18c9b..a6a3102 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -1869,13 +1869,22 @@ void tipc_recv_msg(struct sk_buff *head, struct tipc_bearer *tb_ptr) goto cont; } - /* Locate unicast link endpoint that should handle message */ + /* Locate neighboring node that sent message */ n_ptr = tipc_node_find(msg_prevnode(msg)); if (unlikely(!n_ptr)) goto cont; tipc_node_lock(n_ptr); + /* Don't talk to neighbor during cleanup after last session */ + + if (n_ptr->cleanup_required) { + tipc_node_unlock(n_ptr); + goto cont; + } + + /* Locate unicast link endpoint that should handle message */ + l_ptr = n_ptr->links[b_ptr->identity]; if (unlikely(!l_ptr)) { tipc_node_unlock(n_ptr); diff --git a/net/tipc/node.c b/net/tipc/node.c index 9408517..b702c7b 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -383,6 +383,20 @@ static void node_established_contact(struct tipc_node *n_ptr) tipc_highest_allowed_slave); } +static void node_cleanup_finished(unsigned long node_addr) +{ + struct tipc_node *n_ptr; + + read_lock_bh(&tipc_net_lock); + n_ptr = tipc_node_find(node_addr); + if (n_ptr) { + tipc_node_lock(n_ptr); + n_ptr->cleanup_required = 0; + tipc_node_unlock(n_ptr); + } + read_unlock_bh(&tipc_net_lock); +} + static void node_lost_contact(struct tipc_node *n_ptr) { struct cluster *c_ptr; @@ -457,6 +471,11 @@ static void node_lost_contact(struct tipc_node *n_ptr) tipc_k_signal((Handler)ns->handle_node_down, (unsigned long)ns->usr_handle); } + + /* Prevent re-contact with node until all cleanup is done */ + + n_ptr->cleanup_required = 1; + tipc_k_signal((Handler)node_cleanup_finished, n_ptr->addr); } /** diff --git a/net/tipc/node.h b/net/tipc/node.h index 6f990da..45f3db3 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -52,6 +52,7 @@ * @active_links: pointers to active links to node * @links: pointers to all links to node * @working_links: number of working links to node (both active and standby) + * @cleanup_required: non-zero if cleaning up after a prior loss of contact * @link_cnt: number of links to node * @permit_changeover: non-zero if node has redundant links to this system * @routers: bitmap (used for multicluster communication) @@ -78,6 +79,7 @@ struct tipc_node { struct link *links[MAX_BEARERS]; int link_cnt; int working_links; + int cleanup_required; int permit_changeover; u32 routers[512/32]; int last_router; -- cgit v1.1 From 00093fab980d0a8950a64bdf9e346d0497b9a7e4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 14 Aug 2010 11:09:49 +0000 Subject: net/sched: remove unneeded NULL check There is no need to check "s". nla_data() doesn't return NULL. Also we already dereferenced "s" at this point so it would have oopsed ealier if it were NULL. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/sched/sch_api.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 408eea7..6fb3d41 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -360,7 +360,7 @@ static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt) tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16); } - if (!s || tsize != s->tsize || (!tab && tsize > 0)) + if (tsize != s->tsize || (!tab && tsize > 0)) return ERR_PTR(-EINVAL); spin_lock(&qdisc_stab_lock); -- cgit v1.1 From b3d18f15092a7db2f229cd7bc69fc40eac0774f4 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 16 Aug 2010 06:26:57 +0000 Subject: net/ax25: Use available error codes Error codes are stored in err, but the return value is always 0. Return err instead. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r@ local idexpression x; constant C; @@ if (...) { ... x = -C ... when != x ( return <+...x...+>; | return NULL; | return; | * return ...; ) } // Signed-off-by: Julia Lawall Acked-by: Ralf Baechle Signed-off-by: David S. Miller --- net/ax25/ax25_route.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ax25/ax25_route.c b/net/ax25/ax25_route.c index 7805945..a169084 100644 --- a/net/ax25/ax25_route.c +++ b/net/ax25/ax25_route.c @@ -412,7 +412,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) { ax25_uid_assoc *user; ax25_route *ax25_rt; - int err; + int err = 0; if ((ax25_rt = ax25_get_route(addr, NULL)) == NULL) return -EHOSTUNREACH; @@ -453,7 +453,7 @@ int ax25_rt_autobind(ax25_cb *ax25, ax25_address *addr) put: ax25_put_route(ax25_rt); - return 0; + return err; } struct sk_buff *ax25_rt_build_path(struct sk_buff *skb, ax25_address *src, -- cgit v1.1 From 49339ccae5ba361b62e829886117dbce4b77194f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Mon, 16 Aug 2010 06:28:19 +0000 Subject: net/ax25: Use available error codes Error codes are stored in err, but the return value is always 0. Return err instead. The semantic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @r@ local idexpression x; constant C; @@ if (...) { ... x = -C ... when != x ( return <+...x...+>; | return NULL; | return; | * return ...; ) } // Signed-off-by: Julia Lawall Acked-by: Ralf Baechle Signed-off-by: David S. Miller --- net/ax25/af_ax25.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index cfdfd7e..26eaebf 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1103,7 +1103,7 @@ done: out: release_sock(sk); - return 0; + return err; } /* -- cgit v1.1 From 2244d07bfa2097cb00600da91c715a8aa547917e Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 17 Aug 2010 08:59:14 +0000 Subject: net: simplify flags for tx timestamping This patch removes the abstraction introduced by the union skb_shared_tx in the shared skb data. The access of the different union elements at several places led to some confusion about accessing the shared tx_flags e.g. in skb_orphan_try(). http://marc.info/?l=linux-netdev&m=128084897415886&w=2 Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/can/raw.c | 4 ++-- net/core/dev.c | 6 +++--- net/core/skbuff.c | 2 +- net/ipv4/icmp.c | 4 ++-- net/ipv4/ip_output.c | 6 +++--- net/ipv4/raw.c | 2 +- net/ipv4/udp.c | 4 ++-- net/packet/af_packet.c | 4 ++-- net/socket.c | 9 ++++----- 9 files changed, 20 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index a10e333..7d77e67 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -647,12 +647,12 @@ static int raw_sendmsg(struct kiocb *iocb, struct socket *sock, err = memcpy_fromiovec(skb_put(skb, size), msg->msg_iov, size); if (err < 0) goto free_skb; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto free_skb; /* to be able to check the received tx sock reference in raw_rcv() */ - skb_tx(skb)->prevent_sk_orphan = 1; + skb_shinfo(skb)->tx_flags |= SKBTX_DRV_NEEDS_SK_REF; skb->dev = dev; skb->sk = sk; diff --git a/net/core/dev.c b/net/core/dev.c index 586a11c..c1dc8a9 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1902,14 +1902,14 @@ static int dev_gso_segment(struct sk_buff *skb) /* * Try to orphan skb early, right before transmission by the device. - * We cannot orphan skb if tx timestamp is requested, since - * drivers need to call skb_tstamp_tx() to send the timestamp. + * We cannot orphan skb if tx timestamp is requested or the sk-reference + * is needed on driver level for other reasons, e.g. see net/can/raw.c */ static inline void skb_orphan_try(struct sk_buff *skb) { struct sock *sk = skb->sk; - if (sk && !skb_tx(skb)->flags) { + if (sk && !skb_shinfo(skb)->tx_flags) { /* skb_tx_hash() wont be able to get sk. * We copy sk_hash into skb->rxhash */ diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 3a2513f..99ef721 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3016,7 +3016,7 @@ void skb_tstamp_tx(struct sk_buff *orig_skb, } else { /* * no hardware time stamps available, - * so keep the skb_shared_tx and only + * so keep the shared tx_flags and only * store software time stamp */ skb->tstamp = ktime_get_real(); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index a0d847c7..96bc7f9 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -379,7 +379,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) inet->tos = ip_hdr(skb)->tos; daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (icmp_param->replyopts.optlen) { ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) @@ -538,7 +538,7 @@ void icmp_send(struct sk_buff *skb_in, int type, int code, __be32 info) inet_sk(sk)->tos = tos; ipc.addr = iph->saddr; ipc.opt = &icmp_param.replyopts; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; { struct flowi fl = { diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 04b6989..e807492 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -953,7 +953,7 @@ alloc_new_skb: else /* only the initial fragment is time stamped */ - ipc->shtx.flags = 0; + ipc->tx_flags = 0; } if (skb == NULL) goto error; @@ -964,7 +964,7 @@ alloc_new_skb: skb->ip_summed = csummode; skb->csum = 0; skb_reserve(skb, hh_len); - *skb_tx(skb) = ipc->shtx; + skb_shinfo(skb)->tx_flags = ipc->tx_flags; /* * Find where to start putting bytes. @@ -1384,7 +1384,7 @@ void ip_send_reply(struct sock *sk, struct sk_buff *skb, struct ip_reply_arg *ar daddr = ipc.addr = rt->rt_src; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (replyopts.opt.optlen) { ipc.opt = &replyopts.opt; diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c index 009a7b2..1f85ef2 100644 --- a/net/ipv4/raw.c +++ b/net/ipv4/raw.c @@ -505,7 +505,7 @@ static int raw_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; ipc.oif = sk->sk_bound_dev_if; if (msg->msg_controllen) { diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 32e0bef..86e757e 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -797,7 +797,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, return -EOPNOTSUPP; ipc.opt = NULL; - ipc.shtx.flags = 0; + ipc.tx_flags = 0; if (up->pending) { /* @@ -845,7 +845,7 @@ int udp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, ipc.addr = inet->inet_saddr; ipc.oif = sk->sk_bound_dev_if; - err = sock_tx_timestamp(msg, sk, &ipc.shtx); + err = sock_tx_timestamp(sk, &ipc.tx_flags); if (err) return err; if (msg->msg_controllen) { diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 9a17f28..3616f27b 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -488,7 +488,7 @@ retry: skb->dev = dev; skb->priority = sk->sk_priority; skb->mark = sk->sk_mark; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto out_unlock; @@ -1209,7 +1209,7 @@ static int packet_snd(struct socket *sock, err = skb_copy_datagram_from_iovec(skb, offset, msg->msg_iov, 0, len); if (err) goto out_free; - err = sock_tx_timestamp(msg, sk, skb_tx(skb)); + err = sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags); if (err < 0) goto out_free; diff --git a/net/socket.c b/net/socket.c index 2270b94..7848d12 100644 --- a/net/socket.c +++ b/net/socket.c @@ -535,14 +535,13 @@ void sock_release(struct socket *sock) } EXPORT_SYMBOL(sock_release); -int sock_tx_timestamp(struct msghdr *msg, struct sock *sk, - union skb_shared_tx *shtx) +int sock_tx_timestamp(struct sock *sk, __u8 *tx_flags) { - shtx->flags = 0; + *tx_flags = 0; if (sock_flag(sk, SOCK_TIMESTAMPING_TX_HARDWARE)) - shtx->hardware = 1; + *tx_flags |= SKBTX_HW_TSTAMP; if (sock_flag(sk, SOCK_TIMESTAMPING_TX_SOFTWARE)) - shtx->software = 1; + *tx_flags |= SKBTX_SW_TSTAMP; return 0; } EXPORT_SYMBOL(sock_tx_timestamp); -- cgit v1.1 From b92840900fb575004cac694e56fd0a43f54dc344 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 18 Aug 2010 10:44:34 +0000 Subject: atm: remove a net_device_stats clear No need to clear device stats in lec_open() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/atm/lec.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/atm/lec.c b/net/atm/lec.c index d98bde1..181d70c 100644 --- a/net/atm/lec.c +++ b/net/atm/lec.c @@ -220,7 +220,6 @@ static unsigned char *get_tr_dst(unsigned char *packet, unsigned char *rdesc) static int lec_open(struct net_device *dev) { netif_start_queue(dev); - memset(&dev->stats, 0, sizeof(struct net_device_stats)); return 0; } -- cgit v1.1 From 2d47b45951af087c1a4439c559309b0bf90a0718 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:00:56 +0000 Subject: net: rps: reset network header before calling skb_get_rxhash() skb_get_rxhash() assumes the network header pointer of the skb is set properly after the commit: commit bfb564e7391340638afe4ad67744a8f3858e7566 Author: Krishna Kumar Date: Wed Aug 4 06:15:52 2010 +0000 core: Factor out flow calculation from get_rps_cpu Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index c1dc8a9..cf87fde 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2372,6 +2372,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (!rxqueue->rps_map && !rxqueue->rps_flow_table) goto done; + skb_reset_network_header(skb); if (!skb_get_rxhash(skb)) goto done; -- cgit v1.1 From dbe5775bbc00116ed5699babfe17c54f32eb34c3 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:01:38 +0000 Subject: net: rps: skip fragment when computing rxhash Fragmented IP packets may have no transfer header, so when computing rxhash, we should skip them. Signed-off-by: Changli Gao Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index cf87fde..7e97e89 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2284,7 +2284,10 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) goto done; ip = (struct iphdr *) skb->data + nhoff; - ip_proto = ip->protocol; + if (ip->frag_off & htons(IP_MF | IP_OFFSET)) + ip_proto = 0; + else + ip_proto = ip->protocol; addr1 = (__force u32) ip->saddr; addr2 = (__force u32) ip->daddr; ihl = ip->ihl; -- cgit v1.1 From 12fcdefb3643607c47f39906a49056cf608bb545 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:04:32 +0000 Subject: net: rps: use proto_ports_offset() to handle the AH message correctly The SPI isn't at the beginning of an AH message. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7e97e89..da584f5 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2266,7 +2266,7 @@ static inline void ____napi_schedule(struct softnet_data *sd, */ __u32 __skb_get_rxhash(struct sk_buff *skb) { - int nhoff, hash = 0; + int nhoff, hash = 0, poff; struct ipv6hdr *ip6; struct iphdr *ip; u8 ip_proto; @@ -2306,24 +2306,15 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) goto done; } - switch (ip_proto) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - case IPPROTO_AH: - case IPPROTO_SCTP: - case IPPROTO_UDPLITE: - if (pskb_may_pull(skb, (ihl * 4) + 4 + nhoff)) { - ports.v32 = * (__force u32 *) (skb->data + nhoff + - (ihl * 4)); + ports.v32 = 0; + poff = proto_ports_offset(ip_proto); + if (poff >= 0) { + nhoff += ihl * 4 + poff; + if (pskb_may_pull(skb, nhoff + 4)) { + ports.v32 = * (__force u32 *) (skb->data + nhoff); if (ports.v16[1] < ports.v16[0]) swap(ports.v16[0], ports.v16[1]); - break; } - default: - ports.v32 = 0; - break; } /* get a consistent hash (same value on both flow directions) */ -- cgit v1.1 From 78d3307eded853f01c5e9aaa8c0768c2f75825a3 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:05:08 +0000 Subject: net_sched: cls_flow: use proto_ports_offset() to support AH message Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/sched/cls_flow.c | 67 +++++++++++++++++++++++++++++----------------------- 1 file changed, 38 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index e17096e..cd709f1 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -111,44 +111,41 @@ static u32 flow_get_proto(struct sk_buff *skb) } } -static int has_ports(u8 protocol) -{ - switch (protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ESP: - return 1; - default: - return 0; - } -} - static u32 flow_get_proto_src(struct sk_buff *skb) { switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - has_ports(iph->protocol) && - pskb_network_may_pull(skb, iph->ihl * 4 + 2)) - return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4)); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 2 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + poff)); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; - if (!pskb_network_may_pull(skb, sizeof(*iph) + 2)) + if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); - if (has_ports(iph->nexthdr)) - return ntohs(*(__be16 *)&iph[1]); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 2)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff)); + } break; } } @@ -161,24 +158,36 @@ static u32 flow_get_proto_dst(struct sk_buff *skb) switch (skb->protocol) { case htons(ETH_P_IP): { struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ip_hdr(skb); - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - has_ports(iph->protocol) && - pskb_network_may_pull(skb, iph->ihl * 4 + 4)) - return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + 2)); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + iph->ihl * 4 + + 2 + poff)); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; - if (!pskb_network_may_pull(skb, sizeof(*iph) + 4)) + if (!pskb_network_may_pull(skb, sizeof(*iph))) break; iph = ipv6_hdr(skb); - if (has_ports(iph->nexthdr)) - return ntohs(*(__be16 *)((void *)&iph[1] + 2)); + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + poff + 4)) { + iph = ipv6_hdr(skb); + return ntohs(*(__be16 *)((void *)iph + sizeof(*iph) + + poff + 2)); + } break; } } -- cgit v1.1 From 3d04ebb6ab2ac9a3bea7644f0d13cdf65002b870 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 20:34:40 +0000 Subject: netfilter: ipt_CLUSTERIP: use proto_ports_offset() to support AH message Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/ipv4/netfilter/ipt_CLUSTERIP.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_CLUSTERIP.c b/net/ipv4/netfilter/ipt_CLUSTERIP.c index 3a43cf3..1e26a48 100644 --- a/net/ipv4/netfilter/ipt_CLUSTERIP.c +++ b/net/ipv4/netfilter/ipt_CLUSTERIP.c @@ -29,6 +29,7 @@ #include #include #include +#include #define CLUSTERIP_VERSION "0.8" @@ -231,24 +232,22 @@ clusterip_hashfn(const struct sk_buff *skb, { const struct iphdr *iph = ip_hdr(skb); unsigned long hashval; - u_int16_t sport, dport; - const u_int16_t *ports; - - switch (iph->protocol) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - case IPPROTO_ICMP: - ports = (const void *)iph+iph->ihl*4; - sport = ports[0]; - dport = ports[1]; - break; - default: + u_int16_t sport = 0, dport = 0; + int poff; + + poff = proto_ports_offset(iph->protocol); + if (poff >= 0) { + const u_int16_t *ports; + u16 _ports[2]; + + ports = skb_header_pointer(skb, iph->ihl * 4 + poff, 4, _ports); + if (ports) { + sport = ports[0]; + dport = ports[1]; + } + } else { if (net_ratelimit()) pr_info("unknown protocol %u\n", iph->protocol); - sport = dport = 0; } switch (config->hash_mode) { -- cgit v1.1 From aca071c1c1c07bcc0b100b7c58e59790d6be6a69 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:06:39 +0000 Subject: netfilter: xt_hashlimit: use proto_ports_offset() to support AH message Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/netfilter/xt_hashlimit.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_hashlimit.c b/net/netfilter/xt_hashlimit.c index b46a839..9228ee0d 100644 --- a/net/netfilter/xt_hashlimit.c +++ b/net/netfilter/xt_hashlimit.c @@ -448,6 +448,7 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, { __be16 _ports[2], *ports; u8 nexthdr; + int poff; memset(dst, 0, sizeof(*dst)); @@ -492,19 +493,13 @@ hashlimit_init_dst(const struct xt_hashlimit_htable *hinfo, return 0; } - switch (nexthdr) { - case IPPROTO_TCP: - case IPPROTO_UDP: - case IPPROTO_UDPLITE: - case IPPROTO_SCTP: - case IPPROTO_DCCP: - ports = skb_header_pointer(skb, protoff, sizeof(_ports), + poff = proto_ports_offset(nexthdr); + if (poff >= 0) { + ports = skb_header_pointer(skb, protoff + poff, sizeof(_ports), &_ports); - break; - default: + } else { _ports[0] = _ports[1] = 0; ports = _ports; - break; } if (!ports) return -1; -- cgit v1.1 From b9959c2e4460b1df1d113d829180398588bb04b4 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 17 Aug 2010 19:07:35 +0000 Subject: net_sched: sch_sfq: use proto_ports_offset() to support AH message Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/sched/sch_sfq.c | 33 ++++++++++++++++----------------- 1 file changed, 16 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 201cbac..3cf478d 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -123,40 +123,39 @@ static unsigned sfq_hash(struct sfq_sched_data *q, struct sk_buff *skb) case htons(ETH_P_IP): { const struct iphdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) goto err; iph = ip_hdr(skb); h = (__force u32)iph->daddr; h2 = (__force u32)iph->saddr ^ iph->protocol; - if (!(iph->frag_off&htons(IP_MF|IP_OFFSET)) && - (iph->protocol == IPPROTO_TCP || - iph->protocol == IPPROTO_UDP || - iph->protocol == IPPROTO_UDPLITE || - iph->protocol == IPPROTO_SCTP || - iph->protocol == IPPROTO_DCCP || - iph->protocol == IPPROTO_ESP) && - pskb_network_may_pull(skb, iph->ihl * 4 + 4)) - h2 ^= *(((u32*)iph) + iph->ihl); + if (iph->frag_off & htons(IP_MF|IP_OFFSET)) + break; + poff = proto_ports_offset(iph->protocol); + if (poff >= 0 && + pskb_network_may_pull(skb, iph->ihl * 4 + 4 + poff)) { + iph = ip_hdr(skb); + h2 ^= *(u32*)((void *)iph + iph->ihl * 4 + poff); + } break; } case htons(ETH_P_IPV6): { struct ipv6hdr *iph; + int poff; if (!pskb_network_may_pull(skb, sizeof(*iph))) goto err; iph = ipv6_hdr(skb); h = (__force u32)iph->daddr.s6_addr32[3]; h2 = (__force u32)iph->saddr.s6_addr32[3] ^ iph->nexthdr; - if ((iph->nexthdr == IPPROTO_TCP || - iph->nexthdr == IPPROTO_UDP || - iph->nexthdr == IPPROTO_UDPLITE || - iph->nexthdr == IPPROTO_SCTP || - iph->nexthdr == IPPROTO_DCCP || - iph->nexthdr == IPPROTO_ESP) && - pskb_network_may_pull(skb, sizeof(*iph) + 4)) - h2 ^= *(u32*)&iph[1]; + poff = proto_ports_offset(iph->nexthdr); + if (poff >= 0 && + pskb_network_may_pull(skb, sizeof(*iph) + 4 + poff)) { + iph = ipv6_hdr(skb); + h2 ^= *(u32*)((void *)iph + sizeof(*iph) + poff); + } break; } default: -- cgit v1.1 From 49e8ab03ebcacd8e37660ffec20c0c46721a2800 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Aug 2010 06:10:45 +0000 Subject: net: build_ehash_secret() and rt_bind_peer() cleanups Now cmpxchg() is available on all arches, we can use it in build_ehash_secret() and rt_bind_peer() instead of using spinlocks. Signed-off-by: Eric Dumazet CC: Mathieu Desnoyers Signed-off-by: David S. Miller --- net/ipv4/af_inet.c | 8 +++----- net/ipv4/route.c | 9 +-------- 2 files changed, 4 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 6a1100c..f581f77 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -227,18 +227,16 @@ EXPORT_SYMBOL(inet_ehash_secret); /* * inet_ehash_secret must be set exactly once - * Instead of using a dedicated spinlock, we (ab)use inetsw_lock */ void build_ehash_secret(void) { u32 rnd; + do { get_random_bytes(&rnd, sizeof(rnd)); } while (rnd == 0); - spin_lock_bh(&inetsw_lock); - if (!inet_ehash_secret) - inet_ehash_secret = rnd; - spin_unlock_bh(&inetsw_lock); + + cmpxchg(&inet_ehash_secret, 0, rnd); } EXPORT_SYMBOL(build_ehash_secret); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3f56b6e..85a67c9 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1268,18 +1268,11 @@ skip_hashing: void rt_bind_peer(struct rtable *rt, int create) { - static DEFINE_SPINLOCK(rt_peer_lock); struct inet_peer *peer; peer = inet_getpeer(rt->rt_dst, create); - spin_lock_bh(&rt_peer_lock); - if (rt->peer == NULL) { - rt->peer = peer; - peer = NULL; - } - spin_unlock_bh(&rt_peer_lock); - if (peer) + if (peer && cmpxchg(&rt->peer, NULL, peer) != NULL) inet_putpeer(peer); } -- cgit v1.1 From eb4d40654505e47aa9d2035bb97f631fa61d14b4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Gr=C3=A9goire=20Baron?= Date: Wed, 18 Aug 2010 13:10:35 +0000 Subject: net/sched: add ACT_CSUM action to update packets checksums net/sched: add ACT_CSUM action to update packets checksums ACT_CSUM can be called just after ACT_PEDIT in order to re-compute some altered checksums in IPv4 and IPv6 packets. The following checksums are supported by this patch: - IPv4: IPv4 header, ICMP, IGMP, TCP, UDP & UDPLite - IPv6: ICMPv6, TCP, UDP & UDPLite It's possible to request in the same action to update different kind of checksums, if the packets flow mix TCP, UDP and UDPLite, ... An example of usage is done in the associated iproute2 patch. Version 3 changes: - remove useless goto instructions - improve IPv6 hop options decoding Version 2 changes: - coding style correction - remove useless arguments of some functions - use stack in tcf_csum_dump() - add tcf_csum_skb_nextlayer() to factor code Signed-off-by: Gregoire Baron Acked-by: jamal Signed-off-by: David S. Miller --- net/sched/Kconfig | 10 + net/sched/Makefile | 1 + net/sched/act_csum.c | 595 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 606 insertions(+) create mode 100644 net/sched/act_csum.c (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 2f691fb..522d5a9 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -518,6 +518,16 @@ config NET_ACT_SKBEDIT To compile this code as a module, choose M here: the module will be called act_skbedit. +config NET_ACT_CSUM + tristate "Checksum Updating" + depends on NET_CLS_ACT + ---help--- + Say Y here to update some common checksum after some direct + packet alterations. + + To compile this code as a module, choose M here: the + module will be called act_csum. + config NET_CLS_IND bool "Incoming device classification" depends on NET_CLS_U32 || NET_CLS_FW diff --git a/net/sched/Makefile b/net/sched/Makefile index f14e71b..960f5db 100644 --- a/net/sched/Makefile +++ b/net/sched/Makefile @@ -15,6 +15,7 @@ obj-$(CONFIG_NET_ACT_NAT) += act_nat.o obj-$(CONFIG_NET_ACT_PEDIT) += act_pedit.o obj-$(CONFIG_NET_ACT_SIMP) += act_simple.o obj-$(CONFIG_NET_ACT_SKBEDIT) += act_skbedit.o +obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c new file mode 100644 index 0000000..58d7f36 --- /dev/null +++ b/net/sched/act_csum.c @@ -0,0 +1,595 @@ +/* + * Checksum updating actions + * + * Copyright (c) 2010 Gregoire Baron + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the Free + * Software Foundation; either version 2 of the License, or (at your option) + * any later version. + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +#define CSUM_TAB_MASK 15 +static struct tcf_common *tcf_csum_ht[CSUM_TAB_MASK + 1]; +static u32 csum_idx_gen; +static DEFINE_RWLOCK(csum_lock); + +static struct tcf_hashinfo csum_hash_info = { + .htab = tcf_csum_ht, + .hmask = CSUM_TAB_MASK, + .lock = &csum_lock, +}; + +static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { + [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, +}; + +static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, + struct tc_action *a, int ovr, int bind) +{ + struct nlattr *tb[TCA_CSUM_MAX + 1]; + struct tc_csum *parm; + struct tcf_common *pc; + struct tcf_csum *p; + int ret = 0, err; + + if (nla == NULL) + return -EINVAL; + + err = nla_parse_nested(tb, TCA_CSUM_MAX, nla,csum_policy); + if (err < 0) + return err; + + if (tb[TCA_CSUM_PARMS] == NULL) + return -EINVAL; + parm = nla_data(tb[TCA_CSUM_PARMS]); + + pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); + if (!pc) { + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, &csum_idx_gen, &csum_hash_info); + if (IS_ERR(pc)) + return PTR_ERR(pc); + p = to_tcf_csum(pc); + ret = ACT_P_CREATED; + } else { + p = to_tcf_csum(pc); + if (!ovr) { + tcf_hash_release(pc, bind, &csum_hash_info); + return -EEXIST; + } + } + + spin_lock_bh(&p->tcf_lock); + p->tcf_action = parm->action; + p->update_flags = parm->update_flags; + spin_unlock_bh(&p->tcf_lock); + + if (ret == ACT_P_CREATED) + tcf_hash_insert(pc, &csum_hash_info); + + return ret; +} + +static int tcf_csum_cleanup(struct tc_action *a, int bind) +{ + struct tcf_csum *p = a->priv; + return tcf_hash_release(&p->common, bind, &csum_hash_info); +} + +/** + * tcf_csum_skb_nextlayer - Get next layer pointer + * @skb: sk_buff to use + * @ihl: previous summed headers length + * @ipl: complete packet length + * @jhl: next header length + * + * Check the expected next layer availability in the specified sk_buff. + * Return the next layer pointer if pass, NULL otherwise. + */ +static void *tcf_csum_skb_nextlayer(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl, + unsigned int jhl) +{ + int ntkoff = skb_network_offset(skb); + int hl = ihl + jhl; + + if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) || + (skb_cloned(skb) && + !skb_clone_writable(skb, hl + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC))) + return NULL; + else + return (void *)(skb_network_header(skb) + ihl); +} + +static int tcf_csum_ipv4_icmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct icmphdr *icmph; + + icmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmph)); + if (icmph == NULL) + return 0; + + icmph->checksum = 0; + skb->csum = csum_partial(icmph, ipl - ihl, 0); + icmph->checksum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_igmp(struct sk_buff *skb, + unsigned int ihl, unsigned int ipl) +{ + struct igmphdr *igmph; + + igmph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*igmph)); + if (igmph == NULL) + return 0; + + igmph->csum = 0; + skb->csum = csum_partial(igmph, ipl - ihl, 0); + igmph->csum = csum_fold(skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_icmp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct icmp6hdr *icmp6h; + + icmp6h = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*icmp6h)); + if (icmp6h == NULL) + return 0; + + icmp6h->icmp6_cksum = 0; + skb->csum = csum_partial(icmp6h, ipl - ihl, 0); + icmp6h->icmp6_cksum = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_ICMPV6, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_tcp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = tcp_v4_check(ipl - ihl, + iph->saddr, iph->daddr, skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv6_tcp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl) +{ + struct tcphdr *tcph; + + tcph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*tcph)); + if (tcph == NULL) + return 0; + + tcph->check = 0; + skb->csum = csum_partial(tcph, ipl - ihl, 0); + tcph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, + ipl - ihl, IPPROTO_TCP, + skb->csum); + + skb->ip_summed = CHECKSUM_NONE; + + return 1; +} + +static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* Support both UDP and UDPLITE checksum algorithms, + * Don't use udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use iph->tot_len, or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + if (udplite || udph->check) { + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, + ul, iph->protocol, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + } + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, + unsigned int ihl, unsigned int ipl, int udplite) +{ + struct udphdr *udph; + u16 ul; + + /* Support both UDP and UDPLITE checksum algorithms, + * Don't use udph->len to get the real length without any protocol check, + * UDPLITE uses udph->len for another thing, + * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl. + */ + + udph = tcf_csum_skb_nextlayer(skb, ihl, ipl, sizeof(*udph)); + if (udph == NULL) + return 0; + + ul = ntohs(udph->len); + + udph->check = 0; + + if (udplite) { + if (ul == 0) + skb->csum = csum_partial(udph, ipl - ihl, 0); + + else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) + skb->csum = csum_partial(udph, ul, 0); + + else + goto ignore_obscure_skb; + } else { + if (ul != ipl - ihl) + goto ignore_obscure_skb; + + skb->csum = csum_partial(udph, ul, 0); + } + + udph->check = csum_ipv6_magic(&ip6h->saddr, &ip6h->daddr, ul, + udplite ? IPPROTO_UDPLITE : IPPROTO_UDP, + skb->csum); + + if (!udph->check) + udph->check = CSUM_MANGLED_0; + + skb->ip_summed = CHECKSUM_NONE; + +ignore_obscure_skb: + return 1; +} + +static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) +{ + struct iphdr *iph; + int ntkoff; + + ntkoff = skb_network_offset(skb); + + if (!pskb_may_pull(skb, sizeof(*iph) + ntkoff)) + goto fail; + + iph = ip_hdr(skb); + + switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { + case IPPROTO_ICMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv4_icmp(skb, + iph->ihl * 4, ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_IGMP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP) + if (!tcf_csum_ipv4_igmp(skb, + iph->ihl * 4, ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv4_tcp(skb, iph, + iph->ihl * 4, ntohs(iph->tot_len))) + goto fail; + break; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv4_udp(skb, iph, + iph->ihl * 4, ntohs(iph->tot_len), 0)) + goto fail; + break; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv4_udp(skb, iph, + iph->ihl * 4, ntohs(iph->tot_len), 1)) + goto fail; + break; + } + + if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) { + if (skb_cloned(skb) && + !skb_clone_writable(skb, sizeof(*iph) + ntkoff) && + pskb_expand_head(skb, 0, 0, GFP_ATOMIC)) + goto fail; + + ip_send_check(iph); + } + + return 1; + +fail: + return 0; +} + +static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, + unsigned int ixhl, unsigned int *pl) +{ + int off, len, optlen; + unsigned char *xh = (void *)ip6xh; + + off = sizeof(*ip6xh); + len = ixhl - off; + + while (len > 1) { + switch (xh[off]) + { + case IPV6_TLV_PAD0: + optlen = 1; + break; + case IPV6_TLV_JUMBO: + optlen = xh[off + 1] + 2; + if (optlen != 6 || len < 6 || (off & 3) != 2) + /* wrong jumbo option length/alignment */ + return 0; + *pl = ntohl(*(__be32 *)(xh + off + 2)); + goto done; + default: + optlen = xh[off + 1] + 2; + if (optlen > len) + /* ignore obscure options */ + goto done; + break; + } + off += optlen; + len -= optlen; + } + +done: + return 1; +} + +static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) +{ + struct ipv6hdr *ip6h; + struct ipv6_opt_hdr *ip6xh; + unsigned int hl, ixhl; + unsigned int pl; + int ntkoff; + u8 nexthdr; + + ntkoff = skb_network_offset(skb); + + hl = sizeof(*ip6h); + + if (!pskb_may_pull(skb, hl + ntkoff)) + goto fail; + + ip6h = ipv6_hdr(skb); + + pl = ntohs(ip6h->payload_len); + nexthdr = ip6h->nexthdr; + + do { + switch (nexthdr) { + case NEXTHDR_FRAGMENT: + goto ignore_skb; + case NEXTHDR_ROUTING: + case NEXTHDR_HOP: + case NEXTHDR_DEST: + if (!pskb_may_pull(skb, hl + sizeof(*ip6xh) + ntkoff)) + goto fail; + ip6xh = (void *)(skb_network_header(skb) + hl); + ixhl = ipv6_optlen(ip6xh); + if (!pskb_may_pull(skb, hl + ixhl + ntkoff)) + goto fail; + if ((nexthdr == NEXTHDR_HOP) && + !(tcf_csum_ipv6_hopopts(ip6xh, ixhl, &pl))) + goto fail; + nexthdr = ip6xh->nexthdr; + hl += ixhl; + break; + case IPPROTO_ICMPV6: + if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) + if (!tcf_csum_ipv6_icmp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_TCP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) + if (!tcf_csum_ipv6_tcp(skb, ip6h, + hl, pl + sizeof(*ip6h))) + goto fail; + goto done; + case IPPROTO_UDP: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) + if (!tcf_csum_ipv6_udp(skb, ip6h, + hl, pl + sizeof(*ip6h), 0)) + goto fail; + goto done; + case IPPROTO_UDPLITE: + if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) + if (!tcf_csum_ipv6_udp(skb, ip6h, + hl, pl + sizeof(*ip6h), 1)) + goto fail; + goto done; + default: + goto ignore_skb; + } + } while (pskb_may_pull(skb, hl + 1 + ntkoff)); + +done: +ignore_skb: + return 1; + +fail: + return 0; +} + +static int tcf_csum(struct sk_buff *skb, + struct tc_action *a, struct tcf_result *res) +{ + struct tcf_csum *p = a->priv; + int action; + u32 update_flags; + + spin_lock(&p->tcf_lock); + p->tcf_tm.lastuse = jiffies; + p->tcf_bstats.bytes += qdisc_pkt_len(skb); + p->tcf_bstats.packets++; + action = p->tcf_action; + update_flags = p->update_flags; + spin_unlock(&p->tcf_lock); + + if (unlikely(action == TC_ACT_SHOT)) + goto drop; + + switch (skb->protocol) { + case cpu_to_be16(ETH_P_IP): + if (!tcf_csum_ipv4(skb, update_flags)) + goto drop; + break; + case cpu_to_be16(ETH_P_IPV6): + if (!tcf_csum_ipv6(skb, update_flags)) + goto drop; + break; + } + + return action; + +drop: + spin_lock(&p->tcf_lock); + p->tcf_qstats.drops++; + spin_unlock(&p->tcf_lock); + return TC_ACT_SHOT; +} + +static int tcf_csum_dump(struct sk_buff *skb, + struct tc_action *a, int bind, int ref) +{ + unsigned char *b = skb_tail_pointer(skb); + struct tcf_csum *p = a->priv; + struct tc_csum opt = { + .update_flags = p->update_flags, + + .index = p->tcf_index, + .action = p->tcf_action, + .refcnt = p->tcf_refcnt - ref, + .bindcnt = p->tcf_bindcnt - bind, + }; + struct tcf_t t; + + NLA_PUT(skb, TCA_CSUM_PARMS, sizeof(opt), &opt); + t.install = jiffies_to_clock_t(jiffies - p->tcf_tm.install); + t.lastuse = jiffies_to_clock_t(jiffies - p->tcf_tm.lastuse); + t.expires = jiffies_to_clock_t(p->tcf_tm.expires); + NLA_PUT(skb, TCA_CSUM_TM, sizeof(t), &t); + + return skb->len; + +nla_put_failure: + nlmsg_trim(skb, b); + return -1; +} + +static struct tc_action_ops act_csum_ops = { + .kind = "csum", + .hinfo = &csum_hash_info, + .type = TCA_ACT_CSUM, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_csum, + .dump = tcf_csum_dump, + .cleanup = tcf_csum_cleanup, + .lookup = tcf_hash_search, + .init = tcf_csum_init, + .walk = tcf_generic_walker +}; + +MODULE_DESCRIPTION("Checksum updating actions"); +MODULE_LICENSE("GPL"); + +static int __init csum_init_module(void) +{ + return tcf_register_action(&act_csum_ops); +} + +static void __exit csum_cleanup_module(void) +{ + tcf_unregister_action(&act_csum_ops); +} + +module_init(csum_init_module); +module_exit(csum_cleanup_module); -- cgit v1.1 From 1003489e06c04d807c783a8958f2ccc9aed7a244 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 21 Aug 2010 06:13:28 +0000 Subject: net: rps: fix the wrong network header pointer __skb_get_rxhash() was broken after the commit: commit bfb564e7391340638afe4ad67744a8f3858e7566 Author: Krishna Kumar Date: Wed Aug 4 06:15:52 2010 +0000 core: Factor out flow calculation from get_rps_cpu Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index da584f5..4d74d26 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2283,7 +2283,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip) + nhoff)) goto done; - ip = (struct iphdr *) skb->data + nhoff; + ip = (struct iphdr *) (skb->data + nhoff); if (ip->frag_off & htons(IP_MF | IP_OFFSET)) ip_proto = 0; else @@ -2296,7 +2296,7 @@ __u32 __skb_get_rxhash(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff)) goto done; - ip6 = (struct ipv6hdr *) skb->data + nhoff; + ip6 = (struct ipv6hdr *) (skb->data + nhoff); ip_proto = ip6->nexthdr; addr1 = (__force u32) ip6->saddr.s6_addr32[3]; addr2 = (__force u32) ip6->daddr.s6_addr32[3]; -- cgit v1.1 From 00959ade36acadc00e757f87060bf6e4501d545f Mon Sep 17 00:00:00 2001 From: Dmitry Kozlov Date: Sat, 21 Aug 2010 23:05:39 -0700 Subject: PPTP: PPP over IPv4 (Point-to-Point Tunneling Protocol) PPP: introduce "pptp" module which implements point-to-point tunneling protocol using pppox framework NET: introduce the "gre" module for demultiplexing GRE packets on version criteria (required to pptp and ip_gre may coexists) NET: ip_gre: update to use the "gre" module This patch introduces then pptp support to the linux kernel which dramatically speeds up pptp vpn connections and decreases cpu usage in comparison of existing user-space implementation (poptop/pptpclient). There is accel-pptp project (https://sourceforge.net/projects/accel-pptp/) to utilize this module, it contains plugin for pppd to use pptp in client-mode and modified pptpd (poptop) to build high-performance pptp NAS. There was many changes from initial submitted patch, most important are: 1. using rcu instead of read-write locks 2. using static bitmap instead of dynamically allocated 3. using vmalloc for memory allocation instead of BITS_PER_LONG + __get_free_pages 4. fixed many coding style issues Thanks to Eric Dumazet. Signed-off-by: Dmitry Kozlov Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 7 +++ net/ipv4/Makefile | 1 + net/ipv4/gre.c | 151 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ net/ipv4/ip_gre.c | 14 ++--- 4 files changed, 166 insertions(+), 7 deletions(-) create mode 100644 net/ipv4/gre.c (limited to 'net') diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index 7c3a7d1..7458bda 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -215,8 +215,15 @@ config NET_IPIP be inserted in and removed from the running kernel whenever you want). Most people won't need this and can say N. +config NET_IPGRE_DEMUX + tristate "IP: GRE demultiplexer" + help + This is helper module to demultiplex GRE packets on GRE version field criteria. + Required by ip_gre and pptp modules. + config NET_IPGRE tristate "IP: GRE tunnels over IP" + depends on NET_IPGRE_DEMUX help Tunneling means encapsulating data of one protocol type within another protocol and sending it over a channel that understands the diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index 80ff87c..4978d22 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -20,6 +20,7 @@ obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_IP_MULTIPLE_TABLES) += fib_rules.o obj-$(CONFIG_IP_MROUTE) += ipmr.o obj-$(CONFIG_NET_IPIP) += ipip.o +obj-$(CONFIG_NET_IPGRE_DEMUX) += gre.o obj-$(CONFIG_NET_IPGRE) += ip_gre.o obj-$(CONFIG_SYN_COOKIES) += syncookies.o obj-$(CONFIG_INET_AH) += ah4.o diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c new file mode 100644 index 0000000..b546736d --- /dev/null +++ b/net/ipv4/gre.c @@ -0,0 +1,151 @@ +/* + * GRE over IPv4 demultiplexer driver + * + * Authors: Dmitry Kozlov (xeb@mail.ru) + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static DEFINE_SPINLOCK(gre_proto_lock); + +int gre_add_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version]) + goto err_out_unlock; + + rcu_assign_pointer(gre_proto[version], proto); + spin_unlock(&gre_proto_lock); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_add_protocol); + +int gre_del_protocol(const struct gre_protocol *proto, u8 version) +{ + if (version >= GREPROTO_MAX) + goto err_out; + + spin_lock(&gre_proto_lock); + if (gre_proto[version] != proto) + goto err_out_unlock; + rcu_assign_pointer(gre_proto[version], NULL); + spin_unlock(&gre_proto_lock); + synchronize_rcu(); + return 0; + +err_out_unlock: + spin_unlock(&gre_proto_lock); +err_out: + return -1; +} +EXPORT_SYMBOL_GPL(gre_del_protocol); + +static int gre_rcv(struct sk_buff *skb) +{ + const struct gre_protocol *proto; + u8 ver; + int ret; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->handler) + goto drop_unlock; + ret = proto->handler(skb); + rcu_read_unlock(); + return ret; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); + return NET_RX_DROP; +} + +static void gre_err(struct sk_buff *skb, u32 info) +{ + const struct gre_protocol *proto; + u8 ver; + + if (!pskb_may_pull(skb, 12)) + goto drop; + + ver = skb->data[1]&0x7f; + if (ver >= GREPROTO_MAX) + goto drop; + + rcu_read_lock(); + proto = rcu_dereference(gre_proto[ver]); + if (!proto || !proto->err_handler) + goto drop_unlock; + proto->err_handler(skb, info); + rcu_read_unlock(); + return; + +drop_unlock: + rcu_read_unlock(); +drop: + kfree_skb(skb); +} + +static const struct net_protocol net_gre_protocol = { + .handler = gre_rcv, + .err_handler = gre_err, + .netns_ok = 1, +}; + +static int __init gre_init(void) +{ + pr_info("GRE over IPv4 demultiplexor driver"); + + if (inet_add_protocol(&net_gre_protocol, IPPROTO_GRE) < 0) { + pr_err("gre: can't add protocol\n"); + return -EAGAIN; + } + + return 0; +} + +static void __exit gre_exit(void) +{ + inet_del_protocol(&net_gre_protocol, IPPROTO_GRE); +} + +module_init(gre_init); +module_exit(gre_exit); + +MODULE_DESCRIPTION("GRE over IPv4 demultiplexer driver"); +MODULE_AUTHOR("D. Kozlov (xeb@mail.ru)"); +MODULE_LICENSE("GPL"); + diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 945b20a..8517689 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -44,6 +44,7 @@ #include #include #include +#include #ifdef CONFIG_IPV6 #include @@ -1278,10 +1279,9 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) } -static const struct net_protocol ipgre_protocol = { - .handler = ipgre_rcv, - .err_handler = ipgre_err, - .netns_ok = 1, +static const struct gre_protocol ipgre_protocol = { + .handler = ipgre_rcv, + .err_handler = ipgre_err, }; static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) @@ -1663,7 +1663,7 @@ static int __init ipgre_init(void) if (err < 0) return err; - err = inet_add_protocol(&ipgre_protocol, IPPROTO_GRE); + err = gre_add_protocol(&ipgre_protocol, GREPROTO_CISCO); if (err < 0) { printk(KERN_INFO "ipgre init: can't add protocol\n"); goto add_proto_failed; @@ -1683,7 +1683,7 @@ out: tap_ops_failed: rtnl_link_unregister(&ipgre_link_ops); rtnl_link_failed: - inet_del_protocol(&ipgre_protocol, IPPROTO_GRE); + gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO); add_proto_failed: unregister_pernet_device(&ipgre_net_ops); goto out; @@ -1693,7 +1693,7 @@ static void __exit ipgre_fini(void) { rtnl_link_unregister(&ipgre_tap_ops); rtnl_link_unregister(&ipgre_link_ops); - if (inet_del_protocol(&ipgre_protocol, IPPROTO_GRE) < 0) + if (gre_del_protocol(&ipgre_protocol, GREPROTO_CISCO) < 0) printk(KERN_INFO "ipgre close: can't remove protocol\n"); unregister_pernet_device(&ipgre_net_ops); } -- cgit v1.1 From 81ce790bd75d49a0d119f5d7b27405e1d9b1bd57 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 19 Aug 2010 23:51:33 +0000 Subject: irda: use net_device_stats from struct net_device struct net_device has its own struct net_device_stats member, so use this one instead of a private copy in the irlan_cb struct. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/irda/irlan/irlan_eth.c | 32 +++++++++----------------------- 1 file changed, 9 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/irda/irlan/irlan_eth.c b/net/irda/irlan/irlan_eth.c index 5bb8353..8ee1ff6 100644 --- a/net/irda/irlan/irlan_eth.c +++ b/net/irda/irlan/irlan_eth.c @@ -45,13 +45,11 @@ static int irlan_eth_close(struct net_device *dev); static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, struct net_device *dev); static void irlan_eth_set_multicast_list( struct net_device *dev); -static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev); static const struct net_device_ops irlan_eth_netdev_ops = { .ndo_open = irlan_eth_open, .ndo_stop = irlan_eth_close, .ndo_start_xmit = irlan_eth_xmit, - .ndo_get_stats = irlan_eth_get_stats, .ndo_set_multicast_list = irlan_eth_set_multicast_list, .ndo_change_mtu = eth_change_mtu, .ndo_validate_addr = eth_validate_addr, @@ -208,10 +206,10 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, * tried :-) DB */ /* irttp_data_request already free the packet */ - self->stats.tx_dropped++; + dev->stats.tx_dropped++; } else { - self->stats.tx_packets++; - self->stats.tx_bytes += len; + dev->stats.tx_packets++; + dev->stats.tx_bytes += len; } return NETDEV_TX_OK; @@ -226,15 +224,16 @@ static netdev_tx_t irlan_eth_xmit(struct sk_buff *skb, int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) { struct irlan_cb *self = instance; + struct net_device *dev = self->dev; if (skb == NULL) { - ++self->stats.rx_dropped; + dev->stats.rx_dropped++; return 0; } if (skb->len < ETH_HLEN) { IRDA_DEBUG(0, "%s() : IrLAN frame too short (%d)\n", __func__, skb->len); - ++self->stats.rx_dropped; + dev->stats.rx_dropped++; dev_kfree_skb(skb); return 0; } @@ -244,10 +243,10 @@ int irlan_eth_receive(void *instance, void *sap, struct sk_buff *skb) * might have been previously set by the low level IrDA network * device driver */ - skb->protocol = eth_type_trans(skb, self->dev); /* Remove eth header */ + skb->protocol = eth_type_trans(skb, dev); /* Remove eth header */ - self->stats.rx_packets++; - self->stats.rx_bytes += skb->len; + dev->stats.rx_packets++; + dev->stats.rx_bytes += skb->len; netif_rx(skb); /* Eat it! */ @@ -348,16 +347,3 @@ static void irlan_eth_set_multicast_list(struct net_device *dev) else irlan_set_broadcast_filter(self, FALSE); } - -/* - * Function irlan_get_stats (dev) - * - * Get the current statistics for this device - * - */ -static struct net_device_stats *irlan_eth_get_stats(struct net_device *dev) -{ - struct irlan_cb *self = netdev_priv(dev); - - return &self->stats; -} -- cgit v1.1 From 739a91ef0625e0e4a40b835f4f891313c47915df Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 21 Aug 2010 06:23:15 +0000 Subject: net_sched: cls_flow: add key rxhash We can use rxhash to classify the traffic into flows. As rxhash maybe supplied by NIC or RPS, it is cheaper. Signed-off-by: Changli Gao Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/cls_flow.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'net') diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index cd709f1..5b271a1 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -306,6 +306,11 @@ static u32 flow_get_vlan_tag(const struct sk_buff *skb) return tag & VLAN_VID_MASK; } +static u32 flow_get_rxhash(struct sk_buff *skb) +{ + return skb_get_rxhash(skb); +} + static u32 flow_key_get(struct sk_buff *skb, int key) { switch (key) { @@ -343,6 +348,8 @@ static u32 flow_key_get(struct sk_buff *skb, int key) return flow_get_skgid(skb); case FLOW_KEY_VLAN_TAG: return flow_get_vlan_tag(skb); + case FLOW_KEY_RXHASH: + return flow_get_rxhash(skb); default: WARN_ON(1); return 0; -- cgit v1.1 From 2436243a39de56f03d38c74139261cc61bea8456 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sun, 22 Aug 2010 20:31:14 -0700 Subject: net/sched: need to include net/ip6_checksum.h for the declararion of csum_ipv6_magic. Fixes this build error on PowerPC (at least): net/sched/act_csum.c: In function 'tcf_csum_ipv6_icmp': net/sched/act_csum.c:178: error: implicit declaration of function 'csum_ipv6_magic' Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/sched/act_csum.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 58d7f36..be41f1c 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -29,6 +29,7 @@ #include #include #include +#include #include -- cgit v1.1 From 05532121da0728eaedac2a0a5c3cecad3a95d765 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sun, 22 Aug 2010 21:03:33 -0700 Subject: net: 802.1q: make vlan_hwaccel_do_receive() return void vlan_hwaccel_do_receive() always returns 0, so make it return void. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 3 +-- net/core/dev.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 01ddb04..07eeb5b 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -35,7 +35,7 @@ drop: } EXPORT_SYMBOL(__vlan_hwaccel_rx); -int vlan_hwaccel_do_receive(struct sk_buff *skb) +void vlan_hwaccel_do_receive(struct sk_buff *skb) { struct net_device *dev = skb->dev; struct vlan_rx_stats *rx_stats; @@ -69,7 +69,6 @@ int vlan_hwaccel_do_receive(struct sk_buff *skb) break; } u64_stats_update_end(&rx_stats->syncp); - return 0; } struct net_device *vlan_dev_real_dev(const struct net_device *dev) diff --git a/net/core/dev.c b/net/core/dev.c index 7cd5237..d569f88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2841,8 +2841,8 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb) && vlan_hwaccel_do_receive(skb)) - return NET_RX_SUCCESS; + if (vlan_tx_tag_present(skb)) + vlan_hwaccel_do_receive(skb); /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) -- cgit v1.1 From c2368e795cec561229ef66a04ac51629b918a9e8 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 17:35:32 +0000 Subject: bridge: is PACKET_LOOPBACK unlikely()? While looking at using netdev_rx_handler_register for openvswitch Jesse Gross suggested that an unlikely() might be worthwhile in that code. I'm interested to see if its appropriate for the bridge code. Cc: Jesse Gross Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/bridge/br_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 826cd52..6d04cfd 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -141,7 +141,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb) const unsigned char *dest = eth_hdr(skb)->h_dest; int (*rhook)(struct sk_buff *skb); - if (skb->pkt_type == PACKET_LOOPBACK) + if (unlikely(skb->pkt_type == PACKET_LOOPBACK)) return skb; if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) -- cgit v1.1 From e88c64f0a42575e01c7ace903d0570bc0b7fcf85 Mon Sep 17 00:00:00 2001 From: Hagen Paul Pfeifer Date: Thu, 19 Aug 2010 06:33:05 +0000 Subject: tcp: allow effective reduction of TCP's rcv-buffer via setsockopt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Via setsockopt it is possible to reduce the socket RX buffer (SO_RCVBUF). TCP method to select the initial window and window scaling option in tcp_select_initial_window() currently misbehaves and do not consider a reduced RX socket buffer via setsockopt. Even though the server's RX buffer is reduced via setsockopt() to 256 byte (Initial Window 384 byte => 256 * 2 - (256 * 2 / 4)) the window scale option is still 7: 192.168.1.38.40676 > 78.47.222.210.5001: Flags [S], seq 2577214362, win 5840, options [mss 1460,sackOK,TS val 338417 ecr 0,nop,wscale 0], length 0 78.47.222.210.5001 > 192.168.1.38.40676: Flags [S.], seq 1570631029, ack 2577214363, win 384, options [mss 1452,sackOK,TS val 2435248895 ecr 338417,nop,wscale 7], length 0 192.168.1.38.40676 > 78.47.222.210.5001: Flags [.], ack 1, win 5840, options [nop,nop,TS val 338421 ecr 2435248895], length 0 Within tcp_select_initial_window() the original space argument - a representation of the rx buffer size - is expanded during tcp_select_initial_window(). Only sysctl_tcp_rmem[2], sysctl_rmem_max and window_clamp are considered to calculate the initial window. This patch adjust the window_clamp argument if the user explicitly reduce the receive buffer. Signed-off-by: Hagen Paul Pfeifer Cc: David S. Miller Cc: Patrick McHardy Cc: Eric Dumazet Cc: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index de3bd84..01b94b8 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -2429,6 +2429,12 @@ struct sk_buff *tcp_make_synack(struct sock *sk, struct dst_entry *dst, __u8 rcv_wscale; /* Set this up on the first call only */ req->window_clamp = tp->window_clamp ? : dst_metric(dst, RTAX_WINDOW); + + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (req->window_clamp > tcp_full_space(sk) || req->window_clamp == 0)) + req->window_clamp = tcp_full_space(sk); + /* tcp_full_space because it is guaranteed to be the first packet */ tcp_select_initial_window(tcp_full_space(sk), mss - (ireq->tstamp_ok ? TCPOLEN_TSTAMP_ALIGNED : 0), @@ -2555,6 +2561,11 @@ static void tcp_connect_init(struct sock *sk) tcp_initialize_rcv_mss(sk); + /* limit the window selection if the user enforce a smaller rx buffer */ + if (sk->sk_userlocks & SOCK_RCVBUF_LOCK && + (tp->window_clamp > tcp_full_space(sk) || tp->window_clamp == 0)) + tp->window_clamp = tcp_full_space(sk); + tcp_select_initial_window(tcp_full_space(sk), tp->advmss - (tp->rx_opt.ts_recent_stamp ? tp->tcp_header_len - sizeof(struct tcphdr) : 0), &tp->rcv_wnd, -- cgit v1.1 From 21dc330157454046dd7c494961277d76e1c957fe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Aug 2010 00:13:46 -0700 Subject: net: Rename skb_has_frags to skb_has_frag_list SKBs can be "fragmented" in two ways, via a page array (called skb_shinfo(skb)->frags[]) and via a list of SKBs (called skb_shinfo(skb)->frag_list). Since skb_has_frags() tests the latter, it's name is confusing since it sounds more like it's testing the former. Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- net/core/skbuff.c | 18 +++++++++--------- net/ipv4/ip_fragment.c | 2 +- net/ipv4/ip_output.c | 2 +- net/ipv6/ip6_output.c | 2 +- net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- net/ipv6/reassembly.c | 2 +- 7 files changed, 16 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d569f88..859e30f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1930,7 +1930,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { return skb_is_nonlinear(skb) && - ((skb_has_frags(skb) && !(dev->features & NETIF_F_FRAGLIST)) || + ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -3090,7 +3090,7 @@ enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) if (!(skb->dev->features & NETIF_F_GRO) || netpoll_rx_on(skb)) goto normal; - if (skb_is_gso(skb) || skb_has_frags(skb)) + if (skb_is_gso(skb) || skb_has_frag_list(skb)) goto normal; rcu_read_lock(); diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 99ef721..e2535fb 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -340,7 +340,7 @@ static void skb_release_data(struct sk_buff *skb) put_page(skb_shinfo(skb)->frags[i].page); } - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); kfree(skb->head); @@ -759,7 +759,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) skb_shinfo(n)->nr_frags = i; } - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { skb_shinfo(n)->frag_list = skb_shinfo(skb)->frag_list; skb_clone_fraglist(n); } @@ -822,7 +822,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) get_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_clone_fraglist(skb); skb_release_data(skb); @@ -1099,7 +1099,7 @@ drop_pages: for (; i < nfrags; i++) put_page(skb_shinfo(skb)->frags[i].page); - if (skb_has_frags(skb)) + if (skb_has_frag_list(skb)) skb_drop_fraglist(skb); goto done; } @@ -1194,7 +1194,7 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta) /* Optimization: no fragments, no reasons to preestimate * size of pulled pages. Superb. */ - if (!skb_has_frags(skb)) + if (!skb_has_frag_list(skb)) goto pull_pages; /* Estimate size of pulled pages. */ @@ -2323,7 +2323,7 @@ next_skb: st->frag_data = NULL; } - if (st->root_skb == st->cur_skb && skb_has_frags(st->root_skb)) { + if (st->root_skb == st->cur_skb && skb_has_frag_list(st->root_skb)) { st->cur_skb = skb_shinfo(st->root_skb)->frag_list; st->frag_idx = 0; goto next_skb; @@ -2889,7 +2889,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) return -ENOMEM; /* Easy case. Most of packets will go this way. */ - if (!skb_has_frags(skb)) { + if (!skb_has_frag_list(skb)) { /* A little of trouble, not enough of space for trailer. * This should not happen, when stack is tuned to generate * good frames. OK, on miss we reallocate and reserve even more @@ -2924,7 +2924,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) if (skb1->next == NULL && tailbits) { if (skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1) || + skb_has_frag_list(skb1) || skb_tailroom(skb1) < tailbits) ntail = tailbits + 128; } @@ -2933,7 +2933,7 @@ int skb_cow_data(struct sk_buff *skb, int tailbits, struct sk_buff **trailer) skb_cloned(skb1) || ntail || skb_shinfo(skb1)->nr_frags || - skb_has_frags(skb1)) { + skb_has_frag_list(skb1)) { struct sk_buff *skb2; /* Fuck, we are miserable poor guys... */ diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b7c4165..f4dc879 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -542,7 +542,7 @@ static int ip_frag_reasm(struct ipq *qp, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e807492..6d2753c 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -487,7 +487,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) * LATER: this step can be merged to real generation of fragments, * we can switch to copy when see the first bad fragment. */ - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { struct sk_buff *frag; int first_len = skb_pagelen(skb); int truesizes = 0; diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index d40b330..1838927 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -637,7 +637,7 @@ static int ip6_fragment(struct sk_buff *skb, int (*output)(struct sk_buff *)) } mtu -= hlen + sizeof(struct frag_hdr); - if (skb_has_frags(skb)) { + if (skb_has_frag_list(skb)) { int first_len = skb_pagelen(skb); int truesizes = 0; diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 13ef5bc..089c598 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -413,7 +413,7 @@ nf_ct_frag6_reasm(struct nf_ct_frag6_queue *fq, struct net_device *dev) /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index 545c414..8aea3f3 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -499,7 +499,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, /* If the first fragment is fragmented itself, we split * it to two chunks: the first with data and paged part * and the second, holding only fragments. */ - if (skb_has_frags(head)) { + if (skb_has_frag_list(head)) { struct sk_buff *clone; int i, plen = 0; -- cgit v1.1 From 67b67e365f07d6dc70f3bb266af3268bac0a4836 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:36 +0000 Subject: ccid: ccid-2/3 code cosmetics This patch collects cosmetics-only changes to separate these from code changes: * update with regard to CodingStyle and whitespace changes, * documentation: - adding/revising comments, - remove CCID-3 RX socket documentation which is either duplicate or refers to fields that no longer exist, * expand embedded tfrc_tx_info struct inline for consistency, removing indirections via #define. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 4 ++-- net/dccp/ccids/ccid3.c | 30 ++++++++++++++++++------------ net/dccp/ccids/ccid3.h | 19 +++++++------------ 3 files changed, 27 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 9b3ae99..f564211 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -193,8 +193,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_ssthresh = hc->tx_cwnd / 2; if (hc->tx_ssthresh < 2) hc->tx_ssthresh = 2; - hc->tx_cwnd = 1; - hc->tx_pipe = 0; + hc->tx_cwnd = 1; + hc->tx_pipe = 0; /* clear state about stuff we sent */ hc->tx_seqt = hc->tx_seqh; diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 95f7529..a666f3f 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -218,9 +218,9 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 + * RTO is 0 if and only if no feedback has been received yet. */ - if (hc->tx_t_rto == 0 || /* no feedback received yet */ - hc->tx_p == 0) { + if (hc->tx_t_rto == 0 || hc->tx_p == 0) { /* halve send rate directly */ hc->tx_x = max(hc->tx_x / 2, @@ -256,7 +256,7 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) * Set new timeout for the nofeedback timer. * See comments in packet_recv() regarding the value of t_RTO. */ - if (unlikely(hc->tx_t_rto == 0)) /* no feedback yet */ + if (unlikely(hc->tx_t_rto == 0)) /* no feedback received yet */ t_nfb = TFRC_INITIAL_TIMEOUT; else t_nfb = max(hc->tx_t_rto, 2 * hc->tx_t_ipi); @@ -372,7 +372,7 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -386,7 +386,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_state != TFRC_SSTATE_NO_FBACK) return; - opt_recv = &hc->tx_options_received; now = ktime_get_real(); /* Estimate RTT from history if ACK number is valid */ @@ -489,11 +488,9 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, int rc = 0; const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv; + struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; - opt_recv = &hc->tx_options_received; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { opt_recv->ccid3or_seqno = dp->dccps_gsr; opt_recv->ccid3or_loss_event_rate = ~0; @@ -582,6 +579,7 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { const struct ccid3_hc_tx_sock *hc; + struct tfrc_tx_info tfrc; const void *val; /* Listen socks doesn't have a private CCID block */ @@ -591,10 +589,17 @@ static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, hc = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: - if (len < sizeof(hc->tx_tfrc)) + if (len < sizeof(tfrc)) return -EINVAL; - len = sizeof(hc->tx_tfrc); - val = &hc->tx_tfrc; + tfrc.tfrctx_x = hc->tx_x; + tfrc.tfrctx_x_recv = hc->tx_x_recv; + tfrc.tfrctx_x_calc = hc->tx_x_calc; + tfrc.tfrctx_rtt = hc->tx_rtt; + tfrc.tfrctx_p = hc->tx_p; + tfrc.tfrctx_rto = hc->tx_t_rto; + tfrc.tfrctx_ipi = hc->tx_t_ipi; + len = sizeof(tfrc); + val = &tfrc; break; default: return -ENOPROTOOPT; @@ -749,10 +754,11 @@ static u32 ccid3_first_li(struct sock *sk) x_recv = scaled_div32(hc->rx_bytes_recv, delta); if (x_recv == 0) { /* would also trigger divide-by-zero */ DCCP_WARN("X_recv==0\n"); - if ((x_recv = hc->rx_x_recv) == 0) { + if (hc->rx_x_recv == 0) { DCCP_BUG("stored value of X_recv is zero"); return ~0U; } + x_recv = hc->rx_x_recv; } fval = scaled_div(hc->rx_s, hc->rx_rtt); diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 0326357..b186424 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -95,14 +95,13 @@ enum ccid3_hc_tx_states { * @tx_options_received: Parsed set of retrieved options */ struct ccid3_hc_tx_sock { - struct tfrc_tx_info tx_tfrc; -#define tx_x tx_tfrc.tfrctx_x -#define tx_x_recv tx_tfrc.tfrctx_x_recv -#define tx_x_calc tx_tfrc.tfrctx_x_calc -#define tx_rtt tx_tfrc.tfrctx_rtt -#define tx_p tx_tfrc.tfrctx_p -#define tx_t_rto tx_tfrc.tfrctx_rto -#define tx_t_ipi tx_tfrc.tfrctx_ipi + u64 tx_x; + u64 tx_x_recv; + u32 tx_x_calc; + u32 tx_rtt; + u32 tx_p; + u32 tx_t_rto; + u32 tx_t_ipi; u16 tx_s; enum ccid3_hc_tx_states tx_state:8; u8 tx_last_win_count; @@ -131,16 +130,12 @@ enum ccid3_hc_rx_states { /** * struct ccid3_hc_rx_sock - CCID3 receiver half-connection socket - * @rx_x_recv: Receiver estimate of send rate (RFC 3448 4.3) - * @rx_rtt: Receiver estimate of rtt (non-standard) - * @rx_p: Current loss event rate (RFC 3448 5.4) * @rx_last_counter: Tracks window counter (RFC 4342, 8.1) * @rx_state: Receiver state, one of %ccid3_hc_rx_states * @rx_bytes_recv: Total sum of DCCP payload bytes * @rx_x_recv: Receiver estimate of send rate (RFC 3448, sec. 4.3) * @rx_rtt: Receiver estimate of RTT * @rx_tstamp_last_feedback: Time at which last feedback was sent - * @rx_tstamp_last_ack: Time at which last feedback was sent * @rx_hist: Packet history (loss detection + RTT sampling) * @rx_li_hist: Loss Interval database * @rx_s: Received packet size in bytes -- cgit v1.1 From 51c22bb510fefbb1a87c02dbd835383e6e7e3d36 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:37 +0000 Subject: dccp ccid-3: No more CCID control blocks in LISTEN state The CCIDs are activated as last of the features, at the end of the handshake, were the LISTEN state of the master socket is inherited into the server state of the child socket. Thus, the only states visible to CCIDs now are OPEN/PARTOPEN, and the closing states. This allows to remove tests which were previously necessary to protect against referencing a socket in the listening state (in CCID-3), but which now have become redundant. As a further byproduct of enabling the CCIDs only after the connection has been fully established, several typecast-initialisations of ccid3_hc_{rx,tx}_sock can now be eliminated: * the CCID is loaded, so it is not necessary to test if it is NULL, * if it is possible to load a CCID and leave the private area NULL, then this is a bug, which should crash loudly - and earlier, * the test for state==OPEN || state==PARTOPEN now reduces only to the closing phase (e.g. when the node has received an unexpected Reset). Signed-off-by: Gerrit Renker Acked-by: Ian McDonald Signed-off-by: David S. Miller --- net/dccp/ccids/ccid3.c | 40 +++++++--------------------------------- 1 file changed, 7 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index a666f3f..4340672 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -564,29 +564,17 @@ static void ccid3_hc_tx_exit(struct sock *sk) static void ccid3_hc_tx_get_info(struct sock *sk, struct tcp_info *info) { - struct ccid3_hc_tx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_tx_sk(sk); - info->tcpi_rto = hc->tx_t_rto; - info->tcpi_rtt = hc->tx_rtt; + info->tcpi_rto = ccid3_hc_tx_sk(sk)->tx_t_rto; + info->tcpi_rtt = ccid3_hc_tx_sk(sk)->tx_rtt; } static int ccid3_hc_tx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_tx_sock *hc; + const struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct tfrc_tx_info tfrc; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_tx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_TX_INFO: if (len < sizeof(tfrc)) @@ -706,14 +694,12 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); __be32 x_recv, pinv; if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; - hc = ccid3_hc_rx_sk(sk); - if (dccp_packet_without_ack(skb)) return 0; @@ -876,30 +862,18 @@ static void ccid3_hc_rx_exit(struct sock *sk) static void ccid3_hc_rx_get_info(struct sock *sk, struct tcp_info *info) { - const struct ccid3_hc_rx_sock *hc; - - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return; - - hc = ccid3_hc_rx_sk(sk); - info->tcpi_ca_state = hc->rx_state; + info->tcpi_ca_state = ccid3_hc_rx_sk(sk)->rx_state; info->tcpi_options |= TCPI_OPT_TIMESTAMPS; - info->tcpi_rcv_rtt = hc->rx_rtt; + info->tcpi_rcv_rtt = ccid3_hc_rx_sk(sk)->rx_rtt; } static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, u32 __user *optval, int __user *optlen) { - const struct ccid3_hc_rx_sock *hc; + const struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct tfrc_rx_info rx_info; const void *val; - /* Listen socks doesn't have a private CCID block */ - if (sk->sk_state == DCCP_LISTEN) - return -EINVAL; - - hc = ccid3_hc_rx_sk(sk); switch (optname) { case DCCP_SOCKOPT_CCID_RX_INFO: if (len < sizeof(rx_info)) -- cgit v1.1 From 30564e355511b434613aa42375317b5a07fc9f23 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:38 +0000 Subject: dccp ccid-2: Remove redundant sanity tests This removes the ccid2_hc_tx_check_sanity function: it is redundant. Details: The tx_check_sanity function performs three tests: 1) it checks that the circular TX list is sorted - in ascending order of sequence number (ccid2s_seq) - and time (ccid2s_sent), - in the direction from `tail' (hctx_seqt) to `head' (hctx_seqh); 2) it ensures that the entire list has the length seqbufc * CCID2_SEQBUF_LEN; 3) it ensures that pipe equals the number of packets that were not marked `acked' (ccid2s_acked) between `tail' and `head'. The following argues that each of these tests is redundant, this can be verified by going through the code. (1) is not necessary, since both time and GSS increase from one packet to the next, so that subsequent insertions in tx_packet_sent (which advance the `head' pointer) will be in ascending order of time and sequence number. In (2), the length of the list is always equal to seqbufc times CCID2_SEQBUF_LEN (set to 1024) unless allocation caused an earlier failure, because: * at initialisation (tx_init), there is one chunk of size 1024 and seqbufc=1; * subsequent calls to tx_alloc_seq take place whenever head->next == tail in tx_packet_sent; then a new chunk of size 1024 is inserted between head and tail, and seqbufc is incremented by one. To show that (3) is redundant requires looking at two cases. The `pipe' variable of the TX socket is incremented only in tx_packet_sent, and decremented in tx_packet_recv. When head == tail (TX history empty) then pipe should be 0, which is the case directly after initialisation and after a retransmission timeout has occurred (ccid2_hc_tx_rto_expire). The first case involves parsing Ack Vectors for packets recorded in the live portion of the buffer, between tail and head. For each packet marked by the receiver as received (state 0) or ECN-marked (state 1), pipe is decremented by one, so for all such packets the BUG_ON in tx_check_sanity will not trigger. The second case is the loss detection in the second half of tx_packet_recv, below the comment "Check for NUMDUPACK". The first while-loop here ensures that the sequence number of `seqp' is either above or equal to `high_ack', or otherwise equal to the highest sequence number sent so far (of the entry head->prev, as head points to the next unsent entry). The next while-loop ("while (1)") counts the number of acked packets starting from that position of seqp, going backwards in the direction from head->prev to tail. If NUMDUPACK=3 such packets were counted within this loop, `seqp' points to the last acknowledged packet of these, and the "if (done == NUMDUPACK)" block is entered next. The while-loop contained within that block in turn traverses the list backwards, from head to tail; the position of `seqp' is saved in the variable `last_acked'. For each packet not marked as `acked', a congestion event is triggered within the loop, and pipe is decremented. The loop terminates when `seqp' has reached `tail', whereupon tail is set to the position previously stored in `last_acked'. Thus, between `last_acked' and the previous position of `tail', - pipe has been decremented earlier if the packet was marked as state 0 or 1; - pipe was decremented if the packet was not marked as acked. That is, pipe has been decremented by the number of packets between `last_acked' and the previous position of `tail'. As a consequence, pipe now again reflects the number of packets which have not (yet) been acked between the new position of tail (at `last_acked') and head->prev, or 0 if head==tail. The result is that the BUG_ON condition in check_sanity will also not be triggered, hence the test (3) is also redundant. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 52 -------------------------------------------------- 1 file changed, 52 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f564211..28499e0 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -33,51 +33,8 @@ #ifdef CONFIG_IP_DCCP_CCID2_DEBUG static int ccid2_debug; #define ccid2_pr_debug(format, a...) DCCP_PR_DEBUG(ccid2_debug, format, ##a) - -static void ccid2_hc_tx_check_sanity(const struct ccid2_hc_tx_sock *hc) -{ - int len = 0; - int pipe = 0; - struct ccid2_seq *seqp = hc->tx_seqh; - - /* there is data in the chain */ - if (seqp != hc->tx_seqt) { - seqp = seqp->ccid2s_prev; - len++; - if (!seqp->ccid2s_acked) - pipe++; - - while (seqp != hc->tx_seqt) { - struct ccid2_seq *prev = seqp->ccid2s_prev; - - len++; - if (!prev->ccid2s_acked) - pipe++; - - /* packets are sent sequentially */ - BUG_ON(dccp_delta_seqno(seqp->ccid2s_seq, - prev->ccid2s_seq ) >= 0); - BUG_ON(time_before(seqp->ccid2s_sent, - prev->ccid2s_sent)); - - seqp = prev; - } - } - - BUG_ON(pipe != hc->tx_pipe); - ccid2_pr_debug("len of chain=%d\n", len); - - do { - seqp = seqp->ccid2s_prev; - len++; - } while (seqp != hc->tx_seqh); - - ccid2_pr_debug("total len=%d\n", len); - BUG_ON(len != hc->tx_seqbufc * CCID2_SEQBUF_LEN); -} #else #define ccid2_pr_debug(format, a...) -#define ccid2_hc_tx_check_sanity(hc) #endif static int ccid2_hc_tx_alloc_seq(struct ccid2_hc_tx_sock *hc) @@ -178,8 +135,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) ccid2_pr_debug("RTO_EXPIRE\n"); - ccid2_hc_tx_check_sanity(hc); - /* back-off timer */ hc->tx_rto <<= 1; @@ -204,7 +159,6 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) hc->tx_rpseq = 0; hc->tx_rpdupack = -1; ccid2_change_l_ack_ratio(sk, 1); - ccid2_hc_tx_check_sanity(hc); out: bh_unlock_sock(sk); sock_put(sk); @@ -312,7 +266,6 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) } } while (0); ccid2_pr_debug("=========\n"); - ccid2_hc_tx_check_sanity(hc); #endif } @@ -510,7 +463,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) int done = 0; unsigned int maxincr = 0; - ccid2_hc_tx_check_sanity(hc); /* check reverse path congestion */ seqno = DCCP_SKB_CB(skb)->dccpd_seq; @@ -694,8 +646,6 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_seqt = hc->tx_seqt->ccid2s_next; } - - ccid2_hc_tx_check_sanity(hc); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) @@ -730,8 +680,6 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_last_cong = jiffies; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); - - ccid2_hc_tx_check_sanity(hc); return 0; } -- cgit v1.1 From c38c92a84a9291a3d0eaf6a13650a11961ae964f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:39 +0000 Subject: dccp ccid-2: Simplify dec_pipe and rearming of RTO timer This removes the dec_pipe function and improves the way the RTO timer is rearmed when a new acknowledgment comes in. Details and justification for removal: -------------------------------------- 1) The BUG_ON in dec_pipe is never triggered: pipe is only decremented for TX history entries between tail and head, for which it had previously been incremented in tx_packet_sent; and it is not decremented twice for the same entry, since it is - either decremented when a corresponding Ack Vector cell in state 0 or 1 was received (and then ccid2s_acked==1), - or it is decremented when ccid2s_acked==0, as part of the loss detection in tx_packet_recv (and hence it can not have been decremented earlier). 2) Restarting the RTO timer happens for every single entry in each Ack Vector parsed by tx_packet_recv (according to RFC 4340, 11.4 this can happen up to 16192 times per Ack Vector). 3) The RTO timer should not be restarted when all outstanding data has been acknowledged. This is currently done similar to (2), in dec_pipe, when pipe has reached 0. The patch onsolidates the code which rearms the RTO timer, combining the segments from new_ack and dec_pipe. As a result, the code becomes clearer (compare with tcp_rearm_rto()). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 27 ++++++++------------------- 1 file changed, 8 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 28499e0..f7f5069 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -413,23 +413,6 @@ static inline void ccid2_new_ack(struct sock *sk, hc->tx_srtt, hc->tx_rttvar, hc->tx_rto, HZ, r); } - - /* we got a new ack, so re-start RTO timer */ - ccid2_hc_tx_kill_rto_timer(sk); - ccid2_start_rto_timer(sk); -} - -static void ccid2_hc_tx_dec_pipe(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - if (hc->tx_pipe == 0) - DCCP_BUG("pipe == 0"); - else - hc->tx_pipe--; - - if (hc->tx_pipe == 0) - ccid2_hc_tx_kill_rto_timer(sk); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -572,7 +555,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) seqp->ccid2s_acked = 1; ccid2_pr_debug("Got ack for %llu\n", (unsigned long long)seqp->ccid2s_seq); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) { done = 1; @@ -629,7 +612,7 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) * one ack vector. */ ccid2_congestion_event(sk, seqp); - ccid2_hc_tx_dec_pipe(sk); + hc->tx_pipe--; } if (seqp == hc->tx_seqt) break; @@ -646,6 +629,12 @@ static void ccid2_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_seqt = hc->tx_seqt->ccid2s_next; } + + /* restart RTO timer if not all outstanding data has been acked */ + if (hc->tx_pipe == 0) + sk_stop_timer(sk, &hc->tx_rtotimer); + else + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); } static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) -- cgit v1.1 From 231cc2aaf14bad3b2325be0b19b8385ff5e75485 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 22 Aug 2010 19:41:40 +0000 Subject: dccp ccid-2: Replace broken RTT estimator with better algorithm The current CCID-2 RTT estimator code is in parts broken and lags behind the suggestions in RFC2988 of using scaled variants for SRTT/RTTVAR. That code is replaced by the present patch, which reuses the Linux TCP RTT estimator code. Further details: ---------------- 1. The minimum RTO of previously one second has been replaced with TCP's, since RFC4341, sec. 5 says that the minimum of 1 sec. (suggested in RFC2988, 2.4) is not necessary. Instead, the TCP_RTO_MIN is used, which agrees with DCCP's concept of a default RTT (RFC 4340, 3.4). 2. The maximum RTO has been set to DCCP_RTO_MAX (64 sec), which agrees with RFC2988, (2.5). 3. De-inlined the function ccid2_new_ack(). 4. Added a FIXME: the RTT is sampled several times per Ack Vector, which will give the wrong estimate. It should be replaced with one sample per Ack. However, at the moment this can not be resolved easily, since - it depends on TX history code (which also needs some work), - the cleanest solution is not to use the `sent' time at all (saves 4 bytes per entry) and use DCCP timestamps / elapsed time to estimated the RTT, which however is non-trivial to get right (but needs to be done). Reasons for reusing the Linux TCP estimator algorithm: ------------------------------------------------------ Some time was spent to find a better alternative, using basic RFC2988 as a first step. Further analysis and experimentation showed that the Linux TCP RTO estimator is superior to a basic RFC2988 implementation. A summary is on http://www.erg.abdn.ac.uk/users/gerrit/dccp/notes/ccid2/rto_estimator/ In addition, this estimator fared well in a recent empirical evaluation: Rewaskar, Sushant, Jasleen Kaur and F. Donelson Smith. A Performance Study of Loss Detection/Recovery in Real-world TCP Implementations. Proceedings of 15th IEEE International Conference on Network Protocols (ICNP-07), 2007. Thus there is significant benefit in reusing the existing TCP code. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 169 +++++++++++++++++++++++++++---------------------- net/dccp/ccids/ccid2.h | 20 ++++-- 2 files changed, 108 insertions(+), 81 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index f7f5069..7af3106 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -113,19 +113,12 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_change_srtt(struct ccid2_hc_tx_sock *hc, long val) -{ - ccid2_pr_debug("change SRTT to %ld\n", val); - hc->tx_srtt = val; -} - static void ccid2_start_rto_timer(struct sock *sk); static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - long s; bh_lock_sock(sk); if (sock_owned_by_user(sk)) { @@ -137,10 +130,8 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) /* back-off timer */ hc->tx_rto <<= 1; - - s = hc->tx_rto / HZ; - if (s > 60) - hc->tx_rto = 60 * HZ; + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; ccid2_start_rto_timer(sk); @@ -168,7 +159,7 @@ static void ccid2_start_rto_timer(struct sock *sk) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - ccid2_pr_debug("setting RTO timeout=%ld\n", hc->tx_rto); + ccid2_pr_debug("setting RTO timeout=%u\n", hc->tx_rto); BUG_ON(timer_pending(&hc->tx_rtotimer)); sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); @@ -339,9 +330,86 @@ static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) ccid2_pr_debug("deleted RTO timer\n"); } -static inline void ccid2_new_ack(struct sock *sk, - struct ccid2_seq *seqp, - unsigned int *maxincr) +/** + * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm + * This code is almost identical with TCP's tcp_rtt_estimator(), since + * - it has a higher sampling frequency (recommended by RFC 1323), + * - the RTO does not collapse into RTT due to RTTVAR going towards zero, + * - it is simple (cf. more complex proposals such as Eifel timer or research + * which suggests that the gain should be set according to window size), + * - in tests it was found to work well with CCID2 [gerrit]. + */ +static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) +{ + struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); + long m = mrtt ? : 1; + + if (hc->tx_srtt == 0) { + /* First measurement m */ + hc->tx_srtt = m << 3; + hc->tx_mdev = m << 1; + + hc->tx_mdev_max = max(TCP_RTO_MIN, hc->tx_mdev); + hc->tx_rttvar = hc->tx_mdev_max; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + } else { + /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ + m -= (hc->tx_srtt >> 3); + hc->tx_srtt += m; + + /* Similarly, update scaled mdev with regard to |m| */ + if (m < 0) { + m = -m; + m -= (hc->tx_mdev >> 2); + /* + * This neutralises RTO increase when RTT < SRTT - mdev + * (see P. Sarolahti, A. Kuznetsov,"Congestion Control + * in Linux TCP", USENIX 2002, pp. 49-62). + */ + if (m > 0) + m >>= 3; + } else { + m -= (hc->tx_mdev >> 2); + } + hc->tx_mdev += m; + + if (hc->tx_mdev > hc->tx_mdev_max) { + hc->tx_mdev_max = hc->tx_mdev; + if (hc->tx_mdev_max > hc->tx_rttvar) + hc->tx_rttvar = hc->tx_mdev_max; + } + + /* + * Decay RTTVAR at most once per flight, exploiting that + * 1) pipe <= cwnd <= Sequence_Window = W (RFC 4340, 7.5.2) + * 2) AWL = GSS-W+1 <= GAR <= GSS (RFC 4340, 7.5.1) + * GAR is a useful bound for FlightSize = pipe. + * AWL is probably too low here, as it over-estimates pipe. + */ + if (after48(dccp_sk(sk)->dccps_gar, hc->tx_rtt_seq)) { + if (hc->tx_mdev_max < hc->tx_rttvar) + hc->tx_rttvar -= (hc->tx_rttvar - + hc->tx_mdev_max) >> 2; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; + hc->tx_mdev_max = TCP_RTO_MIN; + } + } + + /* + * Set RTO from SRTT and RTTVAR + * As in TCP, 4 * RTTVAR >= TCP_RTO_MIN, giving a minimum RTO of 200 ms. + * This agrees with RFC 4341, 5: + * "Because DCCP does not retransmit data, DCCP does not require + * TCP's recommended minimum timeout of one second". + */ + hc->tx_rto = (hc->tx_srtt >> 3) + hc->tx_rttvar; + + if (hc->tx_rto > DCCP_RTO_MAX) + hc->tx_rto = DCCP_RTO_MAX; +} + +static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, + unsigned int *maxincr) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); @@ -355,64 +423,15 @@ static inline void ccid2_new_ack(struct sock *sk, hc->tx_cwnd += 1; hc->tx_packets_acked = 0; } - - /* update RTO */ - if (hc->tx_srtt == -1 || - time_after(jiffies, hc->tx_lastrtt + hc->tx_srtt)) { - unsigned long r = (long)jiffies - (long)seqp->ccid2s_sent; - int s; - - /* first measurement */ - if (hc->tx_srtt == -1) { - ccid2_pr_debug("R: %lu Time=%lu seq=%llu\n", - r, jiffies, - (unsigned long long)seqp->ccid2s_seq); - ccid2_change_srtt(hc, r); - hc->tx_rttvar = r >> 1; - } else { - /* RTTVAR */ - long tmp = hc->tx_srtt - r; - long srtt; - - if (tmp < 0) - tmp *= -1; - - tmp >>= 2; - hc->tx_rttvar *= 3; - hc->tx_rttvar >>= 2; - hc->tx_rttvar += tmp; - - /* SRTT */ - srtt = hc->tx_srtt; - srtt *= 7; - srtt >>= 3; - tmp = r >> 3; - srtt += tmp; - ccid2_change_srtt(hc, srtt); - } - s = hc->tx_rttvar << 2; - /* clock granularity is 1 when based on jiffies */ - if (!s) - s = 1; - hc->tx_rto = hc->tx_srtt + s; - - /* must be at least a second */ - s = hc->tx_rto / HZ; - /* DCCP doesn't require this [but I like it cuz my code sux] */ -#if 1 - if (s < 1) - hc->tx_rto = HZ; -#endif - /* max 60 seconds */ - if (s > 60) - hc->tx_rto = HZ * 60; - - hc->tx_lastrtt = jiffies; - - ccid2_pr_debug("srtt: %ld rttvar: %ld rto: %ld (HZ=%d) R=%lu\n", - hc->tx_srtt, hc->tx_rttvar, - hc->tx_rto, HZ, r); - } + /* + * FIXME: RTT is sampled several times per acknowledgment (for each + * entry in the Ack Vector), instead of once per Ack (as in TCP SACK). + * This causes the RTT to be over-estimated, since the older entries + * in the Ack Vector have earlier sending times. + * The cleanest solution is to not use the ccid2s_sent field at all + * and instead use DCCP timestamps: requires changes in other places. + */ + ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) @@ -662,9 +681,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) if (ccid2_hc_tx_alloc_seq(hc)) return -ENOMEM; - hc->tx_rto = 3 * HZ; - ccid2_change_srtt(hc, -1); - hc->tx_rttvar = -1; + hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; hc->tx_last_cong = jiffies; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index 1ec6a30..b017843 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -42,7 +42,12 @@ struct ccid2_seq { * struct ccid2_hc_tx_sock - CCID2 TX half connection * @tx_{cwnd,ssthresh,pipe}: as per RFC 4341, section 5 * @tx_packets_acked: Ack counter for deriving cwnd growth (RFC 3465) - * @tx_lastrtt: time RTT was last measured + * @tx_srtt: smoothed RTT estimate, scaled by 2^3 + * @tx_mdev: smoothed RTT variation, scaled by 2^2 + * @tx_mdev_max: maximum of @mdev during one flight + * @tx_rttvar: moving average/maximum of @mdev_max + * @tx_rto: RTO value deriving from SRTT and RTTVAR (RFC 2988) + * @tx_rtt_seq: to decay RTTVAR at most once per flight * @tx_rpseq: last consecutive seqno * @tx_rpdupack: dupacks since rpseq */ @@ -55,11 +60,16 @@ struct ccid2_hc_tx_sock { int tx_seqbufc; struct ccid2_seq *tx_seqh; struct ccid2_seq *tx_seqt; - long tx_rto; - long tx_srtt; - long tx_rttvar; - unsigned long tx_lastrtt; + + /* RTT measurement: variables/principles are the same as in TCP */ + u32 tx_srtt, + tx_mdev, + tx_mdev_max, + tx_rttvar, + tx_rto; + u64 tx_rtt_seq:48; struct timer_list tx_rtotimer; + u64 tx_rpseq; int tx_rpdupack; unsigned long tx_last_cong; -- cgit v1.1 From 7abac686026ec1af38f6e766369dbfe4007949b6 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 23 Aug 2010 20:42:11 -0700 Subject: pkt_sched: Make act_csum depend upon INET. It uses ip_send_check() and stuff like that. Reported-by: Randy Dunlap Signed-off-by: David S. Miller --- net/sched/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sched/Kconfig b/net/sched/Kconfig index 522d5a9..a36270a 100644 --- a/net/sched/Kconfig +++ b/net/sched/Kconfig @@ -520,7 +520,7 @@ config NET_ACT_SKBEDIT config NET_ACT_CSUM tristate "Checksum Updating" - depends on NET_CLS_ACT + depends on NET_CLS_ACT && INET ---help--- Say Y here to update some common checksum after some direct packet alterations. -- cgit v1.1 From 0eec32ff350348e635b3b8d87b989117ce045d25 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 23 Aug 2010 03:27:58 +0000 Subject: net_sched: act_csum: coding style cleanup Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/sched/act_csum.c | 77 ++++++++++++++++++++++++++-------------------------- 1 file changed, 38 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index be41f1c..67dc7ce 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -42,9 +42,9 @@ static u32 csum_idx_gen; static DEFINE_RWLOCK(csum_lock); static struct tcf_hashinfo csum_hash_info = { - .htab = tcf_csum_ht, - .hmask = CSUM_TAB_MASK, - .lock = &csum_lock, + .htab = tcf_csum_ht, + .hmask = CSUM_TAB_MASK, + .lock = &csum_lock, }; static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { @@ -73,7 +73,8 @@ static int tcf_csum_init(struct nlattr *nla, struct nlattr *est, pc = tcf_hash_check(parm->index, a, bind, &csum_hash_info); if (!pc) { - pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, &csum_idx_gen, &csum_hash_info); + pc = tcf_hash_create(parm->index, est, a, sizeof(*p), bind, + &csum_idx_gen, &csum_hash_info); if (IS_ERR(pc)) return PTR_ERR(pc); p = to_tcf_csum(pc); @@ -230,8 +231,9 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, struct udphdr *udph; u16 ul; - /* Support both UDP and UDPLITE checksum algorithms, - * Don't use udph->len to get the real length without any protocol check, + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, * UDPLITE uses udph->len for another thing, * Use iph->tot_len, or just ipl. */ @@ -249,10 +251,8 @@ static int tcf_csum_ipv4_udp(struct sk_buff *skb, struct iphdr *iph, if (udplite) { if (ul == 0) skb->csum = csum_partial(udph, ipl - ihl, 0); - else if ((ul >= sizeof(*udph)) && (ul <= ipl - ihl)) skb->csum = csum_partial(udph, ul, 0); - else goto ignore_obscure_skb; } else { @@ -282,8 +282,9 @@ static int tcf_csum_ipv6_udp(struct sk_buff *skb, struct ipv6hdr *ip6h, struct udphdr *udph; u16 ul; - /* Support both UDP and UDPLITE checksum algorithms, - * Don't use udph->len to get the real length without any protocol check, + /* + * Support both UDP and UDPLITE checksum algorithms, Don't use + * udph->len to get the real length without any protocol check, * UDPLITE uses udph->len for another thing, * Use ip6h->payload_len + sizeof(*ip6h) ... , or just ipl. */ @@ -340,32 +341,32 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags) switch (iph->frag_off & htons(IP_OFFSET) ? 0 : iph->protocol) { case IPPROTO_ICMP: if (update_flags & TCA_CSUM_UPDATE_FLAG_ICMP) - if (!tcf_csum_ipv4_icmp(skb, - iph->ihl * 4, ntohs(iph->tot_len))) + if (!tcf_csum_ipv4_icmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) goto fail; break; case IPPROTO_IGMP: if (update_flags & TCA_CSUM_UPDATE_FLAG_IGMP) - if (!tcf_csum_ipv4_igmp(skb, - iph->ihl * 4, ntohs(iph->tot_len))) + if (!tcf_csum_ipv4_igmp(skb, iph->ihl * 4, + ntohs(iph->tot_len))) goto fail; break; case IPPROTO_TCP: if (update_flags & TCA_CSUM_UPDATE_FLAG_TCP) - if (!tcf_csum_ipv4_tcp(skb, iph, - iph->ihl * 4, ntohs(iph->tot_len))) + if (!tcf_csum_ipv4_tcp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len))) goto fail; break; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) - if (!tcf_csum_ipv4_udp(skb, iph, - iph->ihl * 4, ntohs(iph->tot_len), 0)) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 0)) goto fail; break; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) - if (!tcf_csum_ipv4_udp(skb, iph, - iph->ihl * 4, ntohs(iph->tot_len), 1)) + if (!tcf_csum_ipv4_udp(skb, iph, iph->ihl * 4, + ntohs(iph->tot_len), 1)) goto fail; break; } @@ -386,7 +387,7 @@ fail: } static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, - unsigned int ixhl, unsigned int *pl) + unsigned int ixhl, unsigned int *pl) { int off, len, optlen; unsigned char *xh = (void *)ip6xh; @@ -395,8 +396,7 @@ static int tcf_csum_ipv6_hopopts(struct ipv6_opt_hdr *ip6xh, len = ixhl - off; while (len > 1) { - switch (xh[off]) - { + switch (xh[off]) { case IPV6_TLV_PAD0: optlen = 1; break; @@ -476,14 +476,14 @@ static int tcf_csum_ipv6(struct sk_buff *skb, u32 update_flags) goto done; case IPPROTO_UDP: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDP) - if (!tcf_csum_ipv6_udp(skb, ip6h, - hl, pl + sizeof(*ip6h), 0)) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 0)) goto fail; goto done; case IPPROTO_UDPLITE: if (update_flags & TCA_CSUM_UPDATE_FLAG_UDPLITE) - if (!tcf_csum_ipv6_udp(skb, ip6h, - hl, pl + sizeof(*ip6h), 1)) + if (!tcf_csum_ipv6_udp(skb, ip6h, hl, + pl + sizeof(*ip6h), 1)) goto fail; goto done; default: @@ -544,7 +544,6 @@ static int tcf_csum_dump(struct sk_buff *skb, struct tcf_csum *p = a->priv; struct tc_csum opt = { .update_flags = p->update_flags, - .index = p->tcf_index, .action = p->tcf_action, .refcnt = p->tcf_refcnt - ref, @@ -566,17 +565,17 @@ nla_put_failure: } static struct tc_action_ops act_csum_ops = { - .kind = "csum", - .hinfo = &csum_hash_info, - .type = TCA_ACT_CSUM, - .capab = TCA_CAP_NONE, - .owner = THIS_MODULE, - .act = tcf_csum, - .dump = tcf_csum_dump, - .cleanup = tcf_csum_cleanup, - .lookup = tcf_hash_search, - .init = tcf_csum_init, - .walk = tcf_generic_walker + .kind = "csum", + .hinfo = &csum_hash_info, + .type = TCA_ACT_CSUM, + .capab = TCA_CAP_NONE, + .owner = THIS_MODULE, + .act = tcf_csum, + .dump = tcf_csum_dump, + .cleanup = tcf_csum_cleanup, + .lookup = tcf_hash_search, + .init = tcf_csum_init, + .walk = tcf_generic_walker }; MODULE_DESCRIPTION("Checksum updating actions"); -- cgit v1.1 From afdcba371f9748ac91608bb6c57f170aab7085b4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 23 Aug 2010 07:14:36 +0000 Subject: net: copy_rtnl_link_stats64() simplification No need to use a temporary struct rtnl_link_stats64 variable, just copy the source to skb buffer. Signed-off-by: Eric Dumazet Reviewed-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f78d821..b2a718d 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -612,36 +612,7 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a, static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b) { - struct rtnl_link_stats64 a; - - a.rx_packets = b->rx_packets; - a.tx_packets = b->tx_packets; - a.rx_bytes = b->rx_bytes; - a.tx_bytes = b->tx_bytes; - a.rx_errors = b->rx_errors; - a.tx_errors = b->tx_errors; - a.rx_dropped = b->rx_dropped; - a.tx_dropped = b->tx_dropped; - - a.multicast = b->multicast; - a.collisions = b->collisions; - - a.rx_length_errors = b->rx_length_errors; - a.rx_over_errors = b->rx_over_errors; - a.rx_crc_errors = b->rx_crc_errors; - a.rx_frame_errors = b->rx_frame_errors; - a.rx_fifo_errors = b->rx_fifo_errors; - a.rx_missed_errors = b->rx_missed_errors; - - a.tx_aborted_errors = b->tx_aborted_errors; - a.tx_carrier_errors = b->tx_carrier_errors; - a.tx_fifo_errors = b->tx_fifo_errors; - a.tx_heartbeat_errors = b->tx_heartbeat_errors; - a.tx_window_errors = b->tx_window_errors; - - a.rx_compressed = b->rx_compressed; - a.tx_compressed = b->tx_compressed; - memcpy(v, &a, sizeof(a)); + memcpy(v, b, sizeof(*b)); } /* All VF info */ -- cgit v1.1 From 0fdc100bdc4b7ab61ed632962c76dfe539047296 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 23 Aug 2010 10:24:18 +0000 Subject: ethtool: allow non-netadmin to query settings The SNMP daemon uses ethtool to determine the speed of network interfaces. This fails on Debian (and probably elsewhere) because for security SNMP daemon runs as non-root user (snmp). Note: A similar patch was rejected previously because of a concern about the possibility that on some hardware querying the ethtool settings requires access to the PHY and could slow the machine down. But the security risk of requiring SNMP daemon (and related services) to run as root far out weighs the risk of denial-of-service. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index d2c4da5..970eb98 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1423,6 +1423,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr) /* Allow some commands to be done by anyone */ switch (ethcmd) { + case ETHTOOL_GSET: case ETHTOOL_GDRVINFO: case ETHTOOL_GMSGLVL: case ETHTOOL_GCOALESCE: -- cgit v1.1 From 633adf1ad1c92c02bd3f10bbd73737a969179378 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Aug 2010 14:49:58 +0200 Subject: cfg80211: mark ieee80211_hdrlen const This function analyses only its single, value-passed argument, and has no side effects. Thus it can be const, which makes mac80211 smaller, for example: text data bss dec hex filename 362518 16720 884 380122 5ccda mac80211.ko (before) 362358 16720 884 379962 5cc3a mac80211.ko (after) a 160 byte saving in text size, and an optimisation because the function won't be called as often. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/util.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 0c8a1e8..1eb2416 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -221,7 +221,7 @@ const unsigned char bridge_tunnel_header[] __aligned(2) = { 0xaa, 0xaa, 0x03, 0x00, 0x00, 0xf8 }; EXPORT_SYMBOL(bridge_tunnel_header); -unsigned int ieee80211_hdrlen(__le16 fc) +unsigned int __attribute_const__ ieee80211_hdrlen(__le16 fc) { unsigned int hdrlen = 24; -- cgit v1.1 From ac4c977d16d843f12901595c91773dddb65768a9 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Aug 2010 15:37:29 +0200 Subject: mac80211: remove unused don't-encrypt flag When MFP is disabled, action frames will not be encrypted since they are management frames and the only management frames that can then be encrypted are authentication frames. Therefore, setting the don't-encrypt flag on action frames is unnecessary. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9a35d9e..f9a3177 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1555,8 +1555,6 @@ static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev, return -ENOLINK; break; case NL80211_IFTYPE_STATION: - if (!(sdata->u.mgd.flags & IEEE80211_STA_MFP_ENABLED)) - flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; break; default: return -EOPNOTSUPP; -- cgit v1.1 From 2e161f78e5f63a7f9fd25a766bb7f816a01eb14a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 12 Aug 2010 15:38:38 +0200 Subject: cfg80211/mac80211: extensible frame processing Allow userspace to register for more than just action frames by giving the frame subtype, and make it possible to use this in various modes as well. With some tweaks and some added functionality this will, in the future, also be usable in AP mode and be able to replace the cooked monitor interface currently used in that case. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 12 ++-- net/mac80211/ieee80211_i.h | 1 + net/mac80211/iface.c | 6 +- net/mac80211/main.c | 37 ++++++++++++ net/mac80211/rx.c | 137 +++++++++++++++++++++++++++++------------- net/mac80211/status.c | 2 +- net/mac80211/util.c | 6 +- net/wireless/core.c | 8 +-- net/wireless/core.h | 21 +++---- net/wireless/mlme.c | 144 +++++++++++++++++++++++++++++---------------- net/wireless/nl80211.c | 108 +++++++++++++++++++++++++--------- net/wireless/nl80211.h | 14 ++--- net/wireless/util.c | 2 +- 13 files changed, 344 insertions(+), 154 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f9a3177..94787d2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1521,11 +1521,11 @@ static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy, return ieee80211_wk_cancel_remain_on_channel(sdata, cookie); } -static int ieee80211_action(struct wiphy *wiphy, struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_channel_type channel_type, - bool channel_type_valid, - const u8 *buf, size_t len, u64 *cookie) +static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, + const u8 *buf, size_t len, u64 *cookie) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -1625,6 +1625,6 @@ struct cfg80211_ops mac80211_config_ops = { .set_bitrate_mask = ieee80211_set_bitrate_mask, .remain_on_channel = ieee80211_remain_on_channel, .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, - .action = ieee80211_action, + .mgmt_tx = ieee80211_mgmt_tx, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, }; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 1bf05bf..e73ae51 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -170,6 +170,7 @@ typedef unsigned __bitwise__ ieee80211_rx_result; #define IEEE80211_RX_RA_MATCH BIT(1) #define IEEE80211_RX_AMSDU BIT(2) #define IEEE80211_RX_FRAGMENTED BIT(3) +#define IEEE80211_MALFORMED_ACTION_FRM BIT(4) /* only add flags here that do not change with subframes of an aMPDU */ struct ieee80211_rx_data { diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9459aee..86f434f 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -177,7 +177,7 @@ static int ieee80211_open(struct net_device *dev) /* no special treatment */ break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: /* cannot happen */ WARN_ON(1); break; @@ -634,7 +634,7 @@ static void ieee80211_teardown_sdata(struct net_device *dev) case NL80211_IFTYPE_MONITOR: break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: BUG(); break; } @@ -886,7 +886,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, case NL80211_IFTYPE_AP_VLAN: break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: BUG(); break; } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 0afccda..a53feac 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -417,6 +417,41 @@ void ieee80211_napi_complete(struct ieee80211_hw *hw) } EXPORT_SYMBOL(ieee80211_napi_complete); +/* There isn't a lot of sense in it, but you can transmit anything you like */ +static const struct ieee80211_txrx_stypes +ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { + [NL80211_IFTYPE_ADHOC] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_STATION] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + }, + [NL80211_IFTYPE_AP] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, + [NL80211_IFTYPE_AP_VLAN] = { + /* copy AP */ + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, +}; + struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, const struct ieee80211_ops *ops) { @@ -446,6 +481,8 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, if (!wiphy) return NULL; + wiphy->mgmt_stypes = ieee80211_default_mgmt_stypes; + wiphy->flags |= WIPHY_FLAG_NETNS_OK | WIPHY_FLAG_4ADDR_AP | WIPHY_FLAG_4ADDR_STATION; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 4fdbed58..aa41e38 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1940,13 +1940,36 @@ static void ieee80211_process_sa_query_req(struct ieee80211_sub_if_data *sdata, } static ieee80211_rx_result debug_noinline +ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) +{ + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + + /* + * From here on, look only at management frames. + * Data and control frames are already handled, + * and unknown (reserved) frames are useless. + */ + if (rx->skb->len < 24) + return RX_DROP_MONITOR; + + if (!ieee80211_is_mgmt(mgmt->frame_control)) + return RX_DROP_MONITOR; + + if (!(rx->flags & IEEE80211_RX_RA_MATCH)) + return RX_DROP_MONITOR; + + if (ieee80211_drop_unencrypted_mgmt(rx)) + return RX_DROP_UNUSABLE; + + return RX_CONTINUE; +} + +static ieee80211_rx_result debug_noinline ieee80211_rx_h_action(struct ieee80211_rx_data *rx) { struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; - struct sk_buff *nskb; - struct ieee80211_rx_status *status; int len = rx->skb->len; if (!ieee80211_is_action(mgmt->frame_control)) @@ -1962,9 +1985,6 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) if (!(rx->flags & IEEE80211_RX_RA_MATCH)) return RX_DROP_UNUSABLE; - if (ieee80211_drop_unencrypted_mgmt(rx)) - return RX_DROP_UNUSABLE; - switch (mgmt->u.action.category) { case WLAN_CATEGORY_BACK: /* @@ -2055,17 +2075,36 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) goto queue; } + return RX_CONTINUE; + invalid: - /* - * For AP mode, hostapd is responsible for handling any action - * frames that we didn't handle, including returning unknown - * ones. For all other modes we will return them to the sender, - * setting the 0x80 bit in the action category, as required by - * 802.11-2007 7.3.1.11. - */ - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - return RX_DROP_MONITOR; + rx->flags |= IEEE80211_MALFORMED_ACTION_FRM; + /* will return in the next handlers */ + return RX_CONTINUE; + + handled: + if (rx->sta) + rx->sta->rx_packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; + + queue: + rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; + skb_queue_tail(&sdata->skb_queue, rx->skb); + ieee80211_queue_work(&local->hw, &sdata->work); + if (rx->sta) + rx->sta->rx_packets++; + return RX_QUEUED; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) +{ + struct ieee80211_rx_status *status; + + /* skip known-bad action frames and return them in the next handler */ + if (rx->flags & IEEE80211_MALFORMED_ACTION_FRM) + return RX_CONTINUE; /* * Getting here means the kernel doesn't know how to handle @@ -2075,10 +2114,44 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) */ status = IEEE80211_SKB_RXCB(rx->skb); - if (cfg80211_rx_action(rx->sdata->dev, status->freq, - rx->skb->data, rx->skb->len, - GFP_ATOMIC)) - goto handled; + if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq, + rx->skb->data, rx->skb->len, + GFP_ATOMIC)) { + if (rx->sta) + rx->sta->rx_packets++; + dev_kfree_skb(rx->skb); + return RX_QUEUED; + } + + + return RX_CONTINUE; +} + +static ieee80211_rx_result debug_noinline +ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct sk_buff *nskb; + struct ieee80211_sub_if_data *sdata = rx->sdata; + + if (!ieee80211_is_action(mgmt->frame_control)) + return RX_CONTINUE; + + /* + * For AP mode, hostapd is responsible for handling any action + * frames that we didn't handle, including returning unknown + * ones. For all other modes we will return them to the sender, + * setting the 0x80 bit in the action category, as required by + * 802.11-2007 7.3.1.11. + * Newer versions of hostapd shall also use the management frame + * registration mechanisms, but older ones still use cooked + * monitor interfaces so push all frames there. + */ + if (!(rx->flags & IEEE80211_MALFORMED_ACTION_FRM) && + (sdata->vif.type == NL80211_IFTYPE_AP || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) + return RX_DROP_MONITOR; /* do not return rejected action frames */ if (mgmt->u.action.category & 0x80) @@ -2097,20 +2170,8 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) ieee80211_tx_skb(rx->sdata, nskb); } - - handled: - if (rx->sta) - rx->sta->rx_packets++; dev_kfree_skb(rx->skb); return RX_QUEUED; - - queue: - rx->skb->pkt_type = IEEE80211_SDATA_QUEUE_TYPE_FRAME; - skb_queue_tail(&sdata->skb_queue, rx->skb); - ieee80211_queue_work(&local->hw, &sdata->work); - if (rx->sta) - rx->sta->rx_packets++; - return RX_QUEUED; } static ieee80211_rx_result debug_noinline @@ -2121,15 +2182,6 @@ ieee80211_rx_h_mgmt(struct ieee80211_rx_data *rx) struct ieee80211_mgmt *mgmt = (void *)rx->skb->data; __le16 stype; - if (!(rx->flags & IEEE80211_RX_RA_MATCH)) - return RX_DROP_MONITOR; - - if (rx->skb->len < 24) - return RX_DROP_MONITOR; - - if (ieee80211_drop_unencrypted_mgmt(rx)) - return RX_DROP_UNUSABLE; - rxs = ieee80211_work_rx_mgmt(rx->sdata, rx->skb); if (rxs != RX_CONTINUE) return rxs; @@ -2374,7 +2426,10 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, if (res != RX_CONTINUE) goto rxh_next; + CALL_RXH(ieee80211_rx_h_mgmt_check) CALL_RXH(ieee80211_rx_h_action) + CALL_RXH(ieee80211_rx_h_userspace_mgmt) + CALL_RXH(ieee80211_rx_h_action_return) CALL_RXH(ieee80211_rx_h_mgmt) rxh_next: @@ -2527,7 +2582,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, break; case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: /* should never get here */ WARN_ON(1); break; diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 10caec5..67a3584 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -296,7 +296,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) } if (info->flags & IEEE80211_TX_INTFL_NL80211_FRAME_TX) - cfg80211_action_tx_status( + cfg80211_mgmt_tx_status( skb->dev, (unsigned long) skb, skb->data, skb->len, !!(info->flags & IEEE80211_TX_STAT_ACK), GFP_ATOMIC); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 748387d..cd2b485 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -471,7 +471,7 @@ void ieee80211_iterate_active_interfaces( list_for_each_entry(sdata, &local->interfaces, list) { switch (sdata->vif.type) { - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: @@ -505,7 +505,7 @@ void ieee80211_iterate_active_interfaces_atomic( list_for_each_entry_rcu(sdata, &local->interfaces, list) { switch (sdata->vif.type) { - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: @@ -1189,7 +1189,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) /* ignore virtual */ break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: WARN_ON(1); break; } diff --git a/net/wireless/core.c b/net/wireless/core.c index c70909c..d52630b 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -433,7 +433,7 @@ int wiphy_register(struct wiphy *wiphy) /* sanity check ifmodes */ WARN_ON(!ifmodes); - ifmodes &= ((1 << __NL80211_IFTYPE_AFTER_LAST) - 1) & ~1; + ifmodes &= ((1 << NUM_NL80211_IFTYPES) - 1) & ~1; if (WARN_ON(ifmodes != wiphy->interface_modes)) wiphy->interface_modes = ifmodes; @@ -685,8 +685,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, INIT_WORK(&wdev->cleanup_work, wdev_cleanup_work); INIT_LIST_HEAD(&wdev->event_list); spin_lock_init(&wdev->event_lock); - INIT_LIST_HEAD(&wdev->action_registrations); - spin_lock_init(&wdev->action_registrations_lock); + INIT_LIST_HEAD(&wdev->mgmt_registrations); + spin_lock_init(&wdev->mgmt_registrations_lock); mutex_lock(&rdev->devlist_mtx); list_add_rcu(&wdev->list, &rdev->netdev_list); @@ -806,7 +806,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, sysfs_remove_link(&dev->dev.kobj, "phy80211"); list_del_rcu(&wdev->list); rdev->devlist_generation++; - cfg80211_mlme_purge_actions(wdev); + cfg80211_mlme_purge_registrations(wdev); #ifdef CONFIG_CFG80211_WEXT kfree(wdev->wext.keys); #endif diff --git a/net/wireless/core.h b/net/wireless/core.h index 63d57ae..58ab2c7 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -331,16 +331,17 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, const u8 *resp_ie, size_t resp_ie_len, u16 status, bool wextev, struct cfg80211_bss *bss); -int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, - const u8 *match_data, int match_len); -void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid); -void cfg80211_mlme_purge_actions(struct wireless_dev *wdev); -int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_channel_type channel_type, - bool channel_type_valid, - const u8 *buf, size_t len, u64 *cookie); +int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, + u16 frame_type, const u8 *match_data, + int match_len); +void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid); +void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev); +int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, + const u8 *buf, size_t len, u64 *cookie); /* SME */ int __cfg80211_connect(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index ee0af32..8515b1e 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -748,31 +748,51 @@ void cfg80211_new_sta(struct net_device *dev, const u8 *mac_addr, } EXPORT_SYMBOL(cfg80211_new_sta); -struct cfg80211_action_registration { +struct cfg80211_mgmt_registration { struct list_head list; u32 nlpid; int match_len; + __le16 frame_type; + u8 match[]; }; -int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, - const u8 *match_data, int match_len) +int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, + u16 frame_type, const u8 *match_data, + int match_len) { - struct cfg80211_action_registration *reg, *nreg; + struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; + u16 mgmt_type; + + if (!wdev->wiphy->mgmt_stypes) + return -EOPNOTSUPP; + + if ((frame_type & IEEE80211_FCTL_FTYPE) != IEEE80211_FTYPE_MGMT) + return -EINVAL; + + if (frame_type & ~(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE)) + return -EINVAL; + + mgmt_type = (frame_type & IEEE80211_FCTL_STYPE) >> 4; + if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].rx & BIT(mgmt_type))) + return -EINVAL; nreg = kzalloc(sizeof(*reg) + match_len, GFP_KERNEL); if (!nreg) return -ENOMEM; - spin_lock_bh(&wdev->action_registrations_lock); + spin_lock_bh(&wdev->mgmt_registrations_lock); - list_for_each_entry(reg, &wdev->action_registrations, list) { + list_for_each_entry(reg, &wdev->mgmt_registrations, list) { int mlen = min(match_len, reg->match_len); + if (frame_type != le16_to_cpu(reg->frame_type)) + continue; + if (memcmp(reg->match, match_data, mlen) == 0) { err = -EALREADY; break; @@ -787,62 +807,75 @@ int cfg80211_mlme_register_action(struct wireless_dev *wdev, u32 snd_pid, memcpy(nreg->match, match_data, match_len); nreg->match_len = match_len; nreg->nlpid = snd_pid; - list_add(&nreg->list, &wdev->action_registrations); + nreg->frame_type = cpu_to_le16(frame_type); + list_add(&nreg->list, &wdev->mgmt_registrations); out: - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); return err; } -void cfg80211_mlme_unregister_actions(struct wireless_dev *wdev, u32 nlpid) +void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid) { - struct cfg80211_action_registration *reg, *tmp; + struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->action_registrations_lock); + spin_lock_bh(&wdev->mgmt_registrations_lock); - list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) { + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { if (reg->nlpid == nlpid) { list_del(®->list); kfree(reg); } } - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); } -void cfg80211_mlme_purge_actions(struct wireless_dev *wdev) +void cfg80211_mlme_purge_registrations(struct wireless_dev *wdev) { - struct cfg80211_action_registration *reg, *tmp; + struct cfg80211_mgmt_registration *reg, *tmp; - spin_lock_bh(&wdev->action_registrations_lock); + spin_lock_bh(&wdev->mgmt_registrations_lock); - list_for_each_entry_safe(reg, tmp, &wdev->action_registrations, list) { + list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { list_del(®->list); kfree(reg); } - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); } -int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, - struct net_device *dev, - struct ieee80211_channel *chan, - enum nl80211_channel_type channel_type, - bool channel_type_valid, - const u8 *buf, size_t len, u64 *cookie) +int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, + struct net_device *dev, + struct ieee80211_channel *chan, + enum nl80211_channel_type channel_type, + bool channel_type_valid, + const u8 *buf, size_t len, u64 *cookie) { struct wireless_dev *wdev = dev->ieee80211_ptr; const struct ieee80211_mgmt *mgmt; + u16 stype; + + if (!wdev->wiphy->mgmt_stypes) + return -EOPNOTSUPP; - if (rdev->ops->action == NULL) + if (!rdev->ops->mgmt_tx) return -EOPNOTSUPP; + if (len < 24 + 1) return -EINVAL; mgmt = (const struct ieee80211_mgmt *) buf; - if (!ieee80211_is_action(mgmt->frame_control)) + + if (!ieee80211_is_mgmt(mgmt->frame_control)) return -EINVAL; - if (mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { + + stype = le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE; + if (!(wdev->wiphy->mgmt_stypes[wdev->iftype].tx & BIT(stype >> 4))) + return -EINVAL; + + if (ieee80211_is_action(mgmt->frame_control) && + mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { /* Verify that we are associated with the destination AP */ wdev_lock(wdev); @@ -863,64 +896,75 @@ int cfg80211_mlme_action(struct cfg80211_registered_device *rdev, return -EINVAL; /* Transmit the Action frame as requested by user space */ - return rdev->ops->action(&rdev->wiphy, dev, chan, channel_type, - channel_type_valid, buf, len, cookie); + return rdev->ops->mgmt_tx(&rdev->wiphy, dev, chan, channel_type, + channel_type_valid, buf, len, cookie); } -bool cfg80211_rx_action(struct net_device *dev, int freq, const u8 *buf, - size_t len, gfp_t gfp) +bool cfg80211_rx_mgmt(struct net_device *dev, int freq, const u8 *buf, + size_t len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); - struct cfg80211_action_registration *reg; - const u8 *action_data; - int action_data_len; + struct cfg80211_mgmt_registration *reg; + const struct ieee80211_txrx_stypes *stypes = + &wiphy->mgmt_stypes[wdev->iftype]; + struct ieee80211_mgmt *mgmt = (void *)buf; + const u8 *data; + int data_len; bool result = false; + __le16 ftype = mgmt->frame_control & + cpu_to_le16(IEEE80211_FCTL_FTYPE | IEEE80211_FCTL_STYPE); + u16 stype; - /* frame length - min size excluding category */ - action_data_len = len - (IEEE80211_MIN_ACTION_SIZE - 1); + stype = (le16_to_cpu(mgmt->frame_control) & IEEE80211_FCTL_STYPE) >> 4; - /* action data starts with category */ - action_data = buf + IEEE80211_MIN_ACTION_SIZE - 1; + if (!(stypes->rx & BIT(stype))) + return false; - spin_lock_bh(&wdev->action_registrations_lock); + data = buf + ieee80211_hdrlen(mgmt->frame_control); + data_len = len - ieee80211_hdrlen(mgmt->frame_control); + + spin_lock_bh(&wdev->mgmt_registrations_lock); + + list_for_each_entry(reg, &wdev->mgmt_registrations, list) { + if (reg->frame_type != ftype) + continue; - list_for_each_entry(reg, &wdev->action_registrations, list) { - if (reg->match_len > action_data_len) + if (reg->match_len > data_len) continue; - if (memcmp(reg->match, action_data, reg->match_len)) + if (memcmp(reg->match, data, reg->match_len)) continue; /* found match! */ /* Indicate the received Action frame to user space */ - if (nl80211_send_action(rdev, dev, reg->nlpid, freq, - buf, len, gfp)) + if (nl80211_send_mgmt(rdev, dev, reg->nlpid, freq, + buf, len, gfp)) continue; result = true; break; } - spin_unlock_bh(&wdev->action_registrations_lock); + spin_unlock_bh(&wdev->mgmt_registrations_lock); return result; } -EXPORT_SYMBOL(cfg80211_rx_action); +EXPORT_SYMBOL(cfg80211_rx_mgmt); -void cfg80211_action_tx_status(struct net_device *dev, u64 cookie, - const u8 *buf, size_t len, bool ack, gfp_t gfp) +void cfg80211_mgmt_tx_status(struct net_device *dev, u64 cookie, + const u8 *buf, size_t len, bool ack, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct wiphy *wiphy = wdev->wiphy; struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); /* Indicate TX status of the Action frame to user space */ - nl80211_send_action_tx_status(rdev, dev, cookie, buf, len, ack, gfp); + nl80211_send_mgmt_tx_status(rdev, dev, cookie, buf, len, ack, gfp); } -EXPORT_SYMBOL(cfg80211_action_tx_status); +EXPORT_SYMBOL(cfg80211_mgmt_tx_status); void cfg80211_cqm_rssi_notify(struct net_device *dev, enum nl80211_cqm_rssi_threshold_event rssi_event, diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index bb5b78e..927ffbd 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -156,6 +156,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_WIPHY_TX_POWER_SETTING] = { .type = NLA_U32 }, [NL80211_ATTR_WIPHY_TX_POWER_LEVEL] = { .type = NLA_U32 }, + [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 }, }; /* policy for the attributes */ @@ -437,6 +438,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, struct ieee80211_rate *rate; int i; u16 ifmodes = dev->wiphy.interface_modes; + const struct ieee80211_txrx_stypes *mgmt_stypes = + dev->wiphy.mgmt_stypes; hdr = nl80211hdr_put(msg, pid, seq, flags, NL80211_CMD_NEW_WIPHY); if (!hdr) @@ -587,7 +590,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, CMD(flush_pmksa, FLUSH_PMKSA); CMD(remain_on_channel, REMAIN_ON_CHANNEL); CMD(set_bitrate_mask, SET_TX_BITRATE_MASK); - CMD(action, ACTION); + CMD(mgmt_tx, FRAME); if (dev->wiphy.flags & WIPHY_FLAG_NETNS_OK) { i++; NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); @@ -608,6 +611,53 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, nla_nest_end(msg, nl_cmds); + if (mgmt_stypes) { + u16 stypes; + struct nlattr *nl_ftypes, *nl_ifs; + enum nl80211_iftype ift; + + nl_ifs = nla_nest_start(msg, NL80211_ATTR_TX_FRAME_TYPES); + if (!nl_ifs) + goto nla_put_failure; + + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + goto nla_put_failure; + i = 0; + stypes = mgmt_stypes[ift].tx; + while (stypes) { + if (stypes & 1) + NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT); + stypes >>= 1; + i++; + } + nla_nest_end(msg, nl_ftypes); + } + + nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES); + if (!nl_ifs) + goto nla_put_failure; + + for (ift = 0; ift < NUM_NL80211_IFTYPES; ift++) { + nl_ftypes = nla_nest_start(msg, ift); + if (!nl_ftypes) + goto nla_put_failure; + i = 0; + stypes = mgmt_stypes[ift].rx; + while (stypes) { + if (stypes & 1) + NLA_PUT_U16(msg, NL80211_ATTR_FRAME_TYPE, + (i << 4) | IEEE80211_FTYPE_MGMT); + stypes >>= 1; + i++; + } + nla_nest_end(msg, nl_ftypes); + } + nla_nest_end(msg, nl_ifs); + } + return genlmsg_end(msg, hdr); nla_put_failure: @@ -4732,17 +4782,18 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, return err; } -static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info) +static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; struct net_device *dev; + u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION; int err; if (!info->attrs[NL80211_ATTR_FRAME_MATCH]) return -EINVAL; - if (nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH]) < 1) - return -EINVAL; + if (info->attrs[NL80211_ATTR_FRAME_TYPE]) + frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]); rtnl_lock(); @@ -4757,12 +4808,13 @@ static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info) } /* not much point in registering if we can't reply */ - if (!rdev->ops->action) { + if (!rdev->ops->mgmt_tx) { err = -EOPNOTSUPP; goto out; } - err = cfg80211_mlme_register_action(dev->ieee80211_ptr, info->snd_pid, + err = cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid, + frame_type, nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH])); out: @@ -4773,7 +4825,7 @@ static int nl80211_register_action(struct sk_buff *skb, struct genl_info *info) return err; } -static int nl80211_action(struct sk_buff *skb, struct genl_info *info) +static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; struct net_device *dev; @@ -4796,7 +4848,7 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock_rtnl; - if (!rdev->ops->action) { + if (!rdev->ops->mgmt_tx) { err = -EOPNOTSUPP; goto out; } @@ -4839,17 +4891,17 @@ static int nl80211_action(struct sk_buff *skb, struct genl_info *info) } hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, - NL80211_CMD_ACTION); + NL80211_CMD_FRAME); if (IS_ERR(hdr)) { err = PTR_ERR(hdr); goto free_msg; } - err = cfg80211_mlme_action(rdev, dev, chan, channel_type, - channel_type_valid, - nla_data(info->attrs[NL80211_ATTR_FRAME]), - nla_len(info->attrs[NL80211_ATTR_FRAME]), - &cookie); + err = cfg80211_mlme_mgmt_tx(rdev, dev, chan, channel_type, + channel_type_valid, + nla_data(info->attrs[NL80211_ATTR_FRAME]), + nla_len(info->attrs[NL80211_ATTR_FRAME]), + &cookie); if (err) goto free_msg; @@ -5348,14 +5400,14 @@ static struct genl_ops nl80211_ops[] = { .flags = GENL_ADMIN_PERM, }, { - .cmd = NL80211_CMD_REGISTER_ACTION, - .doit = nl80211_register_action, + .cmd = NL80211_CMD_REGISTER_FRAME, + .doit = nl80211_register_mgmt, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, { - .cmd = NL80211_CMD_ACTION, - .doit = nl80211_action, + .cmd = NL80211_CMD_FRAME, + .doit = nl80211_tx_mgmt, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, @@ -6055,9 +6107,9 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, nl80211_mlme_mcgrp.id, gfp); } -int nl80211_send_action(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u32 nlpid, - int freq, const u8 *buf, size_t len, gfp_t gfp) +int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 nlpid, + int freq, const u8 *buf, size_t len, gfp_t gfp) { struct sk_buff *msg; void *hdr; @@ -6067,7 +6119,7 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev, if (!msg) return -ENOMEM; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION); + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME); if (!hdr) { nlmsg_free(msg); return -ENOMEM; @@ -6095,10 +6147,10 @@ int nl80211_send_action(struct cfg80211_registered_device *rdev, return -ENOBUFS; } -void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u64 cookie, - const u8 *buf, size_t len, bool ack, - gfp_t gfp) +void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp) { struct sk_buff *msg; void *hdr; @@ -6107,7 +6159,7 @@ void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, if (!msg) return; - hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_ACTION_TX_STATUS); + hdr = nl80211hdr_put(msg, 0, 0, 0, NL80211_CMD_FRAME_TX_STATUS); if (!hdr) { nlmsg_free(msg); return; @@ -6194,7 +6246,7 @@ static int nl80211_netlink_notify(struct notifier_block * nb, list_for_each_entry_rcu(rdev, &cfg80211_rdev_list, list) list_for_each_entry_rcu(wdev, &rdev->netdev_list, list) - cfg80211_mlme_unregister_actions(wdev, notify->pid); + cfg80211_mlme_unregister_socket(wdev, notify->pid); rcu_read_unlock(); diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 2ad7fbc..30d2f93 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -74,13 +74,13 @@ void nl80211_send_sta_event(struct cfg80211_registered_device *rdev, struct net_device *dev, const u8 *mac_addr, struct station_info *sinfo, gfp_t gfp); -int nl80211_send_action(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u32 nlpid, int freq, - const u8 *buf, size_t len, gfp_t gfp); -void nl80211_send_action_tx_status(struct cfg80211_registered_device *rdev, - struct net_device *netdev, u64 cookie, - const u8 *buf, size_t len, bool ack, - gfp_t gfp); +int nl80211_send_mgmt(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u32 nlpid, int freq, + const u8 *buf, size_t len, gfp_t gfp); +void nl80211_send_mgmt_tx_status(struct cfg80211_registered_device *rdev, + struct net_device *netdev, u64 cookie, + const u8 *buf, size_t len, bool ack, + gfp_t gfp); void nl80211_send_cqm_rssi_notify(struct cfg80211_registered_device *rdev, diff --git a/net/wireless/util.c b/net/wireless/util.c index 1eb2416..8d961cc 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -823,7 +823,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, /* monitor can't bridge anyway */ break; case NL80211_IFTYPE_UNSPECIFIED: - case __NL80211_IFTYPE_AFTER_LAST: + case NUM_NL80211_IFTYPES: /* not happening */ break; } -- cgit v1.1 From ec550d246e38e1b4ea8604b5c71ccb72e38f3290 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 24 Aug 2010 14:45:09 -0700 Subject: net: ip_append_data() optim Compiler is not smart enough to avoid a conditional branch. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 6d2753c..e427620 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -837,10 +837,9 @@ int ip_append_data(struct sock *sk, inet->cork.length = 0; sk->sk_sndmsg_page = NULL; sk->sk_sndmsg_off = 0; - if ((exthdrlen = rt->dst.header_len) != 0) { - length += exthdrlen; - transhdrlen += exthdrlen; - } + exthdrlen = rt->dst.header_len; + length += exthdrlen; + transhdrlen += exthdrlen; } else { rt = (struct rtable *)inet->cork.dst; if (inet->cork.flags & IPCORK_OPT) -- cgit v1.1 From c2e3143e3c46ede22336316b3ff4746727c0d93a Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Aug 2010 14:48:10 -0700 Subject: tc: add meta match on receive hash Trivial extension to existing meta data match rules to allow matching on skb receive hash value. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/sched/em_meta.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index 3bcac8a..34da5e2 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -223,6 +223,11 @@ META_COLLECTOR(int_maclen) dst->value = skb->mac_len; } +META_COLLECTOR(int_rxhash) +{ + dst->value = skb_get_rxhash(skb); +} + /************************************************************************** * Netfilter **************************************************************************/ @@ -541,6 +546,7 @@ static struct meta_ops __meta_ops[TCF_META_TYPE_MAX+1][TCF_META_ID_MAX+1] = { [META_ID(SK_SENDMSG_OFF)] = META_FUNC(int_sk_sendmsg_off), [META_ID(SK_WRITE_PENDING)] = META_FUNC(int_sk_write_pend), [META_ID(VLAN_TAG)] = META_FUNC(int_vlan_tag), + [META_ID(RXHASH)] = META_FUNC(int_rxhash), } }; -- cgit v1.1 From 0fb9a9ec27718fbf7fa3153bc94becefb716ceeb Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 20 Aug 2010 16:25:38 -0700 Subject: net/mac80211: Use wiphy_ Standardize logging messages from printk(KERN_ "%s: " fmt , wiphy_name(foo), args); to wiphy_(foo, fmt, args); Signed-off-by: Joe Perches Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 6 +++--- net/mac80211/debugfs.c | 6 ++++-- net/mac80211/ibss.c | 4 ++-- net/mac80211/iface.c | 6 ++---- net/mac80211/key.c | 14 ++++++-------- net/mac80211/main.c | 15 +++++++-------- net/mac80211/mlme.c | 17 +++++++++-------- net/mac80211/rate.c | 9 ++++----- net/mac80211/rx.c | 13 +++++-------- net/mac80211/sta_info.c | 21 +++++++++------------ net/mac80211/status.c | 9 ++++----- net/mac80211/tx.c | 8 ++++---- 12 files changed, 59 insertions(+), 69 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 94787d2..7693ebc 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1123,9 +1123,9 @@ static int ieee80211_set_txq_params(struct wiphy *wiphy, p.uapsd = false; if (drv_conf_tx(local, params->queue, &p)) { - printk(KERN_DEBUG "%s: failed to set TX queue " - "parameters for queue %d\n", - wiphy_name(local->hw.wiphy), params->queue); + wiphy_debug(local->hw.wiphy, + "failed to set TX queue parameters for queue %d\n", + params->queue); return -EINVAL; } diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index a694c59..e81ef4e 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -85,13 +85,15 @@ static ssize_t tsf_write(struct file *file, if (strncmp(buf, "reset", 5) == 0) { if (local->ops->reset_tsf) { drv_reset_tsf(local); - printk(KERN_INFO "%s: debugfs reset TSF\n", wiphy_name(local->hw.wiphy)); + wiphy_info(local->hw.wiphy, "debugfs reset TSF\n"); } } else { tsf = simple_strtoul(buf, NULL, 0); if (local->ops->set_tsf) { drv_set_tsf(local, tsf); - printk(KERN_INFO "%s: debugfs set TSF to %#018llx\n", wiphy_name(local->hw.wiphy), tsf); + wiphy_info(local->hw.wiphy, + "debugfs set TSF to %#018llx\n", tsf); + } } diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 32af971..1a3aae5 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -427,8 +427,8 @@ struct sta_info *ieee80211_ibss_add_sta(struct ieee80211_sub_if_data *sdata, return NULL; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Adding new IBSS station %pM (dev=%s)\n", - wiphy_name(local->hw.wiphy), addr, sdata->name); + wiphy_debug(local->hw.wiphy, "Adding new IBSS station %pM (dev=%s)\n", + addr, sdata->name); #endif sta = sta_info_alloc(sdata, addr, gfp); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 86f434f..9369710 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -1175,8 +1175,7 @@ static u32 ieee80211_idle_off(struct ieee80211_local *local, return 0; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: device no longer idle - %s\n", - wiphy_name(local->hw.wiphy), reason); + wiphy_debug(local->hw.wiphy, "device no longer idle - %s\n", reason); #endif local->hw.conf.flags &= ~IEEE80211_CONF_IDLE; @@ -1189,8 +1188,7 @@ static u32 ieee80211_idle_on(struct ieee80211_local *local) return 0; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: device now idle\n", - wiphy_name(local->hw.wiphy)); + wiphy_debug(local->hw.wiphy, "device now idle\n"); #endif drv_flush(local, false); diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 9c27c53..2ce2dbb 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -87,10 +87,9 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; if (ret && ret != -ENOSPC && ret != -EOPNOTSUPP) - printk(KERN_ERR "mac80211-%s: failed to set key " - "(%d, %pM) to hardware (%d)\n", - wiphy_name(key->local->hw.wiphy), - key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + wiphy_err(key->local->hw.wiphy, + "failed to set key (%d, %pM) to hardware (%d)\n", + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); } static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) @@ -121,10 +120,9 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) sta, &key->conf); if (ret) - printk(KERN_ERR "mac80211-%s: failed to remove key " - "(%d, %pM) from hardware (%d)\n", - wiphy_name(key->local->hw.wiphy), - key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + wiphy_err(key->local->hw.wiphy, + "failed to remove key (%d, %pM) from hardware (%d)\n", + key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index a53feac..5756fba 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -713,16 +713,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) result = ieee80211_wep_init(local); if (result < 0) - printk(KERN_DEBUG "%s: Failed to initialize wep: %d\n", - wiphy_name(local->hw.wiphy), result); + wiphy_debug(local->hw.wiphy, "Failed to initialize wep: %d\n", + result); rtnl_lock(); result = ieee80211_init_rate_ctrl_alg(local, hw->rate_control_algorithm); if (result < 0) { - printk(KERN_DEBUG "%s: Failed to initialize rate control " - "algorithm\n", wiphy_name(local->hw.wiphy)); + wiphy_debug(local->hw.wiphy, + "Failed to initialize rate control algorithm\n"); goto fail_rate; } @@ -731,8 +731,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) result = ieee80211_if_add(local, "wlan%d", NULL, NL80211_IFTYPE_STATION, NULL); if (result) - printk(KERN_WARNING "%s: Failed to add default virtual iface\n", - wiphy_name(local->hw.wiphy)); + wiphy_warn(local->hw.wiphy, + "Failed to add default virtual iface\n"); } rtnl_unlock(); @@ -815,8 +815,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) if (skb_queue_len(&local->skb_queue) || skb_queue_len(&local->skb_queue_unreliable)) - printk(KERN_WARNING "%s: skb_queue not empty\n", - wiphy_name(local->hw.wiphy)); + wiphy_warn(local->hw.wiphy, "skb_queue not empty\n"); skb_queue_purge(&local->skb_queue); skb_queue_purge(&local->skb_queue_unreliable); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 38996a4..5282ac1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -778,16 +778,17 @@ static void ieee80211_sta_wmm_params(struct ieee80211_local *local, params.uapsd = uapsd; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: WMM queue=%d aci=%d acm=%d aifs=%d " - "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", - wiphy_name(local->hw.wiphy), queue, aci, acm, - params.aifs, params.cw_min, params.cw_max, params.txop, - params.uapsd); + wiphy_debug(local->hw.wiphy, + "WMM queue=%d aci=%d acm=%d aifs=%d " + "cWmin=%d cWmax=%d txop=%d uapsd=%d\n", + queue, aci, acm, + params.aifs, params.cw_min, params.cw_max, + params.txop, params.uapsd); #endif if (drv_conf_tx(local, queue, ¶ms)) - printk(KERN_DEBUG "%s: failed to set TX queue " - "parameters for queue %d\n", - wiphy_name(local->hw.wiphy), queue); + wiphy_debug(local->hw.wiphy, + "failed to set TX queue parameters for queue %d\n", + queue); } /* enable WMM or activate new settings */ diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 6d0bd19..f77a456 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -366,8 +366,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, ref = rate_control_alloc(name, local); if (!ref) { - printk(KERN_WARNING "%s: Failed to select rate control " - "algorithm\n", wiphy_name(local->hw.wiphy)); + wiphy_warn(local->hw.wiphy, + "Failed to select rate control algorithm\n"); return -ENOENT; } @@ -378,9 +378,8 @@ int ieee80211_init_rate_ctrl_alg(struct ieee80211_local *local, sta_info_flush(local, NULL); } - printk(KERN_DEBUG "%s: Selected rate control " - "algorithm '%s'\n", wiphy_name(local->hw.wiphy), - ref->ops->name); + wiphy_debug(local->hw.wiphy, "Selected rate control algorithm '%s'\n", + ref->ops->name); return 0; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index aa41e38..e1844f7 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -605,10 +605,8 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, #ifdef CONFIG_MAC80211_HT_DEBUG if (net_ratelimit()) - printk(KERN_DEBUG "%s: release an RX reorder " - "frame due to timeout on earlier " - "frames\n", - wiphy_name(hw->wiphy)); + wiphy_debug(hw->wiphy, + "release an RX reorder frame due to timeout on earlier frames\n"); #endif ieee80211_release_reorder_frame(hw, tid_agg_rx, j, frames); @@ -2698,10 +2696,9 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, skb_new = skb_copy(skb, GFP_ATOMIC); if (!skb_new) { if (net_ratelimit()) - printk(KERN_DEBUG "%s: failed to copy " - "multicast frame for %s\n", - wiphy_name(local->hw.wiphy), - prev->name); + wiphy_debug(local->hw.wiphy, + "failed to copy multicast frame for %s\n", + prev->name); goto next; } ieee80211_invoke_rx_handlers(prev, &rx, skb_new); diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 6d86f0c..687077e 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -174,8 +174,7 @@ static void __sta_info_free(struct ieee80211_local *local, } #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Destroyed STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Destroyed STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ kfree(sta); @@ -262,8 +261,7 @@ struct sta_info *sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->last_seq_ctrl[i] = cpu_to_le16(USHRT_MAX); #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Allocated STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Allocated STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ #ifdef CONFIG_MAC80211_MESH @@ -300,8 +298,9 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async) sta->uploaded = true; #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (async) - printk(KERN_DEBUG "%s: Finished adding IBSS STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, + "Finished adding IBSS STA %pM\n", + sta->sta.addr); #endif } @@ -411,8 +410,8 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) spin_unlock_irqrestore(&local->sta_lock, flags); #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Added IBSS STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Added IBSS STA %pM\n", + sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ ieee80211_queue_work(&local->hw, &local->sta_finish_work); @@ -459,8 +458,7 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU) } #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Inserted STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Inserted STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ /* move reference to rcu-protected */ @@ -690,8 +688,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta) #endif #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "%s: Removed STA %pM\n", - wiphy_name(local->hw.wiphy), sta->sta.addr); + wiphy_debug(local->hw.wiphy, "Removed STA %pM\n", sta->sta.addr); #endif /* CONFIG_MAC80211_VERBOSE_DEBUG */ cancel_work_sync(&sta->drv_unblock_wk); diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 67a3584..571b32b 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -114,11 +114,10 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, #ifdef CONFIG_MAC80211_VERBOSE_DEBUG if (net_ratelimit()) - printk(KERN_DEBUG "%s: dropped TX filtered frame, " - "queue_len=%d PS=%d @%lu\n", - wiphy_name(local->hw.wiphy), - skb_queue_len(&sta->tx_filtered), - !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies); + wiphy_debug(local->hw.wiphy, + "dropped TX filtered frame, queue_len=%d PS=%d @%lu\n", + skb_queue_len(&sta->tx_filtered), + !!test_sta_flags(sta, WLAN_STA_PS_STA), jiffies); #endif dev_kfree_skb(skb); } diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index bc4fefc..d51ec74 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -351,8 +351,8 @@ static void purge_old_ps_buffers(struct ieee80211_local *local) local->total_ps_buffered = total; #ifdef CONFIG_MAC80211_VERBOSE_PS_DEBUG - printk(KERN_DEBUG "%s: PS buffers full - purged %d frames\n", - wiphy_name(local->hw.wiphy), purged); + wiphy_debug(local->hw.wiphy, "PS buffers full - purged %d frames\n", + purged); #endif } @@ -1513,8 +1513,8 @@ static int ieee80211_skb_resize(struct ieee80211_local *local, I802_DEBUG_INC(local->tx_expand_skb_head); if (pskb_expand_head(skb, head_need, tail_need, GFP_ATOMIC)) { - printk(KERN_DEBUG "%s: failed to reallocate TX buffer\n", - wiphy_name(local->hw.wiphy)); + wiphy_debug(local->hw.wiphy, + "failed to reallocate TX buffer\n"); return -ENOMEM; } -- cgit v1.1 From ff67bb86d448c26cb9110e9681669dc4a8aa5e0a Mon Sep 17 00:00:00 2001 From: Wey-Yi Guy Date: Sat, 21 Aug 2010 07:23:29 -0700 Subject: mac80211: fix warning for un-used parameter mesh_hdr only used when CONFIG_MAC80211_MESH is defined Signed-off-by: Wey-Yi Guy Signed-off-by: John W. Linville --- net/mac80211/tx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index d51ec74..a6ac9fd 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1701,7 +1701,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, u16 ethertype, hdrlen, meshhdrlen = 0; __le16 fc; struct ieee80211_hdr hdr; - struct ieee80211s_hdr mesh_hdr; + struct ieee80211s_hdr mesh_hdr __maybe_unused; const u8 *encaps_data; int encaps_len, skip_header_bytes; int nh_pos, h_pos; -- cgit v1.1 From 258086a48b766d12a500f98834654ffa927ca475 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Sun, 22 Aug 2010 23:48:25 +0200 Subject: mac80211: cancel restart_work in ieee80211_unregister_hw Unlike most other workqueue-tasks, the restart_work is not scheduled onto mac80211's private per-interface workqueue, but onto one of the system-wide workqueues. Therefore the mac80211-stack has to cancel any pending restarts, before destroying the shared device context and handing back the memory. Otherwise - under very unlucky circumstances - there could be a stale work- item left, because some other kernel component might have delayed the execution of ieee80211_restart_work for too long. Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/main.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 5756fba..28415de 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -807,6 +807,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw) rtnl_unlock(); + cancel_work_sync(&local->restart_work); cancel_work_sync(&local->reconfig_filter); ieee80211_clear_tx_pending(local); -- cgit v1.1 From 74b70a4e38d542843fccfb367dce1ac861cc3890 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 24 Aug 2010 12:15:53 +0200 Subject: nl80211: fix missing nesting commit 95a6ccbb46c70cff376684c752831c014c87029d Author: Johannes Berg Date: Thu Aug 12 15:38:38 2010 +0200 cfg80211/mac80211: extensible frame processing introduced a netlink bug that caused parsing errors in userspace because it forgot to close a nesting, which would advertise a nesting length of zero to userspace, which then completely threw off parsing and led to Illegal nla->nla_type == 0 being printed by libnl. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 927ffbd..49f5ca3 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -636,6 +636,8 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, nla_nest_end(msg, nl_ftypes); } + nla_nest_end(msg, nl_ifs); + nl_ifs = nla_nest_start(msg, NL80211_ATTR_RX_FRAME_TYPES); if (!nl_ifs) goto nla_put_failure; -- cgit v1.1 From 2c15a0cf27a74213a714cc7be31685b841f7c1ac Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Tue, 24 Aug 2010 19:22:42 +0200 Subject: mac80211: fix rcu-unsafe pointer dereference This patch fixes a potential crash (null-pointer de- reference) which was introduced in my previous patch: "mac80211: AMPDU rx reorder timeout timer" During a BA teardown, the pointer to the soon-to-be-gone tid_ampdu_rx element will be nullified. Therefore the release timer mechanism has to be careful not to accidentally access the item without any RCU protection. Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/rx.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e1844f7..e67deb4 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2479,6 +2479,11 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) { struct sk_buff_head frames; struct ieee80211_rx_data rx = { }; + struct tid_ampdu_rx *tid_agg_rx; + + tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); + if (!tid_agg_rx) + return; __skb_queue_head_init(&frames); @@ -2493,10 +2498,9 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) test_bit(SCAN_OFF_CHANNEL, &sta->local->scanning))) rx.flags |= IEEE80211_RX_IN_SCAN; - spin_lock(&sta->ampdu_mlme.tid_rx[tid]->reorder_lock); - ieee80211_sta_reorder_release(&sta->local->hw, - sta->ampdu_mlme.tid_rx[tid], &frames); - spin_unlock(&sta->ampdu_mlme.tid_rx[tid]->reorder_lock); + spin_lock(&tid_agg_rx->reorder_lock); + ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx, &frames); + spin_unlock(&tid_agg_rx->reorder_lock); ieee80211_rx_handlers(&rx, &frames); } -- cgit v1.1 From 5eb5a52da6ef04604cf8faca43ec670f69f417d3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 25 Aug 2010 14:34:01 +0200 Subject: mac80211: fix mesh advertisement When a mac80211-based driver advertises mesh mode support, this will be advertised to userspace. However, if mac80211 was compiled without mesh support, then that won't actually be true. Fix this by removing the bit for mesh if mesh isn't compiled in. Since this synchronizes what we advertise to cfg80211 and actually support, it means we can now rely on cfg80211's interface type checks and need not check again in mac80211. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 33 --------------------------------- net/mac80211/main.c | 5 +++++ 2 files changed, 5 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 7693ebc..5814382 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -19,33 +19,6 @@ #include "rate.h" #include "mesh.h" -static bool nl80211_type_check(enum nl80211_iftype type) -{ - switch (type) { - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_MONITOR: -#ifdef CONFIG_MAC80211_MESH - case NL80211_IFTYPE_MESH_POINT: -#endif - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_WDS: - return true; - default: - return false; - } -} - -static bool nl80211_params_check(enum nl80211_iftype type, - struct vif_params *params) -{ - if (!nl80211_type_check(type)) - return false; - - return true; -} - static int ieee80211_add_iface(struct wiphy *wiphy, char *name, enum nl80211_iftype type, u32 *flags, struct vif_params *params) @@ -55,9 +28,6 @@ static int ieee80211_add_iface(struct wiphy *wiphy, char *name, struct ieee80211_sub_if_data *sdata; int err; - if (!nl80211_params_check(type, params)) - return -EINVAL; - err = ieee80211_if_add(local, name, &dev, type, params); if (err || type != NL80211_IFTYPE_MONITOR || !flags) return err; @@ -85,9 +55,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy, if (ieee80211_sdata_running(sdata)) return -EBUSY; - if (!nl80211_params_check(type, params)) - return -EINVAL; - ret = ieee80211_if_change_type(sdata, type); if (ret) return ret; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 28415de..80db5ea 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -622,6 +622,11 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) /* mac80211 always supports monitor */ local->hw.wiphy->interface_modes |= BIT(NL80211_IFTYPE_MONITOR); +#ifndef CONFIG_MAC80211_MESH + /* mesh depends on Kconfig, but drivers should set it if they want */ + local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT); +#endif + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) -- cgit v1.1 From b2aff96327545aa5ceb25e3116be69c8b06de703 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Tue, 24 Aug 2010 04:39:49 +0000 Subject: net/netfilter/ipvs: Eliminate memory leak __ip_vs_service_get and __ip_vs_svc_fwm_get increment a reference count, so that reference count should be decremented before leaving the function in an error case. A simplified version of the semantic match that finds this problem is: (http://coccinelle.lip6.fr/) // @r exists@ local idexpression x; expression E; identifier f1; iterator I; @@ x = __ip_vs_service_get(...); <... when != x when != true (x == NULL || ...) when != if (...) { <+...x...+> } when != I (...) { <+...x...+> } ( x == NULL | x == E | x->f1 ) ...> * return ...; // Signed-off-by: Julia Lawall Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0f0c079..53a88af 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2155,7 +2155,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; - goto out_unlock; + goto out_drop_service; } switch (cmd) { @@ -2189,6 +2189,7 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; } +out_drop_service: if (svc) ip_vs_service_put(svc); -- cgit v1.1 From 944c794d6437e5301c8769cb242c2b919a5acf59 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 24 Aug 2010 13:08:10 +0000 Subject: bridge: fix locking comment The carrier check is not called from work queue in current code. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_if.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index c03d2c3..1707945 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -61,11 +61,7 @@ static int port_cost(struct net_device *dev) } -/* - * Check for port carrier transistions. - * Called from work queue to allow for calling functions that - * might sleep (such as speed check), and to debounce. - */ +/* Check for port carrier transistions. */ void br_port_carrier_check(struct net_bridge_port *p) { struct net_device *dev = p->dev; -- cgit v1.1 From aa7c6e5fa08bb5014b6432a690d28748f11e93fc Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 24 Aug 2010 13:12:56 +0000 Subject: bridge: avoid ethtool on non running interface If bridge port is offline, don't call ethtool to query speed. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_if.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 1707945..89ad25a 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -67,20 +67,21 @@ void br_port_carrier_check(struct net_bridge_port *p) struct net_device *dev = p->dev; struct net_bridge *br = p->br; - if (netif_carrier_ok(dev)) + if (netif_running(dev) && netif_carrier_ok(dev)) p->path_cost = port_cost(dev); - if (netif_running(br->dev)) { - spin_lock_bh(&br->lock); - if (netif_carrier_ok(dev)) { - if (p->state == BR_STATE_DISABLED) - br_stp_enable_port(p); - } else { - if (p->state != BR_STATE_DISABLED) - br_stp_disable_port(p); - } - spin_unlock_bh(&br->lock); + if (!netif_running(br->dev)) + return; + + spin_lock_bh(&br->lock); + if (netif_running(dev) && netif_carrier_ok(dev)) { + if (p->state == BR_STATE_DISABLED) + br_stp_enable_port(p); + } else { + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); } + spin_unlock_bh(&br->lock); } static void release_nbp(struct kobject *kobj) -- cgit v1.1 From 8870f8427b8fe30b2684b9e569e5ce038faf41ac Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 26 Aug 2010 13:21:26 -0700 Subject: IPVS: ICMPv6 checksum calculation Cc: Xiaoyu Du Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_core.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 4f8ddba4..69661db 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -637,10 +637,12 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, } /* And finally the ICMP checksum */ - icmph->icmp6_cksum = 0; - /* TODO IPv6: is this correct for ICMPv6? */ - ip_vs_checksum_complete(skb, icmp_offset); - skb->ip_summed = CHECKSUM_UNNECESSARY; + icmph->icmp6_cksum = ~csum_ipv6_magic(&iph->saddr, &iph->daddr, + skb->len - icmp_offset, + IPPROTO_ICMPV6, 0); + skb->csum_start = skb_network_header(skb) - skb->head + icmp_offset; + skb->csum_offset = offsetof(struct icmp6hdr, icmp6_cksum); + skb->ip_summed = CHECKSUM_PARTIAL; if (inout) IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, -- cgit v1.1 From bd144550489270a32a531e84a2b4bb6882096236 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 26 Aug 2010 02:54:29 +0000 Subject: IPVS: convert __ip_vs_sched_lock to a spinlock Also rename __ip_vs_sched_lock to ip_vs_sched_lock. Acked-by: Eric Dumazet Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_sched.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index bbc1ac7..727e45b 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -35,7 +35,7 @@ static LIST_HEAD(ip_vs_schedulers); /* lock for service table */ -static DEFINE_RWLOCK(__ip_vs_sched_lock); +static DEFINE_SPINLOCK(ip_vs_sched_lock); /* @@ -108,7 +108,7 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) IP_VS_DBG(2, "%s(): sched_name \"%s\"\n", __func__, sched_name); - read_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); list_for_each_entry(sched, &ip_vs_schedulers, n_list) { /* @@ -122,14 +122,14 @@ static struct ip_vs_scheduler *ip_vs_sched_getbyname(const char *sched_name) } if (strcmp(sched_name, sched->name)==0) { /* HIT */ - read_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); return sched; } if (sched->module) module_put(sched->module); } - read_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); return NULL; } @@ -184,10 +184,10 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) /* increase the module use count */ ip_vs_use_count_inc(); - write_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); if (!list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already linked\n", __func__, scheduler->name); @@ -200,7 +200,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) */ list_for_each_entry(sched, &ip_vs_schedulers, n_list) { if (strcmp(scheduler->name, sched->name) == 0) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); ip_vs_use_count_dec(); pr_err("%s(): [%s] scheduler already existed " "in the system\n", __func__, scheduler->name); @@ -211,7 +211,7 @@ int register_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Add it into the d-linked scheduler list */ list_add(&scheduler->n_list, &ip_vs_schedulers); - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); pr_info("[%s] scheduler registered.\n", scheduler->name); @@ -229,9 +229,9 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) return -EINVAL; } - write_lock_bh(&__ip_vs_sched_lock); + spin_lock_bh(&ip_vs_sched_lock); if (list_empty(&scheduler->n_list)) { - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); pr_err("%s(): [%s] scheduler is not in the list. failed\n", __func__, scheduler->name); return -EINVAL; @@ -241,7 +241,7 @@ int unregister_ip_vs_scheduler(struct ip_vs_scheduler *scheduler) * Remove it from the d-linked scheduler list */ list_del(&scheduler->n_list); - write_unlock_bh(&__ip_vs_sched_lock); + spin_unlock_bh(&ip_vs_sched_lock); /* decrease the module use count */ ip_vs_use_count_dec(); -- cgit v1.1 From 4f72816ef07fdf49338ee0e6764a3961d552994a Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 26 Aug 2010 02:54:30 +0000 Subject: IPVS: convert __ip_vs_securetcp_lock to a spinlock Also rename __ip_vs_securetcp_lock to ip_vs_securetcp_lock. Spinlock conversion was suggested by Eric Dumazet. Acked-by: Eric Dumazet Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 53a88af..8ca1a47 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -61,7 +61,7 @@ static DEFINE_RWLOCK(__ip_vs_svc_lock); static DEFINE_RWLOCK(__ip_vs_rs_lock); /* lock for state and timeout tables */ -static DEFINE_RWLOCK(__ip_vs_securetcp_lock); +static DEFINE_SPINLOCK(ip_vs_securetcp_lock); /* lock for drop entry handling */ static DEFINE_SPINLOCK(__ip_vs_dropentry_lock); @@ -204,7 +204,7 @@ static void update_defense_level(void) spin_unlock(&__ip_vs_droppacket_lock); /* secure_tcp */ - write_lock(&__ip_vs_securetcp_lock); + spin_lock(&ip_vs_securetcp_lock); switch (sysctl_ip_vs_secure_tcp) { case 0: if (old_secure_tcp >= 2) @@ -238,7 +238,7 @@ static void update_defense_level(void) old_secure_tcp = sysctl_ip_vs_secure_tcp; if (to_change >= 0) ip_vs_protocol_timeout_change(sysctl_ip_vs_secure_tcp>1); - write_unlock(&__ip_vs_securetcp_lock); + spin_unlock(&ip_vs_securetcp_lock); local_bh_enable(); } -- cgit v1.1 From dee06e4702b5a64b9ca81e650d66223c5b3e7f14 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Thu, 26 Aug 2010 02:54:31 +0000 Subject: ipvs: switch to GFP_KERNEL allocations Switch from GFP_ATOMIC allocations to GFP_KERNEL ones in ip_vs_add_service() and ip_vs_new_dest(), as we hold a mutex and are allowed to sleep in this context. Signed-off-by: Eric Dumazet Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_ctl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 8ca1a47..ca8ec8c 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -843,7 +843,7 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, return -EINVAL; } - dest = kzalloc(sizeof(struct ip_vs_dest), GFP_ATOMIC); + dest = kzalloc(sizeof(struct ip_vs_dest), GFP_KERNEL); if (dest == NULL) { pr_err("%s(): no memory.\n", __func__); return -ENOMEM; @@ -1177,7 +1177,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, } #endif - svc = kzalloc(sizeof(struct ip_vs_service), GFP_ATOMIC); + svc = kzalloc(sizeof(struct ip_vs_service), GFP_KERNEL); if (svc == NULL) { IP_VS_DBG(1, "%s(): no memory\n", __func__); ret = -ENOMEM; -- cgit v1.1 From 145ce502e44b57c074c72cfdc855557e19026999 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 24 Aug 2010 13:21:08 +0000 Subject: net/sctp: Use pr_fmt and pr_ Change SCTP_DEBUG_PRINTK and SCTP_DEBUG_PRINTK_IPADDR to use do { print } while (0) guards. Add SCTP_DEBUG_PRINTK_CONT to fix errors in log when lines were continued. Add #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt Add a missing newline in "Failed bind hash alloc" Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/sctp/associola.c | 2 ++ net/sctp/chunk.c | 2 ++ net/sctp/inqueue.c | 2 ++ net/sctp/ipv6.c | 4 +++- net/sctp/objcnt.c | 5 +++-- net/sctp/output.c | 2 ++ net/sctp/outqueue.c | 34 ++++++++++++++++++---------------- net/sctp/probe.c | 4 +++- net/sctp/protocol.c | 17 ++++++++--------- net/sctp/sm_make_chunk.c | 2 ++ net/sctp/sm_sideeffect.c | 21 ++++++++++----------- net/sctp/sm_statefuns.c | 20 ++++++++++---------- net/sctp/sm_statetable.c | 42 +++++++++++++++++++++--------------------- net/sctp/socket.c | 39 +++++++++++++++------------------------ net/sctp/transport.c | 9 +++++---- 15 files changed, 106 insertions(+), 99 deletions(-) (limited to 'net') diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 0b85e52..5f1fb8b 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -48,6 +48,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c index 476caaf..6c85564 100644 --- a/net/sctp/chunk.c +++ b/net/sctp/chunk.c @@ -37,6 +37,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/sctp/inqueue.c b/net/sctp/inqueue.c index ccb6dc4..397296f 100644 --- a/net/sctp/inqueue.c +++ b/net/sctp/inqueue.c @@ -43,6 +43,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7326891..95e0c8e 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -47,6 +47,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -336,7 +338,7 @@ static void sctp_v6_get_saddr(struct sctp_sock *sk, memcpy(saddr, baddr, sizeof(union sctp_addr)); SCTP_DEBUG_PRINTK("saddr: %pI6\n", &saddr->v6.sin6_addr); } else { - printk(KERN_ERR "%s: asoc:%p Could not find a valid source " + pr_err("%s: asoc:%p Could not find a valid source " "address for the dest:%pI6\n", __func__, asoc, &daddr->v6.sin6_addr); } diff --git a/net/sctp/objcnt.c b/net/sctp/objcnt.c index f73ec0e..8ef8e7d 100644 --- a/net/sctp/objcnt.c +++ b/net/sctp/objcnt.c @@ -38,6 +38,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include @@ -134,8 +136,7 @@ void sctp_dbg_objcnt_init(void) ent = proc_create("sctp_dbg_objcnt", 0, proc_net_sctp, &sctp_objcnt_ops); if (!ent) - printk(KERN_WARNING - "sctp_dbg_objcnt: Unable to create /proc entry.\n"); + pr_warn("sctp_dbg_objcnt: Unable to create /proc entry.\n"); } /* Cleanup the objcount entry in the proc filesystem. */ diff --git a/net/sctp/output.c b/net/sctp/output.c index a646681..901764b 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -41,6 +41,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c index c04b2eb..8c6d379 100644 --- a/net/sctp/outqueue.c +++ b/net/sctp/outqueue.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include /* For struct list_head */ #include @@ -1463,23 +1465,23 @@ static void sctp_check_transmitted(struct sctp_outq *q, /* Display the end of the * current range. */ - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); } /* Start a new range. */ - SCTP_DEBUG_PRINTK(",%08x", tsn); + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); dbg_ack_tsn = tsn; break; case 1: /* The last TSN was NOT ACKed. */ if (dbg_last_kept_tsn != dbg_kept_tsn) { /* Display the end of current range. */ - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); } - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); /* FALL THROUGH... */ default: @@ -1526,18 +1528,18 @@ static void sctp_check_transmitted(struct sctp_outq *q, break; if (dbg_last_kept_tsn != dbg_kept_tsn) - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_kept_tsn); - SCTP_DEBUG_PRINTK(",%08x", tsn); + SCTP_DEBUG_PRINTK_CONT(",%08x", tsn); dbg_kept_tsn = tsn; break; case 0: if (dbg_last_ack_tsn != dbg_ack_tsn) - SCTP_DEBUG_PRINTK("-%08x", - dbg_last_ack_tsn); - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("-%08x", + dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("\n"); /* FALL THROUGH... */ default: @@ -1556,17 +1558,17 @@ static void sctp_check_transmitted(struct sctp_outq *q, switch (dbg_prt_state) { case 0: if (dbg_last_ack_tsn != dbg_ack_tsn) { - SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_ack_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_ack_tsn); } else { - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); } break; case 1: if (dbg_last_kept_tsn != dbg_kept_tsn) { - SCTP_DEBUG_PRINTK("-%08x\n", dbg_last_kept_tsn); + SCTP_DEBUG_PRINTK_CONT("-%08x\n", dbg_last_kept_tsn); } else { - SCTP_DEBUG_PRINTK("\n"); + SCTP_DEBUG_PRINTK_CONT("\n"); } } #endif /* SCTP_DEBUG */ diff --git a/net/sctp/probe.c b/net/sctp/probe.c index db3a42b..2e63e9d 100644 --- a/net/sctp/probe.c +++ b/net/sctp/probe.c @@ -22,6 +22,8 @@ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -192,7 +194,7 @@ static __init int sctpprobe_init(void) if (ret) goto remove_proc; - pr_info("SCTP probe registered (port=%d)\n", port); + pr_info("probe registered (port=%d)\n", port); return 0; diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index 5027b83..f774e65 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -707,8 +709,7 @@ static int sctp_ctl_sock_init(void) &init_net); if (err < 0) { - printk(KERN_ERR - "SCTP: Failed to create the SCTP control socket.\n"); + pr_err("Failed to create the SCTP control socket\n"); return err; } return 0; @@ -1206,7 +1207,7 @@ SCTP_STATIC __init int sctp_init(void) __get_free_pages(GFP_ATOMIC, order); } while (!sctp_assoc_hashtable && --order > 0); if (!sctp_assoc_hashtable) { - printk(KERN_ERR "SCTP: Failed association hash alloc.\n"); + pr_err("Failed association hash alloc\n"); status = -ENOMEM; goto err_ahash_alloc; } @@ -1220,7 +1221,7 @@ SCTP_STATIC __init int sctp_init(void) sctp_ep_hashtable = (struct sctp_hashbucket *) kmalloc(64 * sizeof(struct sctp_hashbucket), GFP_KERNEL); if (!sctp_ep_hashtable) { - printk(KERN_ERR "SCTP: Failed endpoint_hash alloc.\n"); + pr_err("Failed endpoint_hash alloc\n"); status = -ENOMEM; goto err_ehash_alloc; } @@ -1239,7 +1240,7 @@ SCTP_STATIC __init int sctp_init(void) __get_free_pages(GFP_ATOMIC, order); } while (!sctp_port_hashtable && --order > 0); if (!sctp_port_hashtable) { - printk(KERN_ERR "SCTP: Failed bind hash alloc."); + pr_err("Failed bind hash alloc\n"); status = -ENOMEM; goto err_bhash_alloc; } @@ -1248,8 +1249,7 @@ SCTP_STATIC __init int sctp_init(void) INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain); } - printk(KERN_INFO "SCTP: Hash tables configured " - "(established %d bind %d)\n", + pr_info("Hash tables configured (established %d bind %d)\n", sctp_assoc_hashsize, sctp_port_hashsize); /* Disable ADDIP by default. */ @@ -1290,8 +1290,7 @@ SCTP_STATIC __init int sctp_init(void) /* Initialize the control inode/socket for handling OOTB packets. */ if ((status = sctp_ctl_sock_init())) { - printk (KERN_ERR - "SCTP: Failed to initialize the SCTP control sock.\n"); + pr_err("Failed to initialize the SCTP control sock\n"); goto err_ctl_sock_init; } diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 246f929..2cc46f0 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -50,6 +50,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index f5e5e27..b21b218 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -47,6 +47,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1146,26 +1148,23 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype, case SCTP_DISPOSITION_VIOLATION: if (net_ratelimit()) - printk(KERN_ERR "sctp protocol violation state %d " - "chunkid %d\n", state, subtype.chunk); + pr_err("protocol violation state %d chunkid %d\n", + state, subtype.chunk); break; case SCTP_DISPOSITION_NOT_IMPL: - printk(KERN_WARNING "sctp unimplemented feature in state %d, " - "event_type %d, event_id %d\n", - state, event_type, subtype.chunk); + pr_warn("unimplemented feature in state %d, event_type %d, event_id %d\n", + state, event_type, subtype.chunk); break; case SCTP_DISPOSITION_BUG: - printk(KERN_ERR "sctp bug in state %d, " - "event_type %d, event_id %d\n", + pr_err("bug in state %d, event_type %d, event_id %d\n", state, event_type, subtype.chunk); BUG(); break; default: - printk(KERN_ERR "sctp impossible disposition %d " - "in state %d, event_type %d, event_id %d\n", + pr_err("impossible disposition %d in state %d, event_type %d, event_id %d\n", status, state, event_type, subtype.chunk); BUG(); break; @@ -1679,8 +1678,8 @@ static int sctp_cmd_interpreter(sctp_event_t event_type, sctp_cmd_send_asconf(asoc); break; default: - printk(KERN_WARNING "Impossible command: %u, %p\n", - cmd->verb, cmd->obj.ptr); + pr_warn("Impossible command: %u, %p\n", + cmd->verb, cmd->obj.ptr); break; } diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 24b2cd5..8b28443 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -50,6 +50,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -1138,18 +1140,16 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, if (unlikely(!link)) { if (from_addr.sa.sa_family == AF_INET6) { if (net_ratelimit()) - printk(KERN_WARNING - "%s association %p could not find address %pI6\n", - __func__, - asoc, - &from_addr.v6.sin6_addr); + pr_warn("%s association %p could not find address %pI6\n", + __func__, + asoc, + &from_addr.v6.sin6_addr); } else { if (net_ratelimit()) - printk(KERN_WARNING - "%s association %p could not find address %pI4\n", - __func__, - asoc, - &from_addr.v4.sin_addr.s_addr); + pr_warn("%s association %p could not find address %pI4\n", + __func__, + asoc, + &from_addr.v4.sin_addr.s_addr); } return SCTP_DISPOSITION_DISCARD; } diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c index 6d9b3aa..546d4387 100644 --- a/net/sctp/sm_statetable.c +++ b/net/sctp/sm_statetable.c @@ -46,6 +46,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -66,15 +68,19 @@ static const sctp_sm_table_entry_t bug = { .name = "sctp_sf_bug" }; -#define DO_LOOKUP(_max, _type, _table) \ - if ((event_subtype._type > (_max))) { \ - printk(KERN_WARNING \ - "sctp table %p possible attack:" \ - " event %d exceeds max %d\n", \ - _table, event_subtype._type, _max); \ - return &bug; \ - } \ - return &_table[event_subtype._type][(int)state]; +#define DO_LOOKUP(_max, _type, _table) \ +({ \ + const sctp_sm_table_entry_t *rtn; \ + \ + if ((event_subtype._type > (_max))) { \ + pr_warn("table %p possible attack: event %d exceeds max %d\n", \ + _table, event_subtype._type, _max); \ + rtn = &bug; \ + } else \ + rtn = &_table[event_subtype._type][(int)state]; \ + \ + rtn; \ +}) const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, sctp_state_t state, @@ -83,21 +89,15 @@ const sctp_sm_table_entry_t *sctp_sm_lookup_event(sctp_event_t event_type, switch (event_type) { case SCTP_EVENT_T_CHUNK: return sctp_chunk_event_lookup(event_subtype.chunk, state); - break; case SCTP_EVENT_T_TIMEOUT: - DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, - timeout_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_TIMEOUT_MAX, timeout, + timeout_event_table); case SCTP_EVENT_T_OTHER: - DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, other_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_OTHER_MAX, other, + other_event_table); case SCTP_EVENT_T_PRIMITIVE: - DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, - primitive_event_table); - break; - + return DO_LOOKUP(SCTP_EVENT_PRIMITIVE_MAX, primitive, + primitive_event_table); default: /* Yikes! We got an illegal event type. */ return &bug; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index ca44917..f4bec27 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -57,6 +57,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -2458,9 +2460,8 @@ static int sctp_setsockopt_delayed_ack(struct sock *sk, if (params.sack_delay == 0 && params.sack_freq == 0) return 0; } else if (optlen == sizeof(struct sctp_assoc_value)) { - printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " - "in delayed_ack socket option deprecated\n"); - printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n"); + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; @@ -2868,10 +2869,8 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, unsigned int val; if (optlen == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in maxseg socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); if (copy_from_user(&val, optval, optlen)) return -EFAULT; params.assoc_id = 0; @@ -3121,10 +3120,8 @@ static int sctp_setsockopt_maxburst(struct sock *sk, int assoc_id = 0; if (optlen == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in max_burst socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); if (copy_from_user(&val, optval, optlen)) return -EFAULT; } else if (optlen == sizeof(struct sctp_assoc_value)) { @@ -4281,9 +4278,8 @@ static int sctp_getsockopt_delayed_ack(struct sock *sk, int len, if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else if (len == sizeof(struct sctp_assoc_value)) { - printk(KERN_WARNING "SCTP: Use of struct sctp_assoc_value " - "in delayed_ack socket option deprecated\n"); - printk(KERN_WARNING "SCTP: Use struct sctp_sack_info instead\n"); + pr_warn("Use of struct sctp_assoc_value in delayed_ack socket option deprecated\n"); + pr_warn("Use struct sctp_sack_info instead\n"); if (copy_from_user(¶ms, optval, len)) return -EFAULT; } else @@ -4929,10 +4925,8 @@ static int sctp_getsockopt_maxseg(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in maxseg socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in maxseg socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5023,10 +5017,8 @@ static int sctp_getsockopt_maxburst(struct sock *sk, int len, struct sctp_association *asoc; if (len == sizeof(int)) { - printk(KERN_WARNING - "SCTP: Use of int in max_burst socket option deprecated\n"); - printk(KERN_WARNING - "SCTP: Use struct sctp_assoc_value instead\n"); + pr_warn("Use of int in max_burst socket option deprecated\n"); + pr_warn("Use struct sctp_assoc_value instead\n"); params.assoc_id = 0; } else if (len >= sizeof(struct sctp_assoc_value)) { len = sizeof(struct sctp_assoc_value); @@ -5586,8 +5578,7 @@ SCTP_STATIC int sctp_listen_start(struct sock *sk, int backlog) tfm = crypto_alloc_hash(sctp_hmac_alg, 0, CRYPTO_ALG_ASYNC); if (IS_ERR(tfm)) { if (net_ratelimit()) { - printk(KERN_INFO - "SCTP: failed to load transform for %s: %ld\n", + pr_info("failed to load transform for %s: %ld\n", sctp_hmac_alg, PTR_ERR(tfm)); } return -ENOSYS; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 132046c..d3ae493 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -48,6 +48,8 @@ * be incorporated into the next SCTP release. */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + #include #include #include @@ -244,10 +246,9 @@ void sctp_transport_update_pmtu(struct sctp_transport *t, u32 pmtu) struct dst_entry *dst; if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", - __func__, pmtu, - SCTP_DEFAULT_MINSEGMENT); + pr_warn("%s: Reported pmtu %d too low, using default minimum of %d\n", + __func__, pmtu, + SCTP_DEFAULT_MINSEGMENT); /* Use default minimum segment size and disable * pmtu discovery on this transport. */ -- cgit v1.1 From 53f91dc1f76922375ad7957ef29f48986722532d Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 24 Aug 2010 13:32:58 +0000 Subject: net: use scnprintf() to avoid potential buffer overflow strlcpy() returns the total length of the string they tried to create, so we should not use its return value without any check. scnprintf() returns the number of characters written into @buf not including the trailing '\0', so use it instead here. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/ethernet/eth.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 215c839..85e7b45 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -367,7 +367,7 @@ struct net_device *alloc_etherdev_mq(int sizeof_priv, unsigned int queue_count) EXPORT_SYMBOL(alloc_etherdev_mq); static size_t _format_mac_addr(char *buf, int buflen, - const unsigned char *addr, int len) + const unsigned char *addr, int len) { int i; char *cp = buf; @@ -376,7 +376,7 @@ static size_t _format_mac_addr(char *buf, int buflen, cp += scnprintf(cp, buflen - (cp - buf), "%02x", addr[i]); if (i == len - 1) break; - cp += strlcpy(cp, ":", buflen - (cp - buf)); + cp += scnprintf(cp, buflen - (cp - buf), ":"); } return cp - buf; } @@ -386,7 +386,7 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) size_t l; l = _format_mac_addr(buf, PAGE_SIZE, addr, len); - l += strlcpy(buf + l, "\n", PAGE_SIZE - l); + l += scnprintf(buf + l, PAGE_SIZE - l, "\n"); return ((ssize_t) l); } EXPORT_SYMBOL(sysfs_format_mac); -- cgit v1.1 From 40d0802b3eb47d57e2d57a5244a18cbbe9632e13 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 26 Aug 2010 22:03:08 -0700 Subject: gro: __napi_gro_receive() optimizations compare_ether_header() can have a special implementation on 64 bit arches if CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS is defined. __napi_gro_receive() and vlan_gro_common() can avoid a conditional branch to perform device match. On x86_64, __napi_gro_receive() has now 38 instructions instead of 53 As gcc-4.4.3 still choose to not inline it, add inline keyword to this performance critical function. Signed-off-by: Eric Dumazet CC: Herbert Xu Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 9 ++++++--- net/core/dev.c | 10 ++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 07eeb5b..3438c01 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -105,9 +105,12 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, goto drop; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - p->dev == skb->dev && !compare_ether_header( - skb_mac_header(p), skb_gro_mac_header(skb)); + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), + skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } diff --git a/net/core/dev.c b/net/core/dev.c index 859e30f..63bd20a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3169,16 +3169,18 @@ normal: } EXPORT_SYMBOL(dev_gro_receive); -static gro_result_t +static inline gro_result_t __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { struct sk_buff *p; for (p = napi->gro_list; p; p = p->next) { - NAPI_GRO_CB(p)->same_flow = - (p->dev == skb->dev) && - !compare_ether_header(skb_mac_header(p), + unsigned long diffs; + + diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); + NAPI_GRO_CB(p)->same_flow = !diffs; NAPI_GRO_CB(p)->flush = 0; } -- cgit v1.1 From 5f33c92d188add2a22ec524c03e0ab097e303d52 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Aug 2010 13:30:25 +0200 Subject: mac80211: remove unused scan expire define Since cfg80211 manages the BSS list completely, this define hasn't been used for a long time and will never be used again. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index e73ae51..9e225f0 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -50,12 +50,6 @@ struct ieee80211_local; * increased memory use (about 2 kB of RAM per entry). */ #define IEEE80211_FRAGMENT_MAX 4 -/* - * Time after which we ignore scan results and no longer report/use - * them in any way. - */ -#define IEEE80211_SCAN_RESULT_EXPIRE (10 * HZ) - #define TU_TO_EXP_TIME(x) (jiffies + usecs_to_jiffies((x) * 1024)) #define IEEE80211_DEFAULT_UAPSD_QUEUES \ -- cgit v1.1 From 8789d459bc5e837bf37d261453df96ef54018d7b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 26 Aug 2010 13:30:26 +0200 Subject: mac80211: allow scan to complete from any context The ieee80211_scan_completed() function was a frequent source of potential deadlocks, since it is called by drivers but may call back into drivers, so drivers had to make sure to call it without any locks held, which frequently lead to more complex code in drivers. Avoid that problem by allowing the function to be called in any context, and queueing the actual work it does. Also update the documentation for it to indicate this. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 6 ++++++ net/mac80211/scan.c | 34 ++++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9e225f0..3171332 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -596,11 +596,17 @@ enum queue_stop_reason { * determine if we are on the operating channel or not * @SCAN_OFF_CHANNEL: We're off our operating channel for scanning, * gets only set in conjunction with SCAN_SW_SCANNING + * @SCAN_COMPLETED: Set for our scan work function when the driver reported + * that the scan completed. + * @SCAN_ABORTED: Set for our scan work function when the driver reported + * a scan complete for an aborted scan. */ enum { SCAN_SW_SCANNING, SCAN_HW_SCANNING, SCAN_OFF_CHANNEL, + SCAN_COMPLETED, + SCAN_ABORTED, }; /** diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 31f233f..d60389b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -248,13 +248,11 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) return true; } -void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) { struct ieee80211_local *local = hw_to_local(hw); bool was_hw_scan; - trace_api_scan_completed(local, aborted); - mutex_lock(&local->mtx); /* @@ -312,6 +310,18 @@ void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) ieee80211_mesh_notify_scan_completed(local); ieee80211_queue_work(&local->hw, &local->work_work); } + +void ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +{ + struct ieee80211_local *local = hw_to_local(hw); + + trace_api_scan_completed(local, aborted); + + set_bit(SCAN_COMPLETED, &local->scanning); + if (aborted) + set_bit(SCAN_ABORTED, &local->scanning); + ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); +} EXPORT_SYMBOL(ieee80211_scan_completed); static int ieee80211_start_sw_scan(struct ieee80211_local *local) @@ -449,7 +459,7 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local, /* if no more bands/channels left, complete scan and advance to the idle state */ if (local->scan_channel_idx >= local->scan_req->n_channels) { - ieee80211_scan_completed(&local->hw, false); + __ieee80211_scan_completed(&local->hw, false); return 1; } @@ -641,6 +651,14 @@ void ieee80211_scan_work(struct work_struct *work) struct ieee80211_sub_if_data *sdata = local->scan_sdata; unsigned long next_delay = 0; + if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { + bool aborted; + + aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); + __ieee80211_scan_completed(&local->hw, aborted); + return; + } + mutex_lock(&local->mtx); if (!sdata || !local->scan_req) { mutex_unlock(&local->mtx); @@ -651,7 +669,7 @@ void ieee80211_scan_work(struct work_struct *work) int rc = drv_hw_scan(local, sdata, local->hw_scan_req); mutex_unlock(&local->mtx); if (rc) - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); return; } @@ -666,7 +684,7 @@ void ieee80211_scan_work(struct work_struct *work) mutex_unlock(&local->mtx); if (rc) - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); return; } @@ -676,7 +694,7 @@ void ieee80211_scan_work(struct work_struct *work) * Avoid re-scheduling when the sdata is going away. */ if (!ieee80211_sdata_running(sdata)) { - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); return; } @@ -783,5 +801,5 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) mutex_unlock(&local->mtx); if (abortscan) - ieee80211_scan_completed(&local->hw, true); + __ieee80211_scan_completed(&local->hw, true); } -- cgit v1.1 From 7d64b7cc1fc33bab24567903a93f699d11649c0b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 14:26:51 +0300 Subject: cfg80211: allow vendor specific cipher suites cfg80211 currently rejects all cipher suites it doesn't know about for key length checking purposes. This can lead to inconsistencies when a driver advertises an algorithm that cfg80211 doesn't know about. Remove this rejection so drivers can specify any algorithm they like. Signed-off-by: Johannes Berg Signed-off-by: Juuso Oikarinen Signed-off-by: John W. Linville --- net/wireless/util.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/util.c b/net/wireless/util.c index 8d961cc..bca32eb 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -183,7 +183,14 @@ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, return -EINVAL; break; default: - return -EINVAL; + /* + * We don't know anything about this algorithm, + * allow using it -- but the driver must check + * all parameters! We still check below whether + * or not the driver supports this algorithm, + * of course. + */ + break; } if (params->seq) { -- cgit v1.1 From 3ffc2a905b1faae4c0fe39d66f0752c3a4cbb3c7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 14:26:52 +0300 Subject: mac80211: allow vendor specific cipher suites Allow drivers to specify their own set of cipher suites to advertise vendor-specific ciphers. The driver is then required to implement hardware crypto offload for it. Signed-off-by: Johannes Berg Signed-off-by: Juuso Oikarinen Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 5 +++-- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/key.c | 38 ++++++++++++++++++++++++++++++-------- net/mac80211/key.h | 6 +++--- net/mac80211/main.c | 44 ++++++++++++++++++++++++++++++++++++++------ net/mac80211/rx.c | 6 ++++++ net/mac80211/tx.c | 12 ++++++++++-- 7 files changed, 92 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 5814382..f149b4e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -119,9 +119,10 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, } } - ieee80211_key_link(key, sdata, sta); + err = ieee80211_key_link(key, sdata, sta); + if (err) + ieee80211_key_free(sdata->local, key); - err = 0; out_unlock: mutex_unlock(&sdata->local->sta_mtx); diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3171332..7d2bb6f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -662,6 +662,8 @@ struct ieee80211_local { int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll; unsigned int filter_flags; /* FIF_* */ + bool wiphy_ciphers_allocated; + /* protects the aggregated multicast list and filter calls */ spinlock_t filter_lock; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 2ce2dbb..3570f8c 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -60,7 +60,7 @@ static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key) return NULL; } -static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) +static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) { struct ieee80211_sub_if_data *sdata; struct ieee80211_sta *sta; @@ -68,8 +68,10 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) might_sleep(); - if (!key->local->ops->set_key) - return; + if (!key->local->ops->set_key) { + ret = -EOPNOTSUPP; + goto out_unsupported; + } assert_key_lock(key->local); @@ -90,6 +92,24 @@ static void ieee80211_key_enable_hw_accel(struct ieee80211_key *key) wiphy_err(key->local->hw.wiphy, "failed to set key (%d, %pM) to hardware (%d)\n", key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); + +out_unsupported: + if (ret) { + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + case WLAN_CIPHER_SUITE_TKIP: + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_AES_CMAC: + /* all of these we can do in software */ + ret = 0; + break; + default: + ret = -EINVAL; + } + } + + return ret; } static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) @@ -329,12 +349,12 @@ static void __ieee80211_key_destroy(struct ieee80211_key *key) kfree(key); } -void ieee80211_key_link(struct ieee80211_key *key, - struct ieee80211_sub_if_data *sdata, - struct sta_info *sta) +int ieee80211_key_link(struct ieee80211_key *key, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta) { struct ieee80211_key *old_key; - int idx; + int idx, ret; BUG_ON(!sdata); BUG_ON(!key); @@ -389,9 +409,11 @@ void ieee80211_key_link(struct ieee80211_key *key, ieee80211_debugfs_key_add(key); - ieee80211_key_enable_hw_accel(key); + ret = ieee80211_key_enable_hw_accel(key); mutex_unlock(&sdata->local->key_mtx); + + return ret; } static void __ieee80211_key_free(struct ieee80211_key *key) diff --git a/net/mac80211/key.h b/net/mac80211/key.h index 53b5ce1..cb9a4a6 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -130,9 +130,9 @@ struct ieee80211_key *ieee80211_key_alloc(u32 cipher, int idx, size_t key_len, * Insert a key into data structures (sdata, sta if necessary) * to make it used, free old key. */ -void ieee80211_key_link(struct ieee80211_key *key, - struct ieee80211_sub_if_data *sdata, - struct sta_info *sta); +int __must_check ieee80211_key_link(struct ieee80211_key *key, + struct ieee80211_sub_if_data *sdata, + struct sta_info *sta); void ieee80211_key_free(struct ieee80211_local *local, struct ieee80211_key *key); void ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 80db5ea..15f0e96 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -662,13 +662,40 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (local->hw.wiphy->max_scan_ie_len) local->hw.wiphy->max_scan_ie_len -= local->scan_ies_len; - local->hw.wiphy->cipher_suites = cipher_suites; - local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites); - if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) - local->hw.wiphy->n_cipher_suites--; + /* Set up cipher suites unless driver already did */ + if (!local->hw.wiphy->cipher_suites) { + local->hw.wiphy->cipher_suites = cipher_suites; + local->hw.wiphy->n_cipher_suites = ARRAY_SIZE(cipher_suites); + if (!(local->hw.flags & IEEE80211_HW_MFP_CAPABLE)) + local->hw.wiphy->n_cipher_suites--; + } if (IS_ERR(local->wep_tx_tfm) || IS_ERR(local->wep_rx_tfm)) { - local->hw.wiphy->cipher_suites += 2; - local->hw.wiphy->n_cipher_suites -= 2; + if (local->hw.wiphy->cipher_suites == cipher_suites) { + local->hw.wiphy->cipher_suites += 2; + local->hw.wiphy->n_cipher_suites -= 2; + } else { + u32 *suites; + int r, w = 0; + + /* Filter out WEP */ + + suites = kmemdup( + local->hw.wiphy->cipher_suites, + sizeof(u32) * local->hw.wiphy->n_cipher_suites, + GFP_KERNEL); + if (!suites) + return -ENOMEM; + for (r = 0; r < local->hw.wiphy->n_cipher_suites; r++) { + u32 suite = local->hw.wiphy->cipher_suites[r]; + if (suite == WLAN_CIPHER_SUITE_WEP40 || + suite == WLAN_CIPHER_SUITE_WEP104) + continue; + suites[w++] = suite; + } + local->hw.wiphy->cipher_suites = suites; + local->hw.wiphy->n_cipher_suites = w; + local->wiphy_ciphers_allocated = true; + } } result = wiphy_register(local->hw.wiphy); @@ -783,6 +810,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) fail_workqueue: wiphy_unregister(local->hw.wiphy); fail_wiphy_register: + if (local->wiphy_ciphers_allocated) + kfree(local->hw.wiphy->cipher_suites); kfree(local->int_scan_req); return result; } @@ -840,6 +869,9 @@ void ieee80211_free_hw(struct ieee80211_hw *hw) mutex_destroy(&local->iflist_mtx); mutex_destroy(&local->mtx); + if (local->wiphy_ciphers_allocated) + kfree(local->hw.wiphy->cipher_suites); + wiphy_free(local->hw.wiphy); } EXPORT_SYMBOL(ieee80211_free_hw); diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index e67deb4..6e5fb16 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1000,6 +1000,12 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) case WLAN_CIPHER_SUITE_AES_CMAC: result = ieee80211_crypto_aes_cmac_decrypt(rx); break; + default: + /* + * We can reach here only with HW-only algorithms + * but why didn't it decrypt the frame?! + */ + return RX_DROP_UNUSABLE; } /* either the frame has been decrypted or will be dropped */ diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index a6ac9fd..31a8903 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -947,6 +947,8 @@ ieee80211_tx_h_stats(struct ieee80211_tx_data *tx) static ieee80211_tx_result debug_noinline ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) { + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + if (!tx->key) return TX_CONTINUE; @@ -960,10 +962,16 @@ ieee80211_tx_h_encrypt(struct ieee80211_tx_data *tx) return ieee80211_crypto_ccmp_encrypt(tx); case WLAN_CIPHER_SUITE_AES_CMAC: return ieee80211_crypto_aes_cmac_encrypt(tx); + default: + /* handle hw-only algorithm */ + if (info->control.hw_key) { + ieee80211_tx_set_protected(tx); + return TX_CONTINUE; + } + break; + } - /* not reached */ - WARN_ON(1); return TX_DROP; } -- cgit v1.1 From c0692b8fe29fb4d4dad33487aabf3ed7e1e880c0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 14:26:53 +0300 Subject: cfg80211: allow changing port control protocol Some vendor specified mechanisms for 802.1X-style functionality use a different protocol than EAP (even if EAP is vendor-extensible). Allow setting the ethertype for the protocol when a driver has support for this. The default if unspecified is EAP, of course. Note: This is suitable only for station mode, not for AP implementation. Signed-off-by: Johannes Berg Signed-off-by: Juuso Oikarinen Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 25 ++++++++++++++++++++++--- net/wireless/wext-sme.c | 2 ++ 2 files changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 49f5ca3..85a23de 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -136,6 +136,8 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { .len = sizeof(struct nl80211_sta_flag_update), }, [NL80211_ATTR_CONTROL_PORT] = { .type = NLA_FLAG }, + [NL80211_ATTR_CONTROL_PORT_ETHERTYPE] = { .type = NLA_U16 }, + [NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT] = { .type = NLA_FLAG }, [NL80211_ATTR_PRIVACY] = { .type = NLA_FLAG }, [NL80211_ATTR_CIPHER_SUITE_GROUP] = { .type = NLA_U32 }, [NL80211_ATTR_WPA_VERSIONS] = { .type = NLA_U32 }, @@ -474,6 +476,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, NLA_PUT_U8(msg, NL80211_ATTR_MAX_NUM_PMKIDS, dev->wiphy.max_num_pmkids); + if (dev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) + NLA_PUT_FLAG(msg, NL80211_ATTR_CONTROL_PORT_ETHERTYPE); + nl_modes = nla_nest_start(msg, NL80211_ATTR_SUPPORTED_IFTYPES); if (!nl_modes) goto nla_put_failure; @@ -3691,7 +3696,8 @@ unlock_rtnl: return err; } -static int nl80211_crypto_settings(struct genl_info *info, +static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, + struct genl_info *info, struct cfg80211_crypto_settings *settings, int cipher_limit) { @@ -3699,6 +3705,19 @@ static int nl80211_crypto_settings(struct genl_info *info, settings->control_port = info->attrs[NL80211_ATTR_CONTROL_PORT]; + if (info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]) { + u16 proto; + proto = nla_get_u16( + info->attrs[NL80211_ATTR_CONTROL_PORT_ETHERTYPE]); + settings->control_port_ethertype = cpu_to_be16(proto); + if (!(rdev->wiphy.flags & WIPHY_FLAG_CONTROL_PORT_PROTOCOL) && + proto != ETH_P_PAE) + return -EINVAL; + if (info->attrs[NL80211_ATTR_CONTROL_PORT_NO_ENCRYPT]) + settings->control_port_no_encrypt = true; + } else + settings->control_port_ethertype = cpu_to_be16(ETH_P_PAE); + if (info->attrs[NL80211_ATTR_CIPHER_SUITES_PAIRWISE]) { void *data; int len, i; @@ -3826,7 +3845,7 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_PREV_BSSID]) prev_bssid = nla_data(info->attrs[NL80211_ATTR_PREV_BSSID]); - err = nl80211_crypto_settings(info, &crypto, 1); + err = nl80211_crypto_settings(rdev, info, &crypto, 1); if (!err) err = cfg80211_mlme_assoc(rdev, dev, chan, bssid, prev_bssid, ssid, ssid_len, ie, ie_len, use_mfp, @@ -4303,7 +4322,7 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) connect.privacy = info->attrs[NL80211_ATTR_PRIVACY]; - err = nl80211_crypto_settings(info, &connect.crypto, + err = nl80211_crypto_settings(rdev, info, &connect.crypto, NL80211_MAX_NR_CIPHER_SUITES); if (err) return err; diff --git a/net/wireless/wext-sme.c b/net/wireless/wext-sme.c index 9818198..6fffe62 100644 --- a/net/wireless/wext-sme.c +++ b/net/wireless/wext-sme.c @@ -197,6 +197,8 @@ int cfg80211_mgd_wext_siwessid(struct net_device *dev, wdev->wext.connect.ssid_len = len; wdev->wext.connect.crypto.control_port = false; + wdev->wext.connect.crypto.control_port_ethertype = + cpu_to_be16(ETH_P_PAE); err = cfg80211_mgd_wext_connect(rdev, wdev); out: -- cgit v1.1 From a621fa4d6a7fdf9d34938d2e129a72624833eeeb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 14:26:54 +0300 Subject: mac80211: allow changing port control protocol Some vendor specified mechanisms for 802.1X-style functionality use a different protocol than EAP (even if EAP is vendor-extensible). Support this in mac80211 via the cfg80211 API for it. Signed-off-by: Johannes Berg Signed-off-by: Juuso Oikarinen Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/iface.c | 3 +++ net/mac80211/main.c | 3 +++ net/mac80211/mlme.c | 3 +++ net/mac80211/rx.c | 2 +- net/mac80211/tx.c | 20 +++++++++++++++++--- 6 files changed, 29 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 7d2bb6f..f648377 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -509,6 +509,8 @@ struct ieee80211_sub_if_data { struct ieee80211_key *default_mgmt_key; u16 sequence_number; + __be16 control_port_protocol; + bool control_port_no_encrypt; struct work_struct work; struct sk_buff_head skb_queue; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 9369710..810e676 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -855,6 +855,9 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, sdata->dev->netdev_ops = &ieee80211_dataif_ops; sdata->wdev.iftype = type; + sdata->control_port_protocol = cpu_to_be16(ETH_P_PAE); + sdata->control_port_no_encrypt = false; + /* only monitor differs */ sdata->dev->type = ARPHRD_ETHER; diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 15f0e96..964c267 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -627,6 +627,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) local->hw.wiphy->interface_modes &= ~BIT(NL80211_IFTYPE_MESH_POINT); #endif + /* mac80211 supports control port protocol changing */ + local->hw.wiphy->flags |= WIPHY_FLAG_CONTROL_PORT_PROTOCOL; + if (local->hw.flags & IEEE80211_HW_SIGNAL_DBM) local->hw.wiphy->signal_type = CFG80211_SIGNAL_TYPE_MBM; else if (local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5282ac1..0cb4296 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2262,6 +2262,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, else ifmgd->flags &= ~IEEE80211_STA_CONTROL_PORT; + sdata->control_port_protocol = req->crypto.control_port_ethertype; + sdata->control_port_no_encrypt = req->crypto.control_port_no_encrypt; + ieee80211_add_work(wk); return 0; } diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 6e5fb16..ac205a3 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -1527,7 +1527,7 @@ static bool ieee80211_frame_allowed(struct ieee80211_rx_data *rx, __le16 fc) * Allow EAPOL frames to us/the PAE group address regardless * of whether the frame was encrypted or not. */ - if (ehdr->h_proto == htons(ETH_P_PAE) && + if (ehdr->h_proto == rx->sdata->control_port_protocol && (compare_ether_addr(ehdr->h_dest, rx->sdata->vif.addr) == 0 || compare_ether_addr(ehdr->h_dest, pae_group_addr) == 0)) return true; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 31a8903..92764bb 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -509,6 +509,18 @@ ieee80211_tx_h_ps_buf(struct ieee80211_tx_data *tx) } static ieee80211_tx_result debug_noinline +ieee80211_tx_h_check_control_port_protocol(struct ieee80211_tx_data *tx) +{ + struct ieee80211_tx_info *info = IEEE80211_SKB_CB(tx->skb); + + if (unlikely(tx->sdata->control_port_protocol == tx->skb->protocol && + tx->sdata->control_port_no_encrypt)) + info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT; + + return TX_CONTINUE; +} + +static ieee80211_tx_result debug_noinline ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) { struct ieee80211_key *key = NULL; @@ -527,7 +539,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) else if ((key = rcu_dereference(tx->sdata->default_key))) tx->key = key; else if (tx->sdata->drop_unencrypted && - (tx->skb->protocol != cpu_to_be16(ETH_P_PAE)) && + (tx->skb->protocol != tx->sdata->control_port_protocol) && !(info->flags & IEEE80211_TX_CTL_INJECTED) && (!ieee80211_is_robust_mgmt_frame(hdr) || (ieee80211_is_action(hdr->frame_control) && @@ -1349,6 +1361,7 @@ static int invoke_tx_handlers(struct ieee80211_tx_data *tx) CALL_TXH(ieee80211_tx_h_dynamic_ps); CALL_TXH(ieee80211_tx_h_check_assoc); CALL_TXH(ieee80211_tx_h_ps_buf); + CALL_TXH(ieee80211_tx_h_check_control_port_protocol); CALL_TXH(ieee80211_tx_h_select_key); if (!(tx->local->hw.flags & IEEE80211_HW_HAS_RATE_CONTROL)) CALL_TXH(ieee80211_tx_h_rate_ctrl); @@ -1826,7 +1839,8 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, #endif case NL80211_IFTYPE_STATION: memcpy(hdr.addr1, sdata->u.mgd.bssid, ETH_ALEN); - if (sdata->u.mgd.use_4addr && ethertype != ETH_P_PAE) { + if (sdata->u.mgd.use_4addr && + cpu_to_be16(ethertype) != sdata->control_port_protocol) { fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS | IEEE80211_FCTL_TODS); /* RA TA DA SA */ memcpy(hdr.addr2, sdata->vif.addr, ETH_ALEN); @@ -1879,7 +1893,7 @@ netdev_tx_t ieee80211_subif_start_xmit(struct sk_buff *skb, if (!ieee80211_vif_is_mesh(&sdata->vif) && unlikely(!is_multicast_ether_addr(hdr.addr1) && !(sta_flags & WLAN_STA_AUTHORIZED) && - !(ethertype == ETH_P_PAE && + !(cpu_to_be16(ethertype) == sdata->control_port_protocol && compare_ether_addr(sdata->vif.addr, skb->data + ETH_ALEN) == 0))) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG -- cgit v1.1 From 2337db8db845ece2d4ab7673a343e285f1bfda85 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 13:36:49 +0200 Subject: mac80211: use subqueue helpers There are subqueue helpers so that we don't need to get the TX queue and then wake/stop it, use those helpers. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/tx.c | 3 +-- net/mac80211/util.c | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 92764bb..ccf3737 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -2092,8 +2092,7 @@ void ieee80211_tx_pending(unsigned long data) if (skb_queue_empty(&local->pending[i])) list_for_each_entry_rcu(sdata, &local->interfaces, list) - netif_tx_wake_queue( - netdev_get_tx_queue(sdata->dev, i)); + netif_wake_subqueue(sdata->dev, i); } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index cd2b485..ef686d3 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -284,7 +284,7 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, if (skb_queue_empty(&local->pending[queue])) { rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) - netif_tx_wake_queue(netdev_get_tx_queue(sdata->dev, queue)); + netif_wake_subqueue(sdata->dev, queue); rcu_read_unlock(); } else tasklet_schedule(&local->tx_pending_tasklet); @@ -323,7 +323,7 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, rcu_read_lock(); list_for_each_entry_rcu(sdata, &local->interfaces, list) - netif_tx_stop_queue(netdev_get_tx_queue(sdata->dev, queue)); + netif_stop_subqueue(sdata->dev, queue); rcu_read_unlock(); } -- cgit v1.1 From b9dcf712d1fb98bf279fcd453a42a763b104961d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 12:35:54 +0200 Subject: mac80211: clean up ifdown/cleanup paths There's a lot of redundant code in mac80211's interface cleanup/down, for example freeing AP beacons is done both when the interface is set DOWN as well as when it is torn down, of which only the former has any effect. Also, a bunch of things should be closer to where they matter, like the MLME timers that we should cancel when disassociating, rather than only when the interface is set DOWN. Clean up all this code. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/iface.c | 89 ++++++++++++++-------------------------------------- net/mac80211/mlme.c | 5 +++ 2 files changed, 28 insertions(+), 66 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 810e676..cc1c68d 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -370,12 +370,9 @@ static int ieee80211_stop(struct net_device *dev) * (because if we remove a STA after ops->remove_interface() * the driver will have removed the vif info already!) * - * We could relax this and only unlink the stations from the - * hash table and list but keep them on a per-sdata list that - * will be inserted back again when the interface is brought - * up again, but I don't currently see a use case for that, - * except with WDS which gets a STA entry created when it is - * brought up. + * This is relevant only in AP, WDS and mesh modes, since in + * all other modes we've already removed all stations when + * disconnecting etc. */ sta_info_flush(local, sdata); @@ -410,11 +407,21 @@ static int ieee80211_stop(struct net_device *dev) struct ieee80211_sub_if_data *vlan, *tmpsdata; struct beacon_data *old_beacon = sdata->u.ap.beacon; + /* sdata_running will return false, so this will disable */ + ieee80211_bss_info_change_notify(sdata, + BSS_CHANGED_BEACON_ENABLED); + /* remove beacon */ rcu_assign_pointer(sdata->u.ap.beacon, NULL); synchronize_rcu(); kfree(old_beacon); + /* free all potentially still buffered bcast frames */ + while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) { + local->total_ps_buffered--; + dev_kfree_skb(skb); + } + /* down all dependent devices, that is VLANs */ list_for_each_entry_safe(vlan, tmpsdata, &sdata->u.ap.vlans, u.vlan.list) @@ -454,27 +461,6 @@ static int ieee80211_stop(struct net_device *dev) ieee80211_configure_filter(local); break; - case NL80211_IFTYPE_STATION: - del_timer_sync(&sdata->u.mgd.chswitch_timer); - del_timer_sync(&sdata->u.mgd.timer); - del_timer_sync(&sdata->u.mgd.conn_mon_timer); - del_timer_sync(&sdata->u.mgd.bcn_mon_timer); - /* - * If any of the timers fired while we waited for it, it will - * have queued its work. Now the work will be running again - * but will not rearm the timer again because it checks - * whether the interface is running, which, at this point, - * it no longer is. - */ - cancel_work_sync(&sdata->u.mgd.chswitch_work); - cancel_work_sync(&sdata->u.mgd.monitor_work); - cancel_work_sync(&sdata->u.mgd.beacon_connection_loss_work); - - /* fall through */ - case NL80211_IFTYPE_ADHOC: - if (sdata->vif.type == NL80211_IFTYPE_ADHOC) - del_timer_sync(&sdata->u.ibss.timer); - /* fall through */ case NL80211_IFTYPE_MESH_POINT: if (ieee80211_vif_is_mesh(&sdata->vif)) { /* other_bss and allmulti are always set on mesh @@ -502,17 +488,19 @@ static int ieee80211_stop(struct net_device *dev) ieee80211_scan_cancel(local); /* - * Disable beaconing for AP and mesh, IBSS can't - * still be joined to a network at this point. + * Disable beaconing here for mesh only, AP and IBSS + * are already taken care of. */ - if (sdata->vif.type == NL80211_IFTYPE_AP || - sdata->vif.type == NL80211_IFTYPE_MESH_POINT) { + if (sdata->vif.type == NL80211_IFTYPE_MESH_POINT) ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BEACON_ENABLED); - } - /* free all remaining keys, there shouldn't be any */ + /* + * Free all remaining keys, there shouldn't be any, + * except maybe group keys in AP more or WDS? + */ ieee80211_free_keys(sdata); + drv_remove_interface(local, &sdata->vif); } @@ -593,8 +581,6 @@ static void ieee80211_teardown_sdata(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; - struct beacon_data *beacon; - struct sk_buff *skb; int flushed; int i; @@ -607,37 +593,8 @@ static void ieee80211_teardown_sdata(struct net_device *dev) __skb_queue_purge(&sdata->fragments[i].skb_list); sdata->fragment_next = 0; - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP: - beacon = sdata->u.ap.beacon; - rcu_assign_pointer(sdata->u.ap.beacon, NULL); - synchronize_rcu(); - kfree(beacon); - - while ((skb = skb_dequeue(&sdata->u.ap.ps_bc_buf))) { - local->total_ps_buffered--; - dev_kfree_skb(skb); - } - - break; - case NL80211_IFTYPE_MESH_POINT: - if (ieee80211_vif_is_mesh(&sdata->vif)) - mesh_rmc_free(sdata); - break; - case NL80211_IFTYPE_ADHOC: - if (WARN_ON(sdata->u.ibss.presp)) - kfree_skb(sdata->u.ibss.presp); - break; - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_WDS: - case NL80211_IFTYPE_AP_VLAN: - case NL80211_IFTYPE_MONITOR: - break; - case NL80211_IFTYPE_UNSPECIFIED: - case NUM_NL80211_IFTYPES: - BUG(); - break; - } + if (ieee80211_vif_is_mesh(&sdata->vif)) + mesh_rmc_free(sdata); flushed = sta_info_flush(local, sdata); WARN_ON(flushed); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0cb4296..c869447 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -991,6 +991,11 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, if (remove_sta) sta_info_destroy_addr(sdata, bssid); + + del_timer_sync(&sdata->u.mgd.conn_mon_timer); + del_timer_sync(&sdata->u.mgd.bcn_mon_timer); + del_timer_sync(&sdata->u.mgd.timer); + del_timer_sync(&sdata->u.mgd.chswitch_timer); } void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, -- cgit v1.1 From 26a58456be40d8181b884eb5b4e61e3f73ba94e0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 12:35:55 +0200 Subject: mac80211: switch to ieee80211_sdata_running Since the introduction of ieee80211_sdata_running(), some new code was introduced that uses netif_running() instead. Switch all these instances over. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 6 ++---- net/mac80211/main.c | 6 +++--- net/mac80211/util.c | 2 +- 3 files changed, 6 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f149b4e..f82b18e 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -81,16 +81,14 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, u8 key_idx, const u8 *mac_addr, struct key_params *params) { - struct ieee80211_sub_if_data *sdata; + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct sta_info *sta = NULL; struct ieee80211_key *key; int err; - if (!netif_running(dev)) + if (!ieee80211_sdata_running(sdata)) return -ENETDOWN; - sdata = IEEE80211_DEV_TO_SUB_IF(dev); - /* reject WEP and TKIP keys if WEP failed to initialize */ switch (params->cipher) { case WLAN_CIPHER_SUITE_WEP40: diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 964c267..93194f6 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -339,9 +339,6 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, struct ieee80211_if_managed *ifmgd; int c = 0; - if (!netif_running(ndev)) - return NOTIFY_DONE; - /* Make sure it's our interface that got changed */ if (!wdev) return NOTIFY_DONE; @@ -352,6 +349,9 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, sdata = IEEE80211_DEV_TO_SUB_IF(ndev); bss_conf = &sdata->vif.bss_conf; + if (!ieee80211_sdata_running(sdata)) + return NOTIFY_DONE; + /* ARP filtering is only supported in managed mode */ if (sdata->vif.type != NL80211_IFTYPE_STATION) return NOTIFY_DONE; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index ef686d3..d38b376 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1308,7 +1308,7 @@ void ieee80211_recalc_smps(struct ieee80211_local *local, */ list_for_each_entry(sdata, &local->interfaces, list) { - if (!netif_running(sdata->dev)) + if (!ieee80211_sdata_running(sdata)) continue; if (sdata->vif.type != NL80211_IFTYPE_STATION) goto set; -- cgit v1.1 From bf533e0bfd77d9671adabdf134b1ac7f24bb0670 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 12:35:56 +0200 Subject: mac80211: simplify zero address checks The libertas_tf special code for zero addresses is a bit too complex, it compares against a stack value instead of using is_zero_ether_addr() and tries to update all interfaces even if just the one that's being brought up needs to be changed. Additionally, the repeated check for a valid MAC address need only be done if we actually changed it on the fly. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/iface.c | 38 +++++++++++++------------------------- 1 file changed, 13 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index cc1c68d..ea50732 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -103,10 +103,9 @@ static int ieee80211_open(struct net_device *dev) u32 changed = 0; int res; u32 hw_reconf_flags = 0; - u8 null_addr[ETH_ALEN] = {0}; /* fail early if user set an invalid address */ - if (compare_ether_addr(dev->dev_addr, null_addr) && + if (!is_zero_ether_addr(dev->dev_addr) && !is_valid_ether_addr(dev->dev_addr)) return -EADDRNOTAVAIL; @@ -195,33 +194,22 @@ static int ieee80211_open(struct net_device *dev) } /* - * Check all interfaces and copy the hopefully now-present - * MAC address to those that have the special null one. + * Copy the hopefully now-present MAC address to + * this interface, if it has the special null one. */ - list_for_each_entry(nsdata, &local->interfaces, list) { - struct net_device *ndev = nsdata->dev; - - /* - * No need to check running since we do not allow - * it to start up with this invalid address. - */ - if (compare_ether_addr(null_addr, ndev->dev_addr) == 0) { - memcpy(ndev->dev_addr, - local->hw.wiphy->perm_addr, - ETH_ALEN); - memcpy(ndev->perm_addr, ndev->dev_addr, ETH_ALEN); + if (is_zero_ether_addr(dev->dev_addr)) { + memcpy(dev->dev_addr, + local->hw.wiphy->perm_addr, + ETH_ALEN); + memcpy(dev->perm_addr, dev->dev_addr, ETH_ALEN); + + if (!is_valid_ether_addr(dev->dev_addr)) { + if (!local->open_count) + drv_stop(local); + return -EADDRNOTAVAIL; } } - /* - * Validate the MAC address for this device. - */ - if (!is_valid_ether_addr(dev->dev_addr)) { - if (!local->open_count) - drv_stop(local); - return -EADDRNOTAVAIL; - } - switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: /* no need to tell driver */ -- cgit v1.1 From 87490f6db38999fee7f6d3dbecc5b94730c7e010 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 12:35:57 +0200 Subject: mac80211: split out concurrent vif checks Split the concurrent virtual interface checks into a new function that can be used to check for any given new interface type. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/iface.c | 43 +++++++++++++++++++++++++++++-------------- 1 file changed, 29 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index ea50732..cba3d80 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -94,20 +94,14 @@ static inline int identical_mac_addr_allowed(int type1, int type2) type2 == NL80211_IFTYPE_AP_VLAN)); } -static int ieee80211_open(struct net_device *dev) +static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype iftype) { - struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - struct ieee80211_sub_if_data *nsdata; struct ieee80211_local *local = sdata->local; - struct sta_info *sta; - u32 changed = 0; - int res; - u32 hw_reconf_flags = 0; + struct ieee80211_sub_if_data *nsdata; + struct net_device *dev = sdata->dev; - /* fail early if user set an invalid address */ - if (!is_zero_ether_addr(dev->dev_addr) && - !is_valid_ether_addr(dev->dev_addr)) - return -EADDRNOTAVAIL; + ASSERT_RTNL(); /* we hold the RTNL here so can safely walk the list */ list_for_each_entry(nsdata, &local->interfaces, list) { @@ -124,7 +118,7 @@ static int ieee80211_open(struct net_device *dev) * belonging to the same hardware. Then, however, we're * faced with having to adopt two different TSF timers... */ - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && + if (iftype == NL80211_IFTYPE_ADHOC && nsdata->vif.type == NL80211_IFTYPE_ADHOC) return -EBUSY; @@ -138,19 +132,40 @@ static int ieee80211_open(struct net_device *dev) /* * check whether it may have the same address */ - if (!identical_mac_addr_allowed(sdata->vif.type, + if (!identical_mac_addr_allowed(iftype, nsdata->vif.type)) return -ENOTUNIQ; /* * can only add VLANs to enabled APs */ - if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN && + if (iftype == NL80211_IFTYPE_AP_VLAN && nsdata->vif.type == NL80211_IFTYPE_AP) sdata->bss = &nsdata->u.ap; } } + return 0; +} + +static int ieee80211_open(struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_local *local = sdata->local; + struct sta_info *sta; + u32 changed = 0; + int res; + u32 hw_reconf_flags = 0; + + /* fail early if user set an invalid address */ + if (!is_zero_ether_addr(dev->dev_addr) && + !is_valid_ether_addr(dev->dev_addr)) + return -EADDRNOTAVAIL; + + res = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); + if (res) + return res; + switch (sdata->vif.type) { case NL80211_IFTYPE_WDS: if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) -- cgit v1.1 From 34d4bc4d41d282a66dafe1b01a7d46bad468cefb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 12:35:58 +0200 Subject: mac80211: support runtime interface type changes Add support to mac80211 for changing the interface type even when the interface is UP, if the driver supports it. To achieve this * add a new driver callback for switching, * split some of the interface up/down code out into new functions (do_open/do_stop), and * maintain an own __SDATA_RUNNING bit that will not be set during interface type, so that any other code doesn't use the interface. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 3 - net/mac80211/driver-ops.h | 14 ++++ net/mac80211/driver-trace.h | 25 +++++++ net/mac80211/ieee80211_i.h | 14 +++- net/mac80211/iface.c | 157 ++++++++++++++++++++++++++++++++++---------- 5 files changed, 176 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index f82b18e..5de1ca3 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -52,9 +52,6 @@ static int ieee80211_change_iface(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); int ret; - if (ieee80211_sdata_running(sdata)) - return -EBUSY; - ret = ieee80211_if_change_type(sdata, type); if (ret) return ret; diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 14123dc..6064b7b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -54,6 +54,20 @@ static inline int drv_add_interface(struct ieee80211_local *local, return ret; } +static inline int drv_change_interface(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type) +{ + int ret; + + might_sleep(); + + trace_drv_change_interface(local, sdata, type); + ret = local->ops->change_interface(&local->hw, &sdata->vif, type); + trace_drv_return_int(local, ret); + return ret; +} + static inline void drv_remove_interface(struct ieee80211_local *local, struct ieee80211_vif *vif) { diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index b5a9558..f6f3d89 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -136,6 +136,31 @@ TRACE_EVENT(drv_add_interface, ) ); +TRACE_EVENT(drv_change_interface, + TP_PROTO(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type), + + TP_ARGS(local, sdata, type), + + TP_STRUCT__entry( + LOCAL_ENTRY + VIF_ENTRY + __field(u32, new_type) + ), + + TP_fast_assign( + LOCAL_ASSIGN; + VIF_ASSIGN; + __entry->new_type = type; + ), + + TP_printk( + LOCAL_PR_FMT VIF_PR_FMT " new type:%d", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type + ) +); + TRACE_EVENT(drv_remove_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata), diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f648377..d529bd5 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -472,6 +472,16 @@ enum ieee80211_sub_if_data_flags { IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3), }; +/** + * enum ieee80211_sdata_state_bits - virtual interface state bits + * @SDATA_STATE_RUNNING: virtual interface is up & running; this + * mirrors netif_running() but is separate for interface type + * change handling while the interface is up + */ +enum ieee80211_sdata_state_bits { + SDATA_STATE_RUNNING, +}; + struct ieee80211_sub_if_data { struct list_head list; @@ -485,6 +495,8 @@ struct ieee80211_sub_if_data { unsigned int flags; + unsigned long state; + int drop_unencrypted; char name[IFNAMSIZ]; @@ -1087,7 +1099,7 @@ void ieee80211_recalc_idle(struct ieee80211_local *local); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { - return netif_running(sdata->dev); + return test_bit(SDATA_STATE_RUNNING, &sdata->state); } /* tx handling */ diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index cba3d80..c1cc200 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -148,7 +148,12 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, return 0; } -static int ieee80211_open(struct net_device *dev) +/* + * NOTE: Be very careful when changing this function, it must NOT return + * an error on interface type changes that have been pre-checked, so most + * checks should be in ieee80211_check_concurrent_iface. + */ +static int ieee80211_do_open(struct net_device *dev, bool coming_up) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); struct ieee80211_local *local = sdata->local; @@ -157,15 +162,6 @@ static int ieee80211_open(struct net_device *dev) int res; u32 hw_reconf_flags = 0; - /* fail early if user set an invalid address */ - if (!is_zero_ether_addr(dev->dev_addr) && - !is_valid_ether_addr(dev->dev_addr)) - return -EADDRNOTAVAIL; - - res = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); - if (res) - return res; - switch (sdata->vif.type) { case NL80211_IFTYPE_WDS: if (!is_valid_ether_addr(sdata->u.wds.remote_addr)) @@ -258,9 +254,11 @@ static int ieee80211_open(struct net_device *dev) netif_carrier_on(dev); break; default: - res = drv_add_interface(local, &sdata->vif); - if (res) - goto err_stop; + if (coming_up) { + res = drv_add_interface(local, &sdata->vif); + if (res) + goto err_stop; + } if (ieee80211_vif_is_mesh(&sdata->vif)) { local->fif_other_bss++; @@ -316,7 +314,9 @@ static int ieee80211_open(struct net_device *dev) hw_reconf_flags |= __ieee80211_recalc_idle(local); mutex_unlock(&local->mtx); - local->open_count++; + if (coming_up) + local->open_count++; + if (hw_reconf_flags) { ieee80211_hw_config(local, hw_reconf_flags); /* @@ -331,6 +331,8 @@ static int ieee80211_open(struct net_device *dev) netif_tx_start_all_queues(dev); + set_bit(SDATA_STATE_RUNNING, &sdata->state); + return 0; err_del_interface: drv_remove_interface(local, &sdata->vif); @@ -344,19 +346,38 @@ static int ieee80211_open(struct net_device *dev) return res; } -static int ieee80211_stop(struct net_device *dev) +static int ieee80211_open(struct net_device *dev) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + int err; + + /* fail early if user set an invalid address */ + if (!is_zero_ether_addr(dev->dev_addr) && + !is_valid_ether_addr(dev->dev_addr)) + return -EADDRNOTAVAIL; + + err = ieee80211_check_concurrent_iface(sdata, sdata->vif.type); + if (err) + return err; + + return ieee80211_do_open(dev, true); +} + +static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, + bool going_down) +{ struct ieee80211_local *local = sdata->local; unsigned long flags; struct sk_buff *skb, *tmp; u32 hw_reconf_flags = 0; int i; + clear_bit(SDATA_STATE_RUNNING, &sdata->state); + /* * Stop TX on this interface first. */ - netif_tx_stop_all_queues(dev); + netif_tx_stop_all_queues(sdata->dev); /* * Purge work for this interface. @@ -394,11 +415,12 @@ static int ieee80211_stop(struct net_device *dev) if (sdata->vif.type == NL80211_IFTYPE_AP) local->fif_pspoll--; - netif_addr_lock_bh(dev); + netif_addr_lock_bh(sdata->dev); spin_lock_bh(&local->filter_lock); - __hw_addr_unsync(&local->mc_list, &dev->mc, dev->addr_len); + __hw_addr_unsync(&local->mc_list, &sdata->dev->mc, + sdata->dev->addr_len); spin_unlock_bh(&local->filter_lock); - netif_addr_unlock_bh(dev); + netif_addr_unlock_bh(sdata->dev); ieee80211_configure_filter(local); @@ -432,7 +454,8 @@ static int ieee80211_stop(struct net_device *dev) WARN_ON(!list_empty(&sdata->u.ap.vlans)); } - local->open_count--; + if (going_down) + local->open_count--; switch (sdata->vif.type) { case NL80211_IFTYPE_AP_VLAN: @@ -504,7 +527,8 @@ static int ieee80211_stop(struct net_device *dev) */ ieee80211_free_keys(sdata); - drv_remove_interface(local, &sdata->vif); + if (going_down) + drv_remove_interface(local, &sdata->vif); } sdata->bss = NULL; @@ -540,6 +564,13 @@ static int ieee80211_stop(struct net_device *dev) } } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); +} + +static int ieee80211_stop(struct net_device *dev) +{ + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + + ieee80211_do_stop(sdata, true); return 0; } @@ -857,9 +888,72 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, ieee80211_debugfs_add_netdev(sdata); } +static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, + enum nl80211_iftype type) +{ + struct ieee80211_local *local = sdata->local; + int ret, err; + + ASSERT_RTNL(); + + if (!local->ops->change_interface) + return -EBUSY; + + switch (sdata->vif.type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + /* + * Could maybe also all others here? + * Just not sure how that interacts + * with the RX/config path e.g. for + * mesh. + */ + break; + default: + return -EBUSY; + } + + switch (type) { + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + /* + * Could probably support everything + * but WDS here (WDS do_open can fail + * under memory pressure, which this + * code isn't prepared to handle). + */ + break; + default: + return -EBUSY; + } + + ret = ieee80211_check_concurrent_iface(sdata, type); + if (ret) + return ret; + + ieee80211_do_stop(sdata, false); + + ieee80211_teardown_sdata(sdata->dev); + + ret = drv_change_interface(local, sdata, type); + if (ret) + type = sdata->vif.type; + + ieee80211_setup_sdata(sdata, type); + + err = ieee80211_do_open(sdata->dev, false); + WARN(err, "type change: do_open returned %d", err); + + return ret; +} + int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, enum nl80211_iftype type) { + int ret; + ASSERT_RTNL(); if (type == sdata->vif.type) @@ -870,18 +964,15 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, type == NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; - /* - * We could, here, on changes between IBSS/STA/MESH modes, - * invoke an MLME function instead that disassociates etc. - * and goes into the requested mode. - */ - - if (ieee80211_sdata_running(sdata)) - return -EBUSY; - - /* Purge and reset type-dependent state. */ - ieee80211_teardown_sdata(sdata->dev); - ieee80211_setup_sdata(sdata, type); + if (ieee80211_sdata_running(sdata)) { + ret = ieee80211_runtime_change_iftype(sdata, type); + if (ret) + return ret; + } else { + /* Purge and reset type-dependent state. */ + ieee80211_teardown_sdata(sdata->dev); + ieee80211_setup_sdata(sdata, type); + } /* reset some values that shouldn't be kept across type changes */ sdata->vif.bss_conf.basic_rates = -- cgit v1.1 From 5b714c6a3753dad0798a70a049e15c7f6bc9446b Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 27 Aug 2010 13:45:28 +0200 Subject: mac80211: fix offchannel queue stop Somebody noticed this problem, and I outlined to them how to fix it, but haven't heard back from them. So while I was adding the state field I figured I could use it to fix it. The problem, as I understand it, is that when we go offchannel while the driver has a queue stopped, the driver will likely start draining the queue and then enable it while offchannel. This in turn will enable the interface queue, and that leads to transmitting data frames on the wrong channel. Fix this by keeping track of offchannel status per interface, and not enabling the interface queues on interfaces that are offchannel when the driver enables a queue. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 3 +++ net/mac80211/offchannel.c | 19 +++++++++++++++++-- net/mac80211/util.c | 5 ++++- 3 files changed, 24 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index d529bd5..9af50fb 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -477,9 +477,12 @@ enum ieee80211_sub_if_data_flags { * @SDATA_STATE_RUNNING: virtual interface is up & running; this * mirrors netif_running() but is separate for interface type * change handling while the interface is up + * @SDATA_STATE_OFFCHANNEL: This interface is currently in offchannel + * mode, so queues are stopped */ enum ieee80211_sdata_state_bits { SDATA_STATE_RUNNING, + SDATA_STATE_OFFCHANNEL, }; struct ieee80211_sub_if_data { diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index c36b191..eeacaa5 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -112,8 +112,10 @@ void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local) * used from user space controlled off-channel operations. */ if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_MONITOR) + sdata->vif.type != NL80211_IFTYPE_MONITOR) { + set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); netif_tx_stop_all_queues(sdata->dev); + } } mutex_unlock(&local->iflist_mtx); } @@ -131,6 +133,7 @@ void ieee80211_offchannel_stop_station(struct ieee80211_local *local) continue; if (sdata->vif.type == NL80211_IFTYPE_STATION) { + set_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); netif_tx_stop_all_queues(sdata->dev); if (sdata->u.mgd.associated) ieee80211_offchannel_ps_enable(sdata); @@ -155,8 +158,20 @@ void ieee80211_offchannel_return(struct ieee80211_local *local, ieee80211_offchannel_ps_disable(sdata); } - if (sdata->vif.type != NL80211_IFTYPE_MONITOR) + if (sdata->vif.type != NL80211_IFTYPE_MONITOR) { + clear_bit(SDATA_STATE_OFFCHANNEL, &sdata->state); + /* + * This may wake up queues even though the driver + * currently has them stopped. This is not very + * likely, since the driver won't have gotten any + * (or hardly any) new packets while we weren't + * on the right channel, and even if it happens + * it will at most lead to queueing up one more + * packet per queue in mac80211 rather than on + * the interface qdisc. + */ netif_tx_wake_all_queues(sdata->dev); + } /* re-enable beaconing */ if (enable_beaconing && diff --git a/net/mac80211/util.c b/net/mac80211/util.c index d38b376..bd40b11 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -283,8 +283,11 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, if (skb_queue_empty(&local->pending[queue])) { rcu_read_lock(); - list_for_each_entry_rcu(sdata, &local->interfaces, list) + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) + continue; netif_wake_subqueue(sdata->dev, queue); + } rcu_read_unlock(); } else tasklet_schedule(&local->tx_pending_tasklet); -- cgit v1.1 From 762c29164e2850d8c5e4c258cef0077b2584d111 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 27 Aug 2010 16:41:56 +0000 Subject: econet: kill unnecessary spin_lock_init() The spinlock aun_queue_lock is initialized statically. It is unnecessary to initialize by spin_lock_init() at module load time. This is detected by the semantic patch. // @def@ declarer name DEFINE_SPINLOCK; identifier spinlock; @@ DEFINE_SPINLOCK(spinlock); @@ identifier def.spinlock; @@ - spin_lock_init(&spinlock); // Signed-off-by: Akinobu Mita Cc: Julia Lawall Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/econet/af_econet.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index dc54bd0..baa98fb 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -1009,7 +1009,6 @@ static int __init aun_udp_initialise(void) struct sockaddr_in sin; skb_queue_head_init(&aun_queue); - spin_lock_init(&aun_queue_lock); setup_timer(&ab_cleanup_timer, ab_cleanup, 0); ab_cleanup_timer.expires = jiffies + (HZ*2); add_timer(&ab_cleanup_timer); @@ -1167,7 +1166,6 @@ static int __init econet_proto_init(void) goto out; sock_register(&econet_family_ops); #ifdef CONFIG_ECONET_AUNUDP - spin_lock_init(&aun_queue_lock); aun_udp_initialise(); #endif #ifdef CONFIG_ECONET_NATIVE -- cgit v1.1 From 6a499b242f7d1ebf13f5bc386e08e80603f79e2a Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Fri, 27 Aug 2010 19:08:26 +0000 Subject: phonet: use for_each_set_bit Replace open-coded loop with for_each_set_bit(). Signed-off-by: Akinobu Mita Cc: "David S. Miller" Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/phonet/pn_dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index b18e48f..d0a4294 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -292,8 +292,7 @@ static void phonet_route_autodel(struct net_device *dev) if (bitmap_empty(deleted, 64)) return; /* short-circuit RCU */ synchronize_rcu(); - for (i = find_first_bit(deleted, 64); i < 64; - i = find_next_bit(deleted, 64, i + 1)) { + for_each_set_bit(i, deleted, 64) { rtm_phonet_notify(RTM_DELROUTE, dev, i); dev_put(dev); } -- cgit v1.1 From 2c70b5196262f29b10eb1886293e0c2d75dc3dd9 Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Sun, 29 Aug 2010 17:04:53 +0000 Subject: IPVS: include net/ip6_checksum.h for csum_ipv6_magic Fixes this build error: net/netfilter/ipvs/ip_vs_core.c: In function 'ip_vs_nat_icmp_v6': net/netfilter/ipvs/ip_vs_core.c:640: error: implicit declaration of function 'csum_ipv6_magic' Signed-off-by: Stephen Rothwell Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_core.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 69661db..edbfb96 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -40,6 +40,7 @@ #include #include /* for icmp_send */ #include +#include #include #include -- cgit v1.1 From dca43c75e7e545694a9dd6288553f55c53e2a3a3 Mon Sep 17 00:00:00 2001 From: Jerry Chu Date: Fri, 27 Aug 2010 19:13:28 +0000 Subject: tcp: Add TCP_USER_TIMEOUT socket option. This patch provides a "user timeout" support as described in RFC793. The socket option is also needed for the the local half of RFC5482 "TCP User Timeout Option". TCP_USER_TIMEOUT is a TCP level socket option that takes an unsigned int, when > 0, to specify the maximum amount of time in ms that transmitted data may remain unacknowledged before TCP will forcefully close the corresponding connection and return ETIMEDOUT to the application. If 0 is given, TCP will continue to use the system default. Increasing the user timeouts allows a TCP connection to survive extended periods without end-to-end connectivity. Decreasing the user timeouts allows applications to "fail fast" if so desired. Otherwise it may take upto 20 minutes with the current system defaults in a normal WAN environment. The socket option can be made during any state of a TCP connection, but is only effective during the synchronized states of a connection (ESTABLISHED, FIN-WAIT-1, FIN-WAIT-2, CLOSE-WAIT, CLOSING, or LAST-ACK). Moreover, when used with the TCP keepalive (SO_KEEPALIVE) option, TCP_USER_TIMEOUT will overtake keepalive to determine when to close a connection due to keepalive failure. The option does not change in anyway when TCP retransmits a packet, nor when a keepalive probe will be sent. This option, like many others, will be inherited by an acceptor from its listener. Signed-off-by: H.K. Jerry Chu Signed-off-by: David S. Miller --- net/ipv4/tcp.c | 11 ++++++++++- net/ipv4/tcp_timer.c | 40 +++++++++++++++++++++++++--------------- 2 files changed, 35 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 176e11a..cf32545 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2391,7 +2391,12 @@ static int do_tcp_setsockopt(struct sock *sk, int level, err = tp->af_specific->md5_parse(sk, optval, optlen); break; #endif - + case TCP_USER_TIMEOUT: + /* Cap the max timeout in ms TCP will retry/retrans + * before giving up and aborting (ETIMEDOUT) a connection. + */ + icsk->icsk_user_timeout = msecs_to_jiffies(val); + break; default: err = -ENOPROTOOPT; break; @@ -2610,6 +2615,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_THIN_DUPACK: val = tp->thin_dupack; break; + + case TCP_USER_TIMEOUT: + val = jiffies_to_msecs(icsk->icsk_user_timeout); + break; default: return -ENOPROTOOPT; } diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index 808bb92..11569de 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -138,10 +138,10 @@ static void tcp_mtu_probing(struct inet_connection_sock *icsk, struct sock *sk) * retransmissions with an initial RTO of TCP_RTO_MIN. */ static bool retransmits_timed_out(struct sock *sk, - unsigned int boundary) + unsigned int boundary, + unsigned int timeout) { - unsigned int timeout, linear_backoff_thresh; - unsigned int start_ts; + unsigned int linear_backoff_thresh, start_ts; if (!inet_csk(sk)->icsk_retransmits) return false; @@ -151,14 +151,15 @@ static bool retransmits_timed_out(struct sock *sk, else start_ts = tcp_sk(sk)->retrans_stamp; - linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); - - if (boundary <= linear_backoff_thresh) - timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; - else - timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + - (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + if (likely(timeout == 0)) { + linear_backoff_thresh = ilog2(TCP_RTO_MAX/TCP_RTO_MIN); + if (boundary <= linear_backoff_thresh) + timeout = ((2 << boundary) - 1) * TCP_RTO_MIN; + else + timeout = ((2 << linear_backoff_thresh) - 1) * TCP_RTO_MIN + + (boundary - linear_backoff_thresh) * TCP_RTO_MAX; + } return (tcp_time_stamp - start_ts) >= timeout; } @@ -174,7 +175,7 @@ static int tcp_write_timeout(struct sock *sk) dst_negative_advice(sk); retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries; } else { - if (retransmits_timed_out(sk, sysctl_tcp_retries1)) { + if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0)) { /* Black hole detection */ tcp_mtu_probing(icsk, sk); @@ -187,14 +188,16 @@ static int tcp_write_timeout(struct sock *sk) retry_until = tcp_orphan_retries(sk, alive); do_reset = alive || - !retransmits_timed_out(sk, retry_until); + !retransmits_timed_out(sk, retry_until, 0); if (tcp_out_of_resources(sk, do_reset)) return 1; } } - if (retransmits_timed_out(sk, retry_until)) { + if (retransmits_timed_out(sk, retry_until, + (1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV) ? 0 : + icsk->icsk_user_timeout)) { /* Has it gone just too far? */ tcp_write_err(sk); return 1; @@ -436,7 +439,7 @@ out_reset_timer: icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX); - if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1)) + if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0)) __sk_dst_reset(sk); out:; @@ -556,7 +559,14 @@ static void tcp_keepalive_timer (unsigned long data) elapsed = keepalive_time_elapsed(tp); if (elapsed >= keepalive_time_when(tp)) { - if (icsk->icsk_probes_out >= keepalive_probes(tp)) { + /* If the TCP_USER_TIMEOUT option is enabled, use that + * to determine when to timeout instead. + */ + if ((icsk->icsk_user_timeout != 0 && + elapsed >= icsk->icsk_user_timeout && + icsk->icsk_probes_out > 0) || + (icsk->icsk_user_timeout == 0 && + icsk->icsk_probes_out >= keepalive_probes(tp))) { tcp_send_active_reset(sk, GFP_ATOMIC); tcp_write_err(sk); goto out; -- cgit v1.1 From d82b6f85c1d73340ef4a26bd0b247ac14610cd83 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:10 +0000 Subject: dccp ccid-2: Use u32 timestamps uniformly Since CCID-2 is de facto a mini implementation of TCP, it makes sense to share as much code as possible. Hence this patch aligns CCID-2 timestamping with TCP timestamping. This also halves the space consumption (on 64-bit systems). The necessary include file is already included by way of net/dccp.h. Redundant includes have been removed. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 14 ++++++-------- net/dccp/ccids/ccid2.h | 15 ++++++++++----- 2 files changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 7af3106..0cff637 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -25,8 +25,6 @@ */ #include #include "../feat.h" -#include "../ccid.h" -#include "../dccp.h" #include "ccid2.h" @@ -175,7 +173,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) hc->tx_seqh->ccid2s_seq = dp->dccps_gss; hc->tx_seqh->ccid2s_acked = 0; - hc->tx_seqh->ccid2s_sent = jiffies; + hc->tx_seqh->ccid2s_sent = ccid2_time_stamp; next = hc->tx_seqh->ccid2s_next; /* check if we need to alloc more space */ @@ -250,7 +248,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) struct ccid2_seq *seqp = hc->tx_seqt; while (seqp != hc->tx_seqh) { - ccid2_pr_debug("out seq=%llu acked=%d time=%lu\n", + ccid2_pr_debug("out seq=%llu acked=%d time=%u\n", (unsigned long long)seqp->ccid2s_seq, seqp->ccid2s_acked, seqp->ccid2s_sent); seqp = seqp->ccid2s_next; @@ -431,19 +429,19 @@ static void ccid2_new_ack(struct sock *sk, struct ccid2_seq *seqp, * The cleanest solution is to not use the ccid2s_sent field at all * and instead use DCCP timestamps: requires changes in other places. */ - ccid2_rtt_estimator(sk, jiffies - seqp->ccid2s_sent); + ccid2_rtt_estimator(sk, ccid2_time_stamp - seqp->ccid2s_sent); } static void ccid2_congestion_event(struct sock *sk, struct ccid2_seq *seqp) { struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - if (time_before(seqp->ccid2s_sent, hc->tx_last_cong)) { + if ((s32)(seqp->ccid2s_sent - hc->tx_last_cong) < 0) { ccid2_pr_debug("Multiple losses in an RTT---treating as one\n"); return; } - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; hc->tx_cwnd = hc->tx_cwnd / 2 ? : 1U; hc->tx_ssthresh = max(hc->tx_cwnd, 2U); @@ -683,7 +681,7 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) hc->tx_rto = DCCP_TIMEOUT_INIT; hc->tx_rpdupack = -1; - hc->tx_last_cong = jiffies; + hc->tx_last_cong = ccid2_time_stamp; setup_timer(&hc->tx_rtotimer, ccid2_hc_tx_rto_expire, (unsigned long)sk); return 0; diff --git a/net/dccp/ccids/ccid2.h b/net/dccp/ccids/ccid2.h index b017843..9731c2d 100644 --- a/net/dccp/ccids/ccid2.h +++ b/net/dccp/ccids/ccid2.h @@ -18,18 +18,23 @@ #ifndef _DCCP_CCID2_H_ #define _DCCP_CCID2_H_ -#include #include #include #include "../ccid.h" +#include "../dccp.h" + +/* + * CCID-2 timestamping faces the same issues as TCP timestamping. + * Hence we reuse/share as much of the code as possible. + */ +#define ccid2_time_stamp tcp_time_stamp + /* NUMDUPACK parameter from RFC 4341, p. 6 */ #define NUMDUPACK 3 -struct sock; - struct ccid2_seq { u64 ccid2s_seq; - unsigned long ccid2s_sent; + u32 ccid2s_sent; int ccid2s_acked; struct ccid2_seq *ccid2s_prev; struct ccid2_seq *ccid2s_next; @@ -72,7 +77,7 @@ struct ccid2_hc_tx_sock { u64 tx_rpseq; int tx_rpdupack; - unsigned long tx_last_cong; + u32 tx_last_cong; u64 tx_high_ack; }; -- cgit v1.1 From d26eeb07fd02de31848b59d19687daff0e93532f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:11 +0000 Subject: dccp ccid-2: Remove wrappers around sk_{reset,stop}_timer() This removes the wrappers around the sk timer functions, since not much is gained from using them: the BUG_ON in start_rto_timer will never trigger since that function is called only if: * the RTO timer expires (rto_expire, and then timer_pending() is false); * in tx_packet_sent only if !timer_pending() (BUG_ON is redundant here); * previously in new_ack, after stopping the timer (timer_pending() false). Removing the wrappers also clears the way for eventually replacing the RTO timer with the icsk-retransmission-timer, as it is already part of the DCCP socket. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 28 +++------------------------- 1 file changed, 3 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 0cff637..8c95813 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -111,8 +111,6 @@ static void ccid2_change_l_ack_ratio(struct sock *sk, u32 val) dp->dccps_l_ack_ratio = val; } -static void ccid2_start_rto_timer(struct sock *sk); - static void ccid2_hc_tx_rto_expire(unsigned long data) { struct sock *sk = (struct sock *)data; @@ -131,7 +129,7 @@ static void ccid2_hc_tx_rto_expire(unsigned long data) if (hc->tx_rto > DCCP_RTO_MAX) hc->tx_rto = DCCP_RTO_MAX; - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); /* adjust pipe, cwnd etc */ hc->tx_ssthresh = hc->tx_cwnd / 2; @@ -153,16 +151,6 @@ out: sock_put(sk); } -static void ccid2_start_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - ccid2_pr_debug("setting RTO timeout=%u\n", hc->tx_rto); - - BUG_ON(timer_pending(&hc->tx_rtotimer)); - sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); -} - static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); @@ -239,9 +227,7 @@ static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) } #endif - /* setup RTO timer */ - if (!timer_pending(&hc->tx_rtotimer)) - ccid2_start_rto_timer(sk); + sk_reset_timer(sk, &hc->tx_rtotimer, jiffies + hc->tx_rto); #ifdef CONFIG_IP_DCCP_CCID2_DEBUG do { @@ -320,14 +306,6 @@ out_invalid_option: return -1; } -static void ccid2_hc_tx_kill_rto_timer(struct sock *sk) -{ - struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); - - sk_stop_timer(sk, &hc->tx_rtotimer); - ccid2_pr_debug("deleted RTO timer\n"); -} - /** * ccid2_rtt_estimator - Sample RTT and compute RTO using RFC2988 algorithm * This code is almost identical with TCP's tcp_rtt_estimator(), since @@ -692,7 +670,7 @@ static void ccid2_hc_tx_exit(struct sock *sk) struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); int i; - ccid2_hc_tx_kill_rto_timer(sk); + sk_stop_timer(sk, &hc->tx_rtotimer); for (i = 0; i < hc->tx_seqbufc; i++) kfree(hc->tx_seqbuf[i]); -- cgit v1.1 From 22b71c8f4f3db8df92f5e7b081c265bc56c0bd2f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:12 +0000 Subject: tcp/dccp: Consolidate common code for RFC 3390 conversion This patch consolidates initial-window code common to TCP and CCID-2: * TCP uses RFC 3390 in a packet-oriented manner (tcp_input.c) and * CCID-2 uses RFC 3390 in packet-oriented manner (RFC 4341). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 8 ++------ net/ipv4/tcp_input.c | 17 ++--------------- 2 files changed, 4 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index 8c95813..b9c942a 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -641,12 +641,8 @@ static int ccid2_hc_tx_init(struct ccid *ccid, struct sock *sk) /* RFC 4341, 5: initialise ssthresh to arbitrarily high (max) value */ hc->tx_ssthresh = ~0U; - /* - * RFC 4341, 5: "The cwnd parameter is initialized to at most four - * packets for new connections, following the rules from [RFC3390]". - * We need to convert the bytes of RFC3390 into the packets of RFC 4341. - */ - hc->tx_cwnd = clamp(4380U / dp->dccps_mss_cache, 2U, 4U); + /* Use larger initial windows (RFC 4341, section 5). */ + hc->tx_cwnd = rfc3390_bytes_to_packets(dp->dccps_mss_cache); /* Make sure that Ack Ratio is enabled and within bounds. */ max_ratio = DIV_ROUND_UP(hc->tx_cwnd, 2); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e663b78..1bc87a0 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -805,25 +805,12 @@ void tcp_update_metrics(struct sock *sk) } } -/* Numbers are taken from RFC3390. - * - * John Heffner states: - * - * The RFC specifies a window of no more than 4380 bytes - * unless 2*MSS > 4380. Reading the pseudocode in the RFC - * is a bit misleading because they use a clamp at 4380 bytes - * rather than use a multiplier in the relevant range. - */ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) { __u32 cwnd = (dst ? dst_metric(dst, RTAX_INITCWND) : 0); - if (!cwnd) { - if (tp->mss_cache > 1460) - cwnd = 2; - else - cwnd = (tp->mss_cache > 1095) ? 3 : 4; - } + if (!cwnd) + cwnd = rfc3390_bytes_to_packets(tp->mss_cache); return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } -- cgit v1.1 From 4886fcad6e12572afbd230dfab1b268eace20d6d Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:13 +0000 Subject: dccp ccid-2: Share TCP's minimum RTO code Using a fixed RTO_MIN of 0.2 seconds was found to cause problems for CCID-2 over 802.11g: at least once per session there was a spurious timeout. It helped to then increase the the value of RTO_MIN over this link. Since the problem is the same as in TCP, this patch makes the solution from commit "05bb1fad1cde025a864a90cfeb98dcbefe78a44a" "[TCP]: Allow minimum RTO to be configurable via routing metrics." available to DCCP. This avoids reinventing the wheel, so that e.g. the following works in the expected way now also for CCID-2: > ip route change 10.0.0.2 rto_min 800 dev ath0 Luckily this useful rto_min function was recently moved to net/tcp.h, which simplifies sharing code originating from TCP. Documentation also updated (plus minor whitespace fixes). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/ccid2.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index b9c942a..dc18172 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -325,8 +325,9 @@ static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) hc->tx_srtt = m << 3; hc->tx_mdev = m << 1; - hc->tx_mdev_max = max(TCP_RTO_MIN, hc->tx_mdev); + hc->tx_mdev_max = max(hc->tx_mdev, tcp_rto_min(sk)); hc->tx_rttvar = hc->tx_mdev_max; + hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; } else { /* Update scaled SRTT as SRTT += 1/8 * (m - SRTT) */ @@ -367,7 +368,7 @@ static void ccid2_rtt_estimator(struct sock *sk, const long mrtt) hc->tx_rttvar -= (hc->tx_rttvar - hc->tx_mdev_max) >> 2; hc->tx_rtt_seq = dccp_sk(sk)->dccps_gss; - hc->tx_mdev_max = TCP_RTO_MIN; + hc->tx_mdev_max = tcp_rto_min(sk); } } -- cgit v1.1 From 89858ad14307a398961a0f1414b04053c1475e4f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 29 Aug 2010 19:23:14 +0000 Subject: dccp ccid-3: use per-route RTO or TCP RTO as fallback This makes RTAX_RTO_MIN also available to CCID-3, replacing the compile-time RTO lower bound with a per-route tunable value. The original Kconfig option solved the problem that a very low RTT (in the order of HZ) can trigger too frequent and unnecessary reductions of the sending rate. This tunable does not affect the initial RTO value of 2 seconds specified in RFC 5348, section 4.2 and Appendix B. But like the hardcoded Kconfig value, it allows to adapt to network conditions. The same effect as the original Kconfig option of 100ms is now achieved by > ip route replace to unicast 192.168.0.0/24 rto_min 100j dev eth0 (assuming HZ=1000). Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/dccp/ccids/Kconfig | 31 ------------------------------- net/dccp/ccids/ccid3.c | 11 +++++------ net/dccp/ccids/ccid3.h | 2 +- 3 files changed, 6 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/Kconfig b/net/dccp/ccids/Kconfig index 8408398..0581143 100644 --- a/net/dccp/ccids/Kconfig +++ b/net/dccp/ccids/Kconfig @@ -47,37 +47,6 @@ config IP_DCCP_CCID3_DEBUG If in doubt, say N. -config IP_DCCP_CCID3_RTO - int "Use higher bound for nofeedback timer" - default 100 - depends on IP_DCCP_CCID3 && EXPERIMENTAL - ---help--- - Use higher lower bound for nofeedback timer expiration. - - The TFRC nofeedback timer normally expires after the maximum of 4 - RTTs and twice the current send interval (RFC 3448, 4.3). On LANs - with a small RTT this can mean a high processing load and reduced - performance, since then the nofeedback timer is triggered very - frequently. - - This option enables to set a higher lower bound for the nofeedback - value. Values in units of milliseconds can be set here. - - A value of 0 disables this feature by enforcing the value specified - in RFC 3448. The following values have been suggested as bounds for - experimental use: - * 16-20ms to match the typical multimedia inter-frame interval - * 100ms as a reasonable compromise [default] - * 1000ms corresponds to the lower TCP RTO bound (RFC 2988, 2.4) - - The default of 100ms is a compromise between a large value for - efficient DCCP implementations, and a small value to avoid disrupting - the network in times of congestion. - - The purpose of the nofeedback timer is to slow DCCP down when there - is serious network congestion: experimenting with larger values should - therefore not be performed on WANs. - config IP_DCCP_TFRC_LIB def_bool y if IP_DCCP_CCID3 diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 4340672..278e170 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -460,13 +460,12 @@ done_computing_x: sk->sk_write_space(sk); /* - * Update timeout interval for the nofeedback timer. - * We use a configuration option to increase the lower bound. - * This can help avoid triggering the nofeedback timer too - * often ('spinning') on LANs with small RTTs. + * Update timeout interval for the nofeedback timer. In order to control + * rate halving on networks with very low RTTs (<= 1 ms), use per-route + * tunable RTAX_RTO_MIN value as the lower bound. */ - hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, (CONFIG_IP_DCCP_CCID3_RTO * - (USEC_PER_SEC / 1000))); + hc->tx_t_rto = max_t(u32, 4 * hc->tx_rtt, + USEC_PER_SEC/HZ * tcp_rto_min(sk)); /* * Schedule no feedback timer to expire in * max(t_RTO, 2 * s/X) = max(t_RTO, 2 * t_ipi) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index b186424..b7e569c 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -42,7 +42,7 @@ #include "lib/tfrc.h" #include "../ccid.h" -/* Two seconds as per RFC 3448 4.2 */ +/* Two seconds as per RFC 5348, 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) /* In usecs - half the scheduling granularity as per RFC3448 4.6 */ -- cgit v1.1 From 6dcd814bd08bc7989f7f3eac9bbe8b20aec0182a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Aug 2010 07:04:14 +0000 Subject: net: struct xfrm_tunnel in read_mostly section tunnel4_handlers chain being scanned for each incoming packet, make sure it doesnt share an often dirtied cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 2 +- net/ipv4/tunnel4.c | 4 ++-- net/ipv4/xfrm4_tunnel.c | 4 ++-- net/ipv6/sit.c | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index ec03673..3c6f8f3 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -744,7 +744,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) ipn->tunnels_wc[0] = tunnel; } -static struct xfrm_tunnel ipip_handler = { +static struct xfrm_tunnel ipip_handler __read_mostly = { .handler = ipip_rcv, .err_handler = ipip_err, .priority = 1, diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 59186ca..942f02d 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -14,8 +14,8 @@ #include #include -static struct xfrm_tunnel *tunnel4_handlers; -static struct xfrm_tunnel *tunnel64_handlers; +static struct xfrm_tunnel *tunnel4_handlers __read_mostly; +static struct xfrm_tunnel *tunnel64_handlers __read_mostly; static DEFINE_MUTEX(tunnel4_mutex); static inline struct xfrm_tunnel **fam_handlers(unsigned short family) diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 41f5982..8280645 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -58,14 +58,14 @@ static int xfrm_tunnel_err(struct sk_buff *skb, u32 info) return -ENOENT; } -static struct xfrm_tunnel xfrm_tunnel_handler = { +static struct xfrm_tunnel xfrm_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, }; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -static struct xfrm_tunnel xfrm64_tunnel_handler = { +static struct xfrm_tunnel xfrm64_tunnel_handler __read_mostly = { .handler = xfrm_tunnel_rcv, .err_handler = xfrm_tunnel_err, .priority = 2, diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 4699cd3..86618eb 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1132,7 +1132,7 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) sitn->tunnels_wc[0] = tunnel; } -static struct xfrm_tunnel sit_handler = { +static struct xfrm_tunnel sit_handler __read_mostly = { .handler = ipip6_rcv, .err_handler = ipip6_err, .priority = 1, -- cgit v1.1 From 3ff2cfa55fb35bb5ea4490fbc82bb3c6771c121b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Aug 2010 10:27:10 +0000 Subject: ipv6: struct xfrm6_tunnel in read_mostly section tunnel6_handlers chain being scanned for each incoming packet, make sure it doesnt share an often dirtied cache line. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 4 ++-- net/ipv6/tunnel6.c | 4 ++-- net/ipv6/xfrm6_tunnel.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 0fd027f..29f99dd 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1372,13 +1372,13 @@ static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev) ip6n->tnls_wc[0] = t; } -static struct xfrm6_tunnel ip4ip6_handler = { +static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { .handler = ip4ip6_rcv, .err_handler = ip4ip6_err, .priority = 1, }; -static struct xfrm6_tunnel ip6ip6_handler = { +static struct xfrm6_tunnel ip6ip6_handler __read_mostly = { .handler = ip6ip6_rcv, .err_handler = ip6ip6_err, .priority = 1, diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index fc3c86a..d203e6d 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -30,8 +30,8 @@ #include #include -static struct xfrm6_tunnel *tunnel6_handlers; -static struct xfrm6_tunnel *tunnel46_handlers; +static struct xfrm6_tunnel *tunnel6_handlers __read_mostly; +static struct xfrm6_tunnel *tunnel46_handlers __read_mostly; static DEFINE_MUTEX(tunnel6_mutex); int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 2ce3a82..ac7584b 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -317,13 +317,13 @@ static const struct xfrm_type xfrm6_tunnel_type = { .output = xfrm6_tunnel_output, }; -static struct xfrm6_tunnel xfrm6_tunnel_handler = { +static struct xfrm6_tunnel xfrm6_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, }; -static struct xfrm6_tunnel xfrm46_tunnel_handler = { +static struct xfrm6_tunnel xfrm46_tunnel_handler __read_mostly = { .handler = xfrm6_tunnel_rcv, .err_handler = xfrm6_tunnel_err, .priority = 2, -- cgit v1.1 From 3ba06c6fbd651ed3377e584026d1c112b492cc8b Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Fri, 27 Aug 2010 22:21:13 +0300 Subject: mac80211: Fix signal strength average initialization for CQM events The ave_beacon_signal value uses 1/16 dB unit and as such, must be initialized with the signal level of the first Beacon frame multiplied by 16. This fixes an issue where the initial CQM events are reported incorrectly with a burst of events while the running average approaches the correct value after the incorrect initialization. This could cause user space -based roaming decision process to get quite confused at the moment when we would like to go through authentication and DHCP. Cc: stable@kernel.org Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c869447..7915726 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1553,7 +1553,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ifmgd->last_beacon_signal = rx_status->signal; if (ifmgd->flags & IEEE80211_STA_RESET_SIGNAL_AVE) { ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; - ifmgd->ave_beacon_signal = rx_status->signal; + ifmgd->ave_beacon_signal = rx_status->signal * 16; ifmgd->last_cqm_event_signal = 0; } else { ifmgd->ave_beacon_signal = -- cgit v1.1 From 391a200a89bf85bd38f117f34898c24299e3d53d Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Fri, 27 Aug 2010 22:22:00 +0300 Subject: mac80211: Do not generate CQM events based on first Beacon frames The signal strength value in a single RX frame is not that reliable, so it is better to delay start of CQM events until there is a real average signal strength from more than a single Beacon frame available. Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 7 +++++++ net/mac80211/mlme.c | 9 +++++++++ 2 files changed, 16 insertions(+) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9af50fb..16f7fb1 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -369,6 +369,13 @@ struct ieee80211_if_managed { int ave_beacon_signal; /* + * Number of Beacon frames used in ave_beacon_signal. This can be used + * to avoid generating less reliable cqm events that would be based + * only on couple of received frames. + */ + unsigned int count_beacon_signal; + + /* * Last Beacon frame signal strength average (ave_beacon_signal / 16) * that triggered a cqm event. 0 indicates that no event has been * generated for the current association. diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 7915726..0cb822c 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -54,6 +54,12 @@ */ #define IEEE80211_SIGNAL_AVE_WEIGHT 3 +/* + * How many Beacon frames need to have been used in average signal strength + * before starting to indicate signal change events. + */ +#define IEEE80211_SIGNAL_AVE_MIN_COUNT 4 + #define TMR_RUNNING_TIMER 0 #define TMR_RUNNING_CHANSW 1 @@ -1555,13 +1561,16 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, ifmgd->flags &= ~IEEE80211_STA_RESET_SIGNAL_AVE; ifmgd->ave_beacon_signal = rx_status->signal * 16; ifmgd->last_cqm_event_signal = 0; + ifmgd->count_beacon_signal = 1; } else { ifmgd->ave_beacon_signal = (IEEE80211_SIGNAL_AVE_WEIGHT * rx_status->signal * 16 + (16 - IEEE80211_SIGNAL_AVE_WEIGHT) * ifmgd->ave_beacon_signal) / 16; + ifmgd->count_beacon_signal++; } if (bss_conf->cqm_rssi_thold && + ifmgd->count_beacon_signal >= IEEE80211_SIGNAL_AVE_MIN_COUNT && !(local->hw.flags & IEEE80211_HW_SUPPORTS_CQM_RSSI)) { int sig = ifmgd->ave_beacon_signal / 16; int last_event = ifmgd->last_cqm_event_signal; -- cgit v1.1 From 3653910714a4a9b19aadb202c24f7b1ae61d3556 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sat, 28 Aug 2010 17:41:06 +0200 Subject: net/wireless: Remove double test The same expression is tested twice and the result is the same each time. The sematic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @expression@ expression E; @@ ( * E || ... || E | * E && ... && E ) // Signed-off-by: Julia Lawall Signed-off-by: John W. Linville --- net/wireless/wext-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 0ef17bc..4038593 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -611,7 +611,7 @@ struct iw_statistics *get_wireless_stats(struct net_device *dev) #endif #ifdef CONFIG_CFG80211_WEXT - if (dev->ieee80211_ptr && dev->ieee80211_ptr && + if (dev->ieee80211_ptr && dev->ieee80211_ptr->wiphy && dev->ieee80211_ptr->wiphy->wext && dev->ieee80211_ptr->wiphy->wext->get_wireless_stats) -- cgit v1.1 From 18145c69349f2ab60c470798f83b3a2639e2a8d9 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Mon, 30 Aug 2010 15:12:02 -0400 Subject: mac80211: cancel scan in ieee80211_restart_hw if software scan pending This function exists to clean-up after a hardware error or something similar. The restart is accomplished using the same infrastructure used to resume after a suspend. The suspend path cancels running scans, so it seems appropriate to do that here as well for software-based scans. If a hardware-based scan is pending, issue a warning message since this indicates that the drivers has failed to clean-up after itself. Signed-off-by: John W. Linville --- net/mac80211/main.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 93194f6..a06b6ee 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -305,7 +305,13 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) trace_api_restart_hw(local); - /* use this reason, __ieee80211_resume will unblock it */ + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), + "%s called with hardware scan in progress\n", __func__); + + if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) + ieee80211_scan_cancel(local); + + /* use this reason, ieee80211_reconfig will unblock it */ ieee80211_stop_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_SUSPEND); -- cgit v1.1 From 1a98214feef2221cd7c24b17cd688a5a9d85b2ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Mon, 30 Aug 2010 12:57:03 +0000 Subject: Phonet: restore flow control credits when sending fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/pep.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/phonet/pep.c b/net/phonet/pep.c index b2a3ae6..5034f0f 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -834,6 +834,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; + int err; if (pn_flow_safe(pn->tx_fc) && !atomic_add_unless(&pn->tx_credits, -1, 0)) { @@ -852,7 +853,10 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; - return pn_skb_send(sk, skb, &pipe_srv); + err = pn_skb_send(sk, skb, &pipe_srv); + if (err && pn_flow_safe(pn->tx_fc)) + atomic_inc(&pn->tx_credits); + return err; } static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, -- cgit v1.1 From 02ac3268a581639af241c254579160909373e12c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Mon, 30 Aug 2010 12:57:04 +0000 Subject: Phonet: correct sendmsg() error code from sock_alloc_send_skb() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/pep.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 5034f0f..04e3419 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -876,7 +876,7 @@ static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, skb = sock_alloc_send_skb(sk, MAX_PNPIPE_HEADER + len, flags & MSG_DONTWAIT, &err); if (!skb) - return -ENOBUFS; + return err; skb_reserve(skb, MAX_PHONET_HEADER + 3); err = memcpy_fromiovec(skb_put(skb, len), msg->msg_iov, len); -- cgit v1.1 From 01b38606bded44bf8b7ca42e8fe5f2cad5d28121 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Mon, 30 Aug 2010 12:57:05 +0000 Subject: Phonet: do not set POLLOUT in case of send buffer overflow MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/socket.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 6e9848bf..7c91f73 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -281,7 +281,9 @@ static unsigned int pn_socket_poll(struct file *file, struct socket *sock, if (!mask && sk->sk_state == TCP_CLOSE_WAIT) return POLLHUP; - if (sk->sk_state == TCP_ESTABLISHED && atomic_read(&pn->tx_credits)) + if (sk->sk_state == TCP_ESTABLISHED && + atomic_read(&sk->sk_wmem_alloc) < sk->sk_sndbuf && + atomic_read(&pn->tx_credits)) mask |= POLLOUT | POLLWRNORM | POLLWRBAND; return mask; -- cgit v1.1 From 72ed62f7c9f0abe11231d073195a722ee43d6ec1 Mon Sep 17 00:00:00 2001 From: Jiri Pirko Date: Tue, 31 Aug 2010 04:58:18 +0000 Subject: vlan: Use vlan_dev_real_dev in vlan_hwaccel_do_receive [patch net-next-2.6] vlan: Use vlan_dev_real_dev in vlan_hwaccel_do_receive Use helper as in other places. Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller --- net/8021q/vlan_core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 3438c01..889f4ac 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -40,7 +40,7 @@ void vlan_hwaccel_do_receive(struct sk_buff *skb) struct net_device *dev = skb->dev; struct vlan_rx_stats *rx_stats; - skb->dev = vlan_dev_info(dev)->real_dev; + skb->dev = vlan_dev_real_dev(dev); netif_nit_deliver(skb); skb->dev = dev; -- cgit v1.1 From ba4fd9d8282f7f856f2287fe8be784d1dfdda28b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 01:57:35 +0000 Subject: pktgen: remove non used variable remove non used variable "queue" in pg_cleanup Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/pktgen.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 10a1ea7..386c228 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -3907,8 +3907,6 @@ static void __exit pg_cleanup(void) { struct pktgen_thread *t; struct list_head *q, *n; - wait_queue_head_t queue; - init_waitqueue_head(&queue); /* Stop all interfaces & threads */ -- cgit v1.1 From 1639ab6f7831f056286c64d98e8e5eb04e3bacac Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 10:23:47 +0000 Subject: gro: unexport tcp4_gro_receive and tcp4_gro_complete tcp4_gro_receive() and tcp4_gro_complete() dont need to be exported. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tcp_ipv4.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0207662..a0232f3 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2571,7 +2571,6 @@ struct sk_buff **tcp4_gro_receive(struct sk_buff **head, struct sk_buff *skb) return tcp_gro_receive(head, skb); } -EXPORT_SYMBOL(tcp4_gro_receive); int tcp4_gro_complete(struct sk_buff *skb) { @@ -2584,7 +2583,6 @@ int tcp4_gro_complete(struct sk_buff *skb) return tcp_gro_complete(skb); } -EXPORT_SYMBOL(tcp4_gro_complete); struct proto tcp_prot = { .name = "TCP", -- cgit v1.1 From 875168a9330d3aa6481ce62ce8fa77c7be0c75fb Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 30 Aug 2010 11:07:25 +0000 Subject: net: tunnels should use rcu_dereference tunnel4_handlers, tunnel64_handlers, tunnel6_handlers and tunnel46_handlers are protected by RCU, but we dont use appropriate rcu primitives to scan them. rcu_lock() is already held by caller. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tunnel4.c | 13 +++++++++---- net/ipv6/tunnel6.c | 11 ++++++++--- 2 files changed, 17 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index 942f02d..df59d163 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -73,6 +73,11 @@ int xfrm4_tunnel_deregister(struct xfrm_tunnel *handler, unsigned short family) } EXPORT_SYMBOL(xfrm4_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel4_rcv(struct sk_buff *skb) { struct xfrm_tunnel *handler; @@ -80,7 +85,7 @@ static int tunnel4_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->handler(skb)) return 0; @@ -99,7 +104,7 @@ static int tunnel64_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->handler(skb)) return 0; @@ -115,7 +120,7 @@ static void tunnel4_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel4_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel4_handlers, handler) if (!handler->err_handler(skb, info)) break; } @@ -125,7 +130,7 @@ static void tunnel64_err(struct sk_buff *skb, u32 info) { struct xfrm_tunnel *handler; - for (handler = tunnel64_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel64_handlers, handler) if (!handler->err_handler(skb, info)) break; } diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index d203e6d..3177fe0 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -88,6 +88,11 @@ int xfrm6_tunnel_deregister(struct xfrm6_tunnel *handler, unsigned short family) EXPORT_SYMBOL(xfrm6_tunnel_deregister); +#define for_each_tunnel_rcu(head, handler) \ + for (handler = rcu_dereference(head); \ + handler != NULL; \ + handler = rcu_dereference(handler->next)) \ + static int tunnel6_rcv(struct sk_buff *skb) { struct xfrm6_tunnel *handler; @@ -95,7 +100,7 @@ static int tunnel6_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct ipv6hdr))) goto drop; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->handler(skb)) return 0; @@ -113,7 +118,7 @@ static int tunnel46_rcv(struct sk_buff *skb) if (!pskb_may_pull(skb, sizeof(struct iphdr))) goto drop; - for (handler = tunnel46_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel46_handlers, handler) if (!handler->handler(skb)) return 0; @@ -129,7 +134,7 @@ static void tunnel6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, { struct xfrm6_tunnel *handler; - for (handler = tunnel6_handlers; handler; handler = handler->next) + for_each_tunnel_rcu(tunnel6_handlers, handler) if (!handler->err_handler(skb, opt, type, code, offset, info)) break; } -- cgit v1.1 From 86cac58b71227cc34a3d0e78f19585c0eff49ea3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 31 Aug 2010 18:25:32 +0000 Subject: skge: add GRO support - napi_gro_flush() is exported from net/core/dev.c, to avoid an irq_save/irq_restore in the packet receive path. - use napi_gro_receive() instead of netif_receive_skb() - use napi_gro_flush() before calling __napi_complete() - turn on NETIF_F_GRO by default - Tested on a Marvell 88E8001 Gigabit NIC Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 63bd20a..d8c43e7 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3063,7 +3063,7 @@ out: return netif_receive_skb(skb); } -static void napi_gro_flush(struct napi_struct *napi) +inline void napi_gro_flush(struct napi_struct *napi) { struct sk_buff *skb, *next; @@ -3076,6 +3076,7 @@ static void napi_gro_flush(struct napi_struct *napi) napi->gro_count = 0; napi->gro_list = NULL; } +EXPORT_SYMBOL(napi_gro_flush); enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff *skb) { -- cgit v1.1 From 6602cebb5bcac1fccf2850541f8bf9fcc8c86dee Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 1 Sep 2010 05:25:10 +0000 Subject: net: skbuff.c cleanup (skb->data - skb->head) can be changed by skb_headroom(skb) Remove some uses of NET_SKBUFF_DATA_USES_OFFSET, using (skb_end_pointer(skb) - skb->head) or (skb_tail_pointer(skb) - skb->head) : compiler does the right thing, and this is more readable for us ;) (struct skb_shared_info *) casts in pskb_expand_head() to help memcpy() to use aligned moves. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 47 +++++++++++++++-------------------------------- 1 file changed, 15 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index e2535fb..231dff0 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -685,16 +685,10 @@ static void copy_skb_header(struct sk_buff *new, const struct sk_buff *old) struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask) { - int headerlen = skb->data - skb->head; - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end + skb->data_len, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head + skb->data_len, gfp_mask); -#endif + int headerlen = skb_headroom(skb); + unsigned int size = (skb_end_pointer(skb) - skb->head) + skb->data_len; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) return NULL; @@ -726,20 +720,14 @@ EXPORT_SYMBOL(skb_copy); struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask) { - /* - * Allocate the copy buffer - */ - struct sk_buff *n; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - n = alloc_skb(skb->end, gfp_mask); -#else - n = alloc_skb(skb->end - skb->head, gfp_mask); -#endif + unsigned int size = skb_end_pointer(skb) - skb->head; + struct sk_buff *n = alloc_skb(size, gfp_mask); + if (!n) goto out; /* Set the data pointer */ - skb_reserve(n, skb->data - skb->head); + skb_reserve(n, skb_headroom(skb)); /* Set the tail pointer and length */ skb_put(n, skb_headlen(skb)); /* Copy the bytes */ @@ -791,11 +779,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, { int i; u8 *data; -#ifdef NET_SKBUFF_DATA_USES_OFFSET - int size = nhead + skb->end + ntail; -#else - int size = nhead + (skb->end - skb->head) + ntail; -#endif + int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; long off; BUG_ON(nhead < 0); @@ -810,13 +794,12 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, goto nodata; /* Copy only real data... and, alas, header. This should be - * optimized for the cases when header is void. */ -#ifdef NET_SKBUFF_DATA_USES_OFFSET - memcpy(data + nhead, skb->head, skb->tail); -#else - memcpy(data + nhead, skb->head, skb->tail - skb->head); -#endif - memcpy(data + size, skb_end_pointer(skb), + * optimized for the cases when header is void. + */ + memcpy(data + nhead, skb->head, skb_tail_pointer(skb) - skb->head); + + memcpy((struct skb_shared_info *)(data + size), + skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) -- cgit v1.1 From 85f72bc839705294b32b6c16b491c0422f0a71b3 Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 1 Sep 2010 16:12:28 -0400 Subject: mac80211: only cancel software-based scans on suspend Otherwise the hardware scan handler could access an invalid scan request structure. The driver should cancel any pending hardware scans during the suspend process anyway, so also add a warning if the hardware scan is still pending when the device resumes. Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 6 ++++++ net/mac80211/pm.c | 3 ++- 2 files changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 16f7fb1..4e635e2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1186,6 +1186,12 @@ int __ieee80211_suspend(struct ieee80211_hw *hw); static inline int __ieee80211_resume(struct ieee80211_hw *hw) { + struct ieee80211_local *local = hw_to_local(hw); + + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), + "%s: resume with hardware scan still in progress\n", + wiphy_name(hw->wiphy)); + return ieee80211_reconfig(hw_to_local(hw)); } #else diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index d287fde..ce671df 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -12,7 +12,8 @@ int __ieee80211_suspend(struct ieee80211_hw *hw) struct ieee80211_sub_if_data *sdata; struct sta_info *sta; - ieee80211_scan_cancel(local); + if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) + ieee80211_scan_cancel(local); ieee80211_stop_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_SUSPEND); -- cgit v1.1 From fa50d6457691d5c2d8a3430abf950435ef129cf1 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 31 Aug 2010 12:14:13 +0000 Subject: net: make rx_queue sysfs_ops const Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/net-sysfs.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 7d74854..76485a3 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -515,7 +515,7 @@ static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr, return attribute->store(queue, attribute, buf, count); } -static struct sysfs_ops rx_queue_sysfs_ops = { +static const struct sysfs_ops rx_queue_sysfs_ops = { .show = rx_queue_attr_show, .store = rx_queue_attr_store, }; -- cgit v1.1 From 0705c6f0e2d39333645bf77cf1efb94526ff1f82 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Wed, 1 Sep 2010 00:28:35 +0000 Subject: tcp: update also tcp_output with regard to RFC 5681 Thanks to Ilpo Jarvinen, this updates also the initial window setting for tcp_output with regard to RFC 5681. Signed-off-by: Gerrit Renker Signed-off-by: David S. Miller --- net/ipv4/tcp_output.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 01b94b8..ea09d2f 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -224,16 +224,10 @@ void tcp_select_initial_window(int __space, __u32 mss, } } - /* Set initial window to value enough for senders, - * following RFC2414. Senders, not following this RFC, - * will be satisfied with 2. - */ + /* Set initial window to value enough for senders, following RFC5681. */ if (mss > (1 << *rcv_wscale)) { - int init_cwnd = 4; - if (mss > 1460 * 3) - init_cwnd = 2; - else if (mss > 1460) - init_cwnd = 3; + int init_cwnd = rfc3390_bytes_to_packets(mss); + /* when initializing use the value from init_rcv_wnd * rather than the default from above */ -- cgit v1.1 From 95f4b45bc688b03107f5452ccda29496fc1b4ecf Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Sep 2010 09:19:32 -0700 Subject: net: another last_rx round Kill last_rx use in l2tp and two net drivers Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_eth.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_eth.c b/net/l2tp/l2tp_eth.c index 58c6c4c..f3468ba 100644 --- a/net/l2tp/l2tp_eth.c +++ b/net/l2tp/l2tp_eth.c @@ -144,7 +144,6 @@ static void l2tp_eth_dev_recv(struct l2tp_session *session, struct sk_buff *skb, nf_reset(skb); if (dev_forward_skb(dev, skb) == NET_RX_SUCCESS) { - dev->last_rx = jiffies; dev->stats.rx_packets++; dev->stats.rx_bytes += data_len; } else -- cgit v1.1 From 8ed2163ff3b6abc5143d46dea73e523b22a6f987 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Wed, 1 Sep 2010 22:19:14 +0000 Subject: ipvs: use pkts for SCTP too Use correctly the in_pkts packet counter also for SCTP Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman Signed-off-by: David S. Miller --- net/netfilter/ipvs/ip_vs_core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index edbfb96..b7ce5b4 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -1383,8 +1383,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && cp->protocol == IPPROTO_SCTP) { if ((cp->state == IP_VS_SCTP_S_ESTABLISHED && - (atomic_read(&cp->in_pkts) % - sysctl_ip_vs_sync_threshold[1] + (pkts % sysctl_ip_vs_sync_threshold[1] == sysctl_ip_vs_sync_threshold[0])) || (cp->old_state != cp->state && ((cp->state == IP_VS_SCTP_S_CLOSED) || @@ -1395,7 +1394,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, } } - if (af == AF_INET && + /* Keep this block last: TCP and others with pp->num_states <= 1 */ + else if (af == AF_INET && (ip_vs_sync_state & IP_VS_STATE_MASTER) && (((cp->protocol != IPPROTO_TCP || cp->state == IP_VS_TCP_S_ESTABLISHED) && -- cgit v1.1 From c07b68e841bd737e2ebeb57268d251c89ccc5010 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Sep 2010 03:53:46 +0000 Subject: net: dev_add_pack() & __dev_remove_pack() changes Add a small helper ptype_head() to get the head to manipulate dev_add_pack() & __dev_remove_pack() can use a spinlock without blocking BH, since softirq use RCU, and these functions are run from process context only. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 32 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d8c43e7..efd318d 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -371,6 +371,14 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) * --ANK (980803) */ +static inline struct list_head *ptype_head(const struct packet_type *pt) +{ + if (pt->type == htons(ETH_P_ALL)) + return &ptype_all; + else + return &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; +} + /** * dev_add_pack - add packet handler * @pt: packet type declaration @@ -386,16 +394,11 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev) void dev_add_pack(struct packet_type *pt) { - int hash; + struct list_head *head = ptype_head(pt); - spin_lock_bh(&ptype_lock); - if (pt->type == htons(ETH_P_ALL)) - list_add_rcu(&pt->list, &ptype_all); - else { - hash = ntohs(pt->type) & PTYPE_HASH_MASK; - list_add_rcu(&pt->list, &ptype_base[hash]); - } - spin_unlock_bh(&ptype_lock); + spin_lock(&ptype_lock); + list_add_rcu(&pt->list, head); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(dev_add_pack); @@ -414,15 +417,10 @@ EXPORT_SYMBOL(dev_add_pack); */ void __dev_remove_pack(struct packet_type *pt) { - struct list_head *head; + struct list_head *head = ptype_head(pt); struct packet_type *pt1; - spin_lock_bh(&ptype_lock); - - if (pt->type == htons(ETH_P_ALL)) - head = &ptype_all; - else - head = &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK]; + spin_lock(&ptype_lock); list_for_each_entry(pt1, head, list) { if (pt == pt1) { @@ -433,7 +431,7 @@ void __dev_remove_pack(struct packet_type *pt) printk(KERN_WARNING "dev_remove_pack: %p not found.\n", pt); out: - spin_unlock_bh(&ptype_lock); + spin_unlock(&ptype_lock); } EXPORT_SYMBOL(__dev_remove_pack); -- cgit v1.1 From deffd77759e3ceb936f0760cc54a213881577a83 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 2 Sep 2010 03:56:51 +0000 Subject: net: arp: code cleanup Clean the code up according to Documentation/CodingStyle. Don't initialize the variable dont_send in arp_process(). Remove the temporary varialbe flags in arp_state_to_flags(). Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/ipv4/arp.c | 226 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 123 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 96c1955..dcfe7e9 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -55,7 +55,7 @@ * Stuart Cheshire : Metricom and grat arp fixes * *** FOR 2.1 clean this up *** * Lawrence V. Stefani: (08/12/96) Added FDDI support. - * Alan Cox : Took the AP1000 nasty FDDI hack and + * Alan Cox : Took the AP1000 nasty FDDI hack and * folded into the mainstream FDDI code. * Ack spit, Linus how did you allow that * one in... @@ -120,7 +120,7 @@ EXPORT_SYMBOL(clip_tbl_hook); #endif #include -#include +#include #include @@ -173,32 +173,32 @@ const struct neigh_ops arp_broken_ops = { EXPORT_SYMBOL(arp_broken_ops); struct neigh_table arp_tbl = { - .family = AF_INET, - .entry_size = sizeof(struct neighbour) + 4, - .key_len = 4, - .hash = arp_hash, - .constructor = arp_constructor, - .proxy_redo = parp_redo, - .id = "arp_cache", - .parms = { - .tbl = &arp_tbl, - .base_reachable_time = 30 * HZ, - .retrans_time = 1 * HZ, - .gc_staletime = 60 * HZ, - .reachable_time = 30 * HZ, - .delay_probe_time = 5 * HZ, - .queue_len = 3, - .ucast_probes = 3, - .mcast_probes = 3, - .anycast_delay = 1 * HZ, - .proxy_delay = (8 * HZ) / 10, - .proxy_qlen = 64, - .locktime = 1 * HZ, + .family = AF_INET, + .entry_size = sizeof(struct neighbour) + 4, + .key_len = 4, + .hash = arp_hash, + .constructor = arp_constructor, + .proxy_redo = parp_redo, + .id = "arp_cache", + .parms = { + .tbl = &arp_tbl, + .base_reachable_time = 30 * HZ, + .retrans_time = 1 * HZ, + .gc_staletime = 60 * HZ, + .reachable_time = 30 * HZ, + .delay_probe_time = 5 * HZ, + .queue_len = 3, + .ucast_probes = 3, + .mcast_probes = 3, + .anycast_delay = 1 * HZ, + .proxy_delay = (8 * HZ) / 10, + .proxy_qlen = 64, + .locktime = 1 * HZ, }, - .gc_interval = 30 * HZ, - .gc_thresh1 = 128, - .gc_thresh2 = 512, - .gc_thresh3 = 1024, + .gc_interval = 30 * HZ, + .gc_thresh1 = 128, + .gc_thresh2 = 512, + .gc_thresh3 = 1024, }; EXPORT_SYMBOL(arp_tbl); @@ -233,7 +233,7 @@ static u32 arp_hash(const void *pkey, const struct net_device *dev) static int arp_constructor(struct neighbour *neigh) { - __be32 addr = *(__be32*)neigh->primary_key; + __be32 addr = *(__be32 *)neigh->primary_key; struct net_device *dev = neigh->dev; struct in_device *in_dev; struct neigh_parms *parms; @@ -296,16 +296,19 @@ static int arp_constructor(struct neighbour *neigh) neigh->ops = &arp_broken_ops; neigh->output = neigh->ops->output; return 0; +#else + break; #endif - ;} + } #endif if (neigh->type == RTN_MULTICAST) { neigh->nud_state = NUD_NOARP; arp_mc_map(addr, neigh->ha, dev, 1); - } else if (dev->flags&(IFF_NOARP|IFF_LOOPBACK)) { + } else if (dev->flags & (IFF_NOARP | IFF_LOOPBACK)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->dev_addr, dev->addr_len); - } else if (neigh->type == RTN_BROADCAST || dev->flags&IFF_POINTOPOINT) { + } else if (neigh->type == RTN_BROADCAST || + (dev->flags & IFF_POINTOPOINT)) { neigh->nud_state = NUD_NOARP; memcpy(neigh->ha, dev->broadcast, dev->addr_len); } @@ -315,7 +318,7 @@ static int arp_constructor(struct neighbour *neigh) else neigh->ops = &arp_generic_ops; - if (neigh->nud_state&NUD_VALID) + if (neigh->nud_state & NUD_VALID) neigh->output = neigh->ops->connected_output; else neigh->output = neigh->ops->output; @@ -334,7 +337,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) __be32 saddr = 0; u8 *dst_ha = NULL; struct net_device *dev = neigh->dev; - __be32 target = *(__be32*)neigh->primary_key; + __be32 target = *(__be32 *)neigh->primary_key; int probes = atomic_read(&neigh->probes); struct in_device *in_dev; @@ -347,7 +350,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(dev_net(dev), ip_hdr(skb)->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(dev_net(dev), + ip_hdr(skb)->saddr) == RTN_LOCAL) saddr = ip_hdr(skb)->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ @@ -369,16 +373,21 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!saddr) saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); - if ((probes -= neigh->parms->ucast_probes) < 0) { - if (!(neigh->nud_state&NUD_VALID)) - printk(KERN_DEBUG "trying to ucast probe in NUD_INVALID\n"); + probes -= neigh->parms->ucast_probes; + if (probes < 0) { + if (!(neigh->nud_state & NUD_VALID)) + printk(KERN_DEBUG + "trying to ucast probe in NUD_INVALID\n"); dst_ha = neigh->ha; read_lock_bh(&neigh->lock); - } else if ((probes -= neigh->parms->app_probes) < 0) { + } else { + probes -= neigh->parms->app_probes; + if (probes < 0) { #ifdef CONFIG_ARPD - neigh_app_ns(neigh); + neigh_app_ns(neigh); #endif - return; + return; + } } arp_send(ARPOP_REQUEST, ETH_P_ARP, target, dev, saddr, @@ -451,7 +460,8 @@ static int arp_filter(__be32 sip, __be32 tip, struct net_device *dev) * is allowed to use this function, it is scheduled to be removed. --ANK */ -static int arp_set_predefined(int addr_hint, unsigned char * haddr, __be32 paddr, struct net_device * dev) +static int arp_set_predefined(int addr_hint, unsigned char *haddr, + __be32 paddr, struct net_device *dev) { switch (addr_hint) { case RTN_LOCAL: @@ -483,7 +493,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) paddr = skb_rtable(skb)->rt_gateway; - if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(dev_net(dev), paddr), haddr, + paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); @@ -515,13 +526,14 @@ int arp_bind_neighbour(struct dst_entry *dst) return -EINVAL; if (n == NULL) { __be32 nexthop = ((struct rtable *)dst)->rt_gateway; - if (dev->flags&(IFF_LOOPBACK|IFF_POINTOPOINT)) + if (dev->flags & (IFF_LOOPBACK | IFF_POINTOPOINT)) nexthop = 0; n = __neigh_lookup_errno( #if defined(CONFIG_ATM_CLIP) || defined(CONFIG_ATM_CLIP_MODULE) - dev->type == ARPHRD_ATM ? clip_tbl_hook : + dev->type == ARPHRD_ATM ? + clip_tbl_hook : #endif - &arp_tbl, &nexthop, dev); + &arp_tbl, &nexthop, dev); if (IS_ERR(n)) return PTR_ERR(n); dst->neighbour = n; @@ -543,8 +555,8 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, if (!IN_DEV_PROXY_ARP(in_dev)) return 0; - - if ((imi = IN_DEV_MEDIUM_ID(in_dev)) == 0) + imi = IN_DEV_MEDIUM_ID(in_dev); + if (imi == 0) return 1; if (imi == -1) return 0; @@ -685,7 +697,7 @@ struct sk_buff *arp_create(int type, int ptype, __be32 dest_ip, arp->ar_pln = 4; arp->ar_op = htons(type); - arp_ptr=(unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); memcpy(arp_ptr, src_hw, dev->addr_len); arp_ptr += dev->addr_len; @@ -735,9 +747,8 @@ void arp_send(int type, int ptype, __be32 dest_ip, skb = arp_create(type, ptype, dest_ip, dev, src_ip, dest_hw, src_hw, target_hw); - if (skb == NULL) { + if (skb == NULL) return; - } arp_xmit(skb); } @@ -815,7 +826,7 @@ static int arp_process(struct sk_buff *skb) /* * Extract fields */ - arp_ptr= (unsigned char *)(arp+1); + arp_ptr = (unsigned char *)(arp + 1); sha = arp_ptr; arp_ptr += dev->addr_len; memcpy(&sip, arp_ptr, 4); @@ -869,16 +880,17 @@ static int arp_process(struct sk_buff *skb) addr_type = rt->rt_type; if (addr_type == RTN_LOCAL) { - int dont_send = 0; + int dont_send; - if (!dont_send) - dont_send |= arp_ignore(in_dev,sip,tip); + dont_send = arp_ignore(in_dev, sip, tip); if (!dont_send && IN_DEV_ARPFILTER(in_dev)) - dont_send |= arp_filter(sip,tip,dev); + dont_send |= arp_filter(sip, tip, dev); if (!dont_send) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); neigh_release(n); } } @@ -887,8 +899,7 @@ static int arp_process(struct sk_buff *skb) if (addr_type == RTN_UNICAST && (arp_fwd_proxy(in_dev, dev, rt) || arp_fwd_pvlan(in_dev, dev, rt, sip, tip) || - pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) - { + pneigh_lookup(&arp_tbl, net, &tip, dev, 0))) { n = neigh_event_ns(&arp_tbl, sha, &sip, dev); if (n) neigh_release(n); @@ -896,9 +907,12 @@ static int arp_process(struct sk_buff *skb) if (NEIGH_CB(skb)->flags & LOCALLY_ENQUEUED || skb->pkt_type == PACKET_HOST || in_dev->arp_parms->proxy_delay == 0) { - arp_send(ARPOP_REPLY,ETH_P_ARP,sip,dev,tip,sha,dev->dev_addr,sha); + arp_send(ARPOP_REPLY, ETH_P_ARP, sip, + dev, tip, sha, dev->dev_addr, + sha); } else { - pneigh_enqueue(&arp_tbl, in_dev->arp_parms, skb); + pneigh_enqueue(&arp_tbl, + in_dev->arp_parms, skb); return 0; } goto out; @@ -939,7 +953,8 @@ static int arp_process(struct sk_buff *skb) if (arp->ar_op != htons(ARPOP_REPLY) || skb->pkt_type != PACKET_HOST) state = NUD_STALE; - neigh_update(n, sha, state, override ? NEIGH_UPDATE_F_OVERRIDE : 0); + neigh_update(n, sha, state, + override ? NEIGH_UPDATE_F_OVERRIDE : 0); neigh_release(n); } @@ -975,7 +990,8 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev, arp->ar_pln != 4) goto freeskb; - if ((skb = skb_share_check(skb, GFP_ATOMIC)) == NULL) + skb = skb_share_check(skb, GFP_ATOMIC); + if (skb == NULL) goto out_of_mem; memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); @@ -1019,7 +1035,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { dev = dev_getbyhwaddr(net, r->arp_ha.sa_family, - r->arp_ha.sa_data); + r->arp_ha.sa_data); if (!dev) return -ENODEV; } @@ -1033,7 +1049,7 @@ static int arp_req_set_public(struct net *net, struct arpreq *r, } static int arp_req_set(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { __be32 ip; struct neighbour *neigh; @@ -1046,10 +1062,11 @@ static int arp_req_set(struct net *net, struct arpreq *r, if (r->arp_flags & ATF_PERM) r->arp_flags |= ATF_COM; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; dev = rt->dst.dev; ip_rt_put(rt); @@ -1083,9 +1100,9 @@ static int arp_req_set(struct net *net, struct arpreq *r, unsigned state = NUD_STALE; if (r->arp_flags & ATF_PERM) state = NUD_PERMANENT; - err = neigh_update(neigh, (r->arp_flags&ATF_COM) ? + err = neigh_update(neigh, (r->arp_flags & ATF_COM) ? r->arp_ha.sa_data : NULL, state, - NEIGH_UPDATE_F_OVERRIDE| + NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); } @@ -1094,12 +1111,12 @@ static int arp_req_set(struct net *net, struct arpreq *r, static unsigned arp_state_to_flags(struct neighbour *neigh) { - unsigned flags = 0; if (neigh->nud_state&NUD_PERMANENT) - flags = ATF_PERM|ATF_COM; + return ATF_PERM | ATF_COM; else if (neigh->nud_state&NUD_VALID) - flags = ATF_COM; - return flags; + return ATF_COM; + else + return 0; } /* @@ -1142,7 +1159,7 @@ static int arp_req_delete_public(struct net *net, struct arpreq *r, } static int arp_req_delete(struct net *net, struct arpreq *r, - struct net_device * dev) + struct net_device *dev) { int err; __be32 ip; @@ -1153,10 +1170,11 @@ static int arp_req_delete(struct net *net, struct arpreq *r, ip = ((struct sockaddr_in *)&r->arp_pa)->sin_addr.s_addr; if (dev == NULL) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = ip, - .tos = RTO_ONLINK } } }; - struct rtable * rt; - if ((err = ip_route_output_key(net, &rt, &fl)) != 0) + struct flowi fl = { .nl_u.ip4_u = { .daddr = ip, + .tos = RTO_ONLINK } }; + struct rtable *rt; + err = ip_route_output_key(net, &rt, &fl); + if (err != 0) return err; dev = rt->dst.dev; ip_rt_put(rt); @@ -1166,7 +1184,7 @@ static int arp_req_delete(struct net *net, struct arpreq *r, err = -ENXIO; neigh = neigh_lookup(&arp_tbl, &ip, dev); if (neigh) { - if (neigh->nud_state&~NUD_NOARP) + if (neigh->nud_state & ~NUD_NOARP) err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE| NEIGH_UPDATE_F_ADMIN); @@ -1186,24 +1204,24 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) struct net_device *dev = NULL; switch (cmd) { - case SIOCDARP: - case SIOCSARP: - if (!capable(CAP_NET_ADMIN)) - return -EPERM; - case SIOCGARP: - err = copy_from_user(&r, arg, sizeof(struct arpreq)); - if (err) - return -EFAULT; - break; - default: - return -EINVAL; + case SIOCDARP: + case SIOCSARP: + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + case SIOCGARP: + err = copy_from_user(&r, arg, sizeof(struct arpreq)); + if (err) + return -EFAULT; + break; + default: + return -EINVAL; } if (r.arp_pa.sa_family != AF_INET) return -EPFNOSUPPORT; if (!(r.arp_flags & ATF_PUBL) && - (r.arp_flags & (ATF_NETMASK|ATF_DONTPUB))) + (r.arp_flags & (ATF_NETMASK | ATF_DONTPUB))) return -EINVAL; if (!(r.arp_flags & ATF_NETMASK)) ((struct sockaddr_in *)&r.arp_netmask)->sin_addr.s_addr = @@ -1211,7 +1229,8 @@ int arp_ioctl(struct net *net, unsigned int cmd, void __user *arg) rtnl_lock(); if (r.arp_dev[0]) { err = -ENODEV; - if ((dev = __dev_get_by_name(net, r.arp_dev)) == NULL) + dev = __dev_get_by_name(net, r.arp_dev); + if (dev == NULL) goto out; /* Mmmm... It is wrong... ARPHRD_NETROM==0 */ @@ -1243,7 +1262,8 @@ out: return err; } -static int arp_netdev_event(struct notifier_block *this, unsigned long event, void *ptr) +static int arp_netdev_event(struct notifier_block *this, unsigned long event, + void *ptr) { struct net_device *dev = ptr; @@ -1311,12 +1331,13 @@ static char *ax2asc2(ax25_address *a, char *buf) for (n = 0, s = buf; n < 6; n++) { c = (a->ax25_call[n] >> 1) & 0x7F; - if (c != ' ') *s++ = c; + if (c != ' ') + *s++ = c; } *s++ = '-'; - - if ((n = ((a->ax25_call[6] >> 1) & 0x0F)) > 9) { + n = (a->ax25_call[6] >> 1) & 0x0F; + if (n > 9) { *s++ = '1'; n -= 10; } @@ -1325,10 +1346,9 @@ static char *ax2asc2(ax25_address *a, char *buf) *s++ = '\0'; if (*buf == '\0' || *buf == '-') - return "*"; + return "*"; return buf; - } #endif /* CONFIG_AX25 */ @@ -1408,10 +1428,10 @@ static void *arp_seq_start(struct seq_file *seq, loff_t *pos) /* ------------------------------------------------------------------------ */ static const struct seq_operations arp_seq_ops = { - .start = arp_seq_start, - .next = neigh_seq_next, - .stop = neigh_seq_stop, - .show = arp_seq_show, + .start = arp_seq_start, + .next = neigh_seq_next, + .stop = neigh_seq_stop, + .show = arp_seq_show, }; static int arp_seq_open(struct inode *inode, struct file *file) -- cgit v1.1 From 65e9b62d4503849b10bedfc29bff0473760cc597 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 3 Sep 2010 02:59:14 +0000 Subject: ipv6: add special mode accept_ra=2 to accept RA while configured as router The current IPv6 behavior is to not accept router advertisements while forwarding, i.e. configured as router. This does make sense, a router is typically not supposed to be auto configured. However there are exceptions and we should allow the current behavior to be overwritten. Therefore this patch enables the user to overrule the "if forwarding enabled then don't listen to RAs" rule by setting accept_ra to the special value of 2. An alternative would be to ignore the forwarding switch alltogether and solely accept RAs based on the value of accept_ra. However, I found that if not intended, accepting RAs as a router can lead to strange unwanted behavior therefore we it seems wise to only do so if the user explicitely asks for this behavior. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/ndisc.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 58841c4..69a0051 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1105,6 +1105,18 @@ errout: rtnl_set_sk_err(net, RTNLGRP_ND_USEROPT, err); } +static inline int accept_ra(struct inet6_dev *in6_dev) +{ + /* + * If forwarding is enabled, RA are not accepted unless the special + * hybrid mode (accept_ra=2) is enabled. + */ + if (in6_dev->cnf.forwarding && in6_dev->cnf.accept_ra < 2) + return 0; + + return in6_dev->cnf.accept_ra; +} + static void ndisc_router_discovery(struct sk_buff *skb) { struct ra_msg *ra_msg = (struct ra_msg *)skb_transport_header(skb); @@ -1158,8 +1170,7 @@ static void ndisc_router_discovery(struct sk_buff *skb) return; } - /* skip route and link configuration on routers */ - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) + if (!accept_ra(in6_dev)) goto skip_linkparms; #ifdef CONFIG_IPV6_NDISC_NODETYPE @@ -1309,8 +1320,7 @@ skip_linkparms: NEIGH_UPDATE_F_ISROUTER); } - /* skip route and link configuration on routers */ - if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_ra) + if (!accept_ra(in6_dev)) goto out; #ifdef CONFIG_IPV6_ROUTE_INFO -- cgit v1.1 From c3bccac2fa76f1619dfe4fb7b9bee69de7f066d8 Mon Sep 17 00:00:00 2001 From: Thomas Graf Date: Fri, 3 Sep 2010 03:04:20 +0000 Subject: ipv6: add special mode forwarding=2 to send RS while configured as router Similar to accepting router advertisement, the IPv6 stack does not send router solicitations if forwarding is enabled. This patch enables this behavior to be overruled by setting forwarding to the special value 2. Signed-off-by: Thomas Graf Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index ab70a3f..5bc893e 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -2964,7 +2964,8 @@ static void addrconf_dad_completed(struct inet6_ifaddr *ifp) start sending router solicitations. */ - if (ifp->idev->cnf.forwarding == 0 && + if ((ifp->idev->cnf.forwarding == 0 || + ifp->idev->cnf.forwarding == 2) && ifp->idev->cnf.rtr_solicits > 0 && (dev->flags&IFF_LOOPBACK) == 0 && (ipv6_addr_type(&ifp->addr) & IPV6_ADDR_LINKLOCAL)) { -- cgit v1.1 From 52ee7a04a0f88815a71acdc604a854fb30dcbe45 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 3 Sep 2010 06:27:08 +0000 Subject: net: remove two kmemcheck annotations __alloc_skb() uses a memset() to clear all the beginning of skb, including bitfields contained in 'flags1' & 'flags2'. We dont need any more to use kmemcheck_annotate_bitfield() on these fields. However, we still need it for the clone part, which is not cleared. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 231dff0..c030cf8 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -202,8 +202,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb_reset_tail_pointer(skb); skb->end = skb->tail + size; - kmemcheck_annotate_bitfield(skb, flags1); - kmemcheck_annotate_bitfield(skb, flags2); #ifdef NET_SKBUFF_DATA_USES_OFFSET skb->mac_header = ~0U; #endif -- cgit v1.1 From 5d9c54c1e9ececcf7e99c4f014f9bec7ee3a7def Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Fri, 3 Sep 2010 08:33:39 +0000 Subject: tipc: Minor optimizations to name table translation code Optimizes TIPC's name table translation code to avoid unnecessary manipulation of the node address field of the resulting port id when name translation fails. This change is possible because a valid port id cannot have a reference field of zero, so examining the reference only is sufficient to determine if the translation was successful. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/name_table.c | 4 +--- net/tipc/port.c | 4 ++-- 2 files changed, 3 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index d504e49..c13c2c7 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -613,8 +613,7 @@ struct publication *tipc_nametbl_remove_publ(u32 type, u32 lower, } /* - * tipc_nametbl_translate(): Translate tipc_name -> tipc_portid. - * Very time-critical. + * tipc_nametbl_translate - translate name to port id * * Note: on entry 'destnode' is the search domain used during translation; * on exit it passes back the node address of the matching port (if any) @@ -685,7 +684,6 @@ found: } spin_unlock_bh(&seq->lock); not_found: - *destnode = 0; read_unlock_bh(&tipc_nametbl_lock); return 0; } diff --git a/net/tipc/port.c b/net/tipc/port.c index ebcbc21..d760336 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -1464,7 +1464,7 @@ int tipc_forward2name(u32 ref, msg_set_destnode(msg, destnode); msg_set_destport(msg, destport); - if (likely(destport || destnode)) { + if (likely(destport)) { p_ptr->sent++; if (likely(destnode == tipc_own_addr)) return tipc_port_recv_sections(p_ptr, num_sect, msg_sect); @@ -1542,7 +1542,7 @@ int tipc_forward_buf2name(u32 ref, skb_push(buf, LONG_H_SIZE); skb_copy_to_linear_data(buf, msg, LONG_H_SIZE); msg_dbg(buf_msg(buf),"PREP:"); - if (likely(destport || destnode)) { + if (likely(destport)) { p_ptr->sent++; if (destnode == tipc_own_addr) return tipc_port_recv_msg(buf); -- cgit v1.1 From 9fbfca013176f9b90d186f3b446fd93e4d972b25 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Fri, 3 Sep 2010 08:33:40 +0000 Subject: tipc: Ensure outgoing messages on Ethernet have sufficient headroom Add code to expand the headroom of an outgoing TIPC message if the sk_buff has insufficient room to hold the header for the associated Ethernet device. This change is necessary to ensure that messages TIPC does not create itself (eg. incoming messages that are being routed to another node) do not cause problems, since TIPC has no control over the amount of headroom available in such messages. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/eth_media.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 6230d16..81253d0 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -72,17 +72,26 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, { struct sk_buff *clone; struct net_device *dev; + int delta; clone = skb_clone(buf, GFP_ATOMIC); - if (clone) { - skb_reset_network_header(clone); - dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; - clone->dev = dev; - dev_hard_header(clone, dev, ETH_P_TIPC, - &dest->dev_addr.eth_addr, - dev->dev_addr, clone->len); - dev_queue_xmit(clone); + if (!clone) + return 0; + + dev = ((struct eth_bearer *)(tb_ptr->usr_handle))->dev; + delta = dev->hard_header_len - skb_headroom(buf); + + if ((delta > 0) && + pskb_expand_head(clone, SKB_DATA_ALIGN(delta), 0, GFP_ATOMIC)) { + kfree_skb(clone); + return 0; } + + skb_reset_network_header(clone); + clone->dev = dev; + dev_hard_header(clone, dev, ETH_P_TIPC, &dest->dev_addr.eth_addr, + dev->dev_addr, clone->len); + dev_queue_xmit(clone); return 0; } -- cgit v1.1 From d1fb62796cdac6899ebd4319e4a610684db063e9 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Fri, 3 Sep 2010 08:33:42 +0000 Subject: tipc: Fix misleading error code when enabling Ethernet bearers Cause TIPC to return EAGAIN if it is unable to enable a new Ethernet bearer because one or more recently disabled Ethernet bearers are temporarily consuming resources during shut down. (The previous error code, EDQUOT, is now returned only if all available Ethernet bearer data structures are fully enabled at the time the request to enable an additional bearer is received.) Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/eth_media.c | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'net') diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 81253d0..673fdf0 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -142,6 +142,16 @@ static int enable_bearer(struct tipc_bearer *tb_ptr) struct eth_bearer *eb_ptr = ð_bearers[0]; struct eth_bearer *stop = ð_bearers[MAX_ETH_BEARERS]; char *driver_name = strchr((const char *)tb_ptr->name, ':') + 1; + int pending_dev = 0; + + /* Find unused Ethernet bearer structure */ + + while (eb_ptr->dev) { + if (!eb_ptr->bearer) + pending_dev++; + if (++eb_ptr == stop) + return pending_dev ? -EAGAIN : -EDQUOT; + } /* Find device with specified name */ -- cgit v1.1 From 1fd63041c49c5c6ed1fe58b7bccc2de462d51e2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 2 Sep 2010 23:09:32 +0000 Subject: net: pskb_expand_head() optimization pskb_expand_head() blindly takes references on fragments before calling skb_release_data(), potentially releasing these references. We can add a fast path, avoiding these atomic operations, if we own the last reference on skb->head. Based on a previous patch from David Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/skbuff.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index c030cf8..2d1bc76 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -779,6 +779,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, u8 *data; int size = nhead + (skb_end_pointer(skb) - skb->head) + ntail; long off; + bool fastpath; BUG_ON(nhead < 0); @@ -800,14 +801,28 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_shinfo(skb), offsetof(struct skb_shared_info, frags[skb_shinfo(skb)->nr_frags])); - for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) - get_page(skb_shinfo(skb)->frags[i].page); + /* Check if we can avoid taking references on fragments if we own + * the last reference on skb->head. (see skb_release_data()) + */ + if (!skb->cloned) + fastpath = true; + else { + int delta = skb->nohdr ? (1 << SKB_DATAREF_SHIFT) + 1 : 1; - if (skb_has_frag_list(skb)) - skb_clone_fraglist(skb); + fastpath = atomic_read(&skb_shinfo(skb)->dataref) == delta; + } - skb_release_data(skb); + if (fastpath) { + kfree(skb->head); + } else { + for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) + get_page(skb_shinfo(skb)->frags[i].page); + if (skb_has_frag_list(skb)) + skb_clone_fraglist(skb); + + skb_release_data(skb); + } off = (data + nhead) - skb->head; skb->head = data; -- cgit v1.1 From 29af9309dba25077f711f5f975977714bc43a0c3 Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Sun, 5 Sep 2010 09:00:21 +0000 Subject: net/9p/trans_fd.c: Fix unsigned return type The function has an unsigned return type, but returns a negative constant to indicate an error condition. The result of calling the function is always stored in a variable of type (signed) int, and thus unsigned can be dropped from the return type. A sematic match that finds this problem is as follows: (http://coccinelle.lip6.fr/) // @exists@ identifier f; constant C; @@ unsigned f(...) { <+... * return -C; ...+> } // Signed-off-by: Julia Lawall Signed-off-by: David S. Miller --- net/9p/trans_fd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index c85109d..078eb16 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -222,7 +222,7 @@ static void p9_conn_cancel(struct p9_conn *m, int err) } } -static unsigned int +static int p9_fd_poll(struct p9_client *client, struct poll_table_struct *pt) { int ret, n; -- cgit v1.1 From b31fa5bad576cd8180a5ad70c648333b44320d44 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 5 Sep 2010 21:31:11 +0000 Subject: net/caif: Use pr_fmt This patch standardizes caif message logging prefixes. Add #define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ Add missing "\n"s to some logging messages Convert pr_warning to pr_warn This changes the logging message prefix from CAIF: to caif: for all uses but caif_socket.c and chnl_net.c. Those now use their filename without extension. Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 22 +++++++++--------- net/caif/caif_socket.c | 19 ++++++++-------- net/caif/cfcnfg.c | 49 ++++++++++++++++++---------------------- net/caif/cfctrl.c | 59 +++++++++++++++++++++---------------------------- net/caif/cfdbgl.c | 4 +++- net/caif/cfdgml.c | 11 ++++----- net/caif/cffrml.c | 14 +++++++----- net/caif/cfmuxl.c | 14 +++++++----- net/caif/cfpkt_skbuff.c | 48 ++++++++++++++++++++-------------------- net/caif/cfrfml.c | 12 +++++----- net/caif/cfserl.c | 4 +++- net/caif/cfsrvl.c | 17 ++++++-------- net/caif/cfutill.c | 12 +++++----- net/caif/cfveil.c | 11 ++++----- net/caif/cfvidl.c | 6 +++-- net/caif/chnl_net.c | 46 ++++++++++++++++++-------------------- 16 files changed, 167 insertions(+), 181 deletions(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 0b586e9..1fdfe70 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -9,6 +9,8 @@ * and Sakari Ailus */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -214,7 +216,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, switch (what) { case NETDEV_REGISTER: - pr_info("CAIF: %s():register %s\n", __func__, dev->name); + pr_info("register %s\n", dev->name); caifd = caif_device_alloc(dev); if (caifd == NULL) break; @@ -225,14 +227,13 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; case NETDEV_UP: - pr_info("CAIF: %s(): up %s\n", __func__, dev->name); + pr_info("up %s\n", dev->name); caifd = caif_get(dev); if (caifd == NULL) break; caifdev = netdev_priv(dev); if (atomic_read(&caifd->state) == NETDEV_UP) { - pr_info("CAIF: %s():%s already up\n", - __func__, dev->name); + pr_info("%s already up\n", dev->name); break; } atomic_set(&caifd->state, what); @@ -273,7 +274,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, caifd = caif_get(dev); if (caifd == NULL) break; - pr_info("CAIF: %s():going down %s\n", __func__, dev->name); + pr_info("going down %s\n", dev->name); if (atomic_read(&caifd->state) == NETDEV_GOING_DOWN || atomic_read(&caifd->state) == NETDEV_DOWN) @@ -295,11 +296,10 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, caifd = caif_get(dev); if (caifd == NULL) break; - pr_info("CAIF: %s(): down %s\n", __func__, dev->name); + pr_info("down %s\n", dev->name); if (atomic_read(&caifd->in_use)) - pr_warning("CAIF: %s(): " - "Unregistering an active CAIF device: %s\n", - __func__, dev->name); + pr_warn("Unregistering an active CAIF device: %s\n", + dev->name); cfcnfg_del_phy_layer(get_caif_conf(), &caifd->layer); dev_put(dev); atomic_set(&caifd->state, what); @@ -307,7 +307,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, case NETDEV_UNREGISTER: caifd = caif_get(dev); - pr_info("CAIF: %s(): unregister %s\n", __func__, dev->name); + pr_info("unregister %s\n", dev->name); atomic_set(&caifd->state, what); caif_device_destroy(dev); break; @@ -391,7 +391,7 @@ static int __init caif_device_init(void) int result; cfg = cfcnfg_create(); if (!cfg) { - pr_warning("CAIF: %s(): can't create cfcnfg.\n", __func__); + pr_warn("can't create cfcnfg\n"); goto err_cfcnfg_create_failed; } result = register_pernet_device(&caif_net_ops); diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 8ce9047..fd1f5df 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -157,8 +159,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { - trace_printk("CAIF: %s():" - " sending flow OFF (queue len = %d %d)\n", + trace_printk("CAIF: %s(): " + "sending flow OFF (queue len = %d %d)\n", __func__, atomic_read(&cf_sk->sk.sk_rmem_alloc), sk_rcvbuf_lowwater(cf_sk)); @@ -172,8 +174,8 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return err; if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); - trace_printk("CAIF: %s():" - " sending flow OFF due to rmem_schedule\n", + trace_printk("CAIF: %s(): " + "sending flow OFF due to rmem_schedule\n", __func__); dbfs_atomic_inc(&cnt.num_rx_flow_off); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); @@ -275,8 +277,7 @@ static void caif_ctrl_cb(struct cflayer *layr, break; default: - pr_debug("CAIF: %s(): Unexpected flow command %d\n", - __func__, flow); + pr_debug("Unexpected flow command %d\n", flow); } } @@ -536,8 +537,7 @@ static int transmit_skb(struct sk_buff *skb, struct caifsock *cf_sk, /* Slight paranoia, probably not needed. */ if (unlikely(loopcnt++ > 1000)) { - pr_warning("CAIF: %s(): transmit retries failed," - " error = %d\n", __func__, ret); + pr_warn("transmit retries failed, error = %d\n", ret); break; } @@ -902,8 +902,7 @@ static int caif_connect(struct socket *sock, struct sockaddr *uaddr, cf_sk->maxframe = dev->mtu - (headroom + tailroom); dev_put(dev); if (cf_sk->maxframe < 1) { - pr_warning("CAIF: %s(): CAIF Interface MTU too small (%d)\n", - __func__, dev->mtu); + pr_warn("CAIF Interface MTU too small (%d)\n", dev->mtu); err = -ENODEV; goto out; } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index 1c29189..ef93a13 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -3,6 +3,9 @@ * Author: Sjur Brendeland/sjur.brandeland@stericsson.com * License terms: GNU General Public License (GPL) version 2 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -78,7 +81,7 @@ struct cfcnfg *cfcnfg_create(void) /* Initiate this layer */ this = kzalloc(sizeof(struct cfcnfg), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } this->mux = cfmuxl_create(); @@ -106,7 +109,7 @@ struct cfcnfg *cfcnfg_create(void) layer_set_up(this->ctrl, this); return this; out_of_mem: - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); kfree(this->mux); kfree(this->ctrl); kfree(this); @@ -194,7 +197,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) caif_assert(adap_layer != NULL); channel_id = adap_layer->id; if (adap_layer->dn == NULL || channel_id == 0) { - pr_err("CAIF: %s():adap_layer->id is 0\n", __func__); + pr_err("adap_layer->id is 0\n"); ret = -ENOTCONN; goto end; } @@ -204,9 +207,8 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) layer_set_up(servl, NULL); ret = cfctrl_linkdown_req(cnfg->ctrl, channel_id, adap_layer); if (servl == NULL) { - pr_err("CAIF: %s(): PROTOCOL ERROR " - "- Error removing service_layer Channel_Id(%d)", - __func__, channel_id); + pr_err("PROTOCOL ERROR - Error removing service_layer Channel_Id(%d)", + channel_id); ret = -EINVAL; goto end; } @@ -216,18 +218,14 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) phyinfo = cfcnfg_get_phyinfo(cnfg, phyid); if (phyinfo == NULL) { - pr_warning("CAIF: %s(): " - "No interface to send disconnect to\n", - __func__); + pr_warn("No interface to send disconnect to\n"); ret = -ENODEV; goto end; } if (phyinfo->id != phyid || phyinfo->phy_layer->id != phyid || phyinfo->frm_layer->id != phyid) { - pr_err("CAIF: %s(): " - "Inconsistency in phy registration\n", - __func__); + pr_err("Inconsistency in phy registration\n"); ret = -EINVAL; goto end; } @@ -276,21 +274,20 @@ int cfcnfg_add_adaptation_layer(struct cfcnfg *cnfg, { struct cflayer *frml; if (adap_layer == NULL) { - pr_err("CAIF: %s(): adap_layer is zero", __func__); + pr_err("adap_layer is zero\n"); return -EINVAL; } if (adap_layer->receive == NULL) { - pr_err("CAIF: %s(): adap_layer->receive is NULL", __func__); + pr_err("adap_layer->receive is NULL\n"); return -EINVAL; } if (adap_layer->ctrlcmd == NULL) { - pr_err("CAIF: %s(): adap_layer->ctrlcmd == NULL", __func__); + pr_err("adap_layer->ctrlcmd == NULL\n"); return -EINVAL; } frml = cnfg->phy_layers[param->phyid].frm_layer; if (frml == NULL) { - pr_err("CAIF: %s(): Specified PHY type does not exist!", - __func__); + pr_err("Specified PHY type does not exist!\n"); return -ENODEV; } caif_assert(param->phyid == cnfg->phy_layers[param->phyid].id); @@ -330,9 +327,7 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, struct net_device *netdev; if (adapt_layer == NULL) { - pr_debug("CAIF: %s(): link setup response " - "but no client exist, send linkdown back\n", - __func__); + pr_debug("link setup response but no client exist, send linkdown back\n"); cfctrl_linkdown_req(cnfg->ctrl, channel_id, NULL); return; } @@ -374,13 +369,11 @@ cfcnfg_linkup_rsp(struct cflayer *layer, u8 channel_id, enum cfctrl_srv serv, servicel = cfdbgl_create(channel_id, &phyinfo->dev_info); break; default: - pr_err("CAIF: %s(): Protocol error. " - "Link setup response - unknown channel type\n", - __func__); + pr_err("Protocol error. Link setup response - unknown channel type\n"); return; } if (!servicel) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } layer_set_dn(servicel, cnfg->mux); @@ -418,7 +411,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, } } if (*phyid == 0) { - pr_err("CAIF: %s(): No Available PHY ID\n", __func__); + pr_err("No Available PHY ID\n"); return; } @@ -427,7 +420,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, phy_driver = cfserl_create(CFPHYTYPE_FRAG, *phyid, stx); if (!phy_driver) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } @@ -436,7 +429,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, phy_driver = NULL; break; default: - pr_err("CAIF: %s(): %d", __func__, phy_type); + pr_err("%d\n", phy_type); return; break; } @@ -455,7 +448,7 @@ cfcnfg_add_phy_layer(struct cfcnfg *cnfg, enum cfcnfg_phy_type phy_type, phy_layer->type = phy_type; frml = cffrml_create(*phyid, fcs); if (!frml) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cnfg->phy_layers[*phyid].frm_layer = frml; diff --git a/net/caif/cfctrl.c b/net/caif/cfctrl.c index 563145f..08f267a 100644 --- a/net/caif/cfctrl.c +++ b/net/caif/cfctrl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -36,7 +38,7 @@ struct cflayer *cfctrl_create(void) struct cfctrl *this = kmalloc(sizeof(struct cfctrl), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfctrl, serv.layer) == 0); @@ -132,9 +134,7 @@ struct cfctrl_request_info *cfctrl_remove_req(struct cfctrl *ctrl, list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (cfctrl_req_eq(req, p)) { if (p != first) - pr_warning("CAIF: %s(): Requests are not " - "received in order\n", - __func__); + pr_warn("Requests are not received in order\n"); atomic_set(&ctrl->rsp_seq_no, p->sequence_no); @@ -177,7 +177,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid) int ret; struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } caif_assert(offsetof(struct cfctrl, serv.layer) == 0); @@ -189,8 +189,7 @@ void cfctrl_enum_req(struct cflayer *layer, u8 physlinkid) ret = cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); if (ret < 0) { - pr_err("CAIF: %s(): Could not transmit enum message\n", - __func__); + pr_err("Could not transmit enum message\n"); cfpkt_destroy(pkt); } } @@ -208,7 +207,7 @@ int cfctrl_linkup_request(struct cflayer *layer, char utility_name[16]; struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_SETUP); @@ -253,13 +252,13 @@ int cfctrl_linkup_request(struct cflayer *layer, param->u.utility.paramlen); break; default: - pr_warning("CAIF: %s():Request setup of bad link type = %d\n", - __func__, param->linktype); + pr_warn("Request setup of bad link type = %d\n", + param->linktype); return -EINVAL; } req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } req->client_layer = user_layer; @@ -276,8 +275,7 @@ int cfctrl_linkup_request(struct cflayer *layer, ret = cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); if (ret < 0) { - pr_err("CAIF: %s(): Could not transmit linksetup request\n", - __func__); + pr_err("Could not transmit linksetup request\n"); cfpkt_destroy(pkt); return -ENODEV; } @@ -291,7 +289,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid, struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } cfpkt_addbdy(pkt, CFCTRL_CMD_LINK_DESTROY); @@ -300,8 +298,7 @@ int cfctrl_linkdown_req(struct cflayer *layer, u8 channelid, ret = cfctrl->serv.layer.dn->transmit(cfctrl->serv.layer.dn, pkt); if (ret < 0) { - pr_err("CAIF: %s(): Could not transmit link-down request\n", - __func__); + pr_err("Could not transmit link-down request\n"); cfpkt_destroy(pkt); } return ret; @@ -313,7 +310,7 @@ void cfctrl_sleep_req(struct cflayer *layer) struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cfpkt_addbdy(pkt, CFCTRL_CMD_SLEEP); @@ -330,7 +327,7 @@ void cfctrl_wake_req(struct cflayer *layer) struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cfpkt_addbdy(pkt, CFCTRL_CMD_WAKE); @@ -347,7 +344,7 @@ void cfctrl_getstartreason_req(struct cflayer *layer) struct cfctrl *cfctrl = container_obj(layer); struct cfpkt *pkt = cfpkt_create(CFPKT_CTRL_PKT_LEN); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return; } cfpkt_addbdy(pkt, CFCTRL_CMD_START_REASON); @@ -364,12 +361,11 @@ void cfctrl_cancel_req(struct cflayer *layr, struct cflayer *adap_layer) struct cfctrl_request_info *p, *tmp; struct cfctrl *ctrl = container_obj(layr); spin_lock(&ctrl->info_list_lock); - pr_warning("CAIF: %s(): enter\n", __func__); + pr_warn("enter\n"); list_for_each_entry_safe(p, tmp, &ctrl->list, list) { if (p->client_layer == adap_layer) { - pr_warning("CAIF: %s(): cancel req :%d\n", __func__, - p->sequence_no); + pr_warn("cancel req :%d\n", p->sequence_no); list_del(&p->list); kfree(p); } @@ -520,9 +516,8 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) cfpkt_extr_head(pkt, ¶m, len); break; default: - pr_warning("CAIF: %s(): Request setup " - "- invalid link type (%d)", - __func__, serv); + pr_warn("Request setup - invalid link type (%d)\n", + serv); goto error; } @@ -532,9 +527,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) if (CFCTRL_ERR_BIT == (CFCTRL_ERR_BIT & cmdrsp) || cfpkt_erroneous(pkt)) { - pr_err("CAIF: %s(): Invalid O/E bit or parse " - "error on CAIF control channel", - __func__); + pr_err("Invalid O/E bit or parse error on CAIF control channel\n"); cfctrl->res.reject_rsp(cfctrl->serv.layer.up, 0, req ? req->client_layer @@ -556,8 +549,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) cfctrl->res.linkdestroy_rsp(cfctrl->serv.layer.up, linkid); break; case CFCTRL_CMD_LINK_ERR: - pr_err("CAIF: %s(): Frame Error Indication received\n", - __func__); + pr_err("Frame Error Indication received\n"); cfctrl->res.linkerror_ind(); break; case CFCTRL_CMD_ENUM: @@ -576,7 +568,7 @@ static int cfctrl_recv(struct cflayer *layer, struct cfpkt *pkt) cfctrl->res.radioset_rsp(); break; default: - pr_err("CAIF: %s(): Unrecognized Control Frame\n", __func__); + pr_err("Unrecognized Control Frame\n"); goto error; break; } @@ -595,8 +587,7 @@ static void cfctrl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, case CAIF_CTRLCMD_FLOW_OFF_IND: spin_lock(&this->info_list_lock); if (!list_empty(&this->list)) { - pr_debug("CAIF: %s(): Received flow off in " - "control layer", __func__); + pr_debug("Received flow off in control layer\n"); } spin_unlock(&this->info_list_lock); break; @@ -620,7 +611,7 @@ static int handle_loop(struct cfctrl *ctrl, int cmd, struct cfpkt *pkt) if (!ctrl->loop_linkused[linkid]) goto found; spin_unlock(&ctrl->loop_linkid_lock); - pr_err("CAIF: %s(): Out of link-ids\n", __func__); + pr_err("Out of link-ids\n"); return -EINVAL; found: if (!ctrl->loop_linkused[linkid]) diff --git a/net/caif/cfdbgl.c b/net/caif/cfdbgl.c index 676648c..496fda9 100644 --- a/net/caif/cfdbgl.c +++ b/net/caif/cfdbgl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -17,7 +19,7 @@ struct cflayer *cfdbgl_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *dbg = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!dbg) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); diff --git a/net/caif/cfdgml.c b/net/caif/cfdgml.c index ed9d53a..d3ed264 100644 --- a/net/caif/cfdgml.c +++ b/net/caif/cfdgml.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -26,7 +28,7 @@ struct cflayer *cfdgml_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *dgm = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!dgm) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -49,14 +51,14 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt) caif_assert(layr->ctrlcmd != NULL); if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } if ((cmd & DGM_CMD_BIT) == 0) { if (cfpkt_extr_head(pkt, &dgmhdr, 3) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -75,8 +77,7 @@ static int cfdgml_receive(struct cflayer *layr, struct cfpkt *pkt) return 0; default: cfpkt_destroy(pkt); - pr_info("CAIF: %s(): Unknown datagram control %d (0x%x)\n", - __func__, cmd, cmd); + pr_info("Unknown datagram control %d (0x%x)\n", cmd, cmd); return -EPROTO; } } diff --git a/net/caif/cffrml.c b/net/caif/cffrml.c index e86a4ca..a445043 100644 --- a/net/caif/cffrml.c +++ b/net/caif/cffrml.c @@ -6,6 +6,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -32,7 +34,7 @@ struct cflayer *cffrml_create(u16 phyid, bool use_fcs) { struct cffrml *this = kmalloc(sizeof(struct cffrml), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cffrml, layer) == 0); @@ -83,7 +85,7 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) if (cfpkt_setlen(pkt, len) < 0) { ++cffrml_rcv_error; - pr_err("CAIF: %s():Framing length error (%d)\n", __func__, len); + pr_err("Framing length error (%d)\n", len); cfpkt_destroy(pkt); return -EPROTO; } @@ -99,14 +101,14 @@ static int cffrml_receive(struct cflayer *layr, struct cfpkt *pkt) cfpkt_add_trail(pkt, &tmp, 2); ++cffrml_rcv_error; ++cffrml_rcv_checsum_error; - pr_info("CAIF: %s(): Frame checksum error " - "(0x%x != 0x%x)\n", __func__, hdrchks, pktchks); + pr_info("Frame checksum error (0x%x != 0x%x)\n", + hdrchks, pktchks); return -EILSEQ; } } if (cfpkt_erroneous(pkt)) { ++cffrml_rcv_error; - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -132,7 +134,7 @@ static int cffrml_transmit(struct cflayer *layr, struct cfpkt *pkt) cfpkt_add_head(pkt, &tmp, 2); cfpkt_info(pkt)->hdr_len += 2; if (cfpkt_erroneous(pkt)) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); return -EPROTO; } ret = layr->dn->transmit(layr->dn, pkt); diff --git a/net/caif/cfmuxl.c b/net/caif/cfmuxl.c index 80c8d33..46f34b2 100644 --- a/net/caif/cfmuxl.c +++ b/net/caif/cfmuxl.c @@ -3,6 +3,9 @@ * Author: Sjur Brendeland/sjur.brandeland@stericsson.com * License terms: GNU General Public License (GPL) version 2 */ + +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -190,7 +193,7 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) u8 id; struct cflayer *up; if (cfpkt_extr_head(pkt, &id, 1) < 0) { - pr_err("CAIF: %s(): erroneous Caif Packet\n", __func__); + pr_err("erroneous Caif Packet\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -199,8 +202,8 @@ static int cfmuxl_receive(struct cflayer *layr, struct cfpkt *pkt) up = get_up(muxl, id); spin_unlock(&muxl->receive_lock); if (up == NULL) { - pr_info("CAIF: %s():Received data on unknown link ID = %d " - "(0x%x) up == NULL", __func__, id, id); + pr_info("Received data on unknown link ID = %d (0x%x) up == NULL", + id, id); cfpkt_destroy(pkt); /* * Don't return ERROR, since modem misbehaves and sends out @@ -223,9 +226,8 @@ static int cfmuxl_transmit(struct cflayer *layr, struct cfpkt *pkt) struct caif_payload_info *info = cfpkt_info(pkt); dn = get_dn(muxl, cfpkt_info(pkt)->dev_info); if (dn == NULL) { - pr_warning("CAIF: %s(): Send data on unknown phy " - "ID = %d (0x%x)\n", - __func__, info->dev_info->id, info->dev_info->id); + pr_warn("Send data on unknown phy ID = %d (0x%x)\n", + info->dev_info->id, info->dev_info->id); return -ENOTCONN; } info->hdr_len += 1; diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c index c49a669..d7e865e 100644 --- a/net/caif/cfpkt_skbuff.c +++ b/net/caif/cfpkt_skbuff.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -12,11 +14,12 @@ #define PKT_PREFIX 48 #define PKT_POSTFIX 2 #define PKT_LEN_WHEN_EXTENDING 128 -#define PKT_ERROR(pkt, errmsg) do { \ - cfpkt_priv(pkt)->erronous = true; \ - skb_reset_tail_pointer(&pkt->skb); \ - pr_warning("CAIF: " errmsg);\ - } while (0) +#define PKT_ERROR(pkt, errmsg) \ +do { \ + cfpkt_priv(pkt)->erronous = true; \ + skb_reset_tail_pointer(&pkt->skb); \ + pr_warn(errmsg); \ +} while (0) struct cfpktq { struct sk_buff_head head; @@ -130,13 +133,13 @@ int cfpkt_extr_head(struct cfpkt *pkt, void *data, u16 len) return -EPROTO; if (unlikely(len > skb->len)) { - PKT_ERROR(pkt, "cfpkt_extr_head read beyond end of packet\n"); + PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } if (unlikely(len > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_extr_head linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } @@ -156,11 +159,11 @@ int cfpkt_extr_trail(struct cfpkt *pkt, void *dta, u16 len) return -EPROTO; if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_extr_trail linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb->data + len > skb_tail_pointer(skb))) { - PKT_ERROR(pkt, "cfpkt_extr_trail read beyond end of packet\n"); + PKT_ERROR(pkt, "read beyond end of packet\n"); return -EPROTO; } from = skb_tail_pointer(skb) - len; @@ -202,7 +205,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) /* Make sure data is writable */ if (unlikely(skb_cow_data(skb, addlen, &lastskb) < 0)) { - PKT_ERROR(pkt, "cfpkt_add_body: cow failed\n"); + PKT_ERROR(pkt, "cow failed\n"); return -EPROTO; } /* @@ -211,8 +214,7 @@ int cfpkt_add_body(struct cfpkt *pkt, const void *data, u16 len) * lengths of the top SKB. */ if (lastskb != skb) { - pr_warning("CAIF: %s(): Packet is non-linear\n", - __func__); + pr_warn("Packet is non-linear\n"); skb->len += len; skb->data_len += len; } @@ -242,14 +244,14 @@ int cfpkt_add_head(struct cfpkt *pkt, const void *data2, u16 len) if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_headroom(skb) < len)) { - PKT_ERROR(pkt, "cfpkt_add_head: no headroom\n"); + PKT_ERROR(pkt, "no headroom\n"); return -EPROTO; } /* Make sure data is writable */ ret = skb_cow_data(skb, 0, &lastskb); if (unlikely(ret < 0)) { - PKT_ERROR(pkt, "cfpkt_add_head: cow failed\n"); + PKT_ERROR(pkt, "cow failed\n"); return ret; } @@ -283,7 +285,7 @@ inline u16 cfpkt_iterate(struct cfpkt *pkt, if (unlikely(is_erronous(pkt))) return -EPROTO; if (unlikely(skb_linearize(&pkt->skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_iterate: linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } return iter_func(data, pkt->skb.data, cfpkt_getlen(pkt)); @@ -309,7 +311,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len) /* Need to expand SKB */ if (unlikely(!cfpkt_pad_trail(pkt, len - skb->len))) - PKT_ERROR(pkt, "cfpkt_setlen: skb_pad_trail failed\n"); + PKT_ERROR(pkt, "skb_pad_trail failed\n"); return cfpkt_getlen(pkt); } @@ -380,8 +382,7 @@ struct cfpkt *cfpkt_split(struct cfpkt *pkt, u16 pos) return NULL; if (skb->data + pos > skb_tail_pointer(skb)) { - PKT_ERROR(pkt, - "cfpkt_split: trying to split beyond end of packet"); + PKT_ERROR(pkt, "trying to split beyond end of packet\n"); return NULL; } @@ -455,17 +456,17 @@ int cfpkt_raw_append(struct cfpkt *pkt, void **buf, unsigned int buflen) return -EPROTO; /* Make sure SKB is writable */ if (unlikely(skb_cow_data(skb, 0, &lastskb) < 0)) { - PKT_ERROR(pkt, "cfpkt_raw_append: skb_cow_data failed\n"); + PKT_ERROR(pkt, "skb_cow_data failed\n"); return -EPROTO; } if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_raw_append: linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } if (unlikely(skb_tailroom(skb) < buflen)) { - PKT_ERROR(pkt, "cfpkt_raw_append: buffer too short - failed\n"); + PKT_ERROR(pkt, "buffer too short - failed\n"); return -EPROTO; } @@ -483,14 +484,13 @@ int cfpkt_raw_extract(struct cfpkt *pkt, void **buf, unsigned int buflen) return -EPROTO; if (unlikely(buflen > skb->len)) { - PKT_ERROR(pkt, "cfpkt_raw_extract: buflen too large " - "- failed\n"); + PKT_ERROR(pkt, "buflen too large - failed\n"); return -EPROTO; } if (unlikely(buflen > skb_headlen(skb))) { if (unlikely(skb_linearize(skb) != 0)) { - PKT_ERROR(pkt, "cfpkt_raw_extract: linearize failed\n"); + PKT_ERROR(pkt, "linearize failed\n"); return -EPROTO; } } diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c index eb16020..6acaeba 100644 --- a/net/caif/cfrfml.c +++ b/net/caif/cfrfml.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -48,7 +50,7 @@ struct cflayer *cfrfml_create(u8 channel_id, struct dev_info *dev_info, kzalloc(sizeof(struct cfrfml), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } @@ -178,9 +180,7 @@ out: cfpkt_destroy(rfml->incomplete_frm); rfml->incomplete_frm = NULL; - pr_info("CAIF: %s(): " - "Connection error %d triggered on RFM link\n", - __func__, err); + pr_info("Connection error %d triggered on RFM link\n", err); /* Trigger connection error upon failure.*/ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, @@ -280,9 +280,7 @@ static int cfrfml_transmit(struct cflayer *layr, struct cfpkt *pkt) out: if (err != 0) { - pr_info("CAIF: %s(): " - "Connection error %d triggered on RFM link\n", - __func__, err); + pr_info("Connection error %d triggered on RFM link\n", err); /* Trigger connection error upon failure.*/ layr->up->ctrlcmd(layr->up, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, diff --git a/net/caif/cfserl.c b/net/caif/cfserl.c index a11fbd6..9297f7d 100644 --- a/net/caif/cfserl.c +++ b/net/caif/cfserl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -34,7 +36,7 @@ struct cflayer *cfserl_create(int type, int instance, bool use_stx) { struct cfserl *this = kmalloc(sizeof(struct cfserl), GFP_ATOMIC); if (!this) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfserl, layer) == 0); diff --git a/net/caif/cfsrvl.c b/net/caif/cfsrvl.c index f40939a..ab5e542 100644 --- a/net/caif/cfsrvl.c +++ b/net/caif/cfsrvl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -79,8 +81,7 @@ static void cfservl_ctrlcmd(struct cflayer *layr, enum caif_ctrlcmd ctrl, layr->up->ctrlcmd(layr->up, ctrl, phyid); break; default: - pr_warning("CAIF: %s(): " - "Unexpected ctrl in cfsrvl (%d)\n", __func__, ctrl); + pr_warn("Unexpected ctrl in cfsrvl (%d)\n", ctrl); /* We have both modem and phy flow on, send flow on */ layr->up->ctrlcmd(layr->up, ctrl, phyid); service->phy_flow_on = true; @@ -107,14 +108,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) u8 flow_on = SRVL_FLOW_ON; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", - __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } if (cfpkt_add_head(pkt, &flow_on, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", - __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -131,14 +130,12 @@ static int cfservl_modemcmd(struct cflayer *layr, enum caif_modemcmd ctrl) u8 flow_off = SRVL_FLOW_OFF; pkt = cfpkt_create(SRVL_CTRL_PKT_SIZE); if (!pkt) { - pr_warning("CAIF: %s(): Out of memory\n", - __func__); + pr_warn("Out of memory\n"); return -ENOMEM; } if (cfpkt_add_head(pkt, &flow_off, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", - __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } diff --git a/net/caif/cfutill.c b/net/caif/cfutill.c index 02795af..efad410 100644 --- a/net/caif/cfutill.c +++ b/net/caif/cfutill.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -26,7 +28,7 @@ struct cflayer *cfutill_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *util = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!util) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -47,7 +49,7 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt) caif_assert(layr->up->receive != NULL); caif_assert(layr->up->ctrlcmd != NULL); if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -64,16 +66,14 @@ static int cfutill_receive(struct cflayer *layr, struct cfpkt *pkt) cfpkt_destroy(pkt); return 0; case UTIL_REMOTE_SHUTDOWN: /* Remote Shutdown Request */ - pr_err("CAIF: %s(): REMOTE SHUTDOWN REQUEST RECEIVED\n", - __func__); + pr_err("REMOTE SHUTDOWN REQUEST RECEIVED\n"); layr->ctrlcmd(layr, CAIF_CTRLCMD_REMOTE_SHUTDOWN_IND, 0); service->open = false; cfpkt_destroy(pkt); return 0; default: cfpkt_destroy(pkt); - pr_warning("CAIF: %s(): Unknown service control %d (0x%x)\n", - __func__, cmd, cmd); + pr_warn("Unknown service control %d (0x%x)\n", cmd, cmd); return -EPROTO; } } diff --git a/net/caif/cfveil.c b/net/caif/cfveil.c index 77cc09f..3b425b1 100644 --- a/net/caif/cfveil.c +++ b/net/caif/cfveil.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -25,7 +27,7 @@ struct cflayer *cfvei_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *vei = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!vei) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -47,7 +49,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt) if (cfpkt_extr_head(pkt, &cmd, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } @@ -67,8 +69,7 @@ static int cfvei_receive(struct cflayer *layr, struct cfpkt *pkt) cfpkt_destroy(pkt); return 0; default: /* SET RS232 PIN */ - pr_warning("CAIF: %s():Unknown VEI control packet %d (0x%x)!\n", - __func__, cmd, cmd); + pr_warn("Unknown VEI control packet %d (0x%x)!\n", cmd, cmd); cfpkt_destroy(pkt); return -EPROTO; } @@ -86,7 +87,7 @@ static int cfvei_transmit(struct cflayer *layr, struct cfpkt *pkt) caif_assert(layr->dn->transmit != NULL); if (cfpkt_add_head(pkt, &tmp, 1) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); return -EPROTO; } diff --git a/net/caif/cfvidl.c b/net/caif/cfvidl.c index ada6ee2..bf6fef2 100644 --- a/net/caif/cfvidl.c +++ b/net/caif/cfvidl.c @@ -4,6 +4,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -21,7 +23,7 @@ struct cflayer *cfvidl_create(u8 channel_id, struct dev_info *dev_info) { struct cfsrvl *vid = kmalloc(sizeof(struct cfsrvl), GFP_ATOMIC); if (!vid) { - pr_warning("CAIF: %s(): Out of memory\n", __func__); + pr_warn("Out of memory\n"); return NULL; } caif_assert(offsetof(struct cfsrvl, layer) == 0); @@ -38,7 +40,7 @@ static int cfvidl_receive(struct cflayer *layr, struct cfpkt *pkt) { u32 videoheader; if (cfpkt_extr_head(pkt, &videoheader, 4) < 0) { - pr_err("CAIF: %s(): Packet is erroneous!\n", __func__); + pr_err("Packet is erroneous!\n"); cfpkt_destroy(pkt); return -EPROTO; } diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 4293e19..86aac24 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -5,6 +5,8 @@ * License terms: GNU General Public License (GPL) version 2 */ +#define pr_fmt(fmt) KBUILD_MODNAME ":%s(): " fmt, __func__ + #include #include #include @@ -29,7 +31,7 @@ #define CAIF_NET_DEFAULT_QUEUE_LEN 500 #undef pr_debug -#define pr_debug pr_warning +#define pr_debug pr_warn /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); @@ -142,8 +144,7 @@ static void chnl_flowctrl_cb(struct cflayer *layr, enum caif_ctrlcmd flow, int phyid) { struct chnl_net *priv = container_of(layr, struct chnl_net, chnl); - pr_debug("CAIF: %s(): NET flowctrl func called flow: %s\n", - __func__, + pr_debug("NET flowctrl func called flow: %s\n", flow == CAIF_CTRLCMD_FLOW_ON_IND ? "ON" : flow == CAIF_CTRLCMD_INIT_RSP ? "INIT" : flow == CAIF_CTRLCMD_FLOW_OFF_IND ? "OFF" : @@ -196,12 +197,12 @@ static int chnl_net_start_xmit(struct sk_buff *skb, struct net_device *dev) priv = netdev_priv(dev); if (skb->len > priv->netdev->mtu) { - pr_warning("CAIF: %s(): Size of skb exceeded MTU\n", __func__); + pr_warn("Size of skb exceeded MTU\n"); return -ENOSPC; } if (!priv->flowenabled) { - pr_debug("CAIF: %s(): dropping packets flow off\n", __func__); + pr_debug("dropping packets flow off\n"); return NETDEV_TX_BUSY; } @@ -237,7 +238,7 @@ static int chnl_net_open(struct net_device *dev) ASSERT_RTNL(); priv = netdev_priv(dev); if (!priv) { - pr_debug("CAIF: %s(): chnl_net_open: no priv\n", __func__); + pr_debug("chnl_net_open: no priv\n"); return -ENODEV; } @@ -246,18 +247,17 @@ static int chnl_net_open(struct net_device *dev) result = caif_connect_client(&priv->conn_req, &priv->chnl, &llifindex, &headroom, &tailroom); if (result != 0) { - pr_debug("CAIF: %s(): err: " - "Unable to register and open device," - " Err:%d\n", - __func__, - result); + pr_debug("err: " + "Unable to register and open device," + " Err:%d\n", + result); goto error; } lldev = dev_get_by_index(dev_net(dev), llifindex); if (lldev == NULL) { - pr_debug("CAIF: %s(): no interface?\n", __func__); + pr_debug("no interface?\n"); result = -ENODEV; goto error; } @@ -279,9 +279,7 @@ static int chnl_net_open(struct net_device *dev) dev_put(lldev); if (mtu < 100) { - pr_warning("CAIF: %s(): " - "CAIF Interface MTU too small (%d)\n", - __func__, mtu); + pr_warn("CAIF Interface MTU too small (%d)\n", mtu); result = -ENODEV; goto error; } @@ -296,33 +294,32 @@ static int chnl_net_open(struct net_device *dev) rtnl_lock(); if (result == -ERESTARTSYS) { - pr_debug("CAIF: %s(): wait_event_interruptible" - " woken by a signal\n", __func__); + pr_debug("wait_event_interruptible woken by a signal\n"); result = -ERESTARTSYS; goto error; } if (result == 0) { - pr_debug("CAIF: %s(): connect timeout\n", __func__); + pr_debug("connect timeout\n"); caif_disconnect_client(&priv->chnl); priv->state = CAIF_DISCONNECTED; - pr_debug("CAIF: %s(): state disconnected\n", __func__); + pr_debug("state disconnected\n"); result = -ETIMEDOUT; goto error; } if (priv->state != CAIF_CONNECTED) { - pr_debug("CAIF: %s(): connect failed\n", __func__); + pr_debug("connect failed\n"); result = -ECONNREFUSED; goto error; } - pr_debug("CAIF: %s(): CAIF Netdevice connected\n", __func__); + pr_debug("CAIF Netdevice connected\n"); return 0; error: caif_disconnect_client(&priv->chnl); priv->state = CAIF_DISCONNECTED; - pr_debug("CAIF: %s(): state disconnected\n", __func__); + pr_debug("state disconnected\n"); return result; } @@ -413,7 +410,7 @@ static void caif_netlink_parms(struct nlattr *data[], struct caif_connect_request *conn_req) { if (!data) { - pr_warning("CAIF: %s: no params data found\n", __func__); + pr_warn("no params data found\n"); return; } if (data[IFLA_CAIF_IPV4_CONNID]) @@ -442,8 +439,7 @@ static int ipcaif_newlink(struct net *src_net, struct net_device *dev, ret = register_netdevice(dev); if (ret) - pr_warning("CAIF: %s(): device rtml registration failed\n", - __func__); + pr_warn("device rtml registration failed\n"); return ret; } -- cgit v1.1 From f6c9322c3a0cd50a6996094327347c87cd2f4bd8 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Sun, 5 Sep 2010 22:08:09 +0000 Subject: net/caifcaif_dev.c: Use netdev_ Convert pr_("%s" ..., (struct netdev *)->name ...) to netdev_((struct netdev *), ...) Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 1fdfe70..0fd01dd 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -216,7 +216,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, switch (what) { case NETDEV_REGISTER: - pr_info("register %s\n", dev->name); + netdev_info(dev, "register\n"); caifd = caif_device_alloc(dev); if (caifd == NULL) break; @@ -227,13 +227,13 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, break; case NETDEV_UP: - pr_info("up %s\n", dev->name); + netdev_info(dev, "up\n"); caifd = caif_get(dev); if (caifd == NULL) break; caifdev = netdev_priv(dev); if (atomic_read(&caifd->state) == NETDEV_UP) { - pr_info("%s already up\n", dev->name); + netdev_info(dev, "already up\n"); break; } atomic_set(&caifd->state, what); @@ -274,7 +274,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, caifd = caif_get(dev); if (caifd == NULL) break; - pr_info("going down %s\n", dev->name); + netdev_info(dev, "going down\n"); if (atomic_read(&caifd->state) == NETDEV_GOING_DOWN || atomic_read(&caifd->state) == NETDEV_DOWN) @@ -296,10 +296,10 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, caifd = caif_get(dev); if (caifd == NULL) break; - pr_info("down %s\n", dev->name); + netdev_info(dev, "down\n"); if (atomic_read(&caifd->in_use)) - pr_warn("Unregistering an active CAIF device: %s\n", - dev->name); + netdev_warn(dev, + "Unregistering an active CAIF device\n"); cfcnfg_del_phy_layer(get_caif_conf(), &caifd->layer); dev_put(dev); atomic_set(&caifd->state, what); @@ -307,7 +307,7 @@ static int caif_device_notify(struct notifier_block *me, unsigned long what, case NETDEV_UNREGISTER: caifd = caif_get(dev); - pr_info("unregister %s\n", dev->name); + netdev_info(dev, "unregister\n"); atomic_set(&caifd->state, what); caif_device_destroy(dev); break; -- cgit v1.1 From db40980fcdb560d7992b0511df16cdd3f7e381f3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 6 Sep 2010 11:13:50 +0000 Subject: net: poll() optimizations No need to test twice sk->sk_shutdown & RCV_SHUTDOWN Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bluetooth/af_bluetooth.c | 5 ++--- net/core/datagram.c | 5 ++--- net/sctp/socket.c | 5 ++--- net/unix/af_unix.c | 5 ++--- 4 files changed, 8 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 421c45b..ed0f22f 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -297,13 +297,12 @@ unsigned int bt_sock_poll(struct file * file, struct socket *sock, poll_table *w mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; if (sk->sk_state == BT_CLOSED) diff --git a/net/core/datagram.c b/net/core/datagram.c index 251997a..4df1b7a 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -746,13 +746,12 @@ unsigned int datagram_poll(struct file *file, struct socket *sock, if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index f4bec27..cf6dcc9 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5707,13 +5707,12 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait) if (sk->sk_err || !skb_queue_empty(&sk->sk_error_queue)) mask |= POLLERR; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; /* Is it readable? Reconsider this code with TCP-style support. */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* The association is either gone or not ready. */ diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 4414a18..9077b4e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2024,11 +2024,10 @@ static unsigned int unix_poll(struct file *file, struct socket *sock, poll_table if (sk->sk_shutdown == SHUTDOWN_MASK) mask |= POLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) - mask |= POLLRDHUP; + mask |= POLLRDHUP | POLLIN | POLLRDNORM; /* readable? */ - if (!skb_queue_empty(&sk->sk_receive_queue) || - (sk->sk_shutdown & RCV_SHUTDOWN)) + if (!skb_queue_empty(&sk->sk_receive_queue)) mask |= POLLIN | POLLRDNORM; /* Connection-based need to check for termination and startup */ -- cgit v1.1 From 65040c33ee8d0199ab7686402bffdbf9e1e26cbe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Diego=20Elio=20=27Flameeyes=27=20Petten=C3=B2?= Date: Fri, 3 Sep 2010 03:47:03 +0000 Subject: sctp: implement SIOCINQ ioctl() (take 3) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This simple patch copies the current approach for SIOCINQ ioctl() from DCCP into SCTP so that the userland code working with SCTP can use a similar interface across different protocols to know how much space to allocate for a buffer. Signed-off-by: Diego Elio Pettenò Signed-off-by: David S. Miller --- net/sctp/socket.c | 35 ++++++++++++++++++++++++++++++++++- 1 file changed, 34 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index cf6dcc9..6a691d8 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3592,7 +3592,40 @@ out: /* The SCTP ioctl handler. */ SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) { - return -ENOIOCTLCMD; + int rc = -ENOTCONN; + + sctp_lock_sock(sk); + + /* + * SEQPACKET-style sockets in LISTENING state are valid, for + * SCTP, so only discard TCP-style sockets in LISTENING state. + */ + if (sctp_style(sk, TCP) && sctp_sstate(sk, LISTENING)) + goto out; + + switch (cmd) { + case SIOCINQ: { + struct sk_buff *skb; + unsigned int amount = 0; + + skb = skb_peek(&sk->sk_receive_queue); + if (skb != NULL) { + /* + * We will only return the amount of this packet since + * that is all that will be read. + */ + amount = skb->len; + } + rc = put_user(amount, (int __user *)arg); + } + break; + default: + rc = -ENOIOCTLCMD; + break; + } +out: + sctp_release_sock(sk); + return rc; } /* This is the function which gets called during socket creation to -- cgit v1.1 From 6febfca98f25c7ee5c3ff7fc85e048bf82230ad5 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Fri, 3 Sep 2010 23:12:37 +0000 Subject: net: rps: add the shortcut for one rps_cpus When there is only one rps_cpus, skb_get_rxhash() can be eliminated. Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/core/dev.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index efd318d..cdbbea3 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2343,7 +2343,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, struct rps_dev_flow **rflowp) { struct netdev_rx_queue *rxqueue; - struct rps_map *map; + struct rps_map *map = NULL; struct rps_dev_flow_table *flow_table; struct rps_sock_flow_table *sock_flow_table; int cpu = -1; @@ -2361,8 +2361,17 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } else rxqueue = dev->_rx; - if (!rxqueue->rps_map && !rxqueue->rps_flow_table) + if (rxqueue->rps_map) { + map = rcu_dereference(rxqueue->rps_map); + if (map && map->len == 1) { + tcpu = map->cpus[0]; + if (cpu_online(tcpu)) + cpu = tcpu; + goto done; + } + } else if (!rxqueue->rps_flow_table) { goto done; + } skb_reset_network_header(skb); if (!skb_get_rxhash(skb)) @@ -2407,7 +2416,6 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, } } - map = rcu_dereference(rxqueue->rps_map); if (map) { tcpu = map->cpus[((u64) skb->rxhash * map->len) >> 32]; -- cgit v1.1 From fb8621bb6c040a25ac2fc246653859f841a1f53d Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Tue, 7 Sep 2010 03:55:00 +0000 Subject: net: remove address space warnings in net/socket.c Casts __kernel to __user pointer require __force markup, so add it. Also sock_get/setsockopt() takes @optval and/or @optlen arguments as user pointers but were taking kernel pointers, use new variables 'uoptval' and/or 'uoptlen' to fix it. These remove following warnings from sparse: net/socket.c:1922:46: warning: cast adds address space to expression () net/socket.c:3061:61: warning: incorrect type in argument 4 (different address spaces) net/socket.c:3061:61: expected char [noderef] *optval net/socket.c:3061:61: got char *optval net/socket.c:3061:69: warning: incorrect type in argument 5 (different address spaces) net/socket.c:3061:69: expected int [noderef] *optlen net/socket.c:3061:69: got int *optlen net/socket.c:3063:67: warning: incorrect type in argument 4 (different address spaces) net/socket.c:3063:67: expected char [noderef] *optval net/socket.c:3063:67: got char *optval net/socket.c:3064:45: warning: incorrect type in argument 5 (different address spaces) net/socket.c:3064:45: expected int [noderef] *optlen net/socket.c:3064:45: got int *optlen net/socket.c:3078:61: warning: incorrect type in argument 4 (different address spaces) net/socket.c:3078:61: expected char [noderef] *optval net/socket.c:3078:61: got char *optval net/socket.c:3080:67: warning: incorrect type in argument 4 (different address spaces) net/socket.c:3080:67: expected char [noderef] *optval net/socket.c:3080:67: got char *optval Signed-off-by: Namhyung Kim Signed-off-by: David S. Miller --- net/socket.c | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 7848d12..717a5f1 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1918,7 +1918,8 @@ SYSCALL_DEFINE3(sendmsg, int, fd, struct msghdr __user *, msg, unsigned, flags) * Afterwards, it will be a kernel pointer. Thus the compiler-assisted * checking falls down on this. */ - if (copy_from_user(ctl_buf, (void __user *)msg_sys.msg_control, + if (copy_from_user(ctl_buf, + (void __user __force *)msg_sys.msg_control, ctl_len)) goto out_freectl; msg_sys.msg_control = ctl_buf; @@ -3053,14 +3054,19 @@ int kernel_getsockopt(struct socket *sock, int level, int optname, char *optval, int *optlen) { mm_segment_t oldfs = get_fs(); + char __user *uoptval; + int __user *uoptlen; int err; + uoptval = (char __user __force *) optval; + uoptlen = (int __user __force *) optlen; + set_fs(KERNEL_DS); if (level == SOL_SOCKET) - err = sock_getsockopt(sock, level, optname, optval, optlen); + err = sock_getsockopt(sock, level, optname, uoptval, uoptlen); else - err = sock->ops->getsockopt(sock, level, optname, optval, - optlen); + err = sock->ops->getsockopt(sock, level, optname, uoptval, + uoptlen); set_fs(oldfs); return err; } @@ -3070,13 +3076,16 @@ int kernel_setsockopt(struct socket *sock, int level, int optname, char *optval, unsigned int optlen) { mm_segment_t oldfs = get_fs(); + char __user *uoptval; int err; + uoptval = (char __user __force *) optval; + set_fs(KERNEL_DS); if (level == SOL_SOCKET) - err = sock_setsockopt(sock, level, optname, optval, optlen); + err = sock_setsockopt(sock, level, optname, uoptval, optlen); else - err = sock->ops->setsockopt(sock, level, optname, optval, + err = sock->ops->setsockopt(sock, level, optname, uoptval, optlen); set_fs(oldfs); return err; -- cgit v1.1 From 9e2effba2c16fc3bd47da605116485afe01e0be0 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 12 Mar 2010 16:22:32 -0800 Subject: RDS: Fix BUG_ONs to not fire when in a tasklet in_interrupt() is true in softirqs. The BUG_ONs are supposed to check for if irqs are disabled, so we should use BUG_ON(irqs_disabled()) instead, duh. Signed-off-by: Andy Grover --- net/rds/ib_rdma.c | 2 +- net/rds/rdma.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a54cd63..a92aebc 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -441,7 +441,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr) /* FIXME we need a way to tell a r/w MR * from a r/o MR */ - BUG_ON(in_interrupt()); + BUG_ON(irqs_disabled()); set_page_dirty(page); put_page(page); } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 75fd13b..3b442d4 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -440,7 +440,7 @@ void rds_rdma_free_op(struct rds_rdma_op *ro) * is the case for a RDMA_READ which copies from remote * to local memory */ if (!ro->r_write) { - BUG_ON(in_interrupt()); + BUG_ON(irqs_disabled()); set_page_dirty(page); } put_page(page); -- cgit v1.1 From 35b52c70534cb7193b218ec12efe6bc595312097 Mon Sep 17 00:00:00 2001 From: Tina Yang Date: Thu, 1 Apr 2010 14:09:00 -0700 Subject: RDS: Fix corrupted rds_mrs On second look at this bug (OFED #2002), it seems that the collision is not with the retransmission queue (packet acked by the peer), but with the local send completion. A theoretical sequence of events (from time t0 to t3) is thought to be as follows, Thread #1 t0: sock_release rds_release rds_send_drop_to /* wait on send completion */ t2: rds_rdma_drop_keys() /* destroy & free all mrs */ Thread #2 t1: rds_ib_send_cq_comp_handler rds_ib_send_unmap_rm rds_message_unmapped /* wake up #1 @ t0 */ t3: rds_message_put rds_message_purge rds_mr_put /* memory corruption detected */ The problem with the rds_rdma_drop_keys() is it could remove a mr's refcount more than its due (i.e. repeatedly as long as it still remains in the tree (mr->r_refcount > 0)). Theoretically it should remove only one reference - reference by the tree. /* Release any MRs associated with this socket */ while ((node = rb_first(&rs->rs_rdma_keys))) { mr = container_of(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; rds_mr_put(mr); } I think the correct way of doing it is to remove the mr from the tree and rds_destroy_mr it first, then a rds_mr_put() to decrement its reference count by one. Whichever thread holds the last reference will free the mr via rds_mr_put(). Signed-off-by: Tina Yang Signed-off-by: Andy Grover --- net/rds/rdma.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 3b442d4..463b458 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -130,14 +130,22 @@ void rds_rdma_drop_keys(struct rds_sock *rs) { struct rds_mr *mr; struct rb_node *node; + unsigned long flags; /* Release any MRs associated with this socket */ + spin_lock_irqsave(&rs->rs_rdma_lock, flags); while ((node = rb_first(&rs->rs_rdma_keys))) { mr = container_of(node, struct rds_mr, r_rb_node); if (mr->r_trans == rs->rs_transport) mr->r_invalidate = 0; + rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); + RB_CLEAR_NODE(&mr->r_rb_node); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + rds_destroy_mr(mr); rds_mr_put(mr); + spin_lock_irqsave(&rs->rs_rdma_lock, flags); } + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); if (rs->rs_transport && rs->rs_transport->flush_mrs) rs->rs_transport->flush_mrs(); -- cgit v1.1 From 7c82eaf00ec7d460932be9314b29997006b799b6 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 19 Feb 2010 18:01:41 -0800 Subject: RDS: Rewrite rds_send_drop_to() for clarity This function has been the source of numerous bugs; it's just too complicated. Simplified to nest spinlocks cleanly within the second loop body, and kick out early if there are no rms to drop. This will be a little slower because conn lock is grabbed for each entry instead of "caching" the lock across rms, but this should be entirely irrelevant to fastpath performance. Signed-off-by: Andy Grover --- net/rds/send.c | 64 ++++++++++++++++++++++++++-------------------------------- 1 file changed, 29 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 9c1c6bc..aee58f9 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -619,9 +619,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) { struct rds_message *rm, *tmp; struct rds_connection *conn; - unsigned long flags, flags2; + unsigned long flags; LIST_HEAD(list); - int wake = 0; /* get all the messages we're dropping under the rs lock */ spin_lock_irqsave(&rs->rs_lock, flags); @@ -631,59 +630,54 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) dest->sin_port != rm->m_inc.i_hdr.h_dport)) continue; - wake = 1; list_move(&rm->m_sock_item, &list); rds_send_sndbuf_remove(rs, rm); clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags); } /* order flag updates with the rs lock */ - if (wake) - smp_mb__after_clear_bit(); + smp_mb__after_clear_bit(); spin_unlock_irqrestore(&rs->rs_lock, flags); - conn = NULL; + if (list_empty(&list)) + return; - /* now remove the messages from the conn list as needed */ + /* Remove the messages from the conn */ list_for_each_entry(rm, &list, m_sock_item) { - /* We do this here rather than in the loop above, so that - * we don't have to nest m_rs_lock under rs->rs_lock */ - spin_lock_irqsave(&rm->m_rs_lock, flags2); - /* If this is a RDMA operation, notify the app. */ - spin_lock(&rs->rs_lock); - __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); - spin_unlock(&rs->rs_lock); - rm->m_rs = NULL; - spin_unlock_irqrestore(&rm->m_rs_lock, flags2); + + conn = rm->m_inc.i_conn; + spin_lock_irqsave(&conn->c_lock, flags); /* - * If we see this flag cleared then we're *sure* that someone - * else beat us to removing it from the conn. If we race - * with their flag update we'll get the lock and then really - * see that the flag has been cleared. + * Maybe someone else beat us to removing rm from the conn. + * If we race with their flag update we'll get the lock and + * then really see that the flag has been cleared. */ - if (!test_bit(RDS_MSG_ON_CONN, &rm->m_flags)) + if (!test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { + spin_unlock_irqrestore(&conn->c_lock, flags); continue; - - if (conn != rm->m_inc.i_conn) { - if (conn) - spin_unlock_irqrestore(&conn->c_lock, flags); - conn = rm->m_inc.i_conn; - spin_lock_irqsave(&conn->c_lock, flags); } - if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) { - list_del_init(&rm->m_conn_item); - rds_message_put(rm); - } - } + /* + * Couldn't grab m_rs_lock in top loop (lock ordering), + * but we can now. + */ + spin_lock(&rm->m_rs_lock); - if (conn) + spin_lock(&rs->rs_lock); + __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); + spin_unlock(&rs->rs_lock); + + rm->m_rs = NULL; + spin_unlock(&rm->m_rs_lock); + + list_del_init(&rm->m_conn_item); + rds_message_put(rm); spin_unlock_irqrestore(&conn->c_lock, flags); + } - if (wake) - rds_wake_sk_sleep(rs); + rds_wake_sk_sleep(rs); while (!list_empty(&list)) { rm = list_entry(list.next, struct rds_message, m_sock_item); -- cgit v1.1 From 9de0864cf55927a7383b5ba6e48834ff3ef053de Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 16:50:54 -0700 Subject: RDS: Fix locking in send on m_rs_lock Do not nest m_rs_lock under c_lock Disable interrupts in {rdma,atomic}_send_complete Signed-off-by: Andy Grover --- net/rds/send.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index aee58f9..725fb04 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -415,8 +415,9 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) struct rds_sock *rs = NULL; struct rds_rdma_op *ro; struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); ro = rm->m_rdma_op; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && @@ -433,7 +434,7 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) ro->r_notifier = NULL; } - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); @@ -647,8 +648,8 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) list_for_each_entry(rm, &list, m_sock_item) { conn = rm->m_inc.i_conn; - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock_irqsave(&conn->c_lock, flags); /* * Maybe someone else beat us to removing rm from the conn. * If we race with their flag update we'll get the lock and @@ -658,23 +659,23 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) spin_unlock_irqrestore(&conn->c_lock, flags); continue; } + list_del_init(&rm->m_conn_item); + spin_unlock_irqrestore(&conn->c_lock, flags); /* * Couldn't grab m_rs_lock in top loop (lock ordering), * but we can now. */ - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); rm->m_rs = NULL; - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); - list_del_init(&rm->m_conn_item); rds_message_put(rm); - spin_unlock_irqrestore(&conn->c_lock, flags); } rds_wake_sk_sleep(rs); -- cgit v1.1 From 2dc393573430f853e56e25bf4b41c34ba2aa8fd6 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 11 Jun 2010 13:49:13 -0700 Subject: RDS: move rds_shutdown_worker impl. to rds_conn_shutdown This fits better in connection.c, rather than threads.c. Signed-off-by: Andy Grover --- net/rds/connection.c | 53 +++++++++++++++++++++++++++++++++++++++++++++ net/rds/rds.h | 2 ++ net/rds/threads.c | 61 +++++++--------------------------------------------- 3 files changed, 63 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 7619b67..895e39c 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -263,6 +263,59 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); +void rds_conn_shutdown(struct rds_connection *conn) +{ + /* shut it down unless it's down already */ + if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { + /* + * Quiesce the connection mgmt handlers before we start tearing + * things down. We don't hold the mutex for the entire + * duration of the shutdown operation, else we may be + * deadlocking with the CM handler. Instead, the CM event + * handler is supposed to check for state DISCONNECTING + */ + mutex_lock(&conn->c_cm_lock); + if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) + && !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { + rds_conn_error(conn, "shutdown called in state %d\n", + atomic_read(&conn->c_state)); + mutex_unlock(&conn->c_cm_lock); + return; + } + mutex_unlock(&conn->c_cm_lock); + + mutex_lock(&conn->c_send_lock); + conn->c_trans->conn_shutdown(conn); + rds_conn_reset(conn); + mutex_unlock(&conn->c_send_lock); + + if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { + /* This can happen - eg when we're in the middle of tearing + * down the connection, and someone unloads the rds module. + * Quite reproduceable with loopback connections. + * Mostly harmless. + */ + rds_conn_error(conn, + "%s: failed to transition to state DOWN, " + "current state is %d\n", + __func__, + atomic_read(&conn->c_state)); + return; + } + } + + /* Then reconnect if it's still live. + * The passive side of an IB loopback connection is never added + * to the conn hash, so we never trigger a reconnect on this + * conn - the reconnect is always triggered by the active peer. */ + cancel_delayed_work_sync(&conn->c_conn_w); + if (!hlist_unhashed(&conn->c_hash_node)) + rds_queue_reconnect(conn); +} + +/* + * Stop and free a connection. + */ void rds_conn_destroy(struct rds_connection *conn) { struct rds_message *rm, *rtmp; diff --git a/net/rds/rds.h b/net/rds/rds.h index c224b5b..1d3eef6 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -527,6 +527,7 @@ struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); +void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); @@ -681,6 +682,7 @@ extern unsigned int rds_sysctl_trace_level; int __init rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; +void rds_queue_reconnect(struct rds_connection *conn); void rds_connect_worker(struct work_struct *); void rds_shutdown_worker(struct work_struct *); void rds_send_worker(struct work_struct *); diff --git a/net/rds/threads.c b/net/rds/threads.c index 786c20e..6e2e43d 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -110,7 +110,7 @@ EXPORT_SYMBOL_GPL(rds_connect_complete); * We should *always* start with a random backoff; otherwise a broken connection * will always take several iterations to be re-established. */ -static void rds_queue_reconnect(struct rds_connection *conn) +void rds_queue_reconnect(struct rds_connection *conn) { unsigned long rand; @@ -156,58 +156,6 @@ void rds_connect_worker(struct work_struct *work) } } -void rds_shutdown_worker(struct work_struct *work) -{ - struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); - - /* shut it down unless it's down already */ - if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_DOWN)) { - /* - * Quiesce the connection mgmt handlers before we start tearing - * things down. We don't hold the mutex for the entire - * duration of the shutdown operation, else we may be - * deadlocking with the CM handler. Instead, the CM event - * handler is supposed to check for state DISCONNECTING - */ - mutex_lock(&conn->c_cm_lock); - if (!rds_conn_transition(conn, RDS_CONN_UP, RDS_CONN_DISCONNECTING) && - !rds_conn_transition(conn, RDS_CONN_ERROR, RDS_CONN_DISCONNECTING)) { - rds_conn_error(conn, "shutdown called in state %d\n", - atomic_read(&conn->c_state)); - mutex_unlock(&conn->c_cm_lock); - return; - } - mutex_unlock(&conn->c_cm_lock); - - mutex_lock(&conn->c_send_lock); - conn->c_trans->conn_shutdown(conn); - rds_conn_reset(conn); - mutex_unlock(&conn->c_send_lock); - - if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { - /* This can happen - eg when we're in the middle of tearing - * down the connection, and someone unloads the rds module. - * Quite reproduceable with loopback connections. - * Mostly harmless. - */ - rds_conn_error(conn, - "%s: failed to transition to state DOWN, " - "current state is %d\n", - __func__, - atomic_read(&conn->c_state)); - return; - } - } - - /* Then reconnect if it's still live. - * The passive side of an IB loopback connection is never added - * to the conn hash, so we never trigger a reconnect on this - * conn - the reconnect is always triggered by the active peer. */ - cancel_delayed_work(&conn->c_conn_w); - if (!hlist_unhashed(&conn->c_hash_node)) - rds_queue_reconnect(conn); -} - void rds_send_worker(struct work_struct *work) { struct rds_connection *conn = container_of(work, struct rds_connection, c_send_w.work); @@ -252,6 +200,13 @@ void rds_recv_worker(struct work_struct *work) } } +void rds_shutdown_worker(struct work_struct *work) +{ + struct rds_connection *conn = container_of(work, struct rds_connection, c_down_w); + + rds_conn_shutdown(conn); +} + void rds_threads_exit(void) { destroy_workqueue(rds_wq); -- cgit v1.1 From 8690bfa17aea4c42da1bcf90a7af93d161eca624 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 11:56:44 -0800 Subject: RDS: cleanup: remove "== NULL"s and "!= NULL"s in ptr comparisons Favor "if (foo)" style over "if (foo != NULL)". Signed-off-by: Andy Grover --- net/rds/af_rds.c | 2 +- net/rds/bind.c | 4 ++-- net/rds/cong.c | 6 +++--- net/rds/connection.c | 4 ++-- net/rds/ib_cm.c | 14 +++++++------- net/rds/ib_recv.c | 20 ++++++++++---------- net/rds/ib_send.c | 4 ++-- net/rds/ib_sysctl.c | 2 +- net/rds/info.c | 12 ++++++------ net/rds/iw_cm.c | 14 +++++++------- net/rds/iw_recv.c | 20 ++++++++++---------- net/rds/iw_send.c | 4 ++-- net/rds/iw_sysctl.c | 2 +- net/rds/loop.c | 2 +- net/rds/message.c | 4 ++-- net/rds/page.c | 4 ++-- net/rds/rdma.c | 18 +++++++++--------- net/rds/recv.c | 4 ++-- net/rds/send.c | 14 +++++++------- net/rds/sysctl.c | 2 +- net/rds/tcp.c | 4 ++-- net/rds/tcp_connect.c | 2 +- net/rds/tcp_listen.c | 4 ++-- net/rds/tcp_recv.c | 10 +++++----- net/rds/tcp_send.c | 2 +- net/rds/threads.c | 2 +- 26 files changed, 90 insertions(+), 90 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index aebfecb..63474e1 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -62,7 +62,7 @@ static int rds_release(struct socket *sock) struct rds_sock *rs; unsigned long flags; - if (sk == NULL) + if (!sk) goto out; rs = rds_sk_to_rs(sk); diff --git a/net/rds/bind.c b/net/rds/bind.c index 5d95fc0..65de5cb 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -121,7 +121,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) do { if (rover == 0) rover++; - if (rds_bind_tree_walk(addr, cpu_to_be16(rover), rs) == NULL) { + if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) { *port = cpu_to_be16(rover); ret = 0; break; @@ -184,7 +184,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; trans = rds_trans_get_preferred(sin->sin_addr.s_addr); - if (trans == NULL) { + if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); if (printk_ratelimit()) diff --git a/net/rds/cong.c b/net/rds/cong.c index 0871a29f..c741e90 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -141,7 +141,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) unsigned long flags; map = kzalloc(sizeof(struct rds_cong_map), GFP_KERNEL); - if (map == NULL) + if (!map) return NULL; map->m_addr = addr; @@ -159,7 +159,7 @@ static struct rds_cong_map *rds_cong_from_addr(__be32 addr) ret = rds_cong_tree_walk(addr, map); spin_unlock_irqrestore(&rds_cong_lock, flags); - if (ret == NULL) { + if (!ret) { ret = map; map = NULL; } @@ -205,7 +205,7 @@ int rds_cong_get_maps(struct rds_connection *conn) conn->c_lcong = rds_cong_from_addr(conn->c_laddr); conn->c_fcong = rds_cong_from_addr(conn->c_faddr); - if (conn->c_lcong == NULL || conn->c_fcong == NULL) + if (!(conn->c_lcong && conn->c_fcong)) return -ENOMEM; return 0; diff --git a/net/rds/connection.c b/net/rds/connection.c index 895e39c..9c9afb5 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -148,7 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, goto out; conn = kmem_cache_zalloc(rds_conn_slab, gfp); - if (conn == NULL) { + if (!conn) { conn = ERR_PTR(-ENOMEM); goto out; } @@ -502,7 +502,7 @@ int __init rds_conn_init(void) rds_conn_slab = kmem_cache_create("rds_connection", sizeof(struct rds_connection), 0, 0, NULL); - if (rds_conn_slab == NULL) + if (!rds_conn_slab) return -ENOMEM; rds_info_register_func(RDS_INFO_CONNECTIONS, rds_conn_info); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index f688327..b46bc2f 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -230,7 +230,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) * the rds_ibdev at all. */ rds_ibdev = ib_get_client_data(dev, &rds_ib_client); - if (rds_ibdev == NULL) { + if (!rds_ibdev) { if (printk_ratelimit()) printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", dev->name); @@ -306,7 +306,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -316,7 +316,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -324,14 +324,14 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - if (ic->i_sends == NULL) { + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; @@ -339,7 +339,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); - if (ic->i_recvs == NULL) { + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; @@ -693,7 +693,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) /* XXX too lazy? */ ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); - if (ic == NULL) + if (!ic) return -ENOMEM; INIT_LIST_HEAD(&ic->ib_node); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index c74e990..d0ee9c1 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -53,7 +53,7 @@ static void rds_ib_frag_drop_page(struct rds_page_frag *frag) static void rds_ib_frag_free(struct rds_page_frag *frag) { rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); + BUG_ON(frag->f_page); kmem_cache_free(rds_ib_frag_slab, frag); } @@ -143,14 +143,14 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, struct ib_sge *sge; int ret = -ENOMEM; - if (recv->r_ibinc == NULL) { + if (!recv->r_ibinc) { if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { rds_ib_stats_inc(s_ib_rx_alloc_limit); goto out; } recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, kptr_gfp); - if (recv->r_ibinc == NULL) { + if (!recv->r_ibinc) { atomic_dec(&rds_ib_allocation); goto out; } @@ -158,17 +158,17 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); } - if (recv->r_frag == NULL) { + if (!recv->r_frag) { recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) + if (!recv->r_frag) goto out; INIT_LIST_HEAD(&recv->r_frag->f_item); recv->r_frag->f_page = NULL; } - if (ic->i_frag.f_page == NULL) { + if (!ic->i_frag.f_page) { ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) + if (!ic->i_frag.f_page) goto out; ic->i_frag.f_offset = 0; } @@ -757,7 +757,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (ibinc == NULL) { + if (!ibinc) { ibinc = recv->r_ibinc; recv->r_ibinc = NULL; ic->i_ibinc = ibinc; @@ -940,13 +940,13 @@ int __init rds_ib_recv_init(void) rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", sizeof(struct rds_ib_incoming), 0, 0, NULL); - if (rds_ib_incoming_slab == NULL) + if (!rds_ib_incoming_slab) goto out; rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), 0, 0, NULL); - if (rds_ib_frag_slab == NULL) + if (!rds_ib_frag_slab) kmem_cache_destroy(rds_ib_incoming_slab); else ret = 0; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 17fa808..0b0090d 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -86,7 +86,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - if (rm->m_rdma_op != NULL) { + if (rm->m_rdma_op) { rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); /* If the user asked for a completion notification on this @@ -525,7 +525,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { + if (!ic->i_rm) { /* printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", be16_to_cpu(rm->m_inc.i_hdr.h_dport), diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index 03f01cb..c070524 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -135,7 +135,7 @@ void rds_ib_sysctl_exit(void) int __init rds_ib_sysctl_init(void) { rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); - if (rds_ib_sysctl_hdr == NULL) + if (!rds_ib_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/info.c b/net/rds/info.c index c45c417..4fdf1b6 100644 --- a/net/rds/info.c +++ b/net/rds/info.c @@ -76,7 +76,7 @@ void rds_info_register_func(int optname, rds_info_func func) BUG_ON(optname < RDS_INFO_FIRST || optname > RDS_INFO_LAST); spin_lock(&rds_info_lock); - BUG_ON(rds_info_funcs[offset] != NULL); + BUG_ON(rds_info_funcs[offset]); rds_info_funcs[offset] = func; spin_unlock(&rds_info_lock); } @@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(rds_info_deregister_func); */ void rds_info_iter_unmap(struct rds_info_iterator *iter) { - if (iter->addr != NULL) { + if (iter->addr) { kunmap_atomic(iter->addr, KM_USER0); iter->addr = NULL; } @@ -117,7 +117,7 @@ void rds_info_copy(struct rds_info_iterator *iter, void *data, unsigned long this; while (bytes) { - if (iter->addr == NULL) + if (!iter->addr) iter->addr = kmap_atomic(*iter->pages, KM_USER0); this = min(bytes, PAGE_SIZE - iter->offset); @@ -188,7 +188,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, >> PAGE_SHIFT; pages = kmalloc(nr_pages * sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } @@ -206,7 +206,7 @@ int rds_info_getsockopt(struct socket *sock, int optname, char __user *optval, call_func: func = rds_info_funcs[optname - RDS_INFO_FIRST]; - if (func == NULL) { + if (!func) { ret = -ENOPROTOOPT; goto out; } @@ -234,7 +234,7 @@ call_func: ret = -EFAULT; out: - for (i = 0; pages != NULL && i < nr_pages; i++) + for (i = 0; pages && i < nr_pages; i++) put_page(pages[i]); kfree(pages); diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c index b5dd6ac..712cf2d 100644 --- a/net/rds/iw_cm.c +++ b/net/rds/iw_cm.c @@ -257,7 +257,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) * the rds_iwdev at all. */ rds_iwdev = ib_get_client_data(dev, &rds_iw_client); - if (rds_iwdev == NULL) { + if (!rds_iwdev) { if (printk_ratelimit()) printk(KERN_NOTICE "RDS/IW: No client_data for device %s\n", dev->name); @@ -292,7 +292,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_send_ring.w_nr * sizeof(struct rds_header), &ic->i_send_hdrs_dma, GFP_KERNEL); - if (ic->i_send_hdrs == NULL) { + if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); goto out; @@ -302,7 +302,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_recv_ring.w_nr * sizeof(struct rds_header), &ic->i_recv_hdrs_dma, GFP_KERNEL); - if (ic->i_recv_hdrs == NULL) { + if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); goto out; @@ -310,14 +310,14 @@ static int rds_iw_setup_qp(struct rds_connection *conn) ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), &ic->i_ack_dma, GFP_KERNEL); - if (ic->i_ack == NULL) { + if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); goto out; } ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work)); - if (ic->i_sends == NULL) { + if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); goto out; @@ -325,7 +325,7 @@ static int rds_iw_setup_qp(struct rds_connection *conn) rds_iw_send_init_ring(ic); ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work)); - if (ic->i_recvs == NULL) { + if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); goto out; @@ -696,7 +696,7 @@ int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp) /* XXX too lazy? */ ic = kzalloc(sizeof(struct rds_iw_connection), GFP_KERNEL); - if (ic == NULL) + if (!ic) return -ENOMEM; INIT_LIST_HEAD(&ic->iw_node); diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 3d47906..48bcf4f 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -53,7 +53,7 @@ static void rds_iw_frag_drop_page(struct rds_page_frag *frag) static void rds_iw_frag_free(struct rds_page_frag *frag) { rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page != NULL); + BUG_ON(frag->f_page); kmem_cache_free(rds_iw_frag_slab, frag); } @@ -143,14 +143,14 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, struct ib_sge *sge; int ret = -ENOMEM; - if (recv->r_iwinc == NULL) { + if (!recv->r_iwinc) { if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) { rds_iw_stats_inc(s_iw_rx_alloc_limit); goto out; } recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab, kptr_gfp); - if (recv->r_iwinc == NULL) { + if (!recv->r_iwinc) { atomic_dec(&rds_iw_allocation); goto out; } @@ -158,17 +158,17 @@ static int rds_iw_recv_refill_one(struct rds_connection *conn, rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr); } - if (recv->r_frag == NULL) { + if (!recv->r_frag) { recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp); - if (recv->r_frag == NULL) + if (!recv->r_frag) goto out; INIT_LIST_HEAD(&recv->r_frag->f_item); recv->r_frag->f_page = NULL; } - if (ic->i_frag.f_page == NULL) { + if (!ic->i_frag.f_page) { ic->i_frag.f_page = alloc_page(page_gfp); - if (ic->i_frag.f_page == NULL) + if (!ic->i_frag.f_page) goto out; ic->i_frag.f_offset = 0; } @@ -716,7 +716,7 @@ static void rds_iw_process_recv(struct rds_connection *conn, * into the inc and save the inc so we can hang upcoming fragments * off its list. */ - if (iwinc == NULL) { + if (!iwinc) { iwinc = recv->r_iwinc; recv->r_iwinc = NULL; ic->i_iwinc = iwinc; @@ -899,13 +899,13 @@ int __init rds_iw_recv_init(void) rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming", sizeof(struct rds_iw_incoming), 0, 0, NULL); - if (rds_iw_incoming_slab == NULL) + if (!rds_iw_incoming_slab) goto out; rds_iw_frag_slab = kmem_cache_create("rds_iw_frag", sizeof(struct rds_page_frag), 0, 0, NULL); - if (rds_iw_frag_slab == NULL) + if (!rds_iw_frag_slab) kmem_cache_destroy(rds_iw_incoming_slab); else ret = 0; diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 52182ff..dced532 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -86,7 +86,7 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - if (rm->m_rdma_op != NULL) { + if (rm->m_rdma_op) { rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); /* If the user asked for a completion notification on this @@ -556,7 +556,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (ic->i_rm == NULL) { + if (!ic->i_rm) { /* printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n", be16_to_cpu(rm->m_inc.i_hdr.h_dport), diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 1c4428a..3cb0587 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -125,7 +125,7 @@ void rds_iw_sysctl_exit(void) int __init rds_iw_sysctl_init(void) { rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); - if (rds_iw_sysctl_hdr == NULL) + if (!rds_iw_sysctl_hdr) return -ENOMEM; return 0; } diff --git a/net/rds/loop.c b/net/rds/loop.c index dd98793..a74b469 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -112,7 +112,7 @@ static int rds_loop_conn_alloc(struct rds_connection *conn, gfp_t gfp) unsigned long flags; lc = kzalloc(sizeof(struct rds_loop_connection), GFP_KERNEL); - if (lc == NULL) + if (!lc) return -ENOMEM; INIT_LIST_HEAD(&lc->loop_node); diff --git a/net/rds/message.c b/net/rds/message.c index 9a1d67e..809656c 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -240,7 +240,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in unsigned int i; rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) + if (!rm) return ERR_PTR(-ENOMEM); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); @@ -284,7 +284,7 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ while (total_len) { - if (sg_page(sg) == NULL) { + if (!sg_page(sg)) { ret = rds_page_remainder_alloc(sg, total_len, GFP_HIGHUSER); if (ret) diff --git a/net/rds/page.c b/net/rds/page.c index 595a952..e5b2527 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -116,7 +116,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, /* jump straight to allocation if we're trying for a huge page */ if (bytes >= PAGE_SIZE) { page = alloc_page(gfp); - if (page == NULL) { + if (!page) { ret = -ENOMEM; } else { sg_set_page(scat, page, PAGE_SIZE, 0); @@ -162,7 +162,7 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes, rem = &per_cpu(rds_page_remainders, get_cpu()); local_irq_save(flags); - if (page == NULL) { + if (!page) { ret = -ENOMEM; break; } diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 463b458..dee698b 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -189,7 +189,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, goto out; } - if (rs->rs_transport->get_mr == NULL) { + if (!rs->rs_transport->get_mr) { ret = -EOPNOTSUPP; goto out; } @@ -205,13 +205,13 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, /* XXX clamp nr_pages to limit the size of this alloc? */ pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } mr = kzalloc(sizeof(struct rds_mr), GFP_KERNEL); - if (mr == NULL) { + if (!mr) { ret = -ENOMEM; goto out; } @@ -244,7 +244,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, nents = ret; sg = kcalloc(nents, sizeof(*sg), GFP_KERNEL); - if (sg == NULL) { + if (!sg) { ret = -ENOMEM; goto out; } @@ -425,7 +425,7 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ - if (mr != NULL) { + if (mr) { if (mr->r_trans->sync_mr) mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); @@ -511,13 +511,13 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, } pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); - if (pages == NULL) { + if (!pages) { ret = -ENOMEM; goto out; } op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); - if (op == NULL) { + if (!op) { ret = -ENOMEM; goto out; } @@ -643,7 +643,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct rds_rdma_op *op; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || - rm->m_rdma_op != NULL) + rm->m_rdma_op) return -EINVAL; op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); @@ -681,7 +681,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr == NULL) + if (!mr) err = -EINVAL; /* invalid r_key */ else atomic_inc(&mr->r_refcount); diff --git a/net/rds/recv.c b/net/rds/recv.c index c93588c..88f1f5a 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -210,7 +210,7 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, } rs = rds_find_bound(daddr, inc->i_hdr.h_dport); - if (rs == NULL) { + if (!rs) { rds_stats_inc(s_recv_drop_no_sock); goto out; } @@ -251,7 +251,7 @@ static int rds_next_incoming(struct rds_sock *rs, struct rds_incoming **inc) { unsigned long flags; - if (*inc == NULL) { + if (!*inc) { read_lock_irqsave(&rs->rs_recv_lock, flags); if (!list_empty(&rs->rs_recv_queue)) { *inc = list_entry(rs->rs_recv_queue.next, diff --git a/net/rds/send.c b/net/rds/send.c index 725fb04..817997d 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -164,7 +164,7 @@ int rds_send_xmit(struct rds_connection *conn) * offset and S/G temporaries. */ rm = conn->c_xmit_rm; - if (rm != NULL && + if (rm && conn->c_xmit_hdr_off == sizeof(struct rds_header) && conn->c_xmit_sg == rm->m_nents) { conn->c_xmit_rm = NULL; @@ -180,8 +180,8 @@ int rds_send_xmit(struct rds_connection *conn) /* If we're asked to send a cong map update, do so. */ - if (rm == NULL && test_and_clear_bit(0, &conn->c_map_queued)) { - if (conn->c_trans->xmit_cong_map != NULL) { + if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { + if (conn->c_trans->xmit_cong_map) { conn->c_map_offset = 0; conn->c_map_bytes = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; @@ -204,7 +204,7 @@ int rds_send_xmit(struct rds_connection *conn) * the connction. We can use this ref while holding the * send_sem.. rds_send_reset() is serialized with it. */ - if (rm == NULL) { + if (!rm) { unsigned int len; spin_lock_irqsave(&conn->c_lock, flags); @@ -224,7 +224,7 @@ int rds_send_xmit(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); - if (rm == NULL) { + if (!rm) { was_empty = 1; break; } @@ -875,7 +875,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; if ((rm->m_rdma_cookie || rm->m_rdma_op) && - conn->c_trans->xmit_rdma == NULL) { + !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", rm->m_rdma_op, conn->c_trans->xmit_rdma); @@ -961,7 +961,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) int ret = 0; rm = rds_message_alloc(0, GFP_ATOMIC); - if (rm == NULL) { + if (!rm) { ret = -ENOMEM; goto out; } diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 7829a20..5a6dd81 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -111,7 +111,7 @@ int __init rds_sysctl_init(void) rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; rds_sysctl_reg_table = register_sysctl_paths(rds_sysctl_path, rds_sysctl_rds_table); - if (rds_sysctl_reg_table == NULL) + if (!rds_sysctl_reg_table) return -ENOMEM; return 0; } diff --git a/net/rds/tcp.c b/net/rds/tcp.c index babf457..aebe103 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -200,7 +200,7 @@ static int rds_tcp_conn_alloc(struct rds_connection *conn, gfp_t gfp) struct rds_tcp_connection *tc; tc = kmem_cache_alloc(rds_tcp_conn_slab, gfp); - if (tc == NULL) + if (!tc) return -ENOMEM; tc->t_sock = NULL; @@ -283,7 +283,7 @@ int __init rds_tcp_init(void) rds_tcp_conn_slab = kmem_cache_create("rds_tcp_connection", sizeof(struct rds_tcp_connection), 0, 0, NULL); - if (rds_tcp_conn_slab == NULL) { + if (!rds_tcp_conn_slab) { ret = -ENOMEM; goto out; } diff --git a/net/rds/tcp_connect.c b/net/rds/tcp_connect.c index c397524..a65ee78 100644 --- a/net/rds/tcp_connect.c +++ b/net/rds/tcp_connect.c @@ -45,7 +45,7 @@ void rds_tcp_state_change(struct sock *sk) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { + if (!conn) { state_change = sk->sk_state_change; goto out; } diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 975183f..50b649e 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -116,7 +116,7 @@ void rds_tcp_listen_data_ready(struct sock *sk, int bytes) read_lock(&sk->sk_callback_lock); ready = sk->sk_user_data; - if (ready == NULL) { /* check for teardown race */ + if (!ready) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } @@ -178,7 +178,7 @@ void rds_tcp_listen_stop(void) struct socket *sock = rds_tcp_listen_sock; struct sock *sk; - if (sock == NULL) + if (!sock) return; sk = sock->sk; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 1aba687..ea73829 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -190,10 +190,10 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, * processing. */ while (left) { - if (tinc == NULL) { + if (!tinc) { tinc = kmem_cache_alloc(rds_tcp_incoming_slab, arg->gfp); - if (tinc == NULL) { + if (!tinc) { desc->error = -ENOMEM; goto out; } @@ -229,7 +229,7 @@ static int rds_tcp_data_recv(read_descriptor_t *desc, struct sk_buff *skb, if (left && tc->t_tinc_data_rem) { clone = skb_clone(skb, arg->gfp); - if (clone == NULL) { + if (!clone) { desc->error = -ENOMEM; goto out; } @@ -326,7 +326,7 @@ void rds_tcp_data_ready(struct sock *sk, int bytes) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { /* check for teardown race */ + if (!conn) { /* check for teardown race */ ready = sk->sk_data_ready; goto out; } @@ -347,7 +347,7 @@ int __init rds_tcp_recv_init(void) rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", sizeof(struct rds_tcp_incoming), 0, 0, NULL); - if (rds_tcp_incoming_slab == NULL) + if (!rds_tcp_incoming_slab) return -ENOMEM; return 0; } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index a28b895..e5f6cce 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -226,7 +226,7 @@ void rds_tcp_write_space(struct sock *sk) read_lock(&sk->sk_callback_lock); conn = sk->sk_user_data; - if (conn == NULL) { + if (!conn) { write_space = sk->sk_write_space; goto out; } diff --git a/net/rds/threads.c b/net/rds/threads.c index 6e2e43d..7a8ca7a 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -215,7 +215,7 @@ void rds_threads_exit(void) int __init rds_threads_init(void) { rds_wq = create_workqueue("krdsd"); - if (rds_wq == NULL) + if (!rds_wq) return -ENOMEM; return 0; -- cgit v1.1 From e779137aa76d38d5c33a98ed887092ae4e4f016f Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 12:15:02 -0800 Subject: RDS: break out rdma and data ops into nested structs in rds_message Clearly separate rdma-related variables in rm from data-related ones. This is in anticipation of adding atomic support. Signed-off-by: Andy Grover --- net/rds/ib_send.c | 44 +++++++++++++++++++++++--------------------- net/rds/iw_send.c | 38 ++++++++++++++++++++------------------ net/rds/message.c | 30 +++++++++++++++--------------- net/rds/rdma.c | 9 +++++---- net/rds/rds.h | 16 +++++++++++----- net/rds/send.c | 30 +++++++++++++++--------------- net/rds/tcp_send.c | 14 +++++++------- 7 files changed, 96 insertions(+), 85 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 0b0090d..5375020 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -83,11 +83,11 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, - DMA_TO_DEVICE); + rm->data.m_sg, rm->data.m_nents, + DMA_TO_DEVICE); - if (rm->m_rdma_op) { - rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); + if (rm->rdma.m_rdma_op) { + rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -111,10 +111,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, */ rds_ib_send_rdma_complete(rm, wc_status); - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + if (rm->rdma.m_rdma_op->r_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -244,8 +244,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rm = rds_send_get_message(conn, send->s_op); if (rm) { - if (rm->m_rdma_op) - rds_ib_send_unmap_rdma(ic, rm->m_rdma_op); + if (rm->rdma.m_rdma_op) + rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op); rds_ib_send_rdma_complete(rm, wc.status); rds_message_put(rm); } @@ -532,18 +532,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_inc.i_hdr.h_flags, be32_to_cpu(rm->m_inc.i_hdr.h_len)); */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (rm->data.m_nents) { + rm->data.m_count = ib_dma_map_sg(dev, + rm->data.m_sg, + rm->data.m_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count); + if (rm->data.m_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.m_count = 0; } ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; @@ -559,10 +561,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.m_rdma_op) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -590,7 +592,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->m_sg[sg]; + scat = &rm->data.m_sg[sg]; sent = 0; i = 0; @@ -600,7 +602,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence) send_flags = IB_SEND_FENCE; /* @@ -619,7 +621,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) { unsigned int len; send = &ic->i_sends[pos]; @@ -697,7 +699,7 @@ add_header: sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { + if (scat == &rm->data.m_sg[rm->data.m_count]) { prev->s_rm = ic->i_rm; prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; ic->i_rm = NULL; diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index dced532..c187e8f 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -83,11 +83,11 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, - rm->m_sg, rm->m_nents, + rm->data.m_sg, rm->data.m_nents, DMA_TO_DEVICE); - if (rm->m_rdma_op) { - rds_iw_send_unmap_rdma(ic, rm->m_rdma_op); + if (rm->rdma.m_rdma_op) { + rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -111,10 +111,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, */ rds_iw_send_rdma_complete(rm, wc_status); - if (rm->m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->m_rdma_op->r_bytes); + if (rm->rdma.m_rdma_op->r_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -563,18 +563,20 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_inc.i_hdr.h_flags, be32_to_cpu(rm->m_inc.i_hdr.h_len)); */ - if (rm->m_nents) { - rm->m_count = ib_dma_map_sg(dev, - rm->m_sg, rm->m_nents, DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->m_count); - if (rm->m_count == 0) { + if (rm->data.m_nents) { + rm->data.m_count = ib_dma_map_sg(dev, + rm->data.m_sg, + rm->data.m_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count); + if (rm->data.m_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->m_count = 0; + rm->data.m_count = 0; } ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; @@ -590,10 +592,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->m_rdma_op) { + if (rm->rdma.m_rdma_op) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -621,7 +623,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->m_sg[sg]; + scat = &rm->data.m_sg[sg]; sent = 0; i = 0; @@ -631,7 +633,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->m_rdma_op && rm->m_rdma_op->r_fence) + if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence) send_flags = IB_SEND_FENCE; /* @@ -650,7 +652,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->m_sg[rm->m_count]; i++) { + for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) { unsigned int len; send = &ic->i_sends[pos]; @@ -728,7 +730,7 @@ add_header: sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->m_sg[rm->m_count]) { + if (scat == &rm->data.m_sg[rm->data.m_count]) { prev->s_rm = ic->i_rm; prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; ic->i_rm = NULL; diff --git a/net/rds/message.c b/net/rds/message.c index 809656c..4421d16 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -63,17 +63,17 @@ static void rds_message_purge(struct rds_message *rm) if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->m_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->m_sg[i])); + for (i = 0; i < rm->data.m_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.m_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->m_sg[i])); + __free_page(sg_page(&rm->data.m_sg[i])); } - rm->m_nents = 0; + rm->data.m_nents = 0; - if (rm->m_rdma_op) - rds_rdma_free_op(rm->m_rdma_op); - if (rm->m_rdma_mr) - rds_mr_put(rm->m_rdma_mr); + if (rm->rdma.m_rdma_op) + rds_rdma_free_op(rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_mr) + rds_mr_put(rm->rdma.m_rdma_mr); } void rds_message_inc_purge(struct rds_incoming *inc) @@ -224,7 +224,7 @@ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) goto out; if (nents) - sg_init_table(rm->m_sg, nents); + sg_init_table(rm->data.m_sg, nents); atomic_set(&rm->m_refcount, 1); INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); @@ -245,10 +245,10 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->m_nents = ceil(total_len, PAGE_SIZE); + rm->data.m_nents = ceil(total_len, PAGE_SIZE); - for (i = 0; i < rm->m_nents; ++i) { - sg_set_page(&rm->m_sg[i], + for (i = 0; i < rm->data.m_nents; ++i) { + sg_set_page(&rm->data.m_sg[i], virt_to_page(page_addrs[i]), PAGE_SIZE, 0); } @@ -278,7 +278,7 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, /* * now allocate and copy in the data payload. */ - sg = rm->m_sg; + sg = rm->data.m_sg; iov = first_iov; iov_off = 0; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ @@ -289,7 +289,7 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, GFP_HIGHUSER); if (ret) goto out; - rm->m_nents++; + rm->data.m_nents++; sg_off = 0; } @@ -348,7 +348,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, iov = first_iov; iov_off = 0; - sg = rm->m_sg; + sg = rm->data.m_sg; vec_off = 0; copied = 0; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index dee698b..24274bb 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -643,14 +643,14 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct rds_rdma_op *op; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || - rm->m_rdma_op) + rm->rdma.m_rdma_op) return -EINVAL; op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); if (IS_ERR(op)) return PTR_ERR(op); rds_stats_inc(s_send_rdma); - rm->m_rdma_op = op; + rm->rdma.m_rdma_op = op; return 0; } @@ -679,6 +679,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); + spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) @@ -689,7 +690,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); - rm->m_rdma_mr = mr; + rm->rdma.m_rdma_mr = mr; } return err; } @@ -707,5 +708,5 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, rm->m_rdma_cookie != 0) return -EINVAL; - return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->m_rdma_mr); + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.m_rdma_mr); } diff --git a/net/rds/rds.h b/net/rds/rds.h index 1d3eef6..07a750b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -259,12 +259,18 @@ struct rds_message { */ spinlock_t m_rs_lock; struct rds_sock *m_rs; - struct rds_rdma_op *m_rdma_op; rds_rdma_cookie_t m_rdma_cookie; - struct rds_mr *m_rdma_mr; - unsigned int m_nents; - unsigned int m_count; - struct scatterlist m_sg[0]; + struct { + struct { + struct rds_rdma_op *m_rdma_op; + struct rds_mr *m_rdma_mr; + } rdma; + struct { + unsigned int m_nents; + unsigned int m_count; + struct scatterlist m_sg[0]; + } data; + }; }; /* diff --git a/net/rds/send.c b/net/rds/send.c index 817997d..19dfd02 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -166,7 +166,7 @@ int rds_send_xmit(struct rds_connection *conn) rm = conn->c_xmit_rm; if (rm && conn->c_xmit_hdr_off == sizeof(struct rds_header) && - conn->c_xmit_sg == rm->m_nents) { + conn->c_xmit_sg == rm->data.m_nents) { conn->c_xmit_rm = NULL; conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; @@ -236,7 +236,7 @@ int rds_send_xmit(struct rds_connection *conn) * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->m_rdma_op && + if (rm->rdma.m_rdma_op && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) @@ -268,8 +268,8 @@ int rds_send_xmit(struct rds_connection *conn) * keep this simple and require that the transport either * send the whole rdma or none of it. */ - if (rm->m_rdma_op && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, rm->m_rdma_op); + if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op); if (ret) break; conn->c_xmit_rdma_sent = 1; @@ -279,7 +279,7 @@ int rds_send_xmit(struct rds_connection *conn) } if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || - conn->c_xmit_sg < rm->m_nents) { + conn->c_xmit_sg < rm->data.m_nents) { ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, @@ -295,7 +295,7 @@ int rds_send_xmit(struct rds_connection *conn) ret -= tmp; } - sg = &rm->m_sg[conn->c_xmit_sg]; + sg = &rm->data.m_sg[conn->c_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - conn->c_xmit_data_off); @@ -306,7 +306,7 @@ int rds_send_xmit(struct rds_connection *conn) sg++; conn->c_xmit_sg++; BUG_ON(ret != 0 && - conn->c_xmit_sg == rm->m_nents); + conn->c_xmit_sg == rm->data.m_nents); } } } @@ -419,7 +419,7 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) spin_lock_irqsave(&rm->m_rs_lock, flags); - ro = rm->m_rdma_op; + ro = rm->rdma.m_rdma_op; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && ro && ro->r_notify && ro->r_notifier) { notifier = ro->r_notifier; @@ -453,7 +453,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status { struct rds_rdma_op *ro; - ro = rm->m_rdma_op; + ro = rm->rdma.m_rdma_op; if (ro && ro->r_notify && ro->r_notifier) { ro->r_notifier->n_status = status; list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); @@ -477,7 +477,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, spin_lock_irqsave(&conn->c_lock, flags); list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (rm->m_rdma_op == op) { + if (rm->rdma.m_rdma_op == op) { atomic_inc(&rm->m_refcount); found = rm; goto out; @@ -485,7 +485,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, } list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (rm->m_rdma_op == op) { + if (rm->rdma.m_rdma_op == op) { atomic_inc(&rm->m_refcount); found = rm; break; @@ -545,7 +545,7 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rds_rdma_op *ro = rm->m_rdma_op; + struct rds_rdma_op *ro = rm->rdma.m_rdma_op; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); @@ -557,7 +557,7 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; - rm->m_rdma_op->r_notifier = NULL; + rm->rdma.m_rdma_op->r_notifier = NULL; } was_on_sock = 1; rm->m_rs = NULL; @@ -874,11 +874,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; - if ((rm->m_rdma_cookie || rm->m_rdma_op) && + if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op) && !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - rm->m_rdma_op, conn->c_trans->xmit_rdma); + rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index e5f6cce..d63aa35 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -166,21 +166,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - while (sg < rm->m_nents) { + while (sg < rm->data.m_nents) { ret = tc->t_sock->ops->sendpage(tc->t_sock, - sg_page(&rm->m_sg[sg]), - rm->m_sg[sg].offset + off, - rm->m_sg[sg].length - off, + sg_page(&rm->data.m_sg[sg]), + rm->data.m_sg[sg].offset + off, + rm->data.m_sg[sg].length - off, MSG_DONTWAIT|MSG_NOSIGNAL); - rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->m_sg[sg]), - rm->m_sg[sg].offset + off, rm->m_sg[sg].length - off, + rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.m_sg[sg]), + rm->data.m_sg[sg].offset + off, rm->data.m_sg[sg].length - off, ret); if (ret <= 0) break; off += ret; done += ret; - if (off == rm->m_sg[sg].length) { + if (off == rm->data.m_sg[sg].length) { off = 0; sg++; } -- cgit v1.1 From 3ef13f3c22aaea28aff383cb0883481d24885456 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 12:37:17 -0800 Subject: RDS: cleanup/fix rds_rdma_unuse First, it looks to me like the atomic_inc is wrong. We should be decrementing refcount only once here, no? It's already being done by the mr_put() at the end. Second, simplify the logic a bit by bailing early (with a warning) if !mr. Signed-off-by: Andy Grover --- net/rds/rdma.c | 27 +++++++++++++++------------ 1 file changed, 15 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 24274bb..5011efa 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -414,27 +414,30 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); - if (mr && (mr->r_use_once || force)) { + if (!mr) { + printk(KERN_ERR "rds: trying to unuse MR with unknown r_key %u!\n", r_key); + spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); + return; + } + + if (mr->r_use_once || force) { rb_erase(&mr->r_rb_node, &rs->rs_rdma_keys); RB_CLEAR_NODE(&mr->r_rb_node); zot_me = 1; - } else if (mr) - atomic_inc(&mr->r_refcount); + } spin_unlock_irqrestore(&rs->rs_rdma_lock, flags); /* May have to issue a dma_sync on this memory region. * Note we could avoid this if the operation was a RDMA READ, * but at this point we can't tell. */ - if (mr) { - if (mr->r_trans->sync_mr) - mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); + if (mr->r_trans->sync_mr) + mr->r_trans->sync_mr(mr->r_trans_private, DMA_FROM_DEVICE); - /* If the MR was marked as invalidate, this will - * trigger an async flush. */ - if (zot_me) - rds_destroy_mr(mr); - rds_mr_put(mr); - } + /* If the MR was marked as invalidate, this will + * trigger an async flush. */ + if (zot_me) + rds_destroy_mr(mr); + rds_mr_put(mr); } void rds_rdma_free_op(struct rds_rdma_op *ro) -- cgit v1.1 From fc445084f185cdd877bec323bfe724a361e2292a Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 12:56:06 -0800 Subject: RDS: Explicitly allocate rm in sendmsg() r_m_copy_from_user used to allocate the rm as well as kernel buffers for the data, and then copy the data in. Now, sendmsg() allocates the rm, although the data buffer alloc still happens in r_m_copy_from_user. SGs are still allocated with rm, but now r_m_alloc_sgs() is used to reserve them. This allows multiple SG lists to be allocated from the one rm -- this is important once we also want to alloc our rdma sgl from this pool. Signed-off-by: Andy Grover --- net/rds/message.c | 51 ++++++++++++++++++++++++++++++--------------------- net/rds/rds.h | 7 +++++-- net/rds/send.c | 31 +++++++++++++++++++++++++++---- 3 files changed, 62 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 4421d16..3498cbcc 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -214,17 +214,22 @@ int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 o } EXPORT_SYMBOL_GPL(rds_message_add_rdma_dest_extension); -struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp) +/* + * Each rds_message is allocated with extra space for the scatterlist entries + * rds ops will need. This is to minimize memory allocation count. Then, each rds op + * can grab SGs when initializing its part of the rds_message. + */ +struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) { struct rds_message *rm; - rm = kzalloc(sizeof(struct rds_message) + - (nents * sizeof(struct scatterlist)), gfp); + rm = kzalloc(sizeof(struct rds_message) + extra_len, gfp); if (!rm) goto out; - if (nents) - sg_init_table(rm->data.m_sg, nents); + rm->m_used_sgs = 0; + rm->m_total_sgs = extra_len / sizeof(struct scatterlist); + atomic_set(&rm->m_refcount, 1); INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); @@ -234,6 +239,23 @@ out: return rm; } +/* + * RDS ops use this to grab SG entries from the rm's sg pool. + */ +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) +{ + struct scatterlist *sg_first = (struct scatterlist *) &rm[1]; + struct scatterlist *sg_ret; + + WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); + + sg_ret = &sg_first[rm->m_used_sgs]; + + rm->m_used_sgs += nents; + + return sg_ret; +} + struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len) { struct rds_message *rm; @@ -256,22 +278,15 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in return rm; } -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len) { unsigned long to_copy; unsigned long iov_off; unsigned long sg_off; - struct rds_message *rm; struct iovec *iov; struct scatterlist *sg; - int ret; - - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); - if (rm == NULL) { - ret = -ENOMEM; - goto out; - } + int ret = 0; rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); @@ -320,14 +335,8 @@ struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, sg++; } - ret = 0; out: - if (ret) { - if (rm) - rds_message_put(rm); - rm = ERR_PTR(ret); - } - return rm; + return ret; } int rds_message_inc_copy_to_user(struct rds_incoming *inc, diff --git a/net/rds/rds.h b/net/rds/rds.h index 07a750b..d29c71a 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -268,9 +268,11 @@ struct rds_message { struct { unsigned int m_nents; unsigned int m_count; - struct scatterlist m_sg[0]; + struct scatterlist *m_sg; } data; }; + unsigned int m_used_sgs; + unsigned int m_total_sgs; }; /* @@ -573,7 +575,8 @@ rds_conn_connecting(struct rds_connection *conn) /* message.c */ struct rds_message *rds_message_alloc(unsigned int nents, gfp_t gfp); -struct rds_message *rds_message_copy_from_user(struct iovec *first_iov, +struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents); +int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, size_t total_len); struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned int total_len); void rds_message_populate_header(struct rds_header *hdr, __be16 sport, diff --git a/net/rds/send.c b/net/rds/send.c index 19dfd02..28d0944 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -758,6 +758,19 @@ out: return *queued; } +/* + * rds_message is getting to be quite complicated, and we'd like to allocate + * it all in one go. This figures out how big it needs to be up front. + */ +static int rds_rm_size(struct msghdr *msg, int data_len) +{ + int size = 0; + + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + + return size; +} + static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, struct msghdr *msg, int *allocated_mr) { @@ -845,13 +858,23 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - rm = rds_message_copy_from_user(msg->msg_iov, payload_len); - if (IS_ERR(rm)) { - ret = PTR_ERR(rm); - rm = NULL; + /* size of rm including all sgs */ + ret = rds_rm_size(msg, payload_len); + if (ret < 0) + goto out; + + rm = rds_message_alloc(ret, GFP_KERNEL); + if (!rm) { + ret = -ENOMEM; goto out; } + rm->data.m_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + /* XXX fix this to not allocate memory */ + ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); + if (ret) + goto out; + rm->m_daddr = daddr; /* rds_conn_create has a spinlock that runs with IRQ off. -- cgit v1.1 From 21f79afa5fda2820671a8f64c3d0e43bb118053b Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 12:57:27 -0800 Subject: RDS: fold rdma.h into rds.h RDMA is now an intrinsic part of RDS, so it's easier to just have a single header. Signed-off-by: Andy Grover --- net/rds/af_rds.c | 1 - net/rds/connection.c | 1 - net/rds/ib_rdma.c | 1 - net/rds/ib_send.c | 1 - net/rds/iw_rdma.c | 1 - net/rds/iw_send.c | 1 - net/rds/message.c | 1 - net/rds/rdma.c | 2 +- net/rds/rdma.h | 85 ---------------------------------------------------- net/rds/rds.h | 76 ++++++++++++++++++++++++++++++++++++++++++++++ net/rds/recv.c | 1 - net/rds/send.c | 1 - 12 files changed, 77 insertions(+), 95 deletions(-) delete mode 100644 net/rds/rdma.h (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 63474e1..ef09340 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -39,7 +39,6 @@ #include #include "rds.h" -#include "rdma.h" /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); diff --git a/net/rds/connection.c b/net/rds/connection.c index 9c9afb5..88bcaf3 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -37,7 +37,6 @@ #include "rds.h" #include "loop.h" -#include "rdma.h" #define RDS_CONNECTION_HASH_BITS 12 #define RDS_CONNECTION_HASH_ENTRIES (1 << RDS_CONNECTION_HASH_BITS) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a92aebc..0f3b5a2 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -34,7 +34,6 @@ #include #include "rds.h" -#include "rdma.h" #include "ib.h" diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 5375020..575fce4 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -36,7 +36,6 @@ #include #include "rds.h" -#include "rdma.h" #include "ib.h" static void rds_ib_send_rdma_complete(struct rds_message *rm, diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 13dc186..4e152e2 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -34,7 +34,6 @@ #include #include "rds.h" -#include "rdma.h" #include "iw.h" diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index c187e8f..62234b8 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -36,7 +36,6 @@ #include #include "rds.h" -#include "rdma.h" #include "iw.h" static void rds_iw_send_rdma_complete(struct rds_message *rm, diff --git a/net/rds/message.c b/net/rds/message.c index 3498cbcc..fb382fb 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -34,7 +34,6 @@ #include #include "rds.h" -#include "rdma.h" static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 5011efa..a21edad 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -35,7 +35,7 @@ #include #include /* for DMA_*_DEVICE */ -#include "rdma.h" +#include "rds.h" /* * XXX diff --git a/net/rds/rdma.h b/net/rds/rdma.h deleted file mode 100644 index 909c398..0000000 --- a/net/rds/rdma.h +++ /dev/null @@ -1,85 +0,0 @@ -#ifndef _RDS_RDMA_H -#define _RDS_RDMA_H - -#include -#include -#include - -#include "rds.h" - -struct rds_mr { - struct rb_node r_rb_node; - atomic_t r_refcount; - u32 r_key; - - /* A copy of the creation flags */ - unsigned int r_use_once:1; - unsigned int r_invalidate:1; - unsigned int r_write:1; - - /* This is for RDS_MR_DEAD. - * It would be nice & consistent to make this part of the above - * bit field here, but we need to use test_and_set_bit. - */ - unsigned long r_state; - struct rds_sock *r_sock; /* back pointer to the socket that owns us */ - struct rds_transport *r_trans; - void *r_trans_private; -}; - -/* Flags for mr->r_state */ -#define RDS_MR_DEAD 0 - -struct rds_rdma_op { - u32 r_key; - u64 r_remote_addr; - unsigned int r_write:1; - unsigned int r_fence:1; - unsigned int r_notify:1; - unsigned int r_recverr:1; - unsigned int r_mapped:1; - struct rds_notifier *r_notifier; - unsigned int r_bytes; - unsigned int r_nents; - unsigned int r_count; - struct scatterlist r_sg[0]; -}; - -static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) -{ - return r_key | (((u64) offset) << 32); -} - -static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) -{ - return cookie; -} - -static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) -{ - return cookie >> 32; -} - -int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); -int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); -int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); -void rds_rdma_drop_keys(struct rds_sock *rs); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg); -void rds_rdma_free_op(struct rds_rdma_op *ro); -void rds_rdma_send_complete(struct rds_message *rm, int); - -extern void __rds_put_mr_final(struct rds_mr *mr); -static inline void rds_mr_put(struct rds_mr *mr) -{ - if (atomic_dec_and_test(&mr->r_refcount)) - __rds_put_mr_final(mr); -} - -#endif diff --git a/net/rds/rds.h b/net/rds/rds.h index d29c71a..7c4adbe 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -206,6 +206,60 @@ struct rds_incoming { rds_rdma_cookie_t i_rdma_cookie; }; +struct rds_mr { + struct rb_node r_rb_node; + atomic_t r_refcount; + u32 r_key; + + /* A copy of the creation flags */ + unsigned int r_use_once:1; + unsigned int r_invalidate:1; + unsigned int r_write:1; + + /* This is for RDS_MR_DEAD. + * It would be nice & consistent to make this part of the above + * bit field here, but we need to use test_and_set_bit. + */ + unsigned long r_state; + struct rds_sock *r_sock; /* back pointer to the socket that owns us */ + struct rds_transport *r_trans; + void *r_trans_private; +}; + +/* Flags for mr->r_state */ +#define RDS_MR_DEAD 0 + +struct rds_rdma_op { + u32 r_key; + u64 r_remote_addr; + unsigned int r_write:1; + unsigned int r_fence:1; + unsigned int r_notify:1; + unsigned int r_recverr:1; + unsigned int r_mapped:1; + unsigned int r_active:1; + struct rds_notifier *r_notifier; + unsigned int r_bytes; + unsigned int r_nents; + unsigned int r_count; + struct scatterlist *r_sg; +}; + +static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) +{ + return r_key | (((u64) offset) << 32); +} + +static inline u32 rds_rdma_cookie_key(rds_rdma_cookie_t cookie) +{ + return cookie; +} + +static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) +{ + return cookie >> 32; +} + /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty @@ -654,6 +708,28 @@ struct rds_message *rds_send_get_message(struct rds_connection *, /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); +int rds_get_mr(struct rds_sock *rs, char __user *optval, int optlen); +int rds_get_mr_for_dest(struct rds_sock *rs, char __user *optval, int optlen); +int rds_free_mr(struct rds_sock *rs, char __user *optval, int optlen); +void rds_rdma_drop_keys(struct rds_sock *rs); +int rds_rdma_extra_size(struct rds_rdma_args *args); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); +void rds_rdma_free_op(struct rds_rdma_op *ro); +void rds_rdma_send_complete(struct rds_message *rm, int); + +extern void __rds_put_mr_final(struct rds_mr *mr); +static inline void rds_mr_put(struct rds_mr *mr) +{ + if (atomic_dec_and_test(&mr->r_refcount)) + __rds_put_mr_final(mr); +} /* stats.c */ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); diff --git a/net/rds/recv.c b/net/rds/recv.c index 88f1f5a..5188763 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -36,7 +36,6 @@ #include #include "rds.h" -#include "rdma.h" void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr) diff --git a/net/rds/send.c b/net/rds/send.c index 28d0944..89e26ff 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -37,7 +37,6 @@ #include #include "rds.h" -#include "rdma.h" /* When transmitting messages in rds_send_xmit, we need to emerge from * time to time and briefly release the CPU. Otherwise the softlock watchdog -- cgit v1.1 From ff87e97a9d70c9ae133d3d3d7792b26ab85f4297 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 14:13:15 -0800 Subject: RDS: make m_rdma_op a member of rds_message This eliminates a separate memory alloc, although it is now necessary to add an "r_active" flag, since it is no longer to use the m_rdma_op pointer as an indicator of if an rdma op is present. rdma SGs allocated from rm sg pool. rds_rm_size also gets bigger. It's a little inefficient to run through CMSGs twice, but it makes later steps a lot smoother. Signed-off-by: Andy Grover --- net/rds/ib_send.c | 20 +++++----- net/rds/iw_send.c | 16 ++++---- net/rds/message.c | 9 +++-- net/rds/rdma.c | 113 +++++++++++++++++++++++++++++------------------------- net/rds/rds.h | 2 +- net/rds/send.c | 59 ++++++++++++++++++++-------- 6 files changed, 129 insertions(+), 90 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 575fce4..f0edfdb 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -85,8 +85,8 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, rm->data.m_sg, rm->data.m_nents, DMA_TO_DEVICE); - if (rm->rdma.m_rdma_op) { - rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) { + rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -110,10 +110,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, */ rds_ib_send_rdma_complete(rm, wc_status); - if (rm->rdma.m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + if (rm->rdma.m_rdma_op.r_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -243,8 +243,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rm = rds_send_get_message(conn, send->s_op); if (rm) { - if (rm->rdma.m_rdma_op) - rds_ib_send_unmap_rdma(ic, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) + rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); rds_ib_send_rdma_complete(rm, wc.status); rds_message_put(rm); } @@ -560,10 +560,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.m_rdma_op) { + if (rm->rdma.m_rdma_op.r_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -601,7 +601,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence) + if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence) send_flags = IB_SEND_FENCE; /* diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 62234b8..9b79a1b 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -85,8 +85,8 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rm->data.m_sg, rm->data.m_nents, DMA_TO_DEVICE); - if (rm->rdma.m_rdma_op) { - rds_iw_send_unmap_rdma(ic, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -110,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, */ rds_iw_send_rdma_complete(rm, wc_status); - if (rm->rdma.m_rdma_op->r_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + if (rm->rdma.m_rdma_op.r_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op->r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.m_rdma_op) { + if (rm->rdma.m_rdma_op.r_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op->r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -632,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->rdma.m_rdma_op && rm->rdma.m_rdma_op->r_fence) + if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence) send_flags = IB_SEND_FENCE; /* diff --git a/net/rds/message.c b/net/rds/message.c index fb382fb..4352ce7 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -69,8 +69,8 @@ static void rds_message_purge(struct rds_message *rm) } rm->data.m_nents = 0; - if (rm->rdma.m_rdma_op) - rds_rdma_free_op(rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active) + rds_rdma_free_op(&rm->rdma.m_rdma_op); if (rm->rdma.m_rdma_mr) rds_mr_put(rm->rdma.m_rdma_mr); } @@ -259,14 +259,17 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in { struct rds_message *rm; unsigned int i; + int num_sgs = ceil(total_len, PAGE_SIZE); + int extra_bytes = num_sgs * sizeof(struct scatterlist); - rm = rds_message_alloc(ceil(total_len, PAGE_SIZE), GFP_KERNEL); + rm = rds_message_alloc(extra_bytes, GFP_KERNEL); if (!rm) return ERR_PTR(-ENOMEM); set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); rm->data.m_nents = ceil(total_len, PAGE_SIZE); + rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs); for (i = 0; i < rm->data.m_nents; ++i) { sg_set_page(&rm->data.m_sg[i], diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a21edad..7ff3379 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -458,26 +458,60 @@ void rds_rdma_free_op(struct rds_rdma_op *ro) } kfree(ro->r_notifier); - kfree(ro); + ro->r_notifier = NULL; + ro->r_active = 0; +} + +/* + * Count the number of pages needed to describe an incoming iovec. + */ +static int rds_rdma_pages(struct rds_rdma_args *args) +{ + struct rds_iovec vec; + struct rds_iovec __user *local_vec; + unsigned int tot_pages = 0; + unsigned int nr_pages; + unsigned int i; + + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + + /* figure out the number of pages in the vector */ + for (i = 0; i < args->nr_local; i++) { + if (copy_from_user(&vec, &local_vec[i], + sizeof(struct rds_iovec))) + return -EFAULT; + + nr_pages = rds_pages_in_vec(&vec); + if (nr_pages == 0) + return -EINVAL; + + tot_pages += nr_pages; + } + + return tot_pages; +} + +int rds_rdma_extra_size(struct rds_rdma_args *args) +{ + return rds_rdma_pages(args) * sizeof(struct scatterlist); } /* * args is a pointer to an in-kernel copy in the sendmsg cmsg. */ -static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, - struct rds_rdma_args *args) +static int rds_rdma_prepare(struct rds_message *rm, + struct rds_sock *rs, + struct rds_rdma_args *args) { struct rds_iovec vec; - struct rds_rdma_op *op = NULL; + struct rds_rdma_op *op = &rm->rdma.m_rdma_op; unsigned int nr_pages; - unsigned int max_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec __user *local_vec; - struct scatterlist *sg; unsigned int nr; unsigned int i, j; - int ret; + int ret = 0; if (rs->rs_bound_addr == 0) { @@ -490,44 +524,21 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, goto out; } - nr_pages = 0; - max_pages = 0; - - local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; - - /* figure out the number of pages in the vector */ - for (i = 0; i < args->nr_local; i++) { - if (copy_from_user(&vec, &local_vec[i], - sizeof(struct rds_iovec))) { - ret = -EFAULT; - goto out; - } - - nr = rds_pages_in_vec(&vec); - if (nr == 0) { - ret = -EINVAL; - goto out; - } - - max_pages = max(nr, max_pages); - nr_pages += nr; - } - - pages = kcalloc(max_pages, sizeof(struct page *), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; + nr_pages = rds_rdma_pages(args); + if (nr_pages < 0) goto out; - } - op = kzalloc(offsetof(struct rds_rdma_op, r_sg[nr_pages]), GFP_KERNEL); - if (!op) { + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + if (!pages) { ret = -ENOMEM; goto out; } + op->r_sg = rds_message_alloc_sgs(rm, nr_pages); op->r_write = !!(args->flags & RDS_RDMA_READWRITE); op->r_fence = !!(args->flags & RDS_RDMA_FENCE); op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->r_active = 1; op->r_recverr = rs->rs_recverr; WARN_ON(!nr_pages); sg_init_table(op->r_sg, nr_pages); @@ -564,6 +575,8 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, (unsigned long long)args->remote_vec.addr, op->r_key); + local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; + for (i = 0; i < args->nr_local; i++) { if (copy_from_user(&vec, &local_vec[i], sizeof(struct rds_iovec))) { @@ -580,11 +593,6 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, rs->rs_user_addr = vec.addr; rs->rs_user_bytes = vec.bytes; - /* did the user change the vec under us? */ - if (nr > max_pages || op->r_nents + nr > nr_pages) { - ret = -EINVAL; - goto out; - } /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ @@ -599,6 +607,7 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, for (j = 0; j < nr; j++) { unsigned int offset = vec.addr & ~PAGE_MASK; + struct scatterlist *sg; sg = &op->r_sg[op->r_nents + j]; sg_set_page(sg, pages[j], @@ -628,12 +637,10 @@ static struct rds_rdma_op *rds_rdma_prepare(struct rds_sock *rs, ret = 0; out: kfree(pages); - if (ret) { - if (op) - rds_rdma_free_op(op); - op = ERR_PTR(ret); - } - return op; + if (ret) + rds_rdma_free_op(op); + + return ret; } /* @@ -643,17 +650,17 @@ out: int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg) { - struct rds_rdma_op *op; + int ret; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || - rm->rdma.m_rdma_op) + rm->rdma.m_rdma_op.r_active) return -EINVAL; - op = rds_rdma_prepare(rs, CMSG_DATA(cmsg)); - if (IS_ERR(op)) - return PTR_ERR(op); + ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg)); + if (ret) + return ret; + rds_stats_inc(s_send_rdma); - rm->rdma.m_rdma_op = op; return 0; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 7c4adbe..0bb4957 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -316,7 +316,7 @@ struct rds_message { rds_rdma_cookie_t m_rdma_cookie; struct { struct { - struct rds_rdma_op *m_rdma_op; + struct rds_rdma_op m_rdma_op; struct rds_mr *m_rdma_mr; } rdma; struct { diff --git a/net/rds/send.c b/net/rds/send.c index 89e26ff..72dbe7f 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -235,7 +235,7 @@ int rds_send_xmit(struct rds_connection *conn) * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->rdma.m_rdma_op && + if (rm->rdma.m_rdma_op.r_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) @@ -267,8 +267,8 @@ int rds_send_xmit(struct rds_connection *conn) * keep this simple and require that the transport either * send the whole rdma or none of it. */ - if (rm->rdma.m_rdma_op && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, rm->rdma.m_rdma_op); + if (rm->rdma.m_rdma_op.r_active && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma.m_rdma_op); if (ret) break; conn->c_xmit_rdma_sent = 1; @@ -418,9 +418,9 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) spin_lock_irqsave(&rm->m_rs_lock, flags); - ro = rm->rdma.m_rdma_op; + ro = &rm->rdma.m_rdma_op; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro && ro->r_notify && ro->r_notifier) { + ro->r_active && ro->r_notify && ro->r_notifier) { notifier = ro->r_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -452,8 +452,8 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status { struct rds_rdma_op *ro; - ro = rm->rdma.m_rdma_op; - if (ro && ro->r_notify && ro->r_notifier) { + ro = &rm->rdma.m_rdma_op; + if (ro->r_active && ro->r_notify && ro->r_notifier) { ro->r_notifier->n_status = status; list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); ro->r_notifier = NULL; @@ -476,7 +476,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, spin_lock_irqsave(&conn->c_lock, flags); list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (rm->rdma.m_rdma_op == op) { + if (&rm->rdma.m_rdma_op == op) { atomic_inc(&rm->m_refcount); found = rm; goto out; @@ -484,7 +484,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, } list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (rm->rdma.m_rdma_op == op) { + if (&rm->rdma.m_rdma_op == op) { atomic_inc(&rm->m_refcount); found = rm; break; @@ -544,19 +544,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rds_rdma_op *ro = rm->rdma.m_rdma_op; + struct rds_rdma_op *ro = &rm->rdma.m_rdma_op; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); - if (ro && ro->r_notifier && (status || ro->r_notify)) { + if (ro->r_active && ro->r_notifier && + (status || ro->r_notify)) { notifier = ro->r_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; - rm->rdma.m_rdma_op->r_notifier = NULL; + rm->rdma.m_rdma_op.r_notifier = NULL; } was_on_sock = 1; rm->m_rs = NULL; @@ -763,9 +764,37 @@ out: */ static int rds_rm_size(struct msghdr *msg, int data_len) { + struct cmsghdr *cmsg; int size = 0; + int retval; + + for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { + if (!CMSG_OK(msg, cmsg)) + return -EINVAL; + + if (cmsg->cmsg_level != SOL_RDS) + continue; + + switch (cmsg->cmsg_type) { + case RDS_CMSG_RDMA_ARGS: + retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); + if (retval < 0) + return retval; + size += retval; + break; + + case RDS_CMSG_RDMA_DEST: + case RDS_CMSG_RDMA_MAP: + /* these are valid but do no add any size */ + break; + + default: + return -EINVAL; + } + + } - size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); return size; } @@ -896,11 +925,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; - if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op) && + if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) && !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); + &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } -- cgit v1.1 From f4dd96f7b27743e568cec519eff0f951c56833c6 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 14:17:31 -0800 Subject: RDS: make sure all sgs alloced are initialized rds_message_alloc_sgs() now returns correctly-initialized sg lists, so calleds need not do this themselves. Signed-off-by: Andy Grover --- net/rds/message.c | 2 +- net/rds/rdma.c | 3 +-- 2 files changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 4352ce7..7d678e2 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -249,7 +249,7 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); sg_ret = &sg_first[rm->m_used_sgs]; - + sg_init_table(sg_ret, nents); rm->m_used_sgs += nents; return sg_ret; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 7ff3379..4fda330 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -534,14 +534,13 @@ static int rds_rdma_prepare(struct rds_message *rm, goto out; } - op->r_sg = rds_message_alloc_sgs(rm, nr_pages); op->r_write = !!(args->flags & RDS_RDMA_READWRITE); op->r_fence = !!(args->flags & RDS_RDMA_FENCE); op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); op->r_active = 1; op->r_recverr = rs->rs_recverr; WARN_ON(!nr_pages); - sg_init_table(op->r_sg, nr_pages); + op->r_sg = rds_message_alloc_sgs(rm, nr_pages); if (op->r_notify || op->r_recverr) { /* We allocate an uninitialized notifier here, because -- cgit v1.1 From a63273d4992603979ddb181b6a8f07082839b39f Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 14:19:32 -0800 Subject: RDS: Clear up some confusing code in send_remove_from_sock The previous code was correct, but made the assumption that if r_notifier was non-NULL then either r_recverr or r_notify was true. Valid, but fragile. Changed to explicitly check r_recverr (shows up in greps for recverr now, too.) Signed-off-by: Andy Grover --- net/rds/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 72dbe7f..b751a8e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -551,7 +551,7 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) rds_send_sndbuf_remove(rs, rm); if (ro->r_active && ro->r_notifier && - (status || ro->r_notify)) { + (ro->r_notify || (ro->r_recverr && status))) { notifier = ro->r_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); -- cgit v1.1 From 15133f6e67d8d646d0744336b4daa3135452cb0d Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 14:33:38 -0800 Subject: RDS: Implement atomic operations Implement a CMSG-based interface to do FADD and CSWP ops. Alter send routines to handle atomic ops. Add atomic counters to stats. Add xmit_atomic() to struct rds_transport Inline rds_ib_send_unmap_rdma into unmap_rm Signed-off-by: Andy Grover --- net/rds/ib.c | 1 + net/rds/ib.h | 1 + net/rds/ib_rdma.c | 4 +- net/rds/ib_send.c | 140 +++++++++++++++++++++++++++++++++++++++++++++++------- net/rds/rdma.c | 73 ++++++++++++++++++++++++++++ net/rds/rds.h | 33 +++++++++++-- net/rds/send.c | 71 +++++++++++++++++++++++++-- net/rds/stats.c | 2 + 8 files changed, 302 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 8f2d6dd..f0d2965 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -264,6 +264,7 @@ struct rds_transport rds_ib_transport = { .xmit = rds_ib_xmit, .xmit_cong_map = NULL, .xmit_rdma = rds_ib_xmit_rdma, + .xmit_atomic = rds_ib_xmit_atomic, .recv = rds_ib_recv, .conn_alloc = rds_ib_conn_alloc, .conn_free = rds_ib_conn_free, diff --git a/net/rds/ib.h b/net/rds/ib.h index 64df4e7..d2fd0aa 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -336,6 +336,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, u32 *adv_credits, int need_posted, int max_posted); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 0f3b5a2..242231f 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -298,7 +298,9 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | - IB_ACCESS_REMOTE_WRITE), + IB_ACCESS_REMOTE_WRITE| + IB_ACCESS_REMOTE_ATOMIC), + &pool->fmr_attr); if (IS_ERR(ibmr->fmr)) { err = PTR_ERR(ibmr->fmr); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index f0edfdb..b2bd164 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -62,15 +62,17 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, rds_rdma_send_complete(rm, notify_status); } -static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, - struct rds_rdma_op *op) +static void rds_ib_send_atomic_complete(struct rds_message *rm, + int wc_status) { - if (op->r_mapped) { - ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; - } + int notify_status; + + if (wc_status != IB_WC_SUCCESS) + notify_status = RDS_RDMA_OTHER_ERROR; + else + notify_status = RDS_RDMA_SUCCESS; + + rds_atomic_send_complete(rm, notify_status); } static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, @@ -86,7 +88,14 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, DMA_TO_DEVICE); if (rm->rdma.m_rdma_op.r_active) { - rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); + struct rds_rdma_op *op = &rm->rdma.m_rdma_op; + + if (op->r_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->r_sg, op->r_nents, + op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->r_mapped = 0; + } /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -116,6 +125,24 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); } + if (rm->atomic.op_active) { + struct rm_atomic_op *op = &rm->atomic; + + /* unmap atomic recvbuf */ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, + DMA_FROM_DEVICE); + op->op_mapped = 0; + } + + rds_ib_send_atomic_complete(rm, wc_status); + + if (rm->atomic.op_type == RDS_ATOMIC_TYPE_CSWP) + rds_stats_inc(s_atomic_cswp); + else + rds_stats_inc(s_atomic_fadd); + } + /* If anyone waited for this message to get flushed out, wake * them up now */ rds_message_unmapped(rm); @@ -158,12 +185,9 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) u32 i; for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - if (send->s_wr.opcode == 0xdead) + if (!send->s_rm || send->s_wr.opcode == 0xdead) continue; - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); - if (send->s_op) - rds_ib_send_unmap_rdma(ic, send->s_op); + rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); } } @@ -218,6 +242,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) break; case IB_WR_RDMA_WRITE: case IB_WR_RDMA_READ: + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_ATOMIC_CMP_AND_SWP: /* Nothing to be done - the SG list will be unmapped * when the SEND completes. */ break; @@ -243,8 +269,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rm = rds_send_get_message(conn, send->s_op); if (rm) { - if (rm->rdma.m_rdma_op.r_active) - rds_ib_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); + rds_ib_send_unmap_rm(ic, send, wc.status); rds_ib_send_rdma_complete(rm, wc.status); rds_message_put(rm); } @@ -736,6 +761,89 @@ out: return ret; } +/* + * Issue atomic operation. + * A simplified version of the rdma case, we always map 1 SG, and + * only 8 bytes, for the return value from the atomic operation. + */ +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) +{ + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_send_work *send = NULL; + struct ib_send_wr *failed_wr; + struct rds_ib_device *rds_ibdev; + u32 pos; + u32 work_alloc; + int ret; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, 1, &pos); + if (work_alloc != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_ring_full); + ret = -ENOMEM; + goto out; + } + + /* address of send request in ring */ + send = &ic->i_sends[pos]; + send->s_queued = jiffies; + + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { + send->s_wr.opcode = IB_WR_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_compare; + send->s_wr.wr.atomic.swap = op->op_swap_add; + } else { /* FADD */ + send->s_wr.opcode = IB_WR_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_swap_add; + send->s_wr.wr.atomic.swap = 0; + } + send->s_wr.send_flags = IB_SEND_SIGNALED; + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; + send->s_wr.wr.atomic.rkey = op->op_rkey; + + /* map 8 byte retval buffer to the device */ + ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); + rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); + if (ret != 1) { + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); + ret = -ENOMEM; /* XXX ? */ + goto out; + } + + /* Convert our struct scatterlist to struct ib_sge */ + send->s_sge[0].addr = ib_sg_dma_address(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].length = ib_sg_dma_len(ic->i_cm_id->device, op->op_sg); + send->s_sge[0].lkey = ic->i_mr->lkey; + + rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, + send->s_sge[0].addr, send->s_sge[0].length); + + failed_wr = &send->s_wr; + ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); + rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, + send, &send->s_wr, ret, failed_wr); + BUG_ON(failed_wr != &send->s_wr); + if (ret) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " + "returned %d\n", &conn->c_faddr, ret); + rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + goto out; + } + + if (unlikely(failed_wr != &send->s_wr)) { + printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret); + BUG_ON(failed_wr != &send->s_wr); + } + +out: + return ret; +} + int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) { struct rds_ib_connection *ic = conn->c_transport_data; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4fda330..a7019df 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -719,3 +719,76 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.m_rdma_mr); } + +/* + * Fill in rds_message for an atomic request. + */ +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) +{ + struct page *page = NULL; + struct rds_atomic_args *args; + int ret = 0; + + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_atomic_args)) + || rm->atomic.op_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); + + if (cmsg->cmsg_type == RDS_CMSG_ATOMIC_CSWP) { + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_swap_add = args->cswp.swap; + rm->atomic.op_compare = args->cswp.compare; + } else { + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_swap_add = args->fadd.add; + } + + rm->m_rdma_cookie = args->cookie; + rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_recverr = rs->rs_recverr; + rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); + + /* verify 8 byte-aligned */ + if (args->local_addr & 0x7) { + ret = -EFAULT; + goto err; + } + + ret = rds_pin_pages(args->local_addr, 1, &page, 1); + if (ret != 1) + goto err; + ret = 0; + + sg_set_page(rm->atomic.op_sg, page, 8, offset_in_page(args->local_addr)); + + if (rm->atomic.op_notify || rm->atomic.op_recverr) { + /* We allocate an uninitialized notifier here, because + * we don't want to do that in the completion handler. We + * would have to use GFP_ATOMIC there, and don't want to deal + * with failed allocations. + */ + rm->atomic.op_notifier = kmalloc(sizeof(*rm->atomic.op_notifier), GFP_KERNEL); + if (!rm->atomic.op_notifier) { + ret = -ENOMEM; + goto err; + } + + rm->atomic.op_notifier->n_user_token = args->user_token; + rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; + } + + rm->atomic.op_rkey = rds_rdma_cookie_key(rm->m_rdma_cookie); + rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); + + rm->atomic.op_active = 1; + + return ret; +err: + if (page) + put_page(page); + kfree(rm->atomic.op_notifier); + + return ret; +} diff --git a/net/rds/rds.h b/net/rds/rds.h index 0bb4957..830e2bb 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -97,6 +97,7 @@ struct rds_connection { unsigned int c_xmit_hdr_off; unsigned int c_xmit_data_off; unsigned int c_xmit_rdma_sent; + unsigned int c_xmit_atomic_sent; spinlock_t c_lock; /* protect msg queues */ u64 c_next_tx_seq; @@ -260,6 +261,10 @@ static inline u32 rds_rdma_cookie_offset(rds_rdma_cookie_t cookie) return cookie >> 32; } +/* atomic operation types */ +#define RDS_ATOMIC_TYPE_CSWP 0 +#define RDS_ATOMIC_TYPE_FADD 1 + /* * m_sock_item and m_conn_item are on lists that are serialized under * conn->c_lock. m_sock_item has additional meaning in that once it is empty @@ -315,11 +320,27 @@ struct rds_message { struct rds_sock *m_rs; rds_rdma_cookie_t m_rdma_cookie; struct { - struct { + struct rm_atomic_op { + int op_type; + uint64_t op_swap_add; + uint64_t op_compare; + + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_active:1; + struct rds_notifier *op_notifier; + struct scatterlist *op_sg; + + struct rds_mr *op_rdma_mr; + } atomic; + struct rm_rdma_op { struct rds_rdma_op m_rdma_op; struct rds_mr *m_rdma_mr; } rdma; - struct { + struct rm_data_op { unsigned int m_nents; unsigned int m_count; struct scatterlist *m_sg; @@ -397,6 +418,7 @@ struct rds_transport { int (*xmit_cong_map)(struct rds_connection *conn, struct rds_cong_map *map, unsigned long offset); int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); + int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, size_t size); @@ -546,6 +568,8 @@ struct rds_statistics { uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; + uint64_t s_atomic_cswp; + uint64_t s_atomic_fadd; }; /* af_rds.c */ @@ -722,7 +746,10 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void rds_rdma_free_op(struct rds_rdma_op *ro); -void rds_rdma_send_complete(struct rds_message *rm, int); +void rds_rdma_send_complete(struct rds_message *rm, int wc_status); +void rds_atomic_send_complete(struct rds_message *rm, int wc_status); +int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg); extern void __rds_put_mr_final(struct rds_mr *mr); static inline void rds_mr_put(struct rds_mr *mr) diff --git a/net/rds/send.c b/net/rds/send.c index b751a8e..f3f4e79 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -73,6 +73,7 @@ void rds_send_reset(struct rds_connection *conn) conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; conn->c_map_queued = 0; @@ -171,6 +172,7 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; /* Release the reference to the previous message. */ rds_message_put(rm); @@ -262,6 +264,17 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_rm = rm; } + + if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); + if (ret) + break; + conn->c_xmit_atomic_sent = 1; + /* The transport owns the mapped memory for now. + * You can't unmap it while it's on the send queue */ + set_bit(RDS_MSG_MAPPED, &rm->m_flags); + } + /* * Try and send an rdma message. Let's see if we can * keep this simple and require that the transport either @@ -443,6 +456,41 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) EXPORT_SYMBOL_GPL(rds_rdma_send_complete); /* + * Just like above, except looks at atomic op + */ +void rds_atomic_send_complete(struct rds_message *rm, int status) +{ + struct rds_sock *rs = NULL; + struct rm_atomic_op *ao; + struct rds_notifier *notifier; + + spin_lock(&rm->m_rs_lock); + + ao = &rm->atomic; + if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) + && ao->op_active && ao->op_notify && ao->op_notifier) { + notifier = ao->op_notifier; + rs = rm->m_rs; + sock_hold(rds_rs_to_sk(rs)); + + notifier->n_status = status; + spin_lock(&rs->rs_lock); + list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); + spin_unlock(&rs->rs_lock); + + ao->op_notifier = NULL; + } + + spin_unlock(&rm->m_rs_lock); + + if (rs) { + rds_wake_sk_sleep(rs); + sock_put(rds_rs_to_sk(rs)); + } +} +EXPORT_SYMBOL_GPL(rds_atomic_send_complete); + +/* * This is the same as rds_rdma_send_complete except we * don't do any locking - we have all the ingredients (message, * socket, socket lock) and can just move the notifier. @@ -788,6 +836,11 @@ static int rds_rm_size(struct msghdr *msg, int data_len) /* these are valid but do no add any size */ break; + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + size += sizeof(struct scatterlist); + break; + default: return -EINVAL; } @@ -813,7 +866,7 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, continue; /* As a side effect, RDMA_DEST and RDMA_MAP will set - * rm->m_rdma_cookie and rm->m_rdma_mr. + * rm->rdma.m_rdma_cookie and rm->rdma.m_rdma_mr. */ switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: @@ -829,6 +882,10 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, if (!ret) *allocated_mr = 1; break; + case RDS_CMSG_ATOMIC_CSWP: + case RDS_CMSG_ATOMIC_FADD: + ret = rds_cmsg_atomic(rs, rm, cmsg); + break; default: return -EINVAL; @@ -926,10 +983,18 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) && - !conn->c_trans->xmit_rdma) { + !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); + &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); + ret = -EOPNOTSUPP; + goto out; + } + + if (rm->atomic.op_active && !conn->c_trans->xmit_atomic) { + if (printk_ratelimit()) + printk(KERN_NOTICE "atomic_op %p conn xmit_atomic %p\n", + &rm->atomic, conn->c_trans->xmit_atomic); ret = -EOPNOTSUPP; goto out; } diff --git a/net/rds/stats.c b/net/rds/stats.c index 7598eb0..c66d95d 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -75,6 +75,8 @@ static const char *const rds_stat_names[] = { "cong_update_received", "cong_send_error", "cong_send_blocked", + "s_atomic_cswp", + "s_atomic_fadd", }; void rds_stats_info_copy(struct rds_info_iterator *iter, -- cgit v1.1 From 40589e74f7ba855f3a887c9d4abe9d100c5b039c Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 10:50:48 -0800 Subject: RDS: Base init_depth and responder_resources on hw values Instead of using a constant for initiator_depth and responder_resources, read the per-QP values when the device is enumerated, and then use these values when creating the connection. Signed-off-by: Andy Grover --- net/rds/ib.c | 3 +++ net/rds/ib.h | 2 ++ net/rds/ib_cm.c | 27 ++++++++++++++++++--------- net/rds/rdma.c | 3 +-- 4 files changed, 24 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index f0d2965..72a5116 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -91,6 +91,9 @@ void rds_ib_add_one(struct ib_device *device) min_t(unsigned int, dev_attr->max_fmr, fmr_pool_size) : fmr_pool_size; + rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); if (IS_ERR(rds_ibdev->pd)) diff --git a/net/rds/ib.h b/net/rds/ib.h index d2fd0aa..a303f13 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -164,6 +164,8 @@ struct rds_ib_device { unsigned int max_fmrs; int max_sge; unsigned int max_wrs; + unsigned int max_initiator_depth; + unsigned int max_responder_resources; spinlock_t spinlock; /* protect the above */ }; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index b46bc2f..3134336 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -153,18 +153,25 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, struct rdma_conn_param *conn_param, struct rds_ib_connect_private *dp, - u32 protocol_version) + u32 protocol_version, + u32 max_responder_resources, + u32 max_initiator_depth) { + struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_ib_device *rds_ibdev; + memset(conn_param, 0, sizeof(struct rdma_conn_param)); - /* XXX tune these? */ - conn_param->responder_resources = 1; - conn_param->initiator_depth = 1; + + rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + + conn_param->responder_resources = + min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); + conn_param->initiator_depth = + min_t(u32, rds_ibdev->max_initiator_depth, max_initiator_depth); conn_param->retry_count = min_t(unsigned int, rds_ib_retry_count, 7); conn_param->rnr_retry_count = 7; if (dp) { - struct rds_ib_connection *ic = conn->c_transport_data; - memset(dp, 0, sizeof(*dp)); dp->dp_saddr = conn->c_laddr; dp->dp_daddr = conn->c_faddr; @@ -479,7 +486,9 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version); + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp_rep, version, + event->param.conn.responder_resources, + event->param.conn.initiator_depth); /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); @@ -516,8 +525,8 @@ int rds_ib_cm_initiate_connect(struct rdma_cm_id *cm_id) goto out; } - rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION); - + rds_ib_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION, + UINT_MAX, UINT_MAX); ret = rdma_connect(cm_id, &conn_param); if (ret) rds_ib_conn_error(conn, "rdma_connect failed (%d)\n", ret); diff --git a/net/rds/rdma.c b/net/rds/rdma.c index a7019df..abbc297 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -745,7 +745,6 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, rm->atomic.op_swap_add = args->fadd.add; } - rm->m_rdma_cookie = args->cookie; rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); @@ -779,7 +778,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, rm->atomic.op_notifier->n_status = RDS_RDMA_SUCCESS; } - rm->atomic.op_rkey = rds_rdma_cookie_key(rm->m_rdma_cookie); + rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); rm->atomic.op_active = 1; -- cgit v1.1 From d22faec22c2ab2364fd8fc3c8159b0b5b28b0fd1 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 10:52:28 -0800 Subject: RDS: Do not mask address when pinning pages This does not appear to be necessary. Signed-off-by: Andy Grover --- net/rds/rdma.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index abbc297..1929cb8 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -238,7 +238,7 @@ static int __rds_rdma_map(struct rds_sock *rs, struct rds_get_mr_args *args, * r/o or r/w. We need to assume r/w, or we'll do a lot of RDMA to * the zero page. */ - ret = rds_pin_pages(args->vec.addr & PAGE_MASK, nr_pages, pages, 1); + ret = rds_pin_pages(args->vec.addr, nr_pages, pages, 1); if (ret < 0) goto out; @@ -595,7 +595,7 @@ static int rds_rdma_prepare(struct rds_message *rm, /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(vec.addr & PAGE_MASK, nr, pages, !op->r_write); + ret = rds_pin_pages(vec.addr, nr, pages, !op->r_write); if (ret < 0) goto out; -- cgit v1.1 From 6200ed7799d9225f363f157ab61f1566cfd80e19 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 10:53:05 -0800 Subject: RDS: Whitespace Tidy up some whitespace issues. Signed-off-by: Andy Grover --- net/rds/rdma.c | 1 - net/rds/recv.c | 4 ++-- net/rds/send.c | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 1929cb8..4df545a 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -688,7 +688,6 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, */ r_key = rds_rdma_cookie_key(rm->m_rdma_cookie); - spin_lock_irqsave(&rs->rs_rdma_lock, flags); mr = rds_mr_tree_walk(&rs->rs_rdma_keys, r_key, NULL); if (!mr) diff --git a/net/rds/recv.c b/net/rds/recv.c index 5188763..68800f0 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -333,10 +333,10 @@ int rds_notify_queue_get(struct rds_sock *rs, struct msghdr *msghdr) if (msghdr) { cmsg.user_token = notifier->n_user_token; - cmsg.status = notifier->n_status; + cmsg.status = notifier->n_status; err = put_cmsg(msghdr, SOL_RDS, RDS_CMSG_RDMA_STATUS, - sizeof(cmsg), &cmsg); + sizeof(cmsg), &cmsg); if (err) break; } diff --git a/net/rds/send.c b/net/rds/send.c index f3f4e79..5bc35d2 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -150,7 +150,7 @@ int rds_send_xmit(struct rds_connection *conn) */ if (conn->c_map_bytes) { ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, - conn->c_map_offset); + conn->c_map_offset); if (ret <= 0) break; -- cgit v1.1 From 809fa148a29467954280fe8b7f97c92403f6293c Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 14:41:46 -0800 Subject: RDS: inc_purge() transport function unused - remove it Signed-off-by: Andy Grover --- net/rds/ib.c | 1 - net/rds/ib.h | 1 - net/rds/ib_recv.c | 2 +- net/rds/iw.c | 1 - net/rds/iw.h | 1 - net/rds/iw_recv.c | 2 +- net/rds/loop.c | 1 - net/rds/message.c | 6 ------ net/rds/rds.h | 2 -- net/rds/tcp.c | 1 - net/rds/tcp.h | 1 - net/rds/tcp_recv.c | 2 +- 12 files changed, 3 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 72a5116..932dacb 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -274,7 +274,6 @@ struct rds_transport rds_ib_transport = { .conn_connect = rds_ib_conn_connect, .conn_shutdown = rds_ib_conn_shutdown, .inc_copy_to_user = rds_ib_inc_copy_to_user, - .inc_purge = rds_ib_inc_purge, .inc_free = rds_ib_inc_free, .cm_initiate_connect = rds_ib_cm_initiate_connect, .cm_handle_connect = rds_ib_cm_handle_connect, diff --git a/net/rds/ib.h b/net/rds/ib.h index a303f13..426035a 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -301,7 +301,6 @@ void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, gfp_t page_gfp, int prefill); -void rds_ib_inc_purge(struct rds_incoming *inc); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index d0ee9c1..e294d00 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -273,7 +273,7 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, return ret; } -void rds_ib_inc_purge(struct rds_incoming *inc) +static void rds_ib_inc_purge(struct rds_incoming *inc) { struct rds_ib_incoming *ibinc; struct rds_page_frag *frag; diff --git a/net/rds/iw.c b/net/rds/iw.c index c8f3d35..e766aec 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -272,7 +272,6 @@ struct rds_transport rds_iw_transport = { .conn_connect = rds_iw_conn_connect, .conn_shutdown = rds_iw_conn_shutdown, .inc_copy_to_user = rds_iw_inc_copy_to_user, - .inc_purge = rds_iw_inc_purge, .inc_free = rds_iw_inc_free, .cm_initiate_connect = rds_iw_cm_initiate_connect, .cm_handle_connect = rds_iw_cm_handle_connect, diff --git a/net/rds/iw.h b/net/rds/iw.h index eef2f0c..6f08300 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -326,7 +326,6 @@ void rds_iw_recv_exit(void); int rds_iw_recv(struct rds_connection *conn); int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, gfp_t page_gfp, int prefill); -void rds_iw_inc_purge(struct rds_incoming *inc); void rds_iw_inc_free(struct rds_incoming *inc); int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 48bcf4f..7e929f7 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -273,7 +273,7 @@ int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, return ret; } -void rds_iw_inc_purge(struct rds_incoming *inc) +static void rds_iw_inc_purge(struct rds_incoming *inc) { struct rds_iw_incoming *iwinc; struct rds_page_frag *frag; diff --git a/net/rds/loop.c b/net/rds/loop.c index a74b469..9364de6 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -176,7 +176,6 @@ struct rds_transport rds_loop_transport = { .conn_connect = rds_loop_conn_connect, .conn_shutdown = rds_loop_conn_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, - .inc_purge = rds_message_inc_purge, .inc_free = rds_message_inc_free, .t_name = "loopback", }; diff --git a/net/rds/message.c b/net/rds/message.c index 7d678e2..f681690 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -75,12 +75,6 @@ static void rds_message_purge(struct rds_message *rm) rds_mr_put(rm->rdma.m_rdma_mr); } -void rds_message_inc_purge(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_purge(rm); -} - void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); diff --git a/net/rds/rds.h b/net/rds/rds.h index 830e2bb..0c610a1 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -422,7 +422,6 @@ struct rds_transport { int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, size_t size); - void (*inc_purge)(struct rds_incoming *inc); void (*inc_free)(struct rds_incoming *inc); int (*cm_handle_connect)(struct rdma_cm_id *cm_id, @@ -668,7 +667,6 @@ int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *vers int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, size_t size); -void rds_message_inc_purge(struct rds_incoming *inc); void rds_message_inc_free(struct rds_incoming *inc); void rds_message_addref(struct rds_message *rm); void rds_message_put(struct rds_message *rm); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index aebe103..8318816 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -266,7 +266,6 @@ struct rds_transport rds_tcp_transport = { .conn_connect = rds_tcp_conn_connect, .conn_shutdown = rds_tcp_conn_shutdown, .inc_copy_to_user = rds_tcp_inc_copy_to_user, - .inc_purge = rds_tcp_inc_purge, .inc_free = rds_tcp_inc_free, .stats_info_copy = rds_tcp_stats_info_copy, .exit = rds_tcp_exit, diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 844fa6b..c639872 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -70,7 +70,6 @@ int __init rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); void rds_tcp_data_ready(struct sock *sk, int bytes); int rds_tcp_recv(struct rds_connection *conn); -void rds_tcp_inc_purge(struct rds_incoming *inc); void rds_tcp_inc_free(struct rds_incoming *inc); int rds_tcp_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index ea73829..49c9660 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -39,7 +39,7 @@ static struct kmem_cache *rds_tcp_incoming_slab; -void rds_tcp_inc_purge(struct rds_incoming *inc) +static void rds_tcp_inc_purge(struct rds_incoming *inc) { struct rds_tcp_incoming *tinc; tinc = container_of(inc, struct rds_tcp_incoming, ti_inc); -- cgit v1.1 From 9c030391e8741695ff6114703e4edccccb634479 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 12 Jan 2010 14:43:06 -0800 Subject: RDS/IB: eliminate duplicate code both atomics and rdmas need to convert ib-specific completion codes into RDS status codes. Rename rds_ib_rdma_send_complete to rds_ib_send_complete, and have it take a pointer to the function to call with the new error code. Signed-off-by: Andy Grover --- net/rds/ib_send.c | 30 +++++++++++------------------- 1 file changed, 11 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index b2bd164..d2516d3 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -38,8 +38,13 @@ #include "rds.h" #include "ib.h" -static void rds_ib_send_rdma_complete(struct rds_message *rm, - int wc_status) +/* + * Convert IB-specific error message to RDS error message and call core + * completion handler. + */ +static void rds_ib_send_complete(struct rds_message *rm, + int wc_status, + void (*complete)(struct rds_message *rm, int status)) { int notify_status; @@ -59,20 +64,7 @@ static void rds_ib_send_rdma_complete(struct rds_message *rm, notify_status = RDS_RDMA_OTHER_ERROR; break; } - rds_rdma_send_complete(rm, notify_status); -} - -static void rds_ib_send_atomic_complete(struct rds_message *rm, - int wc_status) -{ - int notify_status; - - if (wc_status != IB_WC_SUCCESS) - notify_status = RDS_RDMA_OTHER_ERROR; - else - notify_status = RDS_RDMA_SUCCESS; - - rds_atomic_send_complete(rm, notify_status); + complete(rm, notify_status); } static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, @@ -117,7 +109,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, * operation itself unmapped the RDMA buffers, which takes care * of synching. */ - rds_ib_send_rdma_complete(rm, wc_status); + rds_ib_send_complete(rm, wc_status, rds_rdma_send_complete); if (rm->rdma.m_rdma_op.r_write) rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); @@ -135,7 +127,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, op->op_mapped = 0; } - rds_ib_send_atomic_complete(rm, wc_status); + rds_ib_send_complete(rm, wc_status, rds_atomic_send_complete); if (rm->atomic.op_type == RDS_ATOMIC_TYPE_CSWP) rds_stats_inc(s_atomic_cswp); @@ -270,7 +262,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rm = rds_send_get_message(conn, send->s_op); if (rm) { rds_ib_send_unmap_rm(ic, send, wc.status); - rds_ib_send_rdma_complete(rm, wc.status); + rds_ib_send_complete(rm, wc.status, rds_rdma_send_complete); rds_message_put(rm); } } -- cgit v1.1 From f147dd9ecabf23fd63d2562ffe64252a0453ecde Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 13 Jan 2010 15:50:09 -0800 Subject: RDS/IB: Disallow connections less than RDS 3.1 RDS 3.0 connections (in OFED 1.3 and earlier) put the header at the end. 3.1 connections put it at the head. The code has significant added complexity in order to handle both configurations. In OFED 1.6 we can drop this and simplify the code by only supporting "header-first" configuration. This patch checks the protocol version, and if prior to 3.1, does not complete the connection. Signed-off-by: Andy Grover --- net/rds/ib_cm.c | 20 +++++++++++++++----- net/rds/ib_recv.c | 43 +------------------------------------------ 2 files changed, 16 insertions(+), 47 deletions(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 3134336..8b0c743 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -111,11 +111,21 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even } } - printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", - &conn->c_faddr, - RDS_PROTOCOL_MAJOR(conn->c_version), - RDS_PROTOCOL_MINOR(conn->c_version), - ic->i_flowctl ? ", flow control" : ""); + if (conn->c_version < RDS_PROTOCOL(3,1)) { + printk(KERN_NOTICE "RDS/IB: Connection to %pI4 version %u.%u failed," + " no longer supported\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version)); + rds_conn_destroy(conn); + return; + } else { + printk(KERN_NOTICE "RDS/IB: connected to %pI4 version %u.%u%s\n", + &conn->c_faddr, + RDS_PROTOCOL_MAJOR(conn->c_version), + RDS_PROTOCOL_MINOR(conn->c_version), + ic->i_flowctl ? ", flow control" : ""); + } /* * Init rings and fill recv. this needs to wait until protocol negotiation diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e294d00..a68a3a7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -557,47 +557,6 @@ u64 rds_ib_piggyb_ack(struct rds_ib_connection *ic) return rds_ib_get_ack(ic); } -static struct rds_header *rds_ib_get_header(struct rds_connection *conn, - struct rds_ib_recv_work *recv, - u32 data_len) -{ - struct rds_ib_connection *ic = conn->c_transport_data; - void *hdr_buff = &ic->i_recv_hdrs[recv - ic->i_recvs]; - void *addr; - u32 misplaced_hdr_bytes; - - /* - * Support header at the front (RDS 3.1+) as well as header-at-end. - * - * Cases: - * 1) header all in header buff (great!) - * 2) header all in data page (copy all to header buff) - * 3) header split across hdr buf + data page - * (move bit in hdr buff to end before copying other bit from data page) - */ - if (conn->c_version > RDS_PROTOCOL_3_0 || data_len == RDS_FRAG_SIZE) - return hdr_buff; - - if (data_len <= (RDS_FRAG_SIZE - sizeof(struct rds_header))) { - addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); - memcpy(hdr_buff, - addr + recv->r_frag->f_offset + data_len, - sizeof(struct rds_header)); - kunmap_atomic(addr, KM_SOFTIRQ0); - return hdr_buff; - } - - misplaced_hdr_bytes = (sizeof(struct rds_header) - (RDS_FRAG_SIZE - data_len)); - - memmove(hdr_buff + misplaced_hdr_bytes, hdr_buff, misplaced_hdr_bytes); - - addr = kmap_atomic(recv->r_frag->f_page, KM_SOFTIRQ0); - memcpy(hdr_buff, addr + recv->r_frag->f_offset + data_len, - sizeof(struct rds_header) - misplaced_hdr_bytes); - kunmap_atomic(addr, KM_SOFTIRQ0); - return hdr_buff; -} - /* * It's kind of lame that we're copying from the posted receive pages into * long-lived bitmaps. We could have posted the bitmaps and rdma written into @@ -710,7 +669,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, } data_len -= sizeof(struct rds_header); - ihdr = rds_ib_get_header(conn, recv, data_len); + ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs]; /* Validate the checksum. */ if (!rds_message_verify_checksum(ihdr)) { -- cgit v1.1 From 6f3d05db0da0b874afd2dd229bed715133532f8d Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 13 Jan 2010 16:29:37 -0800 Subject: RDS/IB: Remove dead code Signed-off-by: Andy Grover --- net/rds/ib_send.c | 14 -------------- 1 file changed, 14 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index d2516d3..e869084 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -542,12 +542,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* map the message the first time we see it */ if (!ic->i_rm) { - /* - printk(KERN_NOTICE "rds_ib_xmit prep msg dport=%u flags=0x%x len=%d\n", - be16_to_cpu(rm->m_inc.i_hdr.h_dport), - rm->m_inc.i_hdr.h_flags, - be32_to_cpu(rm->m_inc.i_hdr.h_len)); - */ if (rm->data.m_nents) { rm->data.m_count = ib_dma_map_sg(dev, rm->data.m_sg, @@ -684,14 +678,6 @@ add_header: * have been set up to point to the right header buffer. */ memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - if (0) { - struct rds_header *hdr = &ic->i_send_hdrs[pos]; - - printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n", - be16_to_cpu(hdr->h_dport), - hdr->h_flags, - be32_to_cpu(hdr->h_len)); - } if (adv_credits) { struct rds_header *hdr = &ic->i_send_hdrs[pos]; -- cgit v1.1 From 919ced4ce7d6ac62dd5be62d8993fe22a527d53a Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 13 Jan 2010 16:32:24 -0800 Subject: RDS/IB: Remove ib_[header/data]_sge() functions These functions were to cope with differently ordered sg entries depending on RDS 3.0 or 3.1+. Now that we've dropped 3.0 compatibility we no longer need them. Also, modify usage sites for these to refer to sge[0] or [1] directly. Reorder code to initialize header sgs first. Signed-off-by: Andy Grover --- net/rds/ib.h | 24 ------------------------ net/rds/ib_recv.c | 22 +++++++++++----------- net/rds/ib_send.c | 32 ++++++++++++-------------------- 3 files changed, 23 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 426035a..dbf8d6e 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -356,28 +356,4 @@ extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; extern ctl_table rds_ib_sysctl_table[]; -/* - * Helper functions for getting/setting the header and data SGEs in - * RDS packets (not RDMA) - * - * From version 3.1 onwards, header is in front of data in the sge. - */ -static inline struct ib_sge * -rds_ib_header_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - if (ic->conn->c_version > RDS_PROTOCOL_3_0) - return &sge[0]; - else - return &sge[1]; -} - -static inline struct ib_sge * -rds_ib_data_sge(struct rds_ib_connection *ic, struct ib_sge *sge) -{ - if (ic->conn->c_version > RDS_PROTOCOL_3_0) - return &sge[1]; - else - return &sge[0]; -} - #endif diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index a68a3a7..e9fe08a 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -95,15 +95,15 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) recv->r_wr.sg_list = recv->r_sge; recv->r_wr.num_sge = RDS_IB_RECV_SGE; - sge = rds_ib_data_sge(ic, recv->r_sge); - sge->addr = 0; - sge->length = RDS_FRAG_SIZE; - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, recv->r_sge); + sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; + + sge = &recv->r_sge[1]; + sge->addr = 0; + sge->length = RDS_FRAG_SIZE; + sge->lkey = ic->i_mr->lkey; } } @@ -190,14 +190,14 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, recv->r_frag->f_offset = ic->i_frag.f_offset; recv->r_frag->f_mapped = dma_addr; - sge = rds_ib_data_sge(ic, recv->r_sge); - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; - - sge = rds_ib_header_sge(ic, recv->r_sge); + sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); sge->length = sizeof(struct rds_header); + sge = &recv->r_sge[1]; + sge->addr = dma_addr; + sge->length = RDS_FRAG_SIZE; + get_page(recv->r_frag->f_page); if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index e869084..46026d9 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -156,18 +156,14 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) send->s_wr.wr_id = i; send->s_wr.sg_list = send->s_sge; - send->s_wr.num_sge = 1; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.send_flags = 0; send->s_wr.ex.imm_data = 0; - sge = rds_ib_data_sge(ic, send->s_sge); - sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); + sge = &send->s_sge[0]; sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header)); sge->length = sizeof(struct rds_header); sge->lkey = ic->i_mr->lkey; + + send->s_sge[1].lkey = ic->i_mr->lkey; } } @@ -441,28 +437,24 @@ rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, send->s_wr.send_flags = send_flags; send->s_wr.opcode = IB_WR_SEND; - send->s_wr.num_sge = 2; + send->s_wr.num_sge = 1; send->s_wr.next = NULL; send->s_queued = jiffies; send->s_op = NULL; + sge = &send->s_sge[0]; + sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); + sge->length = sizeof(struct rds_header); + sge->lkey = ic->i_mr->lkey; + if (length != 0) { - sge = rds_ib_data_sge(ic, send->s_sge); + send->s_wr.num_sge = 2; + + sge = &send->s_sge[1]; sge->addr = buffer; sge->length = length; sge->lkey = ic->i_mr->lkey; - - sge = rds_ib_header_sge(ic, send->s_sge); - } else { - /* We're sending a packet with no payload. There is only - * one SGE */ - send->s_wr.num_sge = 1; - sge = &send->s_sge[0]; } - - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; } /* -- cgit v1.1 From da5a06cef5724737af4315715632f0a07dd5e116 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Thu, 14 Jan 2010 12:18:11 -0800 Subject: RDS: rewrite rds_ib_xmit Now that the header always goes first, it is possible to simplify rds_ib_xmit. Instead of having a path to handle 0-byte dgrams and another path to handle >0, these can both be handled in one path. This lets us eliminate xmit_populate_wr(). Rename sent to bytes_sent, to differentiate better from other variable named "send". Signed-off-by: Andy Grover --- net/rds/ib_send.c | 123 ++++++++++++++++++++---------------------------------- 1 file changed, 45 insertions(+), 78 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 46026d9..06c1d7e 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -425,38 +425,6 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } -static inline void -rds_ib_xmit_populate_wr(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, unsigned int pos, - unsigned long buffer, unsigned int length, - int send_flags) -{ - struct ib_sge *sge; - - WARN_ON(pos != send - ic->i_sends); - - send->s_wr.send_flags = send_flags; - send->s_wr.opcode = IB_WR_SEND; - send->s_wr.num_sge = 1; - send->s_wr.next = NULL; - send->s_queued = jiffies; - send->s_op = NULL; - - sge = &send->s_sge[0]; - sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header)); - sge->length = sizeof(struct rds_header); - sge->lkey = ic->i_mr->lkey; - - if (length != 0) { - send->s_wr.num_sge = 2; - - sge = &send->s_sge[1]; - sge->addr = buffer; - sge->length = length; - sge->lkey = ic->i_mr->lkey; - } -} - /* * This can be called multiple times for a given message. The first time * we see a message we map its scatterlist into the IB device so that @@ -483,11 +451,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, u32 pos; u32 i; u32 work_alloc; - u32 credit_alloc; + u32 credit_alloc = 0; u32 posted; u32 adv_credits = 0; int send_flags = 0; - int sent; + int bytes_sent = 0; int ret; int flow_controlled = 0; @@ -515,7 +483,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - credit_alloc = work_alloc; if (ic->i_flowctl) { credit_alloc = rds_ib_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT); adv_credits += posted; @@ -591,13 +558,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, BUG_ON(adv_credits > 255); } - send = &ic->i_sends[pos]; - first = send; - prev = NULL; - scat = &rm->data.m_sg[sg]; - sent = 0; - i = 0; - /* Sometimes you want to put a fence between an RDMA * READ and the following SEND. * We could either do this all the time @@ -607,31 +567,45 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence) send_flags = IB_SEND_FENCE; - /* - * We could be copying the header into the unused tail of the page. - * That would need to be changed in the future when those pages might - * be mapped userspace pages or page cache pages. So instead we always - * use a second sge and our long-lived ring of mapped headers. We send - * the header after the data so that the data payload can be aligned on - * the receiver. - */ + /* Each frag gets a header. Msgs may be 0 bytes */ + send = &ic->i_sends[pos]; + first = send; + prev = NULL; + scat = &rm->data.m_sg[sg]; + i = 0; + do { + unsigned int len = 0; - /* handle a 0-len message */ - if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) { - rds_ib_xmit_populate_wr(ic, send, pos, 0, 0, send_flags); - goto add_header; - } + /* Set up the header */ + send->s_wr.send_flags = send_flags; + send->s_wr.opcode = IB_WR_SEND; + send->s_wr.num_sge = 1; + send->s_wr.next = NULL; + send->s_queued = jiffies; + send->s_op = NULL; - /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) { - unsigned int len; + send->s_sge[0].addr = ic->i_send_hdrs_dma + + (pos * sizeof(struct rds_header)); + send->s_sge[0].length = sizeof(struct rds_header); - send = &ic->i_sends[pos]; + memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); + + /* Set up the data, if present */ + if (i < work_alloc + && scat != &rm->data.m_sg[rm->data.m_count]) { + len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); + send->s_wr.num_sge = 2; - len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); - rds_ib_xmit_populate_wr(ic, send, pos, - ib_sg_dma_address(dev, scat) + off, len, - send_flags); + send->s_sge[1].addr = ib_sg_dma_address(dev, scat) + off; + send->s_sge[1].length = len; + + bytes_sent += len; + off += len; + if (off == ib_sg_dma_len(dev, scat)) { + scat++; + off = 0; + } + } /* * We want to delay signaling completions just enough to get @@ -658,18 +632,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, rdsdebug("send %p wr %p num_sge %u next %p\n", send, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); - sent += len; - off += len; - if (off == ib_sg_dma_len(dev, scat)) { - scat++; - off = 0; - } - -add_header: - /* Tack on the header after the data. The header SGE should already - * have been set up to point to the right header buffer. */ - memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header)); - if (adv_credits) { struct rds_header *hdr = &ic->i_send_hdrs[pos]; @@ -685,12 +647,16 @@ add_header: prev = send; pos = (pos + 1) % ic->i_send_ring.w_nr; - } + send = &ic->i_sends[pos]; + i++; + + } while (i < work_alloc + && scat != &rm->data.m_sg[rm->data.m_count]); /* Account the RDS header in the number of bytes we sent, but just once. * The caller has no concept of fragmentation. */ if (hdr_off == 0) - sent += sizeof(struct rds_header); + bytes_sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ if (scat == &rm->data.m_sg[rm->data.m_count]) { @@ -699,6 +665,7 @@ add_header: ic->i_rm = NULL; } + /* Put back wrs & credits we didn't use */ if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; @@ -725,7 +692,7 @@ add_header: goto out; } - ret = sent; + ret = bytes_sent; out: BUG_ON(adv_credits); return ret; -- cgit v1.1 From 1d34f175712b59ad292ecbbaa8fc05402a1fd8ed Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Thu, 14 Jan 2010 15:08:33 -0800 Subject: RDS: Remove unsignaled_bytes sysctl Removed unsignaled_bytes sysctl and code to signal based on it. I believe unsignaled_wrs is more than sufficient for our purposes. Signed-off-by: Andy Grover --- net/rds/ib.h | 1 - net/rds/ib_send.c | 7 ------- net/rds/ib_sysctl.c | 13 ------------- 3 files changed, 21 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index dbf8d6e..1488181 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -138,7 +138,6 @@ struct rds_ib_connection { /* Batched completions */ unsigned int i_unsignaled_wrs; - long i_unsignaled_bytes; }; /* This assumes that atomic_t is at least 32 bits */ diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 06c1d7e..48724b7 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -518,7 +518,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; rds_message_addref(rm); ic->i_rm = rm; @@ -617,12 +616,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; } - ic->i_unsignaled_bytes -= len; - if (ic->i_unsignaled_bytes <= 0) { - ic->i_unsignaled_bytes = rds_ib_sysctl_max_unsig_bytes; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } - /* * Always signal the last one if we're stopping due to flow control. */ diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index c070524..b556c5c 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -49,10 +49,6 @@ unsigned long rds_ib_sysctl_max_unsig_wrs = 16; static unsigned long rds_ib_sysctl_max_unsig_wr_min = 1; static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; -unsigned long rds_ib_sysctl_max_unsig_bytes = (16 << 20); -static unsigned long rds_ib_sysctl_max_unsig_bytes_min = 1; -static unsigned long rds_ib_sysctl_max_unsig_bytes_max = ~0UL; - /* * This sysctl does nothing. * @@ -94,15 +90,6 @@ ctl_table rds_ib_sysctl_table[] = { .extra2 = &rds_ib_sysctl_max_unsig_wr_max, }, { - .procname = "max_unsignaled_bytes", - .data = &rds_ib_sysctl_max_unsig_bytes, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = proc_doulongvec_minmax, - .extra1 = &rds_ib_sysctl_max_unsig_bytes_min, - .extra2 = &rds_ib_sysctl_max_unsig_bytes_max, - }, - { .procname = "max_recv_allocation", .data = &rds_ib_sysctl_max_recv_allocation, .maxlen = sizeof(unsigned long), -- cgit v1.1 From c8de3f1005e8359ea07083e37f3f993646e1adba Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 15 Jan 2010 15:55:26 -0800 Subject: RDS/IB: Make all flow control code conditional on i_flowctl Maybe things worked fine with the flow control code running even in the non-flow-control case, but making it explicitly conditional helps the non-fc case be easier to read. Signed-off-by: Andy Grover --- net/rds/ib_send.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 48724b7..d839b40 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -489,7 +489,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (credit_alloc < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc); work_alloc = credit_alloc; - flow_controlled++; + flow_controlled = 1; } if (work_alloc == 0) { set_bit(RDS_LL_SEND_FULL, &conn->c_flags); @@ -552,9 +552,11 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* * Update adv_credits since we reset the ACK_REQUIRED bit. */ - rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); - adv_credits += posted; - BUG_ON(adv_credits > 255); + if (ic->i_flowctl) { + rds_ib_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits); + adv_credits += posted; + BUG_ON(adv_credits > 255); + } } /* Sometimes you want to put a fence between an RDMA @@ -619,13 +621,13 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* * Always signal the last one if we're stopping due to flow control. */ - if (flow_controlled && i == (work_alloc-1)) + if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; rdsdebug("send %p wr %p num_sge %u next %p\n", send, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); - if (adv_credits) { + if (ic->i_flowctl && adv_credits) { struct rds_header *hdr = &ic->i_send_hdrs[pos]; /* add credit and redo the header checksum */ -- cgit v1.1 From d37c9359056f4f07b37e59810f0ece1031e280b2 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 19 Jan 2010 18:14:56 -0800 Subject: RDS: Move loop-only function to loop.c Also, try to better-document the locking around the rm and its m_inc in loop.c. Signed-off-by: Andy Grover --- net/rds/loop.c | 15 +++++++++++++-- net/rds/message.c | 6 ------ 2 files changed, 13 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/loop.c b/net/rds/loop.c index 9364de6..4a3dd49 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -64,7 +64,8 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, BUG_ON(hdr_off || sg || off); rds_inc_init(&rm->m_inc, conn, conn->c_laddr); - rds_message_addref(rm); /* for the inc */ + /* For the embedded inc. Matching put is in loop_inc_free() */ + rds_message_addref(rm); rds_recv_incoming(conn, conn->c_laddr, conn->c_faddr, &rm->m_inc, GFP_KERNEL, KM_USER0); @@ -77,6 +78,16 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, return sizeof(struct rds_header) + be32_to_cpu(rm->m_inc.i_hdr.h_len); } +/* + * See rds_loop_xmit(). Since our inc is embedded in the rm, we + * make sure the rm lives at least until the inc is done. + */ +static void rds_loop_inc_free(struct rds_incoming *inc) +{ + struct rds_message *rm = container_of(inc, struct rds_message, m_inc); + rds_message_put(rm); +} + static int rds_loop_xmit_cong_map(struct rds_connection *conn, struct rds_cong_map *map, unsigned long offset) @@ -176,6 +187,6 @@ struct rds_transport rds_loop_transport = { .conn_connect = rds_loop_conn_connect, .conn_shutdown = rds_loop_conn_shutdown, .inc_copy_to_user = rds_message_inc_copy_to_user, - .inc_free = rds_message_inc_free, + .inc_free = rds_loop_inc_free, .t_name = "loopback", }; diff --git a/net/rds/message.c b/net/rds/message.c index f681690..3ea05c8 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -89,12 +89,6 @@ void rds_message_put(struct rds_message *rm) } EXPORT_SYMBOL_GPL(rds_message_put); -void rds_message_inc_free(struct rds_incoming *inc) -{ - struct rds_message *rm = container_of(inc, struct rds_message, m_inc); - rds_message_put(rm); -} - void rds_message_populate_header(struct rds_header *hdr, __be16 sport, __be16 dport, u64 seq) { -- cgit v1.1 From 241eef3e2f51fe4ad50abacd7f79c4e2d468197e Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 19 Jan 2010 21:25:26 -0800 Subject: RDS: Implement silent atomics Signed-off-by: Andy Grover --- net/rds/ib.h | 2 +- net/rds/ib_send.c | 62 ++++++++++++++++++++++++++++--------------------------- net/rds/message.c | 2 ++ net/rds/rds.h | 3 ++- net/rds/send.c | 11 +++++++--- 5 files changed, 45 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 1488181..96769b8 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -336,7 +336,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, u32 *adv_credits, int need_posted, int max_posted); -int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index d839b40..e6745d8 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -225,15 +225,12 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) /* In the error case, wc.opcode sometimes contains garbage */ switch (send->s_wr.opcode) { case IB_WR_SEND: - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, wc.status); - break; case IB_WR_RDMA_WRITE: case IB_WR_RDMA_READ: case IB_WR_ATOMIC_FETCH_AND_ADD: case IB_WR_ATOMIC_CMP_AND_SWP: - /* Nothing to be done - the SG list will be unmapped - * when the SEND completes. */ + if (send->s_rm) + rds_ib_send_unmap_rm(ic, send, wc.status); break; default: if (printk_ratelimit()) @@ -425,6 +422,21 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } +static inline void rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + bool notify) +{ + /* + * We want to delay signaling completions just enough to get + * the batching benefits but not so much that we create dead time + * on the wire. + */ + if (ic->i_unsignaled_wrs-- == 0 || notify) { + ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; + send->s_wr.send_flags |= IB_SEND_SIGNALED; + } +} + /* * This can be called multiple times for a given message. The first time * we see a message we map its scatterlist into the IB device so that @@ -517,7 +529,6 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, rm->data.m_count = 0; } - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; rds_message_addref(rm); ic->i_rm = rm; @@ -608,15 +619,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } } - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time - * on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; - } + rds_ib_set_wr_signal_state(ic, send, 0); /* * Always signal the last one if we're stopping due to flow control. @@ -656,7 +659,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* if we finished the message then send completion owns it */ if (scat == &rm->data.m_sg[rm->data.m_count]) { prev->s_rm = ic->i_rm; - prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + prev->s_wr.send_flags |= IB_SEND_SOLICITED; ic->i_rm = NULL; } @@ -698,9 +701,10 @@ out: * A simplified version of the rdma case, we always map 1 SG, and * only 8 bytes, for the return value from the atomic operation. */ -int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm) { struct rds_ib_connection *ic = conn->c_transport_data; + struct rm_atomic_op *op = &rm->atomic; struct rds_ib_send_work *send = NULL; struct ib_send_wr *failed_wr; struct rds_ib_device *rds_ibdev; @@ -731,12 +735,20 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send->s_wr.wr.atomic.compare_add = op->op_swap_add; send->s_wr.wr.atomic.swap = 0; } - send->s_wr.send_flags = IB_SEND_SIGNALED; + rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.num_sge = 1; send->s_wr.next = NULL; send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; send->s_wr.wr.atomic.rkey = op->op_rkey; + /* + * If there is no data or rdma ops in the message, then + * we must fill in s_rm ourselves, so we properly clean up + * on completion. + */ + if (!rm->rdma.m_rdma_op.r_active && !rm->data.op_active) + send->s_rm = rm; + /* map 8 byte retval buffer to the device */ ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); @@ -836,14 +848,8 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; - /* - * We want to delay signaling completions just enough to get - * the batching benefits but not so much that we create dead time on the wire. - */ - if (ic->i_unsignaled_wrs-- == 0) { - ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; - send->s_wr.send_flags = IB_SEND_SIGNALED; - } + + rds_ib_set_wr_signal_state(ic, send, op->r_notify); send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; @@ -884,10 +890,6 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = ic->i_sends; } - /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) - prev->s_wr.send_flags = IB_SEND_SIGNALED; - if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; diff --git a/net/rds/message.c b/net/rds/message.c index 3ea05c8..a27e493 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -325,6 +325,8 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, sg++; } + rm->data.op_active = 1; + out: return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index 0c610a1..bf2349da 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -341,6 +341,7 @@ struct rds_message { struct rds_mr *m_rdma_mr; } rdma; struct rm_data_op { + unsigned int op_active:1; unsigned int m_nents; unsigned int m_count; struct scatterlist *m_sg; @@ -418,7 +419,7 @@ struct rds_transport { int (*xmit_cong_map)(struct rds_connection *conn, struct rds_cong_map *map, unsigned long offset); int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); - int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); + int (*xmit_atomic)(struct rds_connection *conn, struct rds_message *rm); int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, size_t size); diff --git a/net/rds/send.c b/net/rds/send.c index 5bc35d2..42fb934 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -266,7 +266,7 @@ int rds_send_xmit(struct rds_connection *conn) if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { - ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); + ret = conn->c_trans->xmit_atomic(conn, rm); if (ret) break; conn->c_xmit_atomic_sent = 1; @@ -285,13 +285,18 @@ int rds_send_xmit(struct rds_connection *conn) if (ret) break; conn->c_xmit_rdma_sent = 1; + + /* rdmas need data sent, even if just the header */ + rm->data.op_active = 1; + /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); } - if (conn->c_xmit_hdr_off < sizeof(struct rds_header) || - conn->c_xmit_sg < rm->data.m_nents) { + if (rm->data.op_active + && (conn->c_xmit_hdr_off < sizeof(struct rds_header) || + conn->c_xmit_sg < rm->data.m_nents)) { ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, -- cgit v1.1 From 4324879df06ba4db01a0b455af2d003f117e6aa3 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 27 Jan 2010 16:07:30 -0800 Subject: RDS: Inline rdma_prepare into cmsg_rdma_args cmsg_rdma_args just calls rdma_prepare and does a little arg checking -- not quite enough to justify its existence. Plus, it is the only caller of rdma_prepare(). Signed-off-by: Andy Grover --- net/rds/rdma.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4df545a..91967c8 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -497,12 +497,13 @@ int rds_rdma_extra_size(struct rds_rdma_args *args) } /* - * args is a pointer to an in-kernel copy in the sendmsg cmsg. + * The application asks for a RDMA transfer. + * Extract all arguments and set up the rdma_op */ -static int rds_rdma_prepare(struct rds_message *rm, - struct rds_sock *rs, - struct rds_rdma_args *args) +int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, + struct cmsghdr *cmsg) { + struct rds_rdma_args *args; struct rds_iovec vec; struct rds_rdma_op *op = &rm->rdma.m_rdma_op; unsigned int nr_pages; @@ -513,6 +514,11 @@ static int rds_rdma_prepare(struct rds_message *rm, unsigned int i, j; int ret = 0; + if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) + || rm->rdma.m_rdma_op.r_active) + return -EINVAL; + + args = CMSG_DATA(cmsg); if (rs->rs_bound_addr == 0) { ret = -ENOTCONN; /* XXX not a great errno */ @@ -623,7 +629,6 @@ static int rds_rdma_prepare(struct rds_message *rm, op->r_nents += nr; } - if (nr_bytes > args->remote_vec.bytes) { rdsdebug("RDS nr_bytes %u remote_bytes %u do not match\n", nr_bytes, @@ -639,28 +644,9 @@ out: if (ret) rds_rdma_free_op(op); - return ret; -} - -/* - * The application asks for a RDMA transfer. - * Extract all arguments and set up the rdma_op - */ -int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, - struct cmsghdr *cmsg) -{ - int ret; - - if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) || - rm->rdma.m_rdma_op.r_active) - return -EINVAL; - - ret = rds_rdma_prepare(rm, rs, CMSG_DATA(cmsg)); - if (ret) - return ret; - rds_stats_inc(s_send_rdma); - return 0; + + return ret; } /* -- cgit v1.1 From d0ab25a83c4a08cd98b73a37d3f4c069f7b4f50b Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 27 Jan 2010 16:15:48 -0800 Subject: RDS: purge atomic resources too in rds_message_purge() Add atomic_free_op function, analogous to rdma_free_op, and call it in rds_message_purge(). Signed-off-by: Andy Grover --- net/rds/message.c | 5 +++++ net/rds/rdma.c | 16 ++++++++++++++++ net/rds/rds.h | 1 + 3 files changed, 22 insertions(+) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index a27e493..b53306c 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -73,6 +73,11 @@ static void rds_message_purge(struct rds_message *rm) rds_rdma_free_op(&rm->rdma.m_rdma_op); if (rm->rdma.m_rdma_mr) rds_mr_put(rm->rdma.m_rdma_mr); + + if (rm->atomic.op_active) + rds_atomic_free_op(&rm->atomic); + if (rm->atomic.op_rdma_mr) + rds_mr_put(rm->atomic.op_rdma_mr); } void rds_message_put(struct rds_message *rm) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 91967c8..0df86a3 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -462,6 +462,22 @@ void rds_rdma_free_op(struct rds_rdma_op *ro) ro->r_active = 0; } +void rds_atomic_free_op(struct rm_atomic_op *ao) +{ + struct page *page = sg_page(ao->op_sg); + + /* Mark page dirty if it was possibly modified, which + * is the case for a RDMA_READ which copies from remote + * to local memory */ + set_page_dirty(page); + put_page(page); + + kfree(ao->op_notifier); + ao->op_notifier = NULL; + ao->op_active = 0; +} + + /* * Count the number of pages needed to describe an incoming iovec. */ diff --git a/net/rds/rds.h b/net/rds/rds.h index bf2349da..32b3d46 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -745,6 +745,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); void rds_rdma_free_op(struct rds_rdma_op *ro); +void rds_atomic_free_op(struct rm_atomic_op *ao); void rds_rdma_send_complete(struct rds_message *rm, int wc_status); void rds_atomic_send_complete(struct rds_message *rm, int wc_status); int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, -- cgit v1.1 From f8b3aaf2ba8ca9e27b47f8bfdff07c8b968f2c05 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 1 Mar 2010 14:11:53 -0800 Subject: RDS: Remove struct rds_rdma_op A big changeset, but it's all pretty dumb. struct rds_rdma_op was already embedded in struct rm_rdma_op. Remove rds_rdma_op and put its members in rm_rdma_op. Rename members with "op_" prefix instead of "r_", for consistency. Of course this breaks a lot, so fixup the code accordingly. Signed-off-by: Andy Grover --- net/rds/ib.h | 4 ++-- net/rds/ib_send.c | 60 ++++++++++++++++++++++++------------------------ net/rds/iw.h | 4 ++-- net/rds/iw_send.c | 68 +++++++++++++++++++++++++++---------------------------- net/rds/message.c | 8 +++---- net/rds/rdma.c | 58 +++++++++++++++++++++++------------------------ net/rds/rds.h | 41 ++++++++++++++++----------------- net/rds/send.c | 50 ++++++++++++++++++++-------------------- 8 files changed, 145 insertions(+), 148 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 96769b8..d64b508 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -54,7 +54,7 @@ struct rds_ib_connect_private { struct rds_ib_send_work { struct rds_message *s_rm; - struct rds_rdma_op *s_op; + struct rm_rdma_op *s_op; struct ib_send_wr s_wr; struct ib_sge s_sge[RDS_IB_MAX_SGE]; unsigned long s_queued; @@ -331,7 +331,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_ib_send_init_ring(struct rds_ib_connection *ic); void rds_ib_send_clear_ring(struct rds_ib_connection *ic); -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index e6745d8..63981cd1 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -79,14 +79,14 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, rm->data.m_sg, rm->data.m_nents, DMA_TO_DEVICE); - if (rm->rdma.m_rdma_op.r_active) { - struct rds_rdma_op *op = &rm->rdma.m_rdma_op; + if (rm->rdma.op_active) { + struct rm_rdma_op *op = &rm->rdma; - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } /* If the user asked for a completion notification on this @@ -111,10 +111,10 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, */ rds_ib_send_complete(rm, wc_status, rds_rdma_send_complete); - if (rm->rdma.m_rdma_op.r_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); + if (rm->rdma.op_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); } if (rm->atomic.op_active) { @@ -540,10 +540,10 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.m_rdma_op.r_active) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -576,7 +576,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; /* Each frag gets a header. Msgs may be 0 bytes */ @@ -746,7 +746,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm) * we must fill in s_rm ourselves, so we properly clean up * on completion. */ - if (!rm->rdma.m_rdma_op.r_active && !rm->data.op_active) + if (!rm->rdma.op_active && !rm->data.op_active) send->s_rm = rm; /* map 8 byte retval buffer to the device */ @@ -788,7 +788,7 @@ out: return ret; } -int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_send_work *send = NULL; @@ -798,7 +798,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) struct rds_ib_device *rds_ibdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; u32 pos; u32 work_alloc; u32 i; @@ -810,25 +810,25 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } /* * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_ibdev->max_sge); + i = ceil(op->op_count, rds_ibdev->max_sge); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -841,19 +841,19 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; - rds_ib_set_wr_signal_state(ic, send, op->r_notify); + rds_ib_set_wr_signal_state(ic, send, op->op_notify); - send->s_wr.opcode = op->r_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; + send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; + send->s_wr.wr.rdma.rkey = op->op_rkey; send->s_op = op; if (num_sge > rds_ibdev->max_sge) { @@ -868,7 +868,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat); diff --git a/net/rds/iw.h b/net/rds/iw.h index 6f08300..f112105 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -70,7 +70,7 @@ struct rds_iw_send_work { struct rds_message *s_rm; /* We should really put these into a union: */ - struct rds_rdma_op *s_op; + struct rm_rdma_op *s_op; struct rds_iw_mapping *s_mapping; struct ib_mr *s_mr; struct ib_fast_reg_page_list *s_page_list; @@ -357,7 +357,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context); void rds_iw_send_init_ring(struct rds_iw_connection *ic); void rds_iw_send_clear_ring(struct rds_iw_connection *ic); -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op); +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op); void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted, diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 9b79a1b..05ebf16 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -63,13 +63,13 @@ static void rds_iw_send_rdma_complete(struct rds_message *rm, } static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { - if (op->r_mapped) { + if (op->op_mapped) { ib_dma_unmap_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, - op->r_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->r_mapped = 0; + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; } } @@ -85,8 +85,8 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rm->data.m_sg, rm->data.m_nents, DMA_TO_DEVICE); - if (rm->rdma.m_rdma_op.r_active) { - rds_iw_send_unmap_rdma(ic, &rm->rdma.m_rdma_op); + if (rm->rdma.op_active) { + rds_iw_send_unmap_rdma(ic, &rm->rdma); /* If the user asked for a completion notification on this * message, we can implement three different semantics: @@ -110,10 +110,10 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, */ rds_iw_send_rdma_complete(rm, wc_status); - if (rm->rdma.m_rdma_op.r_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); + if (rm->rdma.op_write) + rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.m_rdma_op.r_bytes); + rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); } /* If anyone waited for this message to get flushed out, wake @@ -591,10 +591,10 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, /* If it has a RDMA op, tell the peer we did it. This is * used by the peer to release use-once RDMA MRs. */ - if (rm->rdma.m_rdma_op.r_active) { + if (rm->rdma.op_active) { struct rds_ext_header_rdma ext_hdr; - ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.m_rdma_op.r_key); + ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey); rds_message_add_extension(&rm->m_inc.i_hdr, RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr)); } @@ -632,7 +632,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, * or when requested by the user. Right now, we let * the application choose. */ - if (rm->rdma.m_rdma_op.r_active && rm->rdma.m_rdma_op.r_fence) + if (rm->rdma.op_active && rm->rdma.op_fence) send_flags = IB_SEND_FENCE; /* @@ -785,7 +785,7 @@ static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rd ib_update_fast_reg_key(send->s_mr, send->s_remap_count++); } -int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) +int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) { struct rds_iw_connection *ic = conn->c_transport_data; struct rds_iw_send_work *send = NULL; @@ -795,7 +795,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) struct rds_iw_device *rds_iwdev; struct scatterlist *scat; unsigned long len; - u64 remote_addr = op->r_remote_addr; + u64 remote_addr = op->op_remote_addr; u32 pos, fr_pos; u32 work_alloc; u32 i; @@ -807,21 +807,21 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client); /* map the message the first time we see it */ - if (!op->r_mapped) { - op->r_count = ib_dma_map_sg(ic->i_cm_id->device, - op->r_sg, op->r_nents, (op->r_write) ? - DMA_TO_DEVICE : DMA_FROM_DEVICE); - rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->r_count); - if (op->r_count == 0) { + if (!op->op_mapped) { + op->op_count = ib_dma_map_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, (op->op_write) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE); + rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count); + if (op->op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); ret = -ENOMEM; /* XXX ? */ goto out; } - op->r_mapped = 1; + op->op_mapped = 1; } - if (!op->r_write) { + if (!op->op_write) { /* Alloc space on the send queue for the fastreg */ work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos); if (work_alloc != 1) { @@ -836,7 +836,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->r_count, rds_iwdev->max_sge); + i = ceil(op->op_count, rds_iwdev->max_sge); work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -847,17 +847,17 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } send = &ic->i_sends[pos]; - if (!op->r_write) { + if (!op->op_write) { first = prev = &ic->i_sends[fr_pos]; } else { first = send; prev = NULL; } - scat = &op->r_sg[0]; + scat = &op->op_sg[0]; sent = 0; - num_sge = op->r_count; + num_sge = op->op_count; - for (i = 0; i < work_alloc && scat != &op->r_sg[op->r_count]; i++) { + for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; @@ -874,13 +874,13 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * for local access after RDS is finished with it, using * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed. */ - if (op->r_write) + if (op->op_write) send->s_wr.opcode = IB_WR_RDMA_WRITE; else send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV; send->s_wr.wr.rdma.remote_addr = remote_addr; - send->s_wr.wr.rdma.rkey = op->r_key; + send->s_wr.wr.rdma.rkey = op->op_rkey; send->s_op = op; if (num_sge > rds_iwdev->max_sge) { @@ -894,7 +894,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) if (prev) prev->s_wr.next = &send->s_wr; - for (j = 0; j < send->s_wr.num_sge && scat != &op->r_sg[op->r_count]; j++) { + for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) { len = ib_sg_dma_len(ic->i_cm_id->device, scat); if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) @@ -928,7 +928,7 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) } /* if we finished the message then send completion owns it */ - if (scat == &op->r_sg[op->r_count]) + if (scat == &op->op_sg[op->op_count]) first->s_wr.send_flags = IB_SEND_SIGNALED; if (i < work_alloc) { @@ -942,9 +942,9 @@ int rds_iw_xmit_rdma(struct rds_connection *conn, struct rds_rdma_op *op) * adapters do not allow using the lkey for this at all. To bypass this use a * fastreg_mr (or possibly a dma_mr) */ - if (!op->r_write) { + if (!op->op_write) { rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos], - op->r_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); + op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr); work_alloc++; } diff --git a/net/rds/message.c b/net/rds/message.c index b53306c..bca7eda 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -69,10 +69,10 @@ static void rds_message_purge(struct rds_message *rm) } rm->data.m_nents = 0; - if (rm->rdma.m_rdma_op.r_active) - rds_rdma_free_op(&rm->rdma.m_rdma_op); - if (rm->rdma.m_rdma_mr) - rds_mr_put(rm->rdma.m_rdma_mr); + if (rm->rdma.op_active) + rds_rdma_free_op(&rm->rdma); + if (rm->rdma.op_rdma_mr) + rds_mr_put(rm->rdma.op_rdma_mr); if (rm->atomic.op_active) rds_atomic_free_op(&rm->atomic); diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 0df86a3..8d22999 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -440,26 +440,26 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) rds_mr_put(mr); } -void rds_rdma_free_op(struct rds_rdma_op *ro) +void rds_rdma_free_op(struct rm_rdma_op *ro) { unsigned int i; - for (i = 0; i < ro->r_nents; i++) { - struct page *page = sg_page(&ro->r_sg[i]); + for (i = 0; i < ro->op_nents; i++) { + struct page *page = sg_page(&ro->op_sg[i]); /* Mark page dirty if it was possibly modified, which * is the case for a RDMA_READ which copies from remote * to local memory */ - if (!ro->r_write) { + if (!ro->op_write) { BUG_ON(irqs_disabled()); set_page_dirty(page); } put_page(page); } - kfree(ro->r_notifier); - ro->r_notifier = NULL; - ro->r_active = 0; + kfree(ro->op_notifier); + ro->op_notifier = NULL; + ro->op_active = 0; } void rds_atomic_free_op(struct rm_atomic_op *ao) @@ -521,7 +521,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, { struct rds_rdma_args *args; struct rds_iovec vec; - struct rds_rdma_op *op = &rm->rdma.m_rdma_op; + struct rm_rdma_op *op = &rm->rdma; unsigned int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; @@ -531,7 +531,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, int ret = 0; if (cmsg->cmsg_len < CMSG_LEN(sizeof(struct rds_rdma_args)) - || rm->rdma.m_rdma_op.r_active) + || rm->rdma.op_active) return -EINVAL; args = CMSG_DATA(cmsg); @@ -556,27 +556,27 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, goto out; } - op->r_write = !!(args->flags & RDS_RDMA_READWRITE); - op->r_fence = !!(args->flags & RDS_RDMA_FENCE); - op->r_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); - op->r_active = 1; - op->r_recverr = rs->rs_recverr; + op->op_write = !!(args->flags & RDS_RDMA_READWRITE); + op->op_fence = !!(args->flags & RDS_RDMA_FENCE); + op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->op_active = 1; + op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); - op->r_sg = rds_message_alloc_sgs(rm, nr_pages); + op->op_sg = rds_message_alloc_sgs(rm, nr_pages); - if (op->r_notify || op->r_recverr) { + if (op->op_notify || op->op_recverr) { /* We allocate an uninitialized notifier here, because * we don't want to do that in the completion handler. We * would have to use GFP_ATOMIC there, and don't want to deal * with failed allocations. */ - op->r_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); - if (!op->r_notifier) { + op->op_notifier = kmalloc(sizeof(struct rds_notifier), GFP_KERNEL); + if (!op->op_notifier) { ret = -ENOMEM; goto out; } - op->r_notifier->n_user_token = args->user_token; - op->r_notifier->n_status = RDS_RDMA_SUCCESS; + op->op_notifier->n_user_token = args->user_token; + op->op_notifier->n_status = RDS_RDMA_SUCCESS; } /* The cookie contains the R_Key of the remote memory region, and @@ -586,15 +586,15 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, * destination address (which is really an offset into the MR) * FIXME: We may want to move this into ib_rdma.c */ - op->r_key = rds_rdma_cookie_key(args->cookie); - op->r_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); + op->op_rkey = rds_rdma_cookie_key(args->cookie); + op->op_remote_addr = args->remote_vec.addr + rds_rdma_cookie_offset(args->cookie); nr_bytes = 0; rdsdebug("RDS: rdma prepare nr_local %llu rva %llx rkey %x\n", (unsigned long long)args->nr_local, (unsigned long long)args->remote_vec.addr, - op->r_key); + op->op_rkey); local_vec = (struct rds_iovec __user *)(unsigned long) args->local_vec_addr; @@ -617,7 +617,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, /* If it's a WRITE operation, we want to pin the pages for reading. * If it's a READ operation, we need to pin the pages for writing. */ - ret = rds_pin_pages(vec.addr, nr, pages, !op->r_write); + ret = rds_pin_pages(vec.addr, nr, pages, !op->op_write); if (ret < 0) goto out; @@ -630,7 +630,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, unsigned int offset = vec.addr & ~PAGE_MASK; struct scatterlist *sg; - sg = &op->r_sg[op->r_nents + j]; + sg = &op->op_sg[op->op_nents + j]; sg_set_page(sg, pages[j], min_t(unsigned int, vec.bytes, PAGE_SIZE - offset), offset); @@ -642,7 +642,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, vec.bytes -= sg->length; } - op->r_nents += nr; + op->op_nents += nr; } if (nr_bytes > args->remote_vec.bytes) { @@ -652,7 +652,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, ret = -EINVAL; goto out; } - op->r_bytes = nr_bytes; + op->op_bytes = nr_bytes; ret = 0; out: @@ -700,7 +700,7 @@ int rds_cmsg_rdma_dest(struct rds_sock *rs, struct rds_message *rm, if (mr) { mr->r_trans->sync_mr(mr->r_trans_private, DMA_TO_DEVICE); - rm->rdma.m_rdma_mr = mr; + rm->rdma.op_rdma_mr = mr; } return err; } @@ -718,7 +718,7 @@ int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, rm->m_rdma_cookie != 0) return -EINVAL; - return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.m_rdma_mr); + return __rds_rdma_map(rs, CMSG_DATA(cmsg), &rm->m_rdma_cookie, &rm->rdma.op_rdma_mr); } /* diff --git a/net/rds/rds.h b/net/rds/rds.h index 32b3d46..76eeb59 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -230,22 +230,6 @@ struct rds_mr { /* Flags for mr->r_state */ #define RDS_MR_DEAD 0 -struct rds_rdma_op { - u32 r_key; - u64 r_remote_addr; - unsigned int r_write:1; - unsigned int r_fence:1; - unsigned int r_notify:1; - unsigned int r_recverr:1; - unsigned int r_mapped:1; - unsigned int r_active:1; - struct rds_notifier *r_notifier; - unsigned int r_bytes; - unsigned int r_nents; - unsigned int r_count; - struct scatterlist *r_sg; -}; - static inline rds_rdma_cookie_t rds_rdma_make_cookie(u32 r_key, u32 offset) { return r_key | (((u64) offset) << 32); @@ -331,14 +315,27 @@ struct rds_message { unsigned int op_recverr:1; unsigned int op_mapped:1; unsigned int op_active:1; - struct rds_notifier *op_notifier; struct scatterlist *op_sg; + struct rds_notifier *op_notifier; struct rds_mr *op_rdma_mr; } atomic; struct rm_rdma_op { - struct rds_rdma_op m_rdma_op; - struct rds_mr *m_rdma_mr; + u32 op_rkey; + u64 op_remote_addr; + unsigned int op_write:1; + unsigned int op_fence:1; + unsigned int op_notify:1; + unsigned int op_recverr:1; + unsigned int op_mapped:1; + unsigned int op_active:1; + unsigned int op_bytes; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; + struct rds_notifier *op_notifier; + + struct rds_mr *op_rdma_mr; } rdma; struct rm_data_op { unsigned int op_active:1; @@ -418,7 +415,7 @@ struct rds_transport { unsigned int hdr_off, unsigned int sg, unsigned int off); int (*xmit_cong_map)(struct rds_connection *conn, struct rds_cong_map *map, unsigned long offset); - int (*xmit_rdma)(struct rds_connection *conn, struct rds_rdma_op *op); + int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rds_message *rm); int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, @@ -727,7 +724,7 @@ int rds_send_acked_before(struct rds_connection *conn, u64 seq); void rds_send_remove_from_sock(struct list_head *messages, int status); int rds_send_pong(struct rds_connection *conn, __be16 dport); struct rds_message *rds_send_get_message(struct rds_connection *, - struct rds_rdma_op *); + struct rm_rdma_op *); /* rdma.c */ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force); @@ -744,7 +741,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); int rds_cmsg_rdma_map(struct rds_sock *rs, struct rds_message *rm, struct cmsghdr *cmsg); -void rds_rdma_free_op(struct rds_rdma_op *ro); +void rds_rdma_free_op(struct rm_rdma_op *ro); void rds_atomic_free_op(struct rm_atomic_op *ao); void rds_rdma_send_complete(struct rds_message *rm, int wc_status); void rds_atomic_send_complete(struct rds_message *rm, int wc_status); diff --git a/net/rds/send.c b/net/rds/send.c index 42fb934..08df279 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -237,7 +237,7 @@ int rds_send_xmit(struct rds_connection *conn) * connection. * Therefore, we never retransmit messages with RDMA ops. */ - if (rm->rdma.m_rdma_op.r_active && + if (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) @@ -280,8 +280,8 @@ int rds_send_xmit(struct rds_connection *conn) * keep this simple and require that the transport either * send the whole rdma or none of it. */ - if (rm->rdma.m_rdma_op.r_active && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, &rm->rdma.m_rdma_op); + if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) break; conn->c_xmit_rdma_sent = 1; @@ -430,16 +430,16 @@ int rds_send_acked_before(struct rds_connection *conn, u64 seq) void rds_rdma_send_complete(struct rds_message *rm, int status) { struct rds_sock *rs = NULL; - struct rds_rdma_op *ro; + struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; spin_lock_irqsave(&rm->m_rs_lock, flags); - ro = &rm->rdma.m_rdma_op; + ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->r_active && ro->r_notify && ro->r_notifier) { - notifier = ro->r_notifier; + ro->op_active && ro->op_notify && ro->op_notifier) { + notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); @@ -448,7 +448,7 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); spin_unlock(&rs->rs_lock); - ro->r_notifier = NULL; + ro->op_notifier = NULL; } spin_unlock_irqrestore(&rm->m_rs_lock, flags); @@ -503,13 +503,13 @@ EXPORT_SYMBOL_GPL(rds_atomic_send_complete); static inline void __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { - struct rds_rdma_op *ro; + struct rm_rdma_op *ro; - ro = &rm->rdma.m_rdma_op; - if (ro->r_active && ro->r_notify && ro->r_notifier) { - ro->r_notifier->n_status = status; - list_add_tail(&ro->r_notifier->n_list, &rs->rs_notify_queue); - ro->r_notifier = NULL; + ro = &rm->rdma; + if (ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_notifier->n_status = status; + list_add_tail(&ro->op_notifier->n_list, &rs->rs_notify_queue); + ro->op_notifier = NULL; } /* No need to wake the app - caller does this */ @@ -521,7 +521,7 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status * So speed is not an issue here. */ struct rds_message *rds_send_get_message(struct rds_connection *conn, - struct rds_rdma_op *op) + struct rm_rdma_op *op) { struct rds_message *rm, *tmp, *found = NULL; unsigned long flags; @@ -529,7 +529,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, spin_lock_irqsave(&conn->c_lock, flags); list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (&rm->rdma.m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; goto out; @@ -537,7 +537,7 @@ struct rds_message *rds_send_get_message(struct rds_connection *conn, } list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (&rm->rdma.m_rdma_op == op) { + if (&rm->rdma == op) { atomic_inc(&rm->m_refcount); found = rm; break; @@ -597,20 +597,20 @@ void rds_send_remove_from_sock(struct list_head *messages, int status) spin_lock(&rs->rs_lock); if (test_and_clear_bit(RDS_MSG_ON_SOCK, &rm->m_flags)) { - struct rds_rdma_op *ro = &rm->rdma.m_rdma_op; + struct rm_rdma_op *ro = &rm->rdma; struct rds_notifier *notifier; list_del_init(&rm->m_sock_item); rds_send_sndbuf_remove(rs, rm); - if (ro->r_active && ro->r_notifier && - (ro->r_notify || (ro->r_recverr && status))) { - notifier = ro->r_notifier; + if (ro->op_active && ro->op_notifier && + (ro->op_notify || (ro->op_recverr && status))) { + notifier = ro->op_notifier; list_add_tail(¬ifier->n_list, &rs->rs_notify_queue); if (!notifier->n_status) notifier->n_status = status; - rm->rdma.m_rdma_op.r_notifier = NULL; + rm->rdma.op_notifier = NULL; } was_on_sock = 1; rm->m_rs = NULL; @@ -987,11 +987,11 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; - if ((rm->m_rdma_cookie || rm->rdma.m_rdma_op.r_active) && - !conn->c_trans->xmit_rdma) { + if ((rm->m_rdma_cookie || rm->rdma.op_active) && + !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", - &rm->rdma.m_rdma_op, conn->c_trans->xmit_rdma); + &rm->rdma, conn->c_trans->xmit_rdma); ret = -EOPNOTSUPP; goto out; } -- cgit v1.1 From 6c7cc6e4694dc464ae884332f2a322973497e3cf Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 27 Jan 2010 18:04:18 -0800 Subject: RDS: Rename data op members prefix from m_ to op_ For consistency. Signed-off-by: Andy Grover --- net/rds/ib_send.c | 26 +++++++++++++------------- net/rds/iw_send.c | 24 ++++++++++++------------ net/rds/message.c | 22 +++++++++++----------- net/rds/rds.h | 6 +++--- net/rds/send.c | 10 +++++----- net/rds/tcp_send.c | 14 +++++++------- 6 files changed, 51 insertions(+), 51 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 63981cd1..95f1524 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -76,7 +76,7 @@ static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, - rm->data.m_sg, rm->data.m_nents, + rm->data.op_sg, rm->data.op_nents, DMA_TO_DEVICE); if (rm->rdma.op_active) { @@ -513,20 +513,20 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* map the message the first time we see it */ if (!ic->i_rm) { - if (rm->data.m_nents) { - rm->data.m_count = ib_dma_map_sg(dev, - rm->data.m_sg, - rm->data.m_nents, - DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count); - if (rm->data.m_count == 0) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_ib_stats_inc(s_ib_tx_sg_mapping_failure); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->data.m_count = 0; + rm->data.op_count = 0; } rds_message_addref(rm); @@ -583,7 +583,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->data.m_sg[sg]; + scat = &rm->data.op_sg[sg]; i = 0; do { unsigned int len = 0; @@ -604,7 +604,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* Set up the data, if present */ if (i < work_alloc - && scat != &rm->data.m_sg[rm->data.m_count]) { + && scat != &rm->data.op_sg[rm->data.op_count]) { len = min(RDS_FRAG_SIZE, ib_sg_dma_len(dev, scat) - off); send->s_wr.num_sge = 2; @@ -649,7 +649,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, i++; } while (i < work_alloc - && scat != &rm->data.m_sg[rm->data.m_count]); + && scat != &rm->data.op_sg[rm->data.op_count]); /* Account the RDS header in the number of bytes we sent, but just once. * The caller has no concept of fragmentation. */ @@ -657,7 +657,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, bytes_sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->data.m_sg[rm->data.m_count]) { + if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_rm = ic->i_rm; prev->s_wr.send_flags |= IB_SEND_SOLICITED; ic->i_rm = NULL; diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c index 05ebf16..6280ea0 100644 --- a/net/rds/iw_send.c +++ b/net/rds/iw_send.c @@ -82,7 +82,7 @@ static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic, rdsdebug("ic %p send %p rm %p\n", ic, send, rm); ib_dma_unmap_sg(ic->i_cm_id->device, - rm->data.m_sg, rm->data.m_nents, + rm->data.op_sg, rm->data.op_nents, DMA_TO_DEVICE); if (rm->rdma.op_active) { @@ -562,20 +562,20 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, rm->m_inc.i_hdr.h_flags, be32_to_cpu(rm->m_inc.i_hdr.h_len)); */ - if (rm->data.m_nents) { - rm->data.m_count = ib_dma_map_sg(dev, - rm->data.m_sg, - rm->data.m_nents, - DMA_TO_DEVICE); - rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.m_count); - if (rm->data.m_count == 0) { + if (rm->data.op_nents) { + rm->data.op_count = ib_dma_map_sg(dev, + rm->data.op_sg, + rm->data.op_nents, + DMA_TO_DEVICE); + rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count); + if (rm->data.op_count == 0) { rds_iw_stats_inc(s_iw_tx_sg_mapping_failure); rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc); ret = -ENOMEM; /* XXX ? */ goto out; } } else { - rm->data.m_count = 0; + rm->data.op_count = 0; } ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs; @@ -622,7 +622,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->data.m_sg[sg]; + scat = &rm->data.op_sg[sg]; sent = 0; i = 0; @@ -651,7 +651,7 @@ int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm, } /* if there's data reference it with a chain of work reqs */ - for (; i < work_alloc && scat != &rm->data.m_sg[rm->data.m_count]; i++) { + for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) { unsigned int len; send = &ic->i_sends[pos]; @@ -729,7 +729,7 @@ add_header: sent += sizeof(struct rds_header); /* if we finished the message then send completion owns it */ - if (scat == &rm->data.m_sg[rm->data.m_count]) { + if (scat == &rm->data.op_sg[rm->data.op_count]) { prev->s_rm = ic->i_rm; prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; ic->i_rm = NULL; diff --git a/net/rds/message.c b/net/rds/message.c index bca7eda..4bd9504 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -62,12 +62,12 @@ static void rds_message_purge(struct rds_message *rm) if (unlikely(test_bit(RDS_MSG_PAGEVEC, &rm->m_flags))) return; - for (i = 0; i < rm->data.m_nents; i++) { - rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.m_sg[i])); + for (i = 0; i < rm->data.op_nents; i++) { + rdsdebug("putting data page %p\n", (void *)sg_page(&rm->data.op_sg[i])); /* XXX will have to put_page for page refs */ - __free_page(sg_page(&rm->data.m_sg[i])); + __free_page(sg_page(&rm->data.op_sg[i])); } - rm->data.m_nents = 0; + rm->data.op_nents = 0; if (rm->rdma.op_active) rds_rdma_free_op(&rm->rdma); @@ -261,11 +261,11 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in set_bit(RDS_MSG_PAGEVEC, &rm->m_flags); rm->m_inc.i_hdr.h_len = cpu_to_be32(total_len); - rm->data.m_nents = ceil(total_len, PAGE_SIZE); - rm->data.m_sg = rds_message_alloc_sgs(rm, num_sgs); + rm->data.op_nents = ceil(total_len, PAGE_SIZE); + rm->data.op_sg = rds_message_alloc_sgs(rm, num_sgs); - for (i = 0; i < rm->data.m_nents; ++i) { - sg_set_page(&rm->data.m_sg[i], + for (i = 0; i < rm->data.op_nents; ++i) { + sg_set_page(&rm->data.op_sg[i], virt_to_page(page_addrs[i]), PAGE_SIZE, 0); } @@ -288,7 +288,7 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, /* * now allocate and copy in the data payload. */ - sg = rm->data.m_sg; + sg = rm->data.op_sg; iov = first_iov; iov_off = 0; sg_off = 0; /* Dear gcc, sg->page will be null from kzalloc. */ @@ -299,7 +299,7 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, GFP_HIGHUSER); if (ret) goto out; - rm->data.m_nents++; + rm->data.op_nents++; sg_off = 0; } @@ -354,7 +354,7 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, iov = first_iov; iov_off = 0; - sg = rm->data.m_sg; + sg = rm->data.op_sg; vec_off = 0; copied = 0; diff --git a/net/rds/rds.h b/net/rds/rds.h index 76eeb59..d702849 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -339,9 +339,9 @@ struct rds_message { } rdma; struct rm_data_op { unsigned int op_active:1; - unsigned int m_nents; - unsigned int m_count; - struct scatterlist *m_sg; + unsigned int op_nents; + unsigned int op_count; + struct scatterlist *op_sg; } data; }; unsigned int m_used_sgs; diff --git a/net/rds/send.c b/net/rds/send.c index 08df279..d60d313 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -166,7 +166,7 @@ int rds_send_xmit(struct rds_connection *conn) rm = conn->c_xmit_rm; if (rm && conn->c_xmit_hdr_off == sizeof(struct rds_header) && - conn->c_xmit_sg == rm->data.m_nents) { + conn->c_xmit_sg == rm->data.op_nents) { conn->c_xmit_rm = NULL; conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; @@ -296,7 +296,7 @@ int rds_send_xmit(struct rds_connection *conn) if (rm->data.op_active && (conn->c_xmit_hdr_off < sizeof(struct rds_header) || - conn->c_xmit_sg < rm->data.m_nents)) { + conn->c_xmit_sg < rm->data.op_nents)) { ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, @@ -312,7 +312,7 @@ int rds_send_xmit(struct rds_connection *conn) ret -= tmp; } - sg = &rm->data.m_sg[conn->c_xmit_sg]; + sg = &rm->data.op_sg[conn->c_xmit_sg]; while (ret) { tmp = min_t(int, ret, sg->length - conn->c_xmit_data_off); @@ -323,7 +323,7 @@ int rds_send_xmit(struct rds_connection *conn) sg++; conn->c_xmit_sg++; BUG_ON(ret != 0 && - conn->c_xmit_sg == rm->data.m_nents); + conn->c_xmit_sg == rm->data.op_nents); } } } @@ -959,7 +959,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - rm->data.m_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); /* XXX fix this to not allocate memory */ ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); if (ret) diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index d63aa35..53c1de5 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -166,21 +166,21 @@ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, goto out; } - while (sg < rm->data.m_nents) { + while (sg < rm->data.op_nents) { ret = tc->t_sock->ops->sendpage(tc->t_sock, - sg_page(&rm->data.m_sg[sg]), - rm->data.m_sg[sg].offset + off, - rm->data.m_sg[sg].length - off, + sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, + rm->data.op_sg[sg].length - off, MSG_DONTWAIT|MSG_NOSIGNAL); - rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.m_sg[sg]), - rm->data.m_sg[sg].offset + off, rm->data.m_sg[sg].length - off, + rdsdebug("tcp sendpage %p:%u:%u ret %d\n", (void *)sg_page(&rm->data.op_sg[sg]), + rm->data.op_sg[sg].offset + off, rm->data.op_sg[sg].length - off, ret); if (ret <= 0) break; off += ret; done += ret; - if (off == rm->data.m_sg[sg].length) { + if (off == rm->data.op_sg[sg].length) { off = 0; sg++; } -- cgit v1.1 From 5b2366bd2835919e2e6a836e837eab4a9274bd46 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 3 Feb 2010 19:36:44 -0800 Subject: RDS: Rewrite rds_send_xmit Simplify rds_send_xmit(). Send a congestion map (via xmit_cong_map) without decrementing send_quota. Move resetting of conn xmit variables to end of loop. Update comments. Implement a special case to turn off sending an rds header when there is an atomic op and no other data. Signed-off-by: Andy Grover --- net/rds/rds.h | 5 +-- net/rds/send.c | 131 +++++++++++++++++++++++++++++++-------------------------- 2 files changed, 73 insertions(+), 63 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index d702849..13ed30ac 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -96,8 +96,9 @@ struct rds_connection { unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; unsigned int c_xmit_data_off; - unsigned int c_xmit_rdma_sent; unsigned int c_xmit_atomic_sent; + unsigned int c_xmit_rdma_sent; + unsigned int c_xmit_data_sent; spinlock_t c_lock; /* protect msg queues */ u64 c_next_tx_seq; @@ -120,8 +121,6 @@ struct rds_connection { struct list_head c_map_item; unsigned long c_map_queued; - unsigned long c_map_offset; - unsigned long c_map_bytes; unsigned int c_unacked_packets; unsigned int c_unacked_bytes; diff --git a/net/rds/send.c b/net/rds/send.c index d60d313..66dc6b0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -72,8 +72,9 @@ void rds_send_reset(struct rds_connection *conn) conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; - conn->c_xmit_rdma_sent = 0; conn->c_xmit_atomic_sent = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_data_sent = 0; conn->c_map_queued = 0; @@ -137,69 +138,54 @@ int rds_send_xmit(struct rds_connection *conn) /* * spin trying to push headers and data down the connection until - * the connection doens't make forward progress. + * the connection doesn't make forward progress. */ while (--send_quota) { - /* - * See if need to send a congestion map update if we're - * between sending messages. The send_sem protects our sole - * use of c_map_offset and _bytes. - * Note this is used only by transports that define a special - * xmit_cong_map function. For all others, we create allocate - * a cong_map message and treat it just like any other send. - */ - if (conn->c_map_bytes) { - ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, - conn->c_map_offset); - if (ret <= 0) - break; - conn->c_map_offset += ret; - conn->c_map_bytes -= ret; - if (conn->c_map_bytes) - continue; - } - - /* If we're done sending the current message, clear the - * offset and S/G temporaries. - */ rm = conn->c_xmit_rm; - if (rm && - conn->c_xmit_hdr_off == sizeof(struct rds_header) && - conn->c_xmit_sg == rm->data.op_nents) { - conn->c_xmit_rm = NULL; - conn->c_xmit_sg = 0; - conn->c_xmit_hdr_off = 0; - conn->c_xmit_data_off = 0; - conn->c_xmit_rdma_sent = 0; - conn->c_xmit_atomic_sent = 0; - - /* Release the reference to the previous message. */ - rds_message_put(rm); - rm = NULL; - } - /* If we're asked to send a cong map update, do so. + /* + * If between sending messages, we can send a pending congestion + * map update. + * + * Transports either define a special xmit_cong_map function, + * or we allocate a cong_map message and treat it just like any + * other send. */ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { if (conn->c_trans->xmit_cong_map) { - conn->c_map_offset = 0; - conn->c_map_bytes = sizeof(struct rds_header) + + unsigned long map_offset = 0; + unsigned long map_bytes = sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; - continue; - } - rm = rds_cong_update_alloc(conn); - if (IS_ERR(rm)) { - ret = PTR_ERR(rm); - break; - } + while (map_bytes) { + ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, + map_offset); + if (ret <= 0) { + /* too far down the rabbithole! */ + mutex_unlock(&conn->c_send_lock); + rds_conn_error(conn, "Cong map xmit failed\n"); + goto out; + } + + map_offset += ret; + map_bytes -= ret; + } + } else { + /* send cong update like a normal rm */ + rm = rds_cong_update_alloc(conn); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + break; + } + rm->data.op_active = 1; - conn->c_xmit_rm = rm; + conn->c_xmit_rm = rm; + } } /* - * Grab the next message from the send queue, if there is one. + * If not already working on one, grab the next message. * * c_xmit_rm holds a ref while we're sending this message down * the connction. We can use this ref while holding the @@ -264,7 +250,6 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_rm = rm; } - if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { ret = conn->c_trans->xmit_atomic(conn, rm); if (ret) @@ -273,13 +258,20 @@ int rds_send_xmit(struct rds_connection *conn) /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); + + /* + * This is evil, muahaha. + * We permit 0-byte sends. (rds-ping depends on this.) + * BUT if there is an atomic op and no sent data, + * we turn off sending the header, to achieve + * "silent" atomics. + * But see below; RDMA op might toggle this back on! + */ + if (rm->data.op_nents == 0) + rm->data.op_active = 0; } - /* - * Try and send an rdma message. Let's see if we can - * keep this simple and require that the transport either - * send the whole rdma or none of it. - */ + /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) @@ -294,9 +286,7 @@ int rds_send_xmit(struct rds_connection *conn) set_bit(RDS_MSG_MAPPED, &rm->m_flags); } - if (rm->data.op_active - && (conn->c_xmit_hdr_off < sizeof(struct rds_header) || - conn->c_xmit_sg < rm->data.op_nents)) { + if (rm->data.op_active && !conn->c_xmit_data_sent) { ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, @@ -326,6 +316,27 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_sg == rm->data.op_nents); } } + + if (conn->c_xmit_hdr_off == sizeof(struct rds_header) && + (conn->c_xmit_sg == rm->data.op_nents)) + conn->c_xmit_data_sent = 1; + } + + /* + * A rm will only take multiple times through this loop + * if there is a data op. Thus, if the data is sent (or there was + * none), then we're done with the rm. + */ + if (!rm->data.op_active || conn->c_xmit_data_sent) { + conn->c_xmit_rm = NULL; + conn->c_xmit_sg = 0; + conn->c_xmit_hdr_off = 0; + conn->c_xmit_data_off = 0; + conn->c_xmit_rdma_sent = 0; + conn->c_xmit_atomic_sent = 0; + conn->c_xmit_data_sent = 0; + + rds_message_put(rm); } } @@ -350,7 +361,7 @@ int rds_send_xmit(struct rds_connection *conn) */ mutex_unlock(&conn->c_send_lock); - if (conn->c_map_bytes || (send_quota == 0 && !was_empty)) { + if (send_quota == 0 && !was_empty) { /* We exhausted the send quota, but there's work left to * do. Return and (re-)schedule the send worker. */ -- cgit v1.1 From 372cd7dedfd1ea93a9ae8d9c282e910dc1b76773 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 3 Feb 2010 19:40:32 -0800 Subject: RDS: Do not set op_active in r_m_copy_from_user(). Do not allocate sgs for data for 0-length datagrams Set data.op_active in rds_sendmsg() instead of rds_message_copy_from_user(). Signed-off-by: Andy Grover --- net/rds/message.c | 2 -- net/rds/send.c | 13 ++++++++----- 2 files changed, 8 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 4bd9504..f4c3be0 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -330,8 +330,6 @@ int rds_message_copy_from_user(struct rds_message *rm, struct iovec *first_iov, sg++; } - rm->data.op_active = 1; - out: return ret; } diff --git a/net/rds/send.c b/net/rds/send.c index 66dc6b0..ad89a63 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -970,11 +970,14 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); - /* XXX fix this to not allocate memory */ - ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); - if (ret) - goto out; + /* Attach data to the rm */ + if (payload_len) { + rm->data.op_sg = rds_message_alloc_sgs(rm, ceil(payload_len, PAGE_SIZE)); + ret = rds_message_copy_from_user(rm, msg->msg_iov, payload_len); + if (ret) + goto out; + } + rm->data.op_active = 1; rm->m_daddr = daddr; -- cgit v1.1 From ee4c7b47e46a9dea789aadb8279c8505f755b3ee Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 3 Feb 2010 19:41:52 -0800 Subject: RDS: Add a warning if trying to allocate 0 sgs rds_message_alloc_sgs() only works when nents is nonzero. Signed-off-by: Andy Grover --- net/rds/message.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index f4c3be0..1f73a73 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -240,6 +240,7 @@ struct scatterlist *rds_message_alloc_sgs(struct rds_message *rm, int nents) struct scatterlist *sg_ret; WARN_ON(rm->m_used_sgs + nents > rm->m_total_sgs); + WARN_ON(!nents); sg_ret = &sg_first[rm->m_used_sgs]; sg_init_table(sg_ret, nents); -- cgit v1.1 From 940786eb0a0faf3f30898a1cc7c1540d54c1aff6 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 19 Feb 2010 18:04:58 -0800 Subject: RDS: queue failure notifications for dropped atomic ops When dropping ops in the send queue, we notify the client of failed rdma ops they asked for notifications on, but not atomic ops. It should be for both. Signed-off-by: Andy Grover --- net/rds/send.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index ad89a63..cdca974 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -512,9 +512,10 @@ EXPORT_SYMBOL_GPL(rds_atomic_send_complete); * socket, socket lock) and can just move the notifier. */ static inline void -__rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) +__rds_send_complete(struct rds_sock *rs, struct rds_message *rm, int status) { struct rm_rdma_op *ro; + struct rm_atomic_op *ao; ro = &rm->rdma; if (ro->op_active && ro->op_notify && ro->op_notifier) { @@ -523,6 +524,13 @@ __rds_rdma_send_complete(struct rds_sock *rs, struct rds_message *rm, int status ro->op_notifier = NULL; } + ao = &rm->atomic; + if (ao->op_active && ao->op_notify && ao->op_notifier) { + ao->op_notifier->n_status = status; + list_add_tail(&ao->op_notifier->n_list, &rs->rs_notify_queue); + ao->op_notifier = NULL; + } + /* No need to wake the app - caller does this */ } @@ -733,7 +741,7 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest) spin_lock_irqsave(&rm->m_rs_lock, flags); spin_lock(&rs->rs_lock); - __rds_rdma_send_complete(rs, rm, RDS_RDMA_CANCELED); + __rds_send_complete(rs, rm, RDS_RDMA_CANCELED); spin_unlock(&rs->rs_lock); rm->m_rs = NULL; -- cgit v1.1 From 7e3bd65ebfd5d6cd76b8b979920c632d6e6b4b2a Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 1 Mar 2010 16:04:59 -0800 Subject: RDS: Move some variables around for consistency Also, add a comment. Signed-off-by: Andy Grover --- net/rds/rdma.c | 3 +-- net/rds/rds.h | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 8d22999..5ba5146 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -747,6 +747,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); @@ -782,8 +783,6 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, rm->atomic.op_rkey = rds_rdma_cookie_key(args->cookie); rm->atomic.op_remote_addr = args->remote_addr + rds_rdma_cookie_offset(args->cookie); - rm->atomic.op_active = 1; - return ret; err: if (page) diff --git a/net/rds/rds.h b/net/rds/rds.h index 13ed30ac..46d190d 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -301,7 +301,13 @@ struct rds_message { */ spinlock_t m_rs_lock; struct rds_sock *m_rs; + + /* cookie to send to remote, in rds header */ rds_rdma_cookie_t m_rdma_cookie; + + unsigned int m_used_sgs; + unsigned int m_total_sgs; + struct { struct rm_atomic_op { int op_type; @@ -343,8 +349,6 @@ struct rds_message { struct scatterlist *op_sg; } data; }; - unsigned int m_used_sgs; - unsigned int m_total_sgs; }; /* -- cgit v1.1 From 2c3a5f9abb1dc5efdab8ba9a568b1661c65fd1e3 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 1 Mar 2010 16:10:40 -0800 Subject: RDS: Add flag for silent ops. Do atomic op before RDMA Add a flag to the API so users can indicate they want silent operations. This is needed because silent ops cannot be used with USE_ONCE MRs, so we can't just assume silent. Also, change send_xmit to do atomic op before rdma op if both are present, and centralize the hairy logic to determine if we want to attempt silent, or not. Signed-off-by: Andy Grover --- net/rds/rdma.c | 2 ++ net/rds/rds.h | 2 ++ net/rds/send.c | 55 +++++++++++++++++++++++++++++++------------------------ 3 files changed, 35 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 5ba5146..48781fe 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -559,6 +559,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, op->op_write = !!(args->flags & RDS_RDMA_READWRITE); op->op_fence = !!(args->flags & RDS_RDMA_FENCE); op->op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + op->op_silent = !!(args->flags & RDS_RDMA_SILENT); op->op_active = 1; op->op_recverr = rs->rs_recverr; WARN_ON(!nr_pages); @@ -747,6 +748,7 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + rm->atomic.op_silent = !!(args->flags & RDS_RDMA_SILENT); rm->atomic.op_active = 1; rm->atomic.op_recverr = rs->rs_recverr; rm->atomic.op_sg = rds_message_alloc_sgs(rm, 1); diff --git a/net/rds/rds.h b/net/rds/rds.h index 46d190d..23b9210 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -319,6 +319,7 @@ struct rds_message { unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; + unsigned int op_silent:1; unsigned int op_active:1; struct scatterlist *op_sg; struct rds_notifier *op_notifier; @@ -333,6 +334,7 @@ struct rds_message { unsigned int op_notify:1; unsigned int op_recverr:1; unsigned int op_mapped:1; + unsigned int op_silent:1; unsigned int op_active:1; unsigned int op_bytes; unsigned int op_nents; diff --git a/net/rds/send.c b/net/rds/send.c index cdca974..38567f3 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -250,42 +250,50 @@ int rds_send_xmit(struct rds_connection *conn) conn->c_xmit_rm = rm; } - if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { - ret = conn->c_trans->xmit_atomic(conn, rm); + /* The transport either sends the whole rdma or none of it */ + if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) break; - conn->c_xmit_atomic_sent = 1; + conn->c_xmit_rdma_sent = 1; + /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); - - /* - * This is evil, muahaha. - * We permit 0-byte sends. (rds-ping depends on this.) - * BUT if there is an atomic op and no sent data, - * we turn off sending the header, to achieve - * "silent" atomics. - * But see below; RDMA op might toggle this back on! - */ - if (rm->data.op_nents == 0) - rm->data.op_active = 0; } - /* The transport either sends the whole rdma or none of it */ - if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { - ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); + if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + ret = conn->c_trans->xmit_atomic(conn, rm); if (ret) break; - conn->c_xmit_rdma_sent = 1; - - /* rdmas need data sent, even if just the header */ - rm->data.op_active = 1; - + conn->c_xmit_atomic_sent = 1; /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); } + /* + * A number of cases require an RDS header to be sent + * even if there is no data. + * We permit 0-byte sends; rds-ping depends on this. + * However, if there are exclusively attached silent ops, + * we skip the hdr/data send, to enable silent operation. + */ + if (rm->data.op_nents == 0) { + int ops_present; + int all_ops_are_silent = 1; + + ops_present = (rm->atomic.op_active || rm->rdma.op_active); + if (rm->atomic.op_active && !rm->atomic.op_silent) + all_ops_are_silent = 0; + if (rm->rdma.op_active && !rm->rdma.op_silent) + all_ops_are_silent = 0; + + if (ops_present && all_ops_are_silent + && !rm->m_rdma_cookie) + rm->data.op_active = 0; + } + if (rm->data.op_active && !conn->c_xmit_data_sent) { ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, @@ -1009,8 +1017,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, if (ret) goto out; - if ((rm->m_rdma_cookie || rm->rdma.op_active) && - !conn->c_trans->xmit_rdma) { + if (rm->rdma.op_active && !conn->c_trans->xmit_rdma) { if (printk_ratelimit()) printk(KERN_NOTICE "rdma_op %p conn xmit_rdma %p\n", &rm->rdma, conn->c_trans->xmit_rdma); -- cgit v1.1 From aa0a4ef4ac3a3c5ffa35e32520bfbc0922ef3630 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 13 Apr 2010 12:00:35 -0700 Subject: RDS: Make sure cmsgs aren't used in improper ways It hasn't cropped up in the field, but this code ensures it is impossible to issue operations that pass an rdma cookie (DEST, MAP) in the same sendmsg call that's actually initiating rdma or atomic ops. Disallowing this perverse-but-technically-allowed usage makes silent RDMA heuristics slightly easier. Signed-off-by: Andy Grover --- net/rds/send.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 38567f3..69ab1040 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -846,6 +846,7 @@ static int rds_rm_size(struct msghdr *msg, int data_len) { struct cmsghdr *cmsg; int size = 0; + int cmsg_groups = 0; int retval; for (cmsg = CMSG_FIRSTHDR(msg); cmsg; cmsg = CMSG_NXTHDR(msg, cmsg)) { @@ -857,19 +858,23 @@ static int rds_rm_size(struct msghdr *msg, int data_len) switch (cmsg->cmsg_type) { case RDS_CMSG_RDMA_ARGS: + cmsg_groups |= 1; retval = rds_rdma_extra_size(CMSG_DATA(cmsg)); if (retval < 0) return retval; size += retval; + break; case RDS_CMSG_RDMA_DEST: case RDS_CMSG_RDMA_MAP: + cmsg_groups |= 2; /* these are valid but do no add any size */ break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: + cmsg_groups |= 1; size += sizeof(struct scatterlist); break; @@ -881,6 +886,10 @@ static int rds_rm_size(struct msghdr *msg, int data_len) size += ceil(data_len, PAGE_SIZE) * sizeof(struct scatterlist); + /* Ensure (DEST, MAP) are never used with (ARGS, ATOMIC) */ + if (cmsg_groups == 3) + return -EINVAL; + return size; } -- cgit v1.1 From ff3d7d36134ef7138803734fdbf91cc986ea7976 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 1 Mar 2010 14:03:09 -0800 Subject: RDS: Perform unmapping ops in stages Previously, RDS would wait until the final send WR had completed and then handle cleanup. With silent ops, we do not know if an atomic, rdma, or data op will be last. This patch handles any of these cases by keeping a pointer to the last op in the message in m_last_op. When the TX completion event fires, rds dispatches to per-op-type cleanup functions, and then does whole-message cleanup, if the last op equalled m_last_op. This patch also moves towards having op-specific functions take the op struct, instead of the overall rm struct. rds_ib_connection has a pointer to keep track of a a partially- completed data send operation. This patch changes it from an rds_message pointer to the narrower rm_data_op pointer, and modifies places that use this pointer as needed. Signed-off-by: Andy Grover --- net/rds/ib.h | 7 +- net/rds/ib_cm.c | 9 +- net/rds/ib_send.c | 242 ++++++++++++++++++++++++++++-------------------------- net/rds/rds.h | 4 +- net/rds/send.c | 6 +- 5 files changed, 142 insertions(+), 126 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index d64b508..202140a 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -53,8 +53,7 @@ struct rds_ib_connect_private { }; struct rds_ib_send_work { - struct rds_message *s_rm; - struct rm_rdma_op *s_op; + void *s_op; struct ib_send_wr s_wr; struct ib_sge s_sge[RDS_IB_MAX_SGE]; unsigned long s_queued; @@ -92,7 +91,7 @@ struct rds_ib_connection { /* tx */ struct rds_ib_work_ring i_send_ring; - struct rds_message *i_rm; + struct rm_data_op *i_data_op; struct rds_header *i_send_hdrs; u64 i_send_hdrs_dma; struct rds_ib_send_work *i_sends; @@ -336,7 +335,7 @@ void rds_ib_send_add_credits(struct rds_connection *conn, unsigned int credits); void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted); int rds_ib_send_grab_credits(struct rds_ib_connection *ic, u32 wanted, u32 *adv_credits, int need_posted, int max_posted); -int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm); +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op); /* ib_stats.c */ DECLARE_PER_CPU(struct rds_ib_statistics, rds_ib_stats); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 8b0c743..b5b5ebb 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -673,9 +673,12 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) BUG_ON(ic->rds_ibdev); /* Clear pending transmit */ - if (ic->i_rm) { - rds_message_put(ic->i_rm); - ic->i_rm = NULL; + if (ic->i_data_op) { + struct rds_message *rm; + + rm = container_of(ic->i_data_op, struct rds_message, data); + rds_message_put(rm); + ic->i_data_op = NULL; } /* Clear the ACK state */ diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 95f1524..6461a15 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -67,80 +67,122 @@ static void rds_ib_send_complete(struct rds_message *rm, complete(rm, notify_status); } -static void rds_ib_send_unmap_rm(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, - int wc_status) +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) { - struct rds_message *rm = send->s_rm; - - rdsdebug("ic %p send %p rm %p\n", ic, send, rm); - - ib_dma_unmap_sg(ic->i_cm_id->device, - rm->data.op_sg, rm->data.op_nents, - DMA_TO_DEVICE); + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); +} - if (rm->rdma.op_active) { - struct rm_rdma_op *op = &rm->rdma; +static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, + struct rm_rdma_op *op, + int wc_status) +{ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); + op->op_mapped = 0; + } - if (op->op_mapped) { - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - op->op_mapped = 0; - } + /* If the user asked for a completion notification on this + * message, we can implement three different semantics: + * 1. Notify when we received the ACK on the RDS message + * that was queued with the RDMA. This provides reliable + * notification of RDMA status at the expense of a one-way + * packet delay. + * 2. Notify when the IB stack gives us the completion event for + * the RDMA operation. + * 3. Notify when the IB stack gives us the completion event for + * the accompanying RDS messages. + * Here, we implement approach #3. To implement approach #2, + * we would need to take an event for the rdma WR. To implement #1, + * don't call rds_rdma_send_complete at all, and fall back to the notify + * handling in the ACK processing code. + * + * Note: There's no need to explicitly sync any RDMA buffers using + * ib_dma_sync_sg_for_cpu - the completion for the RDMA + * operation itself unmapped the RDMA buffers, which takes care + * of synching. + */ + rds_ib_send_complete(container_of(op, struct rds_message, rdma), + wc_status, rds_rdma_send_complete); - /* If the user asked for a completion notification on this - * message, we can implement three different semantics: - * 1. Notify when we received the ACK on the RDS message - * that was queued with the RDMA. This provides reliable - * notification of RDMA status at the expense of a one-way - * packet delay. - * 2. Notify when the IB stack gives us the completion event for - * the RDMA operation. - * 3. Notify when the IB stack gives us the completion event for - * the accompanying RDS messages. - * Here, we implement approach #3. To implement approach #2, - * call rds_rdma_send_complete from the cq_handler. To implement #1, - * don't call rds_rdma_send_complete at all, and fall back to the notify - * handling in the ACK processing code. - * - * Note: There's no need to explicitly sync any RDMA buffers using - * ib_dma_sync_sg_for_cpu - the completion for the RDMA - * operation itself unmapped the RDMA buffers, which takes care - * of synching. - */ - rds_ib_send_complete(rm, wc_status, rds_rdma_send_complete); + if (op->op_write) + rds_stats_add(s_send_rdma_bytes, op->op_bytes); + else + rds_stats_add(s_recv_rdma_bytes, op->op_bytes); +} - if (rm->rdma.op_write) - rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes); - else - rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes); +static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, + struct rm_atomic_op *op, + int wc_status) +{ + /* unmap atomic recvbuf */ + if (op->op_mapped) { + ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, + DMA_FROM_DEVICE); + op->op_mapped = 0; } - if (rm->atomic.op_active) { - struct rm_atomic_op *op = &rm->atomic; - - /* unmap atomic recvbuf */ - if (op->op_mapped) { - ib_dma_unmap_sg(ic->i_cm_id->device, op->op_sg, 1, - DMA_FROM_DEVICE); - op->op_mapped = 0; - } + rds_ib_send_complete(container_of(op, struct rds_message, atomic), + wc_status, rds_atomic_send_complete); - rds_ib_send_complete(rm, wc_status, rds_atomic_send_complete); + if (op->op_type == RDS_ATOMIC_TYPE_CSWP) + rds_stats_inc(s_atomic_cswp); + else + rds_stats_inc(s_atomic_fadd); +} - if (rm->atomic.op_type == RDS_ATOMIC_TYPE_CSWP) - rds_stats_inc(s_atomic_cswp); - else - rds_stats_inc(s_atomic_fadd); +/* + * Unmap the resources associated with a struct send_work. + * + * Returns the rm for no good reason other than it is unobtainable + * other than by switching on wr.opcode, currently, and the caller, + * the event handler, needs it. + */ +static struct rds_message *rds_ib_send_unmap_op(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + int wc_status) +{ + struct rds_message *rm = NULL; + + /* In the error case, wc.opcode sometimes contains garbage */ + switch (send->s_wr.opcode) { + case IB_WR_SEND: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, data); + rds_ib_send_unmap_data(ic, send->s_op, wc_status); + } + break; + case IB_WR_RDMA_WRITE: + case IB_WR_RDMA_READ: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, rdma); + rds_ib_send_unmap_rdma(ic, send->s_op, wc_status); + } + break; + case IB_WR_ATOMIC_FETCH_AND_ADD: + case IB_WR_ATOMIC_CMP_AND_SWP: + if (send->s_op) { + rm = container_of(send->s_op, struct rds_message, atomic); + rds_ib_send_unmap_atomic(ic, send->s_op, wc_status); + } + break; + default: + if (printk_ratelimit()) + printk(KERN_NOTICE + "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", + __func__, send->s_wr.opcode); + break; } - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); + send->s_wr.opcode = 0xdead; - rds_message_put(rm); - send->s_rm = NULL; + return rm; } void rds_ib_send_init_ring(struct rds_ib_connection *ic) @@ -151,7 +193,6 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic) for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { struct ib_sge *sge; - send->s_rm = NULL; send->s_op = NULL; send->s_wr.wr_id = i; @@ -173,9 +214,8 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) u32 i; for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) { - if (!send->s_rm || send->s_wr.opcode == 0xdead) - continue; - rds_ib_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR); + if (send->s_op && send->s_wr.opcode != 0xdead) + rds_ib_send_unmap_op(ic, send, IB_WC_WR_FLUSH_ERR); } } @@ -189,6 +229,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) { struct rds_connection *conn = context; struct rds_ib_connection *ic = conn->c_transport_data; + struct rds_message *rm = NULL; struct ib_wc wc; struct rds_ib_send_work *send; u32 completed; @@ -222,42 +263,18 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; - /* In the error case, wc.opcode sometimes contains garbage */ - switch (send->s_wr.opcode) { - case IB_WR_SEND: - case IB_WR_RDMA_WRITE: - case IB_WR_RDMA_READ: - case IB_WR_ATOMIC_FETCH_AND_ADD: - case IB_WR_ATOMIC_CMP_AND_SWP: - if (send->s_rm) - rds_ib_send_unmap_rm(ic, send, wc.status); - break; - default: - if (printk_ratelimit()) - printk(KERN_NOTICE - "RDS/IB: %s: unexpected opcode 0x%x in WR!\n", - __func__, send->s_wr.opcode); - break; - } + rm = rds_ib_send_unmap_op(ic, send, wc.status); - send->s_wr.opcode = 0xdead; - send->s_wr.num_sge = 1; if (send->s_queued + HZ/2 < jiffies) rds_ib_stats_inc(s_ib_tx_stalled); - /* If a RDMA operation produced an error, signal this right - * away. If we don't, the subsequent SEND that goes with this - * RDMA will be canceled with ERR_WFLUSH, and the application - * never learn that the RDMA failed. */ - if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) { - struct rds_message *rm; - - rm = rds_send_get_message(conn, send->s_op); - if (rm) { - rds_ib_send_unmap_rm(ic, send, wc.status); - rds_ib_send_complete(rm, wc.status, rds_rdma_send_complete); - rds_message_put(rm); - } + if (&send->s_op == &rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + + rds_message_put(rm); + send->s_op = NULL; } oldest = (oldest + 1) % ic->i_send_ring.w_nr; @@ -512,7 +529,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } /* map the message the first time we see it */ - if (!ic->i_rm) { + if (!ic->i_data_op) { if (rm->data.op_nents) { rm->data.op_count = ib_dma_map_sg(dev, rm->data.op_sg, @@ -530,7 +547,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, } rds_message_addref(rm); - ic->i_rm = rm; + ic->i_data_op = &rm->data; /* Finalize the header */ if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags)) @@ -583,7 +600,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, send = &ic->i_sends[pos]; first = send; prev = NULL; - scat = &rm->data.op_sg[sg]; + scat = &ic->i_data_op->op_sg[sg]; i = 0; do { unsigned int len = 0; @@ -658,9 +675,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, /* if we finished the message then send completion owns it */ if (scat == &rm->data.op_sg[rm->data.op_count]) { - prev->s_rm = ic->i_rm; + prev->s_op = ic->i_data_op; prev->s_wr.send_flags |= IB_SEND_SOLICITED; - ic->i_rm = NULL; + ic->i_data_op = NULL; } /* Put back wrs & credits we didn't use */ @@ -681,9 +698,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); - if (prev->s_rm) { - ic->i_rm = prev->s_rm; - prev->s_rm = NULL; + if (prev->s_op) { + ic->i_data_op = prev->s_op; + prev->s_op = NULL; } rds_ib_conn_error(ic->conn, "ib_post_send failed\n"); @@ -701,10 +718,9 @@ out: * A simplified version of the rdma case, we always map 1 SG, and * only 8 bytes, for the return value from the atomic operation. */ -int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm) +int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) { struct rds_ib_connection *ic = conn->c_transport_data; - struct rm_atomic_op *op = &rm->atomic; struct rds_ib_send_work *send = NULL; struct ib_send_wr *failed_wr; struct rds_ib_device *rds_ibdev; @@ -741,14 +757,6 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rds_message *rm) send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; send->s_wr.wr.atomic.rkey = op->op_rkey; - /* - * If there is no data or rdma ops in the message, then - * we must fill in s_rm ourselves, so we properly clean up - * on completion. - */ - if (!rm->rdma.op_active && !rm->data.op_active) - send->s_rm = rm; - /* map 8 byte retval buffer to the device */ ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); rdsdebug("ic %p mapping atomic op %p. mapped %d pg\n", ic, op, ret); @@ -809,7 +817,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - /* map the message the first time we see it */ + /* map the op the first time we see it */ if (!op->op_mapped) { op->op_count = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, op->op_nents, (op->op_write) ? diff --git a/net/rds/rds.h b/net/rds/rds.h index 23b9210..7291f00 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -308,6 +308,8 @@ struct rds_message { unsigned int m_used_sgs; unsigned int m_total_sgs; + void *m_final_op; + struct { struct rm_atomic_op { int op_type; @@ -421,7 +423,7 @@ struct rds_transport { int (*xmit_cong_map)(struct rds_connection *conn, struct rds_cong_map *map, unsigned long offset); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); - int (*xmit_atomic)(struct rds_connection *conn, struct rds_message *rm); + int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv)(struct rds_connection *conn); int (*inc_copy_to_user)(struct rds_incoming *inc, struct iovec *iov, size_t size); diff --git a/net/rds/send.c b/net/rds/send.c index 69ab1040..d1f364e 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -252,6 +252,7 @@ int rds_send_xmit(struct rds_connection *conn) /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + rm->m_final_op = &rm->rdma; ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); if (ret) break; @@ -263,10 +264,12 @@ int rds_send_xmit(struct rds_connection *conn) } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { - ret = conn->c_trans->xmit_atomic(conn, rm); + rm->m_final_op = &rm->atomic; + ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); if (ret) break; conn->c_xmit_atomic_sent = 1; + /* The transport owns the mapped memory for now. * You can't unmap it while it's on the send queue */ set_bit(RDS_MSG_MAPPED, &rm->m_flags); @@ -295,6 +298,7 @@ int rds_send_xmit(struct rds_connection *conn) } if (rm->data.op_active && !conn->c_xmit_data_sent) { + rm->m_final_op = &rm->data; ret = conn->c_trans->xmit(conn, rm, conn->c_xmit_hdr_off, conn->c_xmit_sg, -- cgit v1.1 From e32b4a70495aac6a612e13f4c21db09dd756ff2c Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 3 Mar 2010 19:25:21 -0800 Subject: RDS/IB: Do not wait for send ring to be empty on conn shutdown Now that we are signaling send completions much less, we are likely to have dirty entries in the send queue when the connection is shut down (on rmmod, for example.) These are cleaned up a little further down in conn_shutdown, but if we wait on the ring_empty_wait for them, it'll never happen, and we hand on unload. Signed-off-by: Andy Grover --- net/rds/ib_cm.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index b5b5ebb..1a91af7 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -620,8 +620,11 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) ic->i_cm_id, err); } + /* + * Don't wait for the send ring to be empty -- there may be completed + * non-signaled entries sitting on there. We unmap these below. + */ wait_event(rds_ib_ring_empty_wait, - rds_ib_ring_empty(&ic->i_send_ring) && rds_ib_ring_empty(&ic->i_recv_ring)); if (ic->i_send_hdrs) -- cgit v1.1 From 77dd550e5547846604ff6f90c4dc6bba4414e485 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 22 Mar 2010 15:22:04 -0700 Subject: RDS: Stop supporting old cong map sending method We now ask the transport to give us a rm for the congestion map, and then we handle it normally. Previously, the transport defined a function that we would call to send a congestion map. Convert TCP and loop transports to new cong map method. Signed-off-by: Andy Grover --- net/rds/ib.c | 1 - net/rds/iw.c | 1 - net/rds/loop.c | 19 ++++++------------- net/rds/rds.h | 6 ------ net/rds/send.c | 39 +++++++-------------------------------- net/rds/tcp.c | 1 - net/rds/tcp.h | 2 -- net/rds/tcp_send.c | 50 -------------------------------------------------- 8 files changed, 13 insertions(+), 106 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 932dacb..927c481 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -265,7 +265,6 @@ struct rds_transport rds_ib_transport = { .laddr_check = rds_ib_laddr_check, .xmit_complete = rds_ib_xmit_complete, .xmit = rds_ib_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_ib_xmit_rdma, .xmit_atomic = rds_ib_xmit_atomic, .recv = rds_ib_recv, diff --git a/net/rds/iw.c b/net/rds/iw.c index e766aec..467790d 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -264,7 +264,6 @@ struct rds_transport rds_iw_transport = { .laddr_check = rds_iw_laddr_check, .xmit_complete = rds_iw_xmit_complete, .xmit = rds_iw_xmit, - .xmit_cong_map = NULL, .xmit_rdma = rds_iw_xmit_rdma, .recv = rds_iw_recv, .conn_alloc = rds_iw_conn_alloc, diff --git a/net/rds/loop.c b/net/rds/loop.c index 4a3dd49..c390156 100644 --- a/net/rds/loop.c +++ b/net/rds/loop.c @@ -61,6 +61,12 @@ static int rds_loop_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { + /* Do not send cong updates to loopback */ + if (rm->m_inc.i_hdr.h_flags & RDS_FLAG_CONG_BITMAP) { + rds_cong_map_updated(conn->c_fcong, ~(u64) 0); + return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; + } + BUG_ON(hdr_off || sg || off); rds_inc_init(&rm->m_inc, conn, conn->c_laddr); @@ -88,18 +94,6 @@ static void rds_loop_inc_free(struct rds_incoming *inc) rds_message_put(rm); } -static int rds_loop_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, - unsigned long offset) -{ - BUG_ON(offset); - BUG_ON(map != conn->c_lcong); - - rds_cong_map_updated(conn->c_fcong, ~(u64) 0); - - return sizeof(struct rds_header) + RDS_CONG_MAP_BYTES; -} - /* we need to at least give the thread something to succeed */ static int rds_loop_recv(struct rds_connection *conn) { @@ -180,7 +174,6 @@ void rds_loop_exit(void) */ struct rds_transport rds_loop_transport = { .xmit = rds_loop_xmit, - .xmit_cong_map = rds_loop_xmit_cong_map, .recv = rds_loop_recv, .conn_alloc = rds_loop_conn_alloc, .conn_free = rds_loop_conn_free, diff --git a/net/rds/rds.h b/net/rds/rds.h index 7291f00..e81d7e4 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -393,10 +393,6 @@ struct rds_notifier { * transport is responsible for other serialization, including * rds_recv_incoming(). This is called in process context but * should try hard not to block. - * - * @xmit_cong_map: This asks the transport to send the local bitmap down the - * given connection. XXX get a better story about the bitmap - * flag and header. */ #define RDS_TRANS_IB 0 @@ -420,8 +416,6 @@ struct rds_transport { void (*xmit_complete)(struct rds_connection *conn); int (*xmit)(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); - int (*xmit_cong_map)(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset); int (*xmit_rdma)(struct rds_connection *conn, struct rm_rdma_op *op); int (*xmit_atomic)(struct rds_connection *conn, struct rm_atomic_op *op); int (*recv)(struct rds_connection *conn); diff --git a/net/rds/send.c b/net/rds/send.c index d1f364e..8a0647a 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -147,41 +147,16 @@ int rds_send_xmit(struct rds_connection *conn) /* * If between sending messages, we can send a pending congestion * map update. - * - * Transports either define a special xmit_cong_map function, - * or we allocate a cong_map message and treat it just like any - * other send. */ if (!rm && test_and_clear_bit(0, &conn->c_map_queued)) { - if (conn->c_trans->xmit_cong_map) { - unsigned long map_offset = 0; - unsigned long map_bytes = sizeof(struct rds_header) + - RDS_CONG_MAP_BYTES; - - while (map_bytes) { - ret = conn->c_trans->xmit_cong_map(conn, conn->c_lcong, - map_offset); - if (ret <= 0) { - /* too far down the rabbithole! */ - mutex_unlock(&conn->c_send_lock); - rds_conn_error(conn, "Cong map xmit failed\n"); - goto out; - } - - map_offset += ret; - map_bytes -= ret; - } - } else { - /* send cong update like a normal rm */ - rm = rds_cong_update_alloc(conn); - if (IS_ERR(rm)) { - ret = PTR_ERR(rm); - break; - } - rm->data.op_active = 1; - - conn->c_xmit_rm = rm; + rm = rds_cong_update_alloc(conn); + if (IS_ERR(rm)) { + ret = PTR_ERR(rm); + break; } + rm->data.op_active = 1; + + conn->c_xmit_rm = rm; } /* diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 8318816..3262992f5 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -258,7 +258,6 @@ struct rds_transport rds_tcp_transport = { .laddr_check = rds_tcp_laddr_check, .xmit_prepare = rds_tcp_xmit_prepare, .xmit_complete = rds_tcp_xmit_complete, - .xmit_cong_map = rds_tcp_xmit_cong_map, .xmit = rds_tcp_xmit, .recv = rds_tcp_recv, .conn_alloc = rds_tcp_conn_alloc, diff --git a/net/rds/tcp.h b/net/rds/tcp.h index c639872..16b1663 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -80,8 +80,6 @@ void rds_tcp_xmit_complete(struct rds_connection *conn); int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); void rds_tcp_write_space(struct sock *sk); -int rds_tcp_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset); /* tcp_stats.c */ DECLARE_PER_CPU(struct rds_tcp_statistics, rds_tcp_stats); diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index 53c1de5..2979fb4 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -77,56 +77,6 @@ int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) } /* the core send_sem serializes this with other xmit and shutdown */ -int rds_tcp_xmit_cong_map(struct rds_connection *conn, - struct rds_cong_map *map, unsigned long offset) -{ - static struct rds_header rds_tcp_map_header = { - .h_flags = RDS_FLAG_CONG_BITMAP, - }; - struct rds_tcp_connection *tc = conn->c_transport_data; - unsigned long i; - int ret; - int copied = 0; - - /* Some problem claims cpu_to_be32(constant) isn't a constant. */ - rds_tcp_map_header.h_len = cpu_to_be32(RDS_CONG_MAP_BYTES); - - if (offset < sizeof(struct rds_header)) { - ret = rds_tcp_sendmsg(tc->t_sock, - (void *)&rds_tcp_map_header + offset, - sizeof(struct rds_header) - offset); - if (ret <= 0) - return ret; - offset += ret; - copied = ret; - if (offset < sizeof(struct rds_header)) - return ret; - } - - offset -= sizeof(struct rds_header); - i = offset / PAGE_SIZE; - offset = offset % PAGE_SIZE; - BUG_ON(i >= RDS_CONG_MAP_PAGES); - - do { - ret = tc->t_sock->ops->sendpage(tc->t_sock, - virt_to_page(map->m_page_addrs[i]), - offset, PAGE_SIZE - offset, - MSG_DONTWAIT); - if (ret <= 0) - break; - copied += ret; - offset += ret; - if (offset == PAGE_SIZE) { - offset = 0; - i++; - } - } while (i < RDS_CONG_MAP_PAGES); - - return copied ? copied : ret; -} - -/* the core send_sem serializes this with other xmit and shutdown */ int rds_tcp_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off) { -- cgit v1.1 From f17a1a55fb672d7f64be7f2e940ef5669e5efa0a Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Thu, 18 Mar 2010 17:19:52 -0700 Subject: RDS: Refill recv ring directly from tasklet Performance is better if we use allocations that don't block to refill the receive ring. Since the whole reason we were kicking out to the worker thread was so we could do blocking allocs, we no longer need to do this. Remove gfp params from rds_ib_recv_refill(); we always use GFP_NOWAIT. Signed-off-by: Andy Grover --- net/rds/ib.h | 3 +-- net/rds/ib_cm.c | 2 +- net/rds/ib_recv.c | 22 ++++++++-------------- 3 files changed, 10 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 202140a..4c2ee24 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -297,8 +297,7 @@ void rds_ib_flush_mrs(void); int __init rds_ib_recv_init(void); void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill); +int rds_ib_recv_refill(struct rds_connection *conn, int prefill); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 1a91af7..75eda9c8 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -135,7 +135,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even rds_ib_recv_init_ring(ic); /* Post receive buffers - as a side effect, this will update * the posted credit count. */ - rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1); + rds_ib_recv_refill(conn, 1); /* Tune RNR behavior */ rds_ib_tune_rnr(ic, &qp_attr); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index e9fe08a..8f041f7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -135,8 +135,7 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv, - gfp_t kptr_gfp, gfp_t page_gfp) + struct rds_ib_recv_work *recv) { struct rds_ib_connection *ic = conn->c_transport_data; dma_addr_t dma_addr; @@ -148,8 +147,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, rds_ib_stats_inc(s_ib_rx_alloc_limit); goto out; } - recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, - kptr_gfp); + recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT); if (!recv->r_ibinc) { atomic_dec(&rds_ib_allocation); goto out; @@ -159,7 +157,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, } if (!recv->r_frag) { - recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, kptr_gfp); + recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT); if (!recv->r_frag) goto out; INIT_LIST_HEAD(&recv->r_frag->f_item); @@ -167,7 +165,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, } if (!ic->i_frag.f_page) { - ic->i_frag.f_page = alloc_page(page_gfp); + ic->i_frag.f_page = alloc_page(GFP_NOWAIT); if (!ic->i_frag.f_page) goto out; ic->i_frag.f_offset = 0; @@ -221,8 +219,7 @@ out: * * -1 is returned if posting fails due to temporary resource exhaustion. */ -int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, - gfp_t page_gfp, int prefill) +int rds_ib_recv_refill(struct rds_connection *conn, int prefill) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; @@ -241,7 +238,7 @@ int rds_ib_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv, kptr_gfp, page_gfp); + ret = rds_ib_recv_refill_one(conn, recv); if (ret) { ret = -1; break; @@ -856,11 +853,8 @@ void rds_ib_recv_tasklet_fn(unsigned long data) if (rds_ib_ring_empty(&ic->i_recv_ring)) rds_ib_stats_inc(s_ib_rx_ring_empty); - /* - * If the ring is running low, then schedule the thread to refill. - */ if (rds_ib_ring_low(&ic->i_recv_ring)) - queue_delayed_work(rds_wq, &conn->c_recv_w, 0); + rds_ib_recv_refill(conn, 0); } int rds_ib_recv(struct rds_connection *conn) @@ -875,7 +869,7 @@ int rds_ib_recv(struct rds_connection *conn) * we're really low and we want the caller to back off for a bit. */ mutex_lock(&ic->i_recv_mutex); - if (rds_ib_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0)) + if (rds_ib_recv_refill(conn, 0)) ret = -ENOMEM; else rds_ib_stats_inc(s_ib_rx_refill_from_thread); -- cgit v1.1 From 049ee3f500954176a87f22e6ee3e98aecb1b8958 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 23 Mar 2010 17:39:07 -0700 Subject: RDS: Change send lock from a mutex to a spinlock This change allows us to call rds_send_xmit() from a tasklet, which is crucial to our new operating model. * Change c_send_lock to a spinlock * Update stats fields "sem_" to "_lock" * Remove unneeded rds_conn_is_sending() About locking between shutdown and send -- send checks if the connection is up. Shutdown puts the connection into DISCONNECTING. After this, all threads entering send will exit immediately. However, a thread could be *in* send_xmit(), so shutdown acquires the c_send_lock to ensure everyone is out before proceeding with connection shutdown. Signed-off-by: Andy Grover --- net/rds/connection.c | 22 ++++++---------------- net/rds/rds.h | 6 +++--- net/rds/send.c | 15 +++++++-------- net/rds/stats.c | 4 ++-- 4 files changed, 18 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 88bcaf3..56aebe4 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -62,18 +62,6 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) -static inline int rds_conn_is_sending(struct rds_connection *conn) -{ - int ret = 0; - - if (!mutex_trylock(&conn->c_send_lock)) - ret = 1; - else - mutex_unlock(&conn->c_send_lock); - - return ret; -} - static struct rds_connection *rds_conn_lookup(struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) @@ -158,7 +146,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; - mutex_init(&conn->c_send_lock); + spin_lock_init(&conn->c_send_lock); INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); @@ -283,10 +271,12 @@ void rds_conn_shutdown(struct rds_connection *conn) } mutex_unlock(&conn->c_cm_lock); - mutex_lock(&conn->c_send_lock); + /* verify everybody's out of rds_send_xmit() */ + spin_lock_irq(&conn->c_send_lock); + spin_unlock_irq(&conn->c_send_lock); + conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); - mutex_unlock(&conn->c_send_lock); if (!rds_conn_transition(conn, RDS_CONN_DISCONNECTING, RDS_CONN_DOWN)) { /* This can happen - eg when we're in the middle of tearing @@ -476,7 +466,7 @@ static int rds_conn_info_visitor(struct rds_connection *conn, cinfo->flags = 0; rds_conn_info_set(cinfo->flags, - rds_conn_is_sending(conn), SENDING); + spin_is_locked(&conn->c_send_lock), SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, diff --git a/net/rds/rds.h b/net/rds/rds.h index e81d7e4..c3a668b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -91,7 +91,7 @@ struct rds_connection { struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; - struct mutex c_send_lock; /* protect send ring */ + spinlock_t c_send_lock; /* protect send ring */ struct rds_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; @@ -548,8 +548,8 @@ struct rds_statistics { uint64_t s_recv_ping; uint64_t s_send_queue_empty; uint64_t s_send_queue_full; - uint64_t s_send_sem_contention; - uint64_t s_send_sem_queue_raced; + uint64_t s_send_lock_contention; + uint64_t s_send_lock_queue_raced; uint64_t s_send_immediate_retry; uint64_t s_send_delayed_retry; uint64_t s_send_drop_acked; diff --git a/net/rds/send.c b/net/rds/send.c index 8a0647a..d4feec6 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -116,19 +116,18 @@ int rds_send_xmit(struct rds_connection *conn) int was_empty = 0; LIST_HEAD(to_be_dropped); + if (!rds_conn_up(conn)) + goto out; + /* * sendmsg calls here after having queued its message on the send * queue. We only have one task feeding the connection at a time. If * another thread is already feeding the queue then we back off. This * avoids blocking the caller and trading per-connection data between * caches per message. - * - * The sem holder will issue a retry if they notice that someone queued - * a message after they stopped walking the send queue but before they - * dropped the sem. */ - if (!mutex_trylock(&conn->c_send_lock)) { - rds_stats_inc(s_send_sem_contention); + if (!spin_trylock_irqsave(&conn->c_send_lock, flags)) { + rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } @@ -346,7 +345,7 @@ int rds_send_xmit(struct rds_connection *conn) * stop processing the loop when the transport hasn't taken * responsibility for forward progress. */ - mutex_unlock(&conn->c_send_lock); + spin_unlock_irqrestore(&conn->c_send_lock, flags); if (send_quota == 0 && !was_empty) { /* We exhausted the send quota, but there's work left to @@ -360,7 +359,7 @@ int rds_send_xmit(struct rds_connection *conn) * spin lock */ spin_lock_irqsave(&conn->c_lock, flags); if (!list_empty(&conn->c_send_queue)) { - rds_stats_inc(s_send_sem_queue_raced); + rds_stats_inc(s_send_lock_queue_raced); ret = -EAGAIN; } spin_unlock_irqrestore(&conn->c_lock, flags); diff --git a/net/rds/stats.c b/net/rds/stats.c index c66d95d..b77be8b 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -57,8 +57,8 @@ static const char *const rds_stat_names[] = { "recv_ping", "send_queue_empty", "send_queue_full", - "send_sem_contention", - "send_sem_queue_raced", + "send_lock_contention", + "send_lock_queue_raced", "send_immediate_retry", "send_delayed_retry", "send_drop_acked", -- cgit v1.1 From 2ad8099b58f274dc23bc866ca259d7e5db87fa1a Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 23 Mar 2010 17:48:04 -0700 Subject: RDS: rds_send_xmit() locking/irq fixes rds_message_put() cannot be called with irqs off, so move it after irqs are re-enabled. Spinlocks throughout the function do not to use _irqsave because the lock of c_send_lock at top already disabled irqs. Signed-off-by: Andy Grover --- net/rds/send.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index d4feec6..624a3dc 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -168,7 +168,7 @@ int rds_send_xmit(struct rds_connection *conn) if (!rm) { unsigned int len; - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock(&conn->c_lock); if (!list_empty(&conn->c_send_queue)) { rm = list_entry(conn->c_send_queue.next, @@ -183,7 +183,7 @@ int rds_send_xmit(struct rds_connection *conn) list_move_tail(&rm->m_conn_item, &conn->c_retrans); } - spin_unlock_irqrestore(&conn->c_lock, flags); + spin_unlock(&conn->c_lock); if (!rm) { was_empty = 1; @@ -199,11 +199,10 @@ int rds_send_xmit(struct rds_connection *conn) */ if (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { - spin_lock_irqsave(&conn->c_lock, flags); + spin_lock(&conn->c_lock); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); - spin_unlock_irqrestore(&conn->c_lock, flags); - rds_message_put(rm); + spin_unlock(&conn->c_lock); continue; } @@ -326,10 +325,6 @@ int rds_send_xmit(struct rds_connection *conn) } } - /* Nuke any messages we decided not to retransmit. */ - if (!list_empty(&to_be_dropped)) - rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); - if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); @@ -347,6 +342,14 @@ int rds_send_xmit(struct rds_connection *conn) */ spin_unlock_irqrestore(&conn->c_send_lock, flags); + /* Nuke any messages we decided not to retransmit. */ + if (!list_empty(&to_be_dropped)) { + /* irqs on here, so we can put(), unlike above */ + list_for_each_entry(rm, &to_be_dropped, m_conn_item) + rds_message_put(rm); + rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); + } + if (send_quota == 0 && !was_empty) { /* We exhausted the send quota, but there's work left to * do. Return and (re-)schedule the send worker. -- cgit v1.1 From a7d3a281483684f77e350b045af7f80a149fc4c7 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 16:20:18 -0700 Subject: RDS: Call rds_send_xmit() directly from sendmsg() rds_sendmsg() is calling the send worker function to send the just-queued datagrams, presumably because it wants the behavior where anything not sent will re-call the send worker. We now ensure all queued datagrams are sent by retrying from the send completion handler, so this isn't needed any more. Signed-off-by: Andy Grover --- net/rds/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 624a3dc..15b715a 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1073,7 +1073,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, rds_stats_inc(s_send_queued); if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) - rds_send_worker(&conn->c_send_w.work); + rds_send_xmit(conn); rds_message_put(rm); return payload_len; -- cgit v1.1 From 2fa57129df61bf3fb7d90c5486fe15df94091f61 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 16:45:40 -0700 Subject: RDS: Bypass workqueue when queueing cong updates Now that rds_send_xmit() does not block, we can call it directly instead of going through the helper thread. Signed-off-by: Andy Grover --- net/rds/cong.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/cong.c b/net/rds/cong.c index c741e90..75ea686 100644 --- a/net/rds/cong.c +++ b/net/rds/cong.c @@ -221,7 +221,7 @@ void rds_cong_queue_updates(struct rds_cong_map *map) list_for_each_entry(conn, &map->m_conn_list, c_map_item) { if (!test_and_set_bit(0, &conn->c_map_queued)) { rds_stats_inc(s_cong_update_queued); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + rds_send_xmit(conn); } } -- cgit v1.1 From f2ec76f288118fb18449402d75383212cbcb6762 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 16:46:46 -0700 Subject: RDS: Use NOWAIT in message_map_pages() Can no longer block, so use NOWAIT. Signed-off-by: Andy Grover --- net/rds/message.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 1f73a73..dd915e3 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -256,7 +256,7 @@ struct rds_message *rds_message_map_pages(unsigned long *page_addrs, unsigned in int num_sgs = ceil(total_len, PAGE_SIZE); int extra_bytes = num_sgs * sizeof(struct scatterlist); - rm = rds_message_alloc(extra_bytes, GFP_KERNEL); + rm = rds_message_alloc(extra_bytes, GFP_NOWAIT); if (!rm) return ERR_PTR(-ENOMEM); -- cgit v1.1 From cf4b7389ee812817deeb11da1422004e01b50646 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 16:50:54 -0700 Subject: RDS: Fix locking in send on m_rs_lock Do not nest m_rs_lock under c_lock Disable interrupts in {rdma,atomic}_send_complete Signed-off-by: Andy Grover --- net/rds/send.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 15b715a..ecda3e6 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -468,8 +468,9 @@ void rds_atomic_send_complete(struct rds_message *rm, int status) struct rds_sock *rs = NULL; struct rm_atomic_op *ao; struct rds_notifier *notifier; + unsigned long flags; - spin_lock(&rm->m_rs_lock); + spin_lock_irqsave(&rm->m_rs_lock, flags); ao = &rm->atomic; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) @@ -486,7 +487,7 @@ void rds_atomic_send_complete(struct rds_message *rm, int status) ao->op_notifier = NULL; } - spin_unlock(&rm->m_rs_lock); + spin_unlock_irqrestore(&rm->m_rs_lock, flags); if (rs) { rds_wake_sk_sleep(rs); -- cgit v1.1 From ab1a6926f589c51e7a57ce7544d85272c4acc854 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 16:52:12 -0700 Subject: RDS: rds_message_unmapped() doesn't need to check if queue active If the queue has nobody on it, then wake_up does nothing. Signed-off-by: Andy Grover --- net/rds/message.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index dd915e3..9122b53 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -406,8 +406,7 @@ void rds_message_wait(struct rds_message *rm) void rds_message_unmapped(struct rds_message *rm) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); - if (waitqueue_active(&rds_message_flush_waitq)) - wake_up(&rds_message_flush_waitq); + wake_up(&rds_message_flush_waitq); } EXPORT_SYMBOL_GPL(rds_message_unmapped); -- cgit v1.1 From 51e2cba8b5936c13b40f0fa11aa4e84683dbc751 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 17:47:30 -0700 Subject: RDS: Move atomic stats from general to ib-specific area Signed-off-by: Andy Grover --- net/rds/ib.h | 2 ++ net/rds/ib_send.c | 4 ++-- net/rds/ib_stats.c | 2 ++ net/rds/rds.h | 2 -- net/rds/stats.c | 2 -- 5 files changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 4c2ee24..c506604 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -202,6 +202,8 @@ struct rds_ib_statistics { uint64_t s_ib_rdma_mr_pool_flush; uint64_t s_ib_rdma_mr_pool_wait; uint64_t s_ib_rdma_mr_pool_depleted; + uint64_t s_ib_atomic_cswp; + uint64_t s_ib_atomic_fadd; }; extern struct workqueue_struct *rds_ib_wq; diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 6461a15..657037d 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -132,9 +132,9 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, wc_status, rds_atomic_send_complete); if (op->op_type == RDS_ATOMIC_TYPE_CSWP) - rds_stats_inc(s_atomic_cswp); + rds_ib_stats_inc(s_ib_atomic_cswp); else - rds_stats_inc(s_atomic_fadd); + rds_ib_stats_inc(s_ib_atomic_fadd); } /* diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c index d2c904d..2d5965d 100644 --- a/net/rds/ib_stats.c +++ b/net/rds/ib_stats.c @@ -67,6 +67,8 @@ static const char *const rds_ib_stat_names[] = { "ib_rdma_mr_pool_flush", "ib_rdma_mr_pool_wait", "ib_rdma_mr_pool_depleted", + "ib_atomic_cswp", + "ib_atomic_fadd", }; unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, diff --git a/net/rds/rds.h b/net/rds/rds.h index c3a668b..2f19d49 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -566,8 +566,6 @@ struct rds_statistics { uint64_t s_cong_update_received; uint64_t s_cong_send_error; uint64_t s_cong_send_blocked; - uint64_t s_atomic_cswp; - uint64_t s_atomic_fadd; }; /* af_rds.c */ diff --git a/net/rds/stats.c b/net/rds/stats.c index b77be8b..344929a 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -75,8 +75,6 @@ static const char *const rds_stat_names[] = { "cong_update_received", "cong_send_error", "cong_send_blocked", - "s_atomic_cswp", - "s_atomic_fadd", }; void rds_stats_info_copy(struct rds_info_iterator *iter, -- cgit v1.1 From fcc5450c6386526034edc437e4cb2c67a6fdd7e9 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 17:08:49 -0700 Subject: RDS: Remove send_quota from send_xmit() The purpose of the send quota was really to give fairness when different connections were all using the same workq thread to send backlogged msgs -- they could only send so many before another connection could make progress. Now that each connection is pushing the backlog from its completion handler, they are all guaranteed to make progress and the quota isn't needed any longer. A thread *will* have to send all previously queued data, as well as any further msgs placed on the queue while while c_send_lock was held. In a pathological case a single process can get roped into doing this for long periods while other threads get off free. But, since it can only do this until the transport reports full, this is a bounded scenario. Signed-off-by: Andy Grover --- net/rds/send.c | 30 +++++++++++++++--------------- 1 file changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index ecda3e6..656c5c1 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -110,12 +110,11 @@ int rds_send_xmit(struct rds_connection *conn) struct rds_message *rm; unsigned long flags; unsigned int tmp; - unsigned int send_quota = send_batch_count; struct scatterlist *sg; int ret = 0; - int was_empty = 0; LIST_HEAD(to_be_dropped); +restart: if (!rds_conn_up(conn)) goto out; @@ -139,7 +138,7 @@ int rds_send_xmit(struct rds_connection *conn) * spin trying to push headers and data down the connection until * the connection doesn't make forward progress. */ - while (--send_quota) { + while (1) { rm = conn->c_xmit_rm; @@ -185,10 +184,8 @@ int rds_send_xmit(struct rds_connection *conn) spin_unlock(&conn->c_lock); - if (!rm) { - was_empty = 1; + if (!rm) break; - } /* Unfortunately, the way Infiniband deals with * RDMA to a bad MR key is by moving the entire @@ -350,20 +347,23 @@ int rds_send_xmit(struct rds_connection *conn) rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } - if (send_quota == 0 && !was_empty) { - /* We exhausted the send quota, but there's work left to - * do. Return and (re-)schedule the send worker. - */ - ret = -EAGAIN; - } - - if (ret == 0 && was_empty) { + /* + * Other senders will see we have c_send_lock and exit. We + * need to recheck the send queue and race again for c_send_lock + * to make sure messages don't just sit on the send queue. + * + * If the transport cannot continue (i.e ret != 0), then it must + * call us when more room is available, such as from the tx + * completion handler. + */ + if (ret == 0) { /* A simple bit test would be way faster than taking the * spin lock */ spin_lock_irqsave(&conn->c_lock, flags); if (!list_empty(&conn->c_send_queue)) { rds_stats_inc(s_send_lock_queue_raced); - ret = -EAGAIN; + spin_unlock_irqrestore(&conn->c_lock, flags); + goto restart; } spin_unlock_irqrestore(&conn->c_lock, flags); } -- cgit v1.1 From a40aa9233aa22d69212d02f92e5b607bd4d658f4 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 29 Mar 2010 17:10:01 -0700 Subject: RDS: Do wait_event_interruptible instead of wait_event Can't see a reason not to allow signals to interrupt the wait. Signed-off-by: Andy Grover --- net/rds/message.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 9122b53..4cb1ed7 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -399,14 +399,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, */ void rds_message_wait(struct rds_message *rm) { - wait_event(rds_message_flush_waitq, + wait_event_interruptible(rds_message_flush_waitq, !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); } void rds_message_unmapped(struct rds_message *rm) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); - wake_up(&rds_message_flush_waitq); + wake_up_interruptible(&rds_message_flush_waitq); } EXPORT_SYMBOL_GPL(rds_message_unmapped); -- cgit v1.1 From acfcd4d4ec4ed8cb504f96d4fabb7a94029b362b Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 31 Mar 2010 18:56:25 -0700 Subject: RDS: Get pong working again Call send_xmit() directly from pong() Set pongs as op_active Signed-off-by: Andy Grover --- net/rds/send.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 656c5c1..de5693c 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1108,6 +1108,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) } rm->m_daddr = conn->c_faddr; + rm->data.op_active = 1; /* If the connection is down, trigger a connect. We may * have scheduled a delayed reconnect however - in this case @@ -1135,7 +1136,9 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rds_stats_inc(s_send_queued); rds_stats_inc(s_send_pong); - queue_delayed_work(rds_wq, &conn->c_send_w, 0); + if (!test_bit(RDS_LL_SEND_FULL, &conn->c_flags)) + rds_send_xmit(conn); + rds_message_put(rm); return 0; -- cgit v1.1 From 9e29db0e3645cafa980e68a9c717a761448389e1 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 15 Apr 2010 16:38:14 -0400 Subject: RDS: Use a generation counter to avoid rds_send_xmit loop rds_send_xmit is required to loop around after it releases the lock because someone else could done a trylock, found someone working on the list and backed off. But, once we drop our lock, it is possible that someone else does come in and make progress on the list. We should detect this and not loop around if another process is actually working on the list. This patch adds a generation counter that is bumped every time we get the lock and do some send work. If the retry notices someone else has bumped the generation counter, it does not need to loop around and continue working. Signed-off-by: Chris Mason Signed-off-by: Andy Grover --- net/rds/connection.c | 1 + net/rds/rds.h | 1 + net/rds/send.c | 11 +++++++---- 3 files changed, 9 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 56aebe4..7e4e9df 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -147,6 +147,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, conn->c_next_tx_seq = 1; spin_lock_init(&conn->c_send_lock); + atomic_set(&conn->c_send_generation, 1); INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); diff --git a/net/rds/rds.h b/net/rds/rds.h index 2f19d49..b57cb50 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -92,6 +92,7 @@ struct rds_connection { struct rds_cong_map *c_fcong; spinlock_t c_send_lock; /* protect send ring */ + atomic_t c_send_generation; struct rds_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; diff --git a/net/rds/send.c b/net/rds/send.c index de5693c..663fd60 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -112,6 +112,7 @@ int rds_send_xmit(struct rds_connection *conn) unsigned int tmp; struct scatterlist *sg; int ret = 0; + int gen = 0; LIST_HEAD(to_be_dropped); restart: @@ -134,6 +135,8 @@ restart: if (conn->c_trans->xmit_prepare) conn->c_trans->xmit_prepare(conn); + gen = atomic_inc_return(&conn->c_send_generation); + /* * spin trying to push headers and data down the connection until * the connection doesn't make forward progress. @@ -359,13 +362,13 @@ restart: if (ret == 0) { /* A simple bit test would be way faster than taking the * spin lock */ - spin_lock_irqsave(&conn->c_lock, flags); + smp_mb(); if (!list_empty(&conn->c_send_queue)) { rds_stats_inc(s_send_lock_queue_raced); - spin_unlock_irqrestore(&conn->c_lock, flags); - goto restart; + if (gen == atomic_read(&conn->c_send_generation)) { + goto restart; + } } - spin_unlock_irqrestore(&conn->c_lock, flags); } out: return ret; -- cgit v1.1 From ce47f52f42e69d48d1b63fa618fce9cd7ffa9417 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Thu, 15 Apr 2010 17:19:29 -0700 Subject: RDS: Update comments in rds_send_xmit() Update comments to reflect changes in previous commit. Keeping as separate commits due to different authorship. Signed-off-by: Andy Grover --- net/rds/send.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/send.c b/net/rds/send.c index 663fd60..8e3fd99 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -353,15 +353,14 @@ restart: /* * Other senders will see we have c_send_lock and exit. We * need to recheck the send queue and race again for c_send_lock - * to make sure messages don't just sit on the send queue. + * to make sure messages don't just sit on the send queue, if + * somebody hasn't already beat us into the loop. * * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx * completion handler. */ if (ret == 0) { - /* A simple bit test would be way faster than taking the - * spin lock */ smp_mb(); if (!list_empty(&conn->c_send_queue)) { rds_stats_inc(s_send_lock_queue_raced); -- cgit v1.1 From 976673ee1b92d939168c8c1fbad3e16c45caa545 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 21 Apr 2010 13:04:43 -0700 Subject: rds: switch to rwlock on bind_lock The bind_lock is almost entirely readonly, but it gets hammered during normal operations and is a major bottleneck. This commit changes it to an rwlock, which takes it from 80% of the system time on a big numa machine down to much lower numbers. A better fix would involve RCU, which is done in a later commit Signed-off-by: Chris Mason --- net/rds/bind.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/bind.c b/net/rds/bind.c index 65de5cb..a65afff 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -43,7 +43,7 @@ * This is now called for every incoming frame so we arguably care much more * about it than we used to. */ -static DEFINE_SPINLOCK(rds_bind_lock); +static DEFINE_RWLOCK(rds_bind_lock); static struct rb_root rds_bind_tree = RB_ROOT; static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, @@ -88,13 +88,13 @@ struct rds_sock *rds_find_bound(__be32 addr, __be16 port) struct rds_sock *rs; unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); + read_lock_irqsave(&rds_bind_lock, flags); rs = rds_bind_tree_walk(addr, port, NULL); if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - spin_unlock_irqrestore(&rds_bind_lock, flags); + read_unlock_irqrestore(&rds_bind_lock, flags); rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -116,7 +116,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); do { if (rover == 0) @@ -137,7 +137,7 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) rs, &addr, (int)ntohs(*port)); } - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); return ret; } @@ -146,7 +146,7 @@ void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; - spin_lock_irqsave(&rds_bind_lock, flags); + write_lock_irqsave(&rds_bind_lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", @@ -158,7 +158,7 @@ void rds_remove_bound(struct rds_sock *rs) rs->rs_bound_addr = 0; } - spin_unlock_irqrestore(&rds_bind_lock, flags); + write_unlock_irqrestore(&rds_bind_lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) -- cgit v1.1 From c83188dcd76b1f0c17c31b4bbd8de57c634b19f8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 21 Apr 2010 13:09:28 -0700 Subject: rds: per-rm flush_wait waitq This removes a global waitqueue used to wait for rds messages and replaces it with a waitqueue inside the rds_message struct. The global waitqueue turns into a global lock and significantly bottlenecks operations on large machines. Signed-off-by: Chris Mason --- net/rds/message.c | 7 +++---- net/rds/rds.h | 2 ++ 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/message.c b/net/rds/message.c index 4cb1ed7..96e2bf7 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -35,8 +35,6 @@ #include "rds.h" -static DECLARE_WAIT_QUEUE_HEAD(rds_message_flush_waitq); - static unsigned int rds_exthdr_size[__RDS_EXTHDR_MAX] = { [RDS_EXTHDR_NONE] = 0, [RDS_EXTHDR_VERSION] = sizeof(struct rds_ext_header_version), @@ -226,6 +224,7 @@ struct rds_message *rds_message_alloc(unsigned int extra_len, gfp_t gfp) INIT_LIST_HEAD(&rm->m_sock_item); INIT_LIST_HEAD(&rm->m_conn_item); spin_lock_init(&rm->m_rs_lock); + init_waitqueue_head(&rm->m_flush_wait); out: return rm; @@ -399,14 +398,14 @@ int rds_message_inc_copy_to_user(struct rds_incoming *inc, */ void rds_message_wait(struct rds_message *rm) { - wait_event_interruptible(rds_message_flush_waitq, + wait_event_interruptible(rm->m_flush_wait, !test_bit(RDS_MSG_MAPPED, &rm->m_flags)); } void rds_message_unmapped(struct rds_message *rm) { clear_bit(RDS_MSG_MAPPED, &rm->m_flags); - wake_up_interruptible(&rds_message_flush_waitq); + wake_up_interruptible(&rm->m_flush_wait); } EXPORT_SYMBOL_GPL(rds_message_unmapped); diff --git a/net/rds/rds.h b/net/rds/rds.h index b57cb50..c22bd7b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -301,6 +301,8 @@ struct rds_message { * -> rs->rs_lock */ spinlock_t m_rs_lock; + wait_queue_head_t m_flush_wait; + struct rds_sock *m_rs; /* cookie to send to remote, in rds header */ -- cgit v1.1 From 764f2dd92f5cd308d1c4372b33fea2b265c093f5 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 22 Apr 2010 21:59:15 -0400 Subject: rds: rcu-ize rds_ib_get_device() rds_ib_get_device is called very often as we turn an ip address into a corresponding device structure. It currently take a global spinlock as it walks different lists to find active devices. This commit changes the lists over to RCU, which isn't very complex because they are not updated very often at all. Signed-off-by: Chris Mason --- net/rds/ib.c | 1 + net/rds/ib_rdma.c | 24 ++++++++++++++++-------- 2 files changed, 17 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 927c481..7a2131d 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -137,6 +137,7 @@ void rds_ib_remove_one(struct ib_device *device) if (!rds_ibdev) return; + synchronize_rcu(); list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { list_del(&i_ipaddr->list); kfree(i_ipaddr); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 242231f..7240e58 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -32,6 +32,7 @@ */ #include #include +#include #include "rds.h" #include "ib.h" @@ -83,14 +84,14 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) struct rds_ib_ipaddr *i_ipaddr; list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry(i_ipaddr, &rds_ibdev->ipaddr_list, list) { + rcu_read_lock(); + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - spin_unlock_irq(&rds_ibdev->spinlock); + rcu_read_unlock(); return rds_ibdev; } } - spin_unlock_irq(&rds_ibdev->spinlock); + rcu_read_unlock(); } return NULL; @@ -107,7 +108,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) i_ipaddr->ipaddr = ipaddr; spin_lock_irq(&rds_ibdev->spinlock); - list_add_tail(&i_ipaddr->list, &rds_ibdev->ipaddr_list); + list_add_tail_rcu(&i_ipaddr->list, &rds_ibdev->ipaddr_list); spin_unlock_irq(&rds_ibdev->spinlock); return 0; @@ -116,16 +117,23 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { struct rds_ib_ipaddr *i_ipaddr, *next; + struct rds_ib_ipaddr *to_free = NULL; + spin_lock_irq(&rds_ibdev->spinlock); - list_for_each_entry_safe(i_ipaddr, next, &rds_ibdev->ipaddr_list, list) { + list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); + list_del_rcu(&i_ipaddr->list); + to_free = i_ipaddr; break; } } spin_unlock_irq(&rds_ibdev->spinlock); + + if (to_free) { + synchronize_rcu(); + kfree(to_free); + } } int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) -- cgit v1.1 From 4a81802b5e5e0b059627d7173c917711cf35e668 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 23 Apr 2010 11:04:21 -0700 Subject: RDS/IB: Remove unused variable in ib_remove_addr() Signed-off-by: Andy Grover --- net/rds/ib_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 7240e58..7315fff 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -116,7 +116,7 @@ static int rds_ib_add_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) static void rds_ib_remove_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) { - struct rds_ib_ipaddr *i_ipaddr, *next; + struct rds_ib_ipaddr *i_ipaddr; struct rds_ib_ipaddr *to_free = NULL; -- cgit v1.1 From e4c52c98e04937ea87b0979a81354d0040d284f9 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 23 Apr 2010 10:49:53 -0700 Subject: RDS/IB: add _to_node() macros for numa and use {k,v}malloc_node() Allocate send/recv rings in memory that is node-local to the HCA. This significantly helps performance. Signed-off-by: Andy Grover --- net/rds/ib.c | 2 +- net/rds/ib.h | 6 ++++++ net/rds/ib_cm.c | 6 ++++-- net/rds/ib_rdma.c | 5 +++-- 4 files changed, 14 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 7a2131d..7d289d7 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -77,7 +77,7 @@ void rds_ib_add_one(struct ib_device *device) goto free_attr; } - rds_ibdev = kmalloc(sizeof *rds_ibdev, GFP_KERNEL); + rds_ibdev = kmalloc_node(sizeof *rds_ibdev, GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) goto free_attr; diff --git a/net/rds/ib.h b/net/rds/ib.h index c506604..4bc3e2f 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -3,6 +3,8 @@ #include #include +#include +#include #include "rds.h" #include "rdma_transport.h" @@ -167,6 +169,10 @@ struct rds_ib_device { spinlock_t spinlock; /* protect the above */ }; +#define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) +#define ibdev_to_node(ibdev) pcidev_to_node(to_pci_dev(ibdev->dma_device)) +#define rdsibdev_to_node(rdsibdev) ibdev_to_node(rdsibdev->dev) + /* bits for i_ack_flags */ #define IB_ACK_IN_FLIGHT 0 #define IB_ACK_REQUESTED 1 diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 75eda9c8..b5d0b60 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -347,7 +347,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn) goto out; } - ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); + ic->i_sends = vmalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), + ibdev_to_node(dev)); if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); @@ -355,7 +356,8 @@ static int rds_ib_setup_qp(struct rds_connection *conn) } memset(ic->i_sends, 0, ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work)); - ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work)); + ic->i_recvs = vmalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), + ibdev_to_node(dev)); if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 7315fff..cc341cd7 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -297,7 +297,7 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) rds_ib_flush_mr_pool(pool, 0); } - ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL); + ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); if (!ibmr) { err = -ENOMEM; goto out_no_cigar; @@ -376,7 +376,8 @@ static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibm if (page_cnt > fmr_message_size) return -EINVAL; - dma_pages = kmalloc(sizeof(u64) * page_cnt, GFP_ATOMIC); + dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC, + rdsibdev_to_node(rds_ibdev)); if (!dma_pages) return -ENOMEM; -- cgit v1.1 From 38a4e5e61344490f18241333d7b1b368a3a38748 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 11 May 2010 15:09:45 -0700 Subject: rds: Use RCU for the bind lookup searches The RDS bind lookups are somewhat expensive in terms of CPU time and locking overhead. This commit changes them into a faster RCU based hash tree instead of the rbtrees they were using before. On large NUMA systems it is a significant improvement. Signed-off-by: Chris Mason --- net/rds/af_rds.c | 8 +++++ net/rds/bind.c | 90 ++++++++++++++++++++++++++++--------------------------- net/rds/ib_rdma.c | 2 ++ net/rds/rds.h | 2 +- 4 files changed, 57 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index ef09340..f16d2a9 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -72,7 +72,15 @@ static int rds_release(struct socket *sock) * with the socket. */ rds_clear_recv_queue(rs); rds_cong_remove_socket(rs); + + /* + * the binding lookup hash uses rcu, we need to + * make sure we sychronize_rcu before we free our + * entry + */ rds_remove_bound(rs); + synchronize_rcu(); + rds_send_drop_to(rs, NULL); rds_rdma_drop_keys(rs); rds_notify_queue_get(rs, NULL); diff --git a/net/rds/bind.c b/net/rds/bind.c index a65afff..2f6b3fc 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -34,45 +34,52 @@ #include #include #include +#include #include "rds.h" -/* - * XXX this probably still needs more work.. no INADDR_ANY, and rbtrees aren't - * particularly zippy. - * - * This is now called for every incoming frame so we arguably care much more - * about it than we used to. - */ -static DEFINE_RWLOCK(rds_bind_lock); -static struct rb_root rds_bind_tree = RB_ROOT; +#define BIND_HASH_SIZE 1024 +static struct hlist_head bind_hash_table[BIND_HASH_SIZE]; +static DEFINE_SPINLOCK(rds_bind_lock); + +static struct hlist_head *hash_to_bucket(__be32 addr, __be16 port) +{ + return bind_hash_table + (jhash_2words((u32)addr, (u32)port, 0) & + (BIND_HASH_SIZE - 1)); +} -static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, - struct rds_sock *insert) +static struct rds_sock *rds_bind_lookup(__be32 addr, __be16 port, + struct rds_sock *insert) { - struct rb_node **p = &rds_bind_tree.rb_node; - struct rb_node *parent = NULL; struct rds_sock *rs; + struct hlist_node *node; + struct hlist_head *head = hash_to_bucket(addr, port); u64 cmp; u64 needle = ((u64)be32_to_cpu(addr) << 32) | be16_to_cpu(port); - while (*p) { - parent = *p; - rs = rb_entry(parent, struct rds_sock, rs_bound_node); - + rcu_read_lock(); + hlist_for_each_entry_rcu(rs, node, head, rs_bound_node) { cmp = ((u64)be32_to_cpu(rs->rs_bound_addr) << 32) | be16_to_cpu(rs->rs_bound_port); - if (needle < cmp) - p = &(*p)->rb_left; - else if (needle > cmp) - p = &(*p)->rb_right; - else + if (cmp == needle) { + rcu_read_unlock(); return rs; + } } + rcu_read_unlock(); if (insert) { - rb_link_node(&insert->rs_bound_node, parent, p); - rb_insert_color(&insert->rs_bound_node, &rds_bind_tree); + /* + * make sure our addr and port are set before + * we are added to the list, other people + * in rcu will find us as soon as the + * hlist_add_head_rcu is done + */ + insert->rs_bound_addr = addr; + insert->rs_bound_port = port; + rds_sock_addref(insert); + + hlist_add_head_rcu(&insert->rs_bound_node, head); } return NULL; } @@ -86,15 +93,13 @@ static struct rds_sock *rds_bind_tree_walk(__be32 addr, __be16 port, struct rds_sock *rds_find_bound(__be32 addr, __be16 port) { struct rds_sock *rs; - unsigned long flags; - read_lock_irqsave(&rds_bind_lock, flags); - rs = rds_bind_tree_walk(addr, port, NULL); + rs = rds_bind_lookup(addr, port, NULL); + if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD)) rds_sock_addref(rs); else rs = NULL; - read_unlock_irqrestore(&rds_bind_lock, flags); rdsdebug("returning rs %p for %pI4:%u\n", rs, &addr, ntohs(port)); @@ -116,28 +121,21 @@ static int rds_add_bound(struct rds_sock *rs, __be32 addr, __be16 *port) last = rover - 1; } - write_lock_irqsave(&rds_bind_lock, flags); + spin_lock_irqsave(&rds_bind_lock, flags); do { if (rover == 0) rover++; - if (!rds_bind_tree_walk(addr, cpu_to_be16(rover), rs)) { - *port = cpu_to_be16(rover); + if (!rds_bind_lookup(addr, cpu_to_be16(rover), rs)) { + *port = rs->rs_bound_port; ret = 0; + rdsdebug("rs %p binding to %pI4:%d\n", + rs, &addr, (int)ntohs(*port)); break; } } while (rover++ != last); - if (ret == 0) { - rs->rs_bound_addr = addr; - rs->rs_bound_port = *port; - rds_sock_addref(rs); - - rdsdebug("rs %p binding to %pI4:%d\n", - rs, &addr, (int)ntohs(*port)); - } - - write_unlock_irqrestore(&rds_bind_lock, flags); + spin_unlock_irqrestore(&rds_bind_lock, flags); return ret; } @@ -146,19 +144,19 @@ void rds_remove_bound(struct rds_sock *rs) { unsigned long flags; - write_lock_irqsave(&rds_bind_lock, flags); + spin_lock_irqsave(&rds_bind_lock, flags); if (rs->rs_bound_addr) { rdsdebug("rs %p unbinding from %pI4:%d\n", rs, &rs->rs_bound_addr, ntohs(rs->rs_bound_port)); - rb_erase(&rs->rs_bound_node, &rds_bind_tree); + hlist_del_init_rcu(&rs->rs_bound_node); rds_sock_put(rs); rs->rs_bound_addr = 0; } - write_unlock_irqrestore(&rds_bind_lock, flags); + spin_unlock_irqrestore(&rds_bind_lock, flags); } int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) @@ -198,5 +196,9 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) out: release_sock(sk); + + /* we might have called rds_remove_bound on error */ + if (ret) + synchronize_rcu(); return ret; } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index cc341cd7..4ba01b9 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -303,6 +303,8 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } + memset(ibmr, 0, sizeof(*ibmr)); + ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | diff --git a/net/rds/rds.h b/net/rds/rds.h index c22bd7b..241a085 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -452,7 +452,7 @@ struct rds_sock { * bound_addr used for both incoming and outgoing, no INADDR_ANY * support. */ - struct rb_node rs_bound_node; + struct hlist_node rs_bound_node; __be32 rs_bound_addr; __be32 rs_conn_addr; __be16 rs_bound_port; -- cgit v1.1 From 7e3f2952eeb1a0fe2aa9882fd1705a88f9d89b35 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 11 May 2010 15:11:11 -0700 Subject: rds: don't let RDS shutdown a connection while senders are present This is the first in a long line of patches that tries to fix races between RDS connection shutdown and RDS traffic. Here we are maintaining a count of active senders to make sure the connection doesn't go away while they are using it. Signed-off-by: Chris Mason --- net/rds/connection.c | 7 +++++++ net/rds/ib_recv.c | 12 ------------ net/rds/message.c | 5 ++++- net/rds/rds.h | 1 + net/rds/send.c | 17 ++++++++++++++--- 5 files changed, 26 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 7e4e9df..9c249f3 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -148,6 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, spin_lock_init(&conn->c_send_lock); atomic_set(&conn->c_send_generation, 1); + atomic_set(&conn->c_senders, 0); INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); @@ -276,6 +277,12 @@ void rds_conn_shutdown(struct rds_connection *conn) spin_lock_irq(&conn->c_send_lock); spin_unlock_irq(&conn->c_send_lock); + while(atomic_read(&conn->c_senders)) { + schedule_timeout(1); + spin_lock_irq(&conn->c_send_lock); + spin_unlock_irq(&conn->c_send_lock); + } + conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 8f041f7..24d1461 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -863,18 +863,6 @@ int rds_ib_recv(struct rds_connection *conn) int ret = 0; rdsdebug("conn %p\n", conn); - - /* - * If we get a temporary posting failure in this context then - * we're really low and we want the caller to back off for a bit. - */ - mutex_lock(&ic->i_recv_mutex); - if (rds_ib_recv_refill(conn, 0)) - ret = -ENOMEM; - else - rds_ib_stats_inc(s_ib_rx_refill_from_thread); - mutex_unlock(&ic->i_recv_mutex); - if (rds_conn_up(conn)) rds_ib_attempt_ack(ic); diff --git a/net/rds/message.c b/net/rds/message.c index 96e2bf7..84f937f 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -81,7 +81,10 @@ static void rds_message_purge(struct rds_message *rm) void rds_message_put(struct rds_message *rm) { rdsdebug("put rm %p ref %d\n", rm, atomic_read(&rm->m_refcount)); - + if (atomic_read(&rm->m_refcount) == 0) { +printk(KERN_CRIT "danger refcount zero on %p\n", rm); +WARN_ON(1); + } if (atomic_dec_and_test(&rm->m_refcount)) { BUG_ON(!list_empty(&rm->m_sock_item)); BUG_ON(!list_empty(&rm->m_conn_item)); diff --git a/net/rds/rds.h b/net/rds/rds.h index 241a085..4ab3d1a 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -93,6 +93,7 @@ struct rds_connection { spinlock_t c_send_lock; /* protect send ring */ atomic_t c_send_generation; + atomic_t c_senders; struct rds_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; diff --git a/net/rds/send.c b/net/rds/send.c index 8e3fd99..d35c43f 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -60,15 +60,23 @@ void rds_send_reset(struct rds_connection *conn) struct rds_message *rm, *tmp; unsigned long flags; + spin_lock_irqsave(&conn->c_send_lock, flags); if (conn->c_xmit_rm) { + rm = conn->c_xmit_rm; + conn->c_xmit_rm = NULL; /* Tell the user the RDMA op is no longer mapped by the * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ - rds_message_unmapped(conn->c_xmit_rm); - rds_message_put(conn->c_xmit_rm); - conn->c_xmit_rm = NULL; +printk(KERN_CRIT "send reset unmapping %p\n", rm); + rds_message_unmapped(rm); + spin_unlock_irqrestore(&conn->c_send_lock, flags); + + rds_message_put(rm); + } else { + spin_unlock_irqrestore(&conn->c_send_lock, flags); } + conn->c_xmit_sg = 0; conn->c_xmit_hdr_off = 0; conn->c_xmit_data_off = 0; @@ -131,6 +139,7 @@ restart: ret = -ENOMEM; goto out; } + atomic_inc(&conn->c_senders); if (conn->c_trans->xmit_prepare) conn->c_trans->xmit_prepare(conn); @@ -350,6 +359,8 @@ restart: rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } + atomic_dec(&conn->c_senders); + /* * Other senders will see we have c_send_lock and exit. We * need to recheck the send queue and race again for c_send_lock -- cgit v1.1 From c9e65383a20d9a656db70efbf67e57f8115ad776 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 11 May 2010 15:14:16 -0700 Subject: rds: Fix RDMA message reference counting The RDS send_xmit code was trying to get fancy with message counting and was dropping the final reference on the RDMA messages too early. This resulted in memory corruption and oopsen. The fix here is to always add a ref as the parts of the message passes through rds_send_xmit, and always drop a ref as the parts of the message go through completion handling. Signed-off-by: Chris Mason --- net/rds/ib_send.c | 11 ++++++----- net/rds/send.c | 11 ++++++++--- 2 files changed, 14 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 657037d..82459e5 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -268,11 +268,12 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) if (send->s_queued + HZ/2 < jiffies) rds_ib_stats_inc(s_ib_tx_stalled); - if (&send->s_op == &rm->m_final_op) { - /* If anyone waited for this message to get flushed out, wake - * them up now */ - rds_message_unmapped(rm); - + if (send->s_op) { + if (send->s_op == rm->m_final_op) { + /* If anyone waited for this message to get flushed out, wake + * them up now */ + rds_message_unmapped(rm); + } rds_message_put(rm); send->s_op = NULL; } diff --git a/net/rds/send.c b/net/rds/send.c index d35c43f..5c6d4a0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -68,7 +68,6 @@ void rds_send_reset(struct rds_connection *conn) * transport. This isn't entirely true (it's flushed out * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ -printk(KERN_CRIT "send reset unmapping %p\n", rm); rds_message_unmapped(rm); spin_unlock_irqrestore(&conn->c_send_lock, flags); @@ -234,10 +233,13 @@ restart: /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { + rds_message_addref(rm); rm->m_final_op = &rm->rdma; ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); - if (ret) + if (ret) { + rds_message_put(rm); break; + } conn->c_xmit_rdma_sent = 1; /* The transport owns the mapped memory for now. @@ -246,10 +248,13 @@ restart: } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { + rds_message_addref(rm); rm->m_final_op = &rm->atomic; ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); - if (ret) + if (ret) { + rds_message_put(rm); break; + } conn->c_xmit_atomic_sent = 1; /* The transport owns the mapped memory for now. -- cgit v1.1 From abf454398c2ebafc629ebb8b149f5a752c79e919 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 11 May 2010 15:14:52 -0700 Subject: RDS: use locking on the connection hash list rds_conn_destroy really needs locking while it changes the connection hash. Signed-off-by: Chris Mason --- net/rds/connection.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 9c249f3..87df15b 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -321,7 +321,10 @@ void rds_conn_destroy(struct rds_connection *conn) "%pI4\n", conn, &conn->c_laddr, &conn->c_faddr); + /* Ensure conn will not be scheduled for reconnect */ + spin_lock_irq(&rds_conn_lock); hlist_del_init(&conn->c_hash_node); + spin_unlock_irq(&rds_conn_lock); /* wait for the rds thread to shut it down */ atomic_set(&conn->c_state, RDS_CONN_ERROR); -- cgit v1.1 From bcf50ef2ce3c5d8f2fe995259da16677898cb300 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 11 May 2010 15:15:15 -0700 Subject: rds: use RCU to protect the connection hash The connection hash was almost entirely RCU ready, this just makes the final couple of changes to use RCU instead of spinlocks for everything. Signed-off-by: Chris Mason --- net/rds/connection.c | 44 ++++++++++++++++++++++---------------------- 1 file changed, 22 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 87df15b..180b83a 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -62,6 +62,7 @@ static struct hlist_head *rds_conn_bucket(__be32 laddr, __be32 faddr) var |= RDS_INFO_CONNECTION_FLAG_##suffix; \ } while (0) +/* rcu read lock must be held or the connection spinlock */ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, __be32 laddr, __be32 faddr, struct rds_transport *trans) @@ -69,7 +70,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, struct rds_connection *conn, *ret = NULL; struct hlist_node *pos; - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { if (conn->c_faddr == faddr && conn->c_laddr == laddr && conn->c_trans == trans) { ret = conn; @@ -119,7 +120,8 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, unsigned long flags; int ret; - spin_lock_irqsave(&rds_conn_lock, flags); + + rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && !is_outgoing) { @@ -130,7 +132,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, parent = conn; conn = parent->c_passive; } - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); if (conn) goto out; @@ -227,7 +229,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, kmem_cache_free(rds_conn_slab, conn); conn = found; } else { - hlist_add_head(&conn->c_hash_node, head); + hlist_add_head_rcu(&conn->c_hash_node, head); rds_cong_add_conn(conn); rds_conn_count++; } @@ -306,8 +308,13 @@ void rds_conn_shutdown(struct rds_connection *conn) * to the conn hash, so we never trigger a reconnect on this * conn - the reconnect is always triggered by the active peer. */ cancel_delayed_work_sync(&conn->c_conn_w); - if (!hlist_unhashed(&conn->c_hash_node)) + rcu_read_lock(); + if (!hlist_unhashed(&conn->c_hash_node)) { + rcu_read_unlock(); rds_queue_reconnect(conn); + } else { + rcu_read_unlock(); + } } /* @@ -323,14 +330,12 @@ void rds_conn_destroy(struct rds_connection *conn) /* Ensure conn will not be scheduled for reconnect */ spin_lock_irq(&rds_conn_lock); - hlist_del_init(&conn->c_hash_node); + hlist_del_init_rcu(&conn->c_hash_node); spin_unlock_irq(&rds_conn_lock); - /* wait for the rds thread to shut it down */ - atomic_set(&conn->c_state, RDS_CONN_ERROR); - cancel_delayed_work(&conn->c_conn_w); - queue_work(rds_wq, &conn->c_down_w); - flush_workqueue(rds_wq); + synchronize_rcu(); + + rds_conn_shutdown(conn); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, @@ -369,17 +374,16 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, struct list_head *list; struct rds_connection *conn; struct rds_message *rm; - unsigned long flags; unsigned int total = 0; size_t i; len /= sizeof(struct rds_info_message); - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry(conn, pos, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { if (want_send) list = &conn->c_send_queue; else @@ -399,8 +403,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, spin_unlock(&conn->c_lock); } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); lens->nr = total; lens->each = sizeof(struct rds_info_message); @@ -430,19 +433,17 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, uint64_t buffer[(item_len + 7) / 8]; struct hlist_head *head; struct hlist_node *pos; - struct hlist_node *tmp; struct rds_connection *conn; - unsigned long flags; size_t i; - spin_lock_irqsave(&rds_conn_lock, flags); + rcu_read_lock(); lens->nr = 0; lens->each = item_len; for (i = 0, head = rds_conn_hash; i < ARRAY_SIZE(rds_conn_hash); i++, head++) { - hlist_for_each_entry_safe(conn, pos, tmp, head, c_hash_node) { + hlist_for_each_entry_rcu(conn, pos, head, c_hash_node) { /* XXX no c_lock usage.. */ if (!visitor(conn, buffer)) @@ -458,8 +459,7 @@ void rds_for_each_conn_info(struct socket *sock, unsigned int len, lens->nr++; } } - - spin_unlock_irqrestore(&rds_conn_lock, flags); + rcu_read_unlock(); } EXPORT_SYMBOL_GPL(rds_for_each_conn_info); -- cgit v1.1 From 1cc2228c599f173d77000a250bf0541294e1a7be Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 11 May 2010 16:15:35 -0700 Subject: rds: Fix reference counting on the for xmit_atomic and xmit_rdma This makes sure we have the proper number of references in rds_ib_xmit_atomic and rds_ib_xmit_rdma. We also consistently drop references the same way for all message types as the IOs end. Signed-off-by: Chris Mason --- net/rds/ib_send.c | 10 +++++++++- net/rds/send.c | 10 ++-------- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 82459e5..209dbc6 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -757,6 +757,8 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send->s_wr.next = NULL; send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; send->s_wr.wr.atomic.rkey = op->op_rkey; + send->s_op = op; + rds_message_addref(container_of(send->s_op, struct rds_message, atomic)); /* map 8 byte retval buffer to the device */ ret = ib_dma_map_sg(ic->i_cm_id->device, op->op_sg, 1, DMA_FROM_DEVICE); @@ -857,13 +859,13 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) { send->s_wr.send_flags = 0; send->s_queued = jiffies; + send->s_op = NULL; rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; send->s_wr.wr.rdma.rkey = op->op_rkey; - send->s_op = op; if (num_sge > rds_ibdev->max_sge) { send->s_wr.num_sge = rds_ibdev->max_sge; @@ -899,6 +901,12 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send = ic->i_sends; } + /* give a reference to the last op */ + if (scat == &op->op_sg[op->op_count]) { + prev->s_op = op; + rds_message_addref(container_of(op, struct rds_message, rdma)); + } + if (i < work_alloc) { rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc - i); work_alloc = i; diff --git a/net/rds/send.c b/net/rds/send.c index 5c6d4a0..437f1e7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -233,13 +233,10 @@ restart: /* The transport either sends the whole rdma or none of it */ if (rm->rdma.op_active && !conn->c_xmit_rdma_sent) { - rds_message_addref(rm); rm->m_final_op = &rm->rdma; ret = conn->c_trans->xmit_rdma(conn, &rm->rdma); - if (ret) { - rds_message_put(rm); + if (ret) break; - } conn->c_xmit_rdma_sent = 1; /* The transport owns the mapped memory for now. @@ -248,13 +245,10 @@ restart: } if (rm->atomic.op_active && !conn->c_xmit_atomic_sent) { - rds_message_addref(rm); rm->m_final_op = &rm->atomic; ret = conn->c_trans->xmit_atomic(conn, &rm->atomic); - if (ret) { - rds_message_put(rm); + if (ret) break; - } conn->c_xmit_atomic_sent = 1; /* The transport owns the mapped memory for now. -- cgit v1.1 From a46ca94e7fb2c93a59e08b42fd77d8c478fda5fc Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 24 May 2010 13:14:59 -0700 Subject: RDS/IB: rds_ib_cm_handle_connect() forgot to unlock c_cm_lock rds_ib_cm_handle_connect() could return without unlocking the c_conn_lock if rds_setup_qp() failed. Rather than adding another imbalanced mutex_unlock() to this error path we only unlock the mutex once as we exit the function, reducing the likelyhood of making this same mistake in the future. We remove the previous mulitple return sites, leaving one unambigious return path. Signed-off-by: Zach Brown --- net/rds/ib_cm.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index b5d0b60..73253f7 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -428,7 +428,7 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, struct rds_ib_connection *ic = NULL; struct rdma_conn_param conn_param; u32 version; - int err, destroy = 1; + int err = 1, destroy = 1; /* Check whether the remote protocol version matches ours. */ version = rds_ib_protocol_compatible(event); @@ -467,7 +467,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* Wait and see - our connect may still be succeeding */ rds_ib_stats_inc(s_ib_connect_raced); } - mutex_unlock(&conn->c_cm_lock); goto out; } @@ -504,16 +503,14 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, /* rdma_accept() calls rdma_reject() internally if it fails */ err = rdma_accept(cm_id, &conn_param); - mutex_unlock(&conn->c_cm_lock); - if (err) { + if (err) rds_ib_conn_error(conn, "rdma_accept failed (%d)\n", err); - goto out; - } - - return 0; out: - rdma_reject(cm_id, NULL, 0); + if (conn) + mutex_unlock(&conn->c_cm_lock); + if (err) + rdma_reject(cm_id, NULL, 0); return destroy; } -- cgit v1.1 From 89bf9d4158b5a1b6bd00960eb2e47601ec8cc138 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 18 May 2010 15:44:50 -0700 Subject: RDS/IB: get the xmit max_sge from the RDS IB device on the connection rds_ib_xmit_rdma() was calling ib_get_client_data() to get at the rds_ibdevice just to get the max_sge for the transmit. This patch instead has it get it directly off the rds_ibdev which is stored on the connection. The current code won't free the rds_ibdev until all the IB connections that use it are freed. So it's safe to reference the rds_ibdev this way. In the future it also makes it easier to support proper reference counting of the rds_ibdev struct. As an additional bonus, this gets rid of the performance hit of calling in to the IB stack to look up the rds_ibdev. The current implementation in the IB stack acquires an interrupt blocking spinlock to protect the registration of client callback data. Signed-off-by: Zach Brown --- net/rds/ib_send.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 209dbc6..3f91e79 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -806,10 +806,10 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) struct rds_ib_send_work *first; struct rds_ib_send_work *prev; struct ib_send_wr *failed_wr; - struct rds_ib_device *rds_ibdev; struct scatterlist *scat; unsigned long len; u64 remote_addr = op->op_remote_addr; + u32 max_sge = ic->rds_ibdev->max_sge; u32 pos; u32 work_alloc; u32 i; @@ -818,8 +818,6 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) int ret; int num_sge; - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - /* map the op the first time we see it */ if (!op->op_mapped) { op->op_count = ib_dma_map_sg(ic->i_cm_id->device, @@ -839,7 +837,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) * Instead of knowing how to return a partial rdma read/write we insist that there * be enough work requests to send the entire message. */ - i = ceil(op->op_count, rds_ibdev->max_sge); + i = ceil(op->op_count, max_sge); work_alloc = rds_ib_ring_alloc(&ic->i_send_ring, i, &pos); if (work_alloc != i) { @@ -867,9 +865,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send->s_wr.wr.rdma.remote_addr = remote_addr; send->s_wr.wr.rdma.rkey = op->op_rkey; - if (num_sge > rds_ibdev->max_sge) { - send->s_wr.num_sge = rds_ibdev->max_sge; - num_sge -= rds_ibdev->max_sge; + if (num_sge > max_sge) { + send->s_wr.num_sge = max_sge; + num_sge -= max_sge; } else { send->s_wr.num_sge = num_sge; } -- cgit v1.1 From 3e0249f9c05cb77b66f7f09644ca9ca208d991a9 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 18 May 2010 15:48:51 -0700 Subject: RDS/IB: add refcount tracking to struct rds_ib_device The RDS IB client .remove callback used to free the rds_ibdev for the given device unconditionally. This could race other users of the struct. This patch adds refcounting so that we only free the rds_ibdev once all of its users are done. Many rds_ibdev users are tied to connections. We give the connection a reference and change these users to reference the device in the connection instead of looking it up in the IB client data. The only user of the IB client data remaining is the first lookup of the device as connections are built up. Incrementing the reference count of a device found in the IB client data could race with final freeing so we use an RCU grace period to make sure that freeing won't happen until those lookups are done. MRs need the rds_ibdev to get at the pool that they're freed in to. They exist outside a connection and many MRs can reference different devices from one socket, so it was natural to have each MR hold a reference. MR refs can be dropped from interrupt handlers and final device teardown can block so we push it off to a work struct. Pool teardown had to be fixed to cancel its pending work instead of deadlocking waiting for all queued work, including itself, to finish. MRs get their reference from the global device list, which gets a reference. It is left unprotected by locks and remains racy. A simple global lock would be a significant bottleneck. More scalable (complicated) locking should be done carefully in a later patch. Signed-off-by: Zach Brown --- net/rds/ib.c | 129 ++++++++++++++++++++++++++++++++++++++++-------------- net/rds/ib.h | 4 ++ net/rds/ib_cm.c | 33 ++++++-------- net/rds/ib_rdma.c | 14 +++++- 4 files changed, 125 insertions(+), 55 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 7d289d7..1732f8e 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -59,6 +59,38 @@ struct list_head rds_ib_devices; DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); +/* + * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references + * from interrupt context so we push freing off into a work struct in krdsd. + */ +static void rds_ib_dev_free(struct work_struct *work) +{ + struct rds_ib_ipaddr *i_ipaddr, *i_next; + struct rds_ib_device *rds_ibdev = container_of(work, + struct rds_ib_device, free_work); + + if (rds_ibdev->mr_pool) + rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); + if (rds_ibdev->mr) + ib_dereg_mr(rds_ibdev->mr); + if (rds_ibdev->pd) + ib_dealloc_pd(rds_ibdev->pd); + + list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { + list_del(&i_ipaddr->list); + kfree(i_ipaddr); + } + + kfree(rds_ibdev); +} + +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) +{ + BUG_ON(atomic_read(&rds_ibdev->refcount) <= 0); + if (atomic_dec_and_test(&rds_ibdev->refcount)) + queue_work(rds_wq, &rds_ibdev->free_work); +} + void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; @@ -77,11 +109,14 @@ void rds_ib_add_one(struct ib_device *device) goto free_attr; } - rds_ibdev = kmalloc_node(sizeof *rds_ibdev, GFP_KERNEL, ibdev_to_node(device)); + rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, + ibdev_to_node(device)); if (!rds_ibdev) goto free_attr; spin_lock_init(&rds_ibdev->spinlock); + atomic_set(&rds_ibdev->refcount, 1); + INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); rds_ibdev->max_wrs = dev_attr->max_qp_wr; rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); @@ -96,67 +131,93 @@ void rds_ib_add_one(struct ib_device *device) rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); - if (IS_ERR(rds_ibdev->pd)) - goto free_dev; + if (IS_ERR(rds_ibdev->pd)) { + rds_ibdev->pd = NULL; + goto put_dev; + } - rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, - IB_ACCESS_LOCAL_WRITE); - if (IS_ERR(rds_ibdev->mr)) - goto err_pd; + rds_ibdev->mr = ib_get_dma_mr(rds_ibdev->pd, IB_ACCESS_LOCAL_WRITE); + if (IS_ERR(rds_ibdev->mr)) { + rds_ibdev->mr = NULL; + goto put_dev; + } rds_ibdev->mr_pool = rds_ib_create_mr_pool(rds_ibdev); if (IS_ERR(rds_ibdev->mr_pool)) { rds_ibdev->mr_pool = NULL; - goto err_mr; + goto put_dev; } INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); list_add_tail(&rds_ibdev->list, &rds_ib_devices); + atomic_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); + atomic_inc(&rds_ibdev->refcount); - goto free_attr; - -err_mr: - ib_dereg_mr(rds_ibdev->mr); -err_pd: - ib_dealloc_pd(rds_ibdev->pd); -free_dev: - kfree(rds_ibdev); +put_dev: + rds_ib_dev_put(rds_ibdev); free_attr: kfree(dev_attr); } +/* + * New connections use this to find the device to associate with the + * connection. It's not in the fast path so we're not concerned about the + * performance of the IB call. (As of this writing, it uses an interrupt + * blocking spinlock to serialize walking a per-device list of all registered + * clients.) + * + * RCU is used to handle incoming connections racing with device teardown. + * Rather than use a lock to serialize removal from the client_data and + * getting a new reference, we use an RCU grace period. The destruction + * path removes the device from client_data and then waits for all RCU + * readers to finish. + * + * A new connection can get NULL from this if its arriving on a + * device that is in the process of being removed. + */ +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) +{ + struct rds_ib_device *rds_ibdev; + + rcu_read_lock(); + rds_ibdev = ib_get_client_data(device, &rds_ib_client); + if (rds_ibdev) + atomic_inc(&rds_ibdev->refcount); + rcu_read_unlock(); + return rds_ibdev; +} + +/* + * The IB stack is letting us know that a device is going away. This can + * happen if the underlying HCA driver is removed or if PCI hotplug is removing + * the pci function, for example. + * + * This can be called at any time and can be racing with any other RDS path. + */ void rds_ib_remove_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct rds_ib_ipaddr *i_ipaddr, *i_next; rds_ibdev = ib_get_client_data(device, &rds_ib_client); if (!rds_ibdev) return; - synchronize_rcu(); - list_for_each_entry_safe(i_ipaddr, i_next, &rds_ibdev->ipaddr_list, list) { - list_del(&i_ipaddr->list); - kfree(i_ipaddr); - } - rds_ib_destroy_conns(rds_ibdev); - if (rds_ibdev->mr_pool) - rds_ib_destroy_mr_pool(rds_ibdev->mr_pool); - - ib_dereg_mr(rds_ibdev->mr); - - while (ib_dealloc_pd(rds_ibdev->pd)) { - rdsdebug("Failed to dealloc pd %p\n", rds_ibdev->pd); - msleep(1); - } + /* + * prevent future connection attempts from getting a reference to this + * device and wait for currently racing connection attempts to finish + * getting their reference + */ + ib_set_client_data(device, &rds_ib_client, NULL); + synchronize_rcu(); + rds_ib_dev_put(rds_ibdev); list_del(&rds_ibdev->list); - kfree(rds_ibdev); + rds_ib_dev_put(rds_ibdev); } struct ib_client rds_ib_client = { @@ -190,7 +251,7 @@ static int rds_ib_conn_info_visitor(struct rds_connection *conn, rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid); rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid); - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); + rds_ibdev = ic->rds_ibdev; iinfo->max_send_wr = ic->i_send_ring.w_nr; iinfo->max_recv_wr = ic->i_recv_ring.w_nr; iinfo->max_send_sge = rds_ibdev->max_sge; diff --git a/net/rds/ib.h b/net/rds/ib.h index 4bc3e2f..282ec69 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -167,6 +167,8 @@ struct rds_ib_device { unsigned int max_initiator_depth; unsigned int max_responder_resources; spinlock_t spinlock; /* protect the above */ + atomic_t refcount; + struct work_struct free_work; }; #define pcidev_to_node(pcidev) pcibus_to_node(pcidev->bus) @@ -251,6 +253,8 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, extern struct rds_transport rds_ib_transport; extern void rds_ib_add_one(struct ib_device *device); extern void rds_ib_remove_one(struct ib_device *device); +struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); +void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; extern unsigned int fmr_pool_size; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 73253f7..a9fb917 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -95,7 +95,6 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even { const struct rds_ib_connect_private *dp = NULL; struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_ib_device *rds_ibdev; struct ib_qp_attr qp_attr; int err; @@ -145,12 +144,11 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_even if (err) printk(KERN_NOTICE "ib_modify_qp(IB_QP_STATE, RTS): err=%d\n", err); - /* update ib_device with this local ipaddr & conn */ - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - err = rds_ib_update_ipaddr(rds_ibdev, conn->c_laddr); + /* update ib_device with this local ipaddr */ + err = rds_ib_update_ipaddr(ic->rds_ibdev, conn->c_laddr); if (err) - printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", err); - rds_ib_add_conn(rds_ibdev, conn); + printk(KERN_ERR "rds_ib_update_ipaddr failed (%d)\n", + err); /* If the peer gave us the last packet it saw, process this as if * we had received a regular ACK. */ @@ -168,12 +166,10 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, u32 max_initiator_depth) { struct rds_ib_connection *ic = conn->c_transport_data; - struct rds_ib_device *rds_ibdev; + struct rds_ib_device *rds_ibdev = ic->rds_ibdev; memset(conn_param, 0, sizeof(struct rdma_conn_param)); - rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); - conn_param->responder_resources = min_t(u32, rds_ibdev->max_responder_resources, max_responder_resources); conn_param->initiator_depth = @@ -241,18 +237,16 @@ static int rds_ib_setup_qp(struct rds_connection *conn) struct rds_ib_device *rds_ibdev; int ret; - /* rds_ib_add_one creates a rds_ib_device object per IB device, - * and allocates a protection domain, memory range and FMR pool - * for each. If that fails for any reason, it will not register - * the rds_ibdev at all. + /* + * It's normal to see a null device if an incoming connection races + * with device removal, so we don't print a warning. */ - rds_ibdev = ib_get_client_data(dev, &rds_ib_client); - if (!rds_ibdev) { - if (printk_ratelimit()) - printk(KERN_NOTICE "RDS/IB: No client_data for device %s\n", - dev->name); + rds_ibdev = rds_ib_get_client_data(dev); + if (!rds_ibdev) return -EOPNOTSUPP; - } + + /* add the conn now so that connection establishment has the dev */ + rds_ib_add_conn(rds_ibdev, conn); if (rds_ibdev->max_wrs < ic->i_send_ring.w_nr + 1) rds_ib_ring_resize(&ic->i_send_ring, rds_ibdev->max_wrs - 1); @@ -371,6 +365,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ic->i_send_cq, ic->i_recv_cq); out: + rds_ib_dev_put(rds_ibdev); return ret; } diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 4ba01b9..64b5ede 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -87,6 +87,7 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) rcu_read_lock(); list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { + atomic_inc(&rds_ibdev->refcount); rcu_read_unlock(); return rds_ibdev; } @@ -141,8 +142,10 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr) struct rds_ib_device *rds_ibdev_old; rds_ibdev_old = rds_ib_get_device(ipaddr); - if (rds_ibdev_old) + if (rds_ibdev_old) { rds_ib_remove_ipaddr(rds_ibdev_old, ipaddr); + rds_ib_dev_put(rds_ibdev_old); + } return rds_ib_add_ipaddr(rds_ibdev, ipaddr); } @@ -163,6 +166,7 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; + atomic_inc(&rds_ibdev->refcount); } void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn) @@ -182,6 +186,7 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * spin_unlock(&ib_nodev_conns_lock); ic->rds_ibdev = NULL; + rds_ib_dev_put(rds_ibdev); } void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) @@ -240,7 +245,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { - flush_workqueue(rds_wq); + cancel_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); @@ -597,6 +602,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate) queue_work(rds_wq, &pool->flush_worker); } } + + rds_ib_dev_put(rds_ibdev); } void rds_ib_flush_mrs(void) @@ -640,6 +647,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret); ibmr->device = rds_ibdev; + rds_ibdev = NULL; out: if (ret) { @@ -647,5 +655,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, rds_ib_free_mr(ibmr, 0); ibmr = ERR_PTR(ret); } + if (rds_ibdev) + rds_ib_dev_put(rds_ibdev); return ibmr; } -- cgit v1.1 From f3c6808d3d8513db2b0543538fc35c25a60fe7a7 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 24 May 2010 13:14:36 -0700 Subject: RDS: introduce rds_conn_connect_if_down() A few paths had the same block of code to queue a connection's connect work if it was in the right state. Let's move this in to a helper function. Signed-off-by: Zach Brown --- net/rds/connection.c | 12 ++++++++++++ net/rds/rds.h | 1 + net/rds/send.c | 16 ++-------------- 3 files changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 180b83a..5bd96d53 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -540,6 +540,18 @@ void rds_conn_drop(struct rds_connection *conn) EXPORT_SYMBOL_GPL(rds_conn_drop); /* + * If the connection is down, trigger a connect. We may have scheduled a + * delayed reconnect however - in this case we should not interfere. + */ +void rds_conn_connect_if_down(struct rds_connection *conn) +{ + if (rds_conn_state(conn) == RDS_CONN_DOWN && + !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) + queue_delayed_work(rds_wq, &conn->c_conn_w, 0); +} +EXPORT_SYMBOL_GPL(rds_conn_connect_if_down); + +/* * An error occurred on the connection */ void diff --git a/net/rds/rds.h b/net/rds/rds.h index 4ab3d1a..cba5f8b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -617,6 +617,7 @@ void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); +void rds_conn_connect_if_down(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, struct rds_info_iterator *iter, struct rds_info_lengths *lens, diff --git a/net/rds/send.c b/net/rds/send.c index 437f1e7..a629599 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -1036,13 +1036,7 @@ int rds_sendmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, goto out; } - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, nonblock, rs); if (ret) { @@ -1122,13 +1116,7 @@ rds_send_pong(struct rds_connection *conn, __be16 dport) rm->m_daddr = conn->c_faddr; rm->data.op_active = 1; - /* If the connection is down, trigger a connect. We may - * have scheduled a delayed reconnect however - in this case - * we should not interfere. - */ - if (rds_conn_state(conn) == RDS_CONN_DOWN && - !test_and_set_bit(RDS_RECONNECT_PENDING, &conn->c_flags)) - queue_delayed_work(rds_wq, &conn->c_conn_w, 0); + rds_conn_connect_if_down(conn); ret = rds_cong_wait(conn->c_fcong, dport, 1, NULL); if (ret) -- cgit v1.1 From fc19de38be924728fea76026c0d1a6c4b6156084 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Mon, 24 May 2010 13:16:57 -0700 Subject: RDS/IB: disconnect when IB devices are removed Currently IB device removal destroys connections which are associated with the device. This prevents connections from being re-established when replacement devices are added. Instead we'll queue shutdown work on the connections as their devices are removed. When we see that devices are added we triger connection attempts on all connections that don't currently have a device. The result is that RDS sockets can resume device-independent work (bcopy, not RDMA) across IB device removal and restoration. Signed-off-by: Zach Brown --- net/rds/ib.c | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 1732f8e..b21e24f 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -59,6 +59,27 @@ struct list_head rds_ib_devices; DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); +void rds_ib_nodev_connect(void) +{ + struct rds_ib_connection *ic; + + spin_lock(&ib_nodev_conns_lock); + list_for_each_entry(ic, &ib_nodev_conns, ib_node) + rds_conn_connect_if_down(ic->conn); + spin_unlock(&ib_nodev_conns_lock); +} + +void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +{ + struct rds_ib_connection *ic; + unsigned long flags; + + spin_lock_irqsave(&rds_ibdev->spinlock, flags); + list_for_each_entry(ic, &rds_ibdev->conn_list, ib_node) + rds_conn_drop(ic->conn); + spin_unlock_irqrestore(&rds_ibdev->spinlock, flags); +} + /* * rds_ib_destroy_mr_pool() blocks on a few things and mrs drop references * from interrupt context so we push freing off into a work struct in krdsd. @@ -156,6 +177,8 @@ void rds_ib_add_one(struct ib_device *device) ib_set_client_data(device, &rds_ib_client, rds_ibdev); atomic_inc(&rds_ibdev->refcount); + rds_ib_nodev_connect(); + put_dev: rds_ib_dev_put(rds_ibdev); free_attr: @@ -205,7 +228,7 @@ void rds_ib_remove_one(struct ib_device *device) if (!rds_ibdev) return; - rds_ib_destroy_conns(rds_ibdev); + rds_ib_dev_shutdown(rds_ibdev); /* * prevent future connection attempts from getting a reference to this -- cgit v1.1 From 0b088e003ccf316a76c51be5dec2d70b93be3be8 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 24 May 2010 20:12:41 -0700 Subject: RDS: Use page_remainder_alloc() for recv bufs Instead of splitting up a page into RDS_FRAG_SIZE chunks ourselves, ask rds_page_remainder_alloc() to do it. While it is possible PAGE_SIZE > FRAG_SIZE, on x86en it isn't, so having duplicate "carve up a page into buffers" code seems excessive. The other modification this spawns is the use of a single struct scatterlist in rds_page_frag instead of a bare page ptr. This causes verbosity to increase in some places, and decrease in others. Finally, I decided to unify the lifetimes and alloc/free of rds_page_frag and its page. This is a nice simplification in itself, but will be extra-nice once we come to adding cmason's recycling patch. Signed-off-by: Andy Grover --- net/rds/ib.h | 7 +---- net/rds/ib_recv.c | 94 ++++++++++++++++--------------------------------------- net/rds/page.c | 1 + 3 files changed, 29 insertions(+), 73 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 282ec69..9bb7a74 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -28,13 +28,9 @@ extern struct list_head rds_ib_devices; * try and minimize the amount of memory tied up both the device and * socket receive queues. */ -/* page offset of the final full frag that fits in the page */ -#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE) struct rds_page_frag { struct list_head f_item; - struct page *f_page; - unsigned long f_offset; - dma_addr_t f_mapped; + struct scatterlist f_sg; }; struct rds_ib_incoming { @@ -107,7 +103,6 @@ struct rds_ib_connection { struct rds_header *i_recv_hdrs; u64 i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; - struct rds_page_frag i_frag; u64 i_ack_recv; /* last ACK received */ /* sending acks */ diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 24d1461..f6dbf16e 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -43,17 +43,11 @@ static struct kmem_cache *rds_ib_incoming_slab; static struct kmem_cache *rds_ib_frag_slab; static atomic_t rds_ib_allocation = ATOMIC_INIT(0); -static void rds_ib_frag_drop_page(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, frag->f_page); - __free_page(frag->f_page); - frag->f_page = NULL; -} - +/* Free frag and attached recv buffer f_sg */ static void rds_ib_frag_free(struct rds_page_frag *frag) { - rdsdebug("frag %p page %p\n", frag, frag->f_page); - BUG_ON(frag->f_page); + rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); + __free_page(sg_page(&frag->f_sg)); kmem_cache_free(rds_ib_frag_slab, frag); } @@ -71,12 +65,8 @@ static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, { struct rds_page_frag *frag = recv->r_frag; - rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page); - if (frag->f_mapped) - ib_dma_unmap_page(ic->i_cm_id->device, - frag->f_mapped, - RDS_FRAG_SIZE, DMA_FROM_DEVICE); - frag->f_mapped = 0; + rdsdebug("recv %p frag %p page %p\n", recv, frag, sg_page(&frag->f_sg)); + ib_dma_unmap_sg(ic->i_cm_id->device, &frag->f_sg, 1, DMA_FROM_DEVICE); } void rds_ib_recv_init_ring(struct rds_ib_connection *ic) @@ -116,8 +106,6 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, } if (recv->r_frag) { rds_ib_recv_unmap_page(ic, recv); - if (recv->r_frag->f_page) - rds_ib_frag_drop_page(recv->r_frag); rds_ib_frag_free(recv->r_frag); recv->r_frag = NULL; } @@ -129,16 +117,12 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) for (i = 0; i < ic->i_recv_ring.w_nr; i++) rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); - - if (ic->i_frag.f_page) - rds_ib_frag_drop_page(&ic->i_frag); } static int rds_ib_recv_refill_one(struct rds_connection *conn, struct rds_ib_recv_work *recv) { struct rds_ib_connection *ic = conn->c_transport_data; - dma_addr_t dma_addr; struct ib_sge *sge; int ret = -ENOMEM; @@ -161,50 +145,27 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, if (!recv->r_frag) goto out; INIT_LIST_HEAD(&recv->r_frag->f_item); - recv->r_frag->f_page = NULL; - } - - if (!ic->i_frag.f_page) { - ic->i_frag.f_page = alloc_page(GFP_NOWAIT); - if (!ic->i_frag.f_page) + sg_init_table(&recv->r_frag->f_sg, 1); + ret = rds_page_remainder_alloc(&recv->r_frag->f_sg, + RDS_FRAG_SIZE, GFP_NOWAIT); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, recv->r_frag); + recv->r_frag = NULL; goto out; - ic->i_frag.f_offset = 0; + } } - dma_addr = ib_dma_map_page(ic->i_cm_id->device, - ic->i_frag.f_page, - ic->i_frag.f_offset, - RDS_FRAG_SIZE, - DMA_FROM_DEVICE); - if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr)) - goto out; - - /* - * Once we get the RDS_PAGE_LAST_OFF frag then rds_ib_frag_unmap() - * must be called on this recv. This happens as completions hit - * in order or on connection shutdown. - */ - recv->r_frag->f_page = ic->i_frag.f_page; - recv->r_frag->f_offset = ic->i_frag.f_offset; - recv->r_frag->f_mapped = dma_addr; + ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, + 1, DMA_FROM_DEVICE); + WARN_ON(ret != 1); sge = &recv->r_sge[0]; sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header); sge->length = sizeof(struct rds_header); sge = &recv->r_sge[1]; - sge->addr = dma_addr; - sge->length = RDS_FRAG_SIZE; - - get_page(recv->r_frag->f_page); - - if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) { - ic->i_frag.f_offset += RDS_FRAG_SIZE; - } else { - put_page(ic->i_frag.f_page); - ic->i_frag.f_page = NULL; - ic->i_frag.f_offset = 0; - } + sge->addr = sg_dma_address(&recv->r_frag->f_sg); + sge->length = sg_dma_len(&recv->r_frag->f_sg); ret = 0; out: @@ -247,8 +208,8 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill) /* XXX when can this fail? */ ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr); rdsdebug("recv %p ibinc %p page %p addr %lu ret %d\n", recv, - recv->r_ibinc, recv->r_frag->f_page, - (long) recv->r_frag->f_mapped, ret); + recv->r_ibinc, sg_page(&recv->r_frag->f_sg), + (long) sg_dma_address(&recv->r_frag->f_sg), ret); if (ret) { rds_ib_conn_error(conn, "recv post on " "%pI4 returned %d, disconnecting and " @@ -281,7 +242,6 @@ static void rds_ib_inc_purge(struct rds_incoming *inc) list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { list_del_init(&frag->f_item); - rds_ib_frag_drop_page(frag); rds_ib_frag_free(frag); } } @@ -333,13 +293,13 @@ int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, to_copy = min_t(unsigned long, to_copy, len - copied); rdsdebug("%lu bytes to user [%p, %zu] + %lu from frag " - "[%p, %lu] + %lu\n", + "[%p, %u] + %lu\n", to_copy, iov->iov_base, iov->iov_len, iov_off, - frag->f_page, frag->f_offset, frag_off); + sg_page(&frag->f_sg), frag->f_sg.offset, frag_off); /* XXX needs + offset for multiple recvs per page */ - ret = rds_page_copy_to_user(frag->f_page, - frag->f_offset + frag_off, + ret = rds_page_copy_to_user(sg_page(&frag->f_sg), + frag->f_sg.offset + frag_off, iov->iov_base + iov_off, to_copy); if (ret) { @@ -595,7 +555,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn, to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off); BUG_ON(to_copy & 7); /* Must be 64bit aligned. */ - addr = kmap_atomic(frag->f_page, KM_SOFTIRQ0); + addr = kmap_atomic(sg_page(&frag->f_sg), KM_SOFTIRQ0); src = addr + frag_off; dst = (void *)map->m_page_addrs[map_page] + map_off; @@ -698,12 +658,12 @@ static void rds_ib_process_recv(struct rds_connection *conn, * the inc is freed. We don't go that route, so we have to drop the * page ref ourselves. We can't just leave the page on the recv * because that confuses the dma mapping of pages and each recv's use - * of a partial page. We can leave the frag, though, it will be - * reused. + * of a partial page. * * FIXME: Fold this into the code path below. */ - rds_ib_frag_drop_page(recv->r_frag); + rds_ib_frag_free(recv->r_frag); + recv->r_frag = NULL; return; } diff --git a/net/rds/page.c b/net/rds/page.c index e5b2527..5e44f5ae 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -186,6 +186,7 @@ out: ret ? 0 : scat->length); return ret; } +EXPORT_SYMBOL_GPL(rds_page_remainder_alloc); static int rds_page_remainder_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) -- cgit v1.1 From 3427e854e1a0e76be8b3d75fc0fa878f59b43693 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Mon, 24 May 2010 20:28:49 -0700 Subject: RDS: Assume recv->r_frag is always NULL in refill_one() refill_one() should never be called on a recv struct that doesn't need a new r_frag allocated. Add a WARN and remove conditional around r_frag alloc code. Also, add a comment to explain why r_ibinc may or may not need refilling. Signed-off-by: Andy Grover --- net/rds/ib_recv.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index f6dbf16e..5c7e6ac 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -126,6 +126,10 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, struct ib_sge *sge; int ret = -ENOMEM; + /* + * ibinc was taken from recv if recv contained the start of a message. + * recvs that were continuations will still have this allocated. + */ if (!recv->r_ibinc) { if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { rds_ib_stats_inc(s_ib_rx_alloc_limit); @@ -140,19 +144,18 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); } - if (!recv->r_frag) { - recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT); - if (!recv->r_frag) - goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - sg_init_table(&recv->r_frag->f_sg, 1); - ret = rds_page_remainder_alloc(&recv->r_frag->f_sg, - RDS_FRAG_SIZE, GFP_NOWAIT); - if (ret) { - kmem_cache_free(rds_ib_frag_slab, recv->r_frag); - recv->r_frag = NULL; - goto out; - } + WARN_ON(recv->r_frag); /* leak! */ + recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT); + if (!recv->r_frag) + goto out; + INIT_LIST_HEAD(&recv->r_frag->f_item); + sg_init_table(&recv->r_frag->f_sg, 1); + ret = rds_page_remainder_alloc(&recv->r_frag->f_sg, + RDS_FRAG_SIZE, GFP_NOWAIT); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, recv->r_frag); + recv->r_frag = NULL; + goto out; } ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, -- cgit v1.1 From fc24f78085e8771670af42f2b8929b16a0c98a22 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Tue, 25 May 2010 11:20:09 -0700 Subject: RDS/IB: Remove ib_recv_unmap_page() All it does is call unmap_sg(), so just call that directly. The comment above unmap_page also may be incorrect, so we shouldn't hold on to it, either. Signed-off-by: Andy Grover --- net/rds/ib_recv.c | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 5c7e6ac..48add10 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -51,24 +51,6 @@ static void rds_ib_frag_free(struct rds_page_frag *frag) kmem_cache_free(rds_ib_frag_slab, frag); } -/* - * We map a page at a time. Its fragments are posted in order. This - * is called in fragment order as the fragments get send completion events. - * Only the last frag in the page performs the unmapping. - * - * It's OK for ring cleanup to call this in whatever order it likes because - * DMA is not in flight and so we can unmap while other ring entries still - * hold page references in their frags. - */ -static void rds_ib_recv_unmap_page(struct rds_ib_connection *ic, - struct rds_ib_recv_work *recv) -{ - struct rds_page_frag *frag = recv->r_frag; - - rdsdebug("recv %p frag %p page %p\n", recv, frag, sg_page(&frag->f_sg)); - ib_dma_unmap_sg(ic->i_cm_id->device, &frag->f_sg, 1, DMA_FROM_DEVICE); -} - void rds_ib_recv_init_ring(struct rds_ib_connection *ic) { struct rds_ib_recv_work *recv; @@ -105,7 +87,7 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, recv->r_ibinc = NULL; } if (recv->r_frag) { - rds_ib_recv_unmap_page(ic, recv); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); rds_ib_frag_free(recv->r_frag); recv->r_frag = NULL; } @@ -768,7 +750,7 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, recv = &ic->i_recvs[rds_ib_ring_oldest(&ic->i_recv_ring)]; - rds_ib_recv_unmap_page(ic, recv); + ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); /* * Also process recvs in connecting state because it is possible -- cgit v1.1 From 33244125871734ebc0d8d147680a0d7e99385e0b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 26 May 2010 22:05:37 -0700 Subject: RDS/IB: Add caching of frags and incs This patch is based heavily on an initial patch by Chris Mason. Instead of freeing slab memory and pages, it keeps them, and funnels them back to be reused. The lock minimization strategy uses xchg and cmpxchg atomic ops for manipulation of pointers to list heads. We anchor the lists with a pointer to a list_head struct instead of a static list_head struct. We just have to carefully use the existing primitives with the difference between a pointer and a static head struct. For example, 'list_empty()' means that our anchor pointer points to a list with a single item instead of meaning that our static head element doesn't point to any list items. Original patch by Chris, with significant mods and fixes by Andy and Zach. Signed-off-by: Chris Mason Signed-off-by: Andy Grover Signed-off-by: Zach Brown --- net/rds/ib.h | 19 ++++ net/rds/ib_cm.c | 9 ++ net/rds/ib_recv.c | 321 +++++++++++++++++++++++++++++++++++++++++++++--------- 3 files changed, 297 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 9bb7a74..2efd9d1 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -21,6 +21,8 @@ #define RDS_IB_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */ +#define RDS_IB_RECYCLE_BATCH_COUNT 32 + extern struct list_head rds_ib_devices; /* @@ -30,14 +32,27 @@ extern struct list_head rds_ib_devices; */ struct rds_page_frag { struct list_head f_item; + struct list_head f_cache_entry; struct scatterlist f_sg; }; struct rds_ib_incoming { struct list_head ii_frags; + struct list_head ii_cache_entry; struct rds_incoming ii_inc; }; +struct rds_ib_cache_head { + struct list_head *first; + unsigned long count; +}; + +struct rds_ib_refill_cache { + struct rds_ib_cache_head *percpu; + struct list_head *xfer; + struct list_head *ready; +}; + struct rds_ib_connect_private { /* Add new fields at the end, and don't permute existing fields. */ __be32 dp_saddr; @@ -104,6 +119,8 @@ struct rds_ib_connection { u64 i_recv_hdrs_dma; struct rds_ib_recv_work *i_recvs; u64 i_ack_recv; /* last ACK received */ + struct rds_ib_refill_cache i_cache_incs; + struct rds_ib_refill_cache i_cache_frags; /* sending acks */ unsigned long i_ack_flags; @@ -304,6 +321,8 @@ void rds_ib_flush_mrs(void); int __init rds_ib_recv_init(void); void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); +void rds_ib_recv_free_caches(struct rds_ib_connection *ic); int rds_ib_recv_refill(struct rds_connection *conn, int prefill); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index a9fb917..10f6a88 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -709,12 +709,19 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) { struct rds_ib_connection *ic; unsigned long flags; + int ret; /* XXX too lazy? */ ic = kzalloc(sizeof(struct rds_ib_connection), GFP_KERNEL); if (!ic) return -ENOMEM; + ret = rds_ib_recv_alloc_caches(ic); + if (ret) { + kfree(ic); + return ret; + } + INIT_LIST_HEAD(&ic->ib_node); tasklet_init(&ic->i_recv_tasklet, rds_ib_recv_tasklet_fn, (unsigned long) ic); @@ -763,6 +770,8 @@ void rds_ib_conn_free(void *arg) list_del(&ic->ib_node); spin_unlock_irq(lock_ptr); + rds_ib_recv_free_caches(ic); + kfree(ic); } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 48add10..5b429b7 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -43,14 +43,6 @@ static struct kmem_cache *rds_ib_incoming_slab; static struct kmem_cache *rds_ib_frag_slab; static atomic_t rds_ib_allocation = ATOMIC_INIT(0); -/* Free frag and attached recv buffer f_sg */ -static void rds_ib_frag_free(struct rds_page_frag *frag) -{ - rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); - __free_page(sg_page(&frag->f_sg)); - kmem_cache_free(rds_ib_frag_slab, frag); -} - void rds_ib_recv_init_ring(struct rds_ib_connection *ic) { struct rds_ib_recv_work *recv; @@ -79,6 +71,151 @@ void rds_ib_recv_init_ring(struct rds_ib_connection *ic) } } +/* + * The entire 'from' list, including the from element itself, is put on + * to the tail of the 'to' list. + */ +static void list_splice_entire_tail(struct list_head *from, + struct list_head *to) +{ + struct list_head *from_last = from->prev; + + list_splice_tail(from_last, to); + list_add_tail(from_last, to); +} + +static void rds_ib_cache_xfer_to_ready(struct rds_ib_refill_cache *cache) +{ + struct list_head *tmp; + + tmp = xchg(&cache->xfer, NULL); + if (tmp) { + if (cache->ready) + list_splice_entire_tail(tmp, cache->ready); + else + cache->ready = tmp; + } +} + +static int rds_ib_recv_alloc_cache(struct rds_ib_refill_cache *cache) +{ + struct rds_ib_cache_head *head; + int cpu; + + cache->percpu = alloc_percpu(struct rds_ib_cache_head); + if (!cache->percpu) + return -ENOMEM; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + head->first = NULL; + head->count = 0; + } + cache->xfer = NULL; + cache->ready = NULL; + + return 0; +} + +int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic) +{ + int ret; + + ret = rds_ib_recv_alloc_cache(&ic->i_cache_incs); + if (!ret) { + ret = rds_ib_recv_alloc_cache(&ic->i_cache_frags); + if (ret) + free_percpu(ic->i_cache_incs.percpu); + } + + return ret; +} + +static void rds_ib_cache_splice_all_lists(struct rds_ib_refill_cache *cache, + struct list_head *caller_list) +{ + struct rds_ib_cache_head *head; + int cpu; + + for_each_possible_cpu(cpu) { + head = per_cpu_ptr(cache->percpu, cpu); + if (head->first) { + list_splice_entire_tail(head->first, caller_list); + head->first = NULL; + } + } + + if (cache->ready) { + list_splice_entire_tail(cache->ready, caller_list); + cache->ready = NULL; + } +} + +void rds_ib_recv_free_caches(struct rds_ib_connection *ic) +{ + struct rds_ib_incoming *inc; + struct rds_ib_incoming *inc_tmp; + struct rds_page_frag *frag; + struct rds_page_frag *frag_tmp; + LIST_HEAD(list); + + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + rds_ib_cache_splice_all_lists(&ic->i_cache_incs, &list); + free_percpu(ic->i_cache_incs.percpu); + + list_for_each_entry_safe(inc, inc_tmp, &list, ii_cache_entry) { + list_del(&inc->ii_cache_entry); + WARN_ON(!list_empty(&inc->ii_frags)); + kmem_cache_free(rds_ib_incoming_slab, inc); + } + + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + rds_ib_cache_splice_all_lists(&ic->i_cache_frags, &list); + free_percpu(ic->i_cache_frags.percpu); + + list_for_each_entry_safe(frag, frag_tmp, &list, f_cache_entry) { + list_del(&frag->f_cache_entry); + WARN_ON(!list_empty(&frag->f_item)); + kmem_cache_free(rds_ib_frag_slab, frag); + } +} + +/* fwd decl */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache); +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache); + + +/* Recycle frag and attached recv buffer f_sg */ +static void rds_ib_frag_free(struct rds_ib_connection *ic, + struct rds_page_frag *frag) +{ + rdsdebug("frag %p page %p\n", frag, sg_page(&frag->f_sg)); + + rds_ib_recv_cache_put(&frag->f_cache_entry, &ic->i_cache_frags); +} + +/* Recycle inc after freeing attached frags */ +void rds_ib_inc_free(struct rds_incoming *inc) +{ + struct rds_ib_incoming *ibinc; + struct rds_page_frag *frag; + struct rds_page_frag *pos; + struct rds_ib_connection *ic = inc->i_conn->c_transport_data; + + ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + + /* Free attached frags */ + list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { + list_del_init(&frag->f_item); + rds_ib_frag_free(ic, frag); + } + BUG_ON(!list_empty(&ibinc->ii_frags)); + + rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); + rds_ib_recv_cache_put(&ibinc->ii_cache_entry, &ic->i_cache_incs); +} + static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, struct rds_ib_recv_work *recv) { @@ -88,7 +225,7 @@ static void rds_ib_recv_clear_one(struct rds_ib_connection *ic, } if (recv->r_frag) { ib_dma_unmap_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); - rds_ib_frag_free(recv->r_frag); + rds_ib_frag_free(ic, recv->r_frag); recv->r_frag = NULL; } } @@ -101,6 +238,61 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); } +static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic) +{ + struct rds_ib_incoming *ibinc; + struct list_head *cache_item; + int avail_allocs; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_incs); + if (cache_item) { + ibinc = container_of(cache_item, struct rds_ib_incoming, ii_cache_entry); + } else { + avail_allocs = atomic_add_unless(&rds_ib_allocation, + 1, rds_ib_sysctl_max_recv_allocation); + if (!avail_allocs) { + rds_ib_stats_inc(s_ib_rx_alloc_limit); + return NULL; + } + ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT); + if (!ibinc) { + atomic_dec(&rds_ib_allocation); + return NULL; + } + } + INIT_LIST_HEAD(&ibinc->ii_frags); + rds_inc_init(&ibinc->ii_inc, ic->conn, ic->conn->c_faddr); + + return ibinc; +} + +static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic) +{ + struct rds_page_frag *frag; + struct list_head *cache_item; + int ret; + + cache_item = rds_ib_recv_cache_get(&ic->i_cache_frags); + if (cache_item) { + frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); + } else { + frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT); + if (!frag) + return NULL; + + ret = rds_page_remainder_alloc(&frag->f_sg, + RDS_FRAG_SIZE, GFP_NOWAIT); + if (ret) { + kmem_cache_free(rds_ib_frag_slab, frag); + return NULL; + } + } + + INIT_LIST_HEAD(&frag->f_item); + + return frag; +} + static int rds_ib_recv_refill_one(struct rds_connection *conn, struct rds_ib_recv_work *recv) { @@ -108,37 +300,25 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, struct ib_sge *sge; int ret = -ENOMEM; + if (!ic->i_cache_incs.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); + if (!ic->i_cache_frags.ready) + rds_ib_cache_xfer_to_ready(&ic->i_cache_frags); + /* * ibinc was taken from recv if recv contained the start of a message. * recvs that were continuations will still have this allocated. */ if (!recv->r_ibinc) { - if (!atomic_add_unless(&rds_ib_allocation, 1, rds_ib_sysctl_max_recv_allocation)) { - rds_ib_stats_inc(s_ib_rx_alloc_limit); - goto out; - } - recv->r_ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT); - if (!recv->r_ibinc) { - atomic_dec(&rds_ib_allocation); + recv->r_ibinc = rds_ib_refill_one_inc(ic); + if (!recv->r_ibinc) goto out; - } - INIT_LIST_HEAD(&recv->r_ibinc->ii_frags); - rds_inc_init(&recv->r_ibinc->ii_inc, conn, conn->c_faddr); } WARN_ON(recv->r_frag); /* leak! */ - recv->r_frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT); + recv->r_frag = rds_ib_refill_one_frag(ic); if (!recv->r_frag) goto out; - INIT_LIST_HEAD(&recv->r_frag->f_item); - sg_init_table(&recv->r_frag->f_sg, 1); - ret = rds_page_remainder_alloc(&recv->r_frag->f_sg, - RDS_FRAG_SIZE, GFP_NOWAIT); - if (ret) { - kmem_cache_free(rds_ib_frag_slab, recv->r_frag); - recv->r_frag = NULL; - goto out; - } ret = ib_dma_map_sg(ic->i_cm_id->device, &recv->r_frag->f_sg, 1, DMA_FROM_DEVICE); @@ -160,8 +340,7 @@ out: /* * This tries to allocate and post unused work requests after making sure that * they have all the allocations they need to queue received fragments into - * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc - * pairs don't go unmatched. + * sockets. * * -1 is returned if posting fails due to temporary resource exhaustion. */ @@ -216,33 +395,71 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill) return ret; } -static void rds_ib_inc_purge(struct rds_incoming *inc) +/* + * We want to recycle several types of recv allocations, like incs and frags. + * To use this, the *_free() function passes in the ptr to a list_head within + * the recyclee, as well as the cache to put it on. + * + * First, we put the memory on a percpu list. When this reaches a certain size, + * We move it to an intermediate non-percpu list in a lockless manner, with some + * xchg/compxchg wizardry. + * + * N.B. Instead of a list_head as the anchor, we use a single pointer, which can + * be NULL and xchg'd. The list is actually empty when the pointer is NULL, and + * list_empty() will return true with one element is actually present. + */ +static void rds_ib_recv_cache_put(struct list_head *new_item, + struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - struct rds_page_frag *frag; - struct rds_page_frag *pos; + unsigned long flags; + struct rds_ib_cache_head *chp; + struct list_head *old; - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); - rdsdebug("purging ibinc %p inc %p\n", ibinc, inc); + local_irq_save(flags); - list_for_each_entry_safe(frag, pos, &ibinc->ii_frags, f_item) { - list_del_init(&frag->f_item); - rds_ib_frag_free(frag); - } + chp = per_cpu_ptr(cache->percpu, smp_processor_id()); + if (!chp->first) + INIT_LIST_HEAD(new_item); + else /* put on front */ + list_add_tail(new_item, chp->first); + chp->first = new_item; + chp->count++; + + if (chp->count < RDS_IB_RECYCLE_BATCH_COUNT) + goto end; + + /* + * Return our per-cpu first list to the cache's xfer by atomically + * grabbing the current xfer list, appending it to our per-cpu list, + * and then atomically returning that entire list back to the + * cache's xfer list as long as it's still empty. + */ + do { + old = xchg(&cache->xfer, NULL); + if (old) + list_splice_entire_tail(old, chp->first); + old = cmpxchg(&cache->xfer, NULL, chp->first); + } while (old); + + chp->first = NULL; + chp->count = 0; +end: + local_irq_restore(flags); } -void rds_ib_inc_free(struct rds_incoming *inc) +static struct list_head *rds_ib_recv_cache_get(struct rds_ib_refill_cache *cache) { - struct rds_ib_incoming *ibinc; - - ibinc = container_of(inc, struct rds_ib_incoming, ii_inc); + struct list_head *head = cache->ready; + + if (head) { + if (!list_empty(head)) { + cache->ready = head->next; + list_del_init(head); + } else + cache->ready = NULL; + } - rds_ib_inc_purge(inc); - rdsdebug("freeing ibinc %p inc %p\n", ibinc, inc); - BUG_ON(!list_empty(&ibinc->ii_frags)); - kmem_cache_free(rds_ib_incoming_slab, ibinc); - atomic_dec(&rds_ib_allocation); - BUG_ON(atomic_read(&rds_ib_allocation) < 0); + return head; } int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, @@ -647,7 +864,7 @@ static void rds_ib_process_recv(struct rds_connection *conn, * * FIXME: Fold this into the code path below. */ - rds_ib_frag_free(recv->r_frag); + rds_ib_frag_free(ic, recv->r_frag); recv->r_frag = NULL; return; } -- cgit v1.1 From 037f18a3074753991656189a091a5fa371999107 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Wed, 26 May 2010 21:45:06 -0700 Subject: RDS: use friendly gfp masks for prefill When prefilling the rds frags, we end up doing a lot of allocations. We're not in atomic context here, and so there's no reason to dip into atomic reserves. This changes the prefills to use masks that allow waiting. Signed-off-by: Chris Mason --- net/rds/ib_recv.c | 27 ++++++++++++++++++--------- 1 file changed, 18 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 5b429b7..1add097 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -238,7 +238,8 @@ void rds_ib_recv_clear_ring(struct rds_ib_connection *ic) rds_ib_recv_clear_one(ic, &ic->i_recvs[i]); } -static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic) +static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *ic, + gfp_t slab_mask) { struct rds_ib_incoming *ibinc; struct list_head *cache_item; @@ -254,7 +255,7 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i rds_ib_stats_inc(s_ib_rx_alloc_limit); return NULL; } - ibinc = kmem_cache_alloc(rds_ib_incoming_slab, GFP_NOWAIT); + ibinc = kmem_cache_alloc(rds_ib_incoming_slab, slab_mask); if (!ibinc) { atomic_dec(&rds_ib_allocation); return NULL; @@ -266,7 +267,8 @@ static struct rds_ib_incoming *rds_ib_refill_one_inc(struct rds_ib_connection *i return ibinc; } -static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic) +static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic, + gfp_t slab_mask, gfp_t page_mask) { struct rds_page_frag *frag; struct list_head *cache_item; @@ -276,12 +278,12 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic if (cache_item) { frag = container_of(cache_item, struct rds_page_frag, f_cache_entry); } else { - frag = kmem_cache_alloc(rds_ib_frag_slab, GFP_NOWAIT); + frag = kmem_cache_alloc(rds_ib_frag_slab, slab_mask); if (!frag) return NULL; ret = rds_page_remainder_alloc(&frag->f_sg, - RDS_FRAG_SIZE, GFP_NOWAIT); + RDS_FRAG_SIZE, page_mask); if (ret) { kmem_cache_free(rds_ib_frag_slab, frag); return NULL; @@ -294,11 +296,18 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic } static int rds_ib_recv_refill_one(struct rds_connection *conn, - struct rds_ib_recv_work *recv) + struct rds_ib_recv_work *recv, int prefill) { struct rds_ib_connection *ic = conn->c_transport_data; struct ib_sge *sge; int ret = -ENOMEM; + gfp_t slab_mask = GFP_NOWAIT; + gfp_t page_mask = GFP_NOWAIT; + + if (prefill) { + slab_mask = GFP_KERNEL; + page_mask = GFP_HIGHUSER; + } if (!ic->i_cache_incs.ready) rds_ib_cache_xfer_to_ready(&ic->i_cache_incs); @@ -310,13 +319,13 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn, * recvs that were continuations will still have this allocated. */ if (!recv->r_ibinc) { - recv->r_ibinc = rds_ib_refill_one_inc(ic); + recv->r_ibinc = rds_ib_refill_one_inc(ic, slab_mask); if (!recv->r_ibinc) goto out; } WARN_ON(recv->r_frag); /* leak! */ - recv->r_frag = rds_ib_refill_one_frag(ic); + recv->r_frag = rds_ib_refill_one_frag(ic, slab_mask, page_mask); if (!recv->r_frag) goto out; @@ -363,7 +372,7 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill) } recv = &ic->i_recvs[pos]; - ret = rds_ib_recv_refill_one(conn, recv); + ret = rds_ib_recv_refill_one(conn, recv, prefill); if (ret) { ret = -1; break; -- cgit v1.1 From 671202f3491cccdb267f88ad59ba0635aeb2a22e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 4 Jun 2010 14:26:32 -0700 Subject: rds: remove unused rds_send_acked_before() rds_send_acked_before() wasn't blocking interrupts when acquiring c_lock from user context but nothing calls it. Rather than fix its use of c_lock we just remove the function. Signed-off-by: Zach Brown --- net/rds/rds.h | 1 - net/rds/send.c | 29 ----------------------------- 2 files changed, 30 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index cba5f8b..270ded7 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -724,7 +724,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); -int rds_send_acked_before(struct rds_connection *conn, u64 seq); void rds_send_remove_from_sock(struct list_head *messages, int status); int rds_send_pong(struct rds_connection *conn, __be16 dport); struct rds_message *rds_send_get_message(struct rds_connection *, diff --git a/net/rds/send.c b/net/rds/send.c index a629599..b9e41af 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -405,35 +405,6 @@ static inline int rds_send_is_acked(struct rds_message *rm, u64 ack, } /* - * Returns true if there are no messages on the send and retransmit queues - * which have a sequence number greater than or equal to the given sequence - * number. - */ -int rds_send_acked_before(struct rds_connection *conn, u64 seq) -{ - struct rds_message *rm, *tmp; - int ret = 1; - - spin_lock(&conn->c_lock); - - list_for_each_entry_safe(rm, tmp, &conn->c_retrans, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; - } - - list_for_each_entry_safe(rm, tmp, &conn->c_send_queue, m_conn_item) { - if (be64_to_cpu(rm->m_inc.i_hdr.h_sequence) < seq) - ret = 0; - break; - } - - spin_unlock(&conn->c_lock); - - return ret; -} - -/* * This is pretty similar to what happens below in the ACK * handling code - except that we call here as soon as we get * the IB send completion on the RDMA op and the accompanying -- cgit v1.1 From 501dcccdb7a2335cde07d4acb56e636182d62944 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 4 Jun 2010 14:25:27 -0700 Subject: rds: block ints when acquiring c_lock in rds_conn_message_info() conn->c_lock is acquired in interrupt context. rds_conn_message_info() is called from user context and was acquiring c_lock without blocking interrupts, leading to possible deadlocks. Signed-off-by: Zach Brown --- net/rds/connection.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 5bd96d53..5bb0eec 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -375,6 +375,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, struct rds_connection *conn; struct rds_message *rm; unsigned int total = 0; + unsigned long flags; size_t i; len /= sizeof(struct rds_info_message); @@ -389,7 +390,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, else list = &conn->c_retrans; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&conn->c_lock, flags); /* XXX too lazy to maintain counts.. */ list_for_each_entry(rm, list, m_conn_item) { @@ -400,7 +401,7 @@ static void rds_conn_message_info(struct socket *sock, unsigned int len, conn->c_faddr, 0); } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&conn->c_lock, flags); } } rcu_read_unlock(); -- cgit v1.1 From 0f4b1c7e89e699f588807a914ec6e6396c851a72 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 4 Jun 2010 14:41:41 -0700 Subject: rds: fix rds_send_xmit() serialization rds_send_xmit() was changed to hold an interrupt masking spinlock instead of a mutex so that it could be called from the IB receive tasklet path. This broke the TCP transport because its xmit method can block and masks and unmasks interrupts. This patch serializes callers to rds_send_xmit() with a simple bit instead of the current spinlock or previous mutex. This enables rds_send_xmit() to be called from any context and to call functions which block. Getting rid of the c_send_lock exposes the bare c_lock acquisitions which are changed to block interrupts. A waitqueue is added so that rds_conn_shutdown() can wait for callers to leave rds_send_xmit() before tearing down partial send state. This lets us get rid of c_senders. rds_send_xmit() is changed to check the conn state after acquiring the RDS_IN_XMIT bit to resolve races with the shutdown path. Previously both worked with the conn state and then the lock in the same order, allowing them to race and execute the paths concurrently. rds_send_reset() isn't racing with rds_send_xmit() now that rds_conn_shutdown() properly ensures that rds_send_xmit() can't start once the conn state has been changed. We can remove its previous use of the spinlock. Finally, c_send_generation is redundant. Callers can race to test the c_flags bit by simply retrying instead of racing to test the c_send_generation atomic. Signed-off-by: Zach Brown --- net/rds/connection.c | 19 ++++-------- net/rds/ib_send.c | 2 +- net/rds/rds.h | 5 ++-- net/rds/send.c | 82 +++++++++++++++++++++++++++------------------------- net/rds/threads.c | 2 +- 5 files changed, 52 insertions(+), 58 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 5bb0eec..89871db 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -148,9 +148,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, spin_lock_init(&conn->c_lock); conn->c_next_tx_seq = 1; - spin_lock_init(&conn->c_send_lock); - atomic_set(&conn->c_send_generation, 1); - atomic_set(&conn->c_senders, 0); + init_waitqueue_head(&conn->c_waitq); INIT_LIST_HEAD(&conn->c_send_queue); INIT_LIST_HEAD(&conn->c_retrans); @@ -275,15 +273,8 @@ void rds_conn_shutdown(struct rds_connection *conn) } mutex_unlock(&conn->c_cm_lock); - /* verify everybody's out of rds_send_xmit() */ - spin_lock_irq(&conn->c_send_lock); - spin_unlock_irq(&conn->c_send_lock); - - while(atomic_read(&conn->c_senders)) { - schedule_timeout(1); - spin_lock_irq(&conn->c_send_lock); - spin_unlock_irq(&conn->c_send_lock); - } + wait_event(conn->c_waitq, + !test_bit(RDS_IN_XMIT, &conn->c_flags)); conn->c_trans->conn_shutdown(conn); rds_conn_reset(conn); @@ -477,8 +468,8 @@ static int rds_conn_info_visitor(struct rds_connection *conn, sizeof(cinfo->transport)); cinfo->flags = 0; - rds_conn_info_set(cinfo->flags, - spin_is_locked(&conn->c_send_lock), SENDING); + rds_conn_info_set(cinfo->flags, test_bit(RDS_IN_XMIT, &conn->c_flags), + SENDING); /* XXX Future: return the state rather than these funky bits */ rds_conn_info_set(cinfo->flags, atomic_read(&conn->c_state) == RDS_CONN_CONNECTING, diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 3f91e79..e88cb4a 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -321,7 +321,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) * credits (see rds_ib_send_add_credits below). * * The RDS send code is essentially single-threaded; rds_send_xmit - * grabs c_send_lock to ensure exclusive access to the send ring. + * sets RDS_IN_XMIT to ensure exclusive access to the send ring. * However, the ACK sending code is independent and can race with * message SENDs. * diff --git a/net/rds/rds.h b/net/rds/rds.h index 270ded7..4510344 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -80,6 +80,7 @@ enum { /* Bits for c_flags */ #define RDS_LL_SEND_FULL 0 #define RDS_RECONNECT_PENDING 1 +#define RDS_IN_XMIT 2 struct rds_connection { struct hlist_node c_hash_node; @@ -91,9 +92,6 @@ struct rds_connection { struct rds_cong_map *c_lcong; struct rds_cong_map *c_fcong; - spinlock_t c_send_lock; /* protect send ring */ - atomic_t c_send_generation; - atomic_t c_senders; struct rds_message *c_xmit_rm; unsigned long c_xmit_sg; unsigned int c_xmit_hdr_off; @@ -120,6 +118,7 @@ struct rds_connection { struct delayed_work c_conn_w; struct work_struct c_down_w; struct mutex c_cm_lock; /* protect conn state & cm */ + wait_queue_head_t c_waitq; struct list_head c_map_item; unsigned long c_map_queued; diff --git a/net/rds/send.c b/net/rds/send.c index b9e41af..81471b2 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -53,14 +53,14 @@ module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); /* - * Reset the send state. Caller must hold c_send_lock when calling here. + * Reset the send state. Callers must ensure that this doesn't race with + * rds_send_xmit(). */ void rds_send_reset(struct rds_connection *conn) { struct rds_message *rm, *tmp; unsigned long flags; - spin_lock_irqsave(&conn->c_send_lock, flags); if (conn->c_xmit_rm) { rm = conn->c_xmit_rm; conn->c_xmit_rm = NULL; @@ -69,11 +69,7 @@ void rds_send_reset(struct rds_connection *conn) * independently) but as the connection is down, there's * no ongoing RDMA to/from that memory */ rds_message_unmapped(rm); - spin_unlock_irqrestore(&conn->c_send_lock, flags); - rds_message_put(rm); - } else { - spin_unlock_irqrestore(&conn->c_send_lock, flags); } conn->c_xmit_sg = 0; @@ -98,6 +94,25 @@ void rds_send_reset(struct rds_connection *conn) spin_unlock_irqrestore(&conn->c_lock, flags); } +static int acquire_in_xmit(struct rds_connection *conn) +{ + return test_and_set_bit(RDS_IN_XMIT, &conn->c_flags) == 0; +} + +static void release_in_xmit(struct rds_connection *conn) +{ + clear_bit(RDS_IN_XMIT, &conn->c_flags); + smp_mb__after_clear_bit(); + /* + * We don't use wait_on_bit()/wake_up_bit() because our waking is in a + * hot path and finding waiters is very rare. We don't want to walk + * the system-wide hashed waitqueue buckets in the fast path only to + * almost never find waiters. + */ + if (waitqueue_active(&conn->c_waitq)) + wake_up_all(&conn->c_waitq); +} + /* * We're making the concious trade-off here to only send one message * down the connection at a time. @@ -119,12 +134,9 @@ int rds_send_xmit(struct rds_connection *conn) unsigned int tmp; struct scatterlist *sg; int ret = 0; - int gen = 0; LIST_HEAD(to_be_dropped); restart: - if (!rds_conn_up(conn)) - goto out; /* * sendmsg calls here after having queued its message on the send @@ -133,18 +145,25 @@ restart: * avoids blocking the caller and trading per-connection data between * caches per message. */ - if (!spin_trylock_irqsave(&conn->c_send_lock, flags)) { + if (!acquire_in_xmit(conn)) { rds_stats_inc(s_send_lock_contention); ret = -ENOMEM; goto out; } - atomic_inc(&conn->c_senders); + + /* + * rds_conn_shutdown() sets the conn state and then tests RDS_IN_XMIT, + * we do the opposite to avoid races. + */ + if (!rds_conn_up(conn)) { + release_in_xmit(conn); + ret = 0; + goto out; + } if (conn->c_trans->xmit_prepare) conn->c_trans->xmit_prepare(conn); - gen = atomic_inc_return(&conn->c_send_generation); - /* * spin trying to push headers and data down the connection until * the connection doesn't make forward progress. @@ -178,7 +197,7 @@ restart: if (!rm) { unsigned int len; - spin_lock(&conn->c_lock); + spin_lock_irqsave(&conn->c_lock, flags); if (!list_empty(&conn->c_send_queue)) { rm = list_entry(conn->c_send_queue.next, @@ -193,7 +212,7 @@ restart: list_move_tail(&rm->m_conn_item, &conn->c_retrans); } - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&conn->c_lock, flags); if (!rm) break; @@ -207,10 +226,10 @@ restart: */ if (rm->rdma.op_active && test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags)) { - spin_lock(&conn->c_lock); + spin_lock_irqsave(&conn->c_lock, flags); if (test_and_clear_bit(RDS_MSG_ON_CONN, &rm->m_flags)) list_move(&rm->m_conn_item, &to_be_dropped); - spin_unlock(&conn->c_lock); + spin_unlock_irqrestore(&conn->c_lock, flags); continue; } @@ -336,19 +355,7 @@ restart: if (conn->c_trans->xmit_complete) conn->c_trans->xmit_complete(conn); - /* - * We might be racing with another sender who queued a message but - * backed off on noticing that we held the c_send_lock. If we check - * for queued messages after dropping the sem then either we'll - * see the queued message or the queuer will get the sem. If we - * notice the queued message then we trigger an immediate retry. - * - * We need to be careful only to do this when we stopped processing - * the send queue because it was empty. It's the only way we - * stop processing the loop when the transport hasn't taken - * responsibility for forward progress. - */ - spin_unlock_irqrestore(&conn->c_send_lock, flags); + release_in_xmit(conn); /* Nuke any messages we decided not to retransmit. */ if (!list_empty(&to_be_dropped)) { @@ -358,13 +365,12 @@ restart: rds_send_remove_from_sock(&to_be_dropped, RDS_RDMA_DROPPED); } - atomic_dec(&conn->c_senders); - /* - * Other senders will see we have c_send_lock and exit. We - * need to recheck the send queue and race again for c_send_lock - * to make sure messages don't just sit on the send queue, if - * somebody hasn't already beat us into the loop. + * Other senders can queue a message after we last test the send queue + * but before we clear RDS_IN_XMIT. In that case they'd back off and + * not try and send their newly queued message. We need to check the + * send queue after having cleared RDS_IN_XMIT so that their message + * doesn't get stuck on the send queue. * * If the transport cannot continue (i.e ret != 0), then it must * call us when more room is available, such as from the tx @@ -374,9 +380,7 @@ restart: smp_mb(); if (!list_empty(&conn->c_send_queue)) { rds_stats_inc(s_send_lock_queue_raced); - if (gen == atomic_read(&conn->c_send_generation)) { - goto restart; - } + goto restart; } } out: diff --git a/net/rds/threads.c b/net/rds/threads.c index 7a8ca7a..2bab9bf 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -61,7 +61,7 @@ * * Transition to state DISCONNECTING/DOWN: * - Inside the shutdown worker; synchronizes with xmit path - * through c_send_lock, and with connection management callbacks + * through RDS_IN_XMIT, and with connection management callbacks * via c_cm_lock. * * For receive callbacks, we rely on the underlying transport -- cgit v1.1 From 6fa70da6081bbcf948801fd5ee0be4d222298a43 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Jun 2010 11:17:59 -0700 Subject: rds: recycle FMRs through lockless lists FRM allocation and recycling is performance critical and fairly lock intensive. The current code has a per connection lock that all processes bang on and it becomes a major bottleneck on large systems. This changes things to use a number of cmpxchg based lists instead, allowing us to go through the whole FMR lifecycle without locking inside RDS. Zach Brown pointed out that our usage of cmpxchg for xlist removal is racey if someone manages to remove and add back an FMR struct into the list while another CPU can see the FMR's address at the head of the list. The second CPU might assume the list hasn't changed when in fact any number of operations might have happened in between the deletion and reinsertion. This commit maintains a per cpu count of CPUs that are currently in xlist removal, and establishes a grace period to make sure that nobody can see an entry we have just removed from the list. Signed-off-by: Chris Mason --- net/rds/ib_rdma.c | 214 +++++++++++++++++++++++++++++++++++++++++++----------- net/rds/xlist.h | 110 ++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+), 42 deletions(-) create mode 100644 net/rds/xlist.h (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 64b5ede..8c40391 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -36,7 +36,10 @@ #include "rds.h" #include "ib.h" +#include "xlist.h" +static DEFINE_PER_CPU(unsigned long, clean_list_grace); +#define CLEAN_LIST_BUSY_BIT 0 /* * This is stored as mr->r_trans_private. @@ -45,7 +48,11 @@ struct rds_ib_mr { struct rds_ib_device *device; struct rds_ib_mr_pool *pool; struct ib_fmr *fmr; - struct list_head list; + + struct xlist_head xlist; + + /* unmap_list is for freeing */ + struct list_head unmap_list; unsigned int remap_count; struct scatterlist *sg; @@ -61,12 +68,14 @@ struct rds_ib_mr_pool { struct mutex flush_lock; /* serialize fmr invalidate */ struct work_struct flush_worker; /* flush worker */ - spinlock_t list_lock; /* protect variables below */ atomic_t item_count; /* total # of MRs */ atomic_t dirty_count; /* # dirty of MRs */ - struct list_head drop_list; /* MRs that have reached their max_maps limit */ - struct list_head free_list; /* unused MRs */ - struct list_head clean_list; /* unused & unamapped MRs */ + + struct xlist_head drop_list; /* MRs that have reached their max_maps limit */ + struct xlist_head free_list; /* unused MRs */ + struct xlist_head clean_list; /* global unused & unamapped MRs */ + wait_queue_head_t flush_wait; + atomic_t free_pinned; /* memory pinned by free MRs */ unsigned long max_items; unsigned long max_items_soft; @@ -74,7 +83,7 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all); +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -212,11 +221,11 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) if (!pool) return ERR_PTR(-ENOMEM); - INIT_LIST_HEAD(&pool->free_list); - INIT_LIST_HEAD(&pool->drop_list); - INIT_LIST_HEAD(&pool->clean_list); + INIT_XLIST_HEAD(&pool->free_list); + INIT_XLIST_HEAD(&pool->drop_list); + INIT_XLIST_HEAD(&pool->clean_list); mutex_init(&pool->flush_lock); - spin_lock_init(&pool->list_lock); + init_waitqueue_head(&pool->flush_wait); INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); pool->fmr_attr.max_pages = fmr_message_size; @@ -246,27 +255,50 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { cancel_work_sync(&pool->flush_worker); - rds_ib_flush_mr_pool(pool, 1); + rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); kfree(pool); } +static void refill_local(struct rds_ib_mr_pool *pool, struct xlist_head *xl, + struct rds_ib_mr **ibmr_ret) +{ + struct xlist_head *ibmr_xl; + ibmr_xl = xlist_del_head_fast(xl); + *ibmr_ret = list_entry(ibmr_xl, struct rds_ib_mr, xlist); +} + static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool) { struct rds_ib_mr *ibmr = NULL; - unsigned long flags; + struct xlist_head *ret; + unsigned long *flag; - spin_lock_irqsave(&pool->list_lock, flags); - if (!list_empty(&pool->clean_list)) { - ibmr = list_entry(pool->clean_list.next, struct rds_ib_mr, list); - list_del_init(&ibmr->list); - } - spin_unlock_irqrestore(&pool->list_lock, flags); + preempt_disable(); + flag = &__get_cpu_var(clean_list_grace); + set_bit(CLEAN_LIST_BUSY_BIT, flag); + ret = xlist_del_head(&pool->clean_list); + if (ret) + ibmr = list_entry(ret, struct rds_ib_mr, xlist); + clear_bit(CLEAN_LIST_BUSY_BIT, flag); + preempt_enable(); return ibmr; } +static inline void wait_clean_list_grace(void) +{ + int cpu; + unsigned long *flag; + + for_each_online_cpu(cpu) { + flag = &per_cpu(clean_list_grace, cpu); + while (test_bit(CLEAN_LIST_BUSY_BIT, flag)) + cpu_relax(); + } +} + static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; @@ -299,7 +331,9 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) /* We do have some empty MRs. Flush them out. */ rds_ib_stats_inc(s_ib_rdma_mr_pool_wait); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, &ibmr); + if (ibmr) + return ibmr; } ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev)); @@ -494,33 +528,109 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr } /* + * given an xlist of mrs, put them all into the list_head for more processing + */ +static void xlist_append_to_list(struct xlist_head *xlist, struct list_head *list) +{ + struct rds_ib_mr *ibmr; + struct xlist_head splice; + struct xlist_head *cur; + struct xlist_head *next; + + splice.next = NULL; + xlist_splice(xlist, &splice); + cur = splice.next; + while (cur) { + next = cur->next; + ibmr = list_entry(cur, struct rds_ib_mr, xlist); + list_add_tail(&ibmr->unmap_list, list); + cur = next; + } +} + +/* + * this takes a list head of mrs and turns it into an xlist of clusters. + * each cluster has an xlist of MR_CLUSTER_SIZE mrs that are ready for + * reuse. + */ +static void list_append_to_xlist(struct rds_ib_mr_pool *pool, + struct list_head *list, struct xlist_head *xlist, + struct xlist_head **tail_ret) +{ + struct rds_ib_mr *ibmr; + struct xlist_head *cur_mr = xlist; + struct xlist_head *tail_mr = NULL; + + list_for_each_entry(ibmr, list, unmap_list) { + tail_mr = &ibmr->xlist; + tail_mr->next = NULL; + cur_mr->next = tail_mr; + cur_mr = tail_mr; + } + *tail_ret = tail_mr; +} + +/* * Flush our pool of MRs. * At a minimum, all currently unused MRs are unmapped. * If the number of MRs allocated exceeds the limit, we also try * to free as many MRs as needed to get back to this limit. */ -static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) +static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, + int free_all, struct rds_ib_mr **ibmr_ret) { struct rds_ib_mr *ibmr, *next; + struct xlist_head clean_xlist; + struct xlist_head *clean_tail; LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned long flags; unsigned int nfreed = 0, ncleaned = 0, free_goal; int ret = 0; rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); - mutex_lock(&pool->flush_lock); + if (ibmr_ret) { + DEFINE_WAIT(wait); + while(!mutex_trylock(&pool->flush_lock)) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + + prepare_to_wait(&pool->flush_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (xlist_empty(&pool->clean_list)) + schedule(); + + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + finish_wait(&pool->flush_wait, &wait); + goto out_nolock; + } + } + finish_wait(&pool->flush_wait, &wait); + } else + mutex_lock(&pool->flush_lock); + + if (ibmr_ret) { + ibmr = rds_ib_reuse_fmr(pool); + if (ibmr) { + *ibmr_ret = ibmr; + goto out; + } + } - spin_lock_irqsave(&pool->list_lock, flags); /* Get the list of all MRs to be dropped. Ordering matters - - * we want to put drop_list ahead of free_list. */ - list_splice_init(&pool->free_list, &unmap_list); - list_splice_init(&pool->drop_list, &unmap_list); + * we want to put drop_list ahead of free_list. + */ + xlist_append_to_list(&pool->drop_list, &unmap_list); + xlist_append_to_list(&pool->free_list, &unmap_list); if (free_all) - list_splice_init(&pool->clean_list, &unmap_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + xlist_append_to_list(&pool->clean_list, &unmap_list); free_goal = rds_ib_flush_goal(pool, free_all); @@ -528,19 +638,20 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) goto out; /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */ - list_for_each_entry(ibmr, &unmap_list, list) + list_for_each_entry(ibmr, &unmap_list, unmap_list) list_add(&ibmr->fmr->list, &fmr_list); + ret = ib_unmap_fmr(&fmr_list); if (ret) printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret); /* Now we can destroy the DMA mapping and unpin any pages */ - list_for_each_entry_safe(ibmr, next, &unmap_list, list) { + list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) { unpinned += ibmr->sg_len; __rds_ib_teardown_mr(ibmr); if (nfreed < free_goal || ibmr->remap_count >= pool->fmr_attr.max_maps) { rds_ib_stats_inc(s_ib_rdma_mr_free); - list_del(&ibmr->list); + list_del(&ibmr->unmap_list); ib_dealloc_fmr(ibmr->fmr); kfree(ibmr); nfreed++; @@ -548,9 +659,27 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) ncleaned++; } - spin_lock_irqsave(&pool->list_lock, flags); - list_splice(&unmap_list, &pool->clean_list); - spin_unlock_irqrestore(&pool->list_lock, flags); + if (!list_empty(&unmap_list)) { + /* we have to make sure that none of the things we're about + * to put on the clean list would race with other cpus trying + * to pull items off. The xlist would explode if we managed to + * remove something from the clean list and then add it back again + * while another CPU was spinning on that same item in xlist_del_head. + * + * This is pretty unlikely, but just in case wait for an xlist grace period + * here before adding anything back into the clean list. + */ + wait_clean_list_grace(); + + list_append_to_xlist(pool, &unmap_list, &clean_xlist, &clean_tail); + if (ibmr_ret) + refill_local(pool, &clean_xlist, ibmr_ret); + + /* refill_local may have emptied our list */ + if (!xlist_empty(&clean_xlist)) + xlist_add(clean_xlist.next, clean_tail, &pool->clean_list); + + } atomic_sub(unpinned, &pool->free_pinned); atomic_sub(ncleaned, &pool->dirty_count); @@ -558,6 +687,9 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all) out: mutex_unlock(&pool->flush_lock); + if (waitqueue_active(&pool->flush_wait)) + wake_up(&pool->flush_wait); +out_nolock: return ret; } @@ -565,7 +697,7 @@ static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } void rds_ib_free_mr(void *trans_private, int invalidate) @@ -573,20 +705,17 @@ void rds_ib_free_mr(void *trans_private, int invalidate) struct rds_ib_mr *ibmr = trans_private; struct rds_ib_device *rds_ibdev = ibmr->device; struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; - unsigned long flags; rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len); /* Return it to the pool's free list */ - spin_lock_irqsave(&pool->list_lock, flags); if (ibmr->remap_count >= pool->fmr_attr.max_maps) - list_add(&ibmr->list, &pool->drop_list); + xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->drop_list); else - list_add(&ibmr->list, &pool->free_list); + xlist_add(&ibmr->xlist, &ibmr->xlist, &pool->free_list); atomic_add(ibmr->sg_len, &pool->free_pinned); atomic_inc(&pool->dirty_count); - spin_unlock_irqrestore(&pool->list_lock, flags); /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || @@ -595,7 +724,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) if (invalidate) { if (likely(!in_interrupt())) { - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ @@ -614,7 +743,7 @@ void rds_ib_flush_mrs(void) struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; if (pool) - rds_ib_flush_mr_pool(pool, 0); + rds_ib_flush_mr_pool(pool, 0, NULL); } } @@ -659,3 +788,4 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, rds_ib_dev_put(rds_ibdev); return ibmr; } + diff --git a/net/rds/xlist.h b/net/rds/xlist.h new file mode 100644 index 0000000..8c21aca --- /dev/null +++ b/net/rds/xlist.h @@ -0,0 +1,110 @@ +#ifndef _LINUX_XLIST_H +#define _LINUX_XLIST_H + +#include +#include +#include +#include + +struct xlist_head { + struct xlist_head *next; +}; + +/* + * XLIST_PTR_TAIL can be used to prevent double insertion. See + * xlist_protect() + */ +#define XLIST_PTR_TAIL ((struct xlist_head *)0x1) + +static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, struct xlist_head *head) +{ + struct xlist_head *cur; + struct xlist_head *check; + + while (1) { + cur = head->next; + tail->next = cur; + check = cmpxchg(&head->next, cur, new); + if (check == cur) + break; + } +} + +/* + * To avoid duplicate insertion by two CPUs of the same xlist item + * you can call xlist_protect. It will stuff XLIST_PTR_TAIL + * into the entry->next pointer with xchg, and only return 1 + * if there was a NULL there before. + * + * if xlist_protect returns zero, someone else is busy working + * on this entry. Getting a NULL into the entry in a race + * free manner is the caller's job. + */ +static inline int xlist_protect(struct xlist_head *entry) +{ + struct xlist_head *val; + + val = xchg(&entry->next, XLIST_PTR_TAIL); + if (val == NULL) + return 1; + return 0; +} + +static inline struct xlist_head *xlist_del_head(struct xlist_head *head) +{ + struct xlist_head *cur; + struct xlist_head *check; + struct xlist_head *next; + + while (1) { + cur = head->next; + if (!cur) + goto out; + + if (cur == XLIST_PTR_TAIL) { + cur = NULL; + goto out; + } + + next = cur->next; + check = cmpxchg(&head->next, cur, next); + if (check == cur) + goto out; + } +out: + return cur; +} + +static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head) +{ + struct xlist_head *cur; + + cur = head->next; + if (!cur || cur == XLIST_PTR_TAIL) + return NULL; + + head->next = cur->next; + return cur; +} + +static inline void xlist_splice(struct xlist_head *list, + struct xlist_head *head) +{ + struct xlist_head *cur; + + WARN_ON(head->next); + cur = xchg(&list->next, NULL); + head->next = cur; +} + +static inline void INIT_XLIST_HEAD(struct xlist_head *list) +{ + list->next = NULL; +} + +static inline int xlist_empty(struct xlist_head *head) +{ + return head->next == NULL || head->next == XLIST_PTR_TAIL; +} + +#endif -- cgit v1.1 From eabb732279f1a41ac9d066aeb56973ae505c4cbc Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Jun 2010 11:18:57 -0700 Subject: rds: more FMRs are faster When we add more FMRs, we flush them less often and so we go faster. Signed-off-by: Chris Mason --- net/rds/ib.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index 2efd9d1..d2dd23d 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -9,7 +9,7 @@ #include "rdma_transport.h" #define RDS_FMR_SIZE 256 -#define RDS_FMR_POOL_SIZE 4096 +#define RDS_FMR_POOL_SIZE 8192 #define RDS_IB_MAX_SGE 8 #define RDS_IB_RECV_SGE 2 -- cgit v1.1 From 7a0ff5dbdd0b4cb7ea8764da9d78f4bb2eebaf31 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 11 Jun 2010 11:26:02 -0700 Subject: RDS: use delayed work for the FMR flushes Using a delayed work queue helps us make sure a healthy number of FMRs have queued up over the limit. It makes for a large improvement in RDMA iops. Signed-off-by: Chris Mason --- net/rds/ib_rdma.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 8c40391..3a275af 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -66,7 +66,7 @@ struct rds_ib_mr { */ struct rds_ib_mr_pool { struct mutex flush_lock; /* serialize fmr invalidate */ - struct work_struct flush_worker; /* flush worker */ + struct delayed_work flush_worker; /* flush worker */ atomic_t item_count; /* total # of MRs */ atomic_t dirty_count; /* # dirty of MRs */ @@ -226,7 +226,7 @@ struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev) INIT_XLIST_HEAD(&pool->clean_list); mutex_init(&pool->flush_lock); init_waitqueue_head(&pool->flush_wait); - INIT_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); + INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker); pool->fmr_attr.max_pages = fmr_message_size; pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps; @@ -254,7 +254,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool) { - cancel_work_sync(&pool->flush_worker); + cancel_delayed_work_sync(&pool->flush_worker); rds_ib_flush_mr_pool(pool, 1, NULL); WARN_ON(atomic_read(&pool->item_count)); WARN_ON(atomic_read(&pool->free_pinned)); @@ -695,7 +695,7 @@ out_nolock: static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { - struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker); + struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); rds_ib_flush_mr_pool(pool, 0, NULL); } @@ -720,7 +720,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_work(rds_wq, &pool->flush_worker); + queue_delayed_work(rds_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { @@ -728,7 +728,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_work(rds_wq, &pool->flush_worker); + queue_delayed_work(rds_wq, &pool->flush_worker, 10); } } -- cgit v1.1 From c9455d9996ba84af1f534c7e3944ea6f35d2fc54 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 11 Jun 2010 15:18:51 -0700 Subject: RDS: whitespace --- net/rds/connection.c | 1 - net/rds/ib_rdma.c | 1 - 2 files changed, 2 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 89871db..7670b45 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -120,7 +120,6 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, unsigned long flags; int ret; - rcu_read_lock(); conn = rds_conn_lookup(head, laddr, faddr, trans); if (conn && conn->c_loopback && conn->c_trans != &rds_loop_transport && diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 3a275af..2a4ec11 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -349,7 +349,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE| IB_ACCESS_REMOTE_ATOMIC), - &pool->fmr_attr); if (IS_ERR(ibmr->fmr)) { err = PTR_ERR(ibmr->fmr); -- cgit v1.1 From fbf4d7e3d03587a983ee4e536251ea6c1c848ec2 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Fri, 11 Jun 2010 16:24:42 -0700 Subject: RDS: Remove unused XLIST_PTR_TAIL and xlist_protect() Not used. Signed-off-by: Andy Grover --- net/rds/xlist.h | 54 ++++++++++++------------------------------------------ 1 file changed, 12 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/rds/xlist.h b/net/rds/xlist.h index 8c21aca..e6b5190 100644 --- a/net/rds/xlist.h +++ b/net/rds/xlist.h @@ -10,13 +10,18 @@ struct xlist_head { struct xlist_head *next; }; -/* - * XLIST_PTR_TAIL can be used to prevent double insertion. See - * xlist_protect() - */ -#define XLIST_PTR_TAIL ((struct xlist_head *)0x1) +static inline void INIT_XLIST_HEAD(struct xlist_head *list) +{ + list->next = NULL; +} + +static inline int xlist_empty(struct xlist_head *head) +{ + return head->next == NULL; +} -static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, struct xlist_head *head) +static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, + struct xlist_head *head) { struct xlist_head *cur; struct xlist_head *check; @@ -30,26 +35,6 @@ static inline void xlist_add(struct xlist_head *new, struct xlist_head *tail, st } } -/* - * To avoid duplicate insertion by two CPUs of the same xlist item - * you can call xlist_protect. It will stuff XLIST_PTR_TAIL - * into the entry->next pointer with xchg, and only return 1 - * if there was a NULL there before. - * - * if xlist_protect returns zero, someone else is busy working - * on this entry. Getting a NULL into the entry in a race - * free manner is the caller's job. - */ -static inline int xlist_protect(struct xlist_head *entry) -{ - struct xlist_head *val; - - val = xchg(&entry->next, XLIST_PTR_TAIL); - if (val == NULL) - return 1; - return 0; -} - static inline struct xlist_head *xlist_del_head(struct xlist_head *head) { struct xlist_head *cur; @@ -61,11 +46,6 @@ static inline struct xlist_head *xlist_del_head(struct xlist_head *head) if (!cur) goto out; - if (cur == XLIST_PTR_TAIL) { - cur = NULL; - goto out; - } - next = cur->next; check = cmpxchg(&head->next, cur, next); if (check == cur) @@ -80,7 +60,7 @@ static inline struct xlist_head *xlist_del_head_fast(struct xlist_head *head) struct xlist_head *cur; cur = head->next; - if (!cur || cur == XLIST_PTR_TAIL) + if (!cur) return NULL; head->next = cur->next; @@ -97,14 +77,4 @@ static inline void xlist_splice(struct xlist_head *list, head->next = cur; } -static inline void INIT_XLIST_HEAD(struct xlist_head *list) -{ - list->next = NULL; -} - -static inline int xlist_empty(struct xlist_head *head) -{ - return head->next == NULL || head->next == XLIST_PTR_TAIL; -} - #endif -- cgit v1.1 From b6fb0df12db6c8b6bbcc7b5c9459b3bbf5f0cee6 Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 23 Jun 2010 18:06:30 -0700 Subject: RDS/IB: Make ib_recv_refill return void Signed-off-by: Andy Grover --- net/rds/ib.h | 2 +- net/rds/ib_recv.c | 6 +----- 2 files changed, 2 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index d2dd23d..6265ada 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -323,7 +323,7 @@ void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); void rds_ib_recv_free_caches(struct rds_ib_connection *ic); -int rds_ib_recv_refill(struct rds_connection *conn, int prefill); +void rds_ib_recv_refill(struct rds_connection *conn, int prefill); void rds_ib_inc_free(struct rds_incoming *inc); int rds_ib_inc_copy_to_user(struct rds_incoming *inc, struct iovec *iov, size_t size); diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 1add097..c8c6098 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -353,7 +353,7 @@ out: * * -1 is returned if posting fails due to temporary resource exhaustion. */ -int rds_ib_recv_refill(struct rds_connection *conn, int prefill) +void rds_ib_recv_refill(struct rds_connection *conn, int prefill) { struct rds_ib_connection *ic = conn->c_transport_data; struct rds_ib_recv_work *recv; @@ -367,14 +367,12 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill) if (pos >= ic->i_recv_ring.w_nr) { printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n", pos); - ret = -EINVAL; break; } recv = &ic->i_recvs[pos]; ret = rds_ib_recv_refill_one(conn, recv, prefill); if (ret) { - ret = -1; break; } @@ -388,7 +386,6 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill) "%pI4 returned %d, disconnecting and " "reconnecting\n", &conn->c_faddr, ret); - ret = -1; break; } @@ -401,7 +398,6 @@ int rds_ib_recv_refill(struct rds_connection *conn, int prefill) if (ret) rds_ib_ring_unalloc(&ic->i_recv_ring, 1); - return ret; } /* -- cgit v1.1 From 24fa163a4bae74b3378d30e1bc776568cfca8121 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 25 Jun 2010 14:59:49 -0700 Subject: RDS/IB: wait for IB dev freeing work to finish during rmmod The RDS IB client removal callback can queue work to drop the final reference to an IB device. We have to make sure that this function has returned before we complete rmmod or the work threads can try to execute freed code. Signed-off-by: Zach Brown --- net/rds/ib.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index b21e24f..fc14f63 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -336,11 +336,18 @@ static int rds_ib_laddr_check(__be32 addr) return ret; } +static void rds_ib_unregister_client(void) +{ + ib_unregister_client(&rds_ib_client); + /* wait for rds_ib_dev_free() to complete */ + flush_workqueue(rds_wq); +} + void rds_ib_exit(void) { rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); rds_ib_destroy_nodev_conns(); - ib_unregister_client(&rds_ib_client); + rds_ib_unregister_client(); rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); @@ -404,7 +411,7 @@ out_recv: out_sysctl: rds_ib_sysctl_exit(); out_ibreg: - ib_unregister_client(&rds_ib_client); + rds_ib_unregister_client(); out: return ret; } -- cgit v1.1 From 8aeb1ba6630ffd44001ae9833842794df0107676 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 25 Jun 2010 14:58:16 -0700 Subject: RDS/IB: destroy connections on rmmod IB connections were not being destroyed during rmmod. First, recently IB device removal callback was changed to disconnect connections that used the removing device rather than destroying them. So connections with devices during rmmod were not being destroyed. Second, rds_ib_destroy_nodev_conns() was being called before connections are disassociated with devices. It would almost never find connections in the nodev list. We first get rid of rds_ib_destroy_conns(), which is no longer called, and refactor the existing caller into the main body of the function and get rid of the list and lock wrappers. Then we call rds_ib_destroy_nodev_conns() *after* ib_unregister_client() has removed the IB device from all the conns and put the conns on the nodev list. The result is that IB connections are destroyed by rmmod. Signed-off-by: Zach Brown --- net/rds/ib.c | 2 +- net/rds/ib.h | 10 +--------- net/rds/ib_rdma.c | 9 ++++----- 3 files changed, 6 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index fc14f63..af1ef18 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -346,8 +346,8 @@ static void rds_ib_unregister_client(void) void rds_ib_exit(void) { rds_info_deregister_func(RDS_INFO_IB_CONNECTIONS, rds_ib_ic_info); - rds_ib_destroy_nodev_conns(); rds_ib_unregister_client(); + rds_ib_destroy_nodev_conns(); rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); diff --git a/net/rds/ib.h b/net/rds/ib.h index 6265ada..e9f9ddf 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -299,15 +299,7 @@ void rds_ib_cm_connect_complete(struct rds_connection *conn, int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr); void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn); -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock); -static inline void rds_ib_destroy_nodev_conns(void) -{ - __rds_ib_destroy_conns(&ib_nodev_conns, &ib_nodev_conns_lock); -} -static inline void rds_ib_destroy_conns(struct rds_ib_device *rds_ibdev) -{ - __rds_ib_destroy_conns(&rds_ibdev->conn_list, &rds_ibdev->spinlock); -} +void rds_ib_destroy_nodev_conns(void); struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *); void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo); void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 2a4ec11..00f3995 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -198,16 +198,15 @@ void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection * rds_ib_dev_put(rds_ibdev); } -void __rds_ib_destroy_conns(struct list_head *list, spinlock_t *list_lock) +void rds_ib_destroy_nodev_conns(void) { struct rds_ib_connection *ic, *_ic; LIST_HEAD(tmp_list); /* avoid calling conn_destroy with irqs off */ - spin_lock_irq(list_lock); - list_splice(list, &tmp_list); - INIT_LIST_HEAD(list); - spin_unlock_irq(list_lock); + spin_lock_irq(&ib_nodev_conns_lock); + list_splice(&ib_nodev_conns, &tmp_list); + spin_unlock_irq(&ib_nodev_conns_lock); list_for_each_entry_safe(ic, _ic, &tmp_list, ib_node) rds_conn_destroy(ic->conn); -- cgit v1.1 From 515e079dab19cf774d1eec6e5f4ed65509e31ef1 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 6 Jul 2010 15:09:56 -0700 Subject: RDS/IB: create a work queue for FMR flushing This patch moves the FMR flushing work in to its own mult-threaded work queue. This is to maintain performance in preparation for returning the main krdsd work queue back to a single threaded work queue to avoid deep-rooted concurrency bugs. This is also good because it further separates FMRs, which might be removed some day, from the rest of the code base. Signed-off-by: Zach Brown --- net/rds/ib.c | 9 ++++++++- net/rds/ib.h | 2 ++ net/rds/ib_rdma.c | 25 +++++++++++++++++++++++-- 3 files changed, 33 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index af1ef18..d2007b9 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -351,6 +351,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); + rds_ib_fmr_exit(); } struct rds_transport rds_ib_transport = { @@ -386,10 +387,14 @@ int __init rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = ib_register_client(&rds_ib_client); + ret = rds_ib_fmr_init(); if (ret) goto out; + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out_fmr_exit; + ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; @@ -412,6 +417,8 @@ out_sysctl: rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); +out_fmr_exit: + rds_ib_fmr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index e9f9ddf..fd4ea69 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -308,6 +308,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); +int __init rds_ib_fmr_init(void); +void __exit rds_ib_fmr_exit(void); /* ib_recv.c */ int __init rds_ib_recv_init(void); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 00f3995..0eb5976 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -691,6 +691,26 @@ out_nolock: return ret; } +struct workqueue_struct *rds_ib_fmr_wq; + +int __init rds_ib_fmr_init(void) +{ + rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); + if (!rds_ib_fmr_wq) + return -ENOMEM; + return 0; +} + +/* + * By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void __exit rds_ib_fmr_exit(void) +{ + destroy_workqueue(rds_ib_fmr_wq); +} + static void rds_ib_mr_pool_flush_worker(struct work_struct *work) { struct rds_ib_mr_pool *pool = container_of(work, struct rds_ib_mr_pool, flush_worker.work); @@ -718,7 +738,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - queue_delayed_work(rds_wq, &pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { @@ -726,7 +746,8 @@ void rds_ib_free_mr(void *trans_private, int invalidate) } else { /* We get here if the user created a MR marked * as use_once and invalidate at the same time. */ - queue_delayed_work(rds_wq, &pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, + &pool->flush_worker, 10); } } -- cgit v1.1 From 80c51be56ffa257d3177f0d750d90be65d30c22f Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 6 Jul 2010 15:08:48 -0700 Subject: RDS: return to a single-threaded krdsd We were seeing very nasty bugs due to fundamental assumption the current code makes about concurrent work struct processing. The code simpy isn't able to handle concurrent connection shutdown work function execution today, for example, which is very much possible once a multi-threaded krdsd was introduced. The problem compounds as additional work structs are added to the mix. krdsd is no longer perforance critical now that send and receive posting and FMR flushing are done elsewhere, so the safest fix is to move back to the single threaded krdsd that the current code was built around. Signed-off-by: Zach Brown --- net/rds/threads.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/threads.c b/net/rds/threads.c index 2bab9bf..c08c220 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -214,7 +214,7 @@ void rds_threads_exit(void) int __init rds_threads_init(void) { - rds_wq = create_workqueue("krdsd"); + rds_wq = create_singlethread_workqueue("krdsd"); if (!rds_wq) return -ENOMEM; -- cgit v1.1 From d455ab64096b9a86849c7315c53e595330842db6 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 6 Jul 2010 15:04:34 -0700 Subject: RDS/IB: always process recv completions The recv refill path was leaking fragments because the recv event handler had marked a ring element as free without freeing its frag. This was happening because it wasn't processing receives when the conn wasn't marked up or connecting, as can be the case if it races with rmmod. Two observations support always processing receives in the callback. First, buildup should only post receives, thus triggering recv event handler calls, once it has built up all the state to handle them. Teardown should destroy the CQ and drain the ring before tearing down the state needed to process recvs. Both appear to be true today. Second, this test was fundamentally racy. There is nothing to stop rmmod and connection destruction from swooping in the moment after the conn state was sampled but before real receive procesing starts. Signed-off-by: Zach Brown --- net/rds/ib_recv.c | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index c8c6098..9c4208f 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -979,18 +979,22 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, * to get a recv completion _before_ the rdmacm ESTABLISHED * event is processed. */ - if (rds_conn_up(conn) || rds_conn_connecting(conn)) { + if (wc.status == IB_WC_SUCCESS) { + rds_ib_process_recv(conn, recv, wc.byte_len, state); + } else { /* We expect errors as the qp is drained during shutdown */ - if (wc.status == IB_WC_SUCCESS) { - rds_ib_process_recv(conn, recv, wc.byte_len, state); - } else { + if (rds_conn_up(conn) || rds_conn_connecting(conn)) rds_ib_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " - "reconnecting\n", &conn->c_faddr, - wc.status); - } + "%pI4 had status %u, disconnecting and " + "reconnecting\n", &conn->c_faddr, + wc.status); } + /* + * It's very important that we only free this ring entry if we've truly + * freed the resources allocated to the entry. The refilling path can + * leak if we don't. + */ rds_ib_ring_free(&ic->i_recv_ring, 1); } } -- cgit v1.1 From c20f5b9633bb0953bd2422f0f1430a2028cdbd0a Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 7 Jul 2010 16:46:26 -0700 Subject: RDS/IB: Use SLAB_HWCACHE_ALIGN flag for kmem_cache_create() We are *definitely* counting cycles as closely as DaveM, so ensure hwcache alignment for our recv ring control structs. Signed-off-by: Andy Grover --- net/rds/ib_recv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 9c4208f..37dab28 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1051,13 +1051,13 @@ int __init rds_ib_recv_init(void) rds_ib_incoming_slab = kmem_cache_create("rds_ib_incoming", sizeof(struct rds_ib_incoming), - 0, 0, NULL); + 0, SLAB_HWCACHE_ALIGN, NULL); if (!rds_ib_incoming_slab) goto out; rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), - 0, 0, NULL); + 0, SLAB_HWCACHE_ALIGN, NULL); if (!rds_ib_frag_slab) kmem_cache_destroy(rds_ib_incoming_slab); else -- cgit v1.1 From ef87b7ea39a91906218a262686bcb8bad8b6b46e Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 9 Jul 2010 12:26:20 -0700 Subject: RDS: remove __init and __exit annotation The trivial amount of memory saved isn't worth the cost of dealing with section mismatches. Signed-off-by: Zach Brown --- net/rds/af_rds.c | 4 ++-- net/rds/connection.c | 2 +- net/rds/ib.c | 2 +- net/rds/ib.h | 10 +++++----- net/rds/ib_rdma.c | 4 ++-- net/rds/ib_recv.c | 2 +- net/rds/ib_sysctl.c | 2 +- net/rds/iw.c | 2 +- net/rds/iw.h | 6 +++--- net/rds/iw_recv.c | 2 +- net/rds/iw_sysctl.c | 2 +- net/rds/rdma_transport.c | 4 ++-- net/rds/rds.h | 10 +++++----- net/rds/stats.c | 2 +- net/rds/sysctl.c | 2 +- net/rds/tcp.c | 2 +- net/rds/tcp.h | 6 +++--- net/rds/tcp_listen.c | 2 +- net/rds/tcp_recv.c | 2 +- net/rds/threads.c | 2 +- 20 files changed, 35 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index f16d2a9..57ef0ec 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -521,7 +521,7 @@ out: spin_unlock_irqrestore(&rds_sock_lock, flags); } -static void __exit rds_exit(void) +static void rds_exit(void) { sock_unregister(rds_family_ops.family); proto_unregister(&rds_proto); @@ -536,7 +536,7 @@ static void __exit rds_exit(void) } module_exit(rds_exit); -static int __init rds_init(void) +static int rds_init(void) { int ret; diff --git a/net/rds/connection.c b/net/rds/connection.c index 7670b45..0de40d9 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -488,7 +488,7 @@ static void rds_conn_info(struct socket *sock, unsigned int len, sizeof(struct rds_info_connection)); } -int __init rds_conn_init(void) +int rds_conn_init(void) { rds_conn_slab = kmem_cache_create("rds_connection", sizeof(struct rds_connection), diff --git a/net/rds/ib.c b/net/rds/ib.c index d2007b9..3eb5617 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -381,7 +381,7 @@ struct rds_transport rds_ib_transport = { .t_type = RDS_TRANS_IB }; -int __init rds_ib_init(void) +int rds_ib_init(void) { int ret; diff --git a/net/rds/ib.h b/net/rds/ib.h index fd4ea69..acda2db 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -282,7 +282,7 @@ void rds_ib_conn_free(void *arg); int rds_ib_conn_connect(struct rds_connection *conn); void rds_ib_conn_shutdown(struct rds_connection *conn); void rds_ib_state_change(struct sock *sk); -int __init rds_ib_listen_init(void); +int rds_ib_listen_init(void); void rds_ib_listen_stop(void); void __rds_ib_conn_error(struct rds_connection *conn, const char *, ...); int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -308,11 +308,11 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); -int __init rds_ib_fmr_init(void); -void __exit rds_ib_fmr_exit(void); +int rds_ib_fmr_init(void); +void rds_ib_fmr_exit(void); /* ib_recv.c */ -int __init rds_ib_recv_init(void); +int rds_ib_recv_init(void); void rds_ib_recv_exit(void); int rds_ib_recv(struct rds_connection *conn); int rds_ib_recv_alloc_caches(struct rds_ib_connection *ic); @@ -363,7 +363,7 @@ unsigned int rds_ib_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_ib_sysctl_init(void); +int rds_ib_sysctl_init(void); void rds_ib_sysctl_exit(void); extern unsigned long rds_ib_sysctl_max_send_wr; extern unsigned long rds_ib_sysctl_max_recv_wr; diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 0eb5976..3efdddc 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -693,7 +693,7 @@ out_nolock: struct workqueue_struct *rds_ib_fmr_wq; -int __init rds_ib_fmr_init(void) +int rds_ib_fmr_init(void) { rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); if (!rds_ib_fmr_wq) @@ -706,7 +706,7 @@ int __init rds_ib_fmr_init(void) * had their pools freed. As each pool is freed its work struct is waited on, * so the pool flushing work queue should be idle by the time we get here. */ -void __exit rds_ib_fmr_exit(void) +void rds_ib_fmr_exit(void) { destroy_workqueue(rds_ib_fmr_wq); } diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index 37dab28..f25c483 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1040,7 +1040,7 @@ int rds_ib_recv(struct rds_connection *conn) return ret; } -int __init rds_ib_recv_init(void) +int rds_ib_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index b556c5c..fc3da37 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -119,7 +119,7 @@ void rds_ib_sysctl_exit(void) unregister_sysctl_table(rds_ib_sysctl_hdr); } -int __init rds_ib_sysctl_init(void) +int rds_ib_sysctl_init(void) { rds_ib_sysctl_hdr = register_sysctl_paths(rds_ib_sysctl_path, rds_ib_sysctl_table); if (!rds_ib_sysctl_hdr) diff --git a/net/rds/iw.c b/net/rds/iw.c index 467790d..56808ca 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -287,7 +287,7 @@ struct rds_transport rds_iw_transport = { .t_prefer_loopback = 1, }; -int __init rds_iw_init(void) +int rds_iw_init(void) { int ret; diff --git a/net/rds/iw.h b/net/rds/iw.h index f112105..543e665 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -284,7 +284,7 @@ void rds_iw_conn_free(void *arg); int rds_iw_conn_connect(struct rds_connection *conn); void rds_iw_conn_shutdown(struct rds_connection *conn); void rds_iw_state_change(struct sock *sk); -int __init rds_iw_listen_init(void); +int rds_iw_listen_init(void); void rds_iw_listen_stop(void); void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...); int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id, @@ -321,7 +321,7 @@ void rds_iw_flush_mrs(void); void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); /* ib_recv.c */ -int __init rds_iw_recv_init(void); +int rds_iw_recv_init(void); void rds_iw_recv_exit(void); int rds_iw_recv(struct rds_connection *conn); int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp, @@ -370,7 +370,7 @@ unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); /* ib_sysctl.c */ -int __init rds_iw_sysctl_init(void); +int rds_iw_sysctl_init(void); void rds_iw_sysctl_exit(void); extern unsigned long rds_iw_sysctl_max_send_wr; extern unsigned long rds_iw_sysctl_max_recv_wr; diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c index 7e929f7..5e57347 100644 --- a/net/rds/iw_recv.c +++ b/net/rds/iw_recv.c @@ -887,7 +887,7 @@ int rds_iw_recv(struct rds_connection *conn) return ret; } -int __init rds_iw_recv_init(void) +int rds_iw_recv_init(void) { struct sysinfo si; int ret = -ENOMEM; diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 3cb0587..23e3a9a 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -122,7 +122,7 @@ void rds_iw_sysctl_exit(void) unregister_sysctl_table(rds_iw_sysctl_hdr); } -int __init rds_iw_sysctl_init(void) +int rds_iw_sysctl_init(void) { rds_iw_sysctl_hdr = register_sysctl_paths(rds_iw_sysctl_path, rds_iw_sysctl_table); if (!rds_iw_sysctl_hdr) diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index e599ba2..550d348 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -122,7 +122,7 @@ out: return ret; } -static int __init rds_rdma_listen_init(void) +static int rds_rdma_listen_init(void) { struct sockaddr_in sin; struct rdma_cm_id *cm_id; @@ -177,7 +177,7 @@ static void rds_rdma_listen_stop(void) } } -int __init rds_rdma_init(void) +int rds_rdma_init(void) { int ret; diff --git a/net/rds/rds.h b/net/rds/rds.h index 4510344..8a8a482 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -606,7 +606,7 @@ void rds_cong_exit(void); struct rds_message *rds_cong_update_alloc(struct rds_connection *conn); /* conn.c */ -int __init rds_conn_init(void); +int rds_conn_init(void); void rds_conn_exit(void); struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); @@ -769,14 +769,14 @@ DECLARE_PER_CPU_SHARED_ALIGNED(struct rds_statistics, rds_stats); put_cpu(); \ } while (0) #define rds_stats_add(member, count) rds_stats_add_which(rds_stats, member, count) -int __init rds_stats_init(void); +int rds_stats_init(void); void rds_stats_exit(void); void rds_stats_info_copy(struct rds_info_iterator *iter, uint64_t *values, const char *const *names, size_t nr); /* sysctl.c */ -int __init rds_sysctl_init(void); +int rds_sysctl_init(void); void rds_sysctl_exit(void); extern unsigned long rds_sysctl_sndbuf_min; extern unsigned long rds_sysctl_sndbuf_default; @@ -790,7 +790,7 @@ extern unsigned long rds_sysctl_trace_flags; extern unsigned int rds_sysctl_trace_level; /* threads.c */ -int __init rds_threads_init(void); +int rds_threads_init(void); void rds_threads_exit(void); extern struct workqueue_struct *rds_wq; void rds_queue_reconnect(struct rds_connection *conn); @@ -806,7 +806,7 @@ void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(__be32 addr); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); -int __init rds_trans_init(void); +int rds_trans_init(void); void rds_trans_exit(void); #endif diff --git a/net/rds/stats.c b/net/rds/stats.c index 344929a..10c759c 100644 --- a/net/rds/stats.c +++ b/net/rds/stats.c @@ -143,7 +143,7 @@ void rds_stats_exit(void) rds_info_deregister_func(RDS_INFO_COUNTERS, rds_stats_info); } -int __init rds_stats_init(void) +int rds_stats_init(void) { rds_info_register_func(RDS_INFO_COUNTERS, rds_stats_info); return 0; diff --git a/net/rds/sysctl.c b/net/rds/sysctl.c index 5a6dd81..25ad0c7 100644 --- a/net/rds/sysctl.c +++ b/net/rds/sysctl.c @@ -105,7 +105,7 @@ void rds_sysctl_exit(void) unregister_sysctl_table(rds_sysctl_reg_table); } -int __init rds_sysctl_init(void) +int rds_sysctl_init(void) { rds_sysctl_reconnect_min = msecs_to_jiffies(1); rds_sysctl_reconnect_min_jiffies = rds_sysctl_reconnect_min; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 3262992f5..eeb08e6 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -274,7 +274,7 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; -int __init rds_tcp_init(void) +int rds_tcp_init(void) { int ret; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index 16b1663..f5e6f7b 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -43,7 +43,7 @@ struct rds_tcp_statistics { }; /* tcp.c */ -int __init rds_tcp_init(void); +int rds_tcp_init(void); void rds_tcp_exit(void); void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); @@ -61,12 +61,12 @@ void rds_tcp_conn_shutdown(struct rds_connection *conn); void rds_tcp_state_change(struct sock *sk); /* tcp_listen.c */ -int __init rds_tcp_listen_init(void); +int rds_tcp_listen_init(void); void rds_tcp_listen_stop(void); void rds_tcp_listen_data_ready(struct sock *sk, int bytes); /* tcp_recv.c */ -int __init rds_tcp_recv_init(void); +int rds_tcp_recv_init(void); void rds_tcp_recv_exit(void); void rds_tcp_data_ready(struct sock *sk, int bytes); int rds_tcp_recv(struct rds_connection *conn); diff --git a/net/rds/tcp_listen.c b/net/rds/tcp_listen.c index 50b649e..ae27869 100644 --- a/net/rds/tcp_listen.c +++ b/net/rds/tcp_listen.c @@ -135,7 +135,7 @@ out: ready(sk, bytes); } -int __init rds_tcp_listen_init(void) +int rds_tcp_listen_init(void) { struct sockaddr_in sin; struct socket *sock = NULL; diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 49c9660..7017f3af 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -342,7 +342,7 @@ out: ready(sk, bytes); } -int __init rds_tcp_recv_init(void) +int rds_tcp_recv_init(void) { rds_tcp_incoming_slab = kmem_cache_create("rds_tcp_incoming", sizeof(struct rds_tcp_incoming), diff --git a/net/rds/threads.c b/net/rds/threads.c index c08c220..0fd90f8 100644 --- a/net/rds/threads.c +++ b/net/rds/threads.c @@ -212,7 +212,7 @@ void rds_threads_exit(void) destroy_workqueue(rds_wq); } -int __init rds_threads_init(void) +int rds_threads_init(void) { rds_wq = create_singlethread_workqueue("krdsd"); if (!rds_wq) -- cgit v1.1 From f046011cd73c372267befd10242988eb744649fe Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 14 Jul 2010 13:55:35 -0700 Subject: RDS/IB: track signaled sends We're seeing bugs today where IB connection shutdown clears the send ring while the tasklet is processing completed sends. Implementation details cause this to dereference a null pointer. Shutdown needs to wait for send completion to stop before tearing down the connection. We can't simply wait for the ring to empty because it may contain unsignaled sends that will never be processed. This patch tracks the number of signaled sends that we've posted and waits for them to complete. It also makes sure that the tasklet has finished executing. Signed-off-by: Zach Brown --- net/rds/ib.h | 1 + net/rds/ib_cm.c | 14 +++++++++++--- net/rds/ib_send.c | 47 ++++++++++++++++++++++++++++++++++++++++++----- 3 files changed, 54 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/rds/ib.h b/net/rds/ib.h index acda2db..a13ced5 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -108,6 +108,7 @@ struct rds_ib_connection { struct rds_header *i_send_hdrs; u64 i_send_hdrs_dma; struct rds_ib_send_work *i_sends; + atomic_t i_signaled_sends; /* rx */ struct tasklet_struct i_recv_tasklet; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 10f6a88..123c7d3 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -615,11 +615,18 @@ void rds_ib_conn_shutdown(struct rds_connection *conn) } /* - * Don't wait for the send ring to be empty -- there may be completed - * non-signaled entries sitting on there. We unmap these below. + * We want to wait for tx and rx completion to finish + * before we tear down the connection, but we have to be + * careful not to get stuck waiting on a send ring that + * only has unsignaled sends in it. We've shutdown new + * sends before getting here so by waiting for signaled + * sends to complete we're ensured that there will be no + * more tx processing. */ wait_event(rds_ib_ring_empty_wait, - rds_ib_ring_empty(&ic->i_recv_ring)); + rds_ib_ring_empty(&ic->i_recv_ring) && + (atomic_read(&ic->i_signaled_sends) == 0)); + tasklet_kill(&ic->i_recv_tasklet); if (ic->i_send_hdrs) ib_dma_free_coherent(dev, @@ -729,6 +736,7 @@ int rds_ib_conn_alloc(struct rds_connection *conn, gfp_t gfp) #ifndef KERNEL_HAS_ATOMIC64 spin_lock_init(&ic->i_ack_lock); #endif + atomic_set(&ic->i_signaled_sends, 0); /* * rds_ib_conn_shutdown() waits for these to be emptied so they diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index e88cb4a..15f7569 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -220,6 +220,18 @@ void rds_ib_send_clear_ring(struct rds_ib_connection *ic) } /* + * The only fast path caller always has a non-zero nr, so we don't + * bother testing nr before performing the atomic sub. + */ +static void rds_ib_sub_signaled(struct rds_ib_connection *ic, int nr) +{ + if ((atomic_sub_return(nr, &ic->i_signaled_sends) == 0) && + waitqueue_active(&rds_ib_ring_empty_wait)) + wake_up(&rds_ib_ring_empty_wait); + BUG_ON(atomic_read(&ic->i_signaled_sends) < 0); +} + +/* * The _oldest/_free ring operations here race cleanly with the alloc/unalloc * operations performed in the send path. As the sender allocs and potentially * unallocs the next free entry in the ring it doesn't alter which is @@ -236,6 +248,7 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) u32 oldest; u32 i = 0; int ret; + int nr_sig = 0; rdsdebug("cq %p conn %p\n", cq, conn); rds_ib_stats_inc(s_ib_tx_cq_call); @@ -262,6 +275,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) for (i = 0; i < completed; i++) { send = &ic->i_sends[oldest]; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; rm = rds_ib_send_unmap_op(ic, send, wc.status); @@ -282,6 +297,8 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) } rds_ib_ring_free(&ic->i_send_ring, completed); + rds_ib_sub_signaled(ic, nr_sig); + nr_sig = 0; if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) || test_bit(0, &conn->c_map_queued)) @@ -440,9 +457,9 @@ void rds_ib_advertise_credits(struct rds_connection *conn, unsigned int posted) set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags); } -static inline void rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, - struct rds_ib_send_work *send, - bool notify) +static inline int rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, + struct rds_ib_send_work *send, + bool notify) { /* * We want to delay signaling completions just enough to get @@ -452,7 +469,9 @@ static inline void rds_ib_set_wr_signal_state(struct rds_ib_connection *ic, if (ic->i_unsignaled_wrs-- == 0 || notify) { ic->i_unsignaled_wrs = rds_ib_sysctl_max_unsig_wrs; send->s_wr.send_flags |= IB_SEND_SIGNALED; + return 1; } + return 0; } /* @@ -488,6 +507,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, int bytes_sent = 0; int ret; int flow_controlled = 0; + int nr_sig = 0; BUG_ON(off % RDS_FRAG_SIZE); BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header)); @@ -645,6 +665,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (ic->i_flowctl && flow_controlled && i == (work_alloc-1)) send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED; + if (send->s_wr.send_flags & IB_SEND_SIGNALED) + nr_sig++; + rdsdebug("send %p wr %p num_sge %u next %p\n", send, &send->s_wr, send->s_wr.num_sge, send->s_wr.next); @@ -689,6 +712,9 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, if (ic->i_flowctl && i < credit_alloc) rds_ib_send_add_credits(conn, credit_alloc - i); + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + /* XXX need to worry about failed_wr and partial sends. */ failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); @@ -699,6 +725,7 @@ int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, printk(KERN_WARNING "RDS/IB: ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); if (prev->s_op) { ic->i_data_op = prev->s_op; prev->s_op = NULL; @@ -728,6 +755,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) u32 pos; u32 work_alloc; int ret; + int nr_sig = 0; rds_ibdev = ib_get_client_data(ic->i_cm_id->device, &rds_ib_client); @@ -752,7 +780,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send->s_wr.wr.atomic.compare_add = op->op_swap_add; send->s_wr.wr.atomic.swap = 0; } - rds_ib_set_wr_signal_state(ic, send, op->op_notify); + nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.num_sge = 1; send->s_wr.next = NULL; send->s_wr.wr.atomic.remote_addr = op->op_remote_addr; @@ -778,6 +806,9 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) rdsdebug("rva %Lx rpa %Lx len %u\n", op->op_remote_addr, send->s_sge[0].addr, send->s_sge[0].length); + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + failed_wr = &send->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr); rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic, @@ -787,6 +818,7 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); goto out; } @@ -817,6 +849,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) int sent; int ret; int num_sge; + int nr_sig = 0; /* map the op the first time we see it */ if (!op->op_mapped) { @@ -859,7 +892,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) send->s_queued = jiffies; send->s_op = NULL; - rds_ib_set_wr_signal_state(ic, send, op->op_notify); + nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; send->s_wr.wr.rdma.remote_addr = remote_addr; @@ -910,6 +943,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) work_alloc = i; } + if (nr_sig) + atomic_add(nr_sig, &ic->i_signaled_sends); + failed_wr = &first->s_wr; ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr); rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic, @@ -919,6 +955,7 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op) printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 " "returned %d\n", &conn->c_faddr, ret); rds_ib_ring_unalloc(&ic->i_send_ring, work_alloc); + rds_ib_sub_signaled(ic, nr_sig); goto out; } -- cgit v1.1 From b4e1da3c9a0ac9b01f45a8578b7347e3a31f9fb8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 19 Jul 2010 17:02:41 -0700 Subject: RDS: properly use sg_init_table This is only needed to keep debugging code from bugging. Signed-off-by: Chris Mason --- net/rds/ib_recv.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index f25c483..a2f5f6f 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -282,6 +282,7 @@ static struct rds_page_frag *rds_ib_refill_one_frag(struct rds_ib_connection *ic if (!frag) return NULL; + sg_init_table(&frag->f_sg, 1); ret = rds_page_remainder_alloc(&frag->f_sg, RDS_FRAG_SIZE, page_mask); if (ret) { -- cgit v1.1 From 8576f374ac9537674e3cccb0a9d43fa2b7ebbf5b Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 19 Jul 2010 17:06:46 -0700 Subject: RDS: flush fmrs before allocating new ones Flushing FMRs is somewhat expensive, and is currently kicked off when the interrupt handler notices that we are getting low. The result of this is that FMR flushing only happens from the interrupt cpus. This spreads the load more effectively by triggering flushes just before we allocate a new FMR. Signed-off-by: Chris Mason --- net/rds/ib_rdma.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 3efdddc..0017964 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -38,6 +38,8 @@ #include "ib.h" #include "xlist.h" +struct workqueue_struct *rds_ib_fmr_wq; + static DEFINE_PER_CPU(unsigned long, clean_list_grace); #define CLEAN_LIST_BUSY_BIT 0 @@ -304,6 +306,9 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) struct rds_ib_mr *ibmr = NULL; int err = 0, iter = 0; + if (atomic_read(&pool->dirty_count) >= pool->max_items / 10) + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); + while (1) { ibmr = rds_ib_reuse_fmr(pool); if (ibmr) @@ -691,8 +696,6 @@ out_nolock: return ret; } -struct workqueue_struct *rds_ib_fmr_wq; - int rds_ib_fmr_init(void) { rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); -- cgit v1.1 From 1bde04a63d532c2540d6fdee0a661530a62b1686 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 14 Jul 2010 14:01:21 -0700 Subject: RDS/IB: print IB event strings as well as their number It's nice to not have to go digging in the code to see which event occurred. It's easy to throw together a quick array that maps the ib event enums to their strings. I didn't see anything in the stack that does this translation for us, but I also didn't look very hard. Signed-off-by: Zach Brown --- net/rds/ib_cm.c | 43 +++++++++++++++++++++++++++++++++++++++---- 1 file changed, 39 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 123c7d3..0e2fea8 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -38,6 +38,38 @@ #include "rds.h" #include "ib.h" +static char *rds_ib_event_type_strings[] = { +#define RDS_IB_EVENT_STRING(foo) [IB_EVENT_##foo] = __stringify(foo) + RDS_IB_EVENT_STRING(CQ_ERR), + RDS_IB_EVENT_STRING(QP_FATAL), + RDS_IB_EVENT_STRING(QP_REQ_ERR), + RDS_IB_EVENT_STRING(QP_ACCESS_ERR), + RDS_IB_EVENT_STRING(COMM_EST), + RDS_IB_EVENT_STRING(SQ_DRAINED), + RDS_IB_EVENT_STRING(PATH_MIG), + RDS_IB_EVENT_STRING(PATH_MIG_ERR), + RDS_IB_EVENT_STRING(DEVICE_FATAL), + RDS_IB_EVENT_STRING(PORT_ACTIVE), + RDS_IB_EVENT_STRING(PORT_ERR), + RDS_IB_EVENT_STRING(LID_CHANGE), + RDS_IB_EVENT_STRING(PKEY_CHANGE), + RDS_IB_EVENT_STRING(SM_CHANGE), + RDS_IB_EVENT_STRING(SRQ_ERR), + RDS_IB_EVENT_STRING(SRQ_LIMIT_REACHED), + RDS_IB_EVENT_STRING(QP_LAST_WQE_REACHED), + RDS_IB_EVENT_STRING(CLIENT_REREGISTER), +#undef RDS_IB_EVENT_STRING +}; + +static char *rds_ib_event_str(enum ib_event_type type) +{ + if (type < ARRAY_SIZE(rds_ib_event_type_strings) && + rds_ib_event_type_strings[type]) + return rds_ib_event_type_strings[type]; + else + return "unknown"; +}; + /* * Set the selected protocol version */ @@ -202,7 +234,8 @@ static void rds_ib_cm_fill_conn_param(struct rds_connection *conn, static void rds_ib_cq_event_handler(struct ib_event *event, void *data) { - rdsdebug("event %u data %p\n", event->event, data); + rdsdebug("event %u (%s) data %p\n", + event->event, rds_ib_event_str(event->event), data); } static void rds_ib_qp_event_handler(struct ib_event *event, void *data) @@ -210,16 +243,18 @@ static void rds_ib_qp_event_handler(struct ib_event *event, void *data) struct rds_connection *conn = data; struct rds_ib_connection *ic = conn->c_transport_data; - rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event); + rdsdebug("conn %p ic %p event %u (%s)\n", conn, ic, event->event, + rds_ib_event_str(event->event)); switch (event->event) { case IB_EVENT_COMM_EST: rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST); break; default: - rdsdebug("Fatal QP Event %u " + rdsdebug("Fatal QP Event %u (%s) " "- connection %pI4->%pI4, reconnecting\n", - event->event, &conn->c_laddr, &conn->c_faddr); + event->event, rds_ib_event_str(event->event), + &conn->c_laddr, &conn->c_faddr); rds_conn_drop(conn); break; } -- cgit v1.1 From ea819867b788728aca60717e4fdacb3df771f670 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Thu, 15 Jul 2010 12:34:33 -0700 Subject: RDS/IB: protect the list of IB devices The RDS IB device list wasn't protected by any locking. Traversal in both the get_mr and FMR flushing paths could race with additon and removal. List manipulation is done with RCU primatives and is protected by the write side of a rwsem. The list traversal in the get_mr fast path is protected by a rcu read critical section. The FMR list traversal is more problematic because it can block while traversing the list. We protect this with the read side of the rwsem. Signed-off-by: Zach Brown --- net/rds/ib.c | 27 ++++++++++++++++++++------- net/rds/ib.h | 1 + net/rds/ib_rdma.c | 8 +++++--- 3 files changed, 26 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/rds/ib.c b/net/rds/ib.c index 3eb5617..b12a395 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -53,6 +53,12 @@ MODULE_PARM_DESC(fmr_message_size, " Max size of a RDMA transfer"); module_param(rds_ib_retry_count, int, 0444); MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error"); +/* + * we have a clumsy combination of RCU and a rwsem protecting this list + * because it is used both in the get_mr fast path and while blocking in + * the FMR flushing path. + */ +DECLARE_RWSEM(rds_ib_devices_lock); struct list_head rds_ib_devices; /* NOTE: if also grabbing ibdev lock, grab this first */ @@ -171,7 +177,10 @@ void rds_ib_add_one(struct ib_device *device) INIT_LIST_HEAD(&rds_ibdev->ipaddr_list); INIT_LIST_HEAD(&rds_ibdev->conn_list); - list_add_tail(&rds_ibdev->list, &rds_ib_devices); + + down_write(&rds_ib_devices_lock); + list_add_tail_rcu(&rds_ibdev->list, &rds_ib_devices); + up_write(&rds_ib_devices_lock); atomic_inc(&rds_ibdev->refcount); ib_set_client_data(device, &rds_ib_client, rds_ibdev); @@ -230,16 +239,20 @@ void rds_ib_remove_one(struct ib_device *device) rds_ib_dev_shutdown(rds_ibdev); + /* stop connection attempts from getting a reference to this device. */ + ib_set_client_data(device, &rds_ib_client, NULL); + + down_write(&rds_ib_devices_lock); + list_del_rcu(&rds_ibdev->list); + up_write(&rds_ib_devices_lock); + /* - * prevent future connection attempts from getting a reference to this - * device and wait for currently racing connection attempts to finish - * getting their reference + * This synchronize rcu is waiting for readers of both the ib + * client data and the devices list to finish before we drop + * both of those references. */ - ib_set_client_data(device, &rds_ib_client, NULL); synchronize_rcu(); rds_ib_dev_put(rds_ibdev); - - list_del(&rds_ibdev->list); rds_ib_dev_put(rds_ibdev); } diff --git a/net/rds/ib.h b/net/rds/ib.h index a13ced5..2189fd4 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -23,6 +23,7 @@ #define RDS_IB_RECYCLE_BATCH_COUNT 32 +extern struct rw_semaphore rds_ib_devices_lock; extern struct list_head rds_ib_devices; /* diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 0017964..8f6e221 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -94,8 +94,8 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) struct rds_ib_device *rds_ibdev; struct rds_ib_ipaddr *i_ipaddr; - list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { - rcu_read_lock(); + rcu_read_lock(); + list_for_each_entry_rcu(rds_ibdev, &rds_ib_devices, list) { list_for_each_entry_rcu(i_ipaddr, &rds_ibdev->ipaddr_list, list) { if (i_ipaddr->ipaddr == ipaddr) { atomic_inc(&rds_ibdev->refcount); @@ -103,8 +103,8 @@ static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr) return rds_ibdev; } } - rcu_read_unlock(); } + rcu_read_unlock(); return NULL; } @@ -761,12 +761,14 @@ void rds_ib_flush_mrs(void) { struct rds_ib_device *rds_ibdev; + down_read(&rds_ib_devices_lock); list_for_each_entry(rds_ibdev, &rds_ib_devices, list) { struct rds_ib_mr_pool *pool = rds_ibdev->mr_pool; if (pool) rds_ib_flush_mr_pool(pool, 0, NULL); } + up_read(&rds_ib_devices_lock); } void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, -- cgit v1.1 From fe8ff6b58f040dd52d2db45972db8e0301847f1c Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 23 Jul 2010 10:30:45 -0700 Subject: RDS: lock rds_conn_count decrement in rds_conn_destroy() rds_conn_destroy() can race with all other modifications of the rds_conn_count but it was modifying the count without locking. Signed-off-by: Zach Brown --- net/rds/connection.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 0de40d9..75a1a37 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -313,6 +313,7 @@ void rds_conn_shutdown(struct rds_connection *conn) void rds_conn_destroy(struct rds_connection *conn) { struct rds_message *rm, *rtmp; + unsigned long flags; rdsdebug("freeing conn %p for %pI4 -> " "%pI4\n", conn, &conn->c_laddr, @@ -350,7 +351,9 @@ void rds_conn_destroy(struct rds_connection *conn) BUG_ON(!list_empty(&conn->c_retrans)); kmem_cache_free(rds_conn_slab, conn); + spin_lock_irqsave(&rds_conn_lock, flags); rds_conn_count--; + spin_unlock_irqrestore(&rds_conn_lock, flags); } EXPORT_SYMBOL_GPL(rds_conn_destroy); -- cgit v1.1 From 77510481c0c3980c8979ed236d63e59221fb8ce5 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Wed, 21 Jul 2010 15:13:25 -0700 Subject: RDS: remove old rs_transport comment rs_transport is now also used by the rdma paths once the socket is bound. We don't need this stale comment to tell us what cscope can. Signed-off-by: Zach Brown --- net/rds/rds.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/rds/rds.h b/net/rds/rds.h index 8a8a482..2ff7fc9 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -457,12 +457,6 @@ struct rds_sock { __be32 rs_conn_addr; __be16 rs_bound_port; __be16 rs_conn_port; - - /* - * This is only used to communicate the transport between bind and - * initiating connections. All other trans use is referenced through - * the connection. - */ struct rds_transport *rs_transport; /* -- cgit v1.1 From 5adb5bc65f93e52341c3fc9d03d4030dd375e256 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 23 Jul 2010 10:32:31 -0700 Subject: RDS: have sockets get transport module references Right now there's nothing to stop the various paths that use rs->rs_transport from racing with rmmod and executing freed transport code. The simple fix is to have binding to a transport also hold a reference to the transport's module, removing this class of races. We already had an unused t_owner field which was set for the modular transports and which wasn't set for the built-in loop transport. Signed-off-by: Zach Brown --- net/rds/af_rds.c | 2 ++ net/rds/connection.c | 5 ++++- net/rds/rds.h | 1 + net/rds/transport.c | 19 ++++++++++++++----- 4 files changed, 21 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 57ef0ec..8e3886d 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -90,6 +90,8 @@ static int rds_release(struct socket *sock) rds_sock_count--; spin_unlock_irqrestore(&rds_sock_lock, flags); + rds_trans_put(rs->rs_transport); + sock->sk = NULL; sock_put(sk); out: diff --git a/net/rds/connection.c b/net/rds/connection.c index 75a1a37..968b7a7 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -117,6 +117,7 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, { struct rds_connection *conn, *parent = NULL; struct hlist_head *head = rds_conn_bucket(laddr, faddr); + struct rds_transport *loop_trans; unsigned long flags; int ret; @@ -163,7 +164,9 @@ static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - if (rds_trans_get_preferred(faddr)) { + loop_trans = rds_trans_get_preferred(faddr); + if (loop_trans) { + rds_trans_put(loop_trans); conn->c_loopback = 1; if (is_outgoing && trans->t_prefer_loopback) { /* "outgoing" connection - and the transport diff --git a/net/rds/rds.h b/net/rds/rds.h index 2ff7fc9..aab5e94 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -798,6 +798,7 @@ void rds_connect_complete(struct rds_connection *conn); int rds_trans_register(struct rds_transport *trans); void rds_trans_unregister(struct rds_transport *trans); struct rds_transport *rds_trans_get_preferred(__be32 addr); +void rds_trans_put(struct rds_transport *trans); unsigned int rds_trans_stats_info_copy(struct rds_info_iterator *iter, unsigned int avail); int rds_trans_init(void); diff --git a/net/rds/transport.c b/net/rds/transport.c index 7e10679..7f2ac4f 100644 --- a/net/rds/transport.c +++ b/net/rds/transport.c @@ -71,19 +71,28 @@ void rds_trans_unregister(struct rds_transport *trans) } EXPORT_SYMBOL_GPL(rds_trans_unregister); +void rds_trans_put(struct rds_transport *trans) +{ + if (trans && trans->t_owner) + module_put(trans->t_owner); +} + struct rds_transport *rds_trans_get_preferred(__be32 addr) { struct rds_transport *ret = NULL; - int i; + struct rds_transport *trans; + unsigned int i; if (IN_LOOPBACK(ntohl(addr))) return &rds_loop_transport; down_read(&rds_trans_sem); - for (i = 0; i < RDS_TRANS_COUNT; i++) - { - if (transports[i] && (transports[i]->laddr_check(addr) == 0)) { - ret = transports[i]; + for (i = 0; i < RDS_TRANS_COUNT; i++) { + trans = transports[i]; + + if (trans && (trans->laddr_check(addr) == 0) && + (!trans->t_owner || try_module_get(trans->t_owner))) { + ret = trans; break; } } -- cgit v1.1 From ffcec0e110c198717eb0f6ac000c1e5397db9451 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 23 Jul 2010 10:36:58 -0700 Subject: RDS: don't call rds_conn_shutdown() from rds_conn_destroy() rds_conn_shutdown() can return before the connection is shut down when it encounters an existing state that it doesn't understand. This lets rds_conn_destroy() then start tearing down the conn from under paths that are still using it. It's more reliable the shutdown work and wait for krdsd to complete the shutdown callback. This stopped some hangs I was seeing where krdsd was trying to shut down a freed conn. Signed-off-by: Zach Brown --- net/rds/connection.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 968b7a7..519b4fe 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -312,6 +312,10 @@ void rds_conn_shutdown(struct rds_connection *conn) /* * Stop and free a connection. + * + * This can only be used in very limited circumstances. It assumes that once + * the conn has been shutdown that no one else is referencing the connection. + * We can only ensure this in the rmmod path in the current code. */ void rds_conn_destroy(struct rds_connection *conn) { @@ -326,10 +330,11 @@ void rds_conn_destroy(struct rds_connection *conn) spin_lock_irq(&rds_conn_lock); hlist_del_init_rcu(&conn->c_hash_node); spin_unlock_irq(&rds_conn_lock); - synchronize_rcu(); - rds_conn_shutdown(conn); + /* shut the connection down */ + rds_conn_drop(conn); + flush_work(&conn->c_down_w); /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, -- cgit v1.1 From 4518071ac1bcb76c64a55a3fddb39fb3d39add41 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Fri, 23 Jul 2010 10:37:33 -0700 Subject: RDS: cancel connection work structs as we shut down Nothing was canceling the send and receive work that might have been queued as a conn was being destroyed. Signed-off-by: Zach Brown --- net/rds/connection.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 519b4fe..870992e 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -336,6 +336,10 @@ void rds_conn_destroy(struct rds_connection *conn) rds_conn_drop(conn); flush_work(&conn->c_down_w); + /* make sure lingering queued work won't try to ref the conn */ + cancel_delayed_work_sync(&conn->c_send_w); + cancel_delayed_work_sync(&conn->c_recv_w); + /* tear down queued messages */ list_for_each_entry_safe(rm, rtmp, &conn->c_send_queue, -- cgit v1.1 From 59f740a6aeb2cde2f79fe0df38262d4c1ef35cd8 Mon Sep 17 00:00:00 2001 From: Zach Brown Date: Tue, 3 Aug 2010 13:52:47 -0700 Subject: RDS/IB: print string constants in more places This prints the constant identifier for work completion status and rdma cm event types, like we already do for IB event types. A core string array helper is added that each string type uses. Signed-off-by: Zach Brown --- net/rds/af_rds.c | 9 +++++++++ net/rds/ib.h | 1 + net/rds/ib_cm.c | 10 ++++------ net/rds/ib_recv.c | 12 +++++++----- net/rds/ib_send.c | 47 +++++++++++++++++++++++++++++++++++++++++------ net/rds/rdma_transport.c | 38 ++++++++++++++++++++++++++++++++++---- net/rds/rds.h | 1 + 7 files changed, 97 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index 8e3886d..bb6ad81 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -40,6 +40,15 @@ #include "rds.h" +char *rds_str_array(char **array, size_t elements, size_t index) +{ + if ((index < elements) && array[index]) + return array[index]; + else + return "unknown"; +} +EXPORT_SYMBOL(rds_str_array); + /* this is just used for stats gathering :/ */ static DEFINE_SPINLOCK(rds_sock_lock); static unsigned long rds_sock_count; diff --git a/net/rds/ib.h b/net/rds/ib.h index 2189fd4..7ad3d57 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -345,6 +345,7 @@ u32 rds_ib_ring_completed(struct rds_ib_work_ring *ring, u32 wr_id, u32 oldest); extern wait_queue_head_t rds_ib_ring_empty_wait; /* ib_send.c */ +char *rds_ib_wc_status_str(enum ib_wc_status status); void rds_ib_xmit_complete(struct rds_connection *conn); int rds_ib_xmit(struct rds_connection *conn, struct rds_message *rm, unsigned int hdr_off, unsigned int sg, unsigned int off); diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0e2fea8..bc3dbc1 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -39,7 +39,8 @@ #include "ib.h" static char *rds_ib_event_type_strings[] = { -#define RDS_IB_EVENT_STRING(foo) [IB_EVENT_##foo] = __stringify(foo) +#define RDS_IB_EVENT_STRING(foo) \ + [IB_EVENT_##foo] = __stringify(IB_EVENT_##foo) RDS_IB_EVENT_STRING(CQ_ERR), RDS_IB_EVENT_STRING(QP_FATAL), RDS_IB_EVENT_STRING(QP_REQ_ERR), @@ -63,11 +64,8 @@ static char *rds_ib_event_type_strings[] = { static char *rds_ib_event_str(enum ib_event_type type) { - if (type < ARRAY_SIZE(rds_ib_event_type_strings) && - rds_ib_event_type_strings[type]) - return rds_ib_event_type_strings[type]; - else - return "unknown"; + return rds_str_array(rds_ib_event_type_strings, + ARRAY_SIZE(rds_ib_event_type_strings), type); }; /* diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index a2f5f6f..e29e0ca 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -966,8 +966,9 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, struct rds_ib_recv_work *recv; while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_rx_cq_event); @@ -985,10 +986,11 @@ static inline void rds_poll_cq(struct rds_ib_connection *ic, } else { /* We expect errors as the qp is drained during shutdown */ if (rds_conn_up(conn) || rds_conn_connecting(conn)) - rds_ib_conn_error(conn, "recv completion on " - "%pI4 had status %u, disconnecting and " + rds_ib_conn_error(conn, "recv completion on %pI4 had " + "status %u (%s), disconnecting and " "reconnecting\n", &conn->c_faddr, - wc.status); + wc.status, + rds_ib_wc_status_str(wc.status)); } /* diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 15f7569..808544a 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -38,6 +38,40 @@ #include "rds.h" #include "ib.h" +static char *rds_ib_wc_status_strings[] = { +#define RDS_IB_WC_STATUS_STR(foo) \ + [IB_WC_##foo] = __stringify(IB_WC_##foo) + RDS_IB_WC_STATUS_STR(SUCCESS), + RDS_IB_WC_STATUS_STR(LOC_LEN_ERR), + RDS_IB_WC_STATUS_STR(LOC_QP_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_EEC_OP_ERR), + RDS_IB_WC_STATUS_STR(LOC_PROT_ERR), + RDS_IB_WC_STATUS_STR(WR_FLUSH_ERR), + RDS_IB_WC_STATUS_STR(MW_BIND_ERR), + RDS_IB_WC_STATUS_STR(BAD_RESP_ERR), + RDS_IB_WC_STATUS_STR(LOC_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ACCESS_ERR), + RDS_IB_WC_STATUS_STR(REM_OP_ERR), + RDS_IB_WC_STATUS_STR(RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(RNR_RETRY_EXC_ERR), + RDS_IB_WC_STATUS_STR(LOC_RDD_VIOL_ERR), + RDS_IB_WC_STATUS_STR(REM_INV_RD_REQ_ERR), + RDS_IB_WC_STATUS_STR(REM_ABORT_ERR), + RDS_IB_WC_STATUS_STR(INV_EECN_ERR), + RDS_IB_WC_STATUS_STR(INV_EEC_STATE_ERR), + RDS_IB_WC_STATUS_STR(FATAL_ERR), + RDS_IB_WC_STATUS_STR(RESP_TIMEOUT_ERR), + RDS_IB_WC_STATUS_STR(GENERAL_ERR), +#undef RDS_IB_WC_STATUS_STR +}; + +char *rds_ib_wc_status_str(enum ib_wc_status status) +{ + return rds_str_array(rds_ib_wc_status_strings, + ARRAY_SIZE(rds_ib_wc_status_strings), status); +} + /* * Convert IB-specific error message to RDS error message and call core * completion handler. @@ -257,8 +291,9 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) rdsdebug("ib_req_notify_cq send failed: %d\n", ret); while (ib_poll_cq(cq, 1, &wc) > 0) { - rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n", - (unsigned long long)wc.wr_id, wc.status, wc.byte_len, + rdsdebug("wc wr_id 0x%llx status %u (%s) byte_len %u imm_data %u\n", + (unsigned long long)wc.wr_id, wc.status, + rds_ib_wc_status_str(wc.status), wc.byte_len, be32_to_cpu(wc.ex.imm_data)); rds_ib_stats_inc(s_ib_tx_cq_event); @@ -306,10 +341,10 @@ void rds_ib_send_cq_comp_handler(struct ib_cq *cq, void *context) /* We expect errors as the qp is drained during shutdown */ if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) { - rds_ib_conn_error(conn, - "send completion on %pI4 " - "had status %u, disconnecting and reconnecting\n", - &conn->c_faddr, wc.status); + rds_ib_conn_error(conn, "send completion on %pI4 had status " + "%u (%s), disconnecting and reconnecting\n", + &conn->c_faddr, wc.status, + rds_ib_wc_status_str(wc.status)); } } } diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index 550d348..e6ed10a 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -36,6 +36,34 @@ static struct rdma_cm_id *rds_rdma_listen_id; +static char *rds_cm_event_strings[] = { +#define RDS_CM_EVENT_STRING(foo) \ + [RDMA_CM_EVENT_##foo] = __stringify(RDMA_CM_EVENT_##foo) + RDS_CM_EVENT_STRING(ADDR_RESOLVED), + RDS_CM_EVENT_STRING(ADDR_ERROR), + RDS_CM_EVENT_STRING(ROUTE_RESOLVED), + RDS_CM_EVENT_STRING(ROUTE_ERROR), + RDS_CM_EVENT_STRING(CONNECT_REQUEST), + RDS_CM_EVENT_STRING(CONNECT_RESPONSE), + RDS_CM_EVENT_STRING(CONNECT_ERROR), + RDS_CM_EVENT_STRING(UNREACHABLE), + RDS_CM_EVENT_STRING(REJECTED), + RDS_CM_EVENT_STRING(ESTABLISHED), + RDS_CM_EVENT_STRING(DISCONNECTED), + RDS_CM_EVENT_STRING(DEVICE_REMOVAL), + RDS_CM_EVENT_STRING(MULTICAST_JOIN), + RDS_CM_EVENT_STRING(MULTICAST_ERROR), + RDS_CM_EVENT_STRING(ADDR_CHANGE), + RDS_CM_EVENT_STRING(TIMEWAIT_EXIT), +#undef RDS_CM_EVENT_STRING +}; + +static char *rds_cm_event_str(enum rdma_cm_event_type type) +{ + return rds_str_array(rds_cm_event_strings, + ARRAY_SIZE(rds_cm_event_strings), type); +}; + int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event) { @@ -44,8 +72,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rds_transport *trans; int ret = 0; - rdsdebug("conn %p id %p handling event %u\n", conn, cm_id, - event->event); + rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id, + event->event, rds_cm_event_str(event->event)); if (cm_id->device->node_type == RDMA_NODE_RNIC) trans = &rds_iw_transport; @@ -109,7 +137,8 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, default: /* things like device disconnect? */ - printk(KERN_ERR "RDS: unknown event %u!\n", event->event); + printk(KERN_ERR "RDS: unknown event %u (%s)!\n", + event->event, rds_cm_event_str(event->event)); break; } @@ -117,7 +146,8 @@ out: if (conn) mutex_unlock(&conn->c_cm_lock); - rdsdebug("id %p event %u handling ret %d\n", cm_id, event->event, ret); + rdsdebug("id %p event %u (%s) handling ret %d\n", cm_id, event->event, + rds_cm_event_str(event->event), ret); return ret; } diff --git a/net/rds/rds.h b/net/rds/rds.h index aab5e94..aadaddb 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -566,6 +566,7 @@ struct rds_statistics { }; /* af_rds.c */ +char *rds_str_array(char **array, size_t elements, size_t index); void rds_sock_addref(struct rds_sock *rs); void rds_sock_put(struct rds_sock *rs); void rds_wake_sk_sleep(struct rds_sock *rs); -- cgit v1.1 From 20c72bd5f5f902e5a8745d51573699605bf8d21c Mon Sep 17 00:00:00 2001 From: Andy Grover Date: Wed, 25 Aug 2010 05:51:28 -0700 Subject: RDS: Implement masked atomic operations Add two CMSGs for masked versions of cswp and fadd. args struct modified to use a union for different atomic op type's arguments. Change IB to do masked atomic ops. Atomic op type in rds_message similarly unionized. Signed-off-by: Andy Grover --- net/rds/ib_send.c | 14 +++++++++----- net/rds/rdma.c | 33 +++++++++++++++++++++++++++------ net/rds/rds.h | 14 ++++++++++++-- net/rds/send.c | 4 ++++ 4 files changed, 52 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index 808544a..71f373c 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -807,13 +807,17 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op) send->s_queued = jiffies; if (op->op_type == RDS_ATOMIC_TYPE_CSWP) { - send->s_wr.opcode = IB_WR_ATOMIC_CMP_AND_SWP; - send->s_wr.wr.atomic.compare_add = op->op_compare; - send->s_wr.wr.atomic.swap = op->op_swap_add; + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP; + send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare; + send->s_wr.wr.atomic.swap = op->op_m_cswp.swap; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask; + send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask; } else { /* FADD */ - send->s_wr.opcode = IB_WR_ATOMIC_FETCH_AND_ADD; - send->s_wr.wr.atomic.compare_add = op->op_swap_add; + send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD; + send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add; send->s_wr.wr.atomic.swap = 0; + send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask; + send->s_wr.wr.atomic.swap_mask = 0; } nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify); send->s_wr.num_sge = 1; diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 48781fe..4806467 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -738,13 +738,34 @@ int rds_cmsg_atomic(struct rds_sock *rs, struct rds_message *rm, args = CMSG_DATA(cmsg); - if (cmsg->cmsg_type == RDS_CMSG_ATOMIC_CSWP) { - rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; - rm->atomic.op_swap_add = args->cswp.swap; - rm->atomic.op_compare = args->cswp.compare; - } else { + /* Nonmasked & masked cmsg ops converted to masked hw ops */ + switch (cmsg->cmsg_type) { + case RDS_CMSG_ATOMIC_FADD: + rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; + rm->atomic.op_m_fadd.add = args->fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = 0; + break; + case RDS_CMSG_MASKED_ATOMIC_FADD: rm->atomic.op_type = RDS_ATOMIC_TYPE_FADD; - rm->atomic.op_swap_add = args->fadd.add; + rm->atomic.op_m_fadd.add = args->m_fadd.add; + rm->atomic.op_m_fadd.nocarry_mask = args->m_fadd.nocarry_mask; + break; + case RDS_CMSG_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->cswp.compare; + rm->atomic.op_m_cswp.swap = args->cswp.swap; + rm->atomic.op_m_cswp.compare_mask = ~0; + rm->atomic.op_m_cswp.swap_mask = ~0; + break; + case RDS_CMSG_MASKED_ATOMIC_CSWP: + rm->atomic.op_type = RDS_ATOMIC_TYPE_CSWP; + rm->atomic.op_m_cswp.compare = args->m_cswp.compare; + rm->atomic.op_m_cswp.swap = args->m_cswp.swap; + rm->atomic.op_m_cswp.compare_mask = args->m_cswp.compare_mask; + rm->atomic.op_m_cswp.swap_mask = args->m_cswp.swap_mask; + break; + default: + BUG(); /* should never happen */ } rm->atomic.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); diff --git a/net/rds/rds.h b/net/rds/rds.h index aadaddb..8103dcf 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -316,8 +316,18 @@ struct rds_message { struct { struct rm_atomic_op { int op_type; - uint64_t op_swap_add; - uint64_t op_compare; + union { + struct { + uint64_t compare; + uint64_t swap; + uint64_t compare_mask; + uint64_t swap_mask; + } op_m_cswp; + struct { + uint64_t add; + uint64_t nocarry_mask; + } op_m_fadd; + }; u32 op_rkey; u64 op_remote_addr; diff --git a/net/rds/send.c b/net/rds/send.c index 81471b2..9b951a0 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -843,6 +843,8 @@ static int rds_rm_size(struct msghdr *msg, int data_len) case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: cmsg_groups |= 1; size += sizeof(struct scatterlist); break; @@ -894,6 +896,8 @@ static int rds_cmsg_send(struct rds_sock *rs, struct rds_message *rm, break; case RDS_CMSG_ATOMIC_CSWP: case RDS_CMSG_ATOMIC_FADD: + case RDS_CMSG_MASKED_ATOMIC_CSWP: + case RDS_CMSG_MASKED_ATOMIC_FADD: ret = rds_cmsg_atomic(rs, rm, cmsg); break; -- cgit v1.1 From e0386005ff2a729998735e10769d99e1acbc2dd1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Sep 2010 21:31:35 -0700 Subject: net: inet_add_protocol() can use cmpxchg() Use cmpxchg() to get rid of spinlocks in inet_add_protocol() and friends. inet_protos[] & inet6_protos[] are moved to read_mostly section Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/protocol.c | 31 +++++-------------------------- net/ipv6/protocol.c | 32 ++++---------------------------- 2 files changed, 9 insertions(+), 54 deletions(-) (limited to 'net') diff --git a/net/ipv4/protocol.c b/net/ipv4/protocol.c index f2d2973..65699c2 100644 --- a/net/ipv4/protocol.c +++ b/net/ipv4/protocol.c @@ -28,8 +28,7 @@ #include #include -const struct net_protocol *inet_protos[MAX_INET_PROTOS] ____cacheline_aligned_in_smp; -static DEFINE_SPINLOCK(inet_proto_lock); +const struct net_protocol *inet_protos[MAX_INET_PROTOS] __read_mostly; /* * Add a protocol handler to the hash tables @@ -37,20 +36,9 @@ static DEFINE_SPINLOCK(inet_proto_lock); int inet_add_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; + int hash = protocol & (MAX_INET_PROTOS - 1); - hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash]) { - ret = -1; - } else { - inet_protos[hash] = prot; - ret = 0; - } - spin_unlock_bh(&inet_proto_lock); - - return ret; + return !cmpxchg(&inet_protos[hash], NULL, prot) ? 0 : -1; } EXPORT_SYMBOL(inet_add_protocol); @@ -60,18 +48,9 @@ EXPORT_SYMBOL(inet_add_protocol); int inet_del_protocol(const struct net_protocol *prot, unsigned char protocol) { - int hash, ret; - - hash = protocol & (MAX_INET_PROTOS - 1); + int ret, hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet_proto_lock); - if (inet_protos[hash] == prot) { - inet_protos[hash] = NULL; - ret = 0; - } else { - ret = -1; - } - spin_unlock_bh(&inet_proto_lock); + ret = (cmpxchg(&inet_protos[hash], prot, NULL) == prot) ? 0 : -1; synchronize_net(); diff --git a/net/ipv6/protocol.c b/net/ipv6/protocol.c index 1fa3468..9bb936a 100644 --- a/net/ipv6/protocol.c +++ b/net/ipv6/protocol.c @@ -25,28 +25,14 @@ #include #include -const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS]; -static DEFINE_SPINLOCK(inet6_proto_lock); - +const struct inet6_protocol *inet6_protos[MAX_INET_PROTOS] __read_mostly; int inet6_add_protocol(const struct inet6_protocol *prot, unsigned char protocol) { - int ret, hash = protocol & (MAX_INET_PROTOS - 1); - - spin_lock_bh(&inet6_proto_lock); - - if (inet6_protos[hash]) { - ret = -1; - } else { - inet6_protos[hash] = prot; - ret = 0; - } - - spin_unlock_bh(&inet6_proto_lock); + int hash = protocol & (MAX_INET_PROTOS - 1); - return ret; + return !cmpxchg(&inet6_protos[hash], NULL, prot) ? 0 : -1; } - EXPORT_SYMBOL(inet6_add_protocol); /* @@ -57,20 +43,10 @@ int inet6_del_protocol(const struct inet6_protocol *prot, unsigned char protocol { int ret, hash = protocol & (MAX_INET_PROTOS - 1); - spin_lock_bh(&inet6_proto_lock); - - if (inet6_protos[hash] != prot) { - ret = -1; - } else { - inet6_protos[hash] = NULL; - ret = 0; - } - - spin_unlock_bh(&inet6_proto_lock); + ret = (cmpxchg(&inet6_protos[hash], prot, NULL) == prot) ? 0 : -1; synchronize_net(); return ret; } - EXPORT_SYMBOL(inet6_del_protocol); -- cgit v1.1 From a700d8be733bd593ea4797dfde17aed4f35213c0 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 8 Sep 2010 03:48:47 +0000 Subject: net/core: remove address space warnings on verify_iovec() move_addr_to_kernel() and copy_from_user() requires their argument as __user pointer but were missing proper markups. Add it. This removes following warnings from sparse. net/core/iovec.c:44:52: warning: incorrect type in argument 1 (different address spaces) net/core/iovec.c:44:52: expected void [noderef] *uaddr net/core/iovec.c:44:52: got void *msg_name net/core/iovec.c:55:34: warning: incorrect type in argument 2 (different address spaces) net/core/iovec.c:55:34: expected void const [noderef] *from net/core/iovec.c:55:34: got struct iovec *msg_iov Signed-off-by: Namhyung Kim Signed-off-by: David S. Miller --- net/core/iovec.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/iovec.c b/net/core/iovec.c index 1cd98df..f4657c2 100644 --- a/net/core/iovec.c +++ b/net/core/iovec.c @@ -41,7 +41,9 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, if (m->msg_namelen) { if (mode == VERIFY_READ) { - err = move_addr_to_kernel(m->msg_name, m->msg_namelen, + void __user *namep; + namep = (void __user __force *) m->msg_name; + err = move_addr_to_kernel(namep, m->msg_namelen, address); if (err < 0) return err; @@ -52,7 +54,7 @@ int verify_iovec(struct msghdr *m, struct iovec *iov, struct sockaddr *address, } size = m->msg_iovlen * sizeof(struct iovec); - if (copy_from_user(iov, m->msg_iov, size)) + if (copy_from_user(iov, (void __user __force *) m->msg_iov, size)) return -EFAULT; m->msg_iov = iov; -- cgit v1.1 From f39234d60617d37818b30991e6794643ce220296 Mon Sep 17 00:00:00 2001 From: Namhyung Kim Date: Wed, 8 Sep 2010 03:48:48 +0000 Subject: net/core: add lock context change annotations in net/core/sock.c __lock_sock() and __release_sock() releases and regrabs lock but were missing proper annotations. Add it. This removes following warning from sparse. (Currently __lock_sock() does not emit any warning about it but I think it is better to add also.) net/core/sock.c:1580:17: warning: context imbalance in '__release_sock' - unexpected unlock Signed-off-by: Namhyung Kim Signed-off-by: David S. Miller --- net/core/sock.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/core/sock.c b/net/core/sock.c index b05b9b6..f3a06c4 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1557,6 +1557,8 @@ struct sk_buff *sock_alloc_send_skb(struct sock *sk, unsigned long size, EXPORT_SYMBOL(sock_alloc_send_skb); static void __lock_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { DEFINE_WAIT(wait); @@ -1573,6 +1575,8 @@ static void __lock_sock(struct sock *sk) } static void __release_sock(struct sock *sk) + __releases(&sk->sk_lock.slock) + __acquires(&sk->sk_lock.slock) { struct sk_buff *skb = sk->sk_backlog.head; -- cgit v1.1 From 49d61e2390c92bd226fc395a6165eb5a65ae4de6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Sep 2010 05:33:43 +0000 Subject: tunnels: missing rcu_assign_pointer() xfrm4_tunnel_register() & xfrm6_tunnel_register() should use rcu_assign_pointer() to make sure previous writes (to handler->next) are committed to memory before chain insertion. deregister functions dont need a particular barrier. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/tunnel4.c | 2 +- net/ipv6/tunnel6.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/tunnel4.c b/net/ipv4/tunnel4.c index df59d163..9a17bd2 100644 --- a/net/ipv4/tunnel4.c +++ b/net/ipv4/tunnel4.c @@ -39,7 +39,7 @@ int xfrm4_tunnel_register(struct xfrm_tunnel *handler, unsigned short family) } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; diff --git a/net/ipv6/tunnel6.c b/net/ipv6/tunnel6.c index 3177fe0..d986472 100644 --- a/net/ipv6/tunnel6.c +++ b/net/ipv6/tunnel6.c @@ -51,7 +51,7 @@ int xfrm6_tunnel_register(struct xfrm6_tunnel *handler, unsigned short family) } handler->next = *pprev; - *pprev = handler; + rcu_assign_pointer(*pprev, handler); ret = 0; -- cgit v1.1 From b2abd4c033c3965ce670841dfb401f5f166222d5 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Wed, 8 Sep 2010 13:31:24 +0000 Subject: tipc: Optimize handling excess content on incoming messages Remove code that trimmed excess trailing info from incoming messages arriving over an Ethernet interface. TIPC now ignores the extra info while the message is being processed by the node, and only trims it off if the message is retransmitted to another node. (This latter step is done to ensure the extra info doesn't cause the sk_buff to exceed the outgoing interface's MTU limit.) The outgoing buffer is guaranteed to be linear. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/eth_media.c | 13 +++---------- net/tipc/net.c | 1 + 2 files changed, 4 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tipc/eth_media.c b/net/tipc/eth_media.c index 673fdf0..6e988ba 100644 --- a/net/tipc/eth_media.c +++ b/net/tipc/eth_media.c @@ -101,15 +101,12 @@ static int send_msg(struct sk_buff *buf, struct tipc_bearer *tb_ptr, * Accept only packets explicitly sent to this node, or broadcast packets; * ignores packets sent using Ethernet multicast, and traffic sent to other * nodes (which can happen if interface is running in promiscuous mode). - * Routine truncates any Ethernet padding/CRC appended to the message, - * and ensures message size matches actual length */ static int recv_msg(struct sk_buff *buf, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { struct eth_bearer *eb_ptr = (struct eth_bearer *)pt->af_packet_priv; - u32 size; if (!net_eq(dev_net(dev), &init_net)) { kfree_skb(buf); @@ -118,13 +115,9 @@ static int recv_msg(struct sk_buff *buf, struct net_device *dev, if (likely(eb_ptr->bearer)) { if (likely(buf->pkt_type <= PACKET_BROADCAST)) { - size = msg_size((struct tipc_msg *)buf->data); - skb_trim(buf, size); - if (likely(buf->len == size)) { - buf->next = NULL; - tipc_recv_msg(buf, eb_ptr->bearer); - return 0; - } + buf->next = NULL; + tipc_recv_msg(buf, eb_ptr->bearer); + return 0; } } kfree_skb(buf); diff --git a/net/tipc/net.c b/net/tipc/net.c index f61b769..7e05af4 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -248,6 +248,7 @@ void tipc_net_route_msg(struct sk_buff *buf) /* Handle message for another node */ msg_dbg(msg, "NET>SEND>: "); + skb_trim(buf, msg_size(msg)); tipc_link_send(buf, dnode, msg_link_selector(msg)); } -- cgit v1.1 From a034ee3cca5726b14107f281f4bed1c0fd44472a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 9 Sep 2010 23:32:28 +0000 Subject: fib: cleanups Use rcu_dereference_rtnl() helper Change hard coded constants in fib_flag_trans() 7 -> RTN_UNREACHABLE 8 -> RTN_PROHIBIT Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 55 ++++++++++++++++++++++------------------------------- 1 file changed, 23 insertions(+), 32 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 4a8e370..a96e5ec 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -186,9 +186,7 @@ static inline struct tnode *node_parent_rcu(struct node *node) { struct tnode *ret = node_parent(node); - return rcu_dereference_check(ret, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + return rcu_dereference_rtnl(ret); } /* Same as rcu_assign_pointer @@ -211,9 +209,7 @@ static inline struct node *tnode_get_child_rcu(struct tnode *tn, unsigned int i) { struct node *ret = tnode_get_child(tn, i); - return rcu_dereference_check(ret, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + return rcu_dereference_rtnl(ret); } static inline int tnode_child_length(const struct tnode *tn) @@ -459,8 +455,8 @@ static struct tnode *tnode_new(t_key key, int pos, int bits) tn->empty_children = 1<trie, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + n = rcu_dereference_rtnl(t->trie); while (n != NULL && NODE_TYPE(n) == T_TNODE) { tn = (struct tnode *) n; @@ -1748,16 +1741,14 @@ static struct leaf *leaf_walk_rcu(struct tnode *p, struct node *c) /* Node empty, walk back up to parent */ c = (struct node *) p; - } while ( (p = node_parent_rcu(c)) != NULL); + } while ((p = node_parent_rcu(c)) != NULL); return NULL; /* Root of trie */ } static struct leaf *trie_firstleaf(struct trie *t) { - struct tnode *n = (struct tnode *) rcu_dereference_check(t->trie, - rcu_read_lock_held() || - lockdep_rtnl_is_held()); + struct tnode *n = (struct tnode *)rcu_dereference_rtnl(t->trie); if (!n) return NULL; @@ -2043,14 +2034,14 @@ struct fib_trie_iter { struct seq_net_private p; struct fib_table *tb; struct tnode *tnode; - unsigned index; - unsigned depth; + unsigned int index; + unsigned int depth; }; static struct node *fib_trie_get_next(struct fib_trie_iter *iter) { struct tnode *tn = iter->tnode; - unsigned cindex = iter->index; + unsigned int cindex = iter->index; struct tnode *p; /* A single entry routing table */ @@ -2159,7 +2150,7 @@ static void trie_collect_stats(struct trie *t, struct trie_stat *s) */ static void trie_show_stats(struct seq_file *seq, struct trie_stat *stat) { - unsigned i, max, pointers, bytes, avdepth; + unsigned int i, max, pointers, bytes, avdepth; if (stat->leaves) avdepth = stat->totdepth*100 / stat->leaves; @@ -2356,7 +2347,8 @@ static void fib_trie_seq_stop(struct seq_file *seq, void *v) static void seq_indent(struct seq_file *seq, int n) { - while (n-- > 0) seq_puts(seq, " "); + while (n-- > 0) + seq_puts(seq, " "); } static inline const char *rtn_scope(char *buf, size_t len, enum rt_scope_t s) @@ -2388,7 +2380,7 @@ static const char *const rtn_type_names[__RTN_MAX] = { [RTN_XRESOLVE] = "XRESOLVE", }; -static inline const char *rtn_type(char *buf, size_t len, unsigned t) +static inline const char *rtn_type(char *buf, size_t len, unsigned int t) { if (t < __RTN_MAX && rtn_type_names[t]) return rtn_type_names[t]; @@ -2544,13 +2536,12 @@ static void fib_route_seq_stop(struct seq_file *seq, void *v) rcu_read_unlock(); } -static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) +static unsigned int fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) { - static unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, [8] = RTF_REJECT, - }; - unsigned flags = type2flags[type]; + unsigned int flags = 0; + if (type == RTN_UNREACHABLE || type == RTN_PROHIBIT) + flags = RTF_REJECT; if (fi && fi->fib_nh->nh_gw) flags |= RTF_GATEWAY; if (mask == htonl(0xFFFFFFFF)) @@ -2562,7 +2553,7 @@ static unsigned fib_flag_trans(int type, __be32 mask, const struct fib_info *fi) /* * This outputs /proc/net/route. * The format of the file is not supposed to be changed - * and needs to be same as fib_hash output to avoid breaking + * and needs to be same as fib_hash output to avoid breaking * legacy utilities */ static int fib_route_seq_show(struct seq_file *seq, void *v) @@ -2587,7 +2578,7 @@ static int fib_route_seq_show(struct seq_file *seq, void *v) list_for_each_entry_rcu(fa, &li->falh, fa_list) { const struct fib_info *fi = fa->fa_info; - unsigned flags = fib_flag_trans(fa->fa_type, mask, fi); + unsigned int flags = fib_flag_trans(fa->fa_type, mask, fi); int len; if (fa->fa_type == RTN_BROADCAST -- cgit v1.1 From 9ca7f8762299bb391c11a81c844224216e925b5c Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 8 Sep 2010 09:16:28 +0000 Subject: pkt_sched: remov unnecessary bh_disable Now that est_tree_lock is acquired with BH protection, the other call is unnecessary. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/gen_estimator.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c index 6743146..7c23733 100644 --- a/net/core/gen_estimator.c +++ b/net/core/gen_estimator.c @@ -274,9 +274,9 @@ void gen_kill_estimator(struct gnet_stats_basic_packed *bstats, while ((e = gen_find_node(bstats, rate_est))) { rb_erase(&e->node, &est_root); - write_lock_bh(&est_lock); + write_lock(&est_lock); e->bstats = NULL; - write_unlock_bh(&est_lock); + write_unlock(&est_lock); list_del_rcu(&e->list); call_rcu(&e->e_rcu, __gen_kill_estimator); -- cgit v1.1 From 83b6b1f5d13414d0cb5c4f0a567a6aec0af073bd Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 10 Sep 2010 07:00:25 +0000 Subject: flow: better memory management Allocate hash tables for every online cpus, not every possible ones. NUMA aware allocations. Dont use a full page on arches where PAGE_SIZE > 1024*sizeof(void *) misc: __percpu , __read_mostly, __cpuinit annotations flow_compare_t is just an "unsigned long" Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/flow.c | 78 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 42 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/core/flow.c b/net/core/flow.c index f67dcbf..b143b86 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -53,8 +53,7 @@ struct flow_flush_info { struct flow_cache { u32 hash_shift; - unsigned long order; - struct flow_cache_percpu *percpu; + struct flow_cache_percpu __percpu *percpu; struct notifier_block hotcpu_notifier; int low_watermark; int high_watermark; @@ -64,7 +63,7 @@ struct flow_cache { atomic_t flow_cache_genid = ATOMIC_INIT(0); EXPORT_SYMBOL(flow_cache_genid); static struct flow_cache flow_cache_global; -static struct kmem_cache *flow_cachep; +static struct kmem_cache *flow_cachep __read_mostly; static DEFINE_SPINLOCK(flow_cache_gc_lock); static LIST_HEAD(flow_cache_gc_list); @@ -181,11 +180,7 @@ static u32 flow_hash_code(struct flow_cache *fc, & (flow_cache_hash_size(fc) - 1)); } -#if (BITS_PER_LONG == 64) -typedef u64 flow_compare_t; -#else -typedef u32 flow_compare_t; -#endif +typedef unsigned long flow_compare_t; /* I hear what you're saying, use memcmp. But memcmp cannot make * important assumptions that we can here, such as alignment and @@ -357,62 +352,73 @@ void flow_cache_flush(void) put_online_cpus(); } -static void __init flow_cache_cpu_prepare(struct flow_cache *fc, - struct flow_cache_percpu *fcp) +static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu) { - fcp->hash_table = (struct hlist_head *) - __get_free_pages(GFP_KERNEL|__GFP_ZERO, fc->order); - if (!fcp->hash_table) - panic("NET: failed to allocate flow cache order %lu\n", fc->order); + struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); + size_t sz = sizeof(struct hlist_head) * flow_cache_hash_size(fc); - fcp->hash_rnd_recalc = 1; - fcp->hash_count = 0; - tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + if (!fcp->hash_table) { + fcp->hash_table = kzalloc_node(sz, GFP_KERNEL, cpu_to_node(cpu)); + if (!fcp->hash_table) { + pr_err("NET: failed to allocate flow cache sz %zu\n", sz); + return -ENOMEM; + } + fcp->hash_rnd_recalc = 1; + fcp->hash_count = 0; + tasklet_init(&fcp->flush_tasklet, flow_cache_flush_tasklet, 0); + } + return 0; } -static int flow_cache_cpu(struct notifier_block *nfb, +static int __cpuinit flow_cache_cpu(struct notifier_block *nfb, unsigned long action, void *hcpu) { struct flow_cache *fc = container_of(nfb, struct flow_cache, hotcpu_notifier); - int cpu = (unsigned long) hcpu; + int res, cpu = (unsigned long) hcpu; struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu); - if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) + switch (action) { + case CPU_UP_PREPARE: + case CPU_UP_PREPARE_FROZEN: + res = flow_cache_cpu_prepare(fc, cpu); + if (res) + return notifier_from_errno(res); + break; + case CPU_DEAD: + case CPU_DEAD_FROZEN: __flow_cache_shrink(fc, fcp, 0); + break; + } return NOTIFY_OK; } -static int flow_cache_init(struct flow_cache *fc) +static int __init flow_cache_init(struct flow_cache *fc) { - unsigned long order; int i; fc->hash_shift = 10; fc->low_watermark = 2 * flow_cache_hash_size(fc); fc->high_watermark = 4 * flow_cache_hash_size(fc); - for (order = 0; - (PAGE_SIZE << order) < - (sizeof(struct hlist_head)*flow_cache_hash_size(fc)); - order++) - /* NOTHING */; - fc->order = order; fc->percpu = alloc_percpu(struct flow_cache_percpu); + if (!fc->percpu) + return -ENOMEM; - setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, - (unsigned long) fc); - fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; - add_timer(&fc->rnd_timer); - - for_each_possible_cpu(i) - flow_cache_cpu_prepare(fc, per_cpu_ptr(fc->percpu, i)); - + for_each_online_cpu(i) { + if (flow_cache_cpu_prepare(fc, i)) + return -ENOMEM; + } fc->hotcpu_notifier = (struct notifier_block){ .notifier_call = flow_cache_cpu, }; register_hotcpu_notifier(&fc->hotcpu_notifier); + setup_timer(&fc->rnd_timer, flow_cache_new_hashrnd, + (unsigned long) fc); + fc->rnd_timer.expires = jiffies + FLOW_HASH_RND_PERIOD; + add_timer(&fc->rnd_timer); + return 0; } -- cgit v1.1 From 942623166df3256821e8451273bc07737745e3cb Mon Sep 17 00:00:00 2001 From: Nikitas Angelinas Date: Wed, 8 Sep 2010 22:29:53 +0100 Subject: net/wireless: use ARRAY_SIZE macro in radiotap.c Replace sizeof(rtap_namespace_sizes) / sizeof(rtap_namespace_sizes[0]) with ARRAY_SIZE(rtap_namespace_sizes) in net/wireless/radiotap.c Signed-off-by: Nikitas Angelinas Signed-off-by: John W. Linville --- net/wireless/radiotap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index 1332c44..c774bc0 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -14,6 +14,7 @@ * See COPYING for more details. */ +#include #include #include #include @@ -45,7 +46,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = { }; static const struct ieee80211_radiotap_namespace radiotap_ns = { - .n_bits = sizeof(rtap_namespace_sizes) / sizeof(rtap_namespace_sizes[0]), + .n_bits = ARRAY_SIZE(rtap_namespace_sizes), .align_size = rtap_namespace_sizes, }; -- cgit v1.1 From a1e567c83f541432e687142570215b75bebb1338 Mon Sep 17 00:00:00 2001 From: Bill Jordan Date: Fri, 10 Sep 2010 11:22:32 -0400 Subject: nl80211: Uninitialized variable There is a path in nl80211_set_wiphy where result is tested but uninitialized. I am hitting this path when I attempt: sh# iw dev wlan0 set channel 10 command failed: Unknown error 1069727332 (-1069727332) Signed-off-by: William Jordan Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 85a23de..1d6ef24 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -833,7 +833,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev; struct net_device *netdev = NULL; struct wireless_dev *wdev; - int result, rem_txq_params = 0; + int result = 0, rem_txq_params = 0; struct nlattr *nl_txq_params; u32 changed; u8 retry_short = 0, retry_long = 0; -- cgit v1.1 From 740c1aa3b01251c3c324743f395621749b099065 Mon Sep 17 00:00:00 2001 From: Steve deRosier Date: Sat, 11 Sep 2010 20:01:31 -0700 Subject: mac80211: Fix dangling pointer in ieee80211_xmit hdr pointer is left dangling after call to ieee80211_skb_resize. This can cause guards around mesh path selection to fail. Signed-off-by: Steve deRosier Signed-off-by: John W. Linville --- net/mac80211/tx.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index ccf3737..e1733dc 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1609,6 +1609,7 @@ static void ieee80211_xmit(struct ieee80211_sub_if_data *sdata, return; } + hdr = (struct ieee80211_hdr *) skb->data; info->control.vif = &sdata->vif; if (ieee80211_vif_is_mesh(&sdata->vif) && -- cgit v1.1 From edeb78a7fa838b7fb9c2043680bd8da7cb5cb0e5 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Mon, 13 Sep 2010 14:46:42 +0200 Subject: mac80211: wait for scan work complete before restarting hw This is needed to avoid warning in ieee80211_restart_hw about hardware scan in progress. Signed-off-by: Stanislaw Gruszka Acked-by: Wey-Yi W Guy Signed-off-by: John W. Linville --- net/mac80211/main.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index a06b6ee..7fb1148 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -305,6 +305,9 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) trace_api_restart_hw(local); + /* wait for scan work complete */ + flush_workqueue(local->workqueue); + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), "%s called with hardware scan in progress\n", __func__); -- cgit v1.1 From a2c1e3dad516618cb0fbfb1a62c36d0b0744573a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 14 Sep 2010 21:34:14 +0200 Subject: mac80211: match only assigned bss in sta_info_get_bss sta_info_get_bss() is used to match STA pointers for VLAN/AP interfaces, but if the same station is also added to multiple other interfaces it will erroneously match because both pointers are NULL, fix this by ignoring NULL pointers here. Reported-by: Ben Greear Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/sta_info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 687077e..e356ff8 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -125,7 +125,7 @@ struct sta_info *sta_info_get_bss(struct ieee80211_sub_if_data *sdata, lockdep_is_held(&local->sta_mtx)); while (sta) { if ((sta->sdata == sdata || - sta->sdata->bss == sdata->bss) && + (sta->sdata->bss && sta->sdata->bss == sdata->bss)) && memcmp(sta->sta.addr, addr, ETH_ALEN) == 0) break; sta = rcu_dereference_check(sta->hnext, -- cgit v1.1 From 55b1804c678e679e1018671bd40b583eab124689 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Mon, 13 Sep 2010 18:24:01 +0000 Subject: net/irda: Use static const char * const where possible Signed-off-by: Joe Perches Signed-off-by: David S. Miller --- net/irda/irlan/irlan_event.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/irlan/irlan_event.c b/net/irda/irlan/irlan_event.c index cbcb4eb..43f1604 100644 --- a/net/irda/irlan/irlan_event.c +++ b/net/irda/irlan/irlan_event.c @@ -24,7 +24,7 @@ #include -char *irlan_state[] = { +const char * const irlan_state[] = { "IRLAN_IDLE", "IRLAN_QUERY", "IRLAN_CONN", -- cgit v1.1 From 25aa4efe4feb4150fe613169795df505c5018f2b Mon Sep 17 00:00:00 2001 From: andrew hendry Date: Tue, 14 Sep 2010 13:31:16 +0000 Subject: X.25 remove bkl in listen Listen updates socket values and needs lock_sock. Signed-off-by: Andrew Hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 5e86d4e..f6a8f17 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -507,14 +507,14 @@ static int x25_listen(struct socket *sock, int backlog) struct sock *sk = sock->sk; int rc = -EOPNOTSUPP; - lock_kernel(); + lock_sock(sk); if (sk->sk_state != TCP_LISTEN) { memset(&x25_sk(sk)->dest_addr, 0, X25_ADDR_LEN); sk->sk_max_ack_backlog = backlog; sk->sk_state = TCP_LISTEN; rc = 0; } - unlock_kernel(); + release_sock(sk); return rc; } -- cgit v1.1 From 90c27297a9bfb8ea11c0e3f73ad90c4c66e8501e Mon Sep 17 00:00:00 2001 From: andrew hendry Date: Tue, 14 Sep 2010 13:31:38 +0000 Subject: X.25 remove bkl in bind Accept updates socket values in 3 lines so wrapped with lock_sock. Signed-off-by: Andrew Hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index f6a8f17..bd6fce3 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -688,7 +688,6 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct sockaddr_x25 *addr = (struct sockaddr_x25 *)uaddr; int len, i, rc = 0; - lock_kernel(); if (!sock_flag(sk, SOCK_ZAPPED) || addr_len != sizeof(struct sockaddr_x25) || addr->sx25_family != AF_X25) { @@ -704,12 +703,13 @@ static int x25_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) } } + lock_sock(sk); x25_sk(sk)->source_addr = addr->sx25_addr; x25_insert_socket(sk); sock_reset_flag(sk, SOCK_ZAPPED); + release_sock(sk); SOCK_DEBUG(sk, "x25_bind: socket is bound\n"); out: - unlock_kernel(); return rc; } -- cgit v1.1 From 141646ce56735cedb2336b3cd21364287f0aa4c7 Mon Sep 17 00:00:00 2001 From: Andrew Hendry Date: Tue, 14 Sep 2010 20:38:54 -0700 Subject: X.25 remove bkl in accept Accept already has socket locking. [ Extend socket locking over TCP_LISTEN state test. -DaveM ] Signed-off-by: Andrew Hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index bd6fce3..04321ee 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -869,8 +869,7 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags) struct sk_buff *skb; int rc = -EINVAL; - lock_kernel(); - if (!sk || sk->sk_state != TCP_LISTEN) + if (!sk) goto out; rc = -EOPNOTSUPP; @@ -878,6 +877,10 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags) goto out; lock_sock(sk); + rc = -EINVAL; + if (sk->sk_state != TCP_LISTEN) + goto out2; + rc = x25_wait_for_data(sk, sk->sk_rcvtimeo); if (rc) goto out2; @@ -897,7 +900,6 @@ static int x25_accept(struct socket *sock, struct socket *newsock, int flags) out2: release_sock(sk); out: - unlock_kernel(); return rc; } -- cgit v1.1 From 21a4591794c82c1a73f9d45d6400f878648261e3 Mon Sep 17 00:00:00 2001 From: andrew hendry Date: Tue, 14 Sep 2010 13:32:03 +0000 Subject: X.25 remove bkl in connect Connect already has socket locking. Signed-off-by: Andrew Hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 04321ee..c1bbf9e 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -751,7 +751,6 @@ static int x25_connect(struct socket *sock, struct sockaddr *uaddr, struct x25_route *rt; int rc = 0; - lock_kernel(); lock_sock(sk); if (sk->sk_state == TCP_ESTABLISHED && sock->state == SS_CONNECTING) { sock->state = SS_CONNECTED; @@ -829,7 +828,6 @@ out_put_route: x25_route_put(rt); out: release_sock(sk); - unlock_kernel(); return rc; } -- cgit v1.1 From 20cbd3e120a0c20bebe420e1fed0e816730bb988 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 14 Sep 2010 20:16:59 +0200 Subject: dccp ccid-3: A lower bound for the inter-packet scheduling algorithm This fixes a subtle bug in the calculation of the inter-packet gap and shows that t_delta, as it is currently used, is not needed. The algorithm from RFC 5348, 8.3 below continually computes a send time t_nom, which is initialised with the current time t_now; t_gran = 1E6 / HZ specifies the scheduling granularity, s the packet size, and X the sending rate: t_distance = t_nom - t_now; // in microseconds t_delta = min(t_ipi, t_gran) / 2; // `delta' parameter in microseconds if (t_distance >= t_delta) { reschedule after (t_distance / 1000) milliseconds; } else { t_ipi = s / X; // inter-packet interval in usec t_nom += t_ipi; // compute the next send time send packet now; } Problem: -------- Rescheduling requires a conversion into milliseconds (sk_reset_timer()). The highest jiffy resolution with HZ=1000 is 1 millisecond, so using a higher granularity does not make much sense here. As a consequence, values of t_distance < 1000 are truncated to 0. This issue has so far been resolved by using instead if (t_distance >= t_delta + 1000) reschedule after (t_distance / 1000) milliseconds; This is unnecessarily large, a lower bound is t_delta' = max(t_delta, 1000). And it implies a further simplification: a) when HZ >= 500, then t_delta <= t_gran/2 = 10^6/(2*HZ) <= 1000, so that t_delta' = MAX(1000, t_delta) = 1000 (constant value); b) when HZ < 500, then t_delta = 1/2*MIN(rtt, t_ipi, t_gran) <= t_gran/2, so that 1000 <= t_delta' <= t_gran/2. The maximum error of using a constant t_delta in (b) is less than half a jiffy. Fix: ---- The patch replaces t_delta with a constant, whose value depends on CONFIG_HZ, changing the above algorithm to: if (t_distance >= t_delta') reschedule after (t_distance / 1000) milliseconds; where t_delta' = 10^6/(2*HZ) if HZ < 500, and t_delta' = 1000 otherwise. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 19 ++++++++----------- net/dccp/ccids/ccid3.h | 18 +++++++++++++----- 2 files changed, 21 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 278e170..e9ca098 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -91,19 +91,16 @@ static inline u64 rfc3390_initial_rate(struct sock *sk) return scaled_div(w_init << 6, hc->tx_rtt); } -/* - * Recalculate t_ipi and delta (should be called whenever X changes) +/** + * ccid3_update_send_interval - Calculate new t_ipi = s / X_inst + * This respects the granularity of X_inst (64 * bytes/second). */ static void ccid3_update_send_interval(struct ccid3_hc_tx_sock *hc) { - /* Calculate new t_ipi = s / X_inst (X_inst is in 64 * bytes/second) */ hc->tx_t_ipi = scaled_div32(((u64)hc->tx_s) << 6, hc->tx_x); - /* Calculate new delta by delta = min(t_ipi / 2, t_gran / 2) */ - hc->tx_delta = min_t(u32, hc->tx_t_ipi / 2, TFRC_OPSYS_HALF_TIME_GRAN); - - ccid3_pr_debug("t_ipi=%u, delta=%u, s=%u, X=%u\n", hc->tx_t_ipi, - hc->tx_delta, hc->tx_s, (unsigned)(hc->tx_x >> 6)); + ccid3_pr_debug("t_ipi=%u, s=%u, X=%u\n", hc->tx_t_ipi, + hc->tx_s, (unsigned)(hc->tx_x >> 6)); } static u32 ccid3_hc_tx_idle_rtt(struct ccid3_hc_tx_sock *hc, ktime_t now) @@ -332,15 +329,15 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) delay = ktime_us_delta(hc->tx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* - * Scheduling of packet transmissions [RFC 3448, 4.6] + * Scheduling of packet transmissions (RFC 5348, 8.3) * * if (t_now > t_nom - delta) * // send the packet now * else * // send the packet in (t_nom - t_now) milliseconds. */ - if (delay - (s64)hc->tx_delta >= 1000) - return (u32)delay / 1000L; + if (delay >= TFRC_T_DELTA) + return (u32)delay / USEC_PER_MSEC; ccid3_hc_tx_update_win_count(hc, now); break; diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index b7e569c..4a00174 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -45,12 +45,22 @@ /* Two seconds as per RFC 5348, 4.2 */ #define TFRC_INITIAL_TIMEOUT (2 * USEC_PER_SEC) -/* In usecs - half the scheduling granularity as per RFC3448 4.6 */ -#define TFRC_OPSYS_HALF_TIME_GRAN (USEC_PER_SEC / (2 * HZ)) - /* Parameter t_mbi from [RFC 3448, 4.3]: backoff interval in seconds */ #define TFRC_T_MBI 64 +/* + * The t_delta parameter (RFC 5348, 8.3): delays of less than %USEC_PER_MSEC are + * rounded down to 0, since sk_reset_timer() here uses millisecond granularity. + * Hence we can use a constant t_delta = %USEC_PER_MSEC when HZ >= 500. A coarse + * resolution of HZ < 500 means that the error is below one timer tick (t_gran) + * when using the constant t_delta = t_gran / 2 = %USEC_PER_SEC / (2 * HZ). + */ +#if (HZ >= 500) +# define TFRC_T_DELTA USEC_PER_MSEC +#else +# define TFRC_T_DELTA (USEC_PER_SEC / (2 * HZ)) +#endif + enum ccid3_options { TFRC_OPT_LOSS_EVENT_RATE = 192, TFRC_OPT_LOSS_INTERVALS = 193, @@ -90,7 +100,6 @@ enum ccid3_hc_tx_states { * @tx_no_feedback_timer: Handle to no feedback timer * @tx_t_ld: Time last doubled during slow start * @tx_t_nom: Nominal send time of next packet - * @tx_delta: Send timer delta (RFC 3448, 4.6) in usecs * @tx_hist: Packet history * @tx_options_received: Parsed set of retrieved options */ @@ -109,7 +118,6 @@ struct ccid3_hc_tx_sock { struct timer_list tx_no_feedback_timer; ktime_t tx_t_ld; ktime_t tx_t_nom; - u32 tx_delta; struct tfrc_tx_hist_entry *tx_hist; struct ccid3_options_received tx_options_received; }; -- cgit v1.1 From d2c726309d88df3c5568486e4b5b9e4c3150903f Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 14 Sep 2010 20:18:00 +0200 Subject: dccp ccid-3: remove buggy RTT-sampling history lookup This removes the RTT-sampling function tfrc_tx_hist_rtt(), since 1. it suffered from complex passing of return values (the return value both indicated successful lookup while the value doubled as RTT sample); 2. when for some odd reason the sample value equalled 0, this triggered a bug warning about "bogus Ack", due to the ambiguity of the return value; 3. on a passive host which has not sent anything the TX history is empty and thus will lead to unwanted "bogus Ack" warnings such as ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-28197148 ccid3_hc_tx_packet_recv: server(e7b7d518): DATAACK with bogus ACK-26641606. The fix is to replace the implicit encoding by performing the steps manually. Furthermore, the "bogus Ack" warning has been removed, since it can actually be triggered due to several reasons (network reordering, old packet, (3) above), hence it is not very useful. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 34 ++++++++++++++++++-------------- net/dccp/ccids/lib/packet_history.c | 39 ------------------------------------- net/dccp/ccids/lib/packet_history.h | 22 ++++++++++++++++++--- 3 files changed, 38 insertions(+), 57 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index e9ca098..b2ddd20 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -370,6 +370,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hc->tx_options_received; + struct tfrc_tx_hist_entry *acked; ktime_t now; unsigned long t_nfb; u32 pinv, r_sample; @@ -383,17 +384,24 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_state != TFRC_SSTATE_NO_FBACK) return; - now = ktime_get_real(); - - /* Estimate RTT from history if ACK number is valid */ - r_sample = tfrc_tx_hist_rtt(hc->tx_hist, - DCCP_SKB_CB(skb)->dccpd_ack_seq, now); - if (r_sample == 0) { - DCCP_WARN("%s(%p): %s with bogus ACK-%llu\n", dccp_role(sk), sk, - dccp_packet_name(DCCP_SKB_CB(skb)->dccpd_type), - (unsigned long long)DCCP_SKB_CB(skb)->dccpd_ack_seq); + /* + * Locate the acknowledged packet in the TX history. + * + * Returning "entry not found" here can for instance happen when + * - the host has not sent out anything (e.g. a passive server), + * - the Ack is outdated (packet with higher Ack number was received), + * - it is a bogus Ack (for a packet not sent on this connection). + */ + acked = tfrc_tx_hist_find_entry(hc->tx_hist, dccp_hdr_ack_seq(skb)); + if (acked == NULL) return; - } + /* For the sake of RTT sampling, ignore/remove all older entries */ + tfrc_tx_hist_purge(&acked->next); + + /* Update the moving average for the RTT estimate (RFC 3448, 4.3) */ + now = ktime_get_real(); + r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); + hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); /* Update receive rate in units of 64 * bytes/second */ hc->tx_x_recv = opt_recv->ccid3or_receive_rate; @@ -405,11 +413,7 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) hc->tx_p = 0; else /* can not exceed 100% */ hc->tx_p = scaled_div(1, pinv); - /* - * Validate new RTT sample and update moving average - */ - r_sample = dccp_sample_rtt(sk, r_sample); - hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); + /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 3a4f414..de8fe29 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -38,18 +38,6 @@ #include "packet_history.h" #include "../../dccp.h" -/** - * tfrc_tx_hist_entry - Simple singly-linked TX history list - * @next: next oldest entry (LIFO order) - * @seqno: sequence number of this entry - * @stamp: send time of packet with sequence number @seqno - */ -struct tfrc_tx_hist_entry { - struct tfrc_tx_hist_entry *next; - u64 seqno; - ktime_t stamp; -}; - /* * Transmitter History Routines */ @@ -71,15 +59,6 @@ void tfrc_tx_packet_history_exit(void) } } -static struct tfrc_tx_hist_entry * - tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) -{ - while (head != NULL && head->seqno != seqno) - head = head->next; - - return head; -} - int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno) { struct tfrc_tx_hist_entry *entry = kmem_cache_alloc(tfrc_tx_hist_slab, gfp_any()); @@ -107,24 +86,6 @@ void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp) *headp = NULL; } -u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, const u64 seqno, - const ktime_t now) -{ - u32 rtt = 0; - struct tfrc_tx_hist_entry *packet = tfrc_tx_hist_find_entry(head, seqno); - - if (packet != NULL) { - rtt = ktime_us_delta(now, packet->stamp); - /* - * Garbage-collect older (irrelevant) entries: - */ - tfrc_tx_hist_purge(&packet->next); - } - - return rtt; -} - - /* * Receiver History Routines */ diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index 7df6c52..7ee4a9d 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -40,12 +40,28 @@ #include #include "tfrc.h" -struct tfrc_tx_hist_entry; +/** + * tfrc_tx_hist_entry - Simple singly-linked TX history list + * @next: next oldest entry (LIFO order) + * @seqno: sequence number of this entry + * @stamp: send time of packet with sequence number @seqno + */ +struct tfrc_tx_hist_entry { + struct tfrc_tx_hist_entry *next; + u64 seqno; + ktime_t stamp; +}; + +static inline struct tfrc_tx_hist_entry * + tfrc_tx_hist_find_entry(struct tfrc_tx_hist_entry *head, u64 seqno) +{ + while (head != NULL && head->seqno != seqno) + head = head->next; + return head; +} extern int tfrc_tx_hist_add(struct tfrc_tx_hist_entry **headp, u64 seqno); extern void tfrc_tx_hist_purge(struct tfrc_tx_hist_entry **headp); -extern u32 tfrc_tx_hist_rtt(struct tfrc_tx_hist_entry *head, - const u64 seqno, const ktime_t now); /* Subtraction a-b modulo-16, respects circular wrap-around */ #define SUB16(a, b) (((a) + 16 - (b)) & 0xF) -- cgit v1.1 From 37efb03fbd0935f5f85a0538c46b53be5cf40504 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Tue, 14 Sep 2010 20:21:29 +0200 Subject: dccp ccid-3: Simplify and consolidate tx_parse_options This simplifies and consolidates the TX option-parsing code: 1. The Loss Intervals option is not currently used, so dead code related to this option is removed. I am aware of no plans to support the option, but if someone wants to implement it (e.g. for inter-op tests), it is better to start afresh than having to also update currently unused code. 2. The Loss Event and Receive Rate options have a lot of code in common (both are 32 bit, both have same length etc.), so this is consolidated. 3. The test against GSR is not necessary, because - on first loading CCID3, ccid_new() zeroes out all fields in the socket; - ccid3_hc_tx_packet_recv() treats 0 and ~0U equivalently, due to pinv = opt_recv->ccid3or_loss_event_rate; if (pinv == ~0U || pinv == 0) hctx->p = 0; - as a result, the sequence number field is removed from opt_recv. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 57 +++++++++++++------------------------------------- net/dccp/ccids/ccid3.h | 3 --- 2 files changed, 14 insertions(+), 46 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index b2ddd20..ce80591 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -485,60 +485,31 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, unsigned char len, u16 idx, unsigned char *value) { - int rc = 0; - const struct dccp_sock *dp = dccp_sk(sk); struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; - if (opt_recv->ccid3or_seqno != dp->dccps_gsr) { - opt_recv->ccid3or_seqno = dp->dccps_gsr; - opt_recv->ccid3or_loss_event_rate = ~0; - opt_recv->ccid3or_loss_intervals_idx = 0; - opt_recv->ccid3or_loss_intervals_len = 0; - opt_recv->ccid3or_receive_rate = 0; - } - switch (option) { + case TFRC_OPT_RECEIVE_RATE: case TFRC_OPT_LOSS_EVENT_RATE: if (unlikely(len != 4)) { - DCCP_WARN("%s(%p), invalid len %d " - "for TFRC_OPT_LOSS_EVENT_RATE\n", - dccp_role(sk), sk, len); - rc = -EINVAL; - } else { - opt_val = get_unaligned((__be32 *)value); - opt_recv->ccid3or_loss_event_rate = ntohl(opt_val); - ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", - dccp_role(sk), sk, - opt_recv->ccid3or_loss_event_rate); + DCCP_WARN("%s(%p), invalid len %d for %u\n", + dccp_role(sk), sk, len, option); + return -EINVAL; } - break; - case TFRC_OPT_LOSS_INTERVALS: - opt_recv->ccid3or_loss_intervals_idx = idx; - opt_recv->ccid3or_loss_intervals_len = len; - ccid3_pr_debug("%s(%p), LOSS_INTERVALS=(%u, %u)\n", - dccp_role(sk), sk, - opt_recv->ccid3or_loss_intervals_idx, - opt_recv->ccid3or_loss_intervals_len); - break; - case TFRC_OPT_RECEIVE_RATE: - if (unlikely(len != 4)) { - DCCP_WARN("%s(%p), invalid len %d " - "for TFRC_OPT_RECEIVE_RATE\n", - dccp_role(sk), sk, len); - rc = -EINVAL; - } else { - opt_val = get_unaligned((__be32 *)value); - opt_recv->ccid3or_receive_rate = ntohl(opt_val); + opt_val = ntohl(get_unaligned((__be32 *)value)); + + if (option == TFRC_OPT_RECEIVE_RATE) { + opt_recv->ccid3or_receive_rate = opt_val; ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", - dccp_role(sk), sk, - opt_recv->ccid3or_receive_rate); + dccp_role(sk), sk, opt_val); + } else { + opt_recv->ccid3or_loss_event_rate = opt_val; + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", + dccp_role(sk), sk, opt_val); } - break; } - - return rc; + return 0; } static int ccid3_hc_tx_init(struct ccid *ccid, struct sock *sk) diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 4a00174..9eb90b8 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -68,9 +68,6 @@ enum ccid3_options { }; struct ccid3_options_received { - u64 ccid3or_seqno:48, - ccid3or_loss_intervals_idx:16; - u16 ccid3or_loss_intervals_len; u32 ccid3or_loss_event_rate; u32 ccid3or_receive_rate; }; -- cgit v1.1 From 0abee5260babe0a12d468b485a28336551697925 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Wed, 15 Sep 2010 19:24:50 +0200 Subject: netfilter: nf_nat: add nf_nat_csum() Add a static function nf_nat_csum() to replace the duplicate code in nf_nat_mangle_udp_packet() and __nf_nat_mangle_tcp_packet(). Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_helper.c | 76 ++++++++++++++++---------------------- 1 file changed, 31 insertions(+), 45 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_helper.c b/net/ipv4/netfilter/nf_nat_helper.c index 4a0c6b5..31427fb 100644 --- a/net/ipv4/netfilter/nf_nat_helper.c +++ b/net/ipv4/netfilter/nf_nat_helper.c @@ -153,6 +153,35 @@ void nf_nat_set_seq_adjust(struct nf_conn *ct, enum ip_conntrack_info ctinfo, } EXPORT_SYMBOL_GPL(nf_nat_set_seq_adjust); +static void nf_nat_csum(struct sk_buff *skb, struct iphdr *iph, void *data, + int datalen, __sum16 *check, int oldlen) +{ + struct rtable *rt = skb_rtable(skb); + + if (skb->ip_summed != CHECKSUM_PARTIAL) { + if (!(rt->rt_flags & RTCF_LOCAL) && + skb->dev->features & NETIF_F_V4_CSUM) { + skb->ip_summed = CHECKSUM_PARTIAL; + skb->csum_start = skb_headroom(skb) + + skb_network_offset(skb) + + iph->ihl * 4; + skb->csum_offset = (void *)check - data; + *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, iph->protocol, 0); + } else { + *check = 0; + *check = csum_tcpudp_magic(iph->saddr, iph->daddr, + datalen, iph->protocol, + csum_partial(data, datalen, + 0)); + if (iph->protocol == IPPROTO_UDP && !*check) + *check = CSUM_MANGLED_0; + } + } else + inet_proto_csum_replace2(check, skb, + htons(oldlen), htons(datalen), 1); +} + /* Generic function for mangling variable-length address changes inside * NATed TCP connections (like the PORT XXX,XXX,XXX,XXX,XXX,XXX * command in FTP). @@ -169,7 +198,6 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len, bool adjust) { - struct rtable *rt = skb_rtable(skb); struct iphdr *iph; struct tcphdr *tcph; int oldlen, datalen; @@ -192,26 +220,7 @@ int __nf_nat_mangle_tcp_packet(struct sk_buff *skb, match_offset, match_len, rep_buffer, rep_len); datalen = skb->len - iph->ihl*4; - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - skb->dev->features & NETIF_F_V4_CSUM) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - iph->ihl * 4; - skb->csum_offset = offsetof(struct tcphdr, check); - tcph->check = ~tcp_v4_check(datalen, - iph->saddr, iph->daddr, 0); - } else { - tcph->check = 0; - tcph->check = tcp_v4_check(datalen, - iph->saddr, iph->daddr, - csum_partial(tcph, - datalen, 0)); - } - } else - inet_proto_csum_replace2(&tcph->check, skb, - htons(oldlen), htons(datalen), 1); + nf_nat_csum(skb, iph, tcph, datalen, &tcph->check, oldlen); if (adjust && rep_len != match_len) nf_nat_set_seq_adjust(ct, ctinfo, tcph->seq, @@ -240,7 +249,6 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, const char *rep_buffer, unsigned int rep_len) { - struct rtable *rt = skb_rtable(skb); struct iphdr *iph; struct udphdr *udph; int datalen, oldlen; @@ -274,29 +282,7 @@ nf_nat_mangle_udp_packet(struct sk_buff *skb, if (!udph->check && skb->ip_summed != CHECKSUM_PARTIAL) return 1; - if (skb->ip_summed != CHECKSUM_PARTIAL) { - if (!(rt->rt_flags & RTCF_LOCAL) && - skb->dev->features & NETIF_F_V4_CSUM) { - skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + - skb_network_offset(skb) + - iph->ihl * 4; - skb->csum_offset = offsetof(struct udphdr, check); - udph->check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - 0); - } else { - udph->check = 0; - udph->check = csum_tcpudp_magic(iph->saddr, iph->daddr, - datalen, IPPROTO_UDP, - csum_partial(udph, - datalen, 0)); - if (!udph->check) - udph->check = CSUM_MANGLED_0; - } - } else - inet_proto_csum_replace2(&udph->check, skb, - htons(oldlen), htons(datalen), 1); + nf_nat_csum(skb, iph, udph, datalen, &udph->check, oldlen); return 1; } -- cgit v1.1 From e0de7c93b950b9e784894efc4b529c6958cb747a Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Tue, 14 Sep 2010 09:13:08 +0000 Subject: ethtool: Remove unimplemented flow specification types struct ethtool_rawip4_spec and struct ethtool_ether_spec are neither commented nor used by any driver, so remove them. Adjust padding in the user-visible unions that included these structures. Fix references to struct ethtool_rawip4_spec in ethtool_get_rx_ntuple(), which should use struct ethtool_usrip4_spec. struct ethtool_usrip4_spec cannot hold IPv6 host addresses and there is no separate structure that can, so remove ETH_RX_NFC_IP6 and the reference to it in niu. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 970eb98..fcd62757 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -673,19 +673,19 @@ static int ethtool_get_rx_ntuple(struct net_device *dev, void __user *useraddr) break; case IP_USER_FLOW: sprintf(p, "\tSrc IP addr: 0x%x\n", - fsc->fs.h_u.raw_ip4_spec.ip4src); + fsc->fs.h_u.usr_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tSrc IP mask: 0x%x\n", - fsc->fs.m_u.raw_ip4_spec.ip4src); + fsc->fs.m_u.usr_ip4_spec.ip4src); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP addr: 0x%x\n", - fsc->fs.h_u.raw_ip4_spec.ip4dst); + fsc->fs.h_u.usr_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; sprintf(p, "\tDest IP mask: 0x%x\n", - fsc->fs.m_u.raw_ip4_spec.ip4dst); + fsc->fs.m_u.usr_ip4_spec.ip4dst); p += ETH_GSTRING_LEN; num_strings++; break; -- cgit v1.1 From b7285b7912776a4492744949c747c88d539006fa Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 11:07:24 +0000 Subject: ipip: get rid of ipip_lock As RTNL is held while doing tunnels inserts and deletes, we can remove ipip_lock spinlock. My initial RCU conversion was conservative and converted the rwlock to spinlock, with no RTNL requirement. Use appropriate rcu annotations and modern lockdep checks as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 67 +++++++++++++++++++++++++++++---------------------------- 1 file changed, 34 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 3c6f8f3..8de8888 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -122,11 +122,11 @@ static int ipip_net_id __read_mostly; struct ipip_net { - struct ip_tunnel *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel *tunnels_r[HASH_SIZE]; - struct ip_tunnel *tunnels_l[HASH_SIZE]; - struct ip_tunnel *tunnels_wc[1]; - struct ip_tunnel **tunnels[4]; + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; @@ -135,9 +135,8 @@ static void ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipip_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) @@ -145,8 +144,8 @@ static DEFINE_SPINLOCK(ipip_lock); static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip_tunnel *t; struct ipip_net *ipn = net_generic(net, ipip_net_id); @@ -169,12 +168,12 @@ static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, +static struct ip_tunnel __rcu **__ipip_bucket(struct ipip_net *ipn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - unsigned h = 0; + unsigned int h = 0; int prio = 0; if (remote) { @@ -188,7 +187,7 @@ static struct ip_tunnel **__ipip_bucket(struct ipip_net *ipn, return &ipn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, +static inline struct ip_tunnel __rcu **ipip_bucket(struct ipip_net *ipn, struct ip_tunnel *t) { return __ipip_bucket(ipn, &t->parms); @@ -196,13 +195,14 @@ static inline struct ip_tunnel **ipip_bucket(struct ipip_net *ipn, static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipip_bucket(ipn, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipip_lock); - *tp = t->next; - spin_unlock_bh(&ipip_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip_bucket(ipn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -210,12 +210,10 @@ static void ipip_tunnel_unlink(struct ipip_net *ipn, struct ip_tunnel *t) static void ipip_tunnel_link(struct ipip_net *ipn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip_bucket(ipn, t); + struct ip_tunnel __rcu **tp = ipip_bucket(ipn, t); - spin_lock_bh(&ipip_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipip_lock); } static struct ip_tunnel * ipip_tunnel_locate(struct net *net, @@ -223,12 +221,15 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct ipip_net *ipn = net_generic(net, ipip_net_id); - for (tp = __ipip_bucket(ipn, parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip_bucket(ipn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr) return t; } @@ -268,16 +269,15 @@ failed_free: return NULL; } +/* called with RTNL */ static void ipip_tunnel_uninit(struct net_device *dev) { struct net *net = dev_net(dev); struct ipip_net *ipn = net_generic(net, ipip_net_id); - if (dev == ipn->fb_tunnel_dev) { - spin_lock_bh(&ipip_lock); - ipn->tunnels_wc[0] = NULL; - spin_unlock_bh(&ipip_lock); - } else + if (dev == ipn->fb_tunnel_dev) + rcu_assign_pointer(ipn->tunnels_wc[0], NULL); + else ipip_tunnel_unlink(ipn, netdev_priv(dev)); dev_put(dev); } @@ -741,7 +741,7 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; dev_hold(dev); - ipn->tunnels_wc[0] = tunnel; + rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); } static struct xfrm_tunnel ipip_handler __read_mostly = { @@ -760,11 +760,12 @@ static void ipip_destroy_tunnels(struct ipip_net *ipn, struct list_head *head) for (prio = 1; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = ipn->tunnels[prio][h]; + struct ip_tunnel *t; + t = rtnl_dereference(ipn->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } -- cgit v1.1 From 1507850b400492fdedc3064d3b8db5e9a1c871e3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 11:07:53 +0000 Subject: gre: get rid of ipgre_lock As RTNL is held while doing tunnels inserts and deletes, we can remove ipgre_lock spinlock. My initial RCU conversion was conservative and converted the rwlock to spinlock, with no RTNL requirement. Use appropriate rcu annotations and modern lockdep checks as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 61 +++++++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 8517689..fc20e68 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -129,7 +129,7 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev); static int ipgre_net_id __read_mostly; struct ipgre_net { - struct ip_tunnel *tunnels[4][HASH_SIZE]; + struct ip_tunnel __rcu *tunnels[4][HASH_SIZE]; struct net_device *fb_tunnel_dev; }; @@ -159,9 +159,8 @@ struct ipgre_net { #define tunnels_l tunnels[1] #define tunnels_wc tunnels[0] /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipgre_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) @@ -174,8 +173,8 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, { struct net *net = dev_net(dev); int link = dev->ifindex; - unsigned h0 = HASH(remote); - unsigned h1 = HASH(key); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(key); struct ip_tunnel *t, *cand = NULL; struct ipgre_net *ign = net_generic(net, ipgre_net_id); int dev_type = (gre_proto == htons(ETH_P_TEB)) ? @@ -290,13 +289,13 @@ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, return NULL; } -static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, +static struct ip_tunnel __rcu **__ipgre_bucket(struct ipgre_net *ign, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; __be32 key = parms->i_key; - unsigned h = HASH(key); + unsigned int h = HASH(key); int prio = 0; if (local) @@ -309,7 +308,7 @@ static struct ip_tunnel **__ipgre_bucket(struct ipgre_net *ign, return &ign->tunnels[prio][h]; } -static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, +static inline struct ip_tunnel __rcu **ipgre_bucket(struct ipgre_net *ign, struct ip_tunnel *t) { return __ipgre_bucket(ign, &t->parms); @@ -317,23 +316,22 @@ static inline struct ip_tunnel **ipgre_bucket(struct ipgre_net *ign, static void ipgre_tunnel_link(struct ipgre_net *ign, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipgre_bucket(ign, t); + struct ip_tunnel __rcu **tp = ipgre_bucket(ign, t); - spin_lock_bh(&ipgre_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipgre_lock); } static void ipgre_tunnel_unlink(struct ipgre_net *ign, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipgre_bucket(ign, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipgre_lock); - *tp = t->next; - spin_unlock_bh(&ipgre_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipgre_bucket(ign, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -347,10 +345,13 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, __be32 local = parms->iph.saddr; __be32 key = parms->i_key; int link = parms->link; - struct ip_tunnel *t, **tp; + struct ip_tunnel *t; + struct ip_tunnel __rcu **tp; struct ipgre_net *ign = net_generic(net, ipgre_net_id); - for (tp = __ipgre_bucket(ign, parms); (t = *tp) != NULL; tp = &t->next) + for (tp = __ipgre_bucket(ign, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && key == t->parms.i_key && @@ -361,7 +362,7 @@ static struct ip_tunnel *ipgre_tunnel_find(struct net *net, return t; } -static struct ip_tunnel * ipgre_tunnel_locate(struct net *net, +static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { struct ip_tunnel *t, *nt; @@ -669,7 +670,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev u8 tos; __be16 df; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int gre_hlen; @@ -1013,7 +1014,7 @@ ipgre_tunnel_ioctl (struct net_device *dev, struct ifreq *ifr, int cmd) break; } } else { - unsigned nflags = 0; + unsigned int nflags = 0; t = netdev_priv(dev); @@ -1126,7 +1127,7 @@ static int ipgre_tunnel_change_mtu(struct net_device *dev, int new_mtu) static int ipgre_header(struct sk_buff *skb, struct net_device *dev, unsigned short type, - const void *daddr, const void *saddr, unsigned len) + const void *daddr, const void *saddr, unsigned int len) { struct ip_tunnel *t = netdev_priv(dev); struct iphdr *iph = (struct iphdr *)skb_push(skb, t->hlen); @@ -1275,7 +1276,7 @@ static void ipgre_fb_tunnel_init(struct net_device *dev) tunnel->hlen = sizeof(struct iphdr) + 4; dev_hold(dev); - ign->tunnels_wc[0] = tunnel; + rcu_assign_pointer(ign->tunnels_wc[0], tunnel); } @@ -1291,11 +1292,13 @@ static void ipgre_destroy_tunnels(struct ipgre_net *ign, struct list_head *head) for (prio = 0; prio < 4; prio++) { int h; for (h = 0; h < HASH_SIZE; h++) { - struct ip_tunnel *t = ign->tunnels[prio][h]; + struct ip_tunnel *t; + + t = rtnl_dereference(ign->tunnels[prio][h]); while (t != NULL) { unregister_netdevice_queue(t->dev, head); - t = t->next; + t = rtnl_dereference(t->next); } } } @@ -1522,7 +1525,7 @@ static int ipgre_changelink(struct net_device *dev, struct nlattr *tb[], t = nt; if (dev->type != ARPHRD_ETHER) { - unsigned nflags = 0; + unsigned int nflags = 0; if (ipv4_is_multicast(p.iph.daddr)) nflags = IFF_BROADCAST; -- cgit v1.1 From 3a43be3c328ff42327da0b141de360dc4587aab7 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 11:35:10 +0000 Subject: sit: get rid of ipip6_lock As RTNL is held while doing tunnels inserts and deletes, we can remove ipip6_lock spinlock. My initial RCU conversion was conservative and converted the rwlock to spinlock, with no RTNL requirement. Use appropriate rcu annotations and modern lockdep checks as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/sit.c | 64 ++++++++++++++++++++++++++++------------------------------ 1 file changed, 31 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 86618eb..6822481 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -68,19 +68,18 @@ static void ipip6_tunnel_setup(struct net_device *dev); static int sit_net_id __read_mostly; struct sit_net { - struct ip_tunnel *tunnels_r_l[HASH_SIZE]; - struct ip_tunnel *tunnels_r[HASH_SIZE]; - struct ip_tunnel *tunnels_l[HASH_SIZE]; - struct ip_tunnel *tunnels_wc[1]; - struct ip_tunnel **tunnels[4]; + struct ip_tunnel __rcu *tunnels_r_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_r[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_l[HASH_SIZE]; + struct ip_tunnel __rcu *tunnels_wc[1]; + struct ip_tunnel __rcu **tunnels[4]; struct net_device *fb_tunnel_dev; }; /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ipip6_lock); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) @@ -91,8 +90,8 @@ static DEFINE_SPINLOCK(ipip6_lock); static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, struct net_device *dev, __be32 remote, __be32 local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip_tunnel *t; struct sit_net *sitn = net_generic(net, sit_net_id); @@ -121,12 +120,12 @@ static struct ip_tunnel * ipip6_tunnel_lookup(struct net *net, return NULL; } -static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn, +static struct ip_tunnel __rcu **__ipip6_bucket(struct sit_net *sitn, struct ip_tunnel_parm *parms) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - unsigned h = 0; + unsigned int h = 0; int prio = 0; if (remote) { @@ -140,7 +139,7 @@ static struct ip_tunnel **__ipip6_bucket(struct sit_net *sitn, return &sitn->tunnels[prio][h]; } -static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn, +static inline struct ip_tunnel __rcu **ipip6_bucket(struct sit_net *sitn, struct ip_tunnel *t) { return __ipip6_bucket(sitn, &t->parms); @@ -148,13 +147,14 @@ static inline struct ip_tunnel **ipip6_bucket(struct sit_net *sitn, static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) { - struct ip_tunnel **tp; - - for (tp = ipip6_bucket(sitn, t); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ipip6_lock); - *tp = t->next; - spin_unlock_bh(&ipip6_lock); + struct ip_tunnel __rcu **tp; + struct ip_tunnel *iter; + + for (tp = ipip6_bucket(sitn, t); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -162,12 +162,10 @@ static void ipip6_tunnel_unlink(struct sit_net *sitn, struct ip_tunnel *t) static void ipip6_tunnel_link(struct sit_net *sitn, struct ip_tunnel *t) { - struct ip_tunnel **tp = ipip6_bucket(sitn, t); + struct ip_tunnel __rcu **tp = ipip6_bucket(sitn, t); - spin_lock_bh(&ipip6_lock); - t->next = *tp; + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ipip6_lock); } static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) @@ -187,17 +185,20 @@ static void ipip6_tunnel_clone_6rd(struct net_device *dev, struct sit_net *sitn) #endif } -static struct ip_tunnel * ipip6_tunnel_locate(struct net *net, +static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, struct ip_tunnel_parm *parms, int create) { __be32 remote = parms->iph.daddr; __be32 local = parms->iph.saddr; - struct ip_tunnel *t, **tp, *nt; + struct ip_tunnel *t, *nt; + struct ip_tunnel __rcu **tp; struct net_device *dev; char name[IFNAMSIZ]; struct sit_net *sitn = net_generic(net, sit_net_id); - for (tp = __ipip6_bucket(sitn, parms); (t = *tp) != NULL; tp = &t->next) { + for (tp = __ipip6_bucket(sitn, parms); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (local == t->parms.iph.saddr && remote == t->parms.iph.daddr && parms->link == t->parms.link) { @@ -340,7 +341,7 @@ ipip6_tunnel_add_prl(struct ip_tunnel *t, struct ip_tunnel_prl *a, int chg) ASSERT_RTNL(); - for (p = t->prl; p; p = p->next) { + for (p = rtnl_dereference(t->prl); p; p = rtnl_dereference(p->next)) { if (p->addr == a->addr) { if (chg) { p->flags = a->flags; @@ -451,15 +452,12 @@ static void ipip6_tunnel_uninit(struct net_device *dev) struct sit_net *sitn = net_generic(net, sit_net_id); if (dev == sitn->fb_tunnel_dev) { - spin_lock_bh(&ipip6_lock); - sitn->tunnels_wc[0] = NULL; - spin_unlock_bh(&ipip6_lock); - dev_put(dev); + rcu_assign_pointer(sitn->tunnels_wc[0], NULL); } else { ipip6_tunnel_unlink(sitn, netdev_priv(dev)); ipip6_tunnel_del_prl(netdev_priv(dev), NULL); - dev_put(dev); } + dev_put(dev); } @@ -590,7 +588,7 @@ __be32 try_6rd(struct in6_addr *v6dst, struct ip_tunnel *tunnel) #ifdef CONFIG_IPV6_SIT_6RD if (ipv6_prefix_equal(v6dst, &tunnel->ip6rd.prefix, tunnel->ip6rd.prefixlen)) { - unsigned pbw0, pbi0; + unsigned int pbw0, pbi0; int pbi1; u32 d; -- cgit v1.1 From 0a5f1d476aef43916abb2b8634ebff23ef2c05f6 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Sep 2010 10:22:36 +0000 Subject: irda/irnet: use noop_llseek There may be applications trying to seek on the irnet character device, so we should use noop_llseek to avoid returning an error when the default llseek changes to no_llseek. Signed-off-by: Arnd Bergmann Cc: Samuel Ortiz Cc: netdev@vger.kernel.org Signed-off-by: David S. Miller --- net/irda/irnet/irnet_ppp.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/irnet/irnet_ppp.h b/net/irda/irnet/irnet_ppp.h index b5df241..9402258 100644 --- a/net/irda/irnet/irnet_ppp.h +++ b/net/irda/irnet/irnet_ppp.h @@ -103,7 +103,8 @@ static const struct file_operations irnet_device_fops = .poll = dev_irnet_poll, .unlocked_ioctl = dev_irnet_ioctl, .open = dev_irnet_open, - .release = dev_irnet_close + .release = dev_irnet_close, + .llseek = noop_llseek, /* Also : llseek, readdir, mmap, flush, fsync, fasync, lock, readv, writev */ }; -- cgit v1.1 From 6482f554e2b9cbe733d63124765104f29cf0c9ad Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 15 Sep 2010 12:19:53 +0000 Subject: Phonet: remove dangling pipe if an endpoint is closed early MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closing a pipe endpoint is not normally allowed by the Phonet pipe, other than as a side after-effect of removing the pipe between two endpoints. But there is no way to prevent Linux userspace processes from being killed or suffering from bugs, so this can still happen. We might as well forcefully close Phonet pipe endpoints then. The cellular modem supports only a few existing pipes at a time. So we really should not leak them. This change instructs the modem to destroy the pipe if either of the pipe's endpoint (Linux socket) is closed too early. Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/pep.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 04e3419..d0e7eb2 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -620,6 +620,28 @@ drop: return err; } +static int pipe_do_remove(struct sock *sk) +{ + struct pep_sock *pn = pep_sk(sk); + struct pnpipehdr *ph; + struct sk_buff *skb; + + skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + skb_reserve(skb, MAX_PNPIPE_HEADER); + __skb_push(skb, sizeof(*ph)); + skb_reset_transport_header(skb); + ph = pnp_hdr(skb); + ph->utid = 0; + ph->message_id = PNS_PIPE_REMOVE_REQ; + ph->pipe_handle = pn->pipe_handle; + ph->data[0] = PAD; + + return pn_skb_send(sk, skb, &pipe_srv); +} + /* associated socket ceases to exist */ static void pep_sock_close(struct sock *sk, long timeout) { @@ -638,7 +660,10 @@ static void pep_sock_close(struct sock *sk, long timeout) sk_for_each_safe(sknode, p, n, &pn->ackq) sk_del_node_init(sknode); sk->sk_state = TCP_CLOSE; - } + } else if ((1 << sk->sk_state) & (TCPF_SYN_RECV|TCPF_ESTABLISHED)) + /* Forcefully remove dangling Phonet pipe */ + pipe_do_remove(sk); + ifindex = pn->ifindex; pn->ifindex = 0; release_sock(sk); -- cgit v1.1 From 4e3d16ce5e82648d7f4dfd28b6cf8fe2e9a9efc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 15 Sep 2010 12:30:11 +0000 Subject: Phonet: resource routing backend MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When both destination device and object are nul, Phonet routes the packet according to the resource field. In fact, this is the most common pattern when sending Phonet "request" packets. In this case, the packet is delivered to whichever endpoint (socket) has registered the resource. This adds a new table so that Linux processes can register their Phonet sockets to Phonet resources, if they have adequate privileges. (Namespace support is not implemented at the moment.) Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/socket.c | 88 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) (limited to 'net') diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 7c91f73..4c29a23 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -565,3 +565,91 @@ const struct file_operations pn_sock_seq_fops = { .release = seq_release_net, }; #endif + +static struct { + struct sock *sk[256]; +} pnres; + +/* + * Find and hold socket based on resource. + */ +struct sock *pn_find_sock_by_res(struct net *net, u8 res) +{ + struct sock *sk; + + if (!net_eq(net, &init_net)) + return NULL; + + rcu_read_lock(); + sk = rcu_dereference(pnres.sk[res]); + if (sk) + sock_hold(sk); + rcu_read_unlock(); + return sk; +} + +static DEFINE_MUTEX(resource_mutex); + +int pn_sock_bind_res(struct sock *sk, u8 res) +{ + int ret = -EADDRINUSE; + + if (!net_eq(sock_net(sk), &init_net)) + return -ENOIOCTLCMD; + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + if (pn_socket_autobind(sk->sk_socket)) + return -EAGAIN; + + mutex_lock(&resource_mutex); + if (pnres.sk[res] == NULL) { + sock_hold(sk); + rcu_assign_pointer(pnres.sk[res], sk); + ret = 0; + } + mutex_unlock(&resource_mutex); + return ret; +} + +int pn_sock_unbind_res(struct sock *sk, u8 res) +{ + int ret = -ENOENT; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + mutex_lock(&resource_mutex); + if (pnres.sk[res] == sk) { + rcu_assign_pointer(pnres.sk[res], NULL); + ret = 0; + } + mutex_unlock(&resource_mutex); + + if (ret == 0) { + synchronize_rcu(); + sock_put(sk); + } + return ret; +} + +void pn_sock_unbind_all_res(struct sock *sk) +{ + unsigned res, match = 0; + + mutex_lock(&resource_mutex); + for (res = 0; res < 256; res++) { + if (pnres.sk[res] == sk) { + rcu_assign_pointer(pnres.sk[res], NULL); + match++; + } + } + mutex_unlock(&resource_mutex); + + if (match == 0) + return; + synchronize_rcu(); + while (match > 0) { + sock_put(sk); + match--; + } +} -- cgit v1.1 From 7417fa83c1a8b75a03bd9b9b358999f38e771eab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 15 Sep 2010 12:30:12 +0000 Subject: Phonet: hook resource routing to userspace via ioctl()'s MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit I wish we could use something cleaner, such as bind(). But that would not work since resource subscription is orthogonal/in addition to the normal object ID allocated via bind(). This is similar to multicasting which also uses ioctl()'s. Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/datagram.c | 13 +++++++++++++ net/phonet/socket.c | 1 + 2 files changed, 14 insertions(+) (limited to 'net') diff --git a/net/phonet/datagram.c b/net/phonet/datagram.c index 1bd38db..2f03238 100644 --- a/net/phonet/datagram.c +++ b/net/phonet/datagram.c @@ -52,6 +52,19 @@ static int pn_ioctl(struct sock *sk, int cmd, unsigned long arg) answ = skb ? skb->len : 0; release_sock(sk); return put_user(answ, (int __user *)arg); + + case SIOCPNADDRESOURCE: + case SIOCPNDELRESOURCE: { + u32 res; + if (get_user(res, (u32 __user *)arg)) + return -EFAULT; + if (res >= 256) + return -EINVAL; + if (cmd == SIOCPNADDRESOURCE) + return pn_sock_bind_res(sk, res); + else + return pn_sock_unbind_res(sk, res); + } } return -ENOIOCTLCMD; diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 4c29a23..d4f41af 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -158,6 +158,7 @@ void pn_sock_unhash(struct sock *sk) spin_lock_bh(&pnsocks.lock); sk_del_node_init(sk); spin_unlock_bh(&pnsocks.lock); + pn_sock_unbind_all_res(sk); } EXPORT_SYMBOL(pn_sock_unhash); -- cgit v1.1 From b6a563b2af4ec5b0363cf89869ba234c0e2cd3da Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 15 Sep 2010 12:30:13 +0000 Subject: Phonet: look up the resource routing table when forwarding MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/af_phonet.c | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) (limited to 'net') diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c index 73aee7f..fd95beb 100644 --- a/net/phonet/af_phonet.c +++ b/net/phonet/af_phonet.c @@ -251,6 +251,16 @@ int pn_skb_send(struct sock *sk, struct sk_buff *skb, else if (phonet_address_lookup(net, daddr) == 0) { dev = phonet_device_get(net); skb->pkt_type = PACKET_LOOPBACK; + } else if (pn_sockaddr_get_object(target) == 0) { + /* Resource routing (small race until phonet_rcv()) */ + struct sock *sk = pn_find_sock_by_res(net, + target->spn_resource); + if (sk) { + sock_put(sk); + dev = phonet_device_get(net); + skb->pkt_type = PACKET_LOOPBACK; + } else + dev = phonet_route_output(net, daddr); } else dev = phonet_route_output(net, daddr); @@ -383,6 +393,13 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev, goto out; } + /* resource routing */ + if (pn_sockaddr_get_object(&sa) == 0) { + struct sock *sk = pn_find_sock_by_res(net, sa.spn_resource); + if (sk) + return sk_receive_skb(sk, skb, 0); + } + /* check if we are the destination */ if (phonet_address_lookup(net, pn_sockaddr_get_addr(&sa)) == 0) { /* Phonet packet input */ -- cgit v1.1 From 507215f8d04f9e61f38c975e61d93bcafd30815f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 15 Sep 2010 12:30:14 +0000 Subject: Phonet: list subscribed resources via proc_fs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/pn_dev.c | 2 ++ net/phonet/socket.c | 96 +++++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 98 insertions(+) (limited to 'net') diff --git a/net/phonet/pn_dev.c b/net/phonet/pn_dev.c index d0a4294..947038d 100644 --- a/net/phonet/pn_dev.c +++ b/net/phonet/pn_dev.c @@ -373,6 +373,7 @@ int __init phonet_device_init(void) if (err) return err; + proc_net_fops_create(&init_net, "pnresource", 0, &pn_res_seq_fops); register_netdevice_notifier(&phonet_device_notifier); err = phonet_netlink_register(); if (err) @@ -385,6 +386,7 @@ void phonet_device_exit(void) rtnl_unregister_all(PF_PHONET); unregister_netdevice_notifier(&phonet_device_notifier); unregister_pernet_device(&phonet_net_ops); + proc_net_remove(&init_net, "pnresource"); } int phonet_route_add(struct net_device *dev, u8 daddr) diff --git a/net/phonet/socket.c b/net/phonet/socket.c index d4f41af..6bf6e3c 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -654,3 +654,99 @@ void pn_sock_unbind_all_res(struct sock *sk) match--; } } + +#ifdef CONFIG_PROC_FS +static struct sock **pn_res_get_idx(struct seq_file *seq, loff_t pos) +{ + struct net *net = seq_file_net(seq); + unsigned i; + + if (!net_eq(net, &init_net)) + return NULL; + + for (i = 0; i < 256; i++) { + if (pnres.sk[i] == NULL) + continue; + if (!pos) + return pnres.sk + i; + pos--; + } + return NULL; +} + +static struct sock **pn_res_get_next(struct seq_file *seq, struct sock **sk) +{ + struct net *net = seq_file_net(seq); + unsigned i; + + BUG_ON(!net_eq(net, &init_net)); + + for (i = (sk - pnres.sk) + 1; i < 256; i++) + if (pnres.sk[i]) + return pnres.sk + i; + return NULL; +} + +static void *pn_res_seq_start(struct seq_file *seq, loff_t *pos) + __acquires(resource_mutex) +{ + mutex_lock(&resource_mutex); + return *pos ? pn_res_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; +} + +static void *pn_res_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct sock **sk; + + if (v == SEQ_START_TOKEN) + sk = pn_res_get_idx(seq, 0); + else + sk = pn_res_get_next(seq, v); + (*pos)++; + return sk; +} + +static void pn_res_seq_stop(struct seq_file *seq, void *v) + __releases(resource_mutex) +{ + mutex_unlock(&resource_mutex); +} + +static int pn_res_seq_show(struct seq_file *seq, void *v) +{ + int len; + + if (v == SEQ_START_TOKEN) + seq_printf(seq, "%s%n", "rs uid inode", &len); + else { + struct sock **psk = v; + struct sock *sk = *psk; + + seq_printf(seq, "%02X %5d %lu%n", + psk - pnres.sk, sock_i_uid(sk), sock_i_ino(sk), &len); + } + seq_printf(seq, "%*s\n", 63 - len, ""); + return 0; +} + +static const struct seq_operations pn_res_seq_ops = { + .start = pn_res_seq_start, + .next = pn_res_seq_next, + .stop = pn_res_seq_stop, + .show = pn_res_seq_show, +}; + +static int pn_res_open(struct inode *inode, struct file *file) +{ + return seq_open_net(inode, file, &pn_res_seq_ops, + sizeof(struct seq_net_private)); +} + +const struct file_operations pn_res_seq_fops = { + .owner = THIS_MODULE, + .open = pn_res_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release_net, +}; +#endif -- cgit v1.1 From 9e0064a5456fd75fd7c70f6f3692c7f732f91a65 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 15 Sep 2010 21:34:41 -0700 Subject: phonet: Fix build warning. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/phonet/socket.c: In function ‘pn_res_seq_show’: net/phonet/socket.c:726: warning: format ‘%02X’ expects type ‘unsigned int’, but argument 3 has type ‘long int’ Signed-off-by: David S. Miller --- net/phonet/socket.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/phonet/socket.c b/net/phonet/socket.c index 6bf6e3c..aca8fba 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -723,7 +723,8 @@ static int pn_res_seq_show(struct seq_file *seq, void *v) struct sock *sk = *psk; seq_printf(seq, "%02X %5d %lu%n", - psk - pnres.sk, sock_i_uid(sk), sock_i_ino(sk), &len); + (int) (psk - pnres.sk), sock_i_uid(sk), + sock_i_ino(sk), &len); } seq_printf(seq, "%*s\n", 63 - len, ""); return 0; -- cgit v1.1 From 95ae6b228f814fc0528d0506ee9f18ac333d6851 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 04:04:31 +0000 Subject: ipv4: ip_ptr cleanups dev->ip_ptr is protected by rtnl and rcu. Yet some places dont use appropriate primitives and/or locking rules. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- net/ipv4/devinet.c | 4 ++-- net/ipv4/ipmr.c | 2 +- net/mac80211/main.c | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index fc2dc93..5bdce97 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5286,7 +5286,7 @@ void netdev_run_todo(void) /* paranoia */ BUG_ON(atomic_read(&dev->refcnt)); - WARN_ON(dev->ip_ptr); + WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index da14c49..c2ff48f 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -209,7 +209,7 @@ static void inetdev_destroy(struct in_device *in_dev) inet_free_ifa(ifa); } - dev->ip_ptr = NULL; + rcu_assign_pointer(dev->ip_ptr, NULL); devinet_sysctl_unregister(in_dev); neigh_parms_release(&arp_tbl, in_dev->arp_parms); @@ -1059,7 +1059,7 @@ static int inetdev_event(struct notifier_block *this, unsigned long event, switch (event) { case NETDEV_REGISTER: printk(KERN_DEBUG "inetdev_event: bug\n"); - dev->ip_ptr = NULL; + rcu_assign_pointer(dev->ip_ptr, NULL); break; case NETDEV_UP: if (!inetdev_valid_mtu(dev->mtu)) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 179fcab..10b24c02 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -724,7 +724,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, case 0: if (vifc->vifc_flags == VIFF_USE_IFINDEX) { dev = dev_get_by_index(net, vifc->vifc_lcl_ifindex); - if (dev && dev->ip_ptr == NULL) { + if (dev && __in_dev_get_rtnl(dev) == NULL) { dev_put(dev); return -EADDRNOTAVAIL; } diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 4935b84..b8cf282 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -362,7 +362,7 @@ static int ieee80211_ifa_changed(struct notifier_block *nb, if (sdata->vif.type != NL80211_IFTYPE_STATION) return NOTIFY_DONE; - idev = sdata->dev->ip_ptr; + idev = __in_dev_get_rtnl(sdata->dev); if (!idev) return NOTIFY_DONE; -- cgit v1.1 From 16c3ea785fe4a383c6675dfe7316f3c815755bdd Mon Sep 17 00:00:00 2001 From: Brandon Philips Date: Wed, 15 Sep 2010 09:24:24 +0000 Subject: net: enable GRO by default for vlan devices Currently vlan devices don't have GRO by default as none of the Ethernet drivers add NETIF_F_GRO to their vlan_features. As GRO is a software feature add GRO to dev->vlan_features in register_netdevice() and let vlan_dev_init() take care that it gets enabled only when dev->features has NETIF_F_GRO too. Signed-off-by: Brandon Philips Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 5bdce97..09b3742 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5057,6 +5057,11 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; + /* Enable GRO for vlans by default if dev->features has GRO also. + * vlan_dev_init() will do the dev->features check. + */ + dev->vlan_features |= NETIF_F_GRO; + ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); if (ret) -- cgit v1.1 From 70a851667dbd7aa3f7be9609d6580ca9861230b3 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 16 Sep 2010 19:37:30 +0200 Subject: netfilter: use NFPROTO_IPV4 instead of AF_INET The field family of xt_target should be NFPROTO_IPV4, though NFPROTO_IPV4 and AF_INET are the same. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/xt_TPROXY.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index c61294d..21bb2af 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -76,7 +76,7 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par) static struct xt_target tproxy_tg_reg __read_mostly = { .name = "TPROXY", - .family = AF_INET, + .family = NFPROTO_IPV4, .table = "mangle", .target = tproxy_tg, .targetsize = sizeof(struct xt_tproxy_target_info), -- cgit v1.1 From 99ad3c53b36a056a472927de9c79eda231ecc6fe Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 16 Sep 2010 19:45:19 +0200 Subject: netfilter: nf_nat_core: don't check if the tuple is used if there is no other choice Eliminate nf_nat_used_tuple() to save some CPU cycles when there is no other choice. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_core.c | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 8c8632d..2c084b3 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -262,11 +262,17 @@ get_unique_tuple(struct nf_conntrack_tuple *tuple, proto = __nf_nat_proto_find(orig_tuple->dst.protonum); /* Only bother mapping if it's not already in range and unique */ - if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM) && - (!(range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) || - proto->in_range(tuple, maniptype, &range->min, &range->max)) && - !nf_nat_used_tuple(tuple, ct)) - goto out; + if (!(range->flags & IP_NAT_RANGE_PROTO_RANDOM)) { + if (range->flags & IP_NAT_RANGE_PROTO_SPECIFIED) { + if (proto->in_range(tuple, maniptype, &range->min, + &range->max) && + (range->min.all == range->max.all || + !nf_nat_used_tuple(tuple, ct))) + goto out; + } else if (!nf_nat_used_tuple(tuple, ct)) { + goto out; + } + } /* Last change: get protocol to try to obtain unique tuple. */ proto->unique_tuple(tuple, range, maniptype, ct); -- cgit v1.1 From ed0b6d7581b54455062f09ccac123814e70cd02f Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 16 Sep 2010 19:47:51 +0200 Subject: netfilter: nf_nat: no IP_NAT_RANGE_MAP_IPS flags when alloc_null_binding() When alloc_null_binding(), no IP_NAT_RNAGE_MAP_IPS in flags means no IP address translation is needed. It isn't necessary to specify the address explicitly. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_rule.c | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_rule.c b/net/ipv4/netfilter/nf_nat_rule.c index ebbd319..21c3042 100644 --- a/net/ipv4/netfilter/nf_nat_rule.c +++ b/net/ipv4/netfilter/nf_nat_rule.c @@ -106,16 +106,15 @@ alloc_null_binding(struct nf_conn *ct, unsigned int hooknum) { /* Force range to this IP; let proto decide mapping for per-proto parts (hence not IP_NAT_RANGE_PROTO_SPECIFIED). - Use reply in case it's already been mangled (eg local packet). */ - __be32 ip - = (HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC - ? ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip - : ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); - struct nf_nat_range range - = { IP_NAT_RANGE_MAP_IPS, ip, ip, { 0 }, { 0 } }; - - pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, &ip); + struct nf_nat_range range; + + range.flags = 0; + pr_debug("Allocating NULL binding for %p (%pI4)\n", ct, + HOOK2MANIP(hooknum) == IP_NAT_MANIP_SRC ? + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.dst.u3.ip : + &ct->tuplehash[IP_CT_DIR_REPLY].tuple.src.u3.ip); + return nf_nat_setup_info(ct, &range, HOOK2MANIP(hooknum)); } -- cgit v1.1 From b23909695c33f53df5f1d16696b1aa5b874c1904 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 16 Sep 2010 19:55:03 +0200 Subject: netfilter: nf_conntrack: fix the hash random initializing race nf_conntrack_alloc() isn't called with nf_conntrack_lock locked, so hash random initializing code maybe executed more than once on different CPUs. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index df3eedb..4c0ad9b 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -65,8 +65,7 @@ EXPORT_SYMBOL_GPL(nf_conntrack_max); DEFINE_PER_CPU(struct nf_conn, nf_conntrack_untracked); EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); -static int nf_conntrack_hash_rnd_initted; -static unsigned int nf_conntrack_hash_rnd; +static unsigned int nf_conntrack_hash_rnd __read_mostly; static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, u16 zone, unsigned int size, unsigned int rnd) @@ -574,10 +573,18 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, { struct nf_conn *ct; - if (unlikely(!nf_conntrack_hash_rnd_initted)) { - get_random_bytes(&nf_conntrack_hash_rnd, - sizeof(nf_conntrack_hash_rnd)); - nf_conntrack_hash_rnd_initted = 1; + if (unlikely(!nf_conntrack_hash_rnd)) { + unsigned int rand; + + /* + * Why not initialize nf_conntrack_rnd in a "init()" function ? + * Because there isn't enough entropy when system initializing, + * and we initialize it as late as possible. + */ + do { + get_random_bytes(&rand, sizeof(rand)); + } while (!rand); + cmpxchg(&nf_conntrack_hash_rnd, 0, rand); } /* We don't want any race condition at early drop stage */ -- cgit v1.1 From 9c376639297d3dd82d40e54c9cdca8da9dfc22f1 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Fri, 20 Aug 2010 15:13:59 -0700 Subject: include/net/cfg80211.h: wiphy_ messages use dev_printk The output becomes: [ 41.261941] ieee80211 phy0: Selected rate control algorithm 'minstrel_ht' Signed-off-by: Joe Perches Signed-off-by: John W. Linville --- net/wireless/core.c | 49 ------------------------------------------------- 1 file changed, 49 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index d52630b..b8191cf 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -912,52 +912,3 @@ static void __exit cfg80211_exit(void) destroy_workqueue(cfg80211_wq); } module_exit(cfg80211_exit); - -static int ___wiphy_printk(const char *level, const struct wiphy *wiphy, - struct va_format *vaf) -{ - if (!wiphy) - return printk("%s(NULL wiphy *): %pV", level, vaf); - - return printk("%s%s: %pV", level, wiphy_name(wiphy), vaf); -} - -int __wiphy_printk(const char *level, const struct wiphy *wiphy, - const char *fmt, ...) -{ - struct va_format vaf; - va_list args; - int r; - - va_start(args, fmt); - - vaf.fmt = fmt; - vaf.va = &args; - - r = ___wiphy_printk(level, wiphy, &vaf); - va_end(args); - - return r; -} -EXPORT_SYMBOL(__wiphy_printk); - -#define define_wiphy_printk_level(func, kern_level) \ -int func(const struct wiphy *wiphy, const char *fmt, ...) \ -{ \ - struct va_format vaf; \ - va_list args; \ - int r; \ - \ - va_start(args, fmt); \ - \ - vaf.fmt = fmt; \ - vaf.va = &args; \ - \ - r = ___wiphy_printk(kern_level, wiphy, &vaf); \ - va_end(args); \ - \ - return r; \ -} \ -EXPORT_SYMBOL(func); - -define_wiphy_printk_level(wiphy_debug, KERN_DEBUG); -- cgit v1.1 From f5521b13880f4f4f612e1d20dd4f565122d16e04 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 14 Sep 2010 22:06:53 +0200 Subject: mac80211: use correct station flags lock This code is modifying the station flags, and as such should hold the flags lock so it can do so atomically vs. other flags modifications and readers. This issue was introduced when this code was added in eccb8e8f, as it used the wrong lock (thus not fixing the race that was previously documented in a comment.) Cc: stable@kernel.org [2.6.31+] Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 5de1ca3..171e8ff 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -577,6 +577,7 @@ static void sta_apply_parameters(struct ieee80211_local *local, struct sta_info *sta, struct station_parameters *params) { + unsigned long flags; u32 rates; int i, j; struct ieee80211_supported_band *sband; @@ -585,7 +586,7 @@ static void sta_apply_parameters(struct ieee80211_local *local, sband = local->hw.wiphy->bands[local->oper_channel->band]; - spin_lock_bh(&sta->lock); + spin_lock_irqsave(&sta->flaglock, flags); mask = params->sta_flags_mask; set = params->sta_flags_set; @@ -612,7 +613,7 @@ static void sta_apply_parameters(struct ieee80211_local *local, if (set & BIT(NL80211_STA_FLAG_MFP)) sta->flags |= WLAN_STA_MFP; } - spin_unlock_bh(&sta->lock); + spin_unlock_irqrestore(&sta->flaglock, flags); /* * cfg80211 validates this (1-2007) and allows setting the AID -- cgit v1.1 From 46a5ebaf02d69e26ee0f47a0b8d2d9bc619240d4 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Sep 2010 13:28:15 +0200 Subject: cfg80211/mac80211: use lockdep_assert_held Instead of using a WARN_ON(!mutex_is_locked()) use lockdep_assert_held() which compiles away completely when lockdep isn't enabled, and also is a more accurate assertion since it checks that the current thread is holding the mutex. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/chan.c | 2 +- net/mac80211/key.c | 2 +- net/mac80211/mlme.c | 2 +- net/mac80211/sta_info.c | 2 +- net/mac80211/util.c | 4 ++-- net/wireless/core.h | 9 ++++++--- net/wireless/reg.c | 6 +++++- 7 files changed, 17 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c index 32be11e..5b24740 100644 --- a/net/mac80211/chan.c +++ b/net/mac80211/chan.c @@ -11,7 +11,7 @@ __ieee80211_get_channel_mode(struct ieee80211_local *local, { struct ieee80211_sub_if_data *sdata; - WARN_ON(!mutex_is_locked(&local->iflist_mtx)); + lockdep_assert_held(&local->iflist_mtx); list_for_each_entry(sdata, &local->interfaces, list) { if (sdata == ignore) diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 3570f8c..6a63d1a 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -49,7 +49,7 @@ static const u8 bcast_addr[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; static void assert_key_lock(struct ieee80211_local *local) { - WARN_ON(!mutex_is_locked(&local->key_mtx)); + lockdep_assert_held(&local->key_mtx); } static struct ieee80211_sta *get_sta_for_key(struct ieee80211_key *key) diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 0cb822c..bfb0eab 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -92,7 +92,7 @@ enum rx_mgmt_action { /* utils */ static inline void ASSERT_MGD_MTX(struct ieee80211_if_managed *ifmgd) { - WARN_ON(!mutex_is_locked(&ifmgd->mtx)); + lockdep_assert_held(&ifmgd->mtx); } /* diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index e356ff8..44e10a9 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -280,7 +280,7 @@ static int sta_info_finish_insert(struct sta_info *sta, bool async) unsigned long flags; int err = 0; - WARN_ON(!mutex_is_locked(&local->sta_mtx)); + lockdep_assert_held(&local->sta_mtx); /* notify driver */ if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bd40b11..9f21a69 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1296,9 +1296,9 @@ void ieee80211_recalc_smps(struct ieee80211_local *local, int count = 0; if (forsdata) - WARN_ON(!mutex_is_locked(&forsdata->u.mgd.mtx)); + lockdep_assert_held(&forsdata->u.mgd.mtx); - WARN_ON(!mutex_is_locked(&local->iflist_mtx)); + lockdep_assert_held(&local->iflist_mtx); /* * This function could be improved to handle multiple diff --git a/net/wireless/core.h b/net/wireless/core.h index 58ab2c7..37580e0 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -95,7 +95,10 @@ extern struct mutex cfg80211_mutex; extern struct list_head cfg80211_rdev_list; extern int cfg80211_rdev_list_generation; -#define assert_cfg80211_lock() WARN_ON(!mutex_is_locked(&cfg80211_mutex)) +static inline void assert_cfg80211_lock(void) +{ + lockdep_assert_held(&cfg80211_mutex); +} /* * You can use this to mark a wiphy_idx as not having an associated wiphy. @@ -202,8 +205,8 @@ static inline void wdev_unlock(struct wireless_dev *wdev) mutex_unlock(&wdev->mtx); } -#define ASSERT_RDEV_LOCK(rdev) WARN_ON(!mutex_is_locked(&(rdev)->mtx)); -#define ASSERT_WDEV_LOCK(wdev) WARN_ON(!mutex_is_locked(&(wdev)->mtx)); +#define ASSERT_RDEV_LOCK(rdev) lockdep_assert_held(&(rdev)->mtx) +#define ASSERT_WDEV_LOCK(wdev) lockdep_assert_held(&(wdev)->mtx) enum cfg80211_event_type { EVENT_CONNECT_RESULT, diff --git a/net/wireless/reg.c b/net/wireless/reg.c index b0d9a08..d14bbf9 100644 --- a/net/wireless/reg.c +++ b/net/wireless/reg.c @@ -74,7 +74,11 @@ const struct ieee80211_regdomain *cfg80211_regdomain; * - last_request */ static DEFINE_MUTEX(reg_mutex); -#define assert_reg_lock() WARN_ON(!mutex_is_locked(®_mutex)) + +static inline void assert_reg_lock(void) +{ + lockdep_assert_held(®_mutex); +} /* Used to queue up regulatory hints */ static LIST_HEAD(reg_requests_list); -- cgit v1.1 From 2d2080c3c1d52e186166afc3efe5067291e618bf Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 15 Sep 2010 15:13:13 +0200 Subject: mac80211: set running state earlier When an interface is brought up, the recent changes to allow changing type-while-up only set the running bit after everything was done. This broke a number of things, including idle calculation for monitor interfaces, and it also broke WDS station insertion (although nobody noticed yet). Thus, change the code to set the running bit earlier, but keep it after the driver's add_interface was called because otherwise drivers may iterate over interfaces they haven't fully set up yet. Reported-by: Rajkumar Manoharan Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/iface.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index c1cc200..95908aa 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -280,6 +280,8 @@ static int ieee80211_do_open(struct net_device *dev, bool coming_up) netif_carrier_on(dev); } + set_bit(SDATA_STATE_RUNNING, &sdata->state); + if (sdata->vif.type == NL80211_IFTYPE_WDS) { /* Create STA entry for the WDS peer */ sta = sta_info_alloc(sdata, sdata->u.wds.remote_addr, @@ -331,8 +333,6 @@ static int ieee80211_do_open(struct net_device *dev, bool coming_up) netif_tx_start_all_queues(dev); - set_bit(SDATA_STATE_RUNNING, &sdata->state); - return 0; err_del_interface: drv_remove_interface(local, &sdata->vif); @@ -343,6 +343,7 @@ static int ieee80211_do_open(struct net_device *dev, bool coming_up) sdata->bss = NULL; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) list_del(&sdata->u.vlan.list); + clear_bit(SDATA_STATE_RUNNING, &sdata->state); return res; } -- cgit v1.1 From 074ac8df9f93f2a35a356d92fd7f16cd846f0a03 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 16 Sep 2010 14:58:22 +0200 Subject: cfg80211/nl80211: introduce p2p device types This adds P2P-STA and P2P-GO as device types so we can distinguish between those and normal STA or AP (respectively) type interfaces. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/core.c | 2 ++ net/wireless/mlme.c | 3 ++- net/wireless/nl80211.c | 56 +++++++++++++++++++++++++++++++++++--------------- net/wireless/sme.c | 9 +++++--- net/wireless/util.c | 15 +++++++++++--- 5 files changed, 61 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index b8191cf..ff9615a 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -726,6 +726,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, dev->ethtool_ops = &cfg80211_ethtool_ops; if ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT || wdev->iftype == NL80211_IFTYPE_ADHOC) && !wdev->use_4addr) dev->priv_flags |= IFF_DONT_BRIDGE; break; @@ -734,6 +735,7 @@ static int cfg80211_netdev_notifier_call(struct notifier_block * nb, case NL80211_IFTYPE_ADHOC: cfg80211_leave_ibss(rdev, dev, true); break; + case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: wdev_lock(wdev); #ifdef CONFIG_CFG80211_WEXT diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 8515b1e..46f3711 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -882,7 +882,8 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, if (!wdev->current_bss || memcmp(wdev->current_bss->pub.bssid, mgmt->bssid, ETH_ALEN) != 0 || - (wdev->iftype == NL80211_IFTYPE_STATION && + ((wdev->iftype == NL80211_IFTYPE_STATION || + wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && memcmp(wdev->current_bss->pub.bssid, mgmt->da, ETH_ALEN) != 0)) { wdev_unlock(wdev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 1d6ef24..f15b1af 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -410,12 +410,14 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: break; case NL80211_IFTYPE_ADHOC: if (!wdev->current_bss) return -ENOLINK; break; case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: if (wdev->sme_state != CFG80211_SME_CONNECTED) return -ENOLINK; break; @@ -766,7 +768,8 @@ static bool nl80211_can_set_dev_channel(struct wireless_dev *wdev) wdev->iftype == NL80211_IFTYPE_AP || wdev->iftype == NL80211_IFTYPE_WDS || wdev->iftype == NL80211_IFTYPE_MESH_POINT || - wdev->iftype == NL80211_IFTYPE_MONITOR; + wdev->iftype == NL80211_IFTYPE_MONITOR || + wdev->iftype == NL80211_IFTYPE_P2P_GO; } static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, @@ -1693,7 +1696,8 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { err = -EOPNOTSUPP; goto out; } @@ -1785,7 +1789,8 @@ static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info) goto out; } - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { err = -EOPNOTSUPP; goto out; } @@ -2128,10 +2133,12 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) switch (dev->ieee80211_ptr->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: /* disallow mesh-specific things */ if (params.plink_action) err = -EINVAL; break; + case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: /* disallow everything but AUTHORIZED flag */ if (params.plink_action) @@ -2233,7 +2240,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) goto out_rtnl; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN) { + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { err = -EINVAL; goto out; } @@ -2286,7 +2294,8 @@ static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { err = -EINVAL; goto out; } @@ -2660,7 +2669,8 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) goto out; } - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { err = -EOPNOTSUPP; goto out; } @@ -3363,6 +3373,7 @@ static int nl80211_send_bss(struct sk_buff *msg, u32 pid, u32 seq, int flags, } switch (wdev->iftype) { + case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_STATION: if (intbss == wdev->current_bss) NLA_PUT_U32(msg, NL80211_BSS_STATUS, @@ -3649,7 +3660,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) goto out; } - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -3804,7 +3816,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) goto out; } - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -3888,7 +3901,8 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) goto out; } - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -3954,7 +3968,8 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) goto out; } - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -4332,7 +4347,8 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -4408,7 +4424,8 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) if (err) goto unlock_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -4496,7 +4513,8 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -4541,7 +4559,8 @@ static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) if (err) goto out_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION) { + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -4823,7 +4842,8 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) goto unlock_rtnl; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) { + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -4875,7 +4895,8 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) } if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) { + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto out; } @@ -5093,7 +5114,8 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, goto unlock_rdev; } - if (wdev->iftype != NL80211_IFTYPE_STATION) { + if (wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) { err = -EOPNOTSUPP; goto unlock_rdev; } diff --git a/net/wireless/sme.c b/net/wireless/sme.c index a8c2d6b..f161b98 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -411,7 +411,8 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid, ASSERT_WDEV_LOCK(wdev); - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; if (wdev->sme_state != CFG80211_SME_CONNECTING) @@ -548,7 +549,8 @@ void __cfg80211_roamed(struct wireless_dev *wdev, const u8 *bssid, ASSERT_WDEV_LOCK(wdev); - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; if (wdev->sme_state != CFG80211_SME_CONNECTED) @@ -644,7 +646,8 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, ASSERT_WDEV_LOCK(wdev); - if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION)) + if (WARN_ON(wdev->iftype != NL80211_IFTYPE_STATION && + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT)) return; if (wdev->sme_state != CFG80211_SME_CONNECTED) diff --git a/net/wireless/util.c b/net/wireless/util.c index bca32eb..fb5448f 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -326,7 +326,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) { case cpu_to_le16(IEEE80211_FCTL_TODS): if (unlikely(iftype != NL80211_IFTYPE_AP && - iftype != NL80211_IFTYPE_AP_VLAN)) + iftype != NL80211_IFTYPE_AP_VLAN && + iftype != NL80211_IFTYPE_P2P_GO)) return -1; break; case cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS): @@ -354,7 +355,8 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr, break; case cpu_to_le16(IEEE80211_FCTL_FROMDS): if ((iftype != NL80211_IFTYPE_STATION && - iftype != NL80211_IFTYPE_MESH_POINT) || + iftype != NL80211_IFTYPE_P2P_CLIENT && + iftype != NL80211_IFTYPE_MESH_POINT) || (is_multicast_ether_addr(dst) && !compare_ether_addr(src, addr))) return -1; @@ -431,6 +433,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, switch (iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: fc |= cpu_to_le16(IEEE80211_FCTL_FROMDS); /* DA BSSID SA */ memcpy(hdr.addr1, skb->data, ETH_ALEN); @@ -439,6 +442,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr, hdrlen = 24; break; case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: fc |= cpu_to_le16(IEEE80211_FCTL_TODS); /* BSSID SA DA */ memcpy(hdr.addr1, bssid, ETH_ALEN); @@ -778,7 +782,9 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, /* if it's part of a bridge, reject changing type to station/ibss */ if ((dev->priv_flags & IFF_BRIDGE_PORT) && - (ntype == NL80211_IFTYPE_ADHOC || ntype == NL80211_IFTYPE_STATION)) + (ntype == NL80211_IFTYPE_ADHOC || + ntype == NL80211_IFTYPE_STATION || + ntype == NL80211_IFTYPE_P2P_CLIENT)) return -EBUSY; if (ntype != otype) { @@ -789,6 +795,7 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, cfg80211_leave_ibss(rdev, dev, false); break; case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: cfg80211_disconnect(rdev, dev, WLAN_REASON_DEAUTH_LEAVING, true); break; @@ -817,9 +824,11 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev, if (dev->ieee80211_ptr->use_4addr) break; /* fall through */ + case NL80211_IFTYPE_P2P_CLIENT: case NL80211_IFTYPE_ADHOC: dev->priv_flags |= IFF_DONT_BRIDGE; break; + case NL80211_IFTYPE_P2P_GO: case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: case NL80211_IFTYPE_WDS: -- cgit v1.1 From 2ca27bcff7127da1aa7dd39cd2a6f7cb187e327f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 16 Sep 2010 14:58:23 +0200 Subject: mac80211: add p2p device type support When a driver advertises p2p device support, mac80211 will handle it, but internally it will rewrite the interface type to STA/AP rather than P2P-STA/GO since otherwise a lot of paths need to be touched that are otherwise identical. A p2p boolean tells drivers whether or not a given interface will be used for p2p or not. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 25 ++++++++++++++++++------- net/mac80211/driver-ops.h | 6 +++--- net/mac80211/driver-trace.h | 21 +++++++++++++-------- net/mac80211/iface.c | 29 ++++++++++++++++++++++++++--- net/mac80211/main.c | 15 +++++++++++++++ net/mac80211/rx.c | 4 +--- net/mac80211/util.c | 18 ++++-------------- 7 files changed, 80 insertions(+), 38 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 171e8ff..c981604 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1151,15 +1151,26 @@ static int ieee80211_scan(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_scan_request *req) { - struct ieee80211_sub_if_data *sdata; - - sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); - if (sdata->vif.type != NL80211_IFTYPE_STATION && - sdata->vif.type != NL80211_IFTYPE_ADHOC && - sdata->vif.type != NL80211_IFTYPE_MESH_POINT && - (sdata->vif.type != NL80211_IFTYPE_AP || sdata->u.ap.beacon)) + switch (ieee80211_vif_type_p2p(&sdata->vif)) { + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_MESH_POINT: + case NL80211_IFTYPE_P2P_CLIENT: + break; + case NL80211_IFTYPE_P2P_GO: + if (sdata->local->ops->hw_scan) + break; + /* FIXME: implement NoA while scanning in software */ + return -EOPNOTSUPP; + case NL80211_IFTYPE_AP: + if (sdata->u.ap.beacon) + return -EOPNOTSUPP; + break; + default: return -EOPNOTSUPP; + } return ieee80211_request_scan(sdata, req); } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 6064b7b..1698382 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -56,14 +56,14 @@ static inline int drv_add_interface(struct ieee80211_local *local, static inline int drv_change_interface(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum nl80211_iftype type) + enum nl80211_iftype type, bool p2p) { int ret; might_sleep(); - trace_drv_change_interface(local, sdata, type); - ret = local->ops->change_interface(&local->hw, &sdata->vif, type); + trace_drv_change_interface(local, sdata, type, p2p); + ret = local->ops->change_interface(&local->hw, &sdata->vif, type, p2p); trace_drv_return_int(local, ret); return ret; } diff --git a/net/mac80211/driver-trace.h b/net/mac80211/driver-trace.h index f6f3d89..6831fb1 100644 --- a/net/mac80211/driver-trace.h +++ b/net/mac80211/driver-trace.h @@ -25,12 +25,14 @@ static inline void trace_ ## name(proto) {} #define STA_PR_FMT " sta:%pM" #define STA_PR_ARG __entry->sta_addr -#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ +#define VIF_ENTRY __field(enum nl80211_iftype, vif_type) __field(void *, sdata) \ + __field(bool, p2p) \ __string(vif_name, sdata->dev ? sdata->dev->name : "") -#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ +#define VIF_ASSIGN __entry->vif_type = sdata->vif.type; __entry->sdata = sdata; \ + __entry->p2p = sdata->vif.p2p; \ __assign_str(vif_name, sdata->dev ? sdata->dev->name : "") -#define VIF_PR_FMT " vif:%s(%d)" -#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type +#define VIF_PR_FMT " vif:%s(%d%s)" +#define VIF_PR_ARG __get_str(vif_name), __entry->vif_type, __entry->p2p ? "/p2p" : "" /* * Tracing for driver callbacks. @@ -139,25 +141,28 @@ TRACE_EVENT(drv_add_interface, TRACE_EVENT(drv_change_interface, TP_PROTO(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, - enum nl80211_iftype type), + enum nl80211_iftype type, bool p2p), - TP_ARGS(local, sdata, type), + TP_ARGS(local, sdata, type, p2p), TP_STRUCT__entry( LOCAL_ENTRY VIF_ENTRY __field(u32, new_type) + __field(bool, new_p2p) ), TP_fast_assign( LOCAL_ASSIGN; VIF_ASSIGN; __entry->new_type = type; + __entry->new_p2p = p2p; ), TP_printk( - LOCAL_PR_FMT VIF_PR_FMT " new type:%d", - LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type + LOCAL_PR_FMT VIF_PR_FMT " new type:%d%s", + LOCAL_PR_ARG, VIF_PR_ARG, __entry->new_type, + __entry->new_p2p ? "/p2p" : "" ) ); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 95908aa..6678573 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -188,6 +188,8 @@ static int ieee80211_do_open(struct net_device *dev, bool coming_up) break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_P2P_GO: /* cannot happen */ WARN_ON(1); break; @@ -844,6 +846,7 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, /* and set some type-dependent values */ sdata->vif.type = type; + sdata->vif.p2p = false; sdata->dev->netdev_ops = &ieee80211_dataif_ops; sdata->wdev.iftype = type; @@ -857,10 +860,20 @@ static void ieee80211_setup_sdata(struct ieee80211_sub_if_data *sdata, INIT_WORK(&sdata->work, ieee80211_iface_work); switch (type) { + case NL80211_IFTYPE_P2P_GO: + type = NL80211_IFTYPE_AP; + sdata->vif.type = type; + sdata->vif.p2p = true; + /* fall through */ case NL80211_IFTYPE_AP: skb_queue_head_init(&sdata->u.ap.ps_bc_buf); INIT_LIST_HEAD(&sdata->u.ap.vlans); break; + case NL80211_IFTYPE_P2P_CLIENT: + type = NL80211_IFTYPE_STATION; + sdata->vif.type = type; + sdata->vif.p2p = true; + /* fall through */ case NL80211_IFTYPE_STATION: ieee80211_sta_setup_sdata(sdata); break; @@ -894,6 +907,8 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, { struct ieee80211_local *local = sdata->local; int ret, err; + enum nl80211_iftype internal_type = type; + bool p2p = false; ASSERT_RTNL(); @@ -926,11 +941,19 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, * code isn't prepared to handle). */ break; + case NL80211_IFTYPE_P2P_CLIENT: + p2p = true; + internal_type = NL80211_IFTYPE_STATION; + break; + case NL80211_IFTYPE_P2P_GO: + p2p = true; + internal_type = NL80211_IFTYPE_AP; + break; default: return -EBUSY; } - ret = ieee80211_check_concurrent_iface(sdata, type); + ret = ieee80211_check_concurrent_iface(sdata, internal_type); if (ret) return ret; @@ -938,7 +961,7 @@ static int ieee80211_runtime_change_iftype(struct ieee80211_sub_if_data *sdata, ieee80211_teardown_sdata(sdata->dev); - ret = drv_change_interface(local, sdata, type); + ret = drv_change_interface(local, sdata, internal_type, p2p); if (ret) type = sdata->vif.type; @@ -957,7 +980,7 @@ int ieee80211_if_change_type(struct ieee80211_sub_if_data *sdata, ASSERT_RTNL(); - if (type == sdata->vif.type) + if (type == ieee80211_vif_type_p2p(&sdata->vif)) return 0; /* Setting ad-hoc mode on non-IBSS channel is not supported. */ diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 7fb1148..18fdeca 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -459,6 +459,21 @@ ieee80211_default_mgmt_stypes[NUM_NL80211_IFTYPES] = { BIT(IEEE80211_STYPE_DEAUTH >> 4) | BIT(IEEE80211_STYPE_ACTION >> 4), }, + [NL80211_IFTYPE_P2P_CLIENT] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ACTION >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4), + }, + [NL80211_IFTYPE_P2P_GO] = { + .tx = 0xffff, + .rx = BIT(IEEE80211_STYPE_ASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_REASSOC_REQ >> 4) | + BIT(IEEE80211_STYPE_PROBE_REQ >> 4) | + BIT(IEEE80211_STYPE_DISASSOC >> 4) | + BIT(IEEE80211_STYPE_AUTH >> 4) | + BIT(IEEE80211_STYPE_DEAUTH >> 4) | + BIT(IEEE80211_STYPE_ACTION >> 4), + }, }; struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index ac205a3..c0368152 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2588,9 +2588,7 @@ static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, if (compare_ether_addr(sdata->u.wds.remote_addr, hdr->addr2)) return 0; break; - case NL80211_IFTYPE_MONITOR: - case NL80211_IFTYPE_UNSPECIFIED: - case NUM_NL80211_IFTYPES: + default: /* should never get here */ WARN_ON(1); break; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 9f21a69..737f426 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -474,16 +474,10 @@ void ieee80211_iterate_active_interfaces( list_for_each_entry(sdata, &local->interfaces, list) { switch (sdata->vif.type) { - case NUM_NL80211_IFTYPES: - case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: continue; - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_WDS: - case NL80211_IFTYPE_MESH_POINT: + default: break; } if (ieee80211_sdata_running(sdata)) @@ -508,16 +502,10 @@ void ieee80211_iterate_active_interfaces_atomic( list_for_each_entry_rcu(sdata, &local->interfaces, list) { switch (sdata->vif.type) { - case NUM_NL80211_IFTYPES: - case NL80211_IFTYPE_UNSPECIFIED: case NL80211_IFTYPE_MONITOR: case NL80211_IFTYPE_AP_VLAN: continue; - case NL80211_IFTYPE_AP: - case NL80211_IFTYPE_STATION: - case NL80211_IFTYPE_ADHOC: - case NL80211_IFTYPE_WDS: - case NL80211_IFTYPE_MESH_POINT: + default: break; } if (ieee80211_sdata_running(sdata)) @@ -1193,6 +1181,8 @@ int ieee80211_reconfig(struct ieee80211_local *local) break; case NL80211_IFTYPE_UNSPECIFIED: case NUM_NL80211_IFTYPES: + case NL80211_IFTYPE_P2P_CLIENT: + case NL80211_IFTYPE_P2P_GO: WARN_ON(1); break; } -- cgit v1.1 From be099e82e9cf6d5d65d044e9ef6fc8bee3c7a113 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 16 Sep 2010 15:12:29 -0400 Subject: mac80211: add helper for reseting the connection monitor This will be used in another place later. The connection monitor was added as of 2.6.35 so these fixes will be applicable to >= 2.6.35. Cc: stable@kernel.org Cc: Paul Stewart Cc: Amod Bodas Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 15 ++++++++++----- 2 files changed, 11 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 4e635e2..737fd0f 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1041,6 +1041,7 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); /* IBSS code */ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local); diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index bfb0eab..2d86a4d 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -124,6 +124,15 @@ static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata) round_jiffies_up(jiffies + IEEE80211_BEACON_LOSS_TIME)); } +void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) +{ + if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) + return; + + mod_timer(&sdata->u.mgd.conn_mon_timer, + round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); +} + static int ecw2cw(int ecw) { return (1 << ecw) - 1; @@ -1018,11 +1027,7 @@ void ieee80211_sta_rx_notify(struct ieee80211_sub_if_data *sdata, if (is_multicast_ether_addr(hdr->addr1)) return; - if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) - return; - - mod_timer(&sdata->u.mgd.conn_mon_timer, - round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); + ieee80211_sta_reset_conn_monitor(sdata); } static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) -- cgit v1.1 From 0c699c3a75d4e8d0d2c317f83048d8fd3ffe692a Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 16 Sep 2010 15:12:30 -0400 Subject: mac80211: reset probe send counter upon connection timer reset Upon beacon loss we send probe requests after 30 seconds of idle time and we wait for each probe response 1/2 second. We send a total of 3 probe requests before giving up on the AP. In the case that we reset the connection idle monitor we should reset the probe requests count to 0. Right now this won't help in any way but the next patch will. This patch has fixes for stable kernel [2.6.35+]. Cc: stable@kernel.org Cc: Paul Stewart Cc: Amod Bodas Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2d86a4d..d7915ab 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -126,11 +126,15 @@ static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata) void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata) { + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; + if (sdata->local->hw.flags & IEEE80211_HW_CONNECTION_MONITOR) return; mod_timer(&sdata->u.mgd.conn_mon_timer, round_jiffies_up(jiffies + IEEE80211_CONNECTION_IDLE_TIME)); + + ifmgd->probe_send_count = 0; } static int ecw2cw(int ecw) -- cgit v1.1 From 4730d5977f3e12b828d354f7752cffd94bdf39e5 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 16 Sep 2010 15:12:31 -0400 Subject: mac80211: reset connection idle when going offchannel When we go offchannel mac80211 currently leaves alive the connection idle monitor. This should be instead postponed until we come back to our home channel, otherwise by the time we get back to the home channel we could be triggering unecesary probe requests. For APs that do not respond to unicast probe requests (Nexus One is a simple example) this means we essentially get disconnected after the probes fails. This patch has stable fixes for kernels [2.6.35+] Cc: stable@kernel.org Cc: Paul Stewart Cc: Amod Bodas Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/offchannel.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index eeacaa5..627a33e 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -22,12 +22,15 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) { struct ieee80211_local *local = sdata->local; + struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; local->offchannel_ps_enabled = false; /* FIXME: what to do when local->pspolling is true? */ del_timer_sync(&local->dynamic_ps_timer); + del_timer_sync(&ifmgd->conn_mon_timer); + cancel_work_sync(&local->dynamic_ps_enable_work); if (local->hw.conf.flags & IEEE80211_CONF_PS) { @@ -85,6 +88,8 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) mod_timer(&local->dynamic_ps_timer, jiffies + msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } + + ieee80211_sta_reset_conn_monitor(sdata); } void ieee80211_offchannel_stop_beaconing(struct ieee80211_local *local) -- cgit v1.1 From d3a910a8e4e846b9a767d35483f4dc7c6de7af82 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 16 Sep 2010 15:12:32 -0400 Subject: mac80211: make the beacon monitor available externally This will be used by other components next. The beacon monitor was added as of 2.6.34 so these fixes are applicable only to kernels >= 2.6.34. Cc: stable@kernel.org Cc: Paul Stewart Cc: Amod Bodas Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 8 ++++---- 2 files changed, 5 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 737fd0f..9346a6b 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1041,6 +1041,7 @@ void ieee80211_sta_restart(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_rx_queued_mgmt(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); +void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata); void ieee80211_sta_reset_conn_monitor(struct ieee80211_sub_if_data *sdata); /* IBSS code */ diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index d7915ab..07d03e7 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -115,7 +115,7 @@ static void run_again(struct ieee80211_if_managed *ifmgd, mod_timer(&ifmgd->timer, timeout); } -static void mod_beacon_timer(struct ieee80211_sub_if_data *sdata) +void ieee80211_sta_reset_beacon_monitor(struct ieee80211_sub_if_data *sdata) { if (sdata->local->hw.flags & IEEE80211_HW_BEACON_FILTER) return; @@ -1390,7 +1390,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk, * Also start the timer that will detect beacon loss. */ ieee80211_sta_rx_notify(sdata, (struct ieee80211_hdr *)mgmt); - mod_beacon_timer(sdata); + ieee80211_sta_reset_beacon_monitor(sdata); return true; } @@ -1493,7 +1493,7 @@ static void ieee80211_rx_mgmt_probe_resp(struct ieee80211_sub_if_data *sdata, * we have or will be receiving any beacons or data, so let's * schedule the timers again, just in case. */ - mod_beacon_timer(sdata); + ieee80211_sta_reset_beacon_monitor(sdata); mod_timer(&ifmgd->conn_mon_timer, round_jiffies_up(jiffies + @@ -1619,7 +1619,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, * Push the beacon loss detection into the future since * we are processing a beacon from the AP just now. */ - mod_beacon_timer(sdata); + ieee80211_sta_reset_beacon_monitor(sdata); ncrc = crc32_be(0, (void *)&mgmt->u.beacon.beacon_int, 4); ncrc = ieee802_11_parse_elems_crc(mgmt->u.beacon.variable, -- cgit v1.1 From 3bc3c0d748402e8c1f31b8569f5924d25d7b8e30 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 16 Sep 2010 15:12:33 -0400 Subject: mac80211: disable beacon monitor while going offchannel The beacon monitor should be disabled when going off channel to prevent spurious warnings and triggering connection deterioration work such as sending probe requests. Re-enable the beacon monitor once we come back to the home channel. This patch has fixes for stable kernels [2.6.34+]. Cc: stable@kernel.org Cc: Paul Stewart Cc: Amod Bodas Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/offchannel.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 627a33e..4b56409 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -29,6 +29,7 @@ static void ieee80211_offchannel_ps_enable(struct ieee80211_sub_if_data *sdata) /* FIXME: what to do when local->pspolling is true? */ del_timer_sync(&local->dynamic_ps_timer); + del_timer_sync(&ifmgd->bcn_mon_timer); del_timer_sync(&ifmgd->conn_mon_timer); cancel_work_sync(&local->dynamic_ps_enable_work); @@ -89,6 +90,7 @@ static void ieee80211_offchannel_ps_disable(struct ieee80211_sub_if_data *sdata) msecs_to_jiffies(local->hw.conf.dynamic_ps_timeout)); } + ieee80211_sta_reset_beacon_monitor(sdata); ieee80211_sta_reset_conn_monitor(sdata); } -- cgit v1.1 From f01a067d9e4598c71e3c9ee3a84859d2e8af4f8e Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Thu, 16 Sep 2010 15:12:34 -0400 Subject: mac80211: send last 3/5 probe requests as unicast Some buggy APs do not respond to unicast probe requests or send unicast probe requests very delayed so in the worst case we should try to send broadcast probe requests, otherwise we can get disconnected from these APs. Even if drivers do not have filters to disregard probe responses from foreign APs mac80211 will only process probe responses from our associated AP for re-arming connection monitoring. We need to do this since the beacon monitor does not push back the connection monitor by design so even if we are getting beacons from these type of APs our connection monitor currently relies heavily on the way the probe requests are received on the AP. An example of an AP affected by this is the Nexus One, but this has also been observed with random APs. We can probably optimize this later by using null funcs instead of probe requests. For more details refer to: http://code.google.com/p/chromium-os/issues/detail?id=5715 This patch has fixes for stable kernels [2.6.35+]. Cc: stable@kernel.org Cc: Paul Stewart Cc: Amod Bodas Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 07d03e7..8b733cf 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1038,10 +1038,19 @@ static void ieee80211_mgd_probe_ap_send(struct ieee80211_sub_if_data *sdata) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; const u8 *ssid; + u8 *dst = ifmgd->associated->bssid; + u8 unicast_limit = max(1, IEEE80211_MAX_PROBE_TRIES - 3); + + /* + * Try sending broadcast probe requests for the last three + * probe requests after the first ones failed since some + * buggy APs only support broadcast probe requests. + */ + if (ifmgd->probe_send_count >= unicast_limit) + dst = NULL; ssid = ieee80211_bss_get_ie(ifmgd->associated, WLAN_EID_SSID); - ieee80211_send_probe_req(sdata, ifmgd->associated->bssid, - ssid + 2, ssid[1], NULL, 0); + ieee80211_send_probe_req(sdata, dst, ssid + 2, ssid[1], NULL, 0); ifmgd->probe_send_count++; ifmgd->probe_timeout = jiffies + IEEE80211_PROBE_WAIT; -- cgit v1.1 From caeda9b926c608702c99f3432aae2c24298c3c1d Mon Sep 17 00:00:00 2001 From: Stephen Rothwell Date: Thu, 16 Sep 2010 21:39:16 -0700 Subject: net: include inetdevice.h for rcu_dereference_raw api change rcu_dereference_raw() now needs to know the type of its argument. Signed-off-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/core/dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 09b3742..c09ff09 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -129,6 +129,7 @@ #include #include #include +#include #include "net-sysfs.h" -- cgit v1.1 From 94767632623c7bf5b16a0cf963ec93a8ad9acca4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 15 Sep 2010 20:25:34 +0000 Subject: ip6tnl: get rid of ip6_tnl_lock As RTNL is held while doing tunnels inserts and deletes, we can remove ip6_tnl_lock spinlock. My initial RCU conversion was conservative and converted the rwlock to spinlock, with no RTNL requirement. Use appropriate rcu annotations and modern lockdep checks as well. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 58 +++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 29f99dd..9289cec 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -83,15 +83,14 @@ struct ip6_tnl_net { /* the IPv6 tunnel fallback device */ struct net_device *fb_tnl_dev; /* lists for storing tunnels in use */ - struct ip6_tnl *tnls_r_l[HASH_SIZE]; - struct ip6_tnl *tnls_wc[1]; - struct ip6_tnl **tnls[2]; + struct ip6_tnl __rcu *tnls_r_l[HASH_SIZE]; + struct ip6_tnl __rcu *tnls_wc[1]; + struct ip6_tnl __rcu **tnls[2]; }; /* - * Locking : hash tables are protected by RCU and a spinlock + * Locking : hash tables are protected by RCU and RTNL */ -static DEFINE_SPINLOCK(ip6_tnl_lock); static inline struct dst_entry *ip6_tnl_dst_check(struct ip6_tnl *t) { @@ -138,8 +137,8 @@ static inline void ip6_tnl_dst_store(struct ip6_tnl *t, struct dst_entry *dst) static struct ip6_tnl * ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local) { - unsigned h0 = HASH(remote); - unsigned h1 = HASH(local); + unsigned int h0 = HASH(remote); + unsigned int h1 = HASH(local); struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); @@ -167,7 +166,7 @@ ip6_tnl_lookup(struct net *net, struct in6_addr *remote, struct in6_addr *local) * Return: head of IPv6 tunnel list **/ -static struct ip6_tnl ** +static struct ip6_tnl __rcu ** ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p) { struct in6_addr *remote = &p->raddr; @@ -190,12 +189,10 @@ ip6_tnl_bucket(struct ip6_tnl_net *ip6n, struct ip6_tnl_parm *p) static void ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { - struct ip6_tnl **tp = ip6_tnl_bucket(ip6n, &t->parms); + struct ip6_tnl __rcu **tp = ip6_tnl_bucket(ip6n, &t->parms); - spin_lock_bh(&ip6_tnl_lock); - t->next = *tp; + rcu_assign_pointer(t->next , rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); - spin_unlock_bh(&ip6_tnl_lock); } /** @@ -206,13 +203,14 @@ ip6_tnl_link(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) static void ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) { - struct ip6_tnl **tp; - - for (tp = ip6_tnl_bucket(ip6n, &t->parms); *tp; tp = &(*tp)->next) { - if (t == *tp) { - spin_lock_bh(&ip6_tnl_lock); - *tp = t->next; - spin_unlock_bh(&ip6_tnl_lock); + struct ip6_tnl __rcu **tp; + struct ip6_tnl *iter; + + for (tp = ip6_tnl_bucket(ip6n, &t->parms); + (iter = rtnl_dereference(*tp)) != NULL; + tp = &iter->next) { + if (t == iter) { + rcu_assign_pointer(*tp, t->next); break; } } @@ -290,10 +288,13 @@ static struct ip6_tnl *ip6_tnl_locate(struct net *net, { struct in6_addr *remote = &p->raddr; struct in6_addr *local = &p->laddr; + struct ip6_tnl __rcu **tp; struct ip6_tnl *t; struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - for (t = *ip6_tnl_bucket(ip6n, p); t; t = t->next) { + for (tp = ip6_tnl_bucket(ip6n, p); + (t = rtnl_dereference(*tp)) != NULL; + tp = &t->next) { if (ipv6_addr_equal(local, &t->parms.laddr) && ipv6_addr_equal(remote, &t->parms.raddr)) return t; @@ -318,13 +319,10 @@ ip6_tnl_dev_uninit(struct net_device *dev) struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); - if (dev == ip6n->fb_tnl_dev) { - spin_lock_bh(&ip6_tnl_lock); - ip6n->tnls_wc[0] = NULL; - spin_unlock_bh(&ip6_tnl_lock); - } else { + if (dev == ip6n->fb_tnl_dev) + rcu_assign_pointer(ip6n->tnls_wc[0], NULL); + else ip6_tnl_unlink(ip6n, t); - } ip6_tnl_dst_reset(t); dev_put(dev); } @@ -1369,7 +1367,7 @@ static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev) ip6_tnl_dev_init_gen(dev); t->parms.proto = IPPROTO_IPV6; dev_hold(dev); - ip6n->tnls_wc[0] = t; + rcu_assign_pointer(ip6n->tnls_wc[0], t); } static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { @@ -1391,14 +1389,14 @@ static void __net_exit ip6_tnl_destroy_tunnels(struct ip6_tnl_net *ip6n) LIST_HEAD(list); for (h = 0; h < HASH_SIZE; h++) { - t = ip6n->tnls_r_l[h]; + t = rtnl_dereference(ip6n->tnls_r_l[h]); while (t != NULL) { unregister_netdevice_queue(t->dev, &list); - t = t->next; + t = rtnl_dereference(t->next); } } - t = ip6n->tnls_wc[0]; + t = rtnl_dereference(ip6n->tnls_wc[0]); unregister_netdevice_queue(t->dev, &list); unregister_netdevice_many(&list); } -- cgit v1.1 From 3575792e005dc9994f15ae72c1c6f401d134177d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Fri, 17 Sep 2010 14:18:16 +0200 Subject: ipvs: extend connection flags to 32 bits - the sync protocol supports 16 bits only, so bits 0..15 should be used only for flags that should go to backup server, bits 16 and above should be allocated for flags not sent to backup. - use IP_VS_CONN_F_DEST_MASK as mask of connection flags in destination that can be changed by user space - allow IP_VS_CONN_F_ONE_PACKET to be set in destination Signed-off-by: Julian Anastasov Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_conn.c | 16 ++++++++++------ net/netfilter/ipvs/ip_vs_core.c | 11 ++++++----- net/netfilter/ipvs/ip_vs_ctl.c | 5 +++-- 3 files changed, 19 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index b71c69a..9fe1da7 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -505,6 +505,8 @@ static inline int ip_vs_dest_totalconns(struct ip_vs_dest *dest) static inline void ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) { + unsigned int conn_flags; + /* if dest is NULL, then return directly */ if (!dest) return; @@ -512,16 +514,18 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) /* Increase the refcnt counter of the dest */ atomic_inc(&dest->refcnt); + conn_flags = atomic_read(&dest->conn_flags); + if (cp->protocol != IPPROTO_UDP) + conn_flags &= ~IP_VS_CONN_F_ONE_PACKET; /* Bind with the destination and its corresponding transmitter */ - if ((cp->flags & IP_VS_CONN_F_SYNC) && - (!(cp->flags & IP_VS_CONN_F_TEMPLATE))) + if (cp->flags & IP_VS_CONN_F_SYNC) { /* if the connection is not template and is created * by sync, preserve the activity flag. */ - cp->flags |= atomic_read(&dest->conn_flags) & - (~IP_VS_CONN_F_INACTIVE); - else - cp->flags |= atomic_read(&dest->conn_flags); + if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) + conn_flags &= ~IP_VS_CONN_F_INACTIVE; + } + cp->flags |= conn_flags; cp->dest = dest; IP_VS_DBG_BUF(7, "Bind-dest %s c:%s:%d v:%s:%d " diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 0c043b6..319991d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -194,7 +194,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct ip_vs_dest *dest; struct ip_vs_conn *ct; __be16 dport; /* destination port to forward */ - __be16 flags; + unsigned int flags; union nf_inet_addr snet; /* source network of the client, after masking */ @@ -382,7 +382,8 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; struct ip_vs_dest *dest; - __be16 _ports[2], *pptr, flags; + __be16 _ports[2], *pptr; + unsigned int flags; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); @@ -473,9 +474,9 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, if (sysctl_ip_vs_cache_bypass && svc->fwmark && unicast) { int ret, cs; struct ip_vs_conn *cp; - __u16 flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && - iph.protocol == IPPROTO_UDP)? - IP_VS_CONN_F_ONE_PACKET : 0; + unsigned int flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && + iph.protocol == IPPROTO_UDP)? + IP_VS_CONN_F_ONE_PACKET : 0; union nf_inet_addr daddr = { .all = { 0, 0, 0, 0 } }; ip_vs_service_put(svc); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index ca8ec8c..7bd41d2 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -765,7 +765,8 @@ __ip_vs_update_dest(struct ip_vs_service *svc, /* set the weight and the flags */ atomic_set(&dest->weight, udest->weight); - conn_flags = udest->conn_flags | IP_VS_CONN_F_INACTIVE; + conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; + conn_flags |= IP_VS_CONN_F_INACTIVE; /* check if local node and update the flags */ #ifdef CONFIG_IP_VS_IPV6 @@ -782,7 +783,7 @@ __ip_vs_update_dest(struct ip_vs_service *svc, } /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ - if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != 0) { + if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; } else { /* -- cgit v1.1 From 67c9660831f6b6b76866a0838466c83765ffbbd3 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 17 Sep 2010 11:56:18 -0700 Subject: ethtool: change ethtool_set_gro() to use ethtool_op_get_rx_csum To be able to switch on GRO on a device, ethtool_set_gro() checks this device provides a get_rx_csum() method. Some devices dont provide this method, while they do support RX checksumming. This patch allows bonding to support GRO : ethtool -K bond0 gro on Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/ethtool.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index fcd62757..248c25c 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -1181,8 +1181,11 @@ static int ethtool_set_gro(struct net_device *dev, char __user *useraddr) return -EFAULT; if (edata.data) { - if (!dev->ethtool_ops->get_rx_csum || - !dev->ethtool_ops->get_rx_csum(dev)) + u32 rxcsum = dev->ethtool_ops->get_rx_csum ? + dev->ethtool_ops->get_rx_csum(dev) : + ethtool_op_get_rx_csum(dev); + + if (!rxcsum) return -EINVAL; dev->features |= NETIF_F_GRO; } else -- cgit v1.1 From 3b27e105550f7c4a79ecb6d6a9c49c651c59ae9b Mon Sep 17 00:00:00 2001 From: David Lamparter Date: Fri, 17 Sep 2010 03:22:19 +0000 Subject: netns: keep vlan slaves on master netns move previously, if a vlan master device was moved from one network namespace to another, all 802.1q and macvlan slaves were deleted. we can use dev->reg_state to figure out whether dev_change_net_namespace is happening, since that won't set dev->reg_state NETREG_UNREGISTERING. so, this changes 8021q and macvlan to ignore NETDEV_UNREGISTER when reg_state is not NETREG_UNREGISTERING. Signed-off-by: David Lamparter Reviewed-by: "Eric W. Biederman" Acked-by: Daniel Lezcano Acked-by: Patrick McHardy Signed-off-by: David S. Miller --- net/8021q/vlan.c | 4 ++++ net/core/dev.c | 4 ++++ 2 files changed, 8 insertions(+) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index a2ad152..2c6c2bd 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -525,6 +525,10 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, break; case NETDEV_UNREGISTER: + /* twiddle thumbs on netns device moves */ + if (dev->reg_state != NETREG_UNREGISTERING) + break; + /* Delete all VLANs for this dev. */ grp->killall = 1; diff --git a/net/core/dev.c b/net/core/dev.c index c09ff09..2c7934f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5686,6 +5686,10 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char /* Notify protocols, that we are about to destroy this device. They should clean all the things. + + Note that dev->reg_state stays at NETREG_REGISTERED. + This is wanted because this way 8021q and macvlan know + the device is just moving and can keep their slaves up. */ call_netdevice_notifiers(NETDEV_UNREGISTER, dev); call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev); -- cgit v1.1 From be2902daee80b655cebd482b5ee91ffc29408121 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 16 Sep 2010 11:28:07 +0000 Subject: ethtool, ixgbe: Move RX n-tuple mask fixup to ethtool The ethtool utility does not set masks for flow parameters that are not specified, so if both value and mask are 0 then this must be treated as equivalent to a mask with all bits set. Currently that is done in the only driver that implements RX n-tuple filtering, ixgbe. Move it to the ethtool core. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 34 ++++++++++++++++++++++++++++++++++ 1 file changed, 34 insertions(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 248c25c..91ffce2 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -485,6 +485,38 @@ static void __rx_ntuple_filter_add(struct ethtool_rx_ntuple_list *list, list->count++; } +/* + * ethtool does not (or did not) set masks for flow parameters that are + * not specified, so if both value and mask are 0 then this must be + * treated as equivalent to a mask with all bits set. Implement that + * here rather than in drivers. + */ +static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs) +{ + struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec; + struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec; + + if (fs->flow_type != TCP_V4_FLOW && + fs->flow_type != UDP_V4_FLOW && + fs->flow_type != SCTP_V4_FLOW) + return; + + if (!(entry->ip4src | mask->ip4src)) + mask->ip4src = htonl(0xffffffff); + if (!(entry->ip4dst | mask->ip4dst)) + mask->ip4dst = htonl(0xffffffff); + if (!(entry->psrc | mask->psrc)) + mask->psrc = htons(0xffff); + if (!(entry->pdst | mask->pdst)) + mask->pdst = htons(0xffff); + if (!(entry->tos | mask->tos)) + mask->tos = 0xff; + if (!(fs->vlan_tag | fs->vlan_tag_mask)) + fs->vlan_tag_mask = 0xffff; + if (!(fs->data | fs->data_mask)) + fs->data_mask = 0xffffffffffffffffULL; +} + static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, void __user *useraddr) { @@ -499,6 +531,8 @@ static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev, if (copy_from_user(&cmd, useraddr, sizeof(cmd))) return -EFAULT; + rx_ntuple_fix_masks(&cmd.fs); + /* * Cache filter in dev struct for GET operation only if * the underlying driver doesn't have its own GET operation, and -- cgit v1.1 From 9b9d2e00bfa592aceda7b43da76c670df61faa97 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 18 Sep 2010 13:42:25 +0000 Subject: rds: signedness bug In the original code if the copy_from_user() fails in rds_rdma_pages() then the error handling fails and we get a stack trace from kmalloc(). Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/rds/rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4806467..1a41deb 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -522,7 +522,7 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, struct rds_rdma_args *args; struct rds_iovec vec; struct rm_rdma_op *op = &rm->rdma; - unsigned int nr_pages; + int nr_pages; unsigned int nr_bytes; struct page **pages = NULL; struct rds_iovec __user *local_vec; -- cgit v1.1 From f4fa7f3807d41b78056c6648b04bfadd737df21e Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 18 Sep 2010 13:42:59 +0000 Subject: rds: double unlock in rds_ib_cm_handle_connect() We unlock after we goto out. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/rds/ib_cm.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index bc3dbc1..ee369d2 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -521,7 +521,6 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, err = rds_ib_setup_qp(conn); if (err) { rds_ib_conn_error(conn, "rds_ib_setup_qp failed (%d)\n", err); - mutex_unlock(&conn->c_cm_lock); goto out; } -- cgit v1.1 From aef3ea33e85035f7c827c1db9155f97f4b7ee725 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sat, 18 Sep 2010 13:44:14 +0000 Subject: rds: spin_lock_irq() is not nestable This is basically just a cleanup. IRQs were disabled on the previous line so we don't need to do it again here. In the current code IRQs would get turned on one line earlier than intended. Signed-off-by: Dan Carpenter Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 4 ++-- net/rds/iw_rdma.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 8f6e221..b5a8841 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -171,9 +171,9 @@ void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *con BUG_ON(list_empty(&ic->ib_node)); list_del(&ic->ib_node); - spin_lock_irq(&rds_ibdev->spinlock); + spin_lock(&rds_ibdev->spinlock); list_add_tail(&ic->ib_node, &rds_ibdev->conn_list); - spin_unlock_irq(&rds_ibdev->spinlock); + spin_unlock(&rds_ibdev->spinlock); spin_unlock_irq(&ib_nodev_conns_lock); ic->rds_ibdev = rds_ibdev; diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 4e152e2..0e7accc 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -206,9 +206,9 @@ void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *con BUG_ON(list_empty(&ic->iw_node)); list_del(&ic->iw_node); - spin_lock_irq(&rds_iwdev->spinlock); + spin_lock(&rds_iwdev->spinlock); list_add_tail(&ic->iw_node, &rds_iwdev->conn_list); - spin_unlock_irq(&rds_iwdev->spinlock); + spin_unlock(&rds_iwdev->spinlock); spin_unlock_irq(&iw_nodev_conns_lock); ic->rds_iwdev = rds_iwdev; -- cgit v1.1 From 462fb2af9788a82a534f8184abfde31574e1cfa0 Mon Sep 17 00:00:00 2001 From: Bandan Das Date: Sun, 19 Sep 2010 09:34:33 +0000 Subject: bridge : Sanitize skb before it enters the IP stack Related dicussion here : http://lkml.org/lkml/2010/9/3/16 Introduce a function br_parse_ip_options that will audit the skb and possibly refill IP options before a packet enters the IP stack. If no options are present, the function will zero out the skb cb area so that it is not misinterpreted as options by some unsuspecting IP layer routine. If packet consistency fails, drop it. Signed-off-by: Bandan Das Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 107 +++++++++++++++++++++++++++++++++------------- net/ipv4/ip_options.c | 3 +- 2 files changed, 80 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 137f232..77f7b5f 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -209,6 +209,72 @@ static inline void nf_bridge_update_protocol(struct sk_buff *skb) skb->protocol = htons(ETH_P_PPP_SES); } +/* When handing a packet over to the IP layer + * check whether we have a skb that is in the + * expected format + */ + +int br_parse_ip_options(struct sk_buff *skb) +{ + struct ip_options *opt; + struct iphdr *iph; + struct net_device *dev = skb->dev; + u32 len; + + iph = ip_hdr(skb); + opt = &(IPCB(skb)->opt); + + /* Basic sanity checks */ + if (iph->ihl < 5 || iph->version != 4) + goto inhdr_error; + + if (!pskb_may_pull(skb, iph->ihl*4)) + goto inhdr_error; + + iph = ip_hdr(skb); + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) + goto inhdr_error; + + len = ntohs(iph->tot_len); + if (skb->len < len) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INTRUNCATEDPKTS); + goto drop; + } else if (len < (iph->ihl*4)) + goto inhdr_error; + + if (pskb_trim_rcsum(skb, len)) { + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INDISCARDS); + goto drop; + } + + /* Zero out the CB buffer if no options present */ + if (iph->ihl == 5) { + memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + return 0; + } + + opt->optlen = iph->ihl*4 - sizeof(struct iphdr); + if (ip_options_compile(dev_net(dev), opt, skb)) + goto inhdr_error; + + /* Check correct handling of SRR option */ + if (unlikely(opt->srr)) { + struct in_device *in_dev = __in_dev_get_rcu(dev); + if (in_dev && !IN_DEV_SOURCE_ROUTE(in_dev)) + goto drop; + + if (ip_options_rcv_srr(skb)) + goto drop; + } + + return 0; + +inhdr_error: + IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_INHDRERRORS); +drop: + return -1; +} + /* Fill in the header for fragmented IP packets handled by * the IPv4 connection tracking code. */ @@ -549,7 +615,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, { struct net_bridge_port *p; struct net_bridge *br; - struct iphdr *iph; __u32 len = nf_bridge_encap_header_len(skb); if (unlikely(!pskb_may_pull(skb, len))) @@ -578,28 +643,9 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, nf_bridge_pull_encap_header_rcsum(skb); - if (!pskb_may_pull(skb, sizeof(struct iphdr))) - goto inhdr_error; - - iph = ip_hdr(skb); - if (iph->ihl < 5 || iph->version != 4) - goto inhdr_error; - - if (!pskb_may_pull(skb, 4 * iph->ihl)) - goto inhdr_error; - - iph = ip_hdr(skb); - if (ip_fast_csum((__u8 *) iph, iph->ihl) != 0) - goto inhdr_error; - - len = ntohs(iph->tot_len); - if (skb->len < len || len < 4 * iph->ihl) - goto inhdr_error; - - pskb_trim_rcsum(skb, len); - - /* BUG: Should really parse the IP options here. */ - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); + if (br_parse_ip_options(skb)) + /* Drop invalid packet */ + goto out; nf_bridge_put(skb->nf_bridge); if (!nf_bridge_alloc(skb)) @@ -614,8 +660,6 @@ static unsigned int br_nf_pre_routing(unsigned int hook, struct sk_buff *skb, return NF_STOLEN; -inhdr_error: -// IP_INC_STATS_BH(IpInHdrErrors); out: return NF_DROP; } @@ -759,14 +803,19 @@ static unsigned int br_nf_forward_arp(unsigned int hook, struct sk_buff *skb, #if defined(CONFIG_NF_CONNTRACK_IPV4) || defined(CONFIG_NF_CONNTRACK_IPV4_MODULE) static int br_nf_dev_queue_xmit(struct sk_buff *skb) { + int ret; + if (skb->nfct != NULL && skb->protocol == htons(ETH_P_IP) && skb->len + nf_bridge_mtu_reduction(skb) > skb->dev->mtu && !skb_is_gso(skb)) { - /* BUG: Should really parse the IP options here. */ - memset(IPCB(skb), 0, sizeof(struct inet_skb_parm)); - return ip_fragment(skb, br_dev_queue_push_xmit); + if (br_parse_ip_options(skb)) + /* Drop invalid packet */ + return NF_DROP; + ret = ip_fragment(skb, br_dev_queue_push_xmit); } else - return br_dev_queue_push_xmit(skb); + ret = br_dev_queue_push_xmit(skb); + + return ret; } #else static int br_nf_dev_queue_xmit(struct sk_buff *skb) diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index ba9836c..1906fa3 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -466,7 +466,7 @@ error: } return -EINVAL; } - +EXPORT_SYMBOL(ip_options_compile); /* * Undo all the changes done by ip_options_compile(). @@ -646,3 +646,4 @@ int ip_options_rcv_srr(struct sk_buff *skb) } return 0; } +EXPORT_SYMBOL(ip_options_rcv_srr); -- cgit v1.1 From 8990f468ae9010ab0af4be8f51bf7ab833a67202 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Sep 2010 00:12:11 +0000 Subject: net: rx_dropped accounting Under load, netif_rx() can drop incoming packets but administrators dont have a chance to spot which device needs some tuning (RPS activation for example) This patch adds rx_dropped accounting in vlans and tunnels. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.h | 2 ++ net/8021q/vlan_dev.c | 9 +++++++-- net/ipv4/ip_gre.c | 6 ++++-- net/ipv4/ipip.c | 5 ++++- net/ipv6/ip6_tunnel.c | 5 ++++- net/ipv6/ip6mr.c | 4 +++- net/ipv6/sit.c | 5 ++++- 7 files changed, 28 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 8d9503a..b26ce34 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -25,6 +25,7 @@ struct vlan_priority_tci_mapping { * @rx_multicast: number of received multicast packets * @syncp: synchronization point for 64bit counters * @rx_errors: number of errors + * @rx_dropped: number of dropped packets */ struct vlan_rx_stats { u64 rx_packets; @@ -32,6 +33,7 @@ struct vlan_rx_stats { u64 rx_multicast; struct u64_stats_sync syncp; unsigned long rx_errors; + unsigned long rx_dropped; }; /** diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 3bccdd1..94a1fed 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -225,7 +225,10 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, } } - netif_rx(skb); + if (unlikely(netif_rx(skb) == NET_RX_DROP)) { + if (rx_stats) + rx_stats->rx_dropped++; + } rcu_read_unlock(); return NET_RX_SUCCESS; @@ -843,13 +846,15 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st accum.rx_packets += rxpackets; accum.rx_bytes += rxbytes; accum.rx_multicast += rxmulticast; - /* rx_errors is an ulong, not protected by syncp */ + /* rx_errors, rx_dropped are ulong, not protected by syncp */ accum.rx_errors += p->rx_errors; + accum.rx_dropped += p->rx_dropped; } stats->rx_packets = accum.rx_packets; stats->rx_bytes = accum.rx_bytes; stats->rx_errors = accum.rx_errors; stats->multicast = accum.rx_multicast; + stats->rx_dropped = accum.rx_dropped; } return stats; } diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fc20e68..714b6a8 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -647,9 +647,11 @@ static int ipgre_rcv(struct sk_buff *skb) skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); - netif_rx(skb); + if (netif_rx(skb) == NET_RX_DROP) + stats->rx_dropped++; + rcu_read_unlock(); - return(0); + return 0; } icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 8de8888..babd252 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -377,7 +377,10 @@ static int ipip_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, tunnel->dev); ipip_ecn_decapsulate(iph, skb); - netif_rx(skb); + + if (netif_rx(skb) == NET_RX_DROP) + tunnel->dev->stats.rx_dropped++; + rcu_read_unlock(); return 0; } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9289cec..f6d9f68 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -725,7 +725,10 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, skb_tunnel_rx(skb, t->dev); dscp_ecn_decapsulate(t, ipv6h, skb); - netif_rx(skb); + + if (netif_rx(skb) == NET_RX_DROP) + t->dev->stats.rx_dropped++; + rcu_read_unlock(); return 0; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 66078da..2640c9b 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -666,7 +666,9 @@ static int pim6_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, reg_dev); - netif_rx(skb); + if (netif_rx(skb) == NET_RX_DROP) + reg_dev->stats.rx_dropped++; + dev_put(reg_dev); return 0; drop: diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 6822481..8a03998 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -564,7 +564,10 @@ static int ipip6_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, tunnel->dev); ipip6_ecn_decapsulate(iph, skb); - netif_rx(skb); + + if (netif_rx(skb) == NET_RX_DROP) + tunnel->dev->stats.rx_dropped++; + rcu_read_unlock(); return 0; } -- cgit v1.1 From 4874c131d79695e3d372042781a408a1a8a762d8 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:06:50 +0200 Subject: dccp: Add packet type information to CCID-specific option parsing This 1. adds packet type information to ccid_hc_{rx,tx}_parse_options(). This is necessary, since table 3 in RFC 4340, 5.8 leaves it to the CCIDs to state which options may (not) appear on what packet type. 2. adds such a check for CCID-3's {Loss Event, Receive} Rate as specified in RFC 4340 8.3 ("Receive Rate options MUST NOT be sent on DCCP-Data packets") and 8.5 ("Loss Event Rate options MUST NOT be sent on DCCP-Data packets"). 3. removes an unused argument `idx' from ccid_hc_{rx,tx}_parse_options(). This is also no longer necessary, since the CCID-specific option-parsing routines are passed every single parameter of the type-length-value option encoding. Signed-off-by: Gerrit Renker --- net/dccp/ccid.h | 46 +++++++++++++++++++++++----------------------- net/dccp/ccids/ccid3.c | 14 ++++++++------ net/dccp/options.c | 16 ++++------------ 3 files changed, 35 insertions(+), 41 deletions(-) (limited to 'net') diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 6df6f8a..6d16a90 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -62,18 +62,14 @@ struct ccid_operations { void (*ccid_hc_tx_exit)(struct sock *sk); void (*ccid_hc_rx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_rx_parse_options)(struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value); + int (*ccid_hc_rx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); int (*ccid_hc_rx_insert_options)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_recv)(struct sock *sk, struct sk_buff *skb); - int (*ccid_hc_tx_parse_options)(struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value); + int (*ccid_hc_tx_parse_options)(struct sock *sk, u8 pkt, + u8 opt, u8 *val, u8 len); int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, @@ -168,27 +164,31 @@ static inline void ccid_hc_tx_packet_recv(struct ccid *ccid, struct sock *sk, ccid->ccid_ops->ccid_hc_tx_packet_recv(sk, skb); } +/** + * ccid_hc_tx_parse_options - Parse CCID-specific options sent by the receiver + * @pkt: type of packet that @opt appears on (RFC 4340, 5.1) + * @opt: the CCID-specific option type (RFC 4340, 5.8 and 10.3) + * @val: value of @opt + * @len: length of @val in bytes + */ static inline int ccid_hc_tx_parse_options(struct ccid *ccid, struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value) + u8 pkt, u8 opt, u8 *val, u8 len) { - int rc = 0; - if (ccid->ccid_ops->ccid_hc_tx_parse_options != NULL) - rc = ccid->ccid_ops->ccid_hc_tx_parse_options(sk, option, len, idx, - value); - return rc; + if (ccid->ccid_ops->ccid_hc_tx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_tx_parse_options(sk, pkt, opt, val, len); } +/** + * ccid_hc_rx_parse_options - Parse CCID-specific options sent by the sender + * Arguments are analogous to ccid_hc_tx_parse_options() + */ static inline int ccid_hc_rx_parse_options(struct ccid *ccid, struct sock *sk, - unsigned char option, - unsigned char len, u16 idx, - unsigned char* value) + u8 pkt, u8 opt, u8 *val, u8 len) { - int rc = 0; - if (ccid->ccid_ops->ccid_hc_rx_parse_options != NULL) - rc = ccid->ccid_ops->ccid_hc_rx_parse_options(sk, option, len, idx, value); - return rc; + if (ccid->ccid_ops->ccid_hc_rx_parse_options == NULL) + return 0; + return ccid->ccid_ops->ccid_hc_rx_parse_options(sk, pkt, opt, val, len); } static inline int ccid_hc_rx_insert_options(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index ce80591..be1b8ba 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -481,9 +481,8 @@ done_computing_x: jiffies + usecs_to_jiffies(t_nfb)); } -static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, - unsigned char len, u16 idx, - unsigned char *value) +static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, + u8 option, u8 *optval, u8 optlen) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); struct ccid3_options_received *opt_recv = &hc->tx_options_received; @@ -492,12 +491,15 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, unsigned char option, switch (option) { case TFRC_OPT_RECEIVE_RATE: case TFRC_OPT_LOSS_EVENT_RATE: - if (unlikely(len != 4)) { + /* Must be ignored on Data packets, cf. RFC 4342 8.3 and 8.5 */ + if (packet_type == DCCP_PKT_DATA) + break; + if (unlikely(optlen != 4)) { DCCP_WARN("%s(%p), invalid len %d for %u\n", - dccp_role(sk), sk, len, option); + dccp_role(sk), sk, optlen, option); return -EINVAL; } - opt_val = ntohl(get_unaligned((__be32 *)value)); + opt_val = ntohl(get_unaligned((__be32 *)optval)); if (option == TFRC_OPT_RECEIVE_RATE) { opt_recv->ccid3or_receive_rate = opt_val; diff --git a/net/dccp/options.c b/net/dccp/options.c index bfda087..e4983e3 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -226,23 +226,15 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case 128 ... 191: { - const u16 idx = value - options; - + case 128 ... 191: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, - opt, len, idx, - value) != 0) + pkt_type, opt, value, len)) goto out_invalid_option; - } break; - case 192 ... 255: { - const u16 idx = value - options; - + case 192 ... 255: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, - opt, len, idx, - value) != 0) + pkt_type, opt, value, len)) goto out_invalid_option; - } break; default: DCCP_CRIT("DCCP(%p): option %d(len=%d) not " -- cgit v1.1 From a18213d1d2a469956845b437f5d1d0401ab22e8b Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:08:00 +0200 Subject: dccp: Replace magic CCID-specific numbers by symbolic constants The constants DCCPO_{MIN,MAX}_CCID_SPECIFIC are nowhere used in the code, but instead for the CCID-specific options numbers are used. This patch unifies the use of CCID-specific option numbers, by adding symbolic names reflecting the definitions in RFC 4340, 10.3. Signed-off-by: Gerrit Renker --- net/dccp/options.c | 13 +++---------- 1 file changed, 3 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/dccp/options.c b/net/dccp/options.c index e4983e3..9271851 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -96,18 +96,11 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, } /* - * CCID-Specific Options (from RFC 4340, sec. 10.3): - * - * Option numbers 128 through 191 are for options sent from the - * HC-Sender to the HC-Receiver; option numbers 192 through 255 - * are for options sent from the HC-Receiver to the HC-Sender. - * * CCID-specific options are ignored during connection setup, as * negotiation may still be in progress (see RFC 4340, 10.3). * The same applies to Ack Vectors, as these depend on the CCID. - * */ - if (dreq != NULL && (opt >= 128 || + if (dreq != NULL && (opt >= DCCPO_MIN_RX_CCID_SPECIFIC || opt == DCCPO_ACK_VECTOR_0 || opt == DCCPO_ACK_VECTOR_1)) goto ignore_option; @@ -226,12 +219,12 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_pr_debug("%s rx opt: ELAPSED_TIME=%d\n", dccp_role(sk), elapsed_time); break; - case 128 ... 191: + case DCCPO_MIN_RX_CCID_SPECIFIC ... DCCPO_MAX_RX_CCID_SPECIFIC: if (ccid_hc_rx_parse_options(dp->dccps_hc_rx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; break; - case 192 ... 255: + case DCCPO_MIN_TX_CCID_SPECIFIC ... DCCPO_MAX_TX_CCID_SPECIFIC: if (ccid_hc_tx_parse_options(dp->dccps_hc_tx_ccid, sk, pkt_type, opt, value, len)) goto out_invalid_option; -- cgit v1.1 From 80763dfbac4ed1e6dfe6ec08ef748e0e9aec3260 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:08:24 +0200 Subject: dccp ccid-3: remove dead states This patch is thanks to an investigation by Leandro Sales de Melo and his colleagues. They worked out two state diagrams which highlight the fact that the xxx_TERM states in CCID-3/4 are in fact not necessary. And this can be confirmed by in turn looking at the code: the xxx_TERM states are only ever set in ccid3_hc_{rx,tx}_exit(): when CCID-3 sets the state to xxx_TERM, it is at a time where no more processing should be going on, hence it is not necessary to introduce a dedicated exit state - this is already implied by unloading the CCID. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 37 +++++++++---------------------------- net/dccp/ccids/ccid3.h | 2 -- 2 files changed, 9 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index be1b8ba..90bc0a7 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -54,7 +54,6 @@ static const char *ccid3_tx_state_name(enum ccid3_hc_tx_states state) [TFRC_SSTATE_NO_SENT] = "NO_SENT", [TFRC_SSTATE_NO_FBACK] = "NO_FBACK", [TFRC_SSTATE_FBACK] = "FBACK", - [TFRC_SSTATE_TERM] = "TERM", }; return ccid3_state_names[state]; @@ -208,10 +207,13 @@ static void ccid3_hc_tx_no_feedback_timer(unsigned long data) ccid3_pr_debug("%s(%p, state=%s) - entry\n", dccp_role(sk), sk, ccid3_tx_state_name(hc->tx_state)); + /* Ignore and do not restart after leaving the established state */ + if ((1 << sk->sk_state) & ~(DCCPF_OPEN | DCCPF_PARTOPEN)) + goto out; + + /* Reset feedback state to "no feedback received" */ if (hc->tx_state == TFRC_SSTATE_FBACK) ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - else if (hc->tx_state != TFRC_SSTATE_NO_FBACK) - goto out; /* * Determine new allowed sending rate X as per draft rfc3448bis-00, 4.4 @@ -287,8 +289,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) if (unlikely(skb->len == 0)) return -EBADMSG; - switch (hc->tx_state) { - case TFRC_SSTATE_NO_SENT: + if (hc->tx_state == TFRC_SSTATE_NO_SENT) { sk_reset_timer(sk, &hc->tx_no_feedback_timer, (jiffies + usecs_to_jiffies(TFRC_INITIAL_TIMEOUT))); hc->tx_last_win_count = 0; @@ -323,9 +324,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) ccid3_update_send_interval(hc); ccid3_hc_tx_set_state(sk, TFRC_SSTATE_NO_FBACK); - break; - case TFRC_SSTATE_NO_FBACK: - case TFRC_SSTATE_FBACK: + + } else { delay = ktime_us_delta(hc->tx_t_nom, now); ccid3_pr_debug("delay=%ld\n", (long)delay); /* @@ -340,10 +340,6 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) return (u32)delay / USEC_PER_MSEC; ccid3_hc_tx_update_win_count(hc, now); - break; - case TFRC_SSTATE_TERM: - DCCP_BUG("%s(%p) - Illegal state TERM", dccp_role(sk), sk); - return -EINVAL; } /* prepare to send now (add options etc.) */ @@ -379,11 +375,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_DATAACK)) return; - /* ... and only in the established state */ - if (hc->tx_state != TFRC_SSTATE_FBACK && - hc->tx_state != TFRC_SSTATE_NO_FBACK) - return; - /* * Locate the acknowledged packet in the TX history. * @@ -529,9 +520,7 @@ static void ccid3_hc_tx_exit(struct sock *sk) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - ccid3_hc_tx_set_state(sk, TFRC_SSTATE_TERM); sk_stop_timer(sk, &hc->tx_no_feedback_timer); - tfrc_tx_hist_purge(&hc->tx_hist); } @@ -590,7 +579,6 @@ static const char *ccid3_rx_state_name(enum ccid3_hc_rx_states state) static const char *const ccid3_rx_state_names[] = { [TFRC_RSTATE_NO_DATA] = "NO_DATA", [TFRC_RSTATE_DATA] = "DATA", - [TFRC_RSTATE_TERM] = "TERM", }; return ccid3_rx_state_names[state]; @@ -616,14 +604,9 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk, { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); struct dccp_sock *dp = dccp_sk(sk); - ktime_t now; + ktime_t now = ktime_get_real(); s64 delta = 0; - if (unlikely(hc->rx_state == TFRC_RSTATE_TERM)) - return; - - now = ktime_get_real(); - switch (fbtype) { case CCID3_FBACK_INITIAL: hc->rx_x_recv = 0; @@ -827,8 +810,6 @@ static void ccid3_hc_rx_exit(struct sock *sk) { struct ccid3_hc_rx_sock *hc = ccid3_hc_rx_sk(sk); - ccid3_hc_rx_set_state(sk, TFRC_RSTATE_TERM); - tfrc_rx_hist_purge(&hc->rx_hist); tfrc_lh_cleanup(&hc->rx_li_hist); } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 9eb90b8..89b047b 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -77,7 +77,6 @@ enum ccid3_hc_tx_states { TFRC_SSTATE_NO_SENT = 1, TFRC_SSTATE_NO_FBACK, TFRC_SSTATE_FBACK, - TFRC_SSTATE_TERM, }; /** @@ -130,7 +129,6 @@ static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) enum ccid3_hc_rx_states { TFRC_RSTATE_NO_DATA = 1, TFRC_RSTATE_DATA, - TFRC_RSTATE_TERM = 127, }; /** -- cgit v1.1 From 792e6d3389061ad449429b9ba228eb758c247ea0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:10:52 +0200 Subject: dccp tfrc/ccid-3: computing the loss rate from the Loss Event Rate This adds a function to take care of the following, separate cases occurring in the computation of the Loss Rate p: * 1/(2^32-1) is mapped into 0% as per RFC 4342, 8.5; * 1/0 is mapped into 100%, the maximum; * to avoid that p = 1/x is rounded down to 0 when x is very large, since this means accidentally re-entering slow-start indicated by p == 0, the minimum resolution value of p is now returned instead; * a bug in ccid3_hc_rx_getsockopt is fixed: 1/0 was mapped into ~0U. Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 9 ++++----- net/dccp/ccids/lib/tfrc.h | 1 + net/dccp/ccids/lib/tfrc_equation.c | 14 ++++++++++++++ 3 files changed, 19 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 90bc0a7..9715eeb 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -400,10 +400,10 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Update loss event rate (which is scaled by 1e6) */ pinv = opt_recv->ccid3or_loss_event_rate; - if (pinv == ~0U || pinv == 0) /* see RFC 4342, 8.5 */ + if (pinv == 0) hc->tx_p = 0; - else /* can not exceed 100% */ - hc->tx_p = scaled_div(1, pinv); + else + hc->tx_p = tfrc_invert_loss_event_rate(pinv); /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 @@ -834,8 +834,7 @@ static int ccid3_hc_rx_getsockopt(struct sock *sk, const int optname, int len, return -EINVAL; rx_info.tfrcrx_x_recv = hc->rx_x_recv; rx_info.tfrcrx_rtt = hc->rx_rtt; - rx_info.tfrcrx_p = hc->rx_pinv == 0 ? ~0U : - scaled_div(1, hc->rx_pinv); + rx_info.tfrcrx_p = tfrc_invert_loss_event_rate(hc->rx_pinv); len = sizeof(rx_info); val = &rx_info; break; diff --git a/net/dccp/ccids/lib/tfrc.h b/net/dccp/ccids/lib/tfrc.h index 01bb48e..f8ee3f5 100644 --- a/net/dccp/ccids/lib/tfrc.h +++ b/net/dccp/ccids/lib/tfrc.h @@ -57,6 +57,7 @@ static inline u32 tfrc_ewma(const u32 avg, const u32 newval, const u8 weight) extern u32 tfrc_calc_x(u16 s, u32 R, u32 p); extern u32 tfrc_calc_x_reverse_lookup(u32 fvalue); +extern u32 tfrc_invert_loss_event_rate(u32 loss_event_rate); extern int tfrc_tx_packet_history_init(void); extern void tfrc_tx_packet_history_exit(void); diff --git a/net/dccp/ccids/lib/tfrc_equation.c b/net/dccp/ccids/lib/tfrc_equation.c index 22ca1cf..a052a43 100644 --- a/net/dccp/ccids/lib/tfrc_equation.c +++ b/net/dccp/ccids/lib/tfrc_equation.c @@ -687,3 +687,17 @@ u32 tfrc_calc_x_reverse_lookup(u32 fvalue) index = tfrc_binsearch(fvalue, 0); return (index + 1) * 1000000 / TFRC_CALC_X_ARRSIZE; } + +/** + * tfrc_invert_loss_event_rate - Compute p so that 10^6 corresponds to 100% + * When @loss_event_rate is large, there is a chance that p is truncated to 0. + * To avoid re-entering slow-start in that case, we set p = TFRC_SMALLEST_P > 0. + */ +u32 tfrc_invert_loss_event_rate(u32 loss_event_rate) +{ + if (loss_event_rate == UINT_MAX) /* see RFC 4342, 8.5 */ + return 0; + if (unlikely(loss_event_rate == 0)) /* map 1/0 into 100% */ + return 1000000; + return max_t(u32, scaled_div(1, loss_event_rate), TFRC_SMALLEST_P); +} -- cgit v1.1 From 536bb20b45dee3f9b77b0d250f8ed0733a5cb025 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Sun, 19 Sep 2010 20:14:23 +0200 Subject: dccp ccid-3: Remove redundant 'options_received' struct The `options_received' struct is redundant, since it re-duplicates the existing `p' and `x_recv' fields. This patch removes the sub-struct and migrates the format conversion operations to ccid3_hc_tx_parse_options(). Signed-off-by: Gerrit Renker --- net/dccp/ccids/ccid3.c | 24 ++++++++---------------- net/dccp/ccids/ccid3.h | 7 ------- 2 files changed, 8 insertions(+), 23 deletions(-) (limited to 'net') diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 9715eeb..c3f3a25 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -365,11 +365,10 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv = &hc->tx_options_received; struct tfrc_tx_hist_entry *acked; ktime_t now; unsigned long t_nfb; - u32 pinv, r_sample; + u32 r_sample; /* we are only interested in ACKs */ if (!(DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK || @@ -394,17 +393,6 @@ static void ccid3_hc_tx_packet_recv(struct sock *sk, struct sk_buff *skb) r_sample = dccp_sample_rtt(sk, ktime_us_delta(now, acked->stamp)); hc->tx_rtt = tfrc_ewma(hc->tx_rtt, r_sample, 9); - /* Update receive rate in units of 64 * bytes/second */ - hc->tx_x_recv = opt_recv->ccid3or_receive_rate; - hc->tx_x_recv <<= 6; - - /* Update loss event rate (which is scaled by 1e6) */ - pinv = opt_recv->ccid3or_loss_event_rate; - if (pinv == 0) - hc->tx_p = 0; - else - hc->tx_p = tfrc_invert_loss_event_rate(pinv); - /* * Update allowed sending rate X as per draft rfc3448bis-00, 4.2/3 */ @@ -476,7 +464,6 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, u8 option, u8 *optval, u8 optlen) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); - struct ccid3_options_received *opt_recv = &hc->tx_options_received; __be32 opt_val; switch (option) { @@ -493,11 +480,16 @@ static int ccid3_hc_tx_parse_options(struct sock *sk, u8 packet_type, opt_val = ntohl(get_unaligned((__be32 *)optval)); if (option == TFRC_OPT_RECEIVE_RATE) { - opt_recv->ccid3or_receive_rate = opt_val; + /* Receive Rate is kept in units of 64 bytes/second */ + hc->tx_x_recv = opt_val; + hc->tx_x_recv <<= 6; + ccid3_pr_debug("%s(%p), RECEIVE_RATE=%u\n", dccp_role(sk), sk, opt_val); } else { - opt_recv->ccid3or_loss_event_rate = opt_val; + /* Update the fixpoint Loss Event Rate fraction */ + hc->tx_p = tfrc_invert_loss_event_rate(opt_val); + ccid3_pr_debug("%s(%p), LOSS_EVENT_RATE=%u\n", dccp_role(sk), sk, opt_val); } diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 89b047b..1a9933c 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -67,11 +67,6 @@ enum ccid3_options { TFRC_OPT_RECEIVE_RATE = 194, }; -struct ccid3_options_received { - u32 ccid3or_loss_event_rate; - u32 ccid3or_receive_rate; -}; - /* TFRC sender states */ enum ccid3_hc_tx_states { TFRC_SSTATE_NO_SENT = 1, @@ -97,7 +92,6 @@ enum ccid3_hc_tx_states { * @tx_t_ld: Time last doubled during slow start * @tx_t_nom: Nominal send time of next packet * @tx_hist: Packet history - * @tx_options_received: Parsed set of retrieved options */ struct ccid3_hc_tx_sock { u64 tx_x; @@ -115,7 +109,6 @@ struct ccid3_hc_tx_sock { ktime_t tx_t_ld; ktime_t tx_t_nom; struct tfrc_tx_hist_entry *tx_hist; - struct ccid3_options_received tx_options_received; }; static inline struct ccid3_hc_tx_sock *ccid3_hc_tx_sk(const struct sock *sk) -- cgit v1.1 From f4bc17cdd205ebaa3807c2aa973719bb5ce6a5b2 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 21 Sep 2010 17:35:41 +0200 Subject: ipvs: netfilter connection tracking changes Add more code to IPVS to work with Netfilter connection tracking and fix some problems. - Allow IPVS to be compiled without connection tracking as in 2.6.35 and before. This can avoid keeping conntracks for all IPVS connections because this costs memory. ip_vs_ftp still depends on connection tracking and NAT as implemented for 2.6.36. - Add sysctl var "conntrack" to enable connection tracking for all IPVS connections. For loaded IPVS directors it needs tuning of nf_conntrack_max limit. - Add IP_VS_CONN_F_NFCT connection flag to request the connection to use connection tracking. This allows user space to provide this flag, for example, in dest->conn_flags. This can be useful to request connection tracking per real server instead of forcing it for all connections with the "conntrack" sysctl. This flag is set currently only by ip_vs_ftp and of course by "conntrack" sysctl. - Add ip_vs_nfct.c file to hold all connection tracking code, by this way main code should not depend of netfilter conntrack support. - Return back the ip_vs_post_routing handler as in 2.6.35 and use skb->ipvs_property=1 to allow IPVS to work without connection tracking Connection tracking: - most of the code is already in 2.6.36-rc - alter conntrack reply tuple for LVS-NAT connections when first packet from client is forwarded and conntrack state is NEW or RELATED. Additionally, alter reply for RELATED connections from real server, again for packet in original direction. - add IP_VS_XMIT_TUNNEL to confirm conntrack (without altering reply) for LVS-TUN early because we want to call nf_reset. It is needed because we add IPIP header and the original conntrack should be preserved, not destroyed. The transmitted IPIP packets can reuse same conntrack, so we do not set skb->ipvs_property. - try to destroy conntrack when the IPVS connection is destroyed. It is not fatal if conntrack disappears before that, it depends on the used timers. Fix problems from long time: - add skb->ip_summed = CHECKSUM_NONE for the LVS-TUN transmitters Signed-off-by: Julian Anastasov Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/Kconfig | 13 +- net/netfilter/ipvs/Makefile | 5 +- net/netfilter/ipvs/ip_vs_conn.c | 13 ++ net/netfilter/ipvs/ip_vs_core.c | 46 ++++++- net/netfilter/ipvs/ip_vs_ctl.c | 12 ++ net/netfilter/ipvs/ip_vs_ftp.c | 146 +------------------- net/netfilter/ipvs/ip_vs_nfct.c | 292 ++++++++++++++++++++++++++++++++++++++++ net/netfilter/ipvs/ip_vs_xmit.c | 98 +++++++------- 8 files changed, 430 insertions(+), 195 deletions(-) create mode 100644 net/netfilter/ipvs/ip_vs_nfct.c (limited to 'net') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index 46a77d5..af3c9f4 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -3,7 +3,7 @@ # menuconfig IP_VS tristate "IP virtual server support" - depends on NET && INET && NETFILTER && NF_CONNTRACK + depends on NET && INET && NETFILTER ---help--- IP Virtual Server support will let you build a high-performance virtual server based on cluster of two or more real servers. This @@ -235,7 +235,8 @@ comment 'IPVS application helper' config IP_VS_FTP tristate "FTP protocol helper" - depends on IP_VS_PROTO_TCP && NF_NAT + depends on IP_VS_PROTO_TCP && NF_CONNTRACK && NF_NAT + select IP_VS_NFCT ---help--- FTP is a protocol that transfers IP address and/or port number in the payload. In the virtual server via Network Address Translation, @@ -247,4 +248,12 @@ config IP_VS_FTP If you want to compile it in kernel, say Y. To compile it as a module, choose M here. If unsure, say N. +config IP_VS_NFCT + bool "Netfilter connection tracking" + depends on NF_CONNTRACK + ---help--- + The Netfilter connection tracking support allows the IPVS + connection state to be exported to the Netfilter framework + for filtering purposes. + endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index e3baefd..349fe88 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -9,10 +9,13 @@ ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_UDP) += ip_vs_proto_udp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_AH_ESP) += ip_vs_proto_ah_esp.o ip_vs_proto-objs-$(CONFIG_IP_VS_PROTO_SCTP) += ip_vs_proto_sctp.o +ip_vs-extra_objs-y := +ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o + ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ ip_vs_est.o ip_vs_proto.o \ - $(ip_vs_proto-objs-y) + $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) # IPVS core diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 9fe1da7..a970d96 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -721,6 +721,9 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->control) ip_vs_control_del(cp); + if (cp->flags & IP_VS_CONN_F_NFCT) + ip_vs_conn_drop_conntrack(cp); + if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); @@ -816,6 +819,16 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, if (unlikely(pp && atomic_read(&pp->appcnt))) ip_vs_bind_app(cp, pp); + /* + * Allow conntrack to be preserved. By default, conntrack + * is created and destroyed for every packet. + * Sometimes keeping conntrack can be useful for + * IP_VS_CONN_F_ONE_PACKET too. + */ + + if (ip_vs_conntrack_enabled()) + cp->flags |= IP_VS_CONN_F_NFCT; + /* Hash it in the ip_vs_conn_tab finally */ ip_vs_conn_hash(cp); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 319991d..7fbc80d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -537,6 +537,23 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } +/* + * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING + * chain and is used to avoid double NAT and confirmation when we do + * not want to keep the conntrack structure + */ +static unsigned int ip_vs_post_routing(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (!skb->ipvs_property) + return NF_ACCEPT; + /* The packet was sent from IPVS, exit this chain */ + return NF_STOP; +} + __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -695,7 +712,10 @@ static int handle_response_icmp(int af, struct sk_buff *skb, /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); - skb->ipvs_property = 1; + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + skb->ipvs_property = 1; + else + ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; out: @@ -928,17 +948,19 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); - ip_vs_update_conntrack(skb, cp, 0); + if (!(cp->flags & IP_VS_CONN_F_NFCT)) + skb->ipvs_property = 1; + else + ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); - skb->ipvs_property = 1; - LeaveFunction(11); return NF_ACCEPT; drop: ip_vs_conn_put(cp); kfree_skb(skb); + LeaveFunction(11); return NF_STOLEN; } @@ -1483,6 +1505,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP_PRI_NAT_SRC-1, + }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1511,6 +1541,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* Before the netfilter connection tracking, exit from POST_ROUTING */ + { + .hook = ip_vs_post_routing, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_POST_ROUTING, + .priority = NF_IP6_PRI_NAT_SRC-1, + }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 7bd41d2..d2d842f 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -88,6 +88,9 @@ int sysctl_ip_vs_expire_nodest_conn = 0; int sysctl_ip_vs_expire_quiescent_template = 0; int sysctl_ip_vs_sync_threshold[2] = { 3, 50 }; int sysctl_ip_vs_nat_icmp_send = 0; +#ifdef CONFIG_IP_VS_NFCT +int sysctl_ip_vs_conntrack; +#endif #ifdef CONFIG_IP_VS_DEBUG @@ -1580,6 +1583,15 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_do_defense_mode, }, +#ifdef CONFIG_IP_VS_NFCT + { + .procname = "conntrack", + .data = &sysctl_ip_vs_conntrack, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, +#endif { .procname = "secure_tcp", .data = &sysctl_ip_vs_secure_tcp, diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 7e9af5b..9cd375f 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -20,17 +20,6 @@ * * Author: Wouter Gadeyne * - * - * Code for ip_vs_expect_related and ip_vs_expect_callback is taken from - * http://www.ssi.bg/~ja/nfct/: - * - * ip_vs_nfct.c: Netfilter connection tracking support for IPVS - * - * Portions Copyright (C) 2001-2002 - * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. - * - * Portions Copyright (C) 2003-2008 - * Julian Anastasov */ #define KMSG_COMPONENT "IPVS" @@ -58,16 +47,6 @@ #define SERVER_STRING "227 Entering Passive Mode (" #define CLIENT_STRING "PORT " -#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" -#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ - &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ - (T)->dst.protonum - -#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" -#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ - &((C)->vaddr.ip), ntohs((C)->vport), \ - &((C)->daddr.ip), ntohs((C)->dport), \ - (C)->protocol, (C)->state /* * List of ports (up to IP_VS_APP_MAX_PORTS) to be handled by helper @@ -85,6 +64,8 @@ static int ip_vs_ftp_pasv; static int ip_vs_ftp_init_conn(struct ip_vs_app *app, struct ip_vs_conn *cp) { + /* We use connection tracking for the command connection */ + cp->flags |= IP_VS_CONN_F_NFCT; return 0; } @@ -149,120 +130,6 @@ static int ip_vs_ftp_get_addrport(char *data, char *data_limit, } /* - * Called from init_conntrack() as expectfn handler. - */ -static void -ip_vs_expect_callback(struct nf_conn *ct, - struct nf_conntrack_expect *exp) -{ - struct nf_conntrack_tuple *orig, new_reply; - struct ip_vs_conn *cp; - - if (exp->tuple.src.l3num != PF_INET) - return; - - /* - * We assume that no NF locks are held before this callback. - * ip_vs_conn_out_get and ip_vs_conn_in_get should match their - * expectations even if they use wildcard values, now we provide the - * actual values from the newly created original conntrack direction. - * The conntrack is confirmed when packet reaches IPVS hooks. - */ - - /* RS->CLIENT */ - orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); - if (cp) { - /* Change reply CLIENT->RS to CLIENT->VS */ - new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found inout cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - new_reply.dst.u3 = cp->vaddr; - new_reply.dst.u.tcp.port = cp->vport; - IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE - ", inout cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - goto alter; - } - - /* CLIENT->VS */ - cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); - if (cp) { - /* Change reply VS->CLIENT to RS->CLIENT */ - new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " - FMT_TUPLE ", found outin cp=" FMT_CONN "\n", - __func__, ct, ct->status, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - new_reply.src.u3 = cp->daddr; - new_reply.src.u.tcp.port = cp->dport; - IP_VS_DBG(7, "%s(): ct=%p, new tuples=" FMT_TUPLE ", " - FMT_TUPLE ", outin cp=" FMT_CONN "\n", - __func__, ct, - ARG_TUPLE(orig), ARG_TUPLE(&new_reply), - ARG_CONN(cp)); - goto alter; - } - - IP_VS_DBG(7, "%s(): ct=%p, status=0x%lX, tuple=" FMT_TUPLE - " - unknown expect\n", - __func__, ct, ct->status, ARG_TUPLE(orig)); - return; - -alter: - /* Never alter conntrack for non-NAT conns */ - if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) - nf_conntrack_alter_reply(ct, &new_reply); - ip_vs_conn_put(cp); - return; -} - -/* - * Create NF conntrack expectation with wildcard (optional) source port. - * Then the default callback function will alter the reply and will confirm - * the conntrack entry when the first packet comes. - */ -static void -ip_vs_expect_related(struct sk_buff *skb, struct nf_conn *ct, - struct ip_vs_conn *cp, u_int8_t proto, - const __be16 *port, int from_rs) -{ - struct nf_conntrack_expect *exp; - - BUG_ON(!ct || ct == &nf_conntrack_untracked); - - exp = nf_ct_expect_alloc(ct); - if (!exp) - return; - - if (from_rs) - nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, - nf_ct_l3num(ct), &cp->daddr, &cp->caddr, - proto, port, &cp->cport); - else - nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, - nf_ct_l3num(ct), &cp->caddr, &cp->vaddr, - proto, port, &cp->vport); - - exp->expectfn = ip_vs_expect_callback; - - IP_VS_DBG(7, "%s(): ct=%p, expect tuple=" FMT_TUPLE "\n", - __func__, ct, ARG_TUPLE(&exp->tuple)); - nf_ct_expect_related(exp); - nf_ct_expect_put(exp); -} - -/* * Look at outgoing ftp packets to catch the response to a PASV command * from the server (inside-to-outside). * When we see one, we build a connection entry with the client address, @@ -335,7 +202,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, &cp->caddr, 0, &cp->vaddr, port, &from, port, - IP_VS_CONN_F_NO_CPORT, + IP_VS_CONN_F_NO_CPORT | + IP_VS_CONN_F_NFCT, cp->dest); if (!n_cp) return 0; @@ -371,8 +239,8 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, start-data, end-start, buf, buf_len); if (ret) - ip_vs_expect_related(skb, ct, n_cp, - IPPROTO_TCP, NULL, 0); + ip_vs_nfct_expect_related(skb, ct, n_cp, + IPPROTO_TCP, 0, 0); } /* @@ -487,7 +355,7 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), &cp->daddr, htons(ntohs(cp->dport)-1), - 0, + IP_VS_CONN_F_NFCT, cp->dest); if (!n_cp) return 0; diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c new file mode 100644 index 0000000..c038458 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -0,0 +1,292 @@ +/* + * ip_vs_nfct.c: Netfilter connection tracking support for IPVS + * + * Portions Copyright (C) 2001-2002 + * Antefacto Ltd, 181 Parnell St, Dublin 1, Ireland. + * + * Portions Copyright (C) 2003-2010 + * Julian Anastasov + * + * + * This code is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + * + * + * Authors: + * Ben North + * Julian Anastasov Reorganize and sync with latest kernels + * Hannes Eder Extend NFCT support for FTP, ipvs match + * + * + * Current status: + * + * - provide conntrack confirmation for new and related connections, by + * this way we can see their proper conntrack state in all hooks + * - support for all forwarding methods, not only NAT + * - FTP support (NAT), ability to support other NAT apps with expectations + * - to correctly create expectations for related NAT connections the proper + * NF conntrack support must be already installed, eg. ip_vs_ftp requires + * nf_conntrack_ftp ... iptables_nat for the same ports (but no iptables + * NAT rules are needed) + * - alter reply for NAT when forwarding packet in original direction: + * conntrack from client in NEW or RELATED (Passive FTP DATA) state or + * when RELATED conntrack is created from real server (Active FTP DATA) + * - if iptables_nat is not loaded the Passive FTP will not work (the + * PASV response can not be NAT-ed) but Active FTP should work + * + */ + +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +#define FMT_TUPLE "%pI4:%u->%pI4:%u/%u" +#define ARG_TUPLE(T) &(T)->src.u3.ip, ntohs((T)->src.u.all), \ + &(T)->dst.u3.ip, ntohs((T)->dst.u.all), \ + (T)->dst.protonum + +#define FMT_CONN "%pI4:%u->%pI4:%u->%pI4:%u/%u:%u" +#define ARG_CONN(C) &((C)->caddr.ip), ntohs((C)->cport), \ + &((C)->vaddr.ip), ntohs((C)->vport), \ + &((C)->daddr.ip), ntohs((C)->dport), \ + (C)->protocol, (C)->state + +void +ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) +{ + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + struct nf_conntrack_tuple new_tuple; + + if (ct == NULL || nf_ct_is_confirmed(ct) || nf_ct_is_untracked(ct) || + nf_ct_is_dying(ct)) + return; + + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ) + return; + + /* Alter reply only in original direction */ + if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL) + return; + + /* + * The connection is not yet in the hashtable, so we update it. + * CIP->VIP will remain the same, so leave the tuple in + * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the + * real-server we will see RIP->DIP. + */ + new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + /* + * This will also take care of UDP and other protocols. + */ + if (outin) { + new_tuple.src.u3 = cp->daddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.src.u.tcp.port = cp->dport; + } else { + new_tuple.dst.u3 = cp->vaddr; + if (new_tuple.dst.protonum != IPPROTO_ICMP && + new_tuple.dst.protonum != IPPROTO_ICMPV6) + new_tuple.dst.u.tcp.port = cp->vport; + } + IP_VS_DBG(7, "%s: Updating conntrack ct=%p, status=0x%lX, " + "ctinfo=%d, old reply=" FMT_TUPLE + ", new reply=" FMT_TUPLE ", cp=" FMT_CONN "\n", + __func__, ct, ct->status, ctinfo, + ARG_TUPLE(&ct->tuplehash[IP_CT_DIR_REPLY].tuple), + ARG_TUPLE(&new_tuple), ARG_CONN(cp)); + nf_conntrack_alter_reply(ct, &new_tuple); +} + +int ip_vs_confirm_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp) +{ + return nf_conntrack_confirm(skb); +} + +/* + * Called from init_conntrack() as expectfn handler. + */ +static void ip_vs_nfct_expect_callback(struct nf_conn *ct, + struct nf_conntrack_expect *exp) +{ + struct nf_conntrack_tuple *orig, new_reply; + struct ip_vs_conn *cp; + + if (exp->tuple.src.l3num != PF_INET) + return; + + /* + * We assume that no NF locks are held before this callback. + * ip_vs_conn_out_get and ip_vs_conn_in_get should match their + * expectations even if they use wildcard values, now we provide the + * actual values from the newly created original conntrack direction. + * The conntrack is confirmed when packet reaches IPVS hooks. + */ + + /* RS->CLIENT */ + orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply CLIENT->RS to CLIENT->VS */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found inout cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.dst.u3 = cp->vaddr; + new_reply.dst.u.tcp.port = cp->vport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " FMT_TUPLE + ", inout cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + /* CLIENT->VS */ + cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port); + if (cp) { + /* Change reply VS->CLIENT to RS->CLIENT */ + new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuples=" FMT_TUPLE ", " + FMT_TUPLE ", found outin cp=" FMT_CONN "\n", + __func__, ct, ct->status, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + new_reply.src.u3 = cp->daddr; + new_reply.src.u.tcp.port = cp->dport; + IP_VS_DBG(7, "%s: ct=%p, new tuples=" FMT_TUPLE ", " + FMT_TUPLE ", outin cp=" FMT_CONN "\n", + __func__, ct, + ARG_TUPLE(orig), ARG_TUPLE(&new_reply), + ARG_CONN(cp)); + goto alter; + } + + IP_VS_DBG(7, "%s: ct=%p, status=0x%lX, tuple=" FMT_TUPLE + " - unknown expect\n", + __func__, ct, ct->status, ARG_TUPLE(orig)); + return; + +alter: + /* Never alter conntrack for non-NAT conns */ + if (IP_VS_FWD_METHOD(cp) == IP_VS_CONN_F_MASQ) + nf_conntrack_alter_reply(ct, &new_reply); + ip_vs_conn_put(cp); + return; +} + +/* + * Create NF conntrack expectation with wildcard (optional) source port. + * Then the default callback function will alter the reply and will confirm + * the conntrack entry when the first packet comes. + * Use port 0 to expect connection from any port. + */ +void ip_vs_nfct_expect_related(struct sk_buff *skb, struct nf_conn *ct, + struct ip_vs_conn *cp, u_int8_t proto, + const __be16 port, int from_rs) +{ + struct nf_conntrack_expect *exp; + + if (ct == NULL || nf_ct_is_untracked(ct)) + return; + + exp = nf_ct_expect_alloc(ct); + if (!exp) + return; + + nf_ct_expect_init(exp, NF_CT_EXPECT_CLASS_DEFAULT, nf_ct_l3num(ct), + from_rs ? &cp->daddr : &cp->caddr, + from_rs ? &cp->caddr : &cp->vaddr, + proto, port ? &port : NULL, + from_rs ? &cp->cport : &cp->vport); + + exp->expectfn = ip_vs_nfct_expect_callback; + + IP_VS_DBG(7, "%s: ct=%p, expect tuple=" FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&exp->tuple)); + nf_ct_expect_related(exp); + nf_ct_expect_put(exp); +} +EXPORT_SYMBOL(ip_vs_nfct_expect_related); + +/* + * Our connection was terminated, try to drop the conntrack immediately + */ +void ip_vs_conn_drop_conntrack(struct ip_vs_conn *cp) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conn *ct; + struct nf_conntrack_tuple tuple; + + if (!cp->cport) + return; + + tuple = (struct nf_conntrack_tuple) { + .dst = { .protonum = cp->protocol, .dir = IP_CT_DIR_ORIGINAL } }; + tuple.src.u3 = cp->caddr; + tuple.src.u.all = cp->cport; + tuple.src.l3num = cp->af; + tuple.dst.u3 = cp->vaddr; + tuple.dst.u.all = cp->vport; + + IP_VS_DBG(7, "%s: dropping conntrack with tuple=" FMT_TUPLE + " for conn " FMT_CONN "\n", + __func__, ARG_TUPLE(&tuple), ARG_CONN(cp)); + + h = nf_conntrack_find_get(&init_net, NF_CT_DEFAULT_ZONE, &tuple); + if (h) { + ct = nf_ct_tuplehash_to_ctrack(h); + /* Show what happens instead of calling nf_ct_kill() */ + if (del_timer(&ct->timeout)) { + IP_VS_DBG(7, "%s: ct=%p, deleted conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + if (ct->timeout.function) + ct->timeout.function(ct->timeout.data); + } else { + IP_VS_DBG(7, "%s: ct=%p, no conntrack timer for tuple=" + FMT_TUPLE "\n", + __func__, ct, ARG_TUPLE(&tuple)); + } + nf_ct_put(ct); + } else { + IP_VS_DBG(7, "%s: no conntrack for tuple=" FMT_TUPLE "\n", + __func__, ARG_TUPLE(&tuple)); + } +} + diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 49df6be..8817afa 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -28,7 +28,6 @@ #include #include #include -#include #include #include @@ -194,12 +193,37 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) dst_release(old_dst); } -#define IP_VS_XMIT(pf, skb, rt) \ +#define IP_VS_XMIT_TUNNEL(skb, cp) \ +({ \ + int __ret = NF_ACCEPT; \ + \ + if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ + __ret = ip_vs_confirm_conntrack(skb, cp); \ + if (__ret == NF_ACCEPT) { \ + nf_reset(skb); \ + (skb)->ip_summed = CHECKSUM_NONE; \ + } \ + __ret; \ +}) + +#define IP_VS_XMIT_NAT(pf, skb, cp) \ do { \ - (skb)->ipvs_property = 1; \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + (skb)->ipvs_property = 1; \ + else \ + ip_vs_update_conntrack(skb, cp, 1); \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ - (rt)->dst.dev, dst_output); \ + skb_dst(skb)->dev, dst_output); \ +} while (0) + +#define IP_VS_XMIT(pf, skb, cp) \ +do { \ + if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ + (skb)->ipvs_property = 1; \ + skb_forward_csum(skb); \ + NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ + skb_dst(skb)->dev, dst_output); \ } while (0) @@ -271,7 +295,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -335,7 +359,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -349,36 +373,6 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, } #endif -void -ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin) -{ - struct nf_conn *ct = (struct nf_conn *)skb->nfct; - struct nf_conntrack_tuple new_tuple; - - if (ct == NULL || nf_ct_is_untracked(ct) || nf_ct_is_confirmed(ct)) - return; - - /* - * The connection is not yet in the hashtable, so we update it. - * CIP->VIP will remain the same, so leave the tuple in - * IP_CT_DIR_ORIGINAL untouched. When the reply comes back from the - * real-server we will see RIP->DIP. - */ - new_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; - if (outin) - new_tuple.src.u3 = cp->daddr; - else - new_tuple.dst.u3 = cp->vaddr; - /* - * This will also take care of UDP and other protocols. - */ - if (outin) - new_tuple.src.u.tcp.port = cp->dport; - else - new_tuple.dst.u.tcp.port = cp->vport; - nf_conntrack_alter_reply(ct, &new_tuple); -} - /* * NAT transmitter (only for outside-to-inside nat forwarding) * Not used for related ICMP @@ -434,8 +428,6 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp, 1); - /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -443,7 +435,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -451,8 +443,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, tx_error_icmp: dst_link_failure(skb); tx_error: - LeaveFunction(10); kfree_skb(skb); + LeaveFunction(10); return NF_STOLEN; tx_error_put: ip_rt_put(rt); @@ -512,8 +504,6 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); - ip_vs_update_conntrack(skb, cp, 1); - /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still MTU problem. */ @@ -521,7 +511,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -571,6 +561,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; + int ret; EnterFunction(10); @@ -655,7 +646,11 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ip_local_out(skb); + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); LeaveFunction(10); @@ -681,6 +676,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; + int ret; EnterFunction(10); @@ -761,7 +757,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - ip6_local_out(skb); + ret = IP_VS_XMIT_TUNNEL(skb, cp); + if (ret == NF_ACCEPT) + ip6_local_out(skb); + else if (ret == NF_DROP) + kfree_skb(skb); LeaveFunction(10); @@ -820,7 +820,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -873,7 +873,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); LeaveFunction(10); return NF_STOLEN; @@ -947,7 +947,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp); rc = NF_STOLEN; goto out; @@ -1022,7 +1022,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, rt); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp); rc = NF_STOLEN; goto out; -- cgit v1.1 From 8a8030407f55a6aaedb51167c1a2383311fcd707 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 21 Sep 2010 17:38:57 +0200 Subject: ipvs: make rerouting optional with snat_reroute Add new sysctl flag "snat_reroute". Recent kernels use ip_route_me_harder() to route LVS-NAT responses properly by VIP when there are multiple paths to client. But setups that do not have alternative default routes can skip this routing lookup by using snat_reroute=0. Signed-off-by: Julian Anastasov Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_core.c | 37 +++++++++++++++++++++++++++++-------- net/netfilter/ipvs/ip_vs_ctl.c | 8 ++++++++ 2 files changed, 37 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 7fbc80d..06c388b 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -929,20 +929,31 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_send_check(ip_hdr(skb)); } + /* + * nf_iterate does not expect change in the skb->dst->dev. + * It looks like it is not fatal to enable this code for hooks + * where our handlers are at the end of the chain list and + * when all next handlers use skb->dst->dev and not outdev. + * It will definitely route properly the inout NAT traffic + * when multiple paths are used. + */ + /* For policy routing, packets originating from this * machine itself may be routed differently to packets * passing through. We want this packet to be routed as * if it came from this machine itself. So re-compute * the routing information. */ + if (sysctl_ip_vs_snat_reroute) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else + if (af == AF_INET6) { + if (ip6_route_me_harder(skb) != 0) + goto drop; + } else #endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; + if (ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; + } IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); @@ -991,8 +1002,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { int related, verdict = ip_vs_out_icmp_v6(skb, &related); - if (related) + if (related) { + if (sysctl_ip_vs_snat_reroute && + NF_ACCEPT == verdict && + ip6_route_me_harder(skb)) + verdict = NF_DROP; return verdict; + } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else @@ -1000,8 +1016,13 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, if (unlikely(iph.protocol == IPPROTO_ICMP)) { int related, verdict = ip_vs_out_icmp(skb, &related); - if (related) + if (related) { + if (sysctl_ip_vs_snat_reroute && + NF_ACCEPT == verdict && + ip_route_me_harder(skb, RTN_LOCAL)) + verdict = NF_DROP; return verdict; + } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index d2d842f..e637cd0 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -91,6 +91,7 @@ int sysctl_ip_vs_nat_icmp_send = 0; #ifdef CONFIG_IP_VS_NFCT int sysctl_ip_vs_conntrack; #endif +int sysctl_ip_vs_snat_reroute = 1; #ifdef CONFIG_IP_VS_DEBUG @@ -1599,6 +1600,13 @@ static struct ctl_table vs_vars[] = { .mode = 0644, .proc_handler = proc_do_defense_mode, }, + { + .procname = "snat_reroute", + .data = &sysctl_ip_vs_snat_reroute, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec, + }, #if 0 { .procname = "timeout_established", -- cgit v1.1 From 99f07e91bef34db0fc8b1a224096e97f02dc0d56 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 21 Sep 2010 17:49:20 +0200 Subject: netfilter: save the hash of the tuple in the original direction for latter use Since we don't change the tuple in the original direction, we can save it in ct->tuplehash[IP_CT_DIR_REPLY].hnode.pprev for __nf_conntrack_confirm() use. __hash_conntrack() is split into two steps: hash_conntrack_raw() is used to get the raw hash, and __hash_bucket() is used to get the bucket id. In SYN-flood case, early_drop() doesn't need to recompute the hash again. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_core.c | 112 ++++++++++++++++++++++++++------------ 1 file changed, 78 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index 4c0ad9b..1eacf8d 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -67,29 +67,40 @@ EXPORT_PER_CPU_SYMBOL(nf_conntrack_untracked); static unsigned int nf_conntrack_hash_rnd __read_mostly; -static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, - u16 zone, unsigned int size, unsigned int rnd) +static u32 hash_conntrack_raw(const struct nf_conntrack_tuple *tuple, u16 zone) { unsigned int n; - u_int32_t h; /* The direction must be ignored, so we hash everything up to the * destination ports (which is a multiple of 4) and treat the last * three bytes manually. */ n = (sizeof(tuple->src) + sizeof(tuple->dst.u3)) / sizeof(u32); - h = jhash2((u32 *)tuple, n, - zone ^ rnd ^ (((__force __u16)tuple->dst.u.all << 16) | - tuple->dst.protonum)); + return jhash2((u32 *)tuple, n, zone ^ nf_conntrack_hash_rnd ^ + (((__force __u16)tuple->dst.u.all << 16) | + tuple->dst.protonum)); +} + +static u32 __hash_bucket(u32 hash, unsigned int size) +{ + return ((u64)hash * size) >> 32; +} + +static u32 hash_bucket(u32 hash, const struct net *net) +{ + return __hash_bucket(hash, net->ct.htable_size); +} - return ((u64)h * size) >> 32; +static u_int32_t __hash_conntrack(const struct nf_conntrack_tuple *tuple, + u16 zone, unsigned int size) +{ + return __hash_bucket(hash_conntrack_raw(tuple, zone), size); } static inline u_int32_t hash_conntrack(const struct net *net, u16 zone, const struct nf_conntrack_tuple *tuple) { - return __hash_conntrack(tuple, zone, net->ct.htable_size, - nf_conntrack_hash_rnd); + return __hash_conntrack(tuple, zone, net->ct.htable_size); } bool @@ -291,20 +302,20 @@ static void death_by_timeout(unsigned long ul_conntrack) * OR * - Caller must lock nf_conntrack_lock before calling this function */ -struct nf_conntrack_tuple_hash * -__nf_conntrack_find(struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple) +static struct nf_conntrack_tuple_hash * +____nf_conntrack_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct hlist_nulls_node *n; - unsigned int hash = hash_conntrack(net, zone, tuple); + unsigned int bucket = hash_bucket(hash, net); /* Disable BHs the entire time since we normally need to disable them * at least once for the stats anyway. */ local_bh_disable(); begin: - hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[hash], hnnode) { + hlist_nulls_for_each_entry_rcu(h, n, &net->ct.hash[bucket], hnnode) { if (nf_ct_tuple_equal(tuple, &h->tuple) && nf_ct_zone(nf_ct_tuplehash_to_ctrack(h)) == zone) { NF_CT_STAT_INC(net, found); @@ -318,7 +329,7 @@ begin: * not the expected one, we must restart lookup. * We probably met an item that was moved to another chain. */ - if (get_nulls_value(n) != hash) { + if (get_nulls_value(n) != bucket) { NF_CT_STAT_INC(net, search_restart); goto begin; } @@ -326,19 +337,27 @@ begin: return NULL; } + +struct nf_conntrack_tuple_hash * +__nf_conntrack_find(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return ____nf_conntrack_find(net, zone, tuple, + hash_conntrack_raw(tuple, zone)); +} EXPORT_SYMBOL_GPL(__nf_conntrack_find); /* Find a connection corresponding to a tuple. */ -struct nf_conntrack_tuple_hash * -nf_conntrack_find_get(struct net *net, u16 zone, - const struct nf_conntrack_tuple *tuple) +static struct nf_conntrack_tuple_hash * +__nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple, u32 hash) { struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; rcu_read_lock(); begin: - h = __nf_conntrack_find(net, zone, tuple); + h = ____nf_conntrack_find(net, zone, tuple, hash); if (h) { ct = nf_ct_tuplehash_to_ctrack(h); if (unlikely(nf_ct_is_dying(ct) || @@ -356,6 +375,14 @@ begin: return h; } + +struct nf_conntrack_tuple_hash * +nf_conntrack_find_get(struct net *net, u16 zone, + const struct nf_conntrack_tuple *tuple) +{ + return __nf_conntrack_find_get(net, zone, tuple, + hash_conntrack_raw(tuple, zone)); +} EXPORT_SYMBOL_GPL(nf_conntrack_find_get); static void __nf_conntrack_hash_insert(struct nf_conn *ct, @@ -408,8 +435,11 @@ __nf_conntrack_confirm(struct sk_buff *skb) return NF_ACCEPT; zone = nf_ct_zone(ct); - hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple); - repl_hash = hash_conntrack(net, zone, &ct->tuplehash[IP_CT_DIR_REPLY].tuple); + /* reuse the hash saved before */ + hash = *(unsigned long *)&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev; + hash = hash_bucket(hash, net); + repl_hash = hash_conntrack(net, zone, + &ct->tuplehash[IP_CT_DIR_REPLY].tuple); /* We're not in hash table, and we refuse to set up related connections for unconfirmed conns. But packet copies and @@ -566,10 +596,11 @@ static noinline int early_drop(struct net *net, unsigned int hash) return dropped; } -struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, - const struct nf_conntrack_tuple *orig, - const struct nf_conntrack_tuple *repl, - gfp_t gfp) +static struct nf_conn * +__nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + gfp_t gfp, u32 hash) { struct nf_conn *ct; @@ -585,6 +616,9 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, get_random_bytes(&rand, sizeof(rand)); } while (!rand); cmpxchg(&nf_conntrack_hash_rnd, 0, rand); + + /* recompute the hash as nf_conntrack_hash_rnd is initialized */ + hash = hash_conntrack_raw(orig, zone); } /* We don't want any race condition at early drop stage */ @@ -592,8 +626,7 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, if (nf_conntrack_max && unlikely(atomic_read(&net->ct.count) > nf_conntrack_max)) { - unsigned int hash = hash_conntrack(net, zone, orig); - if (!early_drop(net, hash)) { + if (!early_drop(net, hash_bucket(hash, net))) { atomic_dec(&net->ct.count); if (net_ratelimit()) printk(KERN_WARNING @@ -623,7 +656,8 @@ struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple = *orig; ct->tuplehash[IP_CT_DIR_ORIGINAL].hnnode.pprev = NULL; ct->tuplehash[IP_CT_DIR_REPLY].tuple = *repl; - ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev = NULL; + /* save hash for reusing when confirming */ + *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; /* Don't set timer yet: wait for confirmation */ setup_timer(&ct->timeout, death_by_timeout, (unsigned long)ct); write_pnet(&ct->ct_net, net); @@ -650,6 +684,14 @@ out_free: return ERR_PTR(-ENOMEM); #endif } + +struct nf_conn *nf_conntrack_alloc(struct net *net, u16 zone, + const struct nf_conntrack_tuple *orig, + const struct nf_conntrack_tuple *repl, + gfp_t gfp) +{ + return __nf_conntrack_alloc(net, zone, orig, repl, gfp, 0); +} EXPORT_SYMBOL_GPL(nf_conntrack_alloc); void nf_conntrack_free(struct nf_conn *ct) @@ -671,7 +713,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_l3proto *l3proto, struct nf_conntrack_l4proto *l4proto, struct sk_buff *skb, - unsigned int dataoff) + unsigned int dataoff, u32 hash) { struct nf_conn *ct; struct nf_conn_help *help; @@ -685,7 +727,8 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, return NULL; } - ct = nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC); + ct = __nf_conntrack_alloc(net, zone, tuple, &repl_tuple, GFP_ATOMIC, + hash); if (IS_ERR(ct)) { pr_debug("Can't allocate conntrack.\n"); return (struct nf_conntrack_tuple_hash *)ct; @@ -762,6 +805,7 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, struct nf_conntrack_tuple_hash *h; struct nf_conn *ct; u16 zone = tmpl ? nf_ct_zone(tmpl) : NF_CT_DEFAULT_ZONE; + u32 hash; if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num, protonum, &tuple, l3proto, @@ -771,10 +815,11 @@ resolve_normal_ct(struct net *net, struct nf_conn *tmpl, } /* look for tuple match */ - h = nf_conntrack_find_get(net, zone, &tuple); + hash = hash_conntrack_raw(&tuple, zone); + h = __nf_conntrack_find_get(net, zone, &tuple, hash); if (!h) { h = init_conntrack(net, tmpl, &tuple, l3proto, l4proto, - skb, dataoff); + skb, dataoff, hash); if (!h) return NULL; if (IS_ERR(h)) @@ -1314,8 +1359,7 @@ int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp) ct = nf_ct_tuplehash_to_ctrack(h); hlist_nulls_del_rcu(&h->hnnode); bucket = __hash_conntrack(&h->tuple, nf_ct_zone(ct), - hashsize, - nf_conntrack_hash_rnd); + hashsize); hlist_nulls_add_head_rcu(&h->hnnode, &hash[bucket]); } } -- cgit v1.1 From 26c15cfd291f8b4ee40b4bbdf5e3772adfd704f5 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Tue, 21 Sep 2010 18:12:30 +0200 Subject: ipvs: changes related to service usecnt Change the usage of svc usecnt during command execution: - we check if svc is registered but we do not need to hold usecnt reference while under __ip_vs_mutex, only the packet handling needs it during scheduling - change __ip_vs_service_get to __ip_vs_service_find and __ip_vs_svc_fwm_get to __ip_vs_svc_fwm_find because now caller will increase svc->usecnt - put common code that calls update_service in __ip_vs_update_dest - put common code in ip_vs_unlink_service() and use it to unregister the service - add comment that svc should not be accessed after ip_vs_del_service anymore - all IP_VS_WAIT_WHILE calls are now unified: usecnt > 0 - Properly log the app ports As result, some problems are fixed: - possible use-after-free of svc in ip_vs_genl_set_cmd after ip_vs_del_service because our usecnt reference does not guarantee that svc is not freed on refcnt==0, eg. when no dests are moved to trash - possible usecnt leak in do_ip_vs_set_ctl after ip_vs_del_service when the service is not freed now, for example, when some destionations are moved into trash and svc->refcnt remains above 0. It is harmless because svc is not in hash anymore. Signed-off-by: Julian Anastasov Acked-by: Simon Horman Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_app.c | 6 +- net/netfilter/ipvs/ip_vs_ctl.c | 250 ++++++++++++++++------------------------- 2 files changed, 102 insertions(+), 154 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c index e76f87f..a475ede 100644 --- a/net/netfilter/ipvs/ip_vs_app.c +++ b/net/netfilter/ipvs/ip_vs_app.c @@ -103,8 +103,8 @@ ip_vs_app_inc_new(struct ip_vs_app *app, __u16 proto, __u16 port) goto out; list_add(&inc->a_list, &app->incs_list); - IP_VS_DBG(9, "%s application %s:%u registered\n", - pp->name, inc->name, inc->port); + IP_VS_DBG(9, "%s App %s:%u registered\n", + pp->name, inc->name, ntohs(inc->port)); return 0; @@ -130,7 +130,7 @@ ip_vs_app_inc_release(struct ip_vs_app *inc) pp->unregister_app(inc); IP_VS_DBG(9, "%s App %s:%u unregistered\n", - pp->name, inc->name, inc->port); + pp->name, inc->name, ntohs(inc->port)); list_del(&inc->a_list); diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e637cd0..e4ec8f3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -405,7 +405,7 @@ static int ip_vs_svc_unhash(struct ip_vs_service *svc) * Get service by {proto,addr,port} in the service table. */ static inline struct ip_vs_service * -__ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, +__ip_vs_service_find(int af, __u16 protocol, const union nf_inet_addr *vaddr, __be16 vport) { unsigned hash; @@ -420,7 +420,6 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, && (svc->port == vport) && (svc->protocol == protocol)) { /* HIT */ - atomic_inc(&svc->usecnt); return svc; } } @@ -433,7 +432,7 @@ __ip_vs_service_get(int af, __u16 protocol, const union nf_inet_addr *vaddr, * Get service by {fwmark} in the service table. */ static inline struct ip_vs_service * -__ip_vs_svc_fwm_get(int af, __u32 fwmark) +__ip_vs_svc_fwm_find(int af, __u32 fwmark) { unsigned hash; struct ip_vs_service *svc; @@ -444,7 +443,6 @@ __ip_vs_svc_fwm_get(int af, __u32 fwmark) list_for_each_entry(svc, &ip_vs_svc_fwm_table[hash], f_list) { if (svc->fwmark == fwmark && svc->af == af) { /* HIT */ - atomic_inc(&svc->usecnt); return svc; } } @@ -463,14 +461,14 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, /* * Check the table hashed by fwmark first */ - if (fwmark && (svc = __ip_vs_svc_fwm_get(af, fwmark))) + if (fwmark && (svc = __ip_vs_svc_fwm_find(af, fwmark))) goto out; /* * Check the table hashed by * for "full" addressed entries */ - svc = __ip_vs_service_get(af, protocol, vaddr, vport); + svc = __ip_vs_service_find(af, protocol, vaddr, vport); if (svc == NULL && protocol == IPPROTO_TCP @@ -480,7 +478,7 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, * Check if ftp service entry exists, the packet * might belong to FTP data connections. */ - svc = __ip_vs_service_get(af, protocol, vaddr, FTPPORT); + svc = __ip_vs_service_find(af, protocol, vaddr, FTPPORT); } if (svc == NULL @@ -488,10 +486,12 @@ ip_vs_service_get(int af, __u32 fwmark, __u16 protocol, /* * Check if the catch-all port (port zero) exists */ - svc = __ip_vs_service_get(af, protocol, vaddr, 0); + svc = __ip_vs_service_find(af, protocol, vaddr, 0); } out: + if (svc) + atomic_inc(&svc->usecnt); read_unlock(&__ip_vs_svc_lock); IP_VS_DBG_BUF(9, "lookup service: fwm %u %s %s:%u %s\n", @@ -510,14 +510,19 @@ __ip_vs_bind_svc(struct ip_vs_dest *dest, struct ip_vs_service *svc) dest->svc = svc; } -static inline void +static void __ip_vs_unbind_svc(struct ip_vs_dest *dest) { struct ip_vs_service *svc = dest->svc; dest->svc = NULL; - if (atomic_dec_and_test(&svc->refcnt)) + if (atomic_dec_and_test(&svc->refcnt)) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), atomic_read(&svc->usecnt)); kfree(svc); + } } @@ -762,8 +767,8 @@ ip_vs_zero_stats(struct ip_vs_stats *stats) * Update a destination in the given service */ static void -__ip_vs_update_dest(struct ip_vs_service *svc, - struct ip_vs_dest *dest, struct ip_vs_dest_user_kern *udest) +__ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, + struct ip_vs_dest_user_kern *udest, int add) { int conn_flags; @@ -818,6 +823,25 @@ __ip_vs_update_dest(struct ip_vs_service *svc, dest->flags &= ~IP_VS_DEST_F_OVERLOAD; dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + + if (add) + ip_vs_new_estimator(&dest->stats); + + write_lock_bh(&__ip_vs_svc_lock); + + /* Wait until all other svc users go away */ + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); + + if (add) { + list_add(&dest->n_list, &svc->destinations); + svc->num_dests++; + } + + /* call the update_service, because server weight may be changed */ + if (svc->scheduler->update_service) + svc->scheduler->update_service(svc); + + write_unlock_bh(&__ip_vs_svc_lock); } @@ -865,13 +889,12 @@ ip_vs_new_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest, atomic_set(&dest->activeconns, 0); atomic_set(&dest->inactconns, 0); atomic_set(&dest->persistconns, 0); - atomic_set(&dest->refcnt, 0); + atomic_set(&dest->refcnt, 1); INIT_LIST_HEAD(&dest->d_list); spin_lock_init(&dest->dst_lock); spin_lock_init(&dest->stats.lock); - __ip_vs_update_dest(svc, dest, udest); - ip_vs_new_estimator(&dest->stats); + __ip_vs_update_dest(svc, dest, udest, 1); *dest_p = dest; @@ -931,65 +954,22 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) IP_VS_DBG_ADDR(svc->af, &dest->vaddr), ntohs(dest->vport)); - __ip_vs_update_dest(svc, dest, udest); - /* * Get the destination from the trash */ list_del(&dest->n_list); - ip_vs_new_estimator(&dest->stats); - - write_lock_bh(&__ip_vs_svc_lock); - + __ip_vs_update_dest(svc, dest, udest, 1); + ret = 0; + } else { /* - * Wait until all other svc users go away. + * Allocate and initialize the dest structure */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - return 0; - } - - /* - * Allocate and initialize the dest structure - */ - ret = ip_vs_new_dest(svc, udest, &dest); - if (ret) { - return ret; + ret = ip_vs_new_dest(svc, udest, &dest); } - - /* - * Add the dest entry into the list - */ - atomic_inc(&dest->refcnt); - - write_lock_bh(&__ip_vs_svc_lock); - - /* - * Wait until all other svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - list_add(&dest->n_list, &svc->destinations); - svc->num_dests++; - - /* call the update_service function of its scheduler */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - LeaveFunction(2); - return 0; + return ret; } @@ -1028,19 +1008,7 @@ ip_vs_edit_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) return -ENOENT; } - __ip_vs_update_dest(svc, dest, udest); - - write_lock_bh(&__ip_vs_svc_lock); - - /* Wait until all other svc users go away */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); - - /* call the update_service, because server weight may be changed */ - if (svc->scheduler->update_service) - svc->scheduler->update_service(svc); - - write_unlock_bh(&__ip_vs_svc_lock); - + __ip_vs_update_dest(svc, dest, udest, 0); LeaveFunction(2); return 0; @@ -1067,6 +1035,10 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) * the destination into the trash. */ if (atomic_dec_and_test(&dest->refcnt)) { + IP_VS_DBG_BUF(3, "Removing destination %u/%s:%u\n", + dest->vfwmark, + IP_VS_DBG_ADDR(dest->af, &dest->addr), + ntohs(dest->port)); ip_vs_dst_reset(dest); /* simply decrease svc->refcnt here, let the caller check and release the service if nobody refers to it. @@ -1133,7 +1105,7 @@ ip_vs_del_dest(struct ip_vs_service *svc, struct ip_vs_dest_user_kern *udest) /* * Wait until all other svc users go away. */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); /* * Unlink dest from the service @@ -1190,7 +1162,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, } /* I'm the first user of the service */ - atomic_set(&svc->usecnt, 1); + atomic_set(&svc->usecnt, 0); atomic_set(&svc->refcnt, 0); svc->af = u->af; @@ -1284,7 +1256,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) /* * Wait until all other svc users go away. */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); /* * Set the flags and timeout value @@ -1383,21 +1355,23 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* * Free the service if nobody refers to it */ - if (atomic_read(&svc->refcnt) == 0) + if (atomic_read(&svc->refcnt) == 0) { + IP_VS_DBG_BUF(3, "Removing service %u/%s:%u usecnt=%d\n", + svc->fwmark, + IP_VS_DBG_ADDR(svc->af, &svc->addr), + ntohs(svc->port), atomic_read(&svc->usecnt)); kfree(svc); + } /* decrease the module use count */ ip_vs_use_count_dec(); } /* - * Delete a service from the service list + * Unlink a service from list and try to delete it if its refcnt reached 0 */ -static int ip_vs_del_service(struct ip_vs_service *svc) +static void ip_vs_unlink_service(struct ip_vs_service *svc) { - if (svc == NULL) - return -EEXIST; - /* * Unhash it from the service table */ @@ -1408,11 +1382,21 @@ static int ip_vs_del_service(struct ip_vs_service *svc) /* * Wait until all the svc users go away. */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 1); + IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); __ip_vs_del_service(svc); write_unlock_bh(&__ip_vs_svc_lock); +} + +/* + * Delete a service from the service list + */ +static int ip_vs_del_service(struct ip_vs_service *svc) +{ + if (svc == NULL) + return -EEXIST; + ip_vs_unlink_service(svc); return 0; } @@ -1431,14 +1415,7 @@ static int ip_vs_flush(void) */ for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry_safe(svc, nxt, &ip_vs_svc_table[idx], s_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); + ip_vs_unlink_service(svc); } } @@ -1448,14 +1425,7 @@ static int ip_vs_flush(void) for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { list_for_each_entry_safe(svc, nxt, &ip_vs_svc_fwm_table[idx], f_list) { - write_lock_bh(&__ip_vs_svc_lock); - ip_vs_svc_unhash(svc); - /* - * Wait until all the svc users go away. - */ - IP_VS_WAIT_WHILE(atomic_read(&svc->usecnt) > 0); - __ip_vs_del_service(svc); - write_unlock_bh(&__ip_vs_svc_lock); + ip_vs_unlink_service(svc); } } @@ -2168,15 +2138,15 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) /* Lookup the exact service by or fwmark */ if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); + svc = __ip_vs_service_find(usvc.af, usvc.protocol, + &usvc.addr, usvc.port); else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); + svc = __ip_vs_svc_fwm_find(usvc.af, usvc.fwmark); if (cmd != IP_VS_SO_SET_ADD && (svc == NULL || svc->protocol != usvc.protocol)) { ret = -ESRCH; - goto out_drop_service; + goto out_unlock; } switch (cmd) { @@ -2210,10 +2180,6 @@ do_ip_vs_set_ctl(struct sock *sk, int cmd, void __user *user, unsigned int len) ret = -EINVAL; } -out_drop_service: - if (svc) - ip_vs_service_put(svc); - out_unlock: mutex_unlock(&__ip_vs_mutex); out_dec: @@ -2306,10 +2272,10 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, int ret = 0; if (get->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, get->fwmark); + svc = __ip_vs_svc_fwm_find(AF_INET, get->fwmark); else - svc = __ip_vs_service_get(AF_INET, get->protocol, &addr, - get->port); + svc = __ip_vs_service_find(AF_INET, get->protocol, &addr, + get->port); if (svc) { int count = 0; @@ -2337,7 +2303,6 @@ __ip_vs_get_dest_entries(const struct ip_vs_get_dests *get, } count++; } - ip_vs_service_put(svc); } else ret = -ESRCH; return ret; @@ -2458,15 +2423,14 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) entry = (struct ip_vs_service_entry *)arg; addr.ip = entry->addr; if (entry->fwmark) - svc = __ip_vs_svc_fwm_get(AF_INET, entry->fwmark); + svc = __ip_vs_svc_fwm_find(AF_INET, entry->fwmark); else - svc = __ip_vs_service_get(AF_INET, entry->protocol, - &addr, entry->port); + svc = __ip_vs_service_find(AF_INET, entry->protocol, + &addr, entry->port); if (svc) { ip_vs_copy_service(entry, svc); if (copy_to_user(user, entry, sizeof(*entry)) != 0) ret = -EFAULT; - ip_vs_service_put(svc); } else ret = -ESRCH; } @@ -2733,10 +2697,12 @@ nla_put_failure: } static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, - struct nlattr *nla, int full_entry) + struct nlattr *nla, int full_entry, + struct ip_vs_service **ret_svc) { struct nlattr *attrs[IPVS_SVC_ATTR_MAX + 1]; struct nlattr *nla_af, *nla_port, *nla_fwmark, *nla_protocol, *nla_addr; + struct ip_vs_service *svc; /* Parse mandatory identifying service fields first */ if (nla == NULL || @@ -2772,12 +2738,18 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, usvc->fwmark = 0; } + if (usvc->fwmark) + svc = __ip_vs_svc_fwm_find(usvc->af, usvc->fwmark); + else + svc = __ip_vs_service_find(usvc->af, usvc->protocol, + &usvc->addr, usvc->port); + *ret_svc = svc; + /* If a full entry was requested, check for the additional fields */ if (full_entry) { struct nlattr *nla_sched, *nla_flags, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; - struct ip_vs_service *svc; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; @@ -2790,16 +2762,8 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, nla_memcpy(&flags, nla_flags, sizeof(flags)); /* prefill flags from service if it already exists */ - if (usvc->fwmark) - svc = __ip_vs_svc_fwm_get(usvc->af, usvc->fwmark); - else - svc = __ip_vs_service_get(usvc->af, usvc->protocol, - &usvc->addr, usvc->port); - if (svc) { + if (svc) usvc->flags = svc->flags; - ip_vs_service_put(svc); - } else - usvc->flags = 0; /* set new flags from userland */ usvc->flags = (usvc->flags & ~flags.mask) | @@ -2815,17 +2779,11 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, static struct ip_vs_service *ip_vs_genl_find_service(struct nlattr *nla) { struct ip_vs_service_user_kern usvc; + struct ip_vs_service *svc; int ret; - ret = ip_vs_genl_parse_service(&usvc, nla, 0); - if (ret) - return ERR_PTR(ret); - - if (usvc.fwmark) - return __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - else - return __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); + ret = ip_vs_genl_parse_service(&usvc, nla, 0, &svc); + return ret ? ERR_PTR(ret) : svc; } static int ip_vs_genl_fill_dest(struct sk_buff *skb, struct ip_vs_dest *dest) @@ -2916,7 +2874,6 @@ static int ip_vs_genl_dump_dests(struct sk_buff *skb, nla_put_failure: cb->args[0] = idx; - ip_vs_service_put(svc); out_err: mutex_unlock(&__ip_vs_mutex); @@ -3129,17 +3086,10 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) ret = ip_vs_genl_parse_service(&usvc, info->attrs[IPVS_CMD_ATTR_SERVICE], - need_full_svc); + need_full_svc, &svc); if (ret) goto out; - /* Lookup the exact service by or fwmark */ - if (usvc.fwmark == 0) - svc = __ip_vs_service_get(usvc.af, usvc.protocol, - &usvc.addr, usvc.port); - else - svc = __ip_vs_svc_fwm_get(usvc.af, usvc.fwmark); - /* Unless we're adding a new service, the service must already exist */ if ((cmd != IPVS_CMD_NEW_SERVICE) && (svc == NULL)) { ret = -ESRCH; @@ -3173,6 +3123,7 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) break; case IPVS_CMD_DEL_SERVICE: ret = ip_vs_del_service(svc); + /* do not use svc, it can be freed */ break; case IPVS_CMD_NEW_DEST: ret = ip_vs_add_dest(svc, &udest); @@ -3191,8 +3142,6 @@ static int ip_vs_genl_set_cmd(struct sk_buff *skb, struct genl_info *info) } out: - if (svc) - ip_vs_service_put(svc); mutex_unlock(&__ip_vs_mutex); return ret; @@ -3238,7 +3187,6 @@ static int ip_vs_genl_get_cmd(struct sk_buff *skb, struct genl_info *info) goto out_err; } else if (svc) { ret = ip_vs_genl_fill_service(msg, svc); - ip_vs_service_put(svc); if (ret) goto nla_put_failure; } else { -- cgit v1.1 From a77f5db361ed9953b5b749353ea2c7fed2bf8d93 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 20 Sep 2010 08:42:17 +0000 Subject: ethtool: Allocate register dump buffer with vmalloc() Some NICs have huge register files which exceed the maximum heap allocation size. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/ethtool.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 91ffce2..dae2fd0 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -815,7 +815,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) if (regs.len > reglen) regs.len = reglen; - regbuf = kmalloc(reglen, GFP_USER); + regbuf = vmalloc(reglen); if (!regbuf) return -ENOMEM; @@ -830,7 +830,7 @@ static int ethtool_get_regs(struct net_device *dev, char __user *useraddr) ret = 0; out: - kfree(regbuf); + vfree(regbuf); return ret; } -- cgit v1.1 From 73da16c28ed0724755afdb95c7dd6b166381be10 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 21 Sep 2010 16:12:11 -0700 Subject: ethtool: Fix build due to lack of ethtool.h include. net/core/ethtool.c: In function 'ethtool_get_regs': net/core/ethtool.c:818:2: error: implicit declaration of function 'vmalloc' net/core/ethtool.c:818:9: warning: assignment makes pointer from integer without a cast net/core/ethtool.c:833:2: error: implicit declaration of function 'vfree' Signed-off-by: David S. Miller --- net/core/ethtool.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index dae2fd0..7d7e572 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -19,6 +19,7 @@ #include #include #include +#include #include /* -- cgit v1.1 From 82fd5b5d1ec370a50b3060418cde6a4ac8401117 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Mon, 20 Sep 2010 20:40:26 +0000 Subject: net: core: use kernel's converter from hex to bin Signed-off-by: Andy Shevchenko Signed-off-by: David S. Miller --- net/core/pktgen.c | 10 ++++------ net/core/utils.c | 13 +++++++------ 2 files changed, 11 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 386c228..2c0df0f 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -729,16 +729,14 @@ static int hex32_arg(const char __user *user_buffer, unsigned long maxlen, *num = 0; for (; i < maxlen; i++) { + int value; char c; *num <<= 4; if (get_user(c, &user_buffer[i])) return -EFAULT; - if ((c >= '0') && (c <= '9')) - *num |= c - '0'; - else if ((c >= 'a') && (c <= 'f')) - *num |= c - 'a' + 10; - else if ((c >= 'A') && (c <= 'F')) - *num |= c - 'A' + 10; + value = hex_to_bin(c); + if (value >= 0) + *num |= value; else break; } diff --git a/net/core/utils.c b/net/core/utils.c index f418544..ec6bb32 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -92,18 +92,19 @@ EXPORT_SYMBOL(in_aton); static inline int xdigit2bin(char c, int delim) { + int val; + if (c == delim || c == '\0') return IN6PTON_DELIM; if (c == ':') return IN6PTON_COLON_MASK; if (c == '.') return IN6PTON_DOT; - if (c >= '0' && c <= '9') - return (IN6PTON_XDIGIT | IN6PTON_DIGIT| (c - '0')); - if (c >= 'a' && c <= 'f') - return (IN6PTON_XDIGIT | (c - 'a' + 10)); - if (c >= 'A' && c <= 'F') - return (IN6PTON_XDIGIT | (c - 'A' + 10)); + + val = hex_to_bin(c); + if (val >= 0) + return val | IN6PTON_XDIGIT | (val < 10 ? IN6PTON_DIGIT : 0); + if (delim == -1) return IN6PTON_DELIM; return IN6PTON_UNKNOWN; -- cgit v1.1 From 756e64a0b106f1a2ca96889c39ea0d48131105c0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 21 Sep 2010 06:43:54 +0000 Subject: net: constify some ppp/pptp structs Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_ppp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c index ff954b3..39a21d0 100644 --- a/net/l2tp/l2tp_ppp.c +++ b/net/l2tp/l2tp_ppp.c @@ -1768,7 +1768,7 @@ static const struct proto_ops pppol2tp_ops = { .ioctl = pppox_ioctl, }; -static struct pppox_proto pppol2tp_proto = { +static const struct pppox_proto pppol2tp_proto = { .create = pppol2tp_create, .ioctl = pppol2tp_ioctl }; -- cgit v1.1 From 9c44c9fa78c5a449699491f0355189e2a09c242a Mon Sep 17 00:00:00 2001 From: Sjur Braendeland Date: Tue, 21 Sep 2010 10:40:06 +0000 Subject: caif: Remove buggy re-definition of pr_debug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove debugging quirk redefining pr_debug to pr_warning. Signed-off-by: Sjur Brændeland Signed-off-by: David S. Miller --- net/caif/chnl_net.c | 3 --- 1 file changed, 3 deletions(-) (limited to 'net') diff --git a/net/caif/chnl_net.c b/net/caif/chnl_net.c index 86aac24..84a422c 100644 --- a/net/caif/chnl_net.c +++ b/net/caif/chnl_net.c @@ -30,9 +30,6 @@ #define CONNECT_TIMEOUT (5 * HZ) #define CAIF_NET_DEFAULT_QUEUE_LEN 500 -#undef pr_debug -#define pr_debug pr_warn - /*This list is protected by the rtnl lock. */ static LIST_HEAD(chnl_net_list); -- cgit v1.1 From b04367df66eb63444d38c43d15f5e39499d85ae6 Mon Sep 17 00:00:00 2001 From: Sjur Braendeland Date: Tue, 21 Sep 2010 11:44:44 +0000 Subject: caif: Minor fixes in log prints. Use pr_debug for flow control printouts, and refine an error printout. Signed-off-by: Sjur Braendeland Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 8 ++------ net/caif/cfcnfg.c | 2 +- 2 files changed, 3 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index fd1f5df..3943398 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -159,9 +159,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >= (unsigned)sk->sk_rcvbuf && rx_flow_is_on(cf_sk)) { - trace_printk("CAIF: %s(): " - "sending flow OFF (queue len = %d %d)\n", - __func__, + pr_debug("sending flow OFF (queue len = %d %d)\n", atomic_read(&cf_sk->sk.sk_rmem_alloc), sk_rcvbuf_lowwater(cf_sk)); set_rx_flow_off(cf_sk); @@ -174,9 +172,7 @@ static int caif_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) return err; if (!sk_rmem_schedule(sk, skb->truesize) && rx_flow_is_on(cf_sk)) { set_rx_flow_off(cf_sk); - trace_printk("CAIF: %s(): " - "sending flow OFF due to rmem_schedule\n", - __func__); + pr_debug("sending flow OFF due to rmem_schedule\n"); dbfs_atomic_inc(&cnt.num_rx_flow_off); caif_flow_ctrl(sk, CAIF_MODEMCMD_FLOW_OFF_REQ); } diff --git a/net/caif/cfcnfg.c b/net/caif/cfcnfg.c index ef93a13..41adafd1 100644 --- a/net/caif/cfcnfg.c +++ b/net/caif/cfcnfg.c @@ -197,7 +197,7 @@ int cfcnfg_disconn_adapt_layer(struct cfcnfg *cnfg, struct cflayer *adap_layer) caif_assert(adap_layer != NULL); channel_id = adap_layer->id; if (adap_layer->dn == NULL || channel_id == 0) { - pr_err("adap_layer->id is 0\n"); + pr_err("adap_layer->dn == NULL or adap_layer->id is 0\n"); ret = -ENOTCONN; goto end; } -- cgit v1.1 From e5e03ce1e5c6c015cabf274b24976dff408dc07f Mon Sep 17 00:00:00 2001 From: Sjur Braendeland Date: Tue, 21 Sep 2010 11:44:45 +0000 Subject: caif: Fix function NULL pointer check. Check that receive function pointer is not null before calling it. Signed-off-by: Sjur Braendeland Signed-off-by: David S. Miller --- net/caif/caif_dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_dev.c b/net/caif/caif_dev.c index 0fd01dd..b99369a 100644 --- a/net/caif/caif_dev.c +++ b/net/caif/caif_dev.c @@ -173,7 +173,7 @@ static int receive(struct sk_buff *skb, struct net_device *dev, net = dev_net(dev); pkt = cfpkt_fromnative(CAIF_DIR_IN, skb); caifd = caif_get(dev); - if (!caifd || !caifd->layer.up || !caifd->layer.up->ctrlcmd) + if (!caifd || !caifd->layer.up || !caifd->layer.up->receive) return NET_RX_DROP; if (caifd->layer.up->receive(caifd->layer.up, pkt)) -- cgit v1.1 From 9e2e8f14d48dbb6c31aeb739ae4fc8997b9dfe84 Mon Sep 17 00:00:00 2001 From: Sjur Braendeland Date: Tue, 21 Sep 2010 11:44:46 +0000 Subject: caif: Use default send and receive buffer size in caif_socket. CAIF sockets should use socket's default send and receive buffers sizes. Signed-off-by: Sjur Braendeland Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 7 ------- 1 file changed, 7 deletions(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 3943398..4d918f8 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -30,9 +30,6 @@ MODULE_LICENSE("GPL"); MODULE_ALIAS_NETPROTO(AF_CAIF); -#define CAIF_DEF_SNDBUF (4096*10) -#define CAIF_DEF_RCVBUF (4096*100) - /* * CAIF state is re-using the TCP socket states. * caif_states stored in sk_state reflect the state as reported by @@ -1118,10 +1115,6 @@ static int caif_create(struct net *net, struct socket *sock, int protocol, /* Store the protocol */ sk->sk_protocol = (unsigned char) protocol; - /* Sendbuf dictates the amount of outbound packets not yet sent */ - sk->sk_sndbuf = CAIF_DEF_SNDBUF; - sk->sk_rcvbuf = CAIF_DEF_RCVBUF; - /* * Lock in order to try to stop someone from opening the socket * too early. -- cgit v1.1 From 5b92b61f3891517d18d0573ad2c939c81b59ecfe Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Sep 2010 08:34:12 +0200 Subject: netfilter: nf_nat: better error handling of nf_ct_expect_related() in helpers This patch improves the situation in which the expectation table is full for conntrack NAT helpers. Basically, we give up if we don't find a place in the table instead of looping over nf_ct_expect_related() with a different port (we should only do this if it returns -EBUSY, for -EMFILE or -ESHUTDOWN I think that it's better to skip this). Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_amanda.c | 9 ++++++- net/ipv4/netfilter/nf_nat_ftp.c | 9 ++++++- net/ipv4/netfilter/nf_nat_h323.c | 53 +++++++++++++++++++++++++++++++++----- net/ipv4/netfilter/nf_nat_irc.c | 9 ++++++- net/ipv4/netfilter/nf_nat_sip.c | 27 ++++++++++++++++--- 5 files changed, 93 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_amanda.c b/net/ipv4/netfilter/nf_nat_amanda.c index c31b876..0f23b3f 100644 --- a/net/ipv4/netfilter/nf_nat_amanda.c +++ b/net/ipv4/netfilter/nf_nat_amanda.c @@ -44,9 +44,16 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) diff --git a/net/ipv4/netfilter/nf_nat_ftp.c b/net/ipv4/netfilter/nf_nat_ftp.c index 86e0e84f..dc73abb 100644 --- a/net/ipv4/netfilter/nf_nat_ftp.c +++ b/net/ipv4/netfilter/nf_nat_ftp.c @@ -79,9 +79,16 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index 5045196..790f316 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -222,13 +222,24 @@ static int nat_rtp_rtcp(struct sk_buff *skb, struct nf_conn *ct, /* Try to get a pair of ports. */ for (nated_port = ntohs(rtp_exp->tuple.dst.u.udp.port); nated_port != 0; nated_port += 2) { + int ret; + rtp_exp->tuple.dst.u.udp.port = htons(nated_port); - if (nf_ct_expect_related(rtp_exp) == 0) { + ret = nf_ct_expect_related(rtp_exp); + if (ret == 0) { rtcp_exp->tuple.dst.u.udp.port = htons(nated_port + 1); - if (nf_ct_expect_related(rtcp_exp) == 0) + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + nated_port = 0; break; - nf_ct_unexpect_related(rtp_exp); + } + } else if (ret != -EBUSY) { + nated_port = 0; + break; } } @@ -284,9 +295,16 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (; nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; break; + } } if (nated_port == 0) { /* No port available */ @@ -334,9 +352,16 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (; nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } } if (nated_port == 0) { /* No port available */ @@ -418,9 +443,16 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (; nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + nated_port = 0; break; + } } if (nated_port == 0) { /* No port available */ @@ -500,9 +532,16 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, /* Try to get same port: if not, try to change it. */ for (nated_port = ntohs(port); nated_port != 0; nated_port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(nated_port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) break; + else if (ret != -EBUSY) { + nated_port = 0; + break; + } } if (nated_port == 0) { /* No port available */ diff --git a/net/ipv4/netfilter/nf_nat_irc.c b/net/ipv4/netfilter/nf_nat_irc.c index ea83a88..535e1a8 100644 --- a/net/ipv4/netfilter/nf_nat_irc.c +++ b/net/ipv4/netfilter/nf_nat_irc.c @@ -45,9 +45,16 @@ static unsigned int help(struct sk_buff *skb, /* Try to get same port: if not, try to change it. */ for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { + int ret; + exp->tuple.dst.u.tcp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) diff --git a/net/ipv4/netfilter/nf_nat_sip.c b/net/ipv4/netfilter/nf_nat_sip.c index 11b538d..e40cf78 100644 --- a/net/ipv4/netfilter/nf_nat_sip.c +++ b/net/ipv4/netfilter/nf_nat_sip.c @@ -307,9 +307,16 @@ static unsigned int ip_nat_sip_expect(struct sk_buff *skb, unsigned int dataoff, exp->expectfn = ip_nat_sip_expected; for (; port != 0; port++) { + int ret; + exp->tuple.dst.u.udp.port = htons(port); - if (nf_ct_expect_related(exp) == 0) + ret = nf_ct_expect_related(exp); + if (ret == 0) + break; + else if (ret != -EBUSY) { + port = 0; break; + } } if (port == 0) @@ -480,13 +487,25 @@ static unsigned int ip_nat_sdp_media(struct sk_buff *skb, unsigned int dataoff, /* Try to get same pair of ports: if not, try to change them. */ for (port = ntohs(rtp_exp->tuple.dst.u.udp.port); port != 0; port += 2) { + int ret; + rtp_exp->tuple.dst.u.udp.port = htons(port); - if (nf_ct_expect_related(rtp_exp) != 0) + ret = nf_ct_expect_related(rtp_exp); + if (ret == -EBUSY) continue; + else if (ret < 0) { + port = 0; + break; + } rtcp_exp->tuple.dst.u.udp.port = htons(port + 1); - if (nf_ct_expect_related(rtcp_exp) == 0) + ret = nf_ct_expect_related(rtcp_exp); + if (ret == 0) break; - nf_ct_unexpect_related(rtp_exp); + else if (ret != -EBUSY) { + nf_ct_unexpect_related(rtp_exp); + port = 0; + break; + } } if (port == 0) -- cgit v1.1 From bcac0dfab191cb53b3f9b43c8014a34070ed58ff Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Sep 2010 08:35:36 +0200 Subject: netfilter: ctnetlink: missing validation of CTA_EXPECT_ZONE attribute This patch adds the missing validation of the CTA_EXPECT_ZONE attribute in the ctnetlink code. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_netlink.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 5bae1cd..37533a3 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1733,6 +1733,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_TIMEOUT] = { .type = NLA_U32 }, [CTA_EXPECT_ID] = { .type = NLA_U32 }, [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, + [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, }; static int -- cgit v1.1 From 8b008faf92ac8f7eeb65e8cd36077601af7c46db Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Wed, 22 Sep 2010 08:36:59 +0200 Subject: netfilter: ctnetlink: allow to specify the expectation flags With this patch, you can specify the expectation flags for user-space created expectations. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_netlink.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 37533a3..0804e0e 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1577,6 +1577,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); + NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); helper = rcu_dereference(nfct_help(master)->helper); if (helper) NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); @@ -1734,6 +1735,7 @@ static const struct nla_policy exp_nla_policy[CTA_EXPECT_MAX+1] = { [CTA_EXPECT_ID] = { .type = NLA_U32 }, [CTA_EXPECT_HELP_NAME] = { .type = NLA_NUL_STRING }, [CTA_EXPECT_ZONE] = { .type = NLA_U16 }, + [CTA_EXPECT_FLAGS] = { .type = NLA_U32 }, }; static int @@ -1933,9 +1935,13 @@ ctnetlink_create_expect(struct net *net, u16 zone, goto out; } + if (cda[CTA_EXPECT_FLAGS]) + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + else + exp->flags = 0; + exp->class = 0; exp->expectfn = NULL; - exp->flags = 0; exp->master = ct; exp->helper = NULL; memcpy(&exp->tuple, &tuple, sizeof(struct nf_conntrack_tuple)); -- cgit v1.1 From 40e192c3ff7b1af680b0b03e94cadf1dad5fb82e Mon Sep 17 00:00:00 2001 From: andrew hendry Date: Tue, 21 Sep 2010 15:24:25 +0000 Subject: X.25 remove bkl in getsockname Signed-off-by: Andrew Hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index c1bbf9e..dc6767ec 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -909,7 +909,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr, struct x25_sock *x25 = x25_sk(sk); int rc = 0; - lock_kernel(); if (peer) { if (sk->sk_state != TCP_ESTABLISHED) { rc = -ENOTCONN; @@ -923,7 +922,6 @@ static int x25_getname(struct socket *sock, struct sockaddr *uaddr, *uaddr_len = sizeof(*sx25); out: - unlock_kernel(); return rc; } -- cgit v1.1 From 768190fdc058cc7405330f7782782df084c25d61 Mon Sep 17 00:00:00 2001 From: andrew hendry Date: Tue, 21 Sep 2010 15:24:45 +0000 Subject: X.25 remove bkl in poll The x25_datagram_poll didn't add anything, removed it. Signed-off-by: Andrew Hendry Signed-off-by: David S. Miller --- net/x25/af_x25.c | 14 +------------- 1 file changed, 1 insertion(+), 13 deletions(-) (limited to 'net') diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index dc6767ec..f7af98d 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -925,18 +925,6 @@ out: return rc; } -static unsigned int x25_datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - int rc; - - lock_kernel(); - rc = datagram_poll(file, sock, wait); - unlock_kernel(); - - return rc; -} - int x25_rx_call_request(struct sk_buff *skb, struct x25_neigh *nb, unsigned int lci) { @@ -1744,7 +1732,7 @@ static const struct proto_ops x25_proto_ops = { .socketpair = sock_no_socketpair, .accept = x25_accept, .getname = x25_getname, - .poll = x25_datagram_poll, + .poll = datagram_poll, .ioctl = x25_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = compat_x25_ioctl, -- cgit v1.1 From a02cec2155fbea457eca8881870fd2de1a4c4c76 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 22 Sep 2010 20:43:57 +0000 Subject: net: return operator cleanup Change "return (EXPR);" to "return EXPR;" return is not a function, parentheses are not required. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/802/fc.c | 2 +- net/802/fddi.c | 12 ++++----- net/802/hippi.c | 2 +- net/802/tr.c | 2 +- net/8021q/vlan_core.c | 2 +- net/9p/client.c | 4 +-- net/bluetooth/rfcomm/core.c | 4 +-- net/core/flow.c | 4 +-- net/core/neighbour.c | 6 ++--- net/core/utils.c | 2 +- net/dccp/ccids/lib/loss_interval.c | 2 +- net/econet/af_econet.c | 4 +-- net/ethernet/eth.c | 2 +- net/ipv4/arp.c | 2 +- net/ipv4/datagram.c | 2 +- net/ipv4/inet_diag.c | 2 +- net/ipv4/ip_fragment.c | 4 +-- net/ipv4/ip_gre.c | 2 +- net/ipv4/netfilter/arp_tables.c | 2 +- net/ipv4/route.c | 2 +- net/ipv4/tcp_input.c | 10 ++++---- net/ipv4/tcp_minisocks.c | 2 +- net/ipv4/tcp_output.c | 8 +++--- net/ipv4/tcp_westwood.c | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/addrlabel.c | 5 ++-- net/ipv6/af_inet6.c | 6 ++--- net/ipv6/exthdrs_core.c | 4 +-- net/ipv6/ip6_output.c | 4 +-- net/ipv6/ndisc.c | 8 +++--- net/ipv6/netfilter/ip6_tables.c | 14 +++++------ net/ipv6/raw.c | 12 ++++----- net/ipv6/route.c | 14 +++++------ net/ipv6/tcp_ipv6.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- net/irda/af_irda.c | 14 +++++------ net/irda/discovery.c | 2 +- net/irda/ircomm/ircomm_tty.c | 4 +-- net/irda/irlmp.c | 2 +- net/irda/irlmp_frame.c | 2 +- net/irda/irnet/irnet_irda.c | 22 ++++++++--------- net/irda/irnet/irnet_ppp.c | 8 +++--- net/key/af_key.c | 4 +-- net/mac80211/rate.c | 2 +- net/rfkill/input.c | 2 +- net/rose/rose_link.c | 4 +-- net/sctp/protocol.c | 2 +- net/sctp/socket.c | 6 ++--- net/sunrpc/auth_gss/auth_gss.c | 2 +- net/sunrpc/auth_gss/gss_generic_token.c | 44 ++++++++++++++++----------------- net/sunrpc/auth_gss/gss_krb5_seqnum.c | 2 +- net/sunrpc/auth_gss/gss_mech_switch.c | 2 +- net/sunrpc/sched.c | 2 +- net/tipc/addr.c | 2 +- net/tipc/bcast.c | 2 +- net/tipc/bearer.c | 2 +- net/tipc/dbg.c | 4 +-- net/tipc/link.c | 6 ++--- net/tipc/link.h | 16 ++++++------ net/tipc/msg.h | 6 ++--- net/tipc/name_table.c | 2 +- net/tipc/node.c | 6 ++--- net/tipc/port.h | 2 +- net/tipc/socket.c | 2 +- net/tipc/subscr.c | 2 +- net/wireless/core.h | 2 +- 66 files changed, 169 insertions(+), 170 deletions(-) (limited to 'net') diff --git a/net/802/fc.c b/net/802/fc.c index 34cf1ee..1e49f2d 100644 --- a/net/802/fc.c +++ b/net/802/fc.c @@ -70,7 +70,7 @@ static int fc_header(struct sk_buff *skb, struct net_device *dev, if(daddr) { memcpy(fch->daddr,daddr,dev->addr_len); - return(hdr_len); + return hdr_len; } return -hdr_len; } diff --git a/net/802/fddi.c b/net/802/fddi.c index 3ef0ab0..94b3ad0 100644 --- a/net/802/fddi.c +++ b/net/802/fddi.c @@ -82,10 +82,10 @@ static int fddi_header(struct sk_buff *skb, struct net_device *dev, if (daddr != NULL) { memcpy(fddi->daddr, daddr, dev->addr_len); - return(hl); + return hl; } - return(-hl); + return -hl; } @@ -108,7 +108,7 @@ static int fddi_rebuild_header(struct sk_buff *skb) { printk("%s: Don't know how to resolve type %04X addresses.\n", skb->dev->name, ntohs(fddi->hdr.llc_snap.ethertype)); - return(0); + return 0; } } @@ -162,7 +162,7 @@ __be16 fddi_type_trans(struct sk_buff *skb, struct net_device *dev) /* Assume 802.2 SNAP frames, for now */ - return(type); + return type; } EXPORT_SYMBOL(fddi_type_trans); @@ -170,9 +170,9 @@ EXPORT_SYMBOL(fddi_type_trans); int fddi_change_mtu(struct net_device *dev, int new_mtu) { if ((new_mtu < FDDI_K_SNAP_HLEN) || (new_mtu > FDDI_K_SNAP_DLEN)) - return(-EINVAL); + return -EINVAL; dev->mtu = new_mtu; - return(0); + return 0; } EXPORT_SYMBOL(fddi_change_mtu); diff --git a/net/802/hippi.c b/net/802/hippi.c index cd3e8e9..91aca87 100644 --- a/net/802/hippi.c +++ b/net/802/hippi.c @@ -152,7 +152,7 @@ int hippi_change_mtu(struct net_device *dev, int new_mtu) if ((new_mtu < 68) || (new_mtu > 65280)) return -EINVAL; dev->mtu = new_mtu; - return(0); + return 0; } EXPORT_SYMBOL(hippi_change_mtu); diff --git a/net/802/tr.c b/net/802/tr.c index 1c6e596..5e20cf8 100644 --- a/net/802/tr.c +++ b/net/802/tr.c @@ -145,7 +145,7 @@ static int tr_header(struct sk_buff *skb, struct net_device *dev, { memcpy(trh->daddr,daddr,dev->addr_len); tr_source_route(skb, trh, dev); - return(hdr_len); + return hdr_len; } return -hdr_len; diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index 889f4ac..0eb486d 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -27,7 +27,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, else if (vlan_id) goto drop; - return (polling ? netif_receive_skb(skb) : netif_rx(skb)); + return polling ? netif_receive_skb(skb) : netif_rx(skb); drop: dev_kfree_skb_any(skb); diff --git a/net/9p/client.c b/net/9p/client.c index dc6f2f2..f34b9f5 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -61,13 +61,13 @@ static const match_table_t tokens = { inline int p9_is_proto_dotl(struct p9_client *clnt) { - return (clnt->proto_version == p9_proto_2000L); + return clnt->proto_version == p9_proto_2000L; } EXPORT_SYMBOL(p9_is_proto_dotl); inline int p9_is_proto_dotu(struct p9_client *clnt) { - return (clnt->proto_version == p9_proto_2000u); + return clnt->proto_version == p9_proto_2000u; } EXPORT_SYMBOL(p9_is_proto_dotu); diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 7dca91b..15ea84b 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -179,13 +179,13 @@ static unsigned char rfcomm_crc_table[256] = { /* FCS on 2 bytes */ static inline u8 __fcs(u8 *data) { - return (0xff - __crc(data)); + return 0xff - __crc(data); } /* FCS on 3 bytes */ static inline u8 __fcs2(u8 *data) { - return (0xff - rfcomm_crc_table[__crc(data) ^ data[2]]); + return 0xff - rfcomm_crc_table[__crc(data) ^ data[2]]; } /* Check FCS */ diff --git a/net/core/flow.c b/net/core/flow.c index b143b86..127c8a7 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -176,8 +176,8 @@ static u32 flow_hash_code(struct flow_cache *fc, { u32 *k = (u32 *) key; - return (jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) - & (flow_cache_hash_size(fc) - 1)); + return jhash2(k, (sizeof(*key) / sizeof(u32)), fcp->hash_rnd) + & (flow_cache_hash_size(fc) - 1); } typedef unsigned long flow_compare_t; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index a4e0a74..96b1a749a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -122,7 +122,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return (base ? (net_random() % base) + (base >> 1) : 0); + return base ? (net_random() % base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); @@ -766,9 +766,9 @@ next_elt: static __inline__ int neigh_max_probes(struct neighbour *n) { struct neigh_parms *p = n->parms; - return (n->nud_state & NUD_PROBE ? + return (n->nud_state & NUD_PROBE) ? p->ucast_probes : - p->ucast_probes + p->app_probes + p->mcast_probes); + p->ucast_probes + p->app_probes + p->mcast_probes; } static void neigh_invalidate(struct neighbour *neigh) diff --git a/net/core/utils.c b/net/core/utils.c index ec6bb32..5fea0ab 100644 --- a/net/core/utils.c +++ b/net/core/utils.c @@ -75,7 +75,7 @@ __be32 in_aton(const char *str) str++; } } - return(htonl(l)); + return htonl(l); } EXPORT_SYMBOL(in_aton); diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index 8fc3cbf..497723c 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -116,7 +116,7 @@ u8 tfrc_lh_update_i_mean(struct tfrc_loss_hist *lh, struct sk_buff *skb) cur->li_length = len; tfrc_lh_calc_i_mean(lh); - return (lh->i_mean < old_i_mean); + return lh->i_mean < old_i_mean; } /* Determine if `new_loss' does begin a new loss interval [RFC 4342, 10.2] */ diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index baa98fb..f8c1ae4 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -392,7 +392,7 @@ static int econet_sendmsg(struct kiocb *iocb, struct socket *sock, dev_queue_xmit(skb); dev_put(dev); mutex_unlock(&econet_mutex); - return(len); + return len; out_free: kfree_skb(skb); @@ -637,7 +637,7 @@ static int econet_create(struct net *net, struct socket *sock, int protocol, eo->num = protocol; econet_insert_socket(&econet_sklist, sk); - return(0); + return 0; out: return err; } diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c index 85e7b45..f00ef2f 100644 --- a/net/ethernet/eth.c +++ b/net/ethernet/eth.c @@ -387,6 +387,6 @@ ssize_t sysfs_format_mac(char *buf, const unsigned char *addr, int len) l = _format_mac_addr(buf, PAGE_SIZE, addr, len); l += scnprintf(buf + l, PAGE_SIZE - l, "\n"); - return ((ssize_t) l); + return (ssize_t)l; } EXPORT_SYMBOL(sysfs_format_mac); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index dcfe7e9..4083c18 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -567,7 +567,7 @@ static inline int arp_fwd_proxy(struct in_device *in_dev, if (out_dev) omi = IN_DEV_MEDIUM_ID(out_dev); - return (omi != imi && omi != -1); + return omi != imi && omi != -1; } /* diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 721a8a3..174be6c 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -73,6 +73,6 @@ int ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_id = jiffies; sk_dst_set(sk, &rt->dst); - return(0); + return 0; } EXPORT_SYMBOL(ip4_datagram_connect); diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index e5fa2dd..ba80426 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -425,7 +425,7 @@ static int inet_diag_bc_run(const void *bc, int len, bc += op->no; } } - return (len == 0); + return len == 0; } static int valid_cc(const void *bc, int len, int cc) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index f4dc879..1684408 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -116,11 +116,11 @@ static int ip4_frag_match(struct inet_frag_queue *q, void *a) struct ip4_create_arg *arg = a; qp = container_of(q, struct ipq, q); - return (qp->id == arg->iph->id && + return qp->id == arg->iph->id && qp->saddr == arg->iph->saddr && qp->daddr == arg->iph->daddr && qp->protocol == arg->iph->protocol && - qp->user == arg->user); + qp->user == arg->user; } /* Memory Tracking Functions. */ diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 714b6a8..0967d02 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -659,7 +659,7 @@ drop: rcu_read_unlock(); drop_nolock: kfree_skb(skb); - return(0); + return 0; } static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e8f4f9a..8b642f1 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -72,7 +72,7 @@ static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, for (i = 0; i < len; i++) ret |= (hdr_addr[i] ^ ap->addr[i]) & ap->mask[i]; - return (ret != 0); + return ret != 0; } /* diff --git a/net/ipv4/route.c b/net/ipv4/route.c index e24d48d..ae1d4a4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2791,7 +2791,7 @@ static int ipv4_dst_blackhole(struct net *net, struct rtable **rp, struct flowi dst_release(&(*rp)->dst); *rp = rt; - return (rt ? 0 : -ENOMEM); + return rt ? 0 : -ENOMEM; } int ip_route_output_flow(struct net *net, struct rtable **rp, struct flowi *flp, diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 1bc87a0..51966b3 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2301,7 +2301,7 @@ static inline int tcp_dupack_heuristics(struct tcp_sock *tp) static inline int tcp_skb_timedout(struct sock *sk, struct sk_buff *skb) { - return (tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto); + return tcp_time_stamp - TCP_SKB_CB(skb)->when > inet_csk(sk)->icsk_rto; } static inline int tcp_head_timedout(struct sock *sk) @@ -3398,8 +3398,8 @@ static void tcp_ack_probe(struct sock *sk) static inline int tcp_ack_is_dubious(const struct sock *sk, const int flag) { - return (!(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || - inet_csk(sk)->icsk_ca_state != TCP_CA_Open); + return !(flag & FLAG_NOT_DUP) || (flag & FLAG_CA_ALERT) || + inet_csk(sk)->icsk_ca_state != TCP_CA_Open; } static inline int tcp_may_raise_cwnd(const struct sock *sk, const int flag) @@ -3416,9 +3416,9 @@ static inline int tcp_may_update_window(const struct tcp_sock *tp, const u32 ack, const u32 ack_seq, const u32 nwin) { - return (after(ack, tp->snd_una) || + return after(ack, tp->snd_una) || after(ack_seq, tp->snd_wl1) || - (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd)); + (ack_seq == tp->snd_wl1 && nwin > tp->snd_wnd); } /* Update our send window. diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index f25b56c..43cf901 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -55,7 +55,7 @@ static __inline__ int tcp_in_window(u32 seq, u32 end_seq, u32 s_win, u32 e_win) return 1; if (after(end_seq, s_win) && before(seq, e_win)) return 1; - return (seq == e_win && seq == end_seq); + return seq == e_win && seq == end_seq; } /* diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index ea09d2f..05b1ecf 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1370,9 +1370,9 @@ static inline int tcp_nagle_check(const struct tcp_sock *tp, const struct sk_buff *skb, unsigned mss_now, int nonagle) { - return (skb->len < mss_now && + return skb->len < mss_now && ((nonagle & TCP_NAGLE_CORK) || - (!nonagle && tp->packets_out && tcp_minshall_check(tp)))); + (!nonagle && tp->packets_out && tcp_minshall_check(tp))); } /* Return non-zero if the Nagle test allows this packet to be @@ -1443,10 +1443,10 @@ int tcp_may_send_now(struct sock *sk) struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb = tcp_send_head(sk); - return (skb && + return skb && tcp_snd_test(sk, skb, tcp_current_mss(sk), (tcp_skb_is_last(sk, skb) ? - tp->nonagle : TCP_NAGLE_PUSH))); + tp->nonagle : TCP_NAGLE_PUSH)); } /* Trim TSO SKB to LEN bytes, put the remaining data into a new packet diff --git a/net/ipv4/tcp_westwood.c b/net/ipv4/tcp_westwood.c index 20151d6..a534dda 100644 --- a/net/ipv4/tcp_westwood.c +++ b/net/ipv4/tcp_westwood.c @@ -80,7 +80,7 @@ static void tcp_westwood_init(struct sock *sk) */ static inline u32 westwood_do_filter(u32 a, u32 b) { - return (((7 * a) + b) >> 3); + return ((7 * a) + b) >> 3; } static void westwood_filter(struct westwood *w, u32 delta) diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 5bc893e..89aa543 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -243,7 +243,7 @@ static inline bool addrconf_qdisc_ok(const struct net_device *dev) /* Check if a route is valid prefix route */ static inline int addrconf_is_prefix_route(const struct rt6_info *rt) { - return ((rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0); + return (rt->rt6i_flags & (RTF_GATEWAY | RTF_DEFAULT)) == 0; } static void addrconf_del_timer(struct inet6_ifaddr *ifp) diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c index f0e774c..921dcf6 100644 --- a/net/ipv6/addrlabel.c +++ b/net/ipv6/addrlabel.c @@ -513,10 +513,9 @@ static int ip6addrlbl_dump(struct sk_buff *skb, struct netlink_callback *cb) static inline int ip6addrlbl_msgsize(void) { - return (NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + return NLMSG_ALIGN(sizeof(struct ifaddrlblmsg)) + nla_total_size(16) /* IFAL_ADDRESS */ - + nla_total_size(4) /* IFAL_LABEL */ - ); + + nla_total_size(4); /* IFAL_LABEL */ } static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr* nlh, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 56b9bf2..6022098 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -467,7 +467,7 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, if (ipv6_addr_type(&sin->sin6_addr) & IPV6_ADDR_LINKLOCAL) sin->sin6_scope_id = sk->sk_bound_dev_if; *uaddr_len = sizeof(*sin); - return(0); + return 0; } EXPORT_SYMBOL(inet6_getname); @@ -488,7 +488,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCADDRT: case SIOCDELRT: - return(ipv6_route_ioctl(net, cmd, (void __user *)arg)); + return ipv6_route_ioctl(net, cmd, (void __user *)arg); case SIOCSIFADDR: return addrconf_add_ifaddr(net, (void __user *) arg); @@ -502,7 +502,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return sk->sk_prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ - return(0); + return 0; } EXPORT_SYMBOL(inet6_ioctl); diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c index e1caa5d..14ed0a9 100644 --- a/net/ipv6/exthdrs_core.c +++ b/net/ipv6/exthdrs_core.c @@ -13,12 +13,12 @@ int ipv6_ext_hdr(u8 nexthdr) /* * find out if nexthdr is an extension header or a protocol */ - return ( (nexthdr == NEXTHDR_HOP) || + return (nexthdr == NEXTHDR_HOP) || (nexthdr == NEXTHDR_ROUTING) || (nexthdr == NEXTHDR_FRAGMENT) || (nexthdr == NEXTHDR_AUTH) || (nexthdr == NEXTHDR_NONE) || - (nexthdr == NEXTHDR_DEST) ); + (nexthdr == NEXTHDR_DEST); } /* diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 1838927..efbbbce 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -870,8 +870,8 @@ static inline int ip6_rt_check(struct rt6key *rt_key, struct in6_addr *fl_addr, struct in6_addr *addr_cache) { - return ((rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && - (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache))); + return (rt_key->plen != 128 || !ipv6_addr_equal(fl_addr, &rt_key->addr)) && + (addr_cache == NULL || !ipv6_addr_equal(fl_addr, addr_cache)); } static struct dst_entry *ip6_sk_dst_check(struct sock *sk, diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index 69a0051..b3dd844c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -228,12 +228,12 @@ static struct nd_opt_hdr *ndisc_next_option(struct nd_opt_hdr *cur, do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while(cur < end && cur->nd_opt_type != type); - return (cur <= end && cur->nd_opt_type == type ? cur : NULL); + return cur <= end && cur->nd_opt_type == type ? cur : NULL; } static inline int ndisc_is_useropt(struct nd_opt_hdr *opt) { - return (opt->nd_opt_type == ND_OPT_RDNSS); + return opt->nd_opt_type == ND_OPT_RDNSS; } static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, @@ -244,7 +244,7 @@ static struct nd_opt_hdr *ndisc_next_useropt(struct nd_opt_hdr *cur, do { cur = ((void *)cur) + (cur->nd_opt_len << 3); } while(cur < end && !ndisc_is_useropt(cur)); - return (cur <= end && ndisc_is_useropt(cur) ? cur : NULL); + return cur <= end && ndisc_is_useropt(cur) ? cur : NULL; } static struct ndisc_options *ndisc_parse_options(u8 *opt, int opt_len, @@ -319,7 +319,7 @@ static inline u8 *ndisc_opt_addr_data(struct nd_opt_hdr *p, int prepad = ndisc_addr_option_pad(dev->type); if (lladdrlen != NDISC_OPT_SPACE(dev->addr_len + prepad)) return NULL; - return (lladdr + prepad); + return lladdr + prepad; } int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int dir) diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 8e754be..6b331e9 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -82,13 +82,13 @@ EXPORT_SYMBOL_GPL(ip6t_alloc_initial_table); int ip6t_ext_hdr(u8 nexthdr) { - return ( (nexthdr == IPPROTO_HOPOPTS) || - (nexthdr == IPPROTO_ROUTING) || - (nexthdr == IPPROTO_FRAGMENT) || - (nexthdr == IPPROTO_ESP) || - (nexthdr == IPPROTO_AH) || - (nexthdr == IPPROTO_NONE) || - (nexthdr == IPPROTO_DSTOPTS) ); + return (nexthdr == IPPROTO_HOPOPTS) || + (nexthdr == IPPROTO_ROUTING) || + (nexthdr == IPPROTO_FRAGMENT) || + (nexthdr == IPPROTO_ESP) || + (nexthdr == IPPROTO_AH) || + (nexthdr == IPPROTO_NONE) || + (nexthdr == IPPROTO_DSTOPTS); } /* Returns whether matches rule or not. */ diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index e677937..45e6efb7 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -764,7 +764,7 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, return -EINVAL; if (sin6->sin6_family && sin6->sin6_family != AF_INET6) - return(-EAFNOSUPPORT); + return -EAFNOSUPPORT; /* port is the proto value [0..255] carried in nexthdr */ proto = ntohs(sin6->sin6_port); @@ -772,10 +772,10 @@ static int rawv6_sendmsg(struct kiocb *iocb, struct sock *sk, if (!proto) proto = inet->inet_num; else if (proto != inet->inet_num) - return(-EINVAL); + return -EINVAL; if (proto > 255) - return(-EINVAL); + return -EINVAL; daddr = &sin6->sin6_addr; if (np->sndflow) { @@ -985,7 +985,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, /* You may get strange result with a positive odd offset; RFC2292bis agrees with me. */ if (val > 0 && (val&1)) - return(-EINVAL); + return -EINVAL; if (val < 0) { rp->checksum = 0; } else { @@ -997,7 +997,7 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname, break; default: - return(-ENOPROTOOPT); + return -ENOPROTOOPT; } } @@ -1190,7 +1190,7 @@ static int rawv6_init_sk(struct sock *sk) default: break; } - return(0); + return 0; } struct proto rawv6_prot = { diff --git a/net/ipv6/route.c b/net/ipv6/route.c index d126365..25b0bed 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -217,14 +217,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev, static __inline__ int rt6_check_expired(const struct rt6_info *rt) { - return (rt->rt6i_flags & RTF_EXPIRES && - time_after(jiffies, rt->rt6i_expires)); + return (rt->rt6i_flags & RTF_EXPIRES) && + time_after(jiffies, rt->rt6i_expires); } static inline int rt6_need_strict(struct in6_addr *daddr) { - return (ipv6_addr_type(daddr) & - (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK)); + return ipv6_addr_type(daddr) & + (IPV6_ADDR_MULTICAST | IPV6_ADDR_LINKLOCAL | IPV6_ADDR_LOOPBACK); } /* @@ -440,7 +440,7 @@ static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict) __func__, match); net = dev_net(rt0->rt6i_dev); - return (match ? match : net->ipv6.ip6_null_entry); + return match ? match : net->ipv6.ip6_null_entry; } #ifdef CONFIG_IPV6_ROUTE_INFO @@ -859,7 +859,7 @@ int ip6_dst_blackhole(struct sock *sk, struct dst_entry **dstp, struct flowi *fl dst_release(*dstp); *dstp = new; - return (new ? 0 : -ENOMEM); + return new ? 0 : -ENOMEM; } EXPORT_SYMBOL_GPL(ip6_dst_blackhole); @@ -1070,7 +1070,7 @@ static int ip6_dst_gc(struct dst_ops *ops) net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; out: net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; - return (atomic_read(&ops->entries) > rt_max_size); + return atomic_read(&ops->entries) > rt_max_size; } /* Clean host part of a prefix. Not necessary in radix tree, diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fe6d404..8d93f6d 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -139,7 +139,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, return -EINVAL; if (usin->sin6_family != AF_INET6) - return(-EAFNOSUPPORT); + return -EAFNOSUPPORT; memset(&fl, 0, sizeof(fl)); diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 6baeabb..39676ea 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -199,7 +199,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops) struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); xfrm6_policy_afinfo.garbage_collect(net); - return (atomic_read(&ops->entries) > ops->gc_thresh * 2); + return atomic_read(&ops->entries) > ops->gc_thresh * 2; } static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index fd55b51..bf36351 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -573,9 +573,9 @@ static int irda_find_lsap_sel(struct irda_sock *self, char *name) /* Requested object/attribute doesn't exist */ if((self->errno == IAS_CLASS_UNKNOWN) || (self->errno == IAS_ATTRIB_UNKNOWN)) - return (-EADDRNOTAVAIL); + return -EADDRNOTAVAIL; else - return (-EHOSTUNREACH); + return -EHOSTUNREACH; } /* Get the remote TSAP selector */ @@ -663,7 +663,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) __func__, name); self->daddr = DEV_ADDR_ANY; kfree(discoveries); - return(-ENOTUNIQ); + return -ENOTUNIQ; } /* First time we found that one, save it ! */ daddr = self->daddr; @@ -677,7 +677,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) IRDA_DEBUG(0, "%s(), unexpected IAS query failure\n", __func__); self->daddr = DEV_ADDR_ANY; kfree(discoveries); - return(-EHOSTUNREACH); + return -EHOSTUNREACH; break; } } @@ -689,7 +689,7 @@ static int irda_discover_daddr_and_lsap_sel(struct irda_sock *self, char *name) IRDA_DEBUG(1, "%s(), cannot discover service ''%s'' in any device !!!\n", __func__, name); self->daddr = DEV_ADDR_ANY; - return(-EADDRNOTAVAIL); + return -EADDRNOTAVAIL; } /* Revert back to discovered device & service */ @@ -2465,9 +2465,9 @@ bed: /* Requested object/attribute doesn't exist */ if((self->errno == IAS_CLASS_UNKNOWN) || (self->errno == IAS_ATTRIB_UNKNOWN)) - return (-EADDRNOTAVAIL); + return -EADDRNOTAVAIL; else - return (-EHOSTUNREACH); + return -EHOSTUNREACH; } /* Translate from internal to user structure */ diff --git a/net/irda/discovery.c b/net/irda/discovery.c index c1c8ae9..36c3f03 100644 --- a/net/irda/discovery.c +++ b/net/irda/discovery.c @@ -315,7 +315,7 @@ struct irda_device_info *irlmp_copy_discoveries(hashbin_t *log, int *pn, /* Get the actual number of device in the buffer and return */ *pn = i; - return(buffer); + return buffer; } #ifdef CONFIG_PROC_FS diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c index faa82ca..a39cca8 100644 --- a/net/irda/ircomm/ircomm_tty.c +++ b/net/irda/ircomm/ircomm_tty.c @@ -449,8 +449,8 @@ static int ircomm_tty_open(struct tty_struct *tty, struct file *filp) } #ifdef SERIAL_DO_RESTART - return ((self->flags & ASYNC_HUP_NOTIFY) ? - -EAGAIN : -ERESTARTSYS); + return (self->flags & ASYNC_HUP_NOTIFY) ? + -EAGAIN : -ERESTARTSYS; #else return -EAGAIN; #endif diff --git a/net/irda/irlmp.c b/net/irda/irlmp.c index 0e7d8bd..6115a44 100644 --- a/net/irda/irlmp.c +++ b/net/irda/irlmp.c @@ -939,7 +939,7 @@ struct irda_device_info *irlmp_get_discoveries(int *pn, __u16 mask, int nslots) } /* Return current cached discovery log */ - return(irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE)); + return irlmp_copy_discoveries(irlmp->cachelog, pn, mask, TRUE); } EXPORT_SYMBOL(irlmp_get_discoveries); diff --git a/net/irda/irlmp_frame.c b/net/irda/irlmp_frame.c index 3750884..062e63b 100644 --- a/net/irda/irlmp_frame.c +++ b/net/irda/irlmp_frame.c @@ -448,7 +448,7 @@ static struct lsap_cb *irlmp_find_lsap(struct lap_cb *self, __u8 dlsap_sel, (self->cache.slsap_sel == slsap_sel) && (self->cache.dlsap_sel == dlsap_sel)) { - return (self->cache.lsap); + return self->cache.lsap; } #endif diff --git a/net/irda/irnet/irnet_irda.c b/net/irda/irnet/irnet_irda.c index e98e40d..7f17a80 100644 --- a/net/irda/irnet/irnet_irda.c +++ b/net/irda/irnet/irnet_irda.c @@ -238,7 +238,7 @@ irnet_ias_to_tsap(irnet_socket * self, DEXIT(IRDA_SR_TRACE, "\n"); /* Return the TSAP */ - return(dtsap_sel); + return dtsap_sel; } /*------------------------------------------------------------------*/ @@ -301,7 +301,7 @@ irnet_connect_tsap(irnet_socket * self) { clear_bit(0, &self->ttp_connect); DERROR(IRDA_SR_ERROR, "connect aborted!\n"); - return(err); + return err; } /* Connect to remote device */ @@ -312,7 +312,7 @@ irnet_connect_tsap(irnet_socket * self) { clear_bit(0, &self->ttp_connect); DERROR(IRDA_SR_ERROR, "connect aborted!\n"); - return(err); + return err; } /* The above call is non-blocking. @@ -321,7 +321,7 @@ irnet_connect_tsap(irnet_socket * self) * See you there ;-) */ DEXIT(IRDA_SR_TRACE, "\n"); - return(err); + return err; } /*------------------------------------------------------------------*/ @@ -362,10 +362,10 @@ irnet_discover_next_daddr(irnet_socket * self) /* The above request is non-blocking. * After a while, IrDA will call us back in irnet_discovervalue_confirm() * We will then call irnet_ias_to_tsap() and come back here again... */ - return(0); + return 0; } else - return(1); + return 1; } /*------------------------------------------------------------------*/ @@ -436,7 +436,7 @@ irnet_discover_daddr_and_lsap_sel(irnet_socket * self) /* Follow me in irnet_discovervalue_confirm() */ DEXIT(IRDA_SR_TRACE, "\n"); - return(0); + return 0; } /*------------------------------------------------------------------*/ @@ -485,7 +485,7 @@ irnet_dname_to_daddr(irnet_socket * self) /* No luck ! */ DEBUG(IRDA_SR_INFO, "cannot discover device ``%s'' !!!\n", self->rname); kfree(discoveries); - return(-EADDRNOTAVAIL); + return -EADDRNOTAVAIL; } @@ -527,7 +527,7 @@ irda_irnet_create(irnet_socket * self) INIT_WORK(&self->disconnect_work, irnet_ppp_disconnect); DEXIT(IRDA_SOCK_TRACE, "\n"); - return(0); + return 0; } /*------------------------------------------------------------------*/ @@ -601,7 +601,7 @@ irda_irnet_connect(irnet_socket * self) * We will finish the connection procedure in irnet_connect_tsap(). */ DEXIT(IRDA_SOCK_TRACE, "\n"); - return(0); + return 0; } /*------------------------------------------------------------------*/ @@ -733,7 +733,7 @@ irnet_daddr_to_dname(irnet_socket * self) /* No luck ! */ DEXIT(IRDA_SERV_INFO, ": cannot discover device 0x%08x !!!\n", self->daddr); kfree(discoveries); - return(-EADDRNOTAVAIL); + return -EADDRNOTAVAIL; } /*------------------------------------------------------------------*/ diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index dfe7b38..69f1fa6 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -166,7 +166,7 @@ irnet_ctrl_write(irnet_socket * ap, } /* Success : we have parsed all commands successfully */ - return(count); + return count; } #ifdef INITIAL_DISCOVERY @@ -300,7 +300,7 @@ irnet_ctrl_read(irnet_socket * ap, } DEXIT(CTRL_TRACE, "\n"); - return(strlen(event)); + return strlen(event); } #endif /* INITIAL_DISCOVERY */ @@ -409,7 +409,7 @@ irnet_ctrl_read(irnet_socket * ap, } DEXIT(CTRL_TRACE, "\n"); - return(strlen(event)); + return strlen(event); } /*------------------------------------------------------------------*/ @@ -623,7 +623,7 @@ dev_irnet_poll(struct file * file, mask |= irnet_ctrl_poll(ap, file, wait); DEXIT(FS_TRACE, " - mask=0x%X\n", mask); - return(mask); + return mask; } /*------------------------------------------------------------------*/ diff --git a/net/key/af_key.c b/net/key/af_key.c index 43040e9..d87c22d 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -565,12 +565,12 @@ pfkey_proto2satype(uint16_t proto) static uint8_t pfkey_proto_to_xfrm(uint8_t proto) { - return (proto == IPSEC_PROTO_ANY ? 0 : proto); + return proto == IPSEC_PROTO_ANY ? 0 : proto; } static uint8_t pfkey_proto_from_xfrm(uint8_t proto) { - return (proto ? proto : IPSEC_PROTO_ANY); + return proto ? proto : IPSEC_PROTO_ANY; } static inline int pfkey_sockaddr_len(sa_family_t family) diff --git a/net/mac80211/rate.c b/net/mac80211/rate.c index 4f772de..b0cc385 100644 --- a/net/mac80211/rate.c +++ b/net/mac80211/rate.c @@ -207,7 +207,7 @@ static bool rc_no_data_or_no_ack(struct ieee80211_tx_rate_control *txrc) fc = hdr->frame_control; - return ((info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc)); + return (info->flags & IEEE80211_TX_CTL_NO_ACK) || !ieee80211_is_data(fc); } static void rc_send_low_broadcast(s8 *idx, u32 basic_rates, u8 max_rate_idx) diff --git a/net/rfkill/input.c b/net/rfkill/input.c index 3713d7e..1bca6d4 100644 --- a/net/rfkill/input.c +++ b/net/rfkill/input.c @@ -142,7 +142,7 @@ static unsigned long rfkill_last_scheduled; static unsigned long rfkill_ratelimit(const unsigned long last) { const unsigned long delay = msecs_to_jiffies(RFKILL_OPS_DELAY); - return (time_after(jiffies, last + delay)) ? 0 : delay; + return time_after(jiffies, last + delay) ? 0 : delay; } static void rfkill_schedule_ratelimited(void) diff --git a/net/rose/rose_link.c b/net/rose/rose_link.c index a750a28..fa5f564 100644 --- a/net/rose/rose_link.c +++ b/net/rose/rose_link.c @@ -114,7 +114,7 @@ static int rose_send_frame(struct sk_buff *skb, struct rose_neigh *neigh) if (ax25s) ax25_cb_put(ax25s); - return (neigh->ax25 != NULL); + return neigh->ax25 != NULL; } /* @@ -137,7 +137,7 @@ static int rose_link_up(struct rose_neigh *neigh) if (ax25s) ax25_cb_put(ax25s); - return (neigh->ax25 != NULL); + return neigh->ax25 != NULL; } /* diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f774e65..1ef29c7 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -799,7 +799,7 @@ static void sctp_inet_skb_msgname(struct sk_buff *skb, char *msgname, int *len) static int sctp_inet_af_supported(sa_family_t family, struct sctp_sock *sp) { /* PF_INET only supports AF_INET addresses. */ - return (AF_INET == family); + return AF_INET == family; } /* Address matching with wildcards allowed. */ diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 6a691d8..535659f 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3884,7 +3884,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, } out: - return (retval); + return retval; } @@ -3940,7 +3940,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, } out: - return (retval); + return retval; } /* 7.1.12 Enable/Disable message fragmentation (SCTP_DISABLE_FRAGMENTS) @@ -5594,7 +5594,7 @@ static int sctp_get_port(struct sock *sk, unsigned short snum) /* Note: sk->sk_num gets filled in if ephemeral port request. */ ret = sctp_get_port_local(sk, &addr); - return (ret ? 1 : 0); + return ret ? 1 : 0; } /* diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index dcfc66b..597c493 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1049,7 +1049,7 @@ gss_match(struct auth_cred *acred, struct rpc_cred *rc, int flags) out: if (acred->machine_cred != gss_cred->gc_machine_cred) return 0; - return (rc->cr_uid == acred->uid); + return rc->cr_uid == acred->uid; } /* diff --git a/net/sunrpc/auth_gss/gss_generic_token.c b/net/sunrpc/auth_gss/gss_generic_token.c index 310b78e..c586e92 100644 --- a/net/sunrpc/auth_gss/gss_generic_token.c +++ b/net/sunrpc/auth_gss/gss_generic_token.c @@ -76,19 +76,19 @@ static int der_length_size( int length) { if (length < (1<<7)) - return(1); + return 1; else if (length < (1<<8)) - return(2); + return 2; #if (SIZEOF_INT == 2) else - return(3); + return 3; #else else if (length < (1<<16)) - return(3); + return 3; else if (length < (1<<24)) - return(4); + return 4; else - return(5); + return 5; #endif } @@ -121,14 +121,14 @@ der_read_length(unsigned char **buf, int *bufsize) int ret; if (*bufsize < 1) - return(-1); + return -1; sf = *(*buf)++; (*bufsize)--; if (sf & 0x80) { if ((sf &= 0x7f) > ((*bufsize)-1)) - return(-1); + return -1; if (sf > SIZEOF_INT) - return (-1); + return -1; ret = 0; for (; sf; sf--) { ret = (ret<<8) + (*(*buf)++); @@ -138,7 +138,7 @@ der_read_length(unsigned char **buf, int *bufsize) ret = sf; } - return(ret); + return ret; } /* returns the length of a token, given the mech oid and the body size */ @@ -148,7 +148,7 @@ g_token_size(struct xdr_netobj *mech, unsigned int body_size) { /* set body_size to sequence contents size */ body_size += 2 + (int) mech->len; /* NEED overflow check */ - return(1 + der_length_size(body_size) + body_size); + return 1 + der_length_size(body_size) + body_size; } EXPORT_SYMBOL_GPL(g_token_size); @@ -186,27 +186,27 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size, int ret = 0; if ((toksize-=1) < 0) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; if (*buf++ != 0x60) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; if ((seqsize = der_read_length(&buf, &toksize)) < 0) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; if (seqsize != toksize) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; if ((toksize-=1) < 0) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; if (*buf++ != 0x06) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; if ((toksize-=1) < 0) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; toid.len = *buf++; if ((toksize-=toid.len) < 0) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; toid.data = buf; buf+=toid.len; @@ -217,17 +217,17 @@ g_verify_token_header(struct xdr_netobj *mech, int *body_size, to return G_BAD_TOK_HEADER if the token header is in fact bad */ if ((toksize-=2) < 0) - return(G_BAD_TOK_HEADER); + return G_BAD_TOK_HEADER; if (ret) - return(ret); + return ret; if (!ret) { *buf_in = buf; *body_size = toksize; } - return(ret); + return ret; } EXPORT_SYMBOL_GPL(g_verify_token_header); diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c index 415c013..62ac90c 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c +++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c @@ -162,5 +162,5 @@ krb5_get_seq_num(struct krb5_ctx *kctx, *seqnum = ((plain[0]) | (plain[1] << 8) | (plain[2] << 16) | (plain[3] << 24)); - return (0); + return 0; } diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 2689de3..8b40610 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -331,7 +331,7 @@ gss_delete_sec_context(struct gss_ctx **context_handle) *context_handle); if (!*context_handle) - return(GSS_S_NO_CONTEXT); + return GSS_S_NO_CONTEXT; if ((*context_handle)->internal_ctx_id) (*context_handle)->mech_type->gm_ops ->gss_delete_sec_context((*context_handle) diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index cace604..aa5dbda 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -376,7 +376,7 @@ int rpc_queue_empty(struct rpc_wait_queue *queue) spin_lock_bh(&queue->lock); res = queue->qlen; spin_unlock_bh(&queue->lock); - return (res == 0); + return res == 0; } EXPORT_SYMBOL_GPL(rpc_queue_empty); diff --git a/net/tipc/addr.c b/net/tipc/addr.c index c048543..2ddc351 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -89,7 +89,7 @@ int tipc_addr_domain_valid(u32 addr) int tipc_addr_node_valid(u32 addr) { - return (tipc_addr_domain_valid(addr) && tipc_node(addr)); + return tipc_addr_domain_valid(addr) && tipc_node(addr); } int tipc_in_scope(u32 domain, u32 addr) diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index b11248c..ecfaac1 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -184,7 +184,7 @@ static void bclink_set_gap(struct tipc_node *n_ptr) static int bclink_ack_allowed(u32 n) { - return((n % TIPC_MIN_LINK_WIN) == tipc_own_tag); + return (n % TIPC_MIN_LINK_WIN) == tipc_own_tag; } diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 52ae17b..9c10c6b 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -63,7 +63,7 @@ static int media_name_valid(const char *name) len = strlen(name); if ((len + 1) > TIPC_MAX_MEDIA_NAME) return 0; - return (strspn(name, tipc_alphabet) == len); + return strspn(name, tipc_alphabet) == len; } /** diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c index 1885a7e..6569d45 100644 --- a/net/tipc/dbg.c +++ b/net/tipc/dbg.c @@ -134,7 +134,7 @@ void tipc_printbuf_reset(struct print_buf *pb) int tipc_printbuf_empty(struct print_buf *pb) { - return (!pb->buf || (pb->crs == pb->buf)); + return !pb->buf || (pb->crs == pb->buf); } /** @@ -169,7 +169,7 @@ int tipc_printbuf_validate(struct print_buf *pb) tipc_printf(pb, err); } } - return (pb->crs - pb->buf + 1); + return pb->crs - pb->buf + 1; } /** diff --git a/net/tipc/link.c b/net/tipc/link.c index a6a3102..b8cf1e9 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -239,13 +239,13 @@ int tipc_link_is_up(struct link *l_ptr) { if (!l_ptr) return 0; - return (link_working_working(l_ptr) || link_working_unknown(l_ptr)); + return link_working_working(l_ptr) || link_working_unknown(l_ptr); } int tipc_link_is_active(struct link *l_ptr) { - return ((l_ptr->owner->active_links[0] == l_ptr) || - (l_ptr->owner->active_links[1] == l_ptr)); + return (l_ptr->owner->active_links[0] == l_ptr) || + (l_ptr->owner->active_links[1] == l_ptr); } /** diff --git a/net/tipc/link.h b/net/tipc/link.h index 2e5385c..26151d3 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -279,12 +279,12 @@ static inline int between(u32 lower, u32 upper, u32 n) static inline int less_eq(u32 left, u32 right) { - return (mod(right - left) < 32768u); + return mod(right - left) < 32768u; } static inline int less(u32 left, u32 right) { - return (less_eq(left, right) && (mod(right) != mod(left))); + return less_eq(left, right) && (mod(right) != mod(left)); } static inline u32 lesser(u32 left, u32 right) @@ -299,32 +299,32 @@ static inline u32 lesser(u32 left, u32 right) static inline int link_working_working(struct link *l_ptr) { - return (l_ptr->state == WORKING_WORKING); + return l_ptr->state == WORKING_WORKING; } static inline int link_working_unknown(struct link *l_ptr) { - return (l_ptr->state == WORKING_UNKNOWN); + return l_ptr->state == WORKING_UNKNOWN; } static inline int link_reset_unknown(struct link *l_ptr) { - return (l_ptr->state == RESET_UNKNOWN); + return l_ptr->state == RESET_UNKNOWN; } static inline int link_reset_reset(struct link *l_ptr) { - return (l_ptr->state == RESET_RESET); + return l_ptr->state == RESET_RESET; } static inline int link_blocked(struct link *l_ptr) { - return (l_ptr->exp_msg_count || l_ptr->blocked); + return l_ptr->exp_msg_count || l_ptr->blocked; } static inline int link_congested(struct link *l_ptr) { - return (l_ptr->out_queue_size >= l_ptr->queue_limit[0]); + return l_ptr->out_queue_size >= l_ptr->queue_limit[0]; } #endif diff --git a/net/tipc/msg.h b/net/tipc/msg.h index 995d2da..031aad1 100644 --- a/net/tipc/msg.h +++ b/net/tipc/msg.h @@ -104,7 +104,7 @@ static inline u32 msg_user(struct tipc_msg *m) static inline u32 msg_isdata(struct tipc_msg *m) { - return (msg_user(m) <= TIPC_CRITICAL_IMPORTANCE); + return msg_user(m) <= TIPC_CRITICAL_IMPORTANCE; } static inline void msg_set_user(struct tipc_msg *m, u32 n) @@ -289,7 +289,7 @@ static inline void msg_set_destnode(struct tipc_msg *m, u32 a) static inline int msg_is_dest(struct tipc_msg *m, u32 d) { - return(msg_short(m) || (msg_destnode(m) == d)); + return msg_short(m) || (msg_destnode(m) == d); } static inline u32 msg_routed(struct tipc_msg *m) @@ -632,7 +632,7 @@ static inline void msg_set_bcast_tag(struct tipc_msg *m, u32 n) static inline u32 msg_max_pkt(struct tipc_msg *m) { - return (msg_bits(m, 9, 16, 0xffff) * 4); + return msg_bits(m, 9, 16, 0xffff) * 4; } static inline void msg_set_max_pkt(struct tipc_msg *m, u32 n) diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index c13c2c7..9ca4b06 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -116,7 +116,7 @@ DEFINE_RWLOCK(tipc_nametbl_lock); static int hash(int x) { - return(x & (tipc_nametbl_size - 1)); + return x & (tipc_nametbl_size - 1); } /** diff --git a/net/tipc/node.c b/net/tipc/node.c index b702c7b..7c49cd0 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -242,17 +242,17 @@ int tipc_node_has_active_links(struct tipc_node *n_ptr) int tipc_node_has_redundant_links(struct tipc_node *n_ptr) { - return (n_ptr->working_links > 1); + return n_ptr->working_links > 1; } static int tipc_node_has_active_routes(struct tipc_node *n_ptr) { - return (n_ptr && (n_ptr->last_router >= 0)); + return n_ptr && (n_ptr->last_router >= 0); } int tipc_node_is_up(struct tipc_node *n_ptr) { - return (tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr)); + return tipc_node_has_active_links(n_ptr) || tipc_node_has_active_routes(n_ptr); } struct tipc_node *tipc_node_attach_link(struct link *l_ptr) diff --git a/net/tipc/port.h b/net/tipc/port.h index 8d1652a..e74bd956 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -157,7 +157,7 @@ static inline u32 tipc_peer_node(struct port *p_ptr) static inline int tipc_port_congested(struct port *p_ptr) { - return((p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2)); + return (p_ptr->sent - p_ptr->acked) >= (TIPC_FLOW_CONTROL_WIN * 2); } /** diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f7ac94d..33217fc 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -1195,7 +1195,7 @@ static int rx_queue_full(struct tipc_msg *msg, u32 queue_size, u32 base) if (msg_connected(msg)) threshold *= 4; - return (queue_size >= threshold); + return queue_size >= threshold; } /** diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index ab6eab4..1a5b9a6 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -604,6 +604,6 @@ int tipc_ispublished(struct tipc_name const *name) { u32 domain = 0; - return(tipc_nametbl_translate(name->type, name->instance,&domain) != 0); + return tipc_nametbl_translate(name->type, name->instance, &domain) != 0; } diff --git a/net/wireless/core.h b/net/wireless/core.h index 37580e0..5d89310 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -86,7 +86,7 @@ struct cfg80211_registered_device *wiphy_to_dev(struct wiphy *wiphy) static inline bool wiphy_idx_valid(int wiphy_idx) { - return (wiphy_idx >= 0); + return wiphy_idx >= 0; } -- cgit v1.1 From 8dcb20038ade81f9a87c024e7f12ec74f0e95f33 Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 28 Aug 2010 19:36:10 +0300 Subject: mac80211: Filter ProbeReq SuppRates based on TX rate mask If the TX rate set has been masked, the removed rates can also be removed from the Supported Rates and Extended Supported Rates IEs in Probe Request frames. Signed-off-by: Jouni Malinen Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 2 +- net/mac80211/scan.c | 2 +- net/mac80211/util.c | 37 ++++++++++++++++++++++--------------- 3 files changed, 24 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 9346a6b..3641563 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1256,7 +1256,7 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, const u8 *key, u8 key_len, u8 key_idx); int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, const u8 *ie, size_t ie_len, - enum ieee80211_band band); + enum ieee80211_band band, u32 rate_mask); void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index d60389b..1623e9d 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -242,7 +242,7 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) local->hw_scan_req->n_channels = n_chans; ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie, - req->ie, req->ie_len, band); + req->ie, req->ie_len, band, (u32) -1); local->hw_scan_req->ie_len = ielen; return true; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 737f426..bfd19d7 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -895,26 +895,33 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, const u8 *ie, size_t ie_len, - enum ieee80211_band band) + enum ieee80211_band band, u32 rate_mask) { struct ieee80211_supported_band *sband; u8 *pos; size_t offset = 0, noffset; int supp_rates_len, i; + u8 rates[32]; + int num_rates; + int ext_rates_len; sband = local->hw.wiphy->bands[band]; pos = buffer; - supp_rates_len = min_t(int, sband->n_bitrates, 8); + num_rates = 0; + for (i = 0; i < sband->n_bitrates; i++) { + if ((BIT(i) & rate_mask) == 0) + continue; /* skip rate */ + rates[num_rates++] = (u8) (sband->bitrates[i].bitrate / 5); + } + + supp_rates_len = min_t(int, num_rates, 8); *pos++ = WLAN_EID_SUPP_RATES; *pos++ = supp_rates_len; - - for (i = 0; i < supp_rates_len; i++) { - int rate = sband->bitrates[i].bitrate; - *pos++ = (u8) (rate / 5); - } + memcpy(pos, rates, supp_rates_len); + pos += supp_rates_len; /* insert "request information" if in custom IEs */ if (ie && ie_len) { @@ -932,14 +939,12 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, offset = noffset; } - if (sband->n_bitrates > i) { + ext_rates_len = num_rates - supp_rates_len; + if (ext_rates_len > 0) { *pos++ = WLAN_EID_EXT_SUPP_RATES; - *pos++ = sband->n_bitrates - i; - - for (; i < sband->n_bitrates; i++) { - int rate = sband->bitrates[i].bitrate; - *pos++ = (u8) (rate / 5); - } + *pos++ = ext_rates_len; + memcpy(pos, rates + supp_rates_len, ext_rates_len); + pos += ext_rates_len; } /* insert custom IEs that go before HT */ @@ -1018,7 +1023,9 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, } buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len, - local->hw.conf.channel->band); + local->hw.conf.channel->band, + sdata->rc_rateidx_mask + [local->hw.conf.channel->band]); skb = ieee80211_probereq_get(&local->hw, &sdata->vif, ssid, ssid_len, -- cgit v1.1 From 651b52254fc061f02d965524e71de4333a009a5a Mon Sep 17 00:00:00 2001 From: Jouni Malinen Date: Sat, 28 Aug 2010 19:37:51 +0300 Subject: mac80211: Add DS Parameter Set into Probe Request on 2.4 GHz IEEE Std 802.11k-2008 added DS Parameter Set information element into Probe Request frames as an optional information on 2.4 GHz band (and mandatory, if radio measurements are enabled). This allows APs to filter out Probe Request frames that may be received from neighboring overlapping channels and by doing so, reduce the number of unnecessary frames in the air. Make mac80211 add this IE into Probe Request frames whenever the channel is known (i.e., whenever hwscan is not used). Signed-off-by: Jouni Malinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 3 ++- net/mac80211/scan.c | 3 ++- net/mac80211/util.c | 16 ++++++++++++++-- 3 files changed, 18 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 3641563..78a8d92 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1256,7 +1256,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, const u8 *key, u8 key_len, u8 key_idx); int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, const u8 *ie, size_t ie_len, - enum ieee80211_band band, u32 rate_mask); + enum ieee80211_band band, u32 rate_mask, + u8 channel); void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, const u8 *ssid, size_t ssid_len, const u8 *ie, size_t ie_len); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 1623e9d..5171a95 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -242,7 +242,8 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) local->hw_scan_req->n_channels = n_chans; ielen = ieee80211_build_preq_ies(local, (u8 *)local->hw_scan_req->ie, - req->ie, req->ie_len, band, (u32) -1); + req->ie, req->ie_len, band, (u32) -1, + 0); local->hw_scan_req->ie_len = ielen; return true; diff --git a/net/mac80211/util.c b/net/mac80211/util.c index bfd19d7..aba025d 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -895,7 +895,8 @@ void ieee80211_send_auth(struct ieee80211_sub_if_data *sdata, int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, const u8 *ie, size_t ie_len, - enum ieee80211_band band, u32 rate_mask) + enum ieee80211_band band, u32 rate_mask, + u8 channel) { struct ieee80211_supported_band *sband; u8 *pos; @@ -947,6 +948,12 @@ int ieee80211_build_preq_ies(struct ieee80211_local *local, u8 *buffer, pos += ext_rates_len; } + if (channel && sband->band == IEEE80211_BAND_2GHZ) { + *pos++ = WLAN_EID_DS_PARAMS; + *pos++ = 1; + *pos++ = channel; + } + /* insert custom IEs that go before HT */ if (ie && ie_len) { static const u8 before_ht[] = { @@ -1013,6 +1020,7 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, struct ieee80211_mgmt *mgmt; size_t buf_len; u8 *buf; + u8 chan; /* FIXME: come up with a proper value */ buf = kmalloc(200 + ie_len, GFP_KERNEL); @@ -1022,10 +1030,14 @@ void ieee80211_send_probe_req(struct ieee80211_sub_if_data *sdata, u8 *dst, return; } + chan = ieee80211_frequency_to_channel( + local->hw.conf.channel->center_freq); + buf_len = ieee80211_build_preq_ies(local, buf, ie, ie_len, local->hw.conf.channel->band, sdata->rc_rateidx_mask - [local->hw.conf.channel->band]); + [local->hw.conf.channel->band], + chan); skb = ieee80211_probereq_get(&local->hw, &sdata->vif, ssid, ssid_len, -- cgit v1.1 From eb7d3066cf864342e8ae6a5c1126a1602c4d06c0 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Tue, 21 Sep 2010 21:36:18 +0200 Subject: mac80211: clear txflags for ps-filtered frames This patch fixes stale mac80211_tx_control_flags for filtered / retried frames. Because ieee80211_handle_filtered_frame feeds skbs back into the tx path, they have to be stripped of some tx flags so they won't confuse the stack, driver or device. Cc: Acked-by: Johannes Berg Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/status.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 571b32b..dd85006 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -58,6 +58,7 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local, info->control.vif = &sta->sdata->vif; info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING | IEEE80211_TX_INTFL_RETRANSMISSION; + info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS; sta->tx_filtered_count++; -- cgit v1.1 From 59104f062435c7816e39ee5ed504a69cb8037f10 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 20 Sep 2010 20:16:27 +0000 Subject: ip: take care of last fragment in ip_append_data While investigating a bit, I found ip_fragment() slow path was taken because ip_append_data() provides following layout for a send(MTU + N*(MTU - 20)) syscall : - one skb with 1500 (mtu) bytes - N fragments of 1480 (mtu-20) bytes (before adding IP header) last fragment gets 17 bytes of trail data because of following bit: if (datalen == length + fraggap) alloclen += rt->dst.trailer_len; Then esp4 adds 16 bytes of data (while trailer_len is 17... hmm... another bug ?) In ip_fragment(), we notice last fragment is too big (1496 + 20) > mtu, so we take slow path, building another skb chain. In order to avoid taking slow path, we should correct ip_append_data() to make sure last fragment has real trail space, under mtu... Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_output.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index e427620..3551b6d 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -926,16 +926,19 @@ alloc_new_skb: !(rt->dst.dev->features&NETIF_F_SG)) alloclen = mtu; else - alloclen = datalen + fragheaderlen; + alloclen = fraglen; /* The last fragment gets additional space at tail. * Note, with MSG_MORE we overallocate on fragments, * because we have no idea what fragment will be * the last. */ - if (datalen == length + fraggap) + if (datalen == length + fraggap) { alloclen += rt->dst.trailer_len; - + /* make sure mtu is not reached */ + if (datalen > mtu - fragheaderlen - rt->dst.trailer_len) + datalen -= ALIGN(rt->dst.trailer_len, 8); + } if (transhdrlen) { skb = sock_alloc_send_skb(sk, alloclen + hh_len + 15, -- cgit v1.1 From c5256c51232d8312755e8de2b514c426b19b101a Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 00:46:11 +0000 Subject: net: propagate NETIF_F_HIGHDMA to vlans Automatically allows vlans to get NETIF_F_HIGHDMA if underlying device supports it. On 32bit arches (and more precisely if CONFIG_HIGHMEM is enabled), it can help to reduce cost of illegal_highdma() and __skb_linearize() calls. Tested on tg3 , bnx2, bonding, this worked very well. This is a generalization of a patch provided by Yi Zou & Jeff Kirsher. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 2c7934f..e0c0b86 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5058,10 +5058,11 @@ int register_netdevice(struct net_device *dev) if (dev->features & NETIF_F_SG) dev->features |= NETIF_F_GSO; - /* Enable GRO for vlans by default if dev->features has GRO also. - * vlan_dev_init() will do the dev->features check. + /* Enable GRO and NETIF_F_HIGHDMA for vlans by default, + * vlan_dev_init() will do the dev->features check, so these features + * are enabled only if supported by underlying device. */ - dev->vlan_features |= NETIF_F_GRO; + dev->vlan_features |= (NETIF_F_GRO | NETIF_F_HIGHDMA); ret = call_netdevice_notifiers(NETDEV_POST_INIT, dev); ret = notifier_to_errno(ret); -- cgit v1.1 From 1b4bf461f05d56ced6d6b8f3b4831adc7076f565 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 17:26:35 +0000 Subject: rps: allocate rx queues in register_netdevice only Instead of having two places were we allocate dev->_rx, introduce netif_alloc_rx_queues() helper and call it only from register_netdevice(), not from alloc_netdev_mq() Goal is to let drivers change dev->num_rx_queues after allocating netdev and before registering it. This also removes a lot of ifdefs in net/core/dev.c Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 76 +++++++++++++++++++++++++--------------------------------- 1 file changed, 32 insertions(+), 44 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index e0c0b86..72e9983 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -4964,6 +4964,34 @@ void netif_stacked_transfer_operstate(const struct net_device *rootdev, } EXPORT_SYMBOL(netif_stacked_transfer_operstate); +static int netif_alloc_rx_queues(struct net_device *dev) +{ +#ifdef CONFIG_RPS + unsigned int i, count = dev->num_rx_queues; + + if (count) { + struct netdev_rx_queue *rx; + + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", + count); + return -ENOMEM; + } + dev->_rx = rx; + atomic_set(&rx->count, count); + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < count; i++) + rx[i].first = rx; + } +#endif + return 0; +} + /** * register_netdevice - register a network device * @dev: device to register @@ -5001,24 +5029,10 @@ int register_netdevice(struct net_device *dev) dev->iflink = -1; -#ifdef CONFIG_RPS - if (!dev->num_rx_queues) { - /* - * Allocate a single RX queue if driver never called - * alloc_netdev_mq - */ - - dev->_rx = kzalloc(sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!dev->_rx) { - ret = -ENOMEM; - goto out; - } + ret = netif_alloc_rx_queues(dev); + if (ret) + goto out; - dev->_rx->first = dev->_rx; - atomic_set(&dev->_rx->count, 1); - dev->num_rx_queues = 1; - } -#endif /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5415,10 +5429,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, struct net_device *dev; size_t alloc_size; struct net_device *p; -#ifdef CONFIG_RPS - struct netdev_rx_queue *rx; - int i; -#endif BUG_ON(strlen(name) >= sizeof(dev->name)); @@ -5444,29 +5454,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, goto free_p; } -#ifdef CONFIG_RPS - rx = kcalloc(queue_count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "rx queues.\n"); - goto free_tx; - } - - atomic_set(&rx->count, queue_count); - - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ - for (i = 0; i < queue_count; i++) - rx[i].first = rx; -#endif dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; if (dev_addr_init(dev)) - goto free_rx; + goto free_tx; dev_mc_init(dev); dev_uc_init(dev); @@ -5478,7 +5471,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->real_num_tx_queues = queue_count; #ifdef CONFIG_RPS - dev->_rx = rx; dev->num_rx_queues = queue_count; #endif @@ -5496,11 +5488,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; -free_rx: -#ifdef CONFIG_RPS - kfree(rx); free_tx: -#endif kfree(tx); free_p: kfree(p); -- cgit v1.1 From 83180af0b0ea166adf8249f4513beb7355f9b4c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 23 Sep 2010 21:46:03 +0000 Subject: net: fix rcu use in ip_route_output_slow __in_dev_get_rtnl(dev_out) is called while RTNL is not held, thus triggers a lockdep fault. At this point, we only perform a raw test of dev_out->ip_ptr being NULL, we dont need to make sure ip_ptr cant changed right after. We can use rcu_dereference_raw() for this. Reported-by: Andrew Morton Acked-by: Paul E. McKenney Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ae1d4a4..98beda4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2579,7 +2579,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, goto out; /* RACE: Check return value of inet_select_addr instead. */ - if (__in_dev_get_rtnl(dev_out) == NULL) { + if (rcu_dereference_raw(dev_out->ip_ptr) == NULL) { dev_put(dev_out); goto out; /* Wrong error code */ } -- cgit v1.1 From 295bafb47b0d365e1b4f747dffef29e590f13233 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Wed, 22 Sep 2010 20:29:01 -0700 Subject: mac80211: Support multiple VIFS per AP in debugfs. Create 'stations' sub-directory under each netdev:[vif-name] directory to hold all stations for that network device. Signed-off-by: Ben Greear Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/debugfs.c | 1 - net/mac80211/debugfs_netdev.c | 3 +++ net/mac80211/debugfs_sta.c | 2 +- net/mac80211/ieee80211_i.h | 2 +- 4 files changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index e81ef4e..ebd5b69 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -368,7 +368,6 @@ void debugfs_hw_add(struct ieee80211_local *local) if (!phyd) return; - local->debugfs.stations = debugfs_create_dir("stations", phyd); local->debugfs.keys = debugfs_create_dir("keys", phyd); DEBUGFS_ADD(frequency); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 20b2998..3e124305 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -409,6 +409,9 @@ void ieee80211_debugfs_add_netdev(struct ieee80211_sub_if_data *sdata) sprintf(buf, "netdev:%s", sdata->name); sdata->debugfs.dir = debugfs_create_dir(buf, sdata->local->hw.wiphy->debugfsdir); + if (sdata->debugfs.dir) + sdata->debugfs.subdir_stations = debugfs_create_dir("stations", + sdata->debugfs.dir); add_files(sdata); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 76839d4..6b7ff9f 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -300,7 +300,7 @@ STA_OPS(ht_capa); void ieee80211_sta_debugfs_add(struct sta_info *sta) { - struct dentry *stations_dir = sta->local->debugfs.stations; + struct dentry *stations_dir = sta->sdata->debugfs.subdir_stations; u8 mac[3*ETH_ALEN]; sta->debugfs.add_has_run = true; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 78a8d92..40f7472 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -564,6 +564,7 @@ struct ieee80211_sub_if_data { #ifdef CONFIG_MAC80211_DEBUGFS struct { struct dentry *dir; + struct dentry *subdir_stations; struct dentry *default_key; struct dentry *default_mgmt_key; } debugfs; @@ -899,7 +900,6 @@ struct ieee80211_local { #ifdef CONFIG_MAC80211_DEBUGFS struct local_debugfsdentries { struct dentry *rcdir; - struct dentry *stations; struct dentry *keys; } debugfs; #endif -- cgit v1.1 From 686b9cb994f5f74be790df4cd12873dfdc8a6984 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 23 Sep 2010 09:44:36 -0700 Subject: mac80211/ath9k: Support AMPDU with multiple VIFs. The old ieee80211_find_sta_by_hw method didn't properly find VIFS when there was more than one per AP. This caused AMPDU logic in ath9k to get the wrong VIF when trying to account for transmitted SKBs. This patch changes ieee80211_find_sta_by_hw to take a localaddr argument to distinguish between VIFs with the same AP but different local addresses. The method name is changed to ieee80211_find_sta_by_ifaddr. Signed-off-by: Ben Greear Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/sta_info.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index 44e10a9..ca2cba9 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -838,13 +838,20 @@ void ieee80211_sta_expire(struct ieee80211_sub_if_data *sdata, mutex_unlock(&local->sta_mtx); } -struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw, - const u8 *addr) +struct ieee80211_sta *ieee80211_find_sta_by_ifaddr(struct ieee80211_hw *hw, + const u8 *addr, + const u8 *localaddr) { struct sta_info *sta, *nxt; - /* Just return a random station ... first in list ... */ + /* + * Just return a random station if localaddr is NULL + * ... first in list. + */ for_each_sta_info(hw_to_local(hw), addr, sta, nxt) { + if (localaddr && + compare_ether_addr(sta->sdata->vif.addr, localaddr) != 0) + continue; if (!sta->uploaded) return NULL; return &sta->sta; @@ -852,7 +859,7 @@ struct ieee80211_sta *ieee80211_find_sta_by_hw(struct ieee80211_hw *hw, return NULL; } -EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_hw); +EXPORT_SYMBOL_GPL(ieee80211_find_sta_by_ifaddr); struct ieee80211_sta *ieee80211_find_sta(struct ieee80211_vif *vif, const u8 *addr) -- cgit v1.1 From 56af326830757f3e8a1742770d15dfd6e3c40e85 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 23 Sep 2010 10:22:24 -0700 Subject: mac80211: Support receiving data frames on multiple vifs. When using multiple STA interfaces on the same radio, some data packets need to be received on all interfaces (broadcast, for instance). Make the STA loop look similar to the mgt-data loop. Also, add logic to check RX_FLAG_MMIC_ERROR for last interface in mgt-data loop. Signed-off-by: Ben Greear Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 80 ++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 70 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 308e502..50c0803 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2609,7 +2609,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, int prepares; struct ieee80211_sub_if_data *prev = NULL; struct sk_buff *skb_new; - struct sta_info *sta, *tmp; + struct sta_info *sta, *tmp, *prev_sta; bool found_sta = false; int err = 0; @@ -2640,22 +2640,74 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, ieee80211_verify_alignment(&rx); if (ieee80211_is_data(fc)) { + prev_sta = NULL; for_each_sta_info(local, hdr->addr2, sta, tmp) { - rx.sta = sta; found_sta = true; - rx.sdata = sta->sdata; + if (!prev_sta) { + prev_sta = sta; + continue; + } + + rx.sta = prev_sta; + rx.sdata = prev_sta->sdata; rx.flags |= IEEE80211_RX_RA_MATCH; prepares = prepare_for_handlers(rx.sdata, &rx, hdr); - if (prepares) { - if (status->flag & RX_FLAG_MMIC_ERROR) { - if (rx.flags & IEEE80211_RX_RA_MATCH) - ieee80211_rx_michael_mic_report(hdr, &rx); - } else - prev = rx.sdata; + if (!prepares) + goto next_sta; + + if (status->flag & RX_FLAG_MMIC_ERROR) { + if (rx.flags & IEEE80211_RX_RA_MATCH) + ieee80211_rx_michael_mic_report(hdr, &rx); + goto next_sta; + } + + /* + * frame was destined for the previous interface + * so invoke RX handlers for it + */ + skb_new = skb_copy(skb, GFP_ATOMIC); + if (!skb_new) { + if (net_ratelimit()) + wiphy_debug(local->hw.wiphy, + "failed to copy multicast" + " frame for %s\n", + prev_sta->sdata->name); + goto next_sta; + } + ieee80211_invoke_rx_handlers(prev_sta->sdata, &rx, + skb_new); +next_sta: + prev_sta = sta; + } /* for all STA info */ + + if (prev_sta) { + rx.sta = prev_sta; + rx.sdata = prev_sta->sdata; + + rx.flags |= IEEE80211_RX_RA_MATCH; + prepares = prepare_for_handlers(rx.sdata, &rx, hdr); + if (!prepares) + prev_sta = NULL; + + if (prev_sta && status->flag & RX_FLAG_MMIC_ERROR) { + if (rx.flags & IEEE80211_RX_RA_MATCH) + ieee80211_rx_michael_mic_report(hdr, &rx); + prev_sta = NULL; } } - } + + + if (prev_sta) { + ieee80211_invoke_rx_handlers(prev_sta->sdata, &rx, skb); + return; + } else { + if (found_sta) { + dev_kfree_skb(skb); + return; + } + } + } /* if data frame */ if (!found_sta) { list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) @@ -2718,6 +2770,14 @@ next: if (!prepares) prev = NULL; + + if (prev && status->flag & RX_FLAG_MMIC_ERROR) { + rx.sdata = prev; + if (rx.flags & IEEE80211_RX_RA_MATCH) + ieee80211_rx_michael_mic_report(hdr, + &rx); + prev = NULL; + } } } if (prev) -- cgit v1.1 From 92e44948b2b3b2db8f39f17033f98ae2356156a5 Mon Sep 17 00:00:00 2001 From: Teemu Paasikivi Date: Fri, 24 Sep 2010 07:23:55 +0300 Subject: nl80211: Fix exit from nl80211_set_power_save If interface does not existk, when nl80211_set_power_save is called, (eg. module has been unloaded) it has been causing kernel panic. Added new goto target to avoid crash if get_rdev_dev_by_info_ifindex does not return dev and rdev pointers. Signed-off-by: Teemu Paasikivi Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index f15b1af..4ff827e 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4990,7 +4990,7 @@ static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info) err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); if (err) - goto unlock_rdev; + goto unlock_rtnl; wdev = dev->ieee80211_ptr; @@ -5014,6 +5014,7 @@ static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info) unlock_rdev: cfg80211_unlock_rdev(rdev); dev_put(dev); +unlock_rtnl: rtnl_unlock(); out: -- cgit v1.1 From 7c1e183186377e84e6f4e457be0514887f2df4ef Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Fri, 24 Sep 2010 15:52:49 -0400 Subject: Revert "mac80211: fix use-after-free" This reverts commit cd87a2d3a33d75a646f1aa1aa2ee5bf712d6f963. Author reports it conflicts with proper fixes, applied hereafter. Signed-off-by: John W. Linville --- net/mac80211/rx.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 50c0803..29a582d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2286,6 +2286,9 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, struct net_device *prev_dev = NULL; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + if (status->flag & RX_FLAG_INTERNAL_CMTR) + goto out_free_skb; + if (skb_headroom(skb) < sizeof(*rthdr) && pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC)) goto out_free_skb; @@ -2344,6 +2347,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, } else goto out_free_skb; + status->flag |= RX_FLAG_INTERNAL_CMTR; return; out_free_skb: -- cgit v1.1 From 20b01f80f72426e7ed2e773220da4357925383d5 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Sep 2010 11:21:05 +0200 Subject: mac80211: remove prepare_for_handlers sdata argument The first argument to prepare_for_handlers is always the sdata that can just be stored in rx data directly (and even already is, in two of four code paths.) Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 29a582d..f59f6f5 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2513,10 +2513,10 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) /* main receive path */ -static int prepare_for_handlers(struct ieee80211_sub_if_data *sdata, - struct ieee80211_rx_data *rx, +static int prepare_for_handlers(struct ieee80211_rx_data *rx, struct ieee80211_hdr *hdr) { + struct ieee80211_sub_if_data *sdata = rx->sdata; struct sk_buff *skb = rx->skb; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); u8 *bssid = ieee80211_get_bssid(hdr, skb->len, sdata->vif.type); @@ -2656,7 +2656,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, rx.sdata = prev_sta->sdata; rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(rx.sdata, &rx, hdr); + prepares = prepare_for_handlers(&rx, hdr); if (!prepares) goto next_sta; @@ -2690,7 +2690,7 @@ next_sta: rx.sdata = prev_sta->sdata; rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(rx.sdata, &rx, hdr); + prepares = prepare_for_handlers(&rx, hdr); if (!prepares) prev_sta = NULL; @@ -2733,15 +2733,15 @@ next_sta: } rx.sta = sta_info_get_bss(prev, hdr->addr2); + rx.sdata = prev; rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(prev, &rx, hdr); + prepares = prepare_for_handlers(&rx, hdr); if (!prepares) goto next; if (status->flag & RX_FLAG_MMIC_ERROR) { - rx.sdata = prev; if (rx.flags & IEEE80211_RX_RA_MATCH) ieee80211_rx_michael_mic_report(hdr, &rx); @@ -2768,15 +2768,15 @@ next: if (prev) { rx.sta = sta_info_get_bss(prev, hdr->addr2); + rx.sdata = prev; rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(prev, &rx, hdr); + prepares = prepare_for_handlers(&rx, hdr); if (!prepares) prev = NULL; if (prev && status->flag & RX_FLAG_MMIC_ERROR) { - rx.sdata = prev; if (rx.flags & IEEE80211_RX_RA_MATCH) ieee80211_rx_michael_mic_report(hdr, &rx); -- cgit v1.1 From 4406c376895608375105013bf405ecac720ef558 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Sep 2010 11:21:06 +0200 Subject: mac80211: consolidate packet processing There are now four instances of vaguely the same code that does packet preparation, checking for MMIC errors and reporting them, and then invoking packet processing. Consolidate all of these. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 165 ++++++++++++++++++++---------------------------------- 1 file changed, 62 insertions(+), 103 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index f59f6f5..13311f8 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2443,18 +2443,13 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, } } -static void ieee80211_invoke_rx_handlers(struct ieee80211_sub_if_data *sdata, - struct ieee80211_rx_data *rx, - struct sk_buff *skb) +static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) { struct sk_buff_head reorder_release; ieee80211_rx_result res = RX_DROP_MONITOR; __skb_queue_head_init(&reorder_release); - rx->skb = skb; - rx->sdata = sdata; - #define CALL_RXH(rxh) \ do { \ res = rxh(rx); \ @@ -2598,21 +2593,63 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, } /* + * This function returns whether or not the SKB + * was destined for RX processing or not, which, + * if consume is true, is equivalent to whether + * or not the skb was consumed. + */ +static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, + struct sk_buff *skb, bool consume) +{ + struct ieee80211_local *local = rx->local; + struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); + struct ieee80211_hdr *hdr = (void *)skb->data; + int prepares; + + rx->skb = skb; + rx->flags |= IEEE80211_RX_RA_MATCH; + prepares = prepare_for_handlers(rx, hdr); + + if (!prepares) + return false; + + if (status->flag & RX_FLAG_MMIC_ERROR) { + if (rx->flags & IEEE80211_RX_RA_MATCH) + ieee80211_rx_michael_mic_report(hdr, rx); + return false; + } + + if (!consume) { + skb = skb_copy(skb, GFP_ATOMIC); + if (!skb) { + if (net_ratelimit()) + wiphy_debug(local->hw.wiphy, + "failed to copy multicast frame for %s\n", + sdata->name); + return true; + } + + rx->skb = skb; + } + + ieee80211_invoke_rx_handlers(rx); + return true; +} + +/* * This is the actual Rx frames handler. as it blongs to Rx path it must * be called with rcu_read_lock protection. */ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, struct sk_buff *skb) { - struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr; __le16 fc; struct ieee80211_rx_data rx; - int prepares; - struct ieee80211_sub_if_data *prev = NULL; - struct sk_buff *skb_new; + struct ieee80211_sub_if_data *prev; struct sta_info *sta, *tmp, *prev_sta; bool found_sta = false; int err = 0; @@ -2645,8 +2682,10 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (ieee80211_is_data(fc)) { prev_sta = NULL; + for_each_sta_info(local, hdr->addr2, sta, tmp) { found_sta = true; + if (!prev_sta) { prev_sta = sta; continue; @@ -2654,65 +2693,23 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, rx.sta = prev_sta; rx.sdata = prev_sta->sdata; + ieee80211_prepare_and_rx_handle(&rx, skb, false); - rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(&rx, hdr); - if (!prepares) - goto next_sta; - - if (status->flag & RX_FLAG_MMIC_ERROR) { - if (rx.flags & IEEE80211_RX_RA_MATCH) - ieee80211_rx_michael_mic_report(hdr, &rx); - goto next_sta; - } - - /* - * frame was destined for the previous interface - * so invoke RX handlers for it - */ - skb_new = skb_copy(skb, GFP_ATOMIC); - if (!skb_new) { - if (net_ratelimit()) - wiphy_debug(local->hw.wiphy, - "failed to copy multicast" - " frame for %s\n", - prev_sta->sdata->name); - goto next_sta; - } - ieee80211_invoke_rx_handlers(prev_sta->sdata, &rx, - skb_new); -next_sta: prev_sta = sta; - } /* for all STA info */ + } if (prev_sta) { rx.sta = prev_sta; rx.sdata = prev_sta->sdata; - rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(&rx, hdr); - if (!prepares) - prev_sta = NULL; - - if (prev_sta && status->flag & RX_FLAG_MMIC_ERROR) { - if (rx.flags & IEEE80211_RX_RA_MATCH) - ieee80211_rx_michael_mic_report(hdr, &rx); - prev_sta = NULL; - } - } - - - if (prev_sta) { - ieee80211_invoke_rx_handlers(prev_sta->sdata, &rx, skb); - return; - } else { - if (found_sta) { - dev_kfree_skb(skb); + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) return; - } } - } /* if data frame */ + } + if (!found_sta) { + prev = NULL; + list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(sdata)) continue; @@ -2734,35 +2731,8 @@ next_sta: rx.sta = sta_info_get_bss(prev, hdr->addr2); rx.sdata = prev; + ieee80211_prepare_and_rx_handle(&rx, skb, false); - rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(&rx, hdr); - - if (!prepares) - goto next; - - if (status->flag & RX_FLAG_MMIC_ERROR) { - if (rx.flags & IEEE80211_RX_RA_MATCH) - ieee80211_rx_michael_mic_report(hdr, - &rx); - goto next; - } - - /* - * frame was destined for the previous interface - * so invoke RX handlers for it - */ - - skb_new = skb_copy(skb, GFP_ATOMIC); - if (!skb_new) { - if (net_ratelimit()) - wiphy_debug(local->hw.wiphy, - "failed to copy multicast frame for %s\n", - prev->name); - goto next; - } - ieee80211_invoke_rx_handlers(prev, &rx, skb_new); -next: prev = sdata; } @@ -2770,24 +2740,13 @@ next: rx.sta = sta_info_get_bss(prev, hdr->addr2); rx.sdata = prev; - rx.flags |= IEEE80211_RX_RA_MATCH; - prepares = prepare_for_handlers(&rx, hdr); - - if (!prepares) - prev = NULL; - - if (prev && status->flag & RX_FLAG_MMIC_ERROR) { - if (rx.flags & IEEE80211_RX_RA_MATCH) - ieee80211_rx_michael_mic_report(hdr, - &rx); - prev = NULL; - } + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; } + } - if (prev) - ieee80211_invoke_rx_handlers(prev, &rx, skb); - else - dev_kfree_skb(skb); + + dev_kfree_skb(skb); } /* -- cgit v1.1 From 4b0dd98e70b6516c2c26f28091c2fb09f0ecf215 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Sep 2010 11:21:07 +0200 Subject: mac80211: clean up rx handling wrt. found_sta If a station was found, then we'll have exited the function already, so it is not necessary to have a variable keeping track of it. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 56 +++++++++++++++++++++++++------------------------------ 1 file changed, 25 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 13311f8..2b6b4ea 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -2651,7 +2651,6 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, struct ieee80211_rx_data rx; struct ieee80211_sub_if_data *prev; struct sta_info *sta, *tmp, *prev_sta; - bool found_sta = false; int err = 0; fc = ((struct ieee80211_hdr *)skb->data)->frame_control; @@ -2684,8 +2683,6 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, prev_sta = NULL; for_each_sta_info(local, hdr->addr2, sta, tmp) { - found_sta = true; - if (!prev_sta) { prev_sta = sta; continue; @@ -2707,43 +2704,40 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, } } - if (!found_sta) { - prev = NULL; - - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - if (!ieee80211_sdata_running(sdata)) - continue; - - if (sdata->vif.type == NL80211_IFTYPE_MONITOR || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN) - continue; + prev = NULL; - /* - * frame is destined for this interface, but if it's - * not also for the previous one we handle that after - * the loop to avoid copying the SKB once too much - */ + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!ieee80211_sdata_running(sdata)) + continue; - if (!prev) { - prev = sdata; - continue; - } + if (sdata->vif.type == NL80211_IFTYPE_MONITOR || + sdata->vif.type == NL80211_IFTYPE_AP_VLAN) + continue; - rx.sta = sta_info_get_bss(prev, hdr->addr2); - rx.sdata = prev; - ieee80211_prepare_and_rx_handle(&rx, skb, false); + /* + * frame is destined for this interface, but if it's + * not also for the previous one we handle that after + * the loop to avoid copying the SKB once too much + */ + if (!prev) { prev = sdata; + continue; } - if (prev) { - rx.sta = sta_info_get_bss(prev, hdr->addr2); - rx.sdata = prev; + rx.sta = sta_info_get_bss(prev, hdr->addr2); + rx.sdata = prev; + ieee80211_prepare_and_rx_handle(&rx, skb, false); - if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) - return; - } + prev = sdata; + } + + if (prev) { + rx.sta = sta_info_get_bss(prev, hdr->addr2); + rx.sdata = prev; + if (ieee80211_prepare_and_rx_handle(&rx, skb, true)) + return; } dev_kfree_skb(skb); -- cgit v1.1 From 4080c7cdc23f26c6e6166a70f50fa43814552d81 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Sep 2010 11:21:08 +0200 Subject: mac80211: fix release_reorder_timeout in scan Even if the reorder timeout timer fires while scanning, the frames weren't received during scanning and therefore shouldn't be dropped. To implement this, changes to the passive scan RX handler simplify understanding it, because it currently checks HW_SCANNING independently of a packet's in-scan receive status (which doesn't make a big difference, since scan_rx() will only pick up probe responses and beacons, which can't be aggregated.) Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/rx.c | 22 ++++++++-------------- 1 file changed, 8 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 2b6b4ea..8c666e9 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -389,24 +389,22 @@ ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx) struct ieee80211_local *local = rx->local; struct sk_buff *skb = rx->skb; - if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning))) + if (likely(!(rx->flags & IEEE80211_RX_IN_SCAN))) + return RX_CONTINUE; + + if (test_bit(SCAN_HW_SCANNING, &local->scanning)) return ieee80211_scan_rx(rx->sdata, skb); - if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning) && - (rx->flags & IEEE80211_RX_IN_SCAN))) { + if (test_bit(SCAN_SW_SCANNING, &local->scanning)) { /* drop all the other packets during a software scan anyway */ if (ieee80211_scan_rx(rx->sdata, skb) != RX_QUEUED) dev_kfree_skb(skb); return RX_QUEUED; } - if (unlikely(rx->flags & IEEE80211_RX_IN_SCAN)) { - /* scanning finished during invoking of handlers */ - I802_DEBUG_INC(local->rx_handlers_drop_passive_scan); - return RX_DROP_UNUSABLE; - } - - return RX_CONTINUE; + /* scanning finished during invoking of handlers */ + I802_DEBUG_INC(local->rx_handlers_drop_passive_scan); + return RX_DROP_UNUSABLE; } @@ -2495,10 +2493,6 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) rx.queue = tid; rx.flags |= IEEE80211_RX_RA_MATCH; - if (unlikely(test_bit(SCAN_HW_SCANNING, &sta->local->scanning) || - test_bit(SCAN_OFF_CHANNEL, &sta->local->scanning))) - rx.flags |= IEEE80211_RX_IN_SCAN; - spin_lock(&tid_agg_rx->reorder_lock); ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx, &frames); spin_unlock(&tid_agg_rx->reorder_lock); -- cgit v1.1 From 554891e63a29af35cc6bb403ef34e319518114d0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Fri, 24 Sep 2010 12:38:25 +0200 Subject: mac80211: move packet flags into packet commit 8c0c709eea5cbab97fb464cd68b06f24acc58ee1 Author: Johannes Berg Date: Wed Nov 25 17:46:15 2009 +0100 mac80211: move cmntr flag out of rx flags moved the CMNTR flag into the skb RX flags for some aggregation cleanups, but this was wrong since the optimisation this flag tried to make requires that it is kept across the processing of multiple interfaces -- which isn't true for flags in the skb. The patch not only broke the optimisation, it also introduced a bug: under some (common!) circumstances the flag will be set on an already freed skb! However, investigating this in more detail, I found that most of the flags that we set should be per packet, _except_ for this one, due to a-MPDU processing. Additionally, the flags used for processing (currently just this one) need to be reset before processing a new packet. Since we haven't actually seen bugs reported as a result of the wrong flags handling (which is not too surprising -- the only real bug case I can come up with is an a-MSDU contained in an a-MPDU), I'll make a different fix for rc. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 38 +++++++++++++---- net/mac80211/rx.c | 102 +++++++++++++++++++++++++-------------------- net/mac80211/wpa.c | 2 +- 3 files changed, 89 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 40f7472..945fbf2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -159,13 +159,37 @@ typedef unsigned __bitwise__ ieee80211_rx_result; #define RX_DROP_MONITOR ((__force ieee80211_rx_result) 2u) #define RX_QUEUED ((__force ieee80211_rx_result) 3u) -#define IEEE80211_RX_IN_SCAN BIT(0) -/* frame is destined to interface currently processed (incl. multicast frames) */ -#define IEEE80211_RX_RA_MATCH BIT(1) -#define IEEE80211_RX_AMSDU BIT(2) -#define IEEE80211_RX_FRAGMENTED BIT(3) -#define IEEE80211_MALFORMED_ACTION_FRM BIT(4) -/* only add flags here that do not change with subframes of an aMPDU */ +/** + * enum ieee80211_packet_rx_flags - packet RX flags + * @IEEE80211_RX_RA_MATCH: frame is destined to interface currently processed + * (incl. multicast frames) + * @IEEE80211_RX_IN_SCAN: received while scanning + * @IEEE80211_RX_FRAGMENTED: fragmented frame + * @IEEE80211_RX_AMSDU: a-MSDU packet + * @IEEE80211_RX_MALFORMED_ACTION_FRM: action frame is malformed + * + * These are per-frame flags that are attached to a frame in the + * @rx_flags field of &struct ieee80211_rx_status. + */ +enum ieee80211_packet_rx_flags { + IEEE80211_RX_IN_SCAN = BIT(0), + IEEE80211_RX_RA_MATCH = BIT(1), + IEEE80211_RX_FRAGMENTED = BIT(2), + IEEE80211_RX_AMSDU = BIT(3), + IEEE80211_RX_MALFORMED_ACTION_FRM = BIT(4), +}; + +/** + * enum ieee80211_rx_flags - RX data flags + * + * @IEEE80211_RX_CMNTR: received on cooked monitor already + * + * These flags are used across handling multiple interfaces + * for a single frame. + */ +enum ieee80211_rx_flags { + IEEE80211_RX_CMNTR = BIT(0), +}; struct ieee80211_rx_data { struct sk_buff *skb; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 8c666e9..0b0e83e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -315,6 +315,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb, static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); int tid; /* does the frame have a qos control field? */ @@ -323,9 +324,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx) /* frame has qos control */ tid = *qc & IEEE80211_QOS_CTL_TID_MASK; if (*qc & IEEE80211_QOS_CONTROL_A_MSDU_PRESENT) - rx->flags |= IEEE80211_RX_AMSDU; - else - rx->flags &= ~IEEE80211_RX_AMSDU; + status->rx_flags |= IEEE80211_RX_AMSDU; } else { /* * IEEE 802.11-2007, 7.1.3.4.1 ("Sequence Number field"): @@ -387,9 +386,10 @@ static ieee80211_rx_result debug_noinline ieee80211_rx_h_passive_scan(struct ieee80211_rx_data *rx) { struct ieee80211_local *local = rx->local; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); struct sk_buff *skb = rx->skb; - if (likely(!(rx->flags & IEEE80211_RX_IN_SCAN))) + if (likely(!(status->rx_flags & IEEE80211_RX_IN_SCAN))) return RX_CONTINUE; if (test_bit(SCAN_HW_SCANNING, &local->scanning)) @@ -783,13 +783,14 @@ static ieee80211_rx_result debug_noinline ieee80211_rx_h_check(struct ieee80211_rx_data *rx) { struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); /* Drop duplicate 802.11 retransmissions (IEEE 802.11 Chap. 9.2.9) */ if (rx->sta && !is_multicast_ether_addr(hdr->addr1)) { if (unlikely(ieee80211_has_retry(hdr->frame_control) && rx->sta->last_seq_ctrl[rx->queue] == hdr->seq_ctrl)) { - if (rx->flags & IEEE80211_RX_RA_MATCH) { + if (status->rx_flags & IEEE80211_RX_RA_MATCH) { rx->local->dot11FrameDuplicateCount++; rx->sta->num_duplicates++; } @@ -822,7 +823,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) if ((!ieee80211_has_fromds(hdr->frame_control) && !ieee80211_has_tods(hdr->frame_control) && ieee80211_is_data(hdr->frame_control)) || - !(rx->flags & IEEE80211_RX_RA_MATCH)) { + !(status->rx_flags & IEEE80211_RX_RA_MATCH)) { /* Drop IBSS frames and frames for other hosts * silently. */ return RX_DROP_MONITOR; @@ -879,7 +880,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) * No point in finding a key and decrypting if the frame is neither * addressed to us nor a multicast frame. */ - if (!(rx->flags & IEEE80211_RX_RA_MATCH)) + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) return RX_CONTINUE; /* start without a key */ @@ -1112,7 +1113,7 @@ ieee80211_rx_h_sta_process(struct ieee80211_rx_data *rx) sta->last_rx = jiffies; } - if (!(rx->flags & IEEE80211_RX_RA_MATCH)) + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) return RX_CONTINUE; if (rx->sdata->vif.type == NL80211_IFTYPE_STATION) @@ -1269,6 +1270,7 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) unsigned int frag, seq; struct ieee80211_fragment_entry *entry; struct sk_buff *skb; + struct ieee80211_rx_status *status; hdr = (struct ieee80211_hdr *)rx->skb->data; fc = hdr->frame_control; @@ -1368,7 +1370,8 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx) } /* Complete frame has been reassembled - process it now */ - rx->flags |= IEEE80211_RX_FRAGMENTED; + status = IEEE80211_SKB_RXCB(rx->skb); + status->rx_flags |= IEEE80211_RX_FRAGMENTED; out: if (rx->sta) @@ -1385,9 +1388,10 @@ ieee80211_rx_h_ps_poll(struct ieee80211_rx_data *rx) { struct ieee80211_sub_if_data *sdata = rx->sdata; __le16 fc = ((struct ieee80211_hdr *)rx->skb->data)->frame_control; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); if (likely(!rx->sta || !ieee80211_is_pspoll(fc) || - !(rx->flags & IEEE80211_RX_RA_MATCH))) + !(status->rx_flags & IEEE80211_RX_RA_MATCH))) return RX_CONTINUE; if ((sdata->vif.type != NL80211_IFTYPE_AP) && @@ -1548,6 +1552,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) struct sk_buff *skb, *xmit_skb; struct ethhdr *ehdr = (struct ethhdr *) rx->skb->data; struct sta_info *dsta; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); skb = rx->skb; xmit_skb = NULL; @@ -1555,7 +1560,7 @@ ieee80211_deliver_skb(struct ieee80211_rx_data *rx) if ((sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN) && !(sdata->flags & IEEE80211_SDATA_DONT_BRIDGE_PACKETS) && - (rx->flags & IEEE80211_RX_RA_MATCH) && + (status->rx_flags & IEEE80211_RX_RA_MATCH) && (sdata->vif.type != NL80211_IFTYPE_AP_VLAN || !sdata->u.vlan.sta)) { if (is_multicast_ether_addr(ehdr->h_dest)) { /* @@ -1632,6 +1637,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; __le16 fc = hdr->frame_control; struct sk_buff_head frame_list; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); if (unlikely(!ieee80211_is_data(fc))) return RX_CONTINUE; @@ -1639,7 +1645,7 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx) if (unlikely(!ieee80211_is_data_present(fc))) return RX_DROP_MONITOR; - if (!(rx->flags & IEEE80211_RX_AMSDU)) + if (!(status->rx_flags & IEEE80211_RX_AMSDU)) return RX_CONTINUE; if (ieee80211_has_a4(hdr->frame_control) && @@ -1690,6 +1696,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) struct sk_buff *skb = rx->skb, *fwd_skb; struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); hdr = (struct ieee80211_hdr *) skb->data; hdrlen = ieee80211_hdrlen(hdr->frame_control); @@ -1735,7 +1742,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx) mesh_hdr->ttl--; - if (rx->flags & IEEE80211_RX_RA_MATCH) { + if (status->rx_flags & IEEE80211_RX_RA_MATCH) { if (!mesh_hdr->ttl) IEEE80211_IFSTA_MESH_CTR_INC(&rx->sdata->u.mesh, dropped_frames_ttl); @@ -1945,6 +1952,7 @@ static ieee80211_rx_result debug_noinline ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) { struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); /* * From here on, look only at management frames. @@ -1957,7 +1965,7 @@ ieee80211_rx_h_mgmt_check(struct ieee80211_rx_data *rx) if (!ieee80211_is_mgmt(mgmt->frame_control)) return RX_DROP_MONITOR; - if (!(rx->flags & IEEE80211_RX_RA_MATCH)) + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) return RX_DROP_MONITOR; if (ieee80211_drop_unencrypted_mgmt(rx)) @@ -1972,6 +1980,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) struct ieee80211_local *local = rx->local; struct ieee80211_sub_if_data *sdata = rx->sdata; struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); int len = rx->skb->len; if (!ieee80211_is_action(mgmt->frame_control)) @@ -1984,7 +1993,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) if (!rx->sta && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) return RX_DROP_UNUSABLE; - if (!(rx->flags & IEEE80211_RX_RA_MATCH)) + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) return RX_DROP_UNUSABLE; switch (mgmt->u.action.category) { @@ -2080,7 +2089,7 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) return RX_CONTINUE; invalid: - rx->flags |= IEEE80211_MALFORMED_ACTION_FRM; + status->rx_flags |= IEEE80211_RX_MALFORMED_ACTION_FRM; /* will return in the next handlers */ return RX_CONTINUE; @@ -2102,10 +2111,10 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx) static ieee80211_rx_result debug_noinline ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) { - struct ieee80211_rx_status *status; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); /* skip known-bad action frames and return them in the next handler */ - if (rx->flags & IEEE80211_MALFORMED_ACTION_FRM) + if (status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) return RX_CONTINUE; /* @@ -2114,7 +2123,6 @@ ieee80211_rx_h_userspace_mgmt(struct ieee80211_rx_data *rx) * so userspace can register for those to know whether ones * it transmitted were processed or returned. */ - status = IEEE80211_SKB_RXCB(rx->skb); if (cfg80211_rx_mgmt(rx->sdata->dev, status->freq, rx->skb->data, rx->skb->len, @@ -2136,6 +2144,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) struct ieee80211_mgmt *mgmt = (struct ieee80211_mgmt *) rx->skb->data; struct sk_buff *nskb; struct ieee80211_sub_if_data *sdata = rx->sdata; + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb); if (!ieee80211_is_action(mgmt->frame_control)) return RX_CONTINUE; @@ -2150,7 +2159,7 @@ ieee80211_rx_h_action_return(struct ieee80211_rx_data *rx) * registration mechanisms, but older ones still use cooked * monitor interfaces so push all frames there. */ - if (!(rx->flags & IEEE80211_MALFORMED_ACTION_FRM) && + if (!(status->rx_flags & IEEE80211_RX_MALFORMED_ACTION_FRM) && (sdata->vif.type == NL80211_IFTYPE_AP || sdata->vif.type == NL80211_IFTYPE_AP_VLAN)) return RX_DROP_MONITOR; @@ -2284,8 +2293,13 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, struct net_device *prev_dev = NULL; struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); - if (status->flag & RX_FLAG_INTERNAL_CMTR) + /* + * If cooked monitor has been processed already, then + * don't do it again. If not, set the flag. + */ + if (rx->flags & IEEE80211_RX_CMNTR) goto out_free_skb; + rx->flags |= IEEE80211_RX_CMNTR; if (skb_headroom(skb) < sizeof(*rthdr) && pskb_expand_head(skb, sizeof(*rthdr), 0, GFP_ATOMIC)) @@ -2341,12 +2355,8 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx, if (prev_dev) { skb->dev = prev_dev; netif_receive_skb(skb); - skb = NULL; - } else - goto out_free_skb; - - status->flag |= RX_FLAG_INTERNAL_CMTR; - return; + return; + } out_free_skb: dev_kfree_skb(skb); @@ -2407,6 +2417,7 @@ static void ieee80211_rx_handlers(struct ieee80211_rx_data *rx, * same TID from the same station */ rx->skb = skb; + rx->flags = 0; CALL_RXH(ieee80211_rx_h_decrypt) CALL_RXH(ieee80211_rx_h_check_more_data) @@ -2477,7 +2488,12 @@ static void ieee80211_invoke_rx_handlers(struct ieee80211_rx_data *rx) void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) { struct sk_buff_head frames; - struct ieee80211_rx_data rx = { }; + struct ieee80211_rx_data rx = { + .sta = sta, + .sdata = sta->sdata, + .local = sta->local, + .queue = tid, + }; struct tid_ampdu_rx *tid_agg_rx; tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]); @@ -2486,13 +2502,6 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid) __skb_queue_head_init(&frames); - /* construct rx struct */ - rx.sta = sta; - rx.sdata = sta->sdata; - rx.local = sta->local; - rx.queue = tid; - rx.flags |= IEEE80211_RX_RA_MATCH; - spin_lock(&tid_agg_rx->reorder_lock); ieee80211_sta_reorder_release(&sta->local->hw, tid_agg_rx, &frames); spin_unlock(&tid_agg_rx->reorder_lock); @@ -2519,7 +2528,7 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) { if (!(sdata->dev->flags & IFF_PROMISC)) return 0; - rx->flags &= ~IEEE80211_RX_RA_MATCH; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } break; case NL80211_IFTYPE_ADHOC: @@ -2529,15 +2538,15 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, return 1; } else if (!ieee80211_bssid_match(bssid, sdata->u.ibss.bssid)) { - if (!(rx->flags & IEEE80211_RX_IN_SCAN)) + if (!(status->rx_flags & IEEE80211_RX_IN_SCAN)) return 0; - rx->flags &= ~IEEE80211_RX_RA_MATCH; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } else if (!multicast && compare_ether_addr(sdata->vif.addr, hdr->addr1) != 0) { if (!(sdata->dev->flags & IFF_PROMISC)) return 0; - rx->flags &= ~IEEE80211_RX_RA_MATCH; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } else if (!rx->sta) { int rate_idx; if (status->flag & RX_FLAG_HT) @@ -2555,7 +2564,7 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, if (!(sdata->dev->flags & IFF_PROMISC)) return 0; - rx->flags &= ~IEEE80211_RX_RA_MATCH; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } break; case NL80211_IFTYPE_AP_VLAN: @@ -2566,9 +2575,9 @@ static int prepare_for_handlers(struct ieee80211_rx_data *rx, return 0; } else if (!ieee80211_bssid_match(bssid, sdata->vif.addr)) { - if (!(rx->flags & IEEE80211_RX_IN_SCAN)) + if (!(status->rx_flags & IEEE80211_RX_IN_SCAN)) return 0; - rx->flags &= ~IEEE80211_RX_RA_MATCH; + status->rx_flags &= ~IEEE80211_RX_RA_MATCH; } break; case NL80211_IFTYPE_WDS: @@ -2602,14 +2611,14 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, int prepares; rx->skb = skb; - rx->flags |= IEEE80211_RX_RA_MATCH; + status->rx_flags |= IEEE80211_RX_RA_MATCH; prepares = prepare_for_handlers(rx, hdr); if (!prepares) return false; if (status->flag & RX_FLAG_MMIC_ERROR) { - if (rx->flags & IEEE80211_RX_RA_MATCH) + if (status->rx_flags & IEEE80211_RX_RA_MATCH) ieee80211_rx_michael_mic_report(hdr, rx); return false; } @@ -2638,6 +2647,7 @@ static bool ieee80211_prepare_and_rx_handle(struct ieee80211_rx_data *rx, static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, struct sk_buff *skb) { + struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(skb); struct ieee80211_local *local = hw_to_local(hw); struct ieee80211_sub_if_data *sdata; struct ieee80211_hdr *hdr; @@ -2657,7 +2667,7 @@ static void __ieee80211_rx_handle_packet(struct ieee80211_hw *hw, if (unlikely(test_bit(SCAN_HW_SCANNING, &local->scanning) || test_bit(SCAN_OFF_CHANNEL, &local->scanning))) - rx.flags |= IEEE80211_RX_IN_SCAN; + status->rx_flags |= IEEE80211_RX_IN_SCAN; if (ieee80211_is_mgmt(fc)) err = skb_linearize(skb); @@ -2808,6 +2818,8 @@ void ieee80211_rx(struct ieee80211_hw *hw, struct sk_buff *skb) } } + status->rx_flags = 0; + /* * key references and virtual interfaces are protected using RCU * and this requires that we are in a read-side RCU section during diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c index 43882b3..bee230d 100644 --- a/net/mac80211/wpa.c +++ b/net/mac80211/wpa.c @@ -117,7 +117,7 @@ ieee80211_rx_h_michael_mic_verify(struct ieee80211_rx_data *rx) key = &rx->key->conf.key[key_offset]; michael_mic(key, hdr, data, data_len, mic); if (memcmp(mic, data + data_len, MICHAEL_MIC_LEN) != 0 || wpa_test) { - if (!(rx->flags & IEEE80211_RX_RA_MATCH)) + if (!(status->rx_flags & IEEE80211_RX_RA_MATCH)) return RX_DROP_UNUSABLE; mac80211_ev_michael_mic_failure(rx->sdata, rx->key->conf.keyidx, -- cgit v1.1 From 8d98efa84b790bdd62248eb0dfff17e9baf5c844 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Sun, 26 Sep 2010 19:07:59 +0000 Subject: Phonet: Implement Pipe Controller to support Nokia Slim Modems Phonet stack assumes the presence of Pipe Controller, either in Modem or on Application Processing Engine user-space for the Pipe data. Nokia Slim Modems like WG2.5 used in ST-Ericsson U8500 platform do not implement Pipe controller in them. This patch adds Pipe Controller implemenation to Phonet stack to support Pipe data over Phonet stack for Nokia Slim Modems. Signed-off-by: Kumar Sanghvi Acked-by: Linus Walleij Signed-off-by: David S. Miller --- net/phonet/Kconfig | 11 ++ net/phonet/pep.c | 448 ++++++++++++++++++++++++++++++++++++++++++++++++++++- 2 files changed, 453 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig index 6ec7d55..901956a 100644 --- a/net/phonet/Kconfig +++ b/net/phonet/Kconfig @@ -14,3 +14,14 @@ config PHONET To compile this driver as a module, choose M here: the module will be called phonet. If unsure, say N. + +config PHONET_PIPECTRLR + bool "Phonet Pipe Controller" + depends on PHONET + default N + help + The Pipe Controller implementation in Phonet stack to support Pipe + data with Nokia Slim modems like WG2.5 used on ST-Ericsson U8500 + platform. + + If unsure, say N. diff --git a/net/phonet/pep.c b/net/phonet/pep.c index d0e7eb2..7bf23cf 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -88,6 +88,15 @@ static int pep_reply(struct sock *sk, struct sk_buff *oskb, const struct pnpipehdr *oph = pnp_hdr(oskb); struct pnpipehdr *ph; struct sk_buff *skb; +#ifdef CONFIG_PHONET_PIPECTRLR + const struct phonethdr *hdr = pn_hdr(oskb); + struct sockaddr_pn spn = { + .spn_family = AF_PHONET, + .spn_resource = 0xD9, + .spn_dev = hdr->pn_sdev, + .spn_obj = hdr->pn_sobj, + }; +#endif skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority); if (!skb) @@ -105,10 +114,271 @@ static int pep_reply(struct sock *sk, struct sk_buff *oskb, ph->pipe_handle = oph->pipe_handle; ph->error_code = code; +#ifdef CONFIG_PHONET_PIPECTRLR + return pn_skb_send(sk, skb, &spn); +#else return pn_skb_send(sk, skb, &pipe_srv); +#endif } #define PAD 0x00 + +#ifdef CONFIG_PHONET_PIPECTRLR +static u8 pipe_negotiate_fc(u8 *host_fc, u8 *remote_fc, int len) +{ + int i, j; + u8 base_fc, final_fc; + + for (i = 0; i < len; i++) { + base_fc = host_fc[i]; + for (j = 0; j < len; j++) { + if (remote_fc[j] == base_fc) { + final_fc = base_fc; + goto done; + } + } + } + return -EINVAL; + +done: + return final_fc; + +} + +static int pipe_get_flow_info(struct sock *sk, struct sk_buff *skb, + u8 *pref_rx_fc, u8 *req_tx_fc) +{ + struct pnpipehdr *hdr; + u8 n_sb; + + if (!pskb_may_pull(skb, sizeof(*hdr) + 4)) + return -EINVAL; + + hdr = pnp_hdr(skb); + n_sb = hdr->data[4]; + + __skb_pull(skb, sizeof(*hdr) + 4); + while (n_sb > 0) { + u8 type, buf[3], len = sizeof(buf); + u8 *data = pep_get_sb(skb, &type, &len, buf); + + if (data == NULL) + return -EINVAL; + + switch (type) { + case PN_PIPE_SB_REQUIRED_FC_TX: + if (len < 3 || (data[2] | data[3] | data[4]) > 3) + break; + req_tx_fc[0] = data[2]; + req_tx_fc[1] = data[3]; + req_tx_fc[2] = data[4]; + break; + + case PN_PIPE_SB_PREFERRED_FC_RX: + if (len < 3 || (data[2] | data[3] | data[4]) > 3) + break; + pref_rx_fc[0] = data[2]; + pref_rx_fc[1] = data[3]; + pref_rx_fc[2] = data[4]; + break; + + } + n_sb--; + } + return 0; +} + +static int pipe_handler_send_req(struct sock *sk, u16 dobj, u8 utid, + u8 msg_id, u8 p_handle, gfp_t priority) +{ + int len; + struct pnpipehdr *ph; + struct sk_buff *skb; + struct sockaddr_pn spn = { + .spn_family = AF_PHONET, + .spn_resource = 0xD9, + .spn_dev = pn_dev(dobj), + .spn_obj = pn_obj(dobj), + }; + + static const u8 data[4] = { + PAD, PAD, PAD, PAD, + }; + + switch (msg_id) { + case PNS_PEP_CONNECT_REQ: + len = sizeof(data); + break; + + case PNS_PEP_DISCONNECT_REQ: + case PNS_PEP_ENABLE_REQ: + case PNS_PEP_DISABLE_REQ: + len = 0; + break; + + default: + return -EINVAL; + } + + skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority); + if (!skb) + return -ENOMEM; + skb_set_owner_w(skb, sk); + + skb_reserve(skb, MAX_PNPIPE_HEADER); + if (len) { + __skb_put(skb, len); + skb_copy_to_linear_data(skb, data, len); + } + __skb_push(skb, sizeof(*ph)); + skb_reset_transport_header(skb); + ph = pnp_hdr(skb); + ph->utid = utid; + ph->message_id = msg_id; + ph->pipe_handle = p_handle; + ph->error_code = PN_PIPE_NO_ERROR; + + return pn_skb_send(sk, skb, &spn); +} + +static int pipe_handler_send_created_ind(struct sock *sk, u16 dobj, + u8 utid, u8 p_handle, u8 msg_id, u8 tx_fc, u8 rx_fc) +{ + int err_code; + struct pnpipehdr *ph; + struct sk_buff *skb; + struct sockaddr_pn spn = { + .spn_family = AF_PHONET, + .spn_resource = 0xD9, + .spn_dev = pn_dev(dobj), + .spn_obj = pn_obj(dobj), + }; + + static u8 data[4] = { + 0x03, 0x04, + }; + data[2] = tx_fc; + data[3] = rx_fc; + + /* + * actually, below is number of sub-blocks and not error code. + * Pipe_created_ind message format does not have any + * error code field. However, the Phonet stack will always send + * an error code as part of pnpipehdr. So, use that err_code to + * specify the number of sub-blocks. + */ + err_code = 0x01; + + skb = alloc_skb(MAX_PNPIPE_HEADER + sizeof(data), GFP_ATOMIC); + if (!skb) + return -ENOMEM; + skb_set_owner_w(skb, sk); + + skb_reserve(skb, MAX_PNPIPE_HEADER); + __skb_put(skb, sizeof(data)); + skb_copy_to_linear_data(skb, data, sizeof(data)); + __skb_push(skb, sizeof(*ph)); + skb_reset_transport_header(skb); + ph = pnp_hdr(skb); + ph->utid = utid; + ph->message_id = msg_id; + ph->pipe_handle = p_handle; + ph->error_code = err_code; + + return pn_skb_send(sk, skb, &spn); +} + +static int pipe_handler_send_ind(struct sock *sk, u16 dobj, u8 utid, + u8 p_handle, u8 msg_id) +{ + int err_code; + struct pnpipehdr *ph; + struct sk_buff *skb; + struct sockaddr_pn spn = { + .spn_family = AF_PHONET, + .spn_resource = 0xD9, + .spn_dev = pn_dev(dobj), + .spn_obj = pn_obj(dobj), + }; + + /* + * actually, below is a filler. + * Pipe_enabled/disabled_ind message format does not have any + * error code field. However, the Phonet stack will always send + * an error code as part of pnpipehdr. So, use that err_code to + * specify the filler value. + */ + err_code = 0x0; + + skb = alloc_skb(MAX_PNPIPE_HEADER, GFP_ATOMIC); + if (!skb) + return -ENOMEM; + skb_set_owner_w(skb, sk); + + skb_reserve(skb, MAX_PNPIPE_HEADER); + __skb_push(skb, sizeof(*ph)); + skb_reset_transport_header(skb); + ph = pnp_hdr(skb); + ph->utid = utid; + ph->message_id = msg_id; + ph->pipe_handle = p_handle; + ph->error_code = err_code; + + return pn_skb_send(sk, skb, &spn); +} + +static int pipe_handler_enable_pipe(struct sock *sk, int cmd) +{ + int ret; + struct pep_sock *pn = pep_sk(sk); + + switch (cmd) { + case PNPIPE_ENABLE: + ret = pipe_handler_send_req(sk, pn->pn_sk.sobject, + PNS_PIPE_ENABLE_UTID, PNS_PEP_ENABLE_REQ, + pn->pipe_handle, GFP_ATOMIC); + break; + + case PNPIPE_DISABLE: + ret = pipe_handler_send_req(sk, pn->pn_sk.sobject, + PNS_PIPE_DISABLE_UTID, PNS_PEP_DISABLE_REQ, + pn->pipe_handle, GFP_ATOMIC); + break; + + default: + ret = -EINVAL; + } + + return ret; +} + +static int pipe_handler_create_pipe(struct sock *sk, int pipe_handle, int cmd) +{ + int ret; + struct pep_sock *pn = pep_sk(sk); + + switch (cmd) { + case PNPIPE_CREATE: + ret = pipe_handler_send_req(sk, pn->pn_sk.sobject, + PNS_PEP_CONNECT_UTID, PNS_PEP_CONNECT_REQ, + pipe_handle, GFP_ATOMIC); + break; + + case PNPIPE_DESTROY: + ret = pipe_handler_send_req(sk, pn->remote_pep, + PNS_PEP_DISCONNECT_UTID, + PNS_PEP_DISCONNECT_REQ, + pn->pipe_handle, GFP_ATOMIC); + break; + + default: + ret = -EINVAL; + } + + return ret; +} +#endif + static int pep_accept_conn(struct sock *sk, struct sk_buff *skb) { static const u8 data[20] = { @@ -173,6 +443,14 @@ static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority) struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; +#ifdef CONFIG_PHONET_PIPECTRLR + struct sockaddr_pn spn = { + .spn_family = AF_PHONET, + .spn_resource = 0xD9, + .spn_dev = pn_dev(pn->remote_pep), + .spn_obj = pn_obj(pn->remote_pep), + }; +#endif skb = alloc_skb(MAX_PNPIPE_HEADER + 4, priority); if (!skb) @@ -192,7 +470,11 @@ static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority) ph->data[3] = PAD; ph->data[4] = status; +#ifdef CONFIG_PHONET_PIPECTRLR + return pn_skb_send(sk, skb, &spn); +#else return pn_skb_send(sk, skb, &pipe_srv); +#endif } /* Send our RX flow control information to the sender. @@ -308,6 +590,12 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) struct pnpipehdr *hdr = pnp_hdr(skb); struct sk_buff_head *queue; int err = 0; +#ifdef CONFIG_PHONET_PIPECTRLR + struct phonethdr *ph = pn_hdr(skb); + static u8 host_pref_rx_fc[3], host_req_tx_fc[3]; + u8 remote_pref_rx_fc[3], remote_req_tx_fc[3]; + u8 negotiated_rx_fc, negotiated_tx_fc; +#endif BUG_ON(sk->sk_state == TCP_CLOSE_WAIT); @@ -316,6 +604,40 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE); break; +#ifdef CONFIG_PHONET_PIPECTRLR + case PNS_PEP_CONNECT_RESP: + if ((ph->pn_sdev == pn_dev(pn->remote_pep)) && + (ph->pn_sobj == pn_obj(pn->remote_pep))) { + pipe_get_flow_info(sk, skb, remote_pref_rx_fc, + remote_req_tx_fc); + + negotiated_tx_fc = pipe_negotiate_fc(remote_req_tx_fc, + host_pref_rx_fc, + sizeof(host_pref_rx_fc)); + negotiated_rx_fc = pipe_negotiate_fc(host_req_tx_fc, + remote_pref_rx_fc, + sizeof(host_pref_rx_fc)); + + pn->pipe_state = PIPE_DISABLED; + pipe_handler_send_created_ind(sk, pn->remote_pep, + PNS_PIPE_CREATED_IND_UTID, + pn->pipe_handle, PNS_PIPE_CREATED_IND, + negotiated_tx_fc, negotiated_rx_fc); + pipe_handler_send_created_ind(sk, pn->pn_sk.sobject, + PNS_PIPE_CREATED_IND_UTID, + pn->pipe_handle, PNS_PIPE_CREATED_IND, + negotiated_tx_fc, negotiated_rx_fc); + } else { + pipe_handler_send_req(sk, pn->remote_pep, + PNS_PEP_CONNECT_UTID, + PNS_PEP_CONNECT_REQ, pn->pipe_handle, + GFP_ATOMIC); + pipe_get_flow_info(sk, skb, host_pref_rx_fc, + host_req_tx_fc); + } + break; +#endif + case PNS_PEP_DISCONNECT_REQ: pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); sk->sk_state = TCP_CLOSE_WAIT; @@ -323,11 +645,41 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) sk->sk_state_change(sk); break; +#ifdef CONFIG_PHONET_PIPECTRLR + case PNS_PEP_DISCONNECT_RESP: + pn->pipe_state = PIPE_IDLE; + pipe_handler_send_req(sk, pn->pn_sk.sobject, + PNS_PEP_DISCONNECT_UTID, + PNS_PEP_DISCONNECT_REQ, pn->pipe_handle, + GFP_KERNEL); + break; +#endif + case PNS_PEP_ENABLE_REQ: /* Wait for PNS_PIPE_(ENABLED|REDIRECTED)_IND */ pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; +#ifdef CONFIG_PHONET_PIPECTRLR + case PNS_PEP_ENABLE_RESP: + if ((ph->pn_sdev == pn_dev(pn->remote_pep)) && + (ph->pn_sobj == pn_obj(pn->remote_pep))) { + pn->pipe_state = PIPE_ENABLED; + pipe_handler_send_ind(sk, pn->remote_pep, + PNS_PIPE_ENABLED_IND_UTID, + pn->pipe_handle, PNS_PIPE_ENABLED_IND); + pipe_handler_send_ind(sk, pn->pn_sk.sobject, + PNS_PIPE_ENABLED_IND_UTID, + pn->pipe_handle, PNS_PIPE_ENABLED_IND); + } else + pipe_handler_send_req(sk, pn->remote_pep, + PNS_PIPE_ENABLE_UTID, + PNS_PEP_ENABLE_REQ, pn->pipe_handle, + GFP_KERNEL); + + break; +#endif + case PNS_PEP_RESET_REQ: switch (hdr->state_after_reset) { case PN_PIPE_DISABLE: @@ -346,6 +698,27 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; +#ifdef CONFIG_PHONET_PIPECTRLR + case PNS_PEP_DISABLE_RESP: + if ((ph->pn_sdev == pn_dev(pn->remote_pep)) && + (ph->pn_sobj == pn_obj(pn->remote_pep))) { + pn->pipe_state = PIPE_DISABLED; + pipe_handler_send_ind(sk, pn->remote_pep, + PNS_PIPE_DISABLED_IND_UTID, + pn->pipe_handle, + PNS_PIPE_DISABLED_IND); + pipe_handler_send_ind(sk, pn->pn_sk.sobject, + PNS_PIPE_DISABLED_IND_UTID, + pn->pipe_handle, + PNS_PIPE_DISABLED_IND); + } else + pipe_handler_send_req(sk, pn->remote_pep, + PNS_PIPE_DISABLE_UTID, + PNS_PEP_DISABLE_REQ, pn->pipe_handle, + GFP_KERNEL); + break; +#endif + case PNS_PEP_CTRL_REQ: if (skb_queue_len(&pn->ctrlreq_queue) >= PNPIPE_CTRLREQ_MAX) { atomic_inc(&sk->sk_drops); @@ -519,6 +892,9 @@ static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb) newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL; newpn->init_enable = enabled; newpn->aligned = aligned; +#ifdef CONFIG_PHONET_PIPECTRLR + newpn->remote_pep = pn->remote_pep; +#endif BUG_ON(!skb_queue_empty(&newsk->sk_receive_queue)); skb_queue_head(&newsk->sk_receive_queue, skb); @@ -781,6 +1157,10 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, { struct pep_sock *pn = pep_sk(sk); int val = 0, err = 0; +#ifdef CONFIG_PHONET_PIPECTRLR + int remote_pep; + int pipe_handle; +#endif if (level != SOL_PNPIPE) return -ENOPROTOOPT; @@ -791,6 +1171,48 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, lock_sock(sk); switch (optname) { +#ifdef CONFIG_PHONET_PIPECTRLR + case PNPIPE_CREATE: + if (val) { + if (pn->pipe_state > PIPE_IDLE) { + err = -EFAULT; + break; + } + remote_pep = val & 0xFFFF; + pipe_handle = (val >> 16) & 0xFF; + pn->remote_pep = remote_pep; + err = pipe_handler_create_pipe(sk, pipe_handle, + PNPIPE_CREATE); + break; + } + + case PNPIPE_ENABLE: + if (pn->pipe_state != PIPE_DISABLED) { + err = -EFAULT; + break; + } + err = pipe_handler_enable_pipe(sk, PNPIPE_ENABLE); + break; + + case PNPIPE_DISABLE: + if (pn->pipe_state != PIPE_ENABLED) { + err = -EFAULT; + break; + } + + err = pipe_handler_enable_pipe(sk, PNPIPE_DISABLE); + break; + + case PNPIPE_DESTROY: + if (pn->pipe_state < PIPE_DISABLED) { + err = -EFAULT; + break; + } + + err = pipe_handler_create_pipe(sk, 0x0, PNPIPE_DESTROY); + break; +#endif + case PNPIPE_ENCAP: if (val && val != PNPIPE_ENCAP_IP) { err = -EINVAL; @@ -840,6 +1262,13 @@ static int pep_getsockopt(struct sock *sk, int level, int optname, case PNPIPE_ENCAP: val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE; break; + +#ifdef CONFIG_PHONET_PIPECTRLR + case PNPIPE_INQ: + val = pn->pipe_state; + break; +#endif + case PNPIPE_IFINDEX: val = pn->ifindex; break; @@ -859,7 +1288,14 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; - int err; +#ifdef CONFIG_PHONET_PIPECTRLR + struct sockaddr_pn spn = { + .spn_family = AF_PHONET, + .spn_resource = 0xD9, + .spn_dev = pn_dev(pn->remote_pep), + .spn_obj = pn_obj(pn->remote_pep), + }; +#endif if (pn_flow_safe(pn->tx_fc) && !atomic_add_unless(&pn->tx_credits, -1, 0)) { @@ -877,11 +1313,11 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) } else ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; - - err = pn_skb_send(sk, skb, &pipe_srv); - if (err && pn_flow_safe(pn->tx_fc)) - atomic_inc(&pn->tx_credits); - return err; +#ifdef CONFIG_PHONET_PIPECTRLR + return pn_skb_send(sk, skb, &spn); +#else + return pn_skb_send(sk, skb, &pipe_srv); +#endif } static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, -- cgit v1.1 From af5ef241133b602a77b682009f112e7c3f7604e5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 26 Sep 2010 22:47:09 +0000 Subject: vlan: use this_cpu_ptr() in vlan_skb_recv() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan_dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index 94a1fed..f6fbcc0f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -177,8 +177,8 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, } else { skb->dev = vlan_dev; - rx_stats = per_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats, - smp_processor_id()); + rx_stats = this_cpu_ptr(vlan_dev_info(skb->dev)->vlan_rx_stats); + u64_stats_update_begin(&rx_stats->syncp); rx_stats->rx_packets++; rx_stats->rx_bytes += skb->len; -- cgit v1.1 From e985aad723d7709e6bee566bacb100d33d9b791b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 03:57:11 +0000 Subject: ip_gre: percpu stats accounting MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Le lundi 27 septembre 2010 à 14:29 +0100, Ben Hutchings a écrit : > > diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c > > index 5d6ddcb..de39b22 100644 > > --- a/net/ipv4/ip_gre.c > > +++ b/net/ipv4/ip_gre.c > [...] > > @@ -377,7 +405,7 @@ static struct ip_tunnel *ipgre_tunnel_locate(struct net *net, > > if (parms->name[0]) > > strlcpy(name, parms->name, IFNAMSIZ); > > else > > - sprintf(name, "gre%%d"); > > + strcpy(name, "gre%d"); > > > > dev = alloc_netdev(sizeof(*t), name, ipgre_tunnel_setup); > > if (!dev) > [...] > > This is a valid fix, but doesn't belong in this patch! > Sorry ? It was not a fix, but at most a cleanup ;) Anyway I forgot the gretap case... [PATCH 2/4 v2] ip_gre: percpu stats accounting Maintain per_cpu tx_bytes, tx_packets, rx_bytes, rx_packets. Other seldom used fields are kept in netdev->stats structure, possibly unsafe. This is a preliminary work to support lockless transmit path, and correct RX stats, that are already unsafe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 143 +++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 104 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 5d6ddcb..a1b5d5e 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -165,6 +165,34 @@ struct ipgre_net { #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipgre_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + /* Given src, dst and key, find appropriate for input tunnel. */ static struct ip_tunnel * ipgre_tunnel_lookup(struct net_device *dev, @@ -584,7 +612,7 @@ static int ipgre_rcv(struct sk_buff *skb) if ((tunnel = ipgre_tunnel_lookup(skb->dev, iph->saddr, iph->daddr, key, gre_proto))) { - struct net_device_stats *stats = &tunnel->dev->stats; + struct pcpu_tstats *tstats; secpath_reset(skb); @@ -608,22 +636,22 @@ static int ipgre_rcv(struct sk_buff *skb) /* Looped back packet, drop it! */ if (skb_rtable(skb)->fl.iif == 0) goto drop; - stats->multicast++; + tunnel->dev->stats.multicast++; skb->pkt_type = PACKET_BROADCAST; } #endif if (((flags&GRE_CSUM) && csum) || (!(flags&GRE_CSUM) && tunnel->parms.i_flags&GRE_CSUM)) { - stats->rx_crc_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_crc_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } if (tunnel->parms.i_flags&GRE_SEQ) { if (!(flags&GRE_SEQ) || (tunnel->i_seqno && (s32)(seqno - tunnel->i_seqno) < 0)) { - stats->rx_fifo_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_fifo_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } tunnel->i_seqno = seqno + 1; @@ -632,8 +660,8 @@ static int ipgre_rcv(struct sk_buff *skb) /* Warning: All skb pointers will be invalidated! */ if (tunnel->dev->type == ARPHRD_ETHER) { if (!pskb_may_pull(skb, ETH_HLEN)) { - stats->rx_length_errors++; - stats->rx_errors++; + tunnel->dev->stats.rx_length_errors++; + tunnel->dev->stats.rx_errors++; goto drop; } @@ -642,13 +670,17 @@ static int ipgre_rcv(struct sk_buff *skb) skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN); } - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); if (netif_rx(skb) == NET_RX_DROP) - stats->rx_dropped++; + tunnel->dev->stats.rx_dropped++; rcu_read_unlock(); return 0; @@ -665,8 +697,7 @@ drop_nolock: static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *old_iph = ip_hdr(skb); struct iphdr *tiph; u8 tos; @@ -694,7 +725,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev /* NBMA tunnel */ if (skb_dst(skb) == NULL) { - stats->tx_fifo_errors++; + dev->stats.tx_fifo_errors++; goto tx_error; } @@ -740,14 +771,20 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_GRE + } +; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error; } } @@ -755,7 +792,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } @@ -818,7 +855,7 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev dev->needed_headroom = max_headroom; if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -885,15 +922,15 @@ static netdev_tx_t ipgre_tunnel_xmit(struct sk_buff *skb, struct net_device *dev } nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -913,13 +950,19 @@ static int ipgre_tunnel_bind_dev(struct net_device *dev) /* Guess output device to choose reasonable mtu and needed_headroom */ if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_GRE + }; struct rtable *rt; + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->dst.dev; ip_rt_put(rt); @@ -1171,13 +1214,19 @@ static int ipgre_open(struct net_device *dev) struct ip_tunnel *t = netdev_priv(dev); if (ipv4_is_multicast(t->parms.iph.daddr)) { - struct flowi fl = { .oif = t->parms.link, - .nl_u = { .ip4_u = - { .daddr = t->parms.iph.daddr, - .saddr = t->parms.iph.saddr, - .tos = RT_TOS(t->parms.iph.tos) } }, - .proto = IPPROTO_GRE }; + struct flowi fl = { + .oif = t->parms.link, + .nl_u = { + .ip4_u = { + .daddr = t->parms.iph.daddr, + .saddr = t->parms.iph.saddr, + .tos = RT_TOS(t->parms.iph.tos) + } + }, + .proto = IPPROTO_GRE + }; struct rtable *rt; + if (ip_route_output_key(dev_net(dev), &rt, &fl)) return -EADDRNOTAVAIL; dev = rt->dst.dev; @@ -1217,12 +1266,19 @@ static const struct net_device_ops ipgre_netdev_ops = { .ndo_start_xmit = ipgre_tunnel_xmit, .ndo_do_ioctl = ipgre_tunnel_ioctl, .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, }; +static void ipgre_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipgre_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipgre_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipgre_dev_free; dev->type = ARPHRD_IPGRE; dev->needed_headroom = LL_MAX_HEADER + sizeof(struct iphdr) + 4; @@ -1260,6 +1316,10 @@ static int ipgre_tunnel_init(struct net_device *dev) } else dev->header_ops = &ipgre_header_ops; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } @@ -1446,6 +1506,10 @@ static int ipgre_tap_init(struct net_device *dev) ipgre_tunnel_bind_dev(dev); + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } @@ -1456,6 +1520,7 @@ static const struct net_device_ops ipgre_tap_netdev_ops = { .ndo_set_mac_address = eth_mac_addr, .ndo_validate_addr = eth_validate_addr, .ndo_change_mtu = ipgre_tunnel_change_mtu, + .ndo_get_stats = ipgre_get_stats, }; static void ipgre_tap_setup(struct net_device *dev) @@ -1464,7 +1529,7 @@ static void ipgre_tap_setup(struct net_device *dev) ether_setup(dev); dev->netdev_ops = &ipgre_tap_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipgre_dev_free; dev->iflink = 0; dev->features |= NETIF_F_NETNS_LOCAL; -- cgit v1.1 From 3c97af99a5aa17feaebb4eb0f85f51ab6c055797 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 00:35:50 +0000 Subject: ipip: percpu stats accounting Maintain per_cpu tx_bytes, tx_packets, rx_bytes, rx_packets. Other seldom used fields are kept in netdev->stats structure, possibly unsafe. This is a preliminary work to support lockless transmit path, and correct RX stats, that are already unsafe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 127 +++++++++++++++++++++++++++++++++++++++++--------------- 1 file changed, 93 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index babd252..12b6fde 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -131,8 +131,9 @@ struct ipip_net { struct net_device *fb_tunnel_dev; }; -static void ipip_tunnel_init(struct net_device *dev); +static int ipip_tunnel_init(struct net_device *dev); static void ipip_tunnel_setup(struct net_device *dev); +static void ipip_dev_free(struct net_device *dev); /* * Locking : hash tables are protected by RCU and RTNL @@ -141,6 +142,34 @@ static void ipip_tunnel_setup(struct net_device *dev); #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipip_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + static struct ip_tunnel * ipip_tunnel_lookup(struct net *net, __be32 remote, __be32 local) { @@ -239,7 +268,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else - sprintf(name, "tunl%%d"); + strcpy(name, "tunl%d"); dev = alloc_netdev(sizeof(*t), name, ipip_tunnel_setup); if (dev == NULL) @@ -255,7 +284,8 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, nt = netdev_priv(dev); nt->parms = *parms; - ipip_tunnel_init(dev); + if (ipip_tunnel_init(dev) < 0) + goto failed_free; if (register_netdevice(dev) < 0) goto failed_free; @@ -265,7 +295,7 @@ static struct ip_tunnel * ipip_tunnel_locate(struct net *net, return nt; failed_free: - free_netdev(dev); + ipip_dev_free(dev); return NULL; } @@ -359,8 +389,10 @@ static int ipip_rcv(struct sk_buff *skb) const struct iphdr *iph = ip_hdr(skb); rcu_read_lock(); - if ((tunnel = ipip_tunnel_lookup(dev_net(skb->dev), - iph->saddr, iph->daddr)) != NULL) { + tunnel = ipip_tunnel_lookup(dev_net(skb->dev), iph->saddr, iph->daddr); + if (tunnel != NULL) { + struct pcpu_tstats *tstats; + if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) { rcu_read_unlock(); kfree_skb(skb); @@ -374,7 +406,11 @@ static int ipip_rcv(struct sk_buff *skb) skb->protocol = htons(ETH_P_IP); skb->pkt_type = PACKET_HOST; - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); ipip_ecn_decapsulate(iph, skb); @@ -397,13 +433,12 @@ static int ipip_rcv(struct sk_buff *skb) static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *tiph = &tunnel->parms.iph; u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *old_iph = ip_hdr(skb); struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ @@ -413,13 +448,13 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (skb->protocol != htons(ETH_P_IP)) goto tx_error; - if (tos&1) + if (tos & 1) tos = old_iph->tos; if (!dst) { /* NBMA tunnel */ if ((rt = skb_rtable(skb)) == NULL) { - stats->tx_fifo_errors++; + dev->stats.tx_fifo_errors++; goto tx_error; } if ((dst = rt->rt_gateway) == 0) @@ -427,14 +462,20 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) } { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = dst, - .saddr = tiph->saddr, - .tos = RT_TOS(tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = dst, + .saddr = tiph->saddr, + .tos = RT_TOS(tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } } @@ -442,7 +483,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } @@ -452,7 +493,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - stats->collisions++; + dev->stats.collisions++; ip_rt_put(rt); goto tx_error; } @@ -488,7 +529,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -525,14 +566,14 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) iph->ttl = old_iph->ttl; nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -547,13 +588,19 @@ static void ipip_tunnel_bind_dev(struct net_device *dev) iph = &tunnel->parms.iph; if (iph->daddr) { - struct flowi fl = { .oif = tunnel->parms.link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .saddr = iph->saddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = tunnel->parms.link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; struct rtable *rt; + if (!ip_route_output_key(dev_net(dev), &rt, &fl)) { tdev = rt->dst.dev; ip_rt_put(rt); @@ -699,13 +746,19 @@ static const struct net_device_ops ipip_netdev_ops = { .ndo_start_xmit = ipip_tunnel_xmit, .ndo_do_ioctl = ipip_tunnel_ioctl, .ndo_change_mtu = ipip_tunnel_change_mtu, - + .ndo_get_stats = ipip_get_stats, }; +static void ipip_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipip_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipip_dev_free; dev->type = ARPHRD_TUNNEL; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); @@ -717,7 +770,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } -static void ipip_tunnel_init(struct net_device *dev) +static int ipip_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); @@ -728,6 +781,12 @@ static void ipip_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip_tunnel_bind_dev(dev); + + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; } static void __net_init ipip_fb_tunnel_init(struct net_device *dev) -- cgit v1.1 From 15fc1f7056ebdc57e23f99077fec89e63e6fa941 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 00:38:18 +0000 Subject: sit: percpu stats accounting Maintain per_cpu tx_bytes, tx_packets, rx_bytes, rx_packets. Other seldom used fields are kept in netdev->stats structure, possibly unsafe. This is a preliminary work to support lockless transmit path, and correct RX stats, that are already unsafe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/sit.c | 82 +++++++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 64 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 8a03998..011ecf5 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -63,8 +63,9 @@ #define HASH_SIZE 16 #define HASH(addr) (((__force u32)addr^((__force u32)addr>>4))&0xF) -static void ipip6_tunnel_init(struct net_device *dev); +static int ipip6_tunnel_init(struct net_device *dev); static void ipip6_tunnel_setup(struct net_device *dev); +static void ipip6_dev_free(struct net_device *dev); static int sit_net_id __read_mostly; struct sit_net { @@ -84,6 +85,33 @@ struct sit_net { #define for_each_ip_tunnel_rcu(start) \ for (t = rcu_dereference(start); t; t = rcu_dereference(t->next)) +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ipip6_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} /* * Must be invoked with rcu_read_lock */ @@ -214,7 +242,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, if (parms->name[0]) strlcpy(name, parms->name, IFNAMSIZ); else - sprintf(name, "sit%%d"); + strcpy(name, "sit%d"); dev = alloc_netdev(sizeof(*t), name, ipip6_tunnel_setup); if (dev == NULL) @@ -230,7 +258,8 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, nt = netdev_priv(dev); nt->parms = *parms; - ipip6_tunnel_init(dev); + if (ipip6_tunnel_init(dev) < 0) + goto failed_free; ipip6_tunnel_clone_6rd(dev, sitn); if (parms->i_flags & SIT_ISATAP) @@ -245,7 +274,7 @@ static struct ip_tunnel *ipip6_tunnel_locate(struct net *net, return nt; failed_free: - free_netdev(dev); + ipip6_dev_free(dev); failed: return NULL; } @@ -546,6 +575,8 @@ static int ipip6_rcv(struct sk_buff *skb) tunnel = ipip6_tunnel_lookup(dev_net(skb->dev), skb->dev, iph->saddr, iph->daddr); if (tunnel != NULL) { + struct pcpu_tstats *tstats; + secpath_reset(skb); skb->mac_header = skb->network_header; skb_reset_network_header(skb); @@ -561,7 +592,11 @@ static int ipip6_rcv(struct sk_buff *skb) return 0; } - skb_tunnel_rx(skb, tunnel->dev); + tstats = this_cpu_ptr(tunnel->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, tunnel->dev); ipip6_ecn_decapsulate(iph, skb); @@ -626,14 +661,13 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); - struct net_device_stats *stats = &dev->stats; - struct netdev_queue *txq = netdev_get_tx_queue(dev, 0); + struct pcpu_tstats *tstats; struct iphdr *tiph = &tunnel->parms.iph; struct ipv6hdr *iph6 = ipv6_hdr(skb); u8 tos = tunnel->parms.iph.tos; __be16 df = tiph->frag_off; struct rtable *rt; /* Route to the other host */ - struct net_device *tdev; /* Device to other host */ + struct net_device *tdev; /* Device to other host */ struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ __be32 dst = tiph->daddr; @@ -704,20 +738,20 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, .oif = tunnel->parms.link, .proto = IPPROTO_IPV6 }; if (ip_route_output_key(dev_net(dev), &rt, &fl)) { - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } } if (rt->rt_type != RTN_UNICAST) { ip_rt_put(rt); - stats->tx_carrier_errors++; + dev->stats.tx_carrier_errors++; goto tx_error_icmp; } tdev = rt->dst.dev; if (tdev == dev) { ip_rt_put(rt); - stats->collisions++; + dev->stats.collisions++; goto tx_error; } @@ -725,7 +759,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - stats->collisions++; + dev->stats.collisions++; ip_rt_put(rt); goto tx_error; } @@ -764,7 +798,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, struct sk_buff *new_skb = skb_realloc_headroom(skb, max_headroom); if (!new_skb) { ip_rt_put(rt); - txq->tx_dropped++; + dev->stats.tx_dropped++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -800,14 +834,14 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb, iph->ttl = iph6->hop_limit; nf_reset(skb); - - IPTUNNEL_XMIT(); + tstats = this_cpu_ptr(dev->tstats); + __IPTUNNEL_XMIT(tstats, &dev->stats); return NETDEV_TX_OK; tx_error_icmp: dst_link_failure(skb); tx_error: - stats->tx_errors++; + dev->stats.tx_errors++; dev_kfree_skb(skb); return NETDEV_TX_OK; } @@ -1084,12 +1118,19 @@ static const struct net_device_ops ipip6_netdev_ops = { .ndo_start_xmit = ipip6_tunnel_xmit, .ndo_do_ioctl = ipip6_tunnel_ioctl, .ndo_change_mtu = ipip6_tunnel_change_mtu, + .ndo_get_stats = ipip6_get_stats, }; +static void ipip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + static void ipip6_tunnel_setup(struct net_device *dev) { dev->netdev_ops = &ipip6_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ipip6_dev_free; dev->type = ARPHRD_SIT; dev->hard_header_len = LL_MAX_HEADER + sizeof(struct iphdr); @@ -1101,7 +1142,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->features |= NETIF_F_NETNS_LOCAL; } -static void ipip6_tunnel_init(struct net_device *dev) +static int ipip6_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); @@ -1112,6 +1153,11 @@ static void ipip6_tunnel_init(struct net_device *dev) memcpy(dev->broadcast, &tunnel->parms.iph.daddr, 4); ipip6_tunnel_bind_dev(dev); + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + + return 0; } static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) -- cgit v1.1 From 7fa7cb7109d07c29ab28bb877bc7049a0150dbe5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 04:18:27 +0000 Subject: fib: use atomic_inc_not_zero() in fib_rules_lookup It seems we dont use appropriate refcount increment in an rcu_read_lock() protected section. fib_rule_get() might increment a null refcount and bad things could happen. While fib_nl_delrule() respects an rcu grace period before calling fib_rule_put(), fib_rules_cleanup_ops() calls fib_rule_put() without a grace period. Note : after this patch, we might avoid the synchronize_rcu() call done in fib_nl_delrule() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 42e84e0..d078728 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -225,9 +225,11 @@ jumped: err = ops->action(rule, fl, flags, arg); if (err != -EAGAIN) { - fib_rule_get(rule); - arg->rule = rule; - goto out; + if (likely(atomic_inc_not_zero(&rule->refcnt))) { + arg->rule = rule; + goto out; + } + break; } } -- cgit v1.1 From f91ff5b9ff529be8aac2039af63b2c8ea6cd6ebe Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 06:07:30 +0000 Subject: net: sk_{detach|attach}_filter() rcu fixes sk_attach_filter() and sk_detach_filter() are run with socket locked. Use the appropriate rcu_dereference_protected() instead of blocking BH, and rcu_dereference_bh(). There is no point adding BH prevention and memory barrier. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/filter.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/filter.c b/net/core/filter.c index 52b051f..7adf503 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -638,10 +638,9 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk) return err; } - rcu_read_lock_bh(); - old_fp = rcu_dereference_bh(sk->sk_filter); + old_fp = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); rcu_assign_pointer(sk->sk_filter, fp); - rcu_read_unlock_bh(); if (old_fp) sk_filter_delayed_uncharge(sk, old_fp); @@ -654,14 +653,13 @@ int sk_detach_filter(struct sock *sk) int ret = -ENOENT; struct sk_filter *filter; - rcu_read_lock_bh(); - filter = rcu_dereference_bh(sk->sk_filter); + filter = rcu_dereference_protected(sk->sk_filter, + sock_owned_by_user(sk)); if (filter) { rcu_assign_pointer(sk->sk_filter, NULL); sk_filter_delayed_uncharge(sk, filter); ret = 0; } - rcu_read_unlock_bh(); return ret; } EXPORT_SYMBOL_GPL(sk_detach_filter); -- cgit v1.1 From 62fe0b40abb3484413800edaef9b087a20059acf Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 27 Sep 2010 08:24:33 +0000 Subject: net: Allow changing number of RX queues after device allocation For RPS, we create a kobject for each RX queue based on the number of queues passed to alloc_netdev_mq(). However, drivers generally do not determine the numbers of hardware queues to use until much later, so this usually represents the maximum number the driver may use and not the actual number in use. For TX queues, drivers can update the actual number using netif_set_real_num_tx_queues(). Add a corresponding function for RX queues, netif_set_real_num_rx_queues(). Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 45 +++++++++++++++++++++++++++++++++++++++++---- net/core/net-sysfs.c | 32 ++++++++++++++++++-------------- net/core/net-sysfs.h | 4 ++++ 3 files changed, 63 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 42b200f..48ad47f 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1567,6 +1567,41 @@ void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) } EXPORT_SYMBOL(netif_set_real_num_tx_queues); +#ifdef CONFIG_RPS +/** + * netif_set_real_num_rx_queues - set actual number of RX queues used + * @dev: Network device + * @rxq: Actual number of RX queues + * + * This must be called either with the rtnl_lock held or before + * registration of the net device. Returns 0 on success, or a + * negative error code. If called before registration, it also + * sets the maximum number of queues, and always succeeds. + */ +int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) +{ + int rc; + + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + if (rxq > dev->num_rx_queues) + return -EINVAL; + + rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, + rxq); + if (rc) + return rc; + } else { + dev->num_rx_queues = rxq; + } + + dev->real_num_rx_queues = rxq; + return 0; +} +EXPORT_SYMBOL(netif_set_real_num_rx_queues); +#endif + static inline void __netif_reschedule(struct Qdisc *q) { struct softnet_data *sd; @@ -2352,10 +2387,11 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb, if (skb_rx_queue_recorded(skb)) { u16 index = skb_get_rx_queue(skb); - if (unlikely(index >= dev->num_rx_queues)) { - WARN_ONCE(dev->num_rx_queues > 1, "%s received packet " - "on queue %u, but number of RX queues is %u\n", - dev->name, index, dev->num_rx_queues); + if (unlikely(index >= dev->real_num_rx_queues)) { + WARN_ONCE(dev->real_num_rx_queues > 1, + "%s received packet on queue %u, but number " + "of RX queues is %u\n", + dev->name, index, dev->real_num_rx_queues); goto done; } rxqueue = dev->_rx + index; @@ -5472,6 +5508,7 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, #ifdef CONFIG_RPS dev->num_rx_queues = queue_count; + dev->real_num_rx_queues = queue_count; #endif dev->gso_max_size = GSO_MAX_SIZE; diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index 76485a3..fa81fd0 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -742,34 +742,38 @@ static int rx_queue_add_kobject(struct net_device *net, int index) return error; } -static int rx_queue_register_kobjects(struct net_device *net) +int +net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num) { int i; int error = 0; - net->queues_kset = kset_create_and_add("queues", - NULL, &net->dev.kobj); - if (!net->queues_kset) - return -ENOMEM; - for (i = 0; i < net->num_rx_queues; i++) { + for (i = old_num; i < new_num; i++) { error = rx_queue_add_kobject(net, i); - if (error) + if (error) { + new_num = old_num; break; + } } - if (error) - while (--i >= 0) - kobject_put(&net->_rx[i].kobj); + while (--i >= new_num) + kobject_put(&net->_rx[i].kobj); return error; } -static void rx_queue_remove_kobjects(struct net_device *net) +static int rx_queue_register_kobjects(struct net_device *net) { - int i; + net->queues_kset = kset_create_and_add("queues", + NULL, &net->dev.kobj); + if (!net->queues_kset) + return -ENOMEM; + return net_rx_queue_update_kobjects(net, 0, net->real_num_rx_queues); +} - for (i = 0; i < net->num_rx_queues; i++) - kobject_put(&net->_rx[i].kobj); +static void rx_queue_remove_kobjects(struct net_device *net) +{ + net_rx_queue_update_kobjects(net, net->real_num_rx_queues, 0); kset_unregister(net->queues_kset); } #endif /* CONFIG_RPS */ diff --git a/net/core/net-sysfs.h b/net/core/net-sysfs.h index 805555e..778e157 100644 --- a/net/core/net-sysfs.h +++ b/net/core/net-sysfs.h @@ -4,4 +4,8 @@ int netdev_kobject_init(void); int netdev_register_kobject(struct net_device *); void netdev_unregister_kobject(struct net_device *); +#ifdef CONFIG_RPS +int net_rx_queue_update_kobjects(struct net_device *, int old_num, int new_num); +#endif + #endif -- cgit v1.1 From 5df8dbd7e517b581e0b8cbd72a99da17914a20b0 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Mon, 27 Sep 2010 08:32:59 +0000 Subject: 8021q: Use netif_copy_real_num_queues() to set queue counts This covers RX if necessary, as well as TX. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/8021q/vlan.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 2c6c2bd..25c2133 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -321,7 +321,7 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id) if (new_dev == NULL) return -ENOBUFS; - new_dev->real_num_tx_queues = real_dev->real_num_tx_queues; + netif_copy_real_num_queues(new_dev, real_dev); dev_net_set(new_dev, net); /* need 4 bytes for extra VLAN header info, * hope the underlying device can handle it. -- cgit v1.1 From bc01befdcf3e40979eb518085a075cbf0aacede0 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 28 Sep 2010 21:06:34 +0200 Subject: netfilter: ctnetlink: add support for user-space expectation helpers This patch adds the basic infrastructure to support user-space expectation helpers via ctnetlink and the netfilter queuing infrastructure NFQUEUE. Basically, this patch: * adds NF_CT_EXPECT_USERSPACE flag to identify user-space created expectations. I have also added a sanity check in __nf_ct_expect_check() to avoid that kernel-space helpers may create an expectation if the master conntrack has no helper assigned. * adds some branches to check if the master conntrack helper exists, otherwise we skip the code that refers to kernel-space helper such as the local expectation list and the expectation policy. * allows to set the timeout for user-space expectations with no helper assigned. * a list of expectations created from user-space that depends on ctnetlink (if this module is removed, they are deleted). * includes USERSPACE in the /proc output for expectations that have been created by a user-space helper. This patch also modifies ctnetlink to skip including the helper name in the Netlink messages if no kernel-space helper is set (since no user-space expectation has not kernel-space kernel assigned). You can access an example user-space FTP conntrack helper at: http://people.netfilter.org/pablo/userspace-conntrack-helpers/nf-ftp-helper-userspace-POC.tar.bz Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_expect.c | 62 +++++++++++++++++++++++++++--------- net/netfilter/nf_conntrack_netlink.c | 46 ++++++++++++++++---------- 2 files changed, 77 insertions(+), 31 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index acb29cc..b30a1f2 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -38,20 +38,23 @@ static int nf_ct_expect_hash_rnd_initted __read_mostly; static struct kmem_cache *nf_ct_expect_cachep __read_mostly; +static HLIST_HEAD(nf_ct_userspace_expect_list); + /* nf_conntrack_expect helper functions */ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); - NF_CT_ASSERT(master_help); NF_CT_ASSERT(!timer_pending(&exp->timeout)); hlist_del_rcu(&exp->hnode); net->ct.expect_count--; hlist_del(&exp->lnode); - master_help->expecting[exp->class]--; + if (!(exp->flags & NF_CT_EXPECT_USERSPACE)) + master_help->expecting[exp->class]--; + nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); @@ -320,16 +323,21 @@ static void nf_ct_expect_insert(struct nf_conntrack_expect *exp) atomic_inc(&exp->use); - hlist_add_head(&exp->lnode, &master_help->expectations); - master_help->expecting[exp->class]++; + if (master_help) { + hlist_add_head(&exp->lnode, &master_help->expectations); + master_help->expecting[exp->class]++; + } else if (exp->flags & NF_CT_EXPECT_USERSPACE) + hlist_add_head(&exp->lnode, &nf_ct_userspace_expect_list); hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]); net->ct.expect_count++; setup_timer(&exp->timeout, nf_ct_expectation_timed_out, (unsigned long)exp); - p = &master_help->helper->expect_policy[exp->class]; - exp->timeout.expires = jiffies + p->timeout * HZ; + if (master_help) { + p = &master_help->helper->expect_policy[exp->class]; + exp->timeout.expires = jiffies + p->timeout * HZ; + } add_timer(&exp->timeout); atomic_inc(&exp->use); @@ -380,7 +388,9 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) unsigned int h; int ret = 1; - if (!master_help->helper) { + /* Don't allow expectations created from kernel-space with no helper */ + if (!(expect->flags & NF_CT_EXPECT_USERSPACE) && + (!master_help || (master_help && !master_help->helper))) { ret = -ESHUTDOWN; goto out; } @@ -398,13 +408,16 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect) } } /* Will be over limit? */ - p = &master_help->helper->expect_policy[expect->class]; - if (p->max_expected && - master_help->expecting[expect->class] >= p->max_expected) { - evict_oldest_expect(master, expect); - if (master_help->expecting[expect->class] >= p->max_expected) { - ret = -EMFILE; - goto out; + if (master_help) { + p = &master_help->helper->expect_policy[expect->class]; + if (p->max_expected && + master_help->expecting[expect->class] >= p->max_expected) { + evict_oldest_expect(master, expect); + if (master_help->expecting[expect->class] + >= p->max_expected) { + ret = -EMFILE; + goto out; + } } } @@ -439,6 +452,21 @@ out: } EXPORT_SYMBOL_GPL(nf_ct_expect_related_report); +void nf_ct_remove_userspace_expectations(void) +{ + struct nf_conntrack_expect *exp; + struct hlist_node *n, *next; + + hlist_for_each_entry_safe(exp, n, next, + &nf_ct_userspace_expect_list, lnode) { + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect(exp); + nf_ct_expect_put(exp); + } + } +} +EXPORT_SYMBOL_GPL(nf_ct_remove_userspace_expectations); + #ifdef CONFIG_PROC_FS struct ct_expect_iter_state { struct seq_net_private p; @@ -529,8 +557,12 @@ static int exp_seq_show(struct seq_file *s, void *v) seq_printf(s, "PERMANENT"); delim = ","; } - if (expect->flags & NF_CT_EXPECT_INACTIVE) + if (expect->flags & NF_CT_EXPECT_INACTIVE) { seq_printf(s, "%sINACTIVE", delim); + delim = ","; + } + if (expect->flags & NF_CT_EXPECT_USERSPACE) + seq_printf(s, "%sUSERSPACE", delim); helper = rcu_dereference(nfct_help(expect->master)->helper); if (helper) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 0804e0e..b4077be 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1560,8 +1560,8 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, const struct nf_conntrack_expect *exp) { struct nf_conn *master = exp->master; - struct nf_conntrack_helper *helper; long timeout = (exp->timeout.expires - jiffies) / HZ; + struct nf_conn_help *help; if (timeout < 0) timeout = 0; @@ -1578,9 +1578,14 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, NLA_PUT_BE32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)); NLA_PUT_BE32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)); NLA_PUT_BE32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)); - helper = rcu_dereference(nfct_help(master)->helper); - if (helper) - NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); + help = nfct_help(master); + if (help) { + struct nf_conntrack_helper *helper; + + helper = rcu_dereference(help->helper); + if (helper) + NLA_PUT_STRING(skb, CTA_EXPECT_HELP_NAME, helper->name); + } return 0; @@ -1921,24 +1926,32 @@ ctnetlink_create_expect(struct net *net, u16 zone, if (!h) return -ENOENT; ct = nf_ct_tuplehash_to_ctrack(h); - help = nfct_help(ct); - - if (!help || !help->helper) { - /* such conntrack hasn't got any helper, abort */ - err = -EOPNOTSUPP; - goto out; - } - exp = nf_ct_expect_alloc(ct); if (!exp) { err = -ENOMEM; goto out; } + help = nfct_help(ct); + if (!help) { + if (!cda[CTA_EXPECT_TIMEOUT]) { + err = -EINVAL; + goto out; + } + exp->timeout.expires = + jiffies + ntohl(nla_get_be32(cda[CTA_EXPECT_TIMEOUT])) * HZ; - if (cda[CTA_EXPECT_FLAGS]) - exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); - else - exp->flags = 0; + exp->flags = NF_CT_EXPECT_USERSPACE; + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags |= + ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + } + } else { + if (cda[CTA_EXPECT_FLAGS]) { + exp->flags = ntohl(nla_get_be32(cda[CTA_EXPECT_FLAGS])); + exp->flags &= ~NF_CT_EXPECT_USERSPACE; + } else + exp->flags = 0; + } exp->class = 0; exp->expectfn = NULL; @@ -2109,6 +2122,7 @@ static void __exit ctnetlink_exit(void) { pr_info("ctnetlink: unregistering from nfnetlink.\n"); + nf_ct_remove_userspace_expectations(); #ifdef CONFIG_NF_CONNTRACK_EVENTS nf_ct_expect_unregister_notifier(&ctnl_notifier_exp); nf_conntrack_unregister_notifier(&ctnl_notifier); -- cgit v1.1 From 8d4780eb1ece4e8109b4f6b2e5e61f7fc593c3f4 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 24 Sep 2010 21:59:57 -0400 Subject: mac80211: fix offchannel assumption upon association Association is dealt with as an atomic offchannel operation, we do this because we don't know we are associated until we get the associatin response from the AP. When we do get the associatin response though we were never clearing the offchannel state. This has a few implications, we told drivers we were still offchannel, and the first configured TX power for the channel does not take into account any power constraints. For ath9k this meant ANI calibration would not start upon association, and we'd have to wait until the first bgscan to be triggered. There may be other issues this resolves but I'm too lazy to comb the code to check. Cc: stable@kernel.org Cc: Amod Bodas Cc: Vasanth Thiagarajan Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/main.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 7c85426..e24fa5b 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -110,7 +110,8 @@ int ieee80211_hw_config(struct ieee80211_local *local, u32 changed) chan = scan_chan; channel_type = NL80211_CHAN_NO_HT; local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; - } else if (local->tmp_channel) { + } else if (local->tmp_channel && + local->oper_channel != local->tmp_channel) { chan = scan_chan = local->tmp_channel; channel_type = local->tmp_channel_type; local->hw.conf.flags |= IEEE80211_CONF_OFFCHANNEL; -- cgit v1.1 From 93b05238027420978d785569f8c1aa4a6867bc13 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 28 Sep 2010 12:53:14 +0200 Subject: cfg80211: always set IBSS basic rates IBSS started from wireless extensions is currently missing basic rate configuration, fix this by moving the code to generate the default to the common code that gets invoked for both nl80211 and wext. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/ibss.c | 19 +++++++++++++++++++ net/wireless/nl80211.c | 17 ----------------- 2 files changed, 19 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 27a8ce9..8cb6e08 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -88,6 +88,25 @@ int __cfg80211_join_ibss(struct cfg80211_registered_device *rdev, if (wdev->ssid_len) return -EALREADY; + if (!params->basic_rates) { + /* + * If no rates were explicitly configured, + * use the mandatory rate set for 11b or + * 11a for maximum compatibility. + */ + struct ieee80211_supported_band *sband = + rdev->wiphy.bands[params->channel->band]; + int j; + u32 flag = params->channel->band == IEEE80211_BAND_5GHZ ? + IEEE80211_RATE_MANDATORY_A : + IEEE80211_RATE_MANDATORY_B; + + for (j = 0; j < sband->n_bitrates; j++) { + if (sband->bitrates[j].flags & flag) + params->basic_rates |= BIT(j); + } + } + if (WARN_ON(wdev->connect_keys)) kfree(wdev->connect_keys); wdev->connect_keys = connkeys; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4ff827e..9c84825 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4119,23 +4119,6 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) goto out; } } - } else { - /* - * If no rates were explicitly configured, - * use the mandatory rate set for 11b or - * 11a for maximum compatibility. - */ - struct ieee80211_supported_band *sband = - wiphy->bands[ibss.channel->band]; - int j; - u32 flag = ibss.channel->band == IEEE80211_BAND_5GHZ ? - IEEE80211_RATE_MANDATORY_A : - IEEE80211_RATE_MANDATORY_B; - - for (j = 0; j < sband->n_bitrates; j++) { - if (sband->bitrates[j].flags & flag) - ibss.basic_rates |= BIT(j); - } } err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); -- cgit v1.1 From f2176d7240e4f455a6e007703c7512fbde926dc8 Mon Sep 17 00:00:00 2001 From: Juuso Oikarinen Date: Tue, 28 Sep 2010 14:39:32 +0300 Subject: mac80211: Fix WMM driver queue configuration The WMM parameter configuration function (ieee80211_sta_wmm_params) only configures the WMM parameters to the driver is the wmm_last_param_set counter value is changed by the AP. The wmm_last_param_set is initialized to -1 on association in order to ensure the configuration is made to the driver at least once on association, but currently this initialization is done *after* the WMM parameter configuration function was called. This leads to unreliability in the driver getting properly configured on first association (depending on what counter value the AP happens to use.) When disassociating (the wmm default parameters are configured to the driver) and then reassociating, due to the above the WMM configuration is not set to the driver at all. On drivers without beacon filtering the problem is corrected by later beacons, but on drivers with beacon filtering the WMM will remain permanently incorrectly configured. Fix this by moving the initialization of wmm_last_param_set to -1 before ieee80211_sta_wmm_params is called on association. Signed-off-by: Juuso Oikarinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 8b733cf..77913a1 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -880,14 +880,6 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, sdata->u.mgd.flags &= ~(IEEE80211_STA_CONNECTION_POLL | IEEE80211_STA_BEACON_POLL); - /* - * Always handle WMM once after association regardless - * of the first value the AP uses. Setting -1 here has - * that effect because the AP values is an unsigned - * 4-bit value. - */ - sdata->u.mgd.wmm_last_param_set = -1; - ieee80211_led_assoc(local, 1); if (local->hw.flags & IEEE80211_HW_NEED_DTIM_PERIOD) @@ -1367,6 +1359,14 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk, return false; } + /* + * Always handle WMM once after association regardless + * of the first value the AP uses. Setting -1 here has + * that effect because the AP values is an unsigned + * 4-bit value. + */ + ifmgd->wmm_last_param_set = -1; + if (elems.wmm_param) ieee80211_sta_wmm_params(local, sdata, elems.wmm_param, elems.wmm_param_len); -- cgit v1.1 From 4465b469008bc03b98a1b8df4e9ae501b6c69d4b Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Sun, 23 May 2010 19:54:12 +0000 Subject: ipv4: Allow configuring subnets as local addresses This patch allows a host to be configured to respond to any address in a specified range as if it were local, without actually needing to configure the address on an interface. This is done through routing table configuration. For instance, to configure a host to respond to any address in 10.1/16 received on eth0 as a local address we can do: ip rule add from all iif eth0 lookup 200 ip route add local 10.1/16 dev lo proto kernel scope host src 127.0.0.1 table 200 This host is now reachable by any 10.1/16 address (route lookup on input for packets received on eth0 can find the route). On output, the rule will not be matched so that this host can still send packets to 10.1/16 (not sent on loopback). Presumably, external routing can be configured to make sense out of this. To make this work, we needed to modify the logic in finding the interface which is assigned a given source address for output (dev_ip_find). We perform a normal fib_lookup instead of just a lookup on the local table, and in the lookup we ignore the input interface for matching. This patch is useful to implement IP-anycast for subnets of virtual addresses. Signed-off-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 ++- net/ipv4/fib_frontend.c | 7 +++---- 2 files changed, 5 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index d078728..332c2e3 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -182,7 +182,8 @@ static int fib_rule_match(struct fib_rule *rule, struct fib_rules_ops *ops, { int ret = 0; - if (rule->iifindex && (rule->iifindex != fl->iif)) + if (rule->iifindex && (rule->iifindex != fl->iif) && + !(fl->flags & FLOWI_FLAG_MATCH_ANY_IIF)) goto out; if (rule->oifindex && (rule->oifindex != fl->oif)) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 7d02a9f..981f3c5 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -153,17 +153,16 @@ static void fib_flush(struct net *net) struct net_device * ip_dev_find(struct net *net, __be32 addr) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } }, + .flags = FLOWI_FLAG_MATCH_ANY_IIF }; struct fib_result res; struct net_device *dev = NULL; - struct fib_table *local_table; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (!local_table || fib_table_lookup(local_table, &fl, &res)) + if (fib_lookup(net, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) goto out; -- cgit v1.1 From ab79ad14a2d51e95f0ac3cef7cd116a57089ba82 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Maciej=20=C5=BBenczykowski?= Date: Mon, 27 Sep 2010 00:07:02 +0000 Subject: ipv6: Implement Any-IP support for IPv6. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit AnyIP is the capability to receive packets and establish incoming connections on IPs we have not explicitly configured on the machine. An example use case is to configure a machine to accept all incoming traffic on eth0, and leave the policy of whether traffic for a given IP should be delivered to the machine up to the load balancer. Can be setup as follows: ip -6 rule from all iif eth0 lookup 200 ip -6 route add local default dev lo table 200 (in this case for all IPv6 addresses) Signed-off-by: Maciej Å»enczykowski Signed-off-by: David S. Miller --- net/ipv6/route.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 25b0bed..25476e7 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1169,6 +1169,8 @@ int ip6_route_add(struct fib6_config *cfg) if (addr_type & IPV6_ADDR_MULTICAST) rt->dst.input = ip6_mc_input; + else if (cfg->fc_flags & RTF_LOCAL) + rt->dst.input = ip6_input; else rt->dst.input = ip6_forward; @@ -1190,7 +1192,8 @@ int ip6_route_add(struct fib6_config *cfg) they would result in kernel looping; promote them to reject routes */ if ((cfg->fc_flags & RTF_REJECT) || - (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK))) { + (dev && (dev->flags&IFF_LOOPBACK) && !(addr_type&IPV6_ADDR_LOOPBACK) + && !(cfg->fc_flags&RTF_LOCAL))) { /* hold loopback dev/idev if we haven't done so. */ if (dev != net->loopback_dev) { if (dev) { @@ -2082,6 +2085,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh, if (rtm->rtm_type == RTN_UNREACHABLE) cfg->fc_flags |= RTF_REJECT; + if (rtm->rtm_type == RTN_LOCAL) + cfg->fc_flags |= RTF_LOCAL; + cfg->fc_nlinfo.pid = NETLINK_CB(skb).pid; cfg->fc_nlinfo.nlh = nlh; cfg->fc_nlinfo.nl_net = sock_net(skb->sk); @@ -2202,6 +2208,8 @@ static int rt6_fill_node(struct net *net, NLA_PUT_U32(skb, RTA_TABLE, table); if (rt->rt6i_flags&RTF_REJECT) rtm->rtm_type = RTN_UNREACHABLE; + else if (rt->rt6i_flags&RTF_LOCAL) + rtm->rtm_type = RTN_LOCAL; else if (rt->rt6i_dev && (rt->rt6i_dev->flags&IFF_LOOPBACK)) rtm->rtm_type = RTN_LOCAL; else -- cgit v1.1 From 745e20f1b626b1be4b100af5d4bf7b3439392f8f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2010 13:23:09 -0700 Subject: net: add a recursion limit in xmit path As tunnel devices are going to be lockless, we need to make sure a misconfigured machine wont enter an infinite loop. Add a percpu variable, and limit to three the number of stacked xmits. Reported-by: Jesse Gross Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 48ad47f..50dacca 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2177,6 +2177,9 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q, return rc; } +static DEFINE_PER_CPU(int, xmit_recursion); +#define RECURSION_LIMIT 3 + /** * dev_queue_xmit - transmit a buffer * @skb: buffer to transmit @@ -2242,10 +2245,15 @@ int dev_queue_xmit(struct sk_buff *skb) if (txq->xmit_lock_owner != cpu) { + if (__this_cpu_read(xmit_recursion) > RECURSION_LIMIT) + goto recursion_alert; + HARD_TX_LOCK(dev, txq, cpu); if (!netif_tx_queue_stopped(txq)) { + __this_cpu_inc(xmit_recursion); rc = dev_hard_start_xmit(skb, dev, txq); + __this_cpu_dec(xmit_recursion); if (dev_xmit_complete(rc)) { HARD_TX_UNLOCK(dev, txq); goto out; @@ -2257,7 +2265,9 @@ int dev_queue_xmit(struct sk_buff *skb) "queue packet!\n", dev->name); } else { /* Recursion is detected! It is possible, - * unfortunately */ + * unfortunately + */ +recursion_alert: if (net_ratelimit()) printk(KERN_CRIT "Dead loop on virtual device " "%s, fix it urgently!\n", dev->name); -- cgit v1.1 From fada5636fe41fd1423fe4e6af7b9f609378acde6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 23:56:46 +0000 Subject: ipip: fix percpu stats accounting commit 3c97af99a5aa1 (ipip: percpu stats accounting) forgot the fallback tunnel case (tunl0), and can crash pretty fast. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 12b6fde..9e78f11 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -789,7 +789,7 @@ static int ipip_tunnel_init(struct net_device *dev) return 0; } -static void __net_init ipip_fb_tunnel_init(struct net_device *dev) +static int __net_init ipip_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -802,8 +802,13 @@ static void __net_init ipip_fb_tunnel_init(struct net_device *dev) iph->protocol = IPPROTO_IPIP; iph->ihl = 5; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + dev_hold(dev); rcu_assign_pointer(ipn->tunnels_wc[0], tunnel); + return 0; } static struct xfrm_tunnel ipip_handler __read_mostly = { @@ -852,7 +857,9 @@ static int __net_init ipip_init_net(struct net *net) } dev_net_set(ipn->fb_tunnel_dev, net); - ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + err = ipip_fb_tunnel_init(ipn->fb_tunnel_dev); + if (err) + goto err_reg_dev; if ((err = register_netdev(ipn->fb_tunnel_dev))) goto err_reg_dev; @@ -860,7 +867,7 @@ static int __net_init ipip_init_net(struct net *net) return 0; err_reg_dev: - free_netdev(ipn->fb_tunnel_dev); + ipip_dev_free(ipn->fb_tunnel_dev); err_alloc_dev: /* nothing */ return err; -- cgit v1.1 From dd4080ee575db1a2d0f40538aed5aa7662a06c54 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 02:17:58 +0000 Subject: sit: fix percpu stats accounting commit 15fc1f7056ebd (sit: percpu stats accounting) forgot the fallback tunnel case (sit0), and can crash pretty fast. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/sit.c | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 011ecf5..2cb6460 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1160,7 +1160,7 @@ static int ipip6_tunnel_init(struct net_device *dev) return 0; } -static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) +static int __net_init ipip6_fb_tunnel_init(struct net_device *dev) { struct ip_tunnel *tunnel = netdev_priv(dev); struct iphdr *iph = &tunnel->parms.iph; @@ -1175,8 +1175,12 @@ static void __net_init ipip6_fb_tunnel_init(struct net_device *dev) iph->ihl = 5; iph->ttl = 64; + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; dev_hold(dev); sitn->tunnels_wc[0] = tunnel; + return 0; } static struct xfrm_tunnel sit_handler __read_mostly = { @@ -1220,7 +1224,10 @@ static int __net_init sit_init_net(struct net *net) } dev_net_set(sitn->fb_tunnel_dev, net); - ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); + err = ipip6_fb_tunnel_init(sitn->fb_tunnel_dev); + if (err) + goto err_dev_free; + ipip6_tunnel_clone_6rd(sitn->fb_tunnel_dev, sitn); if ((err = register_netdev(sitn->fb_tunnel_dev))) @@ -1230,7 +1237,8 @@ static int __net_init sit_init_net(struct net *net) err_reg_dev: dev_put(sitn->fb_tunnel_dev); - free_netdev(sitn->fb_tunnel_dev); +err_dev_free: + ipip6_dev_free(sitn->fb_tunnel_dev); err_alloc_dev: return err; } -- cgit v1.1 From 8df40d1033d64597dcf1efd4f7547e817f7a953b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 02:53:41 +0000 Subject: sit: enable lockless xmits SIT tunnels can benefit from lockless xmits, using NETIF_F_LLTX Bench on a 16 cpus machine (dual E5540 cpus), 16 threads sending 10000000 UDP frames via one sit tunnel (size:220 bytes per frame) Before patch : real 3m15.399s user 0m9.185s sys 51m55.403s 75029.00 87.5% _raw_spin_lock vmlinux 1090.00 1.3% dst_release vmlinux 902.00 1.1% dev_queue_xmit vmlinux 627.00 0.7% sock_wfree vmlinux 613.00 0.7% ip6_push_pending_frames ipv6.ko 505.00 0.6% __ip_route_output_key vmlinux After patch: real 1m1.387s user 0m12.489s sys 15m58.868s 28239.00 23.3% dst_release vmlinux 13570.00 11.2% ip6_push_pending_frames ipv6.ko 13118.00 10.8% ip6_append_data ipv6.ko 7995.00 6.6% __ip_route_output_key vmlinux 7924.00 6.5% sk_dst_check vmlinux 5015.00 4.1% udpv6_sendmsg ipv6.ko 3594.00 3.0% sock_alloc_send_pskb vmlinux 3135.00 2.6% sock_wfree vmlinux 3055.00 2.5% ip6_sk_dst_lookup ipv6.ko 2473.00 2.0% ip_finish_output vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/sit.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 2cb6460..d770178 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1140,6 +1140,7 @@ static void ipip6_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; } static int ipip6_tunnel_init(struct net_device *dev) -- cgit v1.1 From b790e01aee74c23a5d92576177934f13aa51f718 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 27 Sep 2010 23:05:47 +0000 Subject: ip_gre: lockless xmit GRE tunnels can benefit from lockless xmits, using NETIF_F_LLTX Note: If tunnels are created with the "oseq" option, LLTX is not enabled : Even using an atomic_t o_seq, we would increase chance for packets being out of order at receiver. Bench on a 16 cpus machine (dual E5540 cpus), 16 threads sending 10000000 UDP frames via one gre tunnel (size:200 bytes per frame) Before patch : real 3m0.094s user 0m9.365s sys 47m50.103s After patch: real 0m29.756s user 0m11.097s sys 7m33.012s Last problem to solve is the contention on dst : 38660.00 21.4% __ip_route_output_key vmlinux 20786.00 11.5% dst_release vmlinux 14191.00 7.8% __xfrm_lookup vmlinux 12410.00 6.9% ip_finish_output vmlinux 4540.00 2.5% ip_push_pending_frames vmlinux 4427.00 2.4% ip_append_data vmlinux 4265.00 2.4% __alloc_skb vmlinux 4140.00 2.3% __ip_local_out vmlinux 3991.00 2.2% dev_queue_xmit vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index a1b5d5e..035db63 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1557,6 +1557,10 @@ static int ipgre_newlink(struct net *src_net, struct net_device *dev, struct nla if (!tb[IFLA_MTU]) dev->mtu = mtu; + /* Can use a lockless transmit, unless we generate output sequences */ + if (!(nt->parms.o_flags & GRE_SEQ)) + dev->features |= NETIF_F_LLTX; + err = register_netdevice(dev); if (err) goto out; -- cgit v1.1 From 153f0943382e9ae0bff7caa110a1a4656088d0d4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 00:17:17 +0000 Subject: ipip: enable lockless xmits IPIP tunnels can benefit from lockless xmits, using NETIF_F_LLTX Bench on a 16 cpus machine (dual E5540 cpus), 16 threads sending 10000000 UDP frames via one ipip tunnel (size:200 bytes per frame) Before patch : real 2m53.321s user 0m10.277s sys 46m0.597s After patch: real 0m32.063s user 0m9.237s sys 8m16.255s Last problem to solve is the contention on dst : 16118.00 28.3% __ip_route_output_key vmlinux 6135.00 10.8% dst_release vmlinux 3220.00 5.6% ip_finish_output vmlinux 2149.00 3.8% ip_route_output_flow vmlinux 1575.00 2.8% ip_append_data vmlinux 1481.00 2.6% ip_push_pending_frames vmlinux 1349.00 2.4% __xfrm_lookup vmlinux 1216.00 2.1% csum_partial_copy_generic vmlinux 1208.00 2.1% udp_sendmsg vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipip.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 9e78f11..6ad46c2 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -767,6 +767,7 @@ static void ipip_tunnel_setup(struct net_device *dev) dev->iflink = 0; dev->addr_len = 4; dev->features |= NETIF_F_NETNS_LOCAL; + dev->features |= NETIF_F_LLTX; dev->priv_flags &= ~IFF_XMIT_DST_RELEASE; } -- cgit v1.1 From 8560f2266b36adb43238f1f9fd13958dd031901c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 03:23:34 +0000 Subject: ip6tnl: percpu stats accounting Maintain per_cpu tx_bytes, tx_packets, rx_bytes, rx_packets. Other seldom used fields are kept in netdev->stats structure, possibly unsafe. This is a preliminary work to support lockless transmit path, and correct RX stats, that are already unsafe. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/ip6_tunnel.c | 93 ++++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 77 insertions(+), 16 deletions(-) (limited to 'net') diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index f6d9f68..8be3c45 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -75,7 +75,7 @@ MODULE_LICENSE("GPL"); (addr)->s6_addr32[2] ^ (addr)->s6_addr32[3]) & \ (HASH_SIZE - 1)) -static void ip6_tnl_dev_init(struct net_device *dev); +static int ip6_tnl_dev_init(struct net_device *dev); static void ip6_tnl_dev_setup(struct net_device *dev); static int ip6_tnl_net_id __read_mostly; @@ -88,6 +88,34 @@ struct ip6_tnl_net { struct ip6_tnl __rcu **tnls[2]; }; +/* often modified stats are per cpu, other are shared (netdev->stats) */ +struct pcpu_tstats { + unsigned long rx_packets; + unsigned long rx_bytes; + unsigned long tx_packets; + unsigned long tx_bytes; +}; + +static struct net_device_stats *ip6_get_stats(struct net_device *dev) +{ + struct pcpu_tstats sum = { 0 }; + int i; + + for_each_possible_cpu(i) { + const struct pcpu_tstats *tstats = per_cpu_ptr(dev->tstats, i); + + sum.rx_packets += tstats->rx_packets; + sum.rx_bytes += tstats->rx_bytes; + sum.tx_packets += tstats->tx_packets; + sum.tx_bytes += tstats->tx_bytes; + } + dev->stats.rx_packets = sum.rx_packets; + dev->stats.rx_bytes = sum.rx_bytes; + dev->stats.tx_packets = sum.tx_packets; + dev->stats.tx_bytes = sum.tx_bytes; + return &dev->stats; +} + /* * Locking : hash tables are protected by RCU and RTNL */ @@ -216,6 +244,12 @@ ip6_tnl_unlink(struct ip6_tnl_net *ip6n, struct ip6_tnl *t) } } +static void ip6_dev_free(struct net_device *dev) +{ + free_percpu(dev->tstats); + free_netdev(dev); +} + /** * ip6_tnl_create() - create a new tunnel * @p: tunnel parameters @@ -254,7 +288,9 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) t = netdev_priv(dev); t->parms = *p; - ip6_tnl_dev_init(dev); + err = ip6_tnl_dev_init(dev); + if (err < 0) + goto failed_free; if ((err = register_netdevice(dev)) < 0) goto failed_free; @@ -264,7 +300,7 @@ static struct ip6_tnl *ip6_tnl_create(struct net *net, struct ip6_tnl_parm *p) return t; failed_free: - free_netdev(dev); + ip6_dev_free(dev); failed: return NULL; } @@ -700,6 +736,8 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, if ((t = ip6_tnl_lookup(dev_net(skb->dev), &ipv6h->saddr, &ipv6h->daddr)) != NULL) { + struct pcpu_tstats *tstats; + if (t->parms.proto != ipproto && t->parms.proto != 0) { rcu_read_unlock(); goto discard; @@ -722,7 +760,11 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, skb->pkt_type = PACKET_HOST; memset(skb->cb, 0, sizeof(struct inet6_skb_parm)); - skb_tunnel_rx(skb, t->dev); + tstats = this_cpu_ptr(t->dev->tstats); + tstats->rx_packets++; + tstats->rx_bytes += skb->len; + + __skb_tunnel_rx(skb, t->dev); dscp_ecn_decapsulate(t, ipv6h, skb); @@ -935,8 +977,10 @@ static int ip6_tnl_xmit2(struct sk_buff *skb, err = ip6_local_out(skb); if (net_xmit_eval(err) == 0) { - stats->tx_bytes += pkt_len; - stats->tx_packets++; + struct pcpu_tstats *tstats = this_cpu_ptr(t->dev->tstats); + + tstats->tx_bytes += pkt_len; + tstats->tx_packets++; } else { stats->tx_errors++; stats->tx_aborted_errors++; @@ -1301,12 +1345,14 @@ ip6_tnl_change_mtu(struct net_device *dev, int new_mtu) static const struct net_device_ops ip6_tnl_netdev_ops = { - .ndo_uninit = ip6_tnl_dev_uninit, + .ndo_uninit = ip6_tnl_dev_uninit, .ndo_start_xmit = ip6_tnl_xmit, - .ndo_do_ioctl = ip6_tnl_ioctl, + .ndo_do_ioctl = ip6_tnl_ioctl, .ndo_change_mtu = ip6_tnl_change_mtu, + .ndo_get_stats = ip6_get_stats, }; + /** * ip6_tnl_dev_setup - setup virtual tunnel device * @dev: virtual device associated with tunnel @@ -1318,7 +1364,7 @@ static const struct net_device_ops ip6_tnl_netdev_ops = { static void ip6_tnl_dev_setup(struct net_device *dev) { dev->netdev_ops = &ip6_tnl_netdev_ops; - dev->destructor = free_netdev; + dev->destructor = ip6_dev_free; dev->type = ARPHRD_TUNNEL6; dev->hard_header_len = LL_MAX_HEADER + sizeof (struct ipv6hdr); @@ -1334,12 +1380,17 @@ static void ip6_tnl_dev_setup(struct net_device *dev) * @dev: virtual device associated with tunnel **/ -static inline void +static inline int ip6_tnl_dev_init_gen(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); + t->dev = dev; strcpy(t->parms.name, dev->name); + dev->tstats = alloc_percpu(struct pcpu_tstats); + if (!dev->tstats) + return -ENOMEM; + return 0; } /** @@ -1347,11 +1398,15 @@ ip6_tnl_dev_init_gen(struct net_device *dev) * @dev: virtual device associated with tunnel **/ -static void ip6_tnl_dev_init(struct net_device *dev) +static int ip6_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); - ip6_tnl_dev_init_gen(dev); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; ip6_tnl_link_config(t); + return 0; } /** @@ -1361,16 +1416,20 @@ static void ip6_tnl_dev_init(struct net_device *dev) * Return: 0 **/ -static void __net_init ip6_fb_tnl_dev_init(struct net_device *dev) +static int __net_init ip6_fb_tnl_dev_init(struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); struct net *net = dev_net(dev); struct ip6_tnl_net *ip6n = net_generic(net, ip6_tnl_net_id); + int err = ip6_tnl_dev_init_gen(dev); + + if (err) + return err; - ip6_tnl_dev_init_gen(dev); t->parms.proto = IPPROTO_IPV6; dev_hold(dev); rcu_assign_pointer(ip6n->tnls_wc[0], t); + return 0; } static struct xfrm6_tunnel ip4ip6_handler __read_mostly = { @@ -1420,7 +1479,9 @@ static int __net_init ip6_tnl_init_net(struct net *net) goto err_alloc_dev; dev_net_set(ip6n->fb_tnl_dev, net); - ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + err = ip6_fb_tnl_dev_init(ip6n->fb_tnl_dev); + if (err < 0) + goto err_register; err = register_netdev(ip6n->fb_tnl_dev); if (err < 0) @@ -1428,7 +1489,7 @@ static int __net_init ip6_tnl_init_net(struct net *net) return 0; err_register: - free_netdev(ip6n->fb_tnl_dev); + ip6_dev_free(ip6n->fb_tnl_dev); err_alloc_dev: return err; } -- cgit v1.1 From bfa5ae63b823f4ffd3483a05f60a93a4a7b7d680 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 28 Sep 2010 05:58:37 +0000 Subject: net: rename netdev rx_queue to ingress_queue There is some confusion with rx_queue name after RPS, and net drivers private rx_queue fields. I suggest to rename "struct net_device"->rx_queue to ingress_queue. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 8 ++++---- net/sched/sch_api.c | 14 +++++++------- net/sched/sch_generic.c | 8 ++++---- 3 files changed, 15 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 50dacca..a313bab 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2720,7 +2720,7 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - rxq = &dev->rx_queue; + rxq = &dev->ingress_queue; q = rxq->qdisc; if (q != &noop_qdisc) { @@ -2737,7 +2737,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (skb->dev->rx_queue.qdisc == &noop_qdisc) + if (skb->dev->ingress_queue.qdisc == &noop_qdisc) goto out; if (*pt_prev) { @@ -4940,7 +4940,7 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, static void netdev_init_queue_locks(struct net_device *dev) { netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->rx_queue, NULL); + __netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -5452,7 +5452,7 @@ static void netdev_init_one_queue(struct net_device *dev, static void netdev_init_queues(struct net_device *dev) { - netdev_init_one_queue(dev, &dev->rx_queue, NULL); + netdev_init_one_queue(dev, &dev->ingress_queue, NULL); netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); } diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 6fb3d41..b802078 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -240,7 +240,7 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (q) goto out; - q = qdisc_match_from_root(dev->rx_queue.qdisc_sleeping, handle); + q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle); out: return q; } @@ -701,7 +701,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, } for (i = 0; i < num_q; i++) { - struct netdev_queue *dev_queue = &dev->rx_queue; + struct netdev_queue *dev_queue = &dev->ingress_queue; if (!ingress) dev_queue = netdev_get_tx_queue(dev, i); @@ -979,7 +979,7 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) return -ENOENT; q = qdisc_leaf(p, clid); } else { /* ingress */ - q = dev->rx_queue.qdisc_sleeping; + q = dev->ingress_queue.qdisc_sleeping; } } else { q = dev->qdisc; @@ -1044,7 +1044,7 @@ replay: return -ENOENT; q = qdisc_leaf(p, clid); } else { /*ingress */ - q = dev->rx_queue.qdisc_sleeping; + q = dev->ingress_queue.qdisc_sleeping; } } else { q = dev->qdisc; @@ -1124,7 +1124,7 @@ create_n_graft: if (!(n->nlmsg_flags&NLM_F_CREATE)) return -ENOENT; if (clid == TC_H_INGRESS) - q = qdisc_create(dev, &dev->rx_queue, p, + q = qdisc_create(dev, &dev->ingress_queue, p, tcm->tcm_parent, tcm->tcm_parent, tca, &err); else { @@ -1304,7 +1304,7 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) goto done; - dev_queue = &dev->rx_queue; + dev_queue = &dev->ingress_queue; if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0) goto done; @@ -1595,7 +1595,7 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) goto done; - dev_queue = &dev->rx_queue; + dev_queue = &dev->ingress_queue; if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0) goto done; diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 2aeb3a4..545278a 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -753,7 +753,7 @@ void dev_activate(struct net_device *dev) need_watchdog = 0; netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); - transition_one_qdisc(dev, &dev->rx_queue, NULL); + transition_one_qdisc(dev, &dev->ingress_queue, NULL); if (need_watchdog) { dev->trans_start = jiffies; @@ -812,7 +812,7 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate(struct net_device *dev) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); - dev_deactivate_queue(dev, &dev->rx_queue, &noop_qdisc); + dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc); dev_watchdog_down(dev); @@ -838,7 +838,7 @@ void dev_init_scheduler(struct net_device *dev) { dev->qdisc = &noop_qdisc; netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); - dev_init_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); + dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc); setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); } @@ -861,7 +861,7 @@ static void shutdown_scheduler_queue(struct net_device *dev, void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); - shutdown_scheduler_queue(dev, &dev->rx_queue, &noop_qdisc); + shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc); qdisc_destroy(dev->qdisc); dev->qdisc = &noop_qdisc; -- cgit v1.1 From a64de47c091e4a337fa9763315cb6f2fbf0c583b Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 28 Sep 2010 17:08:02 +0000 Subject: arp: remove unnecessary export of arp_broken_ops arp_broken_ops is only used in arp.c Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/arp.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 4083c18..d9031ad 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -161,7 +161,7 @@ static const struct neigh_ops arp_direct_ops = { .queue_xmit = dev_queue_xmit, }; -const struct neigh_ops arp_broken_ops = { +static const struct neigh_ops arp_broken_ops = { .family = AF_INET, .solicit = arp_solicit, .error_report = arp_error_report, @@ -170,7 +170,6 @@ const struct neigh_ops arp_broken_ops = { .hh_output = dev_queue_xmit, .queue_xmit = dev_queue_xmit, }; -EXPORT_SYMBOL(arp_broken_ops); struct neigh_table arp_tbl = { .family = AF_INET, -- cgit v1.1 From 1b9f409293529da4630bfc5d6d8e7d7451a6ccb5 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 28 Sep 2010 19:30:14 +0000 Subject: tcp: tcp_enter_quickack_mode can be static Function only used in tcp_input.c Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index fabc09a..eaf20e7 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -182,7 +182,7 @@ static void tcp_incr_quickack(struct sock *sk) icsk->icsk_ack.quick = min(quickacks, TCP_MAX_QUICKACKS); } -void tcp_enter_quickack_mode(struct sock *sk) +static void tcp_enter_quickack_mode(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); tcp_incr_quickack(sk); -- cgit v1.1 From 6d0722a2cec2c23db3b0855ff8bb433175a16b44 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2010 23:35:10 -0700 Subject: ip_gre: comments change HARD_TX_LOCK no longer protects tunnels from dead loops, but xmit_recursion percpu counter. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ip_gre.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 035db63..fbe2c47 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -64,13 +64,13 @@ We cannot track such dead loops during route installation, it is infeasible task. The most general solutions would be to keep skb->encapsulation counter (sort of local ttl), - and silently drop packet when it expires. It is the best + and silently drop packet when it expires. It is a good solution, but it supposes maintaing new variable in ALL skb, even if no tunneling is used. - Current solution: HARD_TX_LOCK lock breaks dead loops. - - + Current solution: xmit_recursion breaks dead loops. This is a percpu + counter, since when we enter the first ndo_xmit(), cpu migration is + forbidden. We force an exit if this counter reaches RECURSION_LIMIT 2. Networking dead loops would not kill routers, but would really kill network. IP hop limit plays role of "t->recursion" in this case, -- cgit v1.1 From e1a5964f0c32a75b17360cfc565d25aaedbff747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Wed, 29 Sep 2010 22:33:50 +0000 Subject: Phonet: restore flow control credits when sending fails MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch restores the below flow control patch submitted by Rémi Denis-Courmont, which accidentaly got lost due to Pipe controller patch on Phonet. commit 1a98214feef2221cd7c24b17cd688a5a9d85b2ea Author: Rémi Denis-Courmont Date: Mon Aug 30 12:57:03 2010 +0000 Phonet: restore flow control credits when sending fails Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller Signed-off-by: Kumar Sanghvi Acked-by: Linus Walleij Signed-off-by: David S. Miller --- net/phonet/pep.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 7bf23cf..552fb66 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -1288,6 +1288,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) { struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; + int err; #ifdef CONFIG_PHONET_PIPECTRLR struct sockaddr_pn spn = { .spn_family = AF_PHONET, @@ -1314,10 +1315,15 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; #ifdef CONFIG_PHONET_PIPECTRLR - return pn_skb_send(sk, skb, &spn); + err = pn_skb_send(sk, skb, &spn); #else - return pn_skb_send(sk, skb, &pipe_srv); + err = pn_skb_send(sk, skb, &pipe_srv); #endif + + if (err && pn_flow_safe(pn->tx_fc)) + atomic_inc(&pn->tx_credits); + return err; + } static int pep_sendmsg(struct kiocb *iocb, struct sock *sk, -- cgit v1.1 From dd28d1a0b5ecc0f5512f658b1a8fd38bc4f4c98c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2010 11:53:50 +0000 Subject: ipv4: __mkroute_output() speedup While doing stress tests with a disabled IP route cache, I found __mkroute_output() was touching three times in_device atomic refcount. Use RCU to touch it once to reduce cache line ping pongs. Before patch time to perform the test real 1m42.009s user 0m12.545s sys 25m0.726s Profile : 16109.00 26.4% ip_route_output_slow vmlinux 7434.00 12.2% dst_destroy vmlinux 3280.00 5.4% fib_rules_lookup vmlinux 3252.00 5.3% fib_semantic_match vmlinux 2622.00 4.3% fib_table_lookup vmlinux 2535.00 4.1% dst_alloc vmlinux 1750.00 2.9% _raw_read_lock vmlinux 1532.00 2.5% rt_set_nexthop vmlinux After patch real 1m36.503s user 0m12.977s sys 23m25.608s 14234.00 22.4% ip_route_output_slow vmlinux 8717.00 13.7% dst_destroy vmlinux 4052.00 6.4% fib_rules_lookup vmlinux 3951.00 6.2% fib_semantic_match vmlinux 3191.00 5.0% dst_alloc vmlinux 1764.00 2.8% fib_table_lookup vmlinux 1692.00 2.7% _raw_read_lock vmlinux 1605.00 2.5% rt_set_nexthop vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 33 +++++++++++++++------------------ 1 file changed, 15 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 98beda4..ea89500 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2358,9 +2358,8 @@ static int __mkroute_output(struct rtable **result, struct rtable *rth; struct in_device *in_dev; u32 tos = RT_FL_TOS(oldflp); - int err = 0; - if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) + if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) return -EINVAL; if (fl->fl4_dst == htonl(0xFFFFFFFF)) @@ -2373,11 +2372,12 @@ static int __mkroute_output(struct rtable **result, if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - /* get work reference to inet device */ - in_dev = in_dev_get(dev_out); - if (!in_dev) + rcu_read_lock(); + in_dev = __in_dev_get_rcu(dev_out); + if (!in_dev) { + rcu_read_unlock(); return -EINVAL; - + } if (res->type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; if (res->fi) { @@ -2385,13 +2385,13 @@ static int __mkroute_output(struct rtable **result, res->fi = NULL; } } else if (res->type == RTN_MULTICAST) { - flags |= RTCF_MULTICAST|RTCF_LOCAL; + flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, oldflp->proto)) flags &= ~RTCF_LOCAL; /* If multicast route do not exist use - default one, but do not gateway in this case. - Yes, it is hack. + * default one, but do not gateway in this case. + * Yes, it is hack. */ if (res->fi && res->prefixlen < 4) { fib_info_put(res->fi); @@ -2402,9 +2402,12 @@ static int __mkroute_output(struct rtable **result, rth = dst_alloc(&ipv4_dst_ops); if (!rth) { - err = -ENOBUFS; - goto cleanup; + rcu_read_unlock(); + return -ENOBUFS; } + in_dev_hold(in_dev); + rcu_read_unlock(); + rth->idev = in_dev; atomic_set(&rth->dst.__refcnt, 1); rth->dst.flags= DST_HOST; @@ -2425,7 +2428,6 @@ static int __mkroute_output(struct rtable **result, cache entry */ rth->dst.dev = dev_out; dev_hold(dev_out); - rth->idev = in_dev_get(dev_out); rth->rt_gateway = fl->fl4_dst; rth->rt_spec_dst= fl->fl4_src; @@ -2460,13 +2462,8 @@ static int __mkroute_output(struct rtable **result, rt_set_nexthop(rth, res, 0); rth->rt_flags = flags; - *result = rth; - cleanup: - /* release work reference to inet device */ - in_dev_put(in_dev); - - return err; + return 0; } static int ip_mkroute_output(struct rtable **rp, -- cgit v1.1 From 82efee1499a27c06f5afb11b07db384fdb3f7004 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2010 03:31:56 +0000 Subject: ipv4: introduce __ip_dev_find() ip_dev_find(net, addr) finds a device given an IPv4 source address and takes a reference on it. Introduce __ip_dev_find(), taking a third argument, to optionally take the device reference. Callers not asking the reference to be taken should be in an rcu_read_lock() protected section. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 981f3c5..4a69a95 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -147,34 +147,40 @@ static void fib_flush(struct net *net) rt_cache_flush(net, -1); } -/* - * Find the first device with a given source address. +/** + * __ip_dev_find - find the first device with a given source address. + * @net: the net namespace + * @addr: the source address + * @devref: if true, take a reference on the found device + * + * If a caller uses devref=false, it should be protected by RCU */ - -struct net_device * ip_dev_find(struct net *net, __be32 addr) +struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } }, - .flags = FLOWI_FLAG_MATCH_ANY_IIF }; - struct fib_result res; + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = addr + } + }, + .flags = FLOWI_FLAG_MATCH_ANY_IIF + }; + struct fib_result res = { 0 }; struct net_device *dev = NULL; -#ifdef CONFIG_IP_MULTIPLE_TABLES - res.r = NULL; -#endif - if (fib_lookup(net, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) goto out; dev = FIB_RES_DEV(res); - if (dev) + if (dev && devref) dev_hold(dev); out: fib_res_put(&res); return dev; } -EXPORT_SYMBOL(ip_dev_find); +EXPORT_SYMBOL(__ip_dev_find); /* * Find address type as if only "dev" was present in the system. If -- cgit v1.1 From 0197aa38df2ce550c0bfc96194b07ce6b68af814 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2010 03:33:58 +0000 Subject: ipv4: rcu conversion in ip_route_output_slow ip_route_output_slow() is enclosed in an rcu_read_lock() protected section, so that no references are taken/released on device, thanks to __ip_dev_find() & dev_get_by_index_rcu() Tested with ip route cache disabled, and a stress test : Before patch: elapsed time : real 1m38.347s user 0m11.909s sys 23m51.501s Profile: 13788.00 22.7% ip_route_output_slow [kernel] 7875.00 13.0% dst_destroy [kernel] 3925.00 6.5% fib_semantic_match [kernel] 3144.00 5.2% fib_rules_lookup [kernel] 3061.00 5.0% dst_alloc [kernel] 2276.00 3.7% rt_set_nexthop [kernel] 1762.00 2.9% fib_table_lookup [kernel] 1538.00 2.5% _raw_read_lock [kernel] 1358.00 2.2% ip_output [kernel] After patch: real 1m28.808s user 0m13.245s sys 20m37.293s 10950.00 17.2% ip_route_output_slow [kernel] 10726.00 16.9% dst_destroy [kernel] 5170.00 8.1% fib_semantic_match [kernel] 3937.00 6.2% dst_alloc [kernel] 3635.00 5.7% rt_set_nexthop [kernel] 2900.00 4.6% fib_rules_lookup [kernel] 2240.00 3.5% fib_table_lookup [kernel] 1427.00 2.2% _raw_read_lock [kernel] 1157.00 1.8% kmem_cache_alloc [kernel] Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 38 ++++++++++++-------------------------- 1 file changed, 12 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ea89500..a61acea 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2487,6 +2487,7 @@ static int ip_mkroute_output(struct rtable **rp, /* * Major route resolver routine. + * called with rcu_read_lock(); */ static int ip_route_output_slow(struct net *net, struct rtable **rp, @@ -2505,7 +2506,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, .iif = net->loopback_dev->ifindex, .oif = oldflp->oif }; struct fib_result res; - unsigned flags = 0; + unsigned int flags = 0; struct net_device *dev_out = NULL; int free_res = 0; int err; @@ -2535,7 +2536,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, (ipv4_is_multicast(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); + dev_out = __ip_dev_find(net, oldflp->fl4_src, false); if (dev_out == NULL) goto out; @@ -2560,26 +2561,21 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (!(oldflp->flags & FLOWI_FLAG_ANYSRC)) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(net, oldflp->fl4_src); - if (dev_out == NULL) + if (!__ip_dev_find(net, oldflp->fl4_src, false)) goto out; - dev_put(dev_out); - dev_out = NULL; } } if (oldflp->oif) { - dev_out = dev_get_by_index(net, oldflp->oif); + dev_out = dev_get_by_index_rcu(net, oldflp->oif); err = -ENODEV; if (dev_out == NULL) goto out; /* RACE: Check return value of inet_select_addr instead. */ - if (rcu_dereference_raw(dev_out->ip_ptr) == NULL) { - dev_put(dev_out); + if (rcu_dereference(dev_out->ip_ptr) == NULL) goto out; /* Wrong error code */ - } if (ipv4_is_local_multicast(oldflp->fl4_dst) || oldflp->fl4_dst == htonl(0xFFFFFFFF)) { @@ -2602,10 +2598,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, fl.fl4_dst = fl.fl4_src; if (!fl.fl4_dst) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); - if (dev_out) - dev_put(dev_out); dev_out = net->loopback_dev; - dev_hold(dev_out); fl.oif = net->loopback_dev->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; @@ -2639,8 +2632,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, res.type = RTN_UNICAST; goto make_route; } - if (dev_out) - dev_put(dev_out); err = -ENETUNREACH; goto out; } @@ -2649,10 +2640,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (res.type == RTN_LOCAL) { if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; - if (dev_out) - dev_put(dev_out); dev_out = net->loopback_dev; - dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) fib_info_put(res.fi); @@ -2672,28 +2660,23 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (!fl.fl4_src) fl.fl4_src = FIB_RES_PREFSRC(res); - if (dev_out) - dev_put(dev_out); dev_out = FIB_RES_DEV(res); - dev_hold(dev_out); fl.oif = dev_out->ifindex; make_route: err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); - if (free_res) fib_res_put(&res); - if (dev_out) - dev_put(dev_out); out: return err; } int __ip_route_output_key(struct net *net, struct rtable **rp, const struct flowi *flp) { - unsigned hash; + unsigned int hash; + int res; struct rtable *rth; if (!rt_caching(net)) @@ -2724,7 +2707,10 @@ int __ip_route_output_key(struct net *net, struct rtable **rp, rcu_read_unlock_bh(); slow_output: - return ip_route_output_slow(net, rp, flp); + rcu_read_lock(); + res = ip_route_output_slow(net, rp, flp); + rcu_read_unlock(); + return res; } EXPORT_SYMBOL_GPL(__ip_route_output_key); -- cgit v1.1 From ddcb4541e917780ef7ccc68dd8df18ca0bc055d0 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 1 Oct 2010 13:58:00 +0000 Subject: gre: protocol table can be static This table is only used in gre.c Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/gre.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/gre.c b/net/ipv4/gre.c index b546736d..caea688 100644 --- a/net/ipv4/gre.c +++ b/net/ipv4/gre.c @@ -22,7 +22,7 @@ #include -const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; +static const struct gre_protocol *gre_proto[GREPROTO_MAX] __read_mostly; static DEFINE_SPINLOCK(gre_proto_lock); int gre_add_protocol(const struct gre_protocol *proto, u8 version) -- cgit v1.1 From 55747a0a73ea74a25fcebb0731e8d3f13fe8c09d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Oct 2010 16:14:55 +0000 Subject: ipmr: __pim_rcv() is called under rcu_read_lock No need to get a reference on reg_dev and release it, we are in a rcu_read_lock() protected section. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 10b24c02..1a92ebd 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1805,6 +1805,7 @@ dont_forward: } #ifdef CONFIG_IP_PIMSM +/* called with rcu_read_lock() */ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, unsigned int pimlen) { @@ -1826,26 +1827,23 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, read_lock(&mrt_lock); if (mrt->mroute_reg_vif_num >= 0) reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev; - if (reg_dev) - dev_hold(reg_dev); read_unlock(&mrt_lock); if (reg_dev == NULL) return 1; skb->mac_header = skb->network_header; - skb_pull(skb, (u8*)encap - skb->data); + skb_pull(skb, (u8 *)encap - skb->data); skb_reset_network_header(skb); skb->protocol = htons(ETH_P_IP); - skb->ip_summed = 0; + skb->ip_summed = CHECKSUM_NONE; skb->pkt_type = PACKET_HOST; skb_tunnel_rx(skb, reg_dev); netif_rx(skb); - dev_put(reg_dev); - return 0; + return NET_RX_SUCCESS; } #endif -- cgit v1.1 From 4c9687098f245601e9d94178715ee03afbcc6f80 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Oct 2010 16:15:01 +0000 Subject: ipmr: RCU conversion of mroute_sk Use RCU and RTNL to protect (struct mr_table)->mroute_sk Readers use RCU, writers use RTNL. ip_ra_control() already use an RCU grace period before ip_ra_destroy_rcu(), so we dont need synchronize_rcu() in mrtsock_destruct() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 91 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 49 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 1a92ebd..e2db2ea 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -75,7 +75,7 @@ struct mr_table { struct net *net; #endif u32 id; - struct sock *mroute_sk; + struct sock __rcu *mroute_sk; struct timer_list ipmr_expire_timer; struct list_head mfc_unres_queue; struct list_head mfc_cache_array[MFC_LINES]; @@ -867,6 +867,7 @@ static int ipmr_cache_report(struct mr_table *mrt, const int ihl = ip_hdrlen(pkt); struct igmphdr *igmp; struct igmpmsg *msg; + struct sock *mroute_sk; int ret; #ifdef CONFIG_IP_PIMSM @@ -925,7 +926,10 @@ static int ipmr_cache_report(struct mr_table *mrt, skb->transport_header = skb->network_header; } - if (mrt->mroute_sk == NULL) { + rcu_read_lock(); + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk == NULL) { + rcu_read_unlock(); kfree_skb(skb); return -EINVAL; } @@ -933,7 +937,8 @@ static int ipmr_cache_report(struct mr_table *mrt, /* * Deliver to mrouted */ - ret = sock_queue_rcv_skb(mrt->mroute_sk, skb); + ret = sock_queue_rcv_skb(mroute_sk, skb); + rcu_read_unlock(); if (ret < 0) { if (net_ratelimit()) printk(KERN_WARNING "mroute: pending queue full, dropping entries.\n"); @@ -1164,6 +1169,9 @@ static void mroute_clean_tables(struct mr_table *mrt) } } +/* called from ip_ra_control(), before an RCU grace period, + * we dont need to call synchronize_rcu() here + */ static void mrtsock_destruct(struct sock *sk) { struct net *net = sock_net(sk); @@ -1171,13 +1179,9 @@ static void mrtsock_destruct(struct sock *sk) rtnl_lock(); ipmr_for_each_table(mrt, net) { - if (sk == mrt->mroute_sk) { + if (sk == rtnl_dereference(mrt->mroute_sk)) { IPV4_DEVCONF_ALL(net, MC_FORWARDING)--; - - write_lock_bh(&mrt_lock); - mrt->mroute_sk = NULL; - write_unlock_bh(&mrt_lock); - + rcu_assign_pointer(mrt->mroute_sk, NULL); mroute_clean_tables(mrt); } } @@ -1204,7 +1208,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENOENT; if (optname != MRT_INIT) { - if (sk != mrt->mroute_sk && !capable(CAP_NET_ADMIN)) + if (sk != rcu_dereference_raw(mrt->mroute_sk) && + !capable(CAP_NET_ADMIN)) return -EACCES; } @@ -1217,23 +1222,20 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENOPROTOOPT; rtnl_lock(); - if (mrt->mroute_sk) { + if (rtnl_dereference(mrt->mroute_sk)) { rtnl_unlock(); return -EADDRINUSE; } ret = ip_ra_control(sk, 1, mrtsock_destruct); if (ret == 0) { - write_lock_bh(&mrt_lock); - mrt->mroute_sk = sk; - write_unlock_bh(&mrt_lock); - + rcu_assign_pointer(mrt->mroute_sk, sk); IPV4_DEVCONF_ALL(net, MC_FORWARDING)++; } rtnl_unlock(); return ret; case MRT_DONE: - if (sk != mrt->mroute_sk) + if (sk != rcu_dereference_raw(mrt->mroute_sk)) return -EACCES; return ip_ra_control(sk, 0, NULL); case MRT_ADD_VIF: @@ -1246,7 +1248,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -ENFILE; rtnl_lock(); if (optname == MRT_ADD_VIF) { - ret = vif_add(net, mrt, &vif, sk == mrt->mroute_sk); + ret = vif_add(net, mrt, &vif, + sk == rtnl_dereference(mrt->mroute_sk)); } else { ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL); } @@ -1267,7 +1270,8 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi if (optname == MRT_DEL_MFC) ret = ipmr_mfc_delete(mrt, &mfc); else - ret = ipmr_mfc_add(net, mrt, &mfc, sk == mrt->mroute_sk); + ret = ipmr_mfc_add(net, mrt, &mfc, + sk == rtnl_dereference(mrt->mroute_sk)); rtnl_unlock(); return ret; /* @@ -1309,14 +1313,16 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi return -EINVAL; if (get_user(v, (u32 __user *)optval)) return -EFAULT; - if (sk == mrt->mroute_sk) - return -EBUSY; rtnl_lock(); ret = 0; - if (!ipmr_new_table(net, v)) - ret = -ENOMEM; - raw_sk(sk)->ipmr_table = v; + if (sk == rtnl_dereference(mrt->mroute_sk)) { + ret = -EBUSY; + } else { + if (!ipmr_new_table(net, v)) + ret = -ENOMEM; + raw_sk(sk)->ipmr_table = v; + } rtnl_unlock(); return ret; } @@ -1713,6 +1719,7 @@ dont_forward: /* * Multicast packets for forwarding arrive here + * Called with rcu_read_lock(); */ int ip_mr_input(struct sk_buff *skb) @@ -1726,7 +1733,7 @@ int ip_mr_input(struct sk_buff *skb) /* Packet is looped back after forward, it should not be forwarded second time, but still can be delivered locally. */ - if (IPCB(skb)->flags&IPSKB_FORWARDED) + if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; err = ipmr_fib_lookup(net, &skb_rtable(skb)->fl, &mrt); @@ -1736,24 +1743,24 @@ int ip_mr_input(struct sk_buff *skb) } if (!local) { - if (IPCB(skb)->opt.router_alert) { - if (ip_call_ra_chain(skb)) - return 0; - } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP){ - /* IGMPv1 (and broken IGMPv2 implementations sort of - Cisco IOS <= 11.2(8)) do not put router alert - option to IGMP packets destined to routable - groups. It is very bad, because it means - that we can forward NO IGMP messages. - */ - read_lock(&mrt_lock); - if (mrt->mroute_sk) { - nf_reset(skb); - raw_rcv(mrt->mroute_sk, skb); - read_unlock(&mrt_lock); - return 0; - } - read_unlock(&mrt_lock); + if (IPCB(skb)->opt.router_alert) { + if (ip_call_ra_chain(skb)) + return 0; + } else if (ip_hdr(skb)->protocol == IPPROTO_IGMP) { + /* IGMPv1 (and broken IGMPv2 implementations sort of + * Cisco IOS <= 11.2(8)) do not put router alert + * option to IGMP packets destined to routable + * groups. It is very bad, because it means + * that we can forward NO IGMP messages. + */ + struct sock *mroute_sk; + + mroute_sk = rcu_dereference(mrt->mroute_sk); + if (mroute_sk) { + nf_reset(skb); + raw_rcv(mroute_sk, skb); + return 0; + } } } -- cgit v1.1 From a8c9486b816f74d4645144db9e8fa2f711c1fc4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Oct 2010 16:15:08 +0000 Subject: ipmr: RCU protection for mfc_cache_array Use RCU & RTNL protection for mfc_cache_array[] ipmr_cache_find() is called under rcu_read_lock(); Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 87 +++++++++++++++++++++++++++++++-------------------------- 1 file changed, 47 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index e2db2ea..cbb6dabe 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -577,11 +577,18 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, return 0; } -static inline void ipmr_cache_free(struct mfc_cache *c) +static void ipmr_cache_free_rcu(struct rcu_head *head) { + struct mfc_cache *c = container_of(head, struct mfc_cache, rcu); + kmem_cache_free(mrt_cachep, c); } +static inline void ipmr_cache_free(struct mfc_cache *c) +{ + call_rcu(&c->rcu, ipmr_cache_free_rcu); +} + /* Destroy an unresolved cache entry, killing queued skbs and reporting error to netlink readers. */ @@ -781,6 +788,7 @@ static int vif_add(struct net *net, struct mr_table *mrt, return 0; } +/* called with rcu_read_lock() */ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, __be32 origin, __be32 mcastgrp) @@ -788,7 +796,7 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, int line = MFC_HASH(mcastgrp, origin); struct mfc_cache *c; - list_for_each_entry(c, &mrt->mfc_cache_array[line], list) { + list_for_each_entry_rcu(c, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == origin && c->mfc_mcastgrp == mcastgrp) return c; } @@ -801,19 +809,20 @@ static struct mfc_cache *ipmr_cache_find(struct mr_table *mrt, static struct mfc_cache *ipmr_cache_alloc(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL); - if (c == NULL) - return NULL; - c->mfc_un.res.minvif = MAXVIFS; + + if (c) + c->mfc_un.res.minvif = MAXVIFS; return c; } static struct mfc_cache *ipmr_cache_alloc_unres(void) { struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_ATOMIC); - if (c == NULL) - return NULL; - skb_queue_head_init(&c->mfc_un.unres.unresolved); - c->mfc_un.unres.expires = jiffies + 10*HZ; + + if (c) { + skb_queue_head_init(&c->mfc_un.unres.unresolved); + c->mfc_un.unres.expires = jiffies + 10*HZ; + } return c; } @@ -1040,9 +1049,7 @@ static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc) list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[line], list) { if (c->mfc_origin == mfc->mfcc_origin.s_addr && c->mfc_mcastgrp == mfc->mfcc_mcastgrp.s_addr) { - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); + list_del_rcu(&c->list); ipmr_cache_free(c); return 0; @@ -1095,9 +1102,7 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt, if (!mrtsock) c->mfc_flags |= MFC_STATIC; - write_lock_bh(&mrt_lock); - list_add(&c->list, &mrt->mfc_cache_array[line]); - write_unlock_bh(&mrt_lock); + list_add_rcu(&c->list, &mrt->mfc_cache_array[line]); /* * Check to see if we resolved a queued list. If so we @@ -1149,12 +1154,9 @@ static void mroute_clean_tables(struct mr_table *mrt) */ for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { - if (c->mfc_flags&MFC_STATIC) + if (c->mfc_flags & MFC_STATIC) continue; - write_lock_bh(&mrt_lock); - list_del(&c->list); - write_unlock_bh(&mrt_lock); - + list_del_rcu(&c->list); ipmr_cache_free(c); } } @@ -1422,19 +1424,19 @@ int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg) if (copy_from_user(&sr, arg, sizeof(sr))) return -EFAULT; - read_lock(&mrt_lock); + rcu_read_lock(); c = ipmr_cache_find(mrt, sr.src.s_addr, sr.grp.s_addr); if (c) { sr.pktcnt = c->mfc_un.res.pkt; sr.bytecnt = c->mfc_un.res.bytes; sr.wrong_if = c->mfc_un.res.wrong_if; - read_unlock(&mrt_lock); + rcu_read_unlock(); if (copy_to_user(arg, &sr, sizeof(sr))) return -EFAULT; return 0; } - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EADDRNOTAVAIL; default: return -ENOIOCTLCMD; @@ -1764,7 +1766,7 @@ int ip_mr_input(struct sk_buff *skb) } } - read_lock(&mrt_lock); + /* already under rcu_read_lock() */ cache = ipmr_cache_find(mrt, ip_hdr(skb)->saddr, ip_hdr(skb)->daddr); /* @@ -1776,13 +1778,12 @@ int ip_mr_input(struct sk_buff *skb) if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); ip_local_deliver(skb); - if (skb2 == NULL) { - read_unlock(&mrt_lock); + if (skb2 == NULL) return -ENOBUFS; - } skb = skb2; } + read_lock(&mrt_lock); vif = ipmr_find_vif(mrt, skb->dev); if (vif >= 0) { int err2 = ipmr_cache_unresolved(mrt, vif, skb); @@ -1795,8 +1796,8 @@ int ip_mr_input(struct sk_buff *skb) return -ENODEV; } + read_lock(&mrt_lock); ip_mr_forward(net, mrt, skb, cache, local); - read_unlock(&mrt_lock); if (local) @@ -1963,7 +1964,7 @@ int ipmr_get_route(struct net *net, if (mrt == NULL) return -ENOENT; - read_lock(&mrt_lock); + rcu_read_lock(); cache = ipmr_cache_find(mrt, rt->rt_src, rt->rt_dst); if (cache == NULL) { @@ -1973,18 +1974,21 @@ int ipmr_get_route(struct net *net, int vif; if (nowait) { - read_unlock(&mrt_lock); + rcu_read_unlock(); return -EAGAIN; } dev = skb->dev; + read_lock(&mrt_lock); if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENODEV; } skb2 = skb_clone(skb, GFP_ATOMIC); if (!skb2) { read_unlock(&mrt_lock); + rcu_read_unlock(); return -ENOMEM; } @@ -1997,13 +2001,16 @@ int ipmr_get_route(struct net *net, iph->version = 0; err = ipmr_cache_unresolved(mrt, vif, skb2); read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } - if (!nowait && (rtm->rtm_flags&RTM_F_NOTIFY)) + read_lock(&mrt_lock); + if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY)) cache->mfc_flags |= MFC_NOTIFY; err = __ipmr_fill_mroute(mrt, skb, cache, rtm); read_unlock(&mrt_lock); + rcu_read_unlock(); return err; } @@ -2055,14 +2062,14 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) s_h = cb->args[1]; s_e = cb->args[2]; - read_lock(&mrt_lock); + rcu_read_lock(); ipmr_for_each_table(mrt, net) { if (t < s_t) goto next_table; if (t > s_t) s_h = 0; for (h = s_h; h < MFC_LINES; h++) { - list_for_each_entry(mfc, &mrt->mfc_cache_array[h], list) { + list_for_each_entry_rcu(mfc, &mrt->mfc_cache_array[h], list) { if (e < s_e) goto next_entry; if (ipmr_fill_mroute(mrt, skb, @@ -2080,7 +2087,7 @@ next_table: t++; } done: - read_unlock(&mrt_lock); + rcu_read_unlock(); cb->args[2] = e; cb->args[1] = h; @@ -2213,14 +2220,14 @@ static struct mfc_cache *ipmr_mfc_seq_idx(struct net *net, struct mr_table *mrt = it->mrt; struct mfc_cache *mfc; - read_lock(&mrt_lock); + rcu_read_lock(); for (it->ct = 0; it->ct < MFC_LINES; it->ct++) { it->cache = &mrt->mfc_cache_array[it->ct]; - list_for_each_entry(mfc, it->cache, list) + list_for_each_entry_rcu(mfc, it->cache, list) if (pos-- == 0) return mfc; } - read_unlock(&mrt_lock); + rcu_read_unlock(); spin_lock_bh(&mfc_unres_lock); it->cache = &mrt->mfc_unres_queue; @@ -2279,7 +2286,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) } /* exhausted cache_array, show unresolved */ - read_unlock(&mrt_lock); + rcu_read_unlock(); it->cache = &mrt->mfc_unres_queue; it->ct = 0; @@ -2302,7 +2309,7 @@ static void ipmr_mfc_seq_stop(struct seq_file *seq, void *v) if (it->cache == &mrt->mfc_unres_queue) spin_unlock_bh(&mfc_unres_lock); else if (it->cache == &mrt->mfc_cache_array[it->ct]) - read_unlock(&mrt_lock); + rcu_read_unlock(); } static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) @@ -2426,7 +2433,7 @@ int __init ip_mr_init(void) mrt_cachep = kmem_cache_create("ip_mrt_cache", sizeof(struct mfc_cache), - 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, + 0, SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); if (!mrt_cachep) return -ENOMEM; -- cgit v1.1 From a8cb16dd9cb571c45bb479a1e4721ce11220a216 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 1 Oct 2010 16:15:29 +0000 Subject: ipmr: cleanups Various code style cleanups Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/ipmr.c | 238 +++++++++++++++++++++++++++++--------------------------- 1 file changed, 124 insertions(+), 114 deletions(-) (limited to 'net') diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index cbb6dabe..86dd569 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -98,7 +98,7 @@ struct ipmr_result { }; /* Big lock, protecting vif table, mrt cache and mroute socket state. - Note that the changes are semaphored via rtnl_lock. + * Note that the changes are semaphored via rtnl_lock. */ static DEFINE_RWLOCK(mrt_lock); @@ -113,11 +113,11 @@ static DEFINE_RWLOCK(mrt_lock); static DEFINE_SPINLOCK(mfc_unres_lock); /* We return to original Alan's scheme. Hash table of resolved - entries is changed only in process context and protected - with weak lock mrt_lock. Queue of unresolved entries is protected - with strong spinlock mfc_unres_lock. - - In this case data path is free of exclusive locks at all. + * entries is changed only in process context and protected + * with weak lock mrt_lock. Queue of unresolved entries is protected + * with strong spinlock mfc_unres_lock. + * + * In this case data path is free of exclusive locks at all. */ static struct kmem_cache *mrt_cachep __read_mostly; @@ -396,9 +396,9 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v) set_fs(KERNEL_DS); err = ops->ndo_do_ioctl(dev, &ifr, SIOCADDTUNNEL); set_fs(oldfs); - } else + } else { err = -EOPNOTSUPP; - + } dev = NULL; if (err == 0 && @@ -495,7 +495,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt) dev->iflink = 0; rcu_read_lock(); - if ((in_dev = __in_dev_get_rcu(dev)) == NULL) { + in_dev = __in_dev_get_rcu(dev); + if (!in_dev) { rcu_read_unlock(); goto failure; } @@ -552,9 +553,10 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, mrt->mroute_reg_vif_num = -1; #endif - if (vifi+1 == mrt->maxvif) { + if (vifi + 1 == mrt->maxvif) { int tmp; - for (tmp=vifi-1; tmp>=0; tmp--) { + + for (tmp = vifi - 1; tmp >= 0; tmp--) { if (VIF_EXISTS(mrt, tmp)) break; } @@ -565,12 +567,13 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify, dev_set_allmulti(dev, -1); - if ((in_dev = __in_dev_get_rtnl(dev)) != NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (in_dev) { IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)--; ip_rt_multicast_event(in_dev); } - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER) && !notify) + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER) && !notify) unregister_netdevice_queue(dev, head); dev_put(dev); @@ -590,7 +593,7 @@ static inline void ipmr_cache_free(struct mfc_cache *c) } /* Destroy an unresolved cache entry, killing queued skbs - and reporting error to netlink readers. + * and reporting error to netlink readers. */ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) @@ -612,8 +615,9 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c) memset(&e->msg, 0, sizeof(e->msg)); rtnl_unicast(skb, net, NETLINK_CB(skb).pid); - } else + } else { kfree_skb(skb); + } } ipmr_cache_free(c); @@ -735,9 +739,9 @@ static int vif_add(struct net *net, struct mr_table *mrt, dev_put(dev); return -EADDRNOTAVAIL; } - } else + } else { dev = ip_dev_find(net, vifc->vifc_lcl_addr.s_addr); - + } if (!dev) return -EADDRNOTAVAIL; err = dev_set_allmulti(dev, 1); @@ -750,16 +754,16 @@ static int vif_add(struct net *net, struct mr_table *mrt, return -EINVAL; } - if ((in_dev = __in_dev_get_rtnl(dev)) == NULL) { + in_dev = __in_dev_get_rtnl(dev); + if (!in_dev) { dev_put(dev); return -EADDRNOTAVAIL; } IPV4_DEVCONF(in_dev->cnf, MC_FORWARDING)++; ip_rt_multicast_event(in_dev); - /* - * Fill in the VIF structures - */ + /* Fill in the VIF structures */ + v->rate_limit = vifc->vifc_rate_limit; v->local = vifc->vifc_lcl_addr.s_addr; v->remote = vifc->vifc_rmt_addr.s_addr; @@ -772,14 +776,14 @@ static int vif_add(struct net *net, struct mr_table *mrt, v->pkt_in = 0; v->pkt_out = 0; v->link = dev->ifindex; - if (v->flags&(VIFF_TUNNEL|VIFF_REGISTER)) + if (v->flags & (VIFF_TUNNEL | VIFF_REGISTER)) v->link = dev->iflink; /* And finish update writing critical data */ write_lock_bh(&mrt_lock); v->dev = dev; #ifdef CONFIG_IP_PIMSM - if (v->flags&VIFF_REGISTER) + if (v->flags & VIFF_REGISTER) mrt->mroute_reg_vif_num = vifi; #endif if (vifi+1 > mrt->maxvif) @@ -836,17 +840,15 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, struct sk_buff *skb; struct nlmsgerr *e; - /* - * Play the pending entries through our router - */ + /* Play the pending entries through our router */ while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) { if (ip_hdr(skb)->version == 0) { struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr)); if (__ipmr_fill_mroute(mrt, skb, c, NLMSG_DATA(nlh)) > 0) { - nlh->nlmsg_len = (skb_tail_pointer(skb) - - (u8 *)nlh); + nlh->nlmsg_len = skb_tail_pointer(skb) - + (u8 *)nlh; } else { nlh->nlmsg_type = NLMSG_ERROR; nlh->nlmsg_len = NLMSG_LENGTH(sizeof(struct nlmsgerr)); @@ -857,8 +859,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, } rtnl_unicast(skb, net, NETLINK_CB(skb).pid); - } else + } else { ip_mr_forward(net, mrt, skb, c, 0); + } } } @@ -892,9 +895,9 @@ static int ipmr_cache_report(struct mr_table *mrt, #ifdef CONFIG_IP_PIMSM if (assert == IGMPMSG_WHOLEPKT) { /* Ugly, but we have no choice with this interface. - Duplicate old header, fix ihl, length etc. - And all this only to mangle msg->im_msgtype and - to set msg->im_mbz to "mbz" :-) + * Duplicate old header, fix ihl, length etc. + * And all this only to mangle msg->im_msgtype and + * to set msg->im_mbz to "mbz" :-) */ skb_push(skb, sizeof(struct iphdr)); skb_reset_network_header(skb); @@ -911,27 +914,23 @@ static int ipmr_cache_report(struct mr_table *mrt, #endif { - /* - * Copy the IP header - */ + /* Copy the IP header */ skb->network_header = skb->tail; skb_put(skb, ihl); skb_copy_to_linear_data(skb, pkt->data, ihl); - ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ + ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */ msg = (struct igmpmsg *)skb_network_header(skb); msg->im_vif = vifi; skb_dst_set(skb, dst_clone(skb_dst(pkt))); - /* - * Add our header - */ + /* Add our header */ - igmp=(struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); + igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr)); igmp->type = msg->im_msgtype = assert; - igmp->code = 0; - ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ + igmp->code = 0; + ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */ skb->transport_header = skb->network_header; } @@ -943,9 +942,8 @@ static int ipmr_cache_report(struct mr_table *mrt, return -EINVAL; } - /* - * Deliver to mrouted - */ + /* Deliver to mrouted */ + ret = sock_queue_rcv_skb(mroute_sk, skb); rcu_read_unlock(); if (ret < 0) { @@ -979,9 +977,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) } if (!found) { - /* - * Create a new entry if allowable - */ + /* Create a new entry if allowable */ if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 || (c = ipmr_cache_alloc_unres()) == NULL) { @@ -991,16 +987,14 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) return -ENOBUFS; } - /* - * Fill in the new cache entry - */ + /* Fill in the new cache entry */ + c->mfc_parent = -1; c->mfc_origin = iph->saddr; c->mfc_mcastgrp = iph->daddr; - /* - * Reflect first query at mrouted. - */ + /* Reflect first query at mrouted. */ + err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE); if (err < 0) { /* If the report failed throw the cache entry @@ -1020,10 +1014,9 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb) mod_timer(&mrt->ipmr_expire_timer, c->mfc_un.unres.expires); } - /* - * See if we can append the packet - */ - if (c->mfc_un.unres.unresolved.qlen>3) { + /* See if we can append the packet */ + + if (c->mfc_un.unres.unresolved.qlen > 3) { kfree_skb(skb); err = -ENOBUFS; } else { @@ -1140,18 +1133,16 @@ static void mroute_clean_tables(struct mr_table *mrt) LIST_HEAD(list); struct mfc_cache *c, *next; - /* - * Shut down all active vif entries - */ + /* Shut down all active vif entries */ + for (i = 0; i < mrt->maxvif; i++) { - if (!(mrt->vif_table[i].flags&VIFF_STATIC)) + if (!(mrt->vif_table[i].flags & VIFF_STATIC)) vif_delete(mrt, i, 0, &list); } unregister_netdevice_many(&list); - /* - * Wipe the cache - */ + /* Wipe the cache */ + for (i = 0; i < MFC_LINES; i++) { list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) { if (c->mfc_flags & MFC_STATIC) @@ -1282,7 +1273,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi case MRT_ASSERT: { int v; - if (get_user(v,(int __user *)optval)) + if (get_user(v, (int __user *)optval)) return -EFAULT; mrt->mroute_do_assert = (v) ? 1 : 0; return 0; @@ -1292,7 +1283,7 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi { int v; - if (get_user(v,(int __user *)optval)) + if (get_user(v, (int __user *)optval)) return -EFAULT; v = (v) ? 1 : 0; @@ -1355,9 +1346,9 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int if (optname != MRT_VERSION && #ifdef CONFIG_IP_PIMSM - optname!=MRT_PIM && + optname != MRT_PIM && #endif - optname!=MRT_ASSERT) + optname != MRT_ASSERT) return -ENOPROTOOPT; if (get_user(olr, optlen)) @@ -1473,7 +1464,7 @@ static struct notifier_block ip_mr_notifier = { }; /* - * Encapsulate a packet by attaching a valid IPIP header to it. + * Encapsulate a packet by attaching a valid IPIP header to it. * This avoids tunnel drivers and other mess and gives us the speed so * important for multicast video. */ @@ -1488,7 +1479,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) skb_reset_network_header(skb); iph = ip_hdr(skb); - iph->version = 4; + iph->version = 4; iph->tos = old_iph->tos; iph->ttl = old_iph->ttl; iph->frag_off = 0; @@ -1506,7 +1497,7 @@ static void ip_encap(struct sk_buff *skb, __be32 saddr, __be32 daddr) static inline int ipmr_forward_finish(struct sk_buff *skb) { - struct ip_options * opt = &(IPCB(skb)->opt); + struct ip_options *opt = &(IPCB(skb)->opt); IP_INC_STATS_BH(dev_net(skb_dst(skb)->dev), IPSTATS_MIB_OUTFORWDATAGRAMS); @@ -1543,22 +1534,34 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, } #endif - if (vif->flags&VIFF_TUNNEL) { - struct flowi fl = { .oif = vif->link, - .nl_u = { .ip4_u = - { .daddr = vif->remote, - .saddr = vif->local, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + if (vif->flags & VIFF_TUNNEL) { + struct flowi fl = { + .oif = vif->link, + .nl_u = { + .ip4_u = { + .daddr = vif->remote, + .saddr = vif->local, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(net, &rt, &fl)) goto out_free; encap = sizeof(struct iphdr); } else { - struct flowi fl = { .oif = vif->link, - .nl_u = { .ip4_u = - { .daddr = iph->daddr, - .tos = RT_TOS(iph->tos) } }, - .proto = IPPROTO_IPIP }; + struct flowi fl = { + .oif = vif->link, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .tos = RT_TOS(iph->tos) + } + }, + .proto = IPPROTO_IPIP + }; + if (ip_route_output_key(net, &rt, &fl)) goto out_free; } @@ -1567,8 +1570,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, if (skb->len+encap > dst_mtu(&rt->dst) && (ntohs(iph->frag_off) & IP_DF)) { /* Do not fragment multicasts. Alas, IPv4 does not - allow to send ICMP, so that packets will disappear - to blackhole. + * allow to send ICMP, so that packets will disappear + * to blackhole. */ IP_INC_STATS_BH(dev_net(dev), IPSTATS_MIB_FRAGFAILS); @@ -1591,7 +1594,8 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt, ip_decrease_ttl(ip_hdr(skb)); /* FIXME: forward and output firewalls used to be called here. - * What do we do with netfilter? -- RR */ + * What do we do with netfilter? -- RR + */ if (vif->flags & VIFF_TUNNEL) { ip_encap(skb, vif->local, vif->remote); /* FIXME: extra output firewall step used to be here. --RR */ @@ -1652,15 +1656,15 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (skb_rtable(skb)->fl.iif == 0) { /* It is our own packet, looped back. - Very complicated situation... - - The best workaround until routing daemons will be - fixed is not to redistribute packet, if it was - send through wrong interface. It means, that - multicast applications WILL NOT work for - (S,G), which have default multicast route pointing - to wrong oif. In any case, it is not a good - idea to use multicasting applications on router. + * Very complicated situation... + * + * The best workaround until routing daemons will be + * fixed is not to redistribute packet, if it was + * send through wrong interface. It means, that + * multicast applications WILL NOT work for + * (S,G), which have default multicast route pointing + * to wrong oif. In any case, it is not a good + * idea to use multicasting applications on router. */ goto dont_forward; } @@ -1670,9 +1674,9 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (true_vifi >= 0 && mrt->mroute_do_assert && /* pimsm uses asserts, when switching from RPT to SPT, - so that we cannot check that packet arrived on an oif. - It is bad, but otherwise we would need to move pretty - large chunk of pimd to kernel. Ough... --ANK + * so that we cannot check that packet arrived on an oif. + * It is bad, but otherwise we would need to move pretty + * large chunk of pimd to kernel. Ough... --ANK */ (mrt->mroute_do_pim || cache->mfc_un.res.ttls[true_vifi] < 255) && @@ -1690,10 +1694,12 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, /* * Forward the frame */ - for (ct = cache->mfc_un.res.maxvif-1; ct >= cache->mfc_un.res.minvif; ct--) { + for (ct = cache->mfc_un.res.maxvif - 1; + ct >= cache->mfc_un.res.minvif; ct--) { if (ip_hdr(skb)->ttl > cache->mfc_un.res.ttls[ct]) { if (psend != -1) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) ipmr_queue_xmit(net, mrt, skb2, cache, psend); @@ -1704,6 +1710,7 @@ static int ip_mr_forward(struct net *net, struct mr_table *mrt, if (psend != -1) { if (local) { struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC); + if (skb2) ipmr_queue_xmit(net, mrt, skb2, cache, psend); } else { @@ -1733,7 +1740,7 @@ int ip_mr_input(struct sk_buff *skb) int err; /* Packet is looped back after forward, it should not be - forwarded second time, but still can be delivered locally. + * forwarded second time, but still can be delivered locally. */ if (IPCB(skb)->flags & IPSKB_FORWARDED) goto dont_forward; @@ -1822,10 +1829,10 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, encap = (struct iphdr *)(skb_transport_header(skb) + pimlen); /* - Check that: - a. packet is really destinted to a multicast group - b. packet is not a NULL-REGISTER - c. packet is not truncated + * Check that: + * a. packet is really sent to a multicast group + * b. packet is not a NULL-REGISTER + * c. packet is not truncated */ if (!ipv4_is_multicast(encap->daddr) || encap->tot_len == 0 || @@ -1860,7 +1867,7 @@ static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb, * Handle IGMP messages of PIMv1 */ -int pim_rcv_v1(struct sk_buff * skb) +int pim_rcv_v1(struct sk_buff *skb) { struct igmphdr *pim; struct net *net = dev_net(skb->dev); @@ -1887,7 +1894,7 @@ drop: #endif #ifdef CONFIG_IP_PIMSM_V2 -static int pim_rcv(struct sk_buff * skb) +static int pim_rcv(struct sk_buff *skb) { struct pimreghdr *pim; struct net *net = dev_net(skb->dev); @@ -1897,8 +1904,8 @@ static int pim_rcv(struct sk_buff * skb) goto drop; pim = (struct pimreghdr *)skb_transport_header(skb); - if (pim->type != ((PIM_VERSION<<4)|(PIM_REGISTER)) || - (pim->flags&PIM_NULL_REGISTER) || + if (pim->type != ((PIM_VERSION << 4) | (PIM_REGISTER)) || + (pim->flags & PIM_NULL_REGISTER) || (ip_compute_csum((void *)pim, sizeof(*pim)) != 0 && csum_fold(skb_checksum(skb, 0, skb->len, 0)))) goto drop; @@ -1971,7 +1978,7 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb2; struct iphdr *iph; struct net_device *dev; - int vif; + int vif = -1; if (nowait) { rcu_read_unlock(); @@ -1980,7 +1987,9 @@ int ipmr_get_route(struct net *net, dev = skb->dev; read_lock(&mrt_lock); - if (dev == NULL || (vif = ipmr_find_vif(mrt, dev)) < 0) { + if (dev) + vif = ipmr_find_vif(mrt, dev); + if (vif < 0) { read_unlock(&mrt_lock); rcu_read_unlock(); return -ENODEV; @@ -2098,7 +2107,8 @@ done: #ifdef CONFIG_PROC_FS /* - * The /proc interfaces to multicast routing /proc/ip_mr_cache /proc/ip_mr_vif + * The /proc interfaces to multicast routing : + * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif */ struct ipmr_vif_iter { struct seq_net_private p; @@ -2294,7 +2304,7 @@ static void *ipmr_mfc_seq_next(struct seq_file *seq, void *v, loff_t *pos) if (!list_empty(it->cache)) return list_first_entry(it->cache, struct mfc_cache, list); - end_of_list: +end_of_list: spin_unlock_bh(&mfc_unres_lock); it->cache = NULL; @@ -2335,7 +2345,7 @@ static int ipmr_mfc_seq_show(struct seq_file *seq, void *v) mfc->mfc_un.res.bytes, mfc->mfc_un.res.wrong_if); for (n = mfc->mfc_un.res.minvif; - n < mfc->mfc_un.res.maxvif; n++ ) { + n < mfc->mfc_un.res.maxvif; n++) { if (VIF_EXISTS(mrt, n) && mfc->mfc_un.res.ttls[n] < 255) seq_printf(seq, -- cgit v1.1 From 9a7241c21b06c3a3f8ebcf3e347bd68556369da7 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Sun, 3 Oct 2010 22:14:37 -0700 Subject: sctp: Fix break indentation in sctp_ioctl(). Signed-off-by: David S. Miller --- net/sctp/socket.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 535659f..d4bf2a7 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -3617,8 +3617,8 @@ SCTP_STATIC int sctp_ioctl(struct sock *sk, int cmd, unsigned long arg) amount = skb->len; } rc = put_user(amount, (int __user *)arg); - } break; + } default: rc = -ENOIOCTLCMD; break; -- cgit v1.1 From c7d4426a98a5f6654cd0b4b33d9dab2e77192c18 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sun, 3 Oct 2010 22:17:54 -0700 Subject: net: introduce DST_NOCACHE flag While doing stress tests with IP route cache disabled, and multi queue devices, I noticed a very high contention on one rwlock used in neighbour code. When many cpus are trying to send frames (possibly using a high performance multiqueue device) to the same neighbour, they fight for the neigh->lock rwlock in order to call neigh_hh_init(), and fight on hh->hh_refcnt (a pair of atomic_inc/atomic_dec_and_test()) But we dont need to call neigh_hh_init() for dst that are used only once. It costs four atomic operations at least, on two contended cache lines, plus the high contention on neigh->lock rwlock. Introduce a new dst flag, DST_NOCACHE, that is set when dst was not inserted in route cache. With the stress test bench, sending 160000000 frames on one neighbour, results are : Before patch: real 2m28.406s user 0m11.781s sys 36m17.964s After patch: real 1m26.532s user 0m12.185s sys 20m3.903s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 4 +++- net/ipv4/route.c | 1 + 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 96b1a749a..b142a0d 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1210,7 +1210,9 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; - if (dev->header_ops->cache && !dst->hh) { + if (dev->header_ops->cache && + !dst->hh && + !(dst->flags & DST_NOCACHE)) { write_lock_bh(&neigh->lock); if (!dst->hh) neigh_hh_init(neigh, dst, dst->ops->protocol); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index a61acea..c3cb8bd 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1107,6 +1107,7 @@ restart: * on the route gc list. */ + rt->dst.flags |= DST_NOCACHE; if (rt->rt_type == RTN_UNICAST || rt->fl.iif == 0) { int err = arp_bind_neighbour(&rt->dst); if (err) { -- cgit v1.1 From 5adbb9fb0c35c38022f79e09fecf15ba8f65f069 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:51 +0900 Subject: netfilter: nf_conntrack_sip: Allow ct_sip_get_header() to be called with a null ct argument Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/nf_conntrack_sip.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 53d8922..2fd1ea2 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -152,6 +152,9 @@ static int parse_addr(const struct nf_conn *ct, const char *cp, const char *end; int ret = 0; + if (!ct) + return 0; + memset(addr, 0, sizeof(*addr)); switch (nf_ct_l3num(ct)) { case AF_INET: -- cgit v1.1 From 001985b2c0cfad48e1dec8e30f4d432eac240dd2 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:51 +0900 Subject: netfilter: nf_conntrack_sip: Add callid parser Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/nf_conntrack_sip.c | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2fd1ea2..715ce54 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -130,6 +130,44 @@ static int digits_len(const struct nf_conn *ct, const char *dptr, return len; } +static int iswordc(const char c) +{ + if (isalnum(c) || c == '!' || c == '"' || c == '%' || + (c >= '(' && c <= '/') || c == ':' || c == '<' || c == '>' || + c == '?' || (c >= '[' && c <= ']') || c == '_' || c == '`' || + c == '{' || c == '}' || c == '~') + return 1; + return 0; +} + +static int word_len(const char *dptr, const char *limit) +{ + int len = 0; + while (dptr < limit && iswordc(*dptr)) { + dptr++; + len++; + } + return len; +} + +static int callid_len(const struct nf_conn *ct, const char *dptr, + const char *limit, int *shift) +{ + int len, domain_len; + + len = word_len(dptr, limit); + dptr += len; + if (!len || dptr == limit || *dptr != '@') + return len; + dptr++; + len++; + + domain_len = word_len(dptr, limit); + if (!domain_len) + return 0; + return len + domain_len; +} + /* get media type + port length */ static int media_len(const struct nf_conn *ct, const char *dptr, const char *limit, int *shift) @@ -299,6 +337,7 @@ static const struct sip_header ct_sip_hdrs[] = { [SIP_HDR_VIA_TCP] = SIP_HDR("Via", "v", "TCP ", epaddr_len), [SIP_HDR_EXPIRES] = SIP_HDR("Expires", NULL, NULL, digits_len), [SIP_HDR_CONTENT_LENGTH] = SIP_HDR("Content-Length", "l", NULL, digits_len), + [SIP_HDR_CALL_ID] = SIP_HDR("Call-Id", "i", NULL, callid_len), }; static const char *sip_follow_continuation(const char *dptr, const char *limit) -- cgit v1.1 From 5b57a98c1f0d78a4c238d83c4ac70de3bd237b2f Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:51 +0900 Subject: IPVS: compact ip_vs_sched_persist() Compact ip_vs_sched_persist() by setting up parameters and calling functions once. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_core.c | 156 +++++++++++++--------------------------- 1 file changed, 51 insertions(+), 105 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 06c388b..70a5cac 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -193,10 +193,14 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; - __be16 dport; /* destination port to forward */ + int protocol = iph.protocol; + __be16 dport = 0; /* destination port to forward */ + __be16 vport = 0; /* virtual service port */ unsigned int flags; union nf_inet_addr snet; /* source network of the client, after masking */ + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + const union nf_inet_addr *vaddr = &iph.daddr; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); @@ -227,119 +231,61 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * service, and a template like * is created for other persistent services. */ - if (ports[1] == svc->port) { - /* Check if a template already exists */ - if (svc->port != FTPPORT) - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, ports[1]); - else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); - - if (!ct || !ip_vs_check_template(ct)) { - /* - * No template found or the dest of the connection - * template is not available. - */ - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } - - /* - * Create a template like for non-ftp service, - * and - * for ftp service. + { + if (ports[1] == svc->port) { + /* non-FTP template: + * + * FTP template: + * */ if (svc->port != FTPPORT) - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, - ports[1], - &dest->addr, dest->port, - IP_VS_CONN_F_TEMPLATE, - dest); - else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; + vport = ports[1]; } else { - /* set destination with the found template */ - dest = ct->dest; + /* Note: persistent fwmark-based services and + * persistent port zero service are handled here. + * fwmark template: + * + * port zero template: + * + */ + if (svc->fwmark) { + protocol = IPPROTO_IP; + vaddr = &fwmark; + } } - dport = dest->port; - } else { - /* - * Note: persistent fwmark-based services and persistent - * port zero service are handled here. - * fwmark template: - * port zero template: + } + + /* Check if a template already exists */ + ct = ip_vs_ct_in_get(svc->af, protocol, &snet, 0, vaddr, vport); + + if (!ct || !ip_vs_check_template(ct)) { + /* No template found or the dest of the connection + * template is not available. */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .ip = htonl(svc->fwmark) - }; + dest = svc->scheduler->schedule(svc, skb); + if (!dest) { + IP_VS_DBG(1, "p-schedule: no dest found.\n"); + return NULL; + } - ct = ip_vs_ct_in_get(svc->af, IPPROTO_IP, &snet, 0, - &fwmark, 0); - } else - ct = ip_vs_ct_in_get(svc->af, iph.protocol, &snet, 0, - &iph.daddr, 0); + if (ports[1] == svc->port && svc->port != FTPPORT) + dport = dest->port; - if (!ct || !ip_vs_check_template(ct)) { - /* - * If it is not persistent port zero, return NULL, - * otherwise create a connection template. - */ - if (svc->port) - return NULL; + /* Create a template */ + ct = ip_vs_conn_new(svc->af, protocol, &snet, 0,vaddr, vport, + &dest->addr, dport, + IP_VS_CONN_F_TEMPLATE, dest); + if (ct == NULL) + return NULL; - dest = svc->scheduler->schedule(svc, skb); - if (dest == NULL) { - IP_VS_DBG(1, "p-schedule: no dest found.\n"); - return NULL; - } + ct->timeout = svc->timeout; + } else + /* set destination with the found template */ + dest = ct->dest; - /* - * Create a template according to the service - */ - if (svc->fwmark) { - union nf_inet_addr fwmark = { - .ip = htonl(svc->fwmark) - }; - - ct = ip_vs_conn_new(svc->af, IPPROTO_IP, - &snet, 0, - &fwmark, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - } else - ct = ip_vs_conn_new(svc->af, iph.protocol, - &snet, 0, - &iph.daddr, 0, - &dest->addr, 0, - IP_VS_CONN_F_TEMPLATE, - dest); - if (ct == NULL) - return NULL; - - ct->timeout = svc->timeout; - } else { - /* set destination with the found template */ - dest = ct->dest; - } - dport = ports[1]; - } + dport = ports[1]; + if (dport == svc->port && dest->port) + dport = dest->port; flags = (svc->flags & IP_VS_SVC_F_ONEPACKET && iph.protocol == IPPROTO_UDP)? -- cgit v1.1 From f11017ec2d1859c661f4e2b12c4a8d250e1f47cf Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:52 +0900 Subject: IPVS: Add struct ip_vs_conn_param Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_conn.c | 171 ++++++++++++++++---------------- net/netfilter/ipvs/ip_vs_core.c | 64 ++++++------ net/netfilter/ipvs/ip_vs_ftp.c | 43 ++++---- net/netfilter/ipvs/ip_vs_nfct.c | 12 +-- net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 47 ++++----- net/netfilter/ipvs/ip_vs_sync.c | 33 +++--- 6 files changed, 181 insertions(+), 189 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index a970d96..deeb906 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -218,27 +218,26 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) /* * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. * Called for pkts coming from OUTside-to-INside. - * s_addr, s_port: pkt source address (foreign host) - * d_addr, d_port: pkt dest address (load balancer) + * p->caddr, p->cport: pkt source address (foreign host) + * p->vaddr, p->vport: pkt dest address (load balancer) */ -static inline struct ip_vs_conn *__ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +static inline struct ip_vs_conn * +__ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && - ip_vs_addr_equal(af, d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && - ((!s_port) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && - protocol == cp->protocol) { + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->vaddr, &cp->vaddr) && + p->cport == cp->cport && p->vport == cp->vport && + ((!p->cport) ^ (!(cp->flags & IP_VS_CONN_F_NO_CPORT))) && + p->protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); ct_read_unlock(hash); @@ -251,71 +250,82 @@ static inline struct ip_vs_conn *__ip_vs_conn_in_get return NULL; } -struct ip_vs_conn *ip_vs_conn_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +struct ip_vs_conn *ip_vs_conn_in_get(const struct ip_vs_conn_param *p) { struct ip_vs_conn *cp; - cp = __ip_vs_conn_in_get(af, protocol, s_addr, s_port, d_addr, d_port); - if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) - cp = __ip_vs_conn_in_get(af, protocol, s_addr, 0, d_addr, - d_port); + cp = __ip_vs_conn_in_get(p); + if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) { + struct ip_vs_conn_param cport_zero_p = *p; + cport_zero_p.cport = 0; + cp = __ip_vs_conn_in_get(&cport_zero_p); + } IP_VS_DBG_BUF(9, "lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), cp ? "hit" : "not hit"); return cp; } +static int +ip_vs_conn_fill_param_proto(int af, const struct sk_buff *skb, + const struct ip_vs_iphdr *iph, + unsigned int proto_off, int inverse, + struct ip_vs_conn_param *p) +{ + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); + if (pptr == NULL) + return 1; + + if (likely(!inverse)) + ip_vs_conn_fill_param(af, iph->protocol, &iph->saddr, pptr[0], + &iph->daddr, pptr[1], p); + else + ip_vs_conn_fill_param(af, iph->protocol, &iph->daddr, pptr[1], + &iph->saddr, pptr[0], p); + return 0; +} + struct ip_vs_conn * ip_vs_conn_in_get_proto(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { - __be16 _ports[2], *pptr; + struct ip_vs_conn_param p; - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) return NULL; - if (likely(!inverse)) - return ip_vs_conn_in_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_in_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); + return ip_vs_conn_in_get(&p); } EXPORT_SYMBOL_GPL(ip_vs_conn_in_get_proto); /* Get reference to connection template */ -struct ip_vs_conn *ip_vs_ct_in_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(af, protocol, s_addr, s_port); + hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, s_addr, &cp->caddr) && + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && /* protocol should only be IPPROTO_IP if - * d_addr is a fwmark */ - ip_vs_addr_equal(protocol == IPPROTO_IP ? AF_UNSPEC : af, - d_addr, &cp->vaddr) && - s_port == cp->cport && d_port == cp->vport && + * p->vaddr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : + p->af, p->vaddr, &cp->vaddr) && + p->cport == cp->cport && p->vport == cp->vport && cp->flags & IP_VS_CONN_F_TEMPLATE && - protocol == cp->protocol) { + p->protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); goto out; @@ -327,23 +337,19 @@ struct ip_vs_conn *ip_vs_ct_in_get ct_read_unlock(hash); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), cp ? "hit" : "not hit"); return cp; } -/* - * Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. - * Called for pkts coming from inside-to-OUTside. - * s_addr, s_port: pkt source address (inside host) - * d_addr, d_port: pkt dest address (foreign host) - */ -struct ip_vs_conn *ip_vs_conn_out_get -(int af, int protocol, const union nf_inet_addr *s_addr, __be16 s_port, - const union nf_inet_addr *d_addr, __be16 d_port) +/* Gets ip_vs_conn associated with supplied parameters in the ip_vs_conn_tab. + * Called for pkts coming from inside-to-OUTside. + * p->caddr, p->cport: pkt source address (inside host) + * p->vaddr, p->vport: pkt dest address (foreign host) */ +struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) { unsigned hash; struct ip_vs_conn *cp, *ret=NULL; @@ -351,16 +357,16 @@ struct ip_vs_conn *ip_vs_conn_out_get /* * Check for "full" addressed entries */ - hash = ip_vs_conn_hashkey(af, protocol, d_addr, d_port); + hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (cp->af == af && - ip_vs_addr_equal(af, d_addr, &cp->caddr) && - ip_vs_addr_equal(af, s_addr, &cp->daddr) && - d_port == cp->cport && s_port == cp->dport && - protocol == cp->protocol) { + if (cp->af == p->af && + ip_vs_addr_equal(p->af, p->vaddr, &cp->caddr) && + ip_vs_addr_equal(p->af, p->caddr, &cp->daddr) && + p->vport == cp->cport && p->cport == cp->dport && + p->protocol == cp->protocol) { /* HIT */ atomic_inc(&cp->refcnt); ret = cp; @@ -371,9 +377,9 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); IP_VS_DBG_BUF(9, "lookup/out %s %s:%d->%s:%d %s\n", - ip_vs_proto_name(protocol), - IP_VS_DBG_ADDR(af, s_addr), ntohs(s_port), - IP_VS_DBG_ADDR(af, d_addr), ntohs(d_port), + ip_vs_proto_name(p->protocol), + IP_VS_DBG_ADDR(p->af, p->caddr), ntohs(p->cport), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), ret ? "hit" : "not hit"); return ret; @@ -385,20 +391,12 @@ ip_vs_conn_out_get_proto(int af, const struct sk_buff *skb, const struct ip_vs_iphdr *iph, unsigned int proto_off, int inverse) { - __be16 _ports[2], *pptr; + struct ip_vs_conn_param p; - pptr = skb_header_pointer(skb, proto_off, sizeof(_ports), _ports); - if (pptr == NULL) + if (ip_vs_conn_fill_param_proto(af, skb, iph, proto_off, inverse, &p)) return NULL; - if (likely(!inverse)) - return ip_vs_conn_out_get(af, iph->protocol, - &iph->saddr, pptr[0], - &iph->daddr, pptr[1]); - else - return ip_vs_conn_out_get(af, iph->protocol, - &iph->daddr, pptr[1], - &iph->saddr, pptr[0]); + return ip_vs_conn_out_get(&p); } EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto); @@ -758,13 +756,12 @@ void ip_vs_conn_expire_now(struct ip_vs_conn *cp) * Create a new connection entry and hash it into the ip_vs_conn_tab */ struct ip_vs_conn * -ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, - const union nf_inet_addr *vaddr, __be16 vport, +ip_vs_conn_new(const struct ip_vs_conn_param *p, const union nf_inet_addr *daddr, __be16 dport, unsigned flags, struct ip_vs_dest *dest) { struct ip_vs_conn *cp; - struct ip_vs_protocol *pp = ip_vs_proto_get(proto); + struct ip_vs_protocol *pp = ip_vs_proto_get(p->protocol); cp = kmem_cache_zalloc(ip_vs_conn_cachep, GFP_ATOMIC); if (cp == NULL) { @@ -774,14 +771,14 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, INIT_LIST_HEAD(&cp->c_list); setup_timer(&cp->timer, ip_vs_conn_expire, (unsigned long)cp); - cp->af = af; - cp->protocol = proto; - ip_vs_addr_copy(af, &cp->caddr, caddr); - cp->cport = cport; - ip_vs_addr_copy(af, &cp->vaddr, vaddr); - cp->vport = vport; + cp->af = p->af; + cp->protocol = p->protocol; + ip_vs_addr_copy(p->af, &cp->caddr, p->caddr); + cp->cport = p->cport; + ip_vs_addr_copy(p->af, &cp->vaddr, p->vaddr); + cp->vport = p->vport; /* proto should only be IPPROTO_IP if d_addr is a fwmark */ - ip_vs_addr_copy(proto == IPPROTO_IP ? AF_UNSPEC : af, + ip_vs_addr_copy(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; @@ -810,7 +807,7 @@ ip_vs_conn_new(int af, int proto, const union nf_inet_addr *caddr, __be16 cport, /* Bind its packet transmitter */ #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) + if (p->af == AF_INET6) ip_vs_bind_xmit_v6(cp); else #endif diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 70a5cac..87602a6 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -193,14 +193,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, struct ip_vs_iphdr iph; struct ip_vs_dest *dest; struct ip_vs_conn *ct; - int protocol = iph.protocol; __be16 dport = 0; /* destination port to forward */ - __be16 vport = 0; /* virtual service port */ unsigned int flags; + struct ip_vs_conn_param param; union nf_inet_addr snet; /* source network of the client, after masking */ - const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; - const union nf_inet_addr *vaddr = &iph.daddr; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); @@ -232,6 +229,11 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * is created for other persistent services. */ { + int protocol = iph.protocol; + const union nf_inet_addr *vaddr = &iph.daddr; + const union nf_inet_addr fwmark = { .ip = htonl(svc->fwmark) }; + __be16 vport = 0; + if (ports[1] == svc->port) { /* non-FTP template: * @@ -253,11 +255,12 @@ ip_vs_sched_persist(struct ip_vs_service *svc, vaddr = &fwmark; } } + ip_vs_conn_fill_param(svc->af, protocol, &snet, 0, + vaddr, vport, ¶m); } /* Check if a template already exists */ - ct = ip_vs_ct_in_get(svc->af, protocol, &snet, 0, vaddr, vport); - + ct = ip_vs_ct_in_get(¶m); if (!ct || !ip_vs_check_template(ct)) { /* No template found or the dest of the connection * template is not available. @@ -272,8 +275,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, dport = dest->port; /* Create a template */ - ct = ip_vs_conn_new(svc->af, protocol, &snet, 0,vaddr, vport, - &dest->addr, dport, + ct = ip_vs_conn_new(¶m, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest); if (ct == NULL) return NULL; @@ -294,12 +296,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc, /* * Create a new connection according to the template */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, ports[0], - &iph.daddr, ports[1], - &dest->addr, dport, - flags, - dest); + ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, ports[0], + &iph.daddr, ports[1], ¶m); + cp = ip_vs_conn_new(¶m, &dest->addr, dport, flags, dest); if (cp == NULL) { ip_vs_conn_put(ct); return NULL; @@ -366,14 +365,16 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) /* * Create a connection entry. */ - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &dest->addr, dest->port ? dest->port : pptr[1], - flags, - dest); - if (cp == NULL) - return NULL; + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->af, iph.protocol, &iph.saddr, + pptr[0], &iph.daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &dest->addr, + dest->port ? dest->port : pptr[1], + flags, dest); + if (!cp) + return NULL; + } IP_VS_DBG_BUF(6, "Schedule fwd:%c c:%s:%u v:%s:%u " "d:%s:%u conn->flags:%X conn->refcnt:%d\n", @@ -429,14 +430,17 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, /* create a new connection entry */ IP_VS_DBG(6, "%s(): create a cache_bypass entry\n", __func__); - cp = ip_vs_conn_new(svc->af, iph.protocol, - &iph.saddr, pptr[0], - &iph.daddr, pptr[1], - &daddr, 0, - IP_VS_CONN_F_BYPASS | flags, - NULL); - if (cp == NULL) - return NF_DROP; + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(svc->af, iph.protocol, + &iph.saddr, pptr[0], + &iph.daddr, pptr[1], &p); + cp = ip_vs_conn_new(&p, &daddr, 0, + IP_VS_CONN_F_BYPASS | flags, + NULL); + if (!cp) + return NF_DROP; + } /* statistics */ ip_vs_in_stats(cp, skb); diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 9cd375f..090889a 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -195,13 +195,17 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, /* * Now update or create an connection entry for it */ - n_cp = ip_vs_conn_out_get(AF_INET, iph->protocol, &from, port, - &cp->caddr, 0); + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(AF_INET, iph->protocol, + &from, port, &cp->caddr, 0, &p); + n_cp = ip_vs_conn_out_get(&p); + } if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &cp->caddr, 0, - &cp->vaddr, port, - &from, port, + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(AF_INET, IPPROTO_TCP, &cp->caddr, + 0, &cp->vaddr, port, &p); + n_cp = ip_vs_conn_new(&p, &from, port, IP_VS_CONN_F_NO_CPORT | IP_VS_CONN_F_NFCT, cp->dest); @@ -347,21 +351,22 @@ static int ip_vs_ftp_in(struct ip_vs_app *app, struct ip_vs_conn *cp, ip_vs_proto_name(iph->protocol), &to.ip, ntohs(port), &cp->vaddr.ip, 0); - n_cp = ip_vs_conn_in_get(AF_INET, iph->protocol, - &to, port, - &cp->vaddr, htons(ntohs(cp->vport)-1)); - if (!n_cp) { - n_cp = ip_vs_conn_new(AF_INET, IPPROTO_TCP, - &to, port, + { + struct ip_vs_conn_param p; + ip_vs_conn_fill_param(AF_INET, iph->protocol, &to, port, &cp->vaddr, htons(ntohs(cp->vport)-1), - &cp->daddr, htons(ntohs(cp->dport)-1), - IP_VS_CONN_F_NFCT, - cp->dest); - if (!n_cp) - return 0; + &p); + n_cp = ip_vs_conn_in_get(&p); + if (!n_cp) { + n_cp = ip_vs_conn_new(&p, &cp->daddr, + htons(ntohs(cp->dport)-1), + IP_VS_CONN_F_NFCT, cp->dest); + if (!n_cp) + return 0; - /* add its controller */ - ip_vs_control_add(n_cp, cp); + /* add its controller */ + ip_vs_control_add(n_cp, cp); + } } /* diff --git a/net/netfilter/ipvs/ip_vs_nfct.c b/net/netfilter/ipvs/ip_vs_nfct.c index c038458..4680647 100644 --- a/net/netfilter/ipvs/ip_vs_nfct.c +++ b/net/netfilter/ipvs/ip_vs_nfct.c @@ -140,6 +140,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, { struct nf_conntrack_tuple *orig, new_reply; struct ip_vs_conn *cp; + struct ip_vs_conn_param p; if (exp->tuple.src.l3num != PF_INET) return; @@ -154,9 +155,10 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, /* RS->CLIENT */ orig = &ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; - cp = ip_vs_conn_out_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); + ip_vs_conn_fill_param(exp->tuple.src.l3num, orig->dst.protonum, + &orig->src.u3, orig->src.u.tcp.port, + &orig->dst.u3, orig->dst.u.tcp.port, &p); + cp = ip_vs_conn_out_get(&p); if (cp) { /* Change reply CLIENT->RS to CLIENT->VS */ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; @@ -176,9 +178,7 @@ static void ip_vs_nfct_expect_callback(struct nf_conn *ct, } /* CLIENT->VS */ - cp = ip_vs_conn_in_get(exp->tuple.src.l3num, orig->dst.protonum, - &orig->src.u3, orig->src.u.tcp.port, - &orig->dst.u3, orig->dst.u.tcp.port); + cp = ip_vs_conn_in_get(&p); if (cp) { /* Change reply VS->CLIENT to RS->CLIENT */ new_reply = ct->tuplehash[IP_CT_DIR_REPLY].tuple; diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 1892dfc..8956ef3 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -40,6 +40,19 @@ struct isakmp_hdr { #define PORT_ISAKMP 500 +static void +ah_esp_conn_fill_param_proto(int af, const struct ip_vs_iphdr *iph, + int inverse, struct ip_vs_conn_param *p) +{ + if (likely(!inverse)) + ip_vs_conn_fill_param(af, IPPROTO_UDP, + &iph->saddr, htons(PORT_ISAKMP), + &iph->daddr, htons(PORT_ISAKMP), p); + else + ip_vs_conn_fill_param(af, IPPROTO_UDP, + &iph->daddr, htons(PORT_ISAKMP), + &iph->saddr, htons(PORT_ISAKMP), p); +} static struct ip_vs_conn * ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, @@ -47,21 +60,10 @@ ah_esp_conn_in_get(int af, const struct sk_buff *skb, struct ip_vs_protocol *pp, int inverse) { struct ip_vs_conn *cp; + struct ip_vs_conn_param p; - if (likely(!inverse)) { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_in_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - + ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + cp = ip_vs_conn_in_get(&p); if (!cp) { /* * We are not sure if the packet is from our @@ -87,21 +89,10 @@ ah_esp_conn_out_get(int af, const struct sk_buff *skb, int inverse) { struct ip_vs_conn *cp; + struct ip_vs_conn_param p; - if (likely(!inverse)) { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->saddr, - htons(PORT_ISAKMP), - &iph->daddr, - htons(PORT_ISAKMP)); - } else { - cp = ip_vs_conn_out_get(af, IPPROTO_UDP, - &iph->daddr, - htons(PORT_ISAKMP), - &iph->saddr, - htons(PORT_ISAKMP)); - } - + ah_esp_conn_fill_param_proto(af, iph, inverse, &p); + cp = ip_vs_conn_out_get(&p); if (!cp) { IP_VS_DBG_BUF(12, "Unknown ISAKMP entry for inout packet " "%s%s %s->%s\n", diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index 7ba0693..f68631f 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -301,6 +301,7 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) struct ip_vs_conn *cp; struct ip_vs_protocol *pp; struct ip_vs_dest *dest; + struct ip_vs_conn_param param; char *p; int i; @@ -370,18 +371,17 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } } - if (!(flags & IP_VS_CONN_F_TEMPLATE)) - cp = ip_vs_conn_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); - else - cp = ip_vs_ct_in_get(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport); + { + ip_vs_conn_fill_param(AF_INET, s->protocol, + (union nf_inet_addr *)&s->caddr, + s->cport, + (union nf_inet_addr *)&s->vaddr, + s->vport, ¶m); + if (!(flags & IP_VS_CONN_F_TEMPLATE)) + cp = ip_vs_conn_in_get(¶m); + else + cp = ip_vs_ct_in_get(¶m); + } if (!cp) { /* * Find the appropriate destination for the connection. @@ -406,14 +406,9 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) else flags &= ~IP_VS_CONN_F_INACTIVE; } - cp = ip_vs_conn_new(AF_INET, s->protocol, - (union nf_inet_addr *)&s->caddr, - s->cport, - (union nf_inet_addr *)&s->vaddr, - s->vport, + cp = ip_vs_conn_new(¶m, (union nf_inet_addr *)&s->daddr, - s->dport, - flags, dest); + s->dport, flags, dest); if (dest) atomic_dec(&dest->refcnt); if (!cp) { -- cgit v1.1 From 6e08bfb879574524cc9a67be960c684989fd986c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:52 +0900 Subject: IPVS: Allow null argument to ip_vs_scheduler_put() This simplifies caller logic sightly. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_ctl.c | 13 ++++--------- net/netfilter/ipvs/ip_vs_sched.c | 2 +- 2 files changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index e4ec8f3..f7afcfe 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1144,7 +1144,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, if (sched == NULL) { pr_info("Scheduler module ip_vs_%s not found\n", u->sched_name); ret = -ENOENT; - goto out_mod_dec; + goto out_err; } #ifdef CONFIG_IP_VS_IPV6 @@ -1204,7 +1204,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, *svc_p = svc; return 0; - out_err: + out_err: if (svc != NULL) { if (svc->scheduler) ip_vs_unbind_scheduler(svc); @@ -1217,7 +1217,6 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, } ip_vs_scheduler_put(sched); - out_mod_dec: /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1300,10 +1299,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) #ifdef CONFIG_IP_VS_IPV6 out: #endif - - if (old_sched) - ip_vs_scheduler_put(old_sched); - + ip_vs_scheduler_put(old_sched); return ret; } @@ -1327,8 +1323,7 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) /* Unbind scheduler */ old_sched = svc->scheduler; ip_vs_unbind_scheduler(svc); - if (old_sched) - ip_vs_scheduler_put(old_sched); + ip_vs_scheduler_put(old_sched); /* Unbind app inc */ if (svc->inc) { diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 727e45b..9f94e32 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -159,7 +159,7 @@ struct ip_vs_scheduler *ip_vs_scheduler_get(const char *sched_name) void ip_vs_scheduler_put(struct ip_vs_scheduler *scheduler) { - if (scheduler->module) + if (scheduler && scheduler->module) module_put(scheduler->module); } -- cgit v1.1 From 2fabf35bfcd89445c54cf1e6a5437dd3cf924a92 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:52 +0900 Subject: IPVS: ip_vs_{un,}bind_scheduler NULL arguments In general NULL arguments aren't passed by the few callers that exist, so don't test for them. The exception is to make passing NULL to ip_vs_unbind_scheduler() a noop. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_ctl.c | 3 +-- net/netfilter/ipvs/ip_vs_sched.c | 23 +++-------------------- 2 files changed, 4 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index f7afcfe..5f20caf 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1206,8 +1206,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, out_err: if (svc != NULL) { - if (svc->scheduler) - ip_vs_unbind_scheduler(svc); + ip_vs_unbind_scheduler(svc); if (svc->inc) { local_bh_disable(); ip_vs_app_inc_put(svc->inc); diff --git a/net/netfilter/ipvs/ip_vs_sched.c b/net/netfilter/ipvs/ip_vs_sched.c index 9f94e32..076ebe0 100644 --- a/net/netfilter/ipvs/ip_vs_sched.c +++ b/net/netfilter/ipvs/ip_vs_sched.c @@ -46,15 +46,6 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, { int ret; - if (svc == NULL) { - pr_err("%s(): svc arg NULL\n", __func__); - return -EINVAL; - } - if (scheduler == NULL) { - pr_err("%s(): scheduler arg NULL\n", __func__); - return -EINVAL; - } - svc->scheduler = scheduler; if (scheduler->init_service) { @@ -74,18 +65,10 @@ int ip_vs_bind_scheduler(struct ip_vs_service *svc, */ int ip_vs_unbind_scheduler(struct ip_vs_service *svc) { - struct ip_vs_scheduler *sched; + struct ip_vs_scheduler *sched = svc->scheduler; - if (svc == NULL) { - pr_err("%s(): svc arg NULL\n", __func__); - return -EINVAL; - } - - sched = svc->scheduler; - if (sched == NULL) { - pr_err("%s(): svc isn't bound\n", __func__); - return -EINVAL; - } + if (!sched) + return 0; if (sched->done_service) { if (sched->done_service(svc) != 0) { -- cgit v1.1 From 85999283a21ab2dd37427fdd8c8e8af57223977c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:53 +0900 Subject: IPVS: Add struct ip_vs_pe Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_conn.c | 67 ++++++++++++++++++++++++++++++++++------- net/netfilter/ipvs/ip_vs_core.c | 36 +++++++++++++++++----- net/netfilter/ipvs/ip_vs_sync.c | 17 +++++++++-- 3 files changed, 100 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index deeb906..06da21e 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -148,6 +148,42 @@ static unsigned int ip_vs_conn_hashkey(int af, unsigned proto, & ip_vs_conn_tab_mask; } +static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, + bool inverse) +{ + const union nf_inet_addr *addr; + __be16 port; + + if (p->pe && p->pe->hashkey_raw) + return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & + ip_vs_conn_tab_mask; + + if (likely(!inverse)) { + addr = p->caddr; + port = p->cport; + } else { + addr = p->vaddr; + port = p->vport; + } + + return ip_vs_conn_hashkey(p->af, p->protocol, addr, port); +} + +static unsigned int ip_vs_conn_hashkey_conn(const struct ip_vs_conn *cp) +{ + struct ip_vs_conn_param p; + + ip_vs_conn_fill_param(cp->af, cp->protocol, &cp->caddr, cp->cport, + NULL, 0, &p); + + if (cp->dest && cp->dest->svc->pe) { + p.pe = cp->dest->svc->pe; + p.pe_data = cp->pe_data; + p.pe_data_len = cp->pe_data_len; + } + + return ip_vs_conn_hashkey_param(&p, false); +} /* * Hashes ip_vs_conn in ip_vs_conn_tab by proto,addr,port. @@ -162,7 +198,7 @@ static inline int ip_vs_conn_hash(struct ip_vs_conn *cp) return 0; /* Hash by protocol, client address and port */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey_conn(cp); ct_write_lock(hash); spin_lock(&cp->lock); @@ -195,7 +231,7 @@ static inline int ip_vs_conn_unhash(struct ip_vs_conn *cp) int ret; /* unhash it and decrease its reference counter */ - hash = ip_vs_conn_hashkey(cp->af, cp->protocol, &cp->caddr, cp->cport); + hash = ip_vs_conn_hashkey_conn(cp); ct_write_lock(hash); spin_lock(&cp->lock); @@ -227,7 +263,7 @@ __ip_vs_conn_in_get(const struct ip_vs_conn_param *p) unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); + hash = ip_vs_conn_hashkey_param(p, false); ct_read_lock(hash); @@ -312,11 +348,17 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) unsigned hash; struct ip_vs_conn *cp; - hash = ip_vs_conn_hashkey(p->af, p->protocol, p->caddr, p->cport); + hash = ip_vs_conn_hashkey_param(p, false); ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { + if (p->pe && p->pe->ct_match) { + if (p->pe->ct_match(p, cp)) + goto out; + continue; + } + if (cp->af == p->af && ip_vs_addr_equal(p->af, p->caddr, &cp->caddr) && /* protocol should only be IPPROTO_IP if @@ -325,15 +367,14 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) p->af, p->vaddr, &cp->vaddr) && p->cport == cp->cport && p->vport == cp->vport && cp->flags & IP_VS_CONN_F_TEMPLATE && - p->protocol == cp->protocol) { - /* HIT */ - atomic_inc(&cp->refcnt); + p->protocol == cp->protocol) goto out; - } } cp = NULL; out: + if (cp) + atomic_inc(&cp->refcnt); ct_read_unlock(hash); IP_VS_DBG_BUF(9, "template lookup/in %s %s:%d->%s:%d %s\n", @@ -357,7 +398,7 @@ struct ip_vs_conn *ip_vs_conn_out_get(const struct ip_vs_conn_param *p) /* * Check for "full" addressed entries */ - hash = ip_vs_conn_hashkey(p->af, p->protocol, p->vaddr, p->vport); + hash = ip_vs_conn_hashkey_param(p, true); ct_read_lock(hash); @@ -722,6 +763,7 @@ static void ip_vs_conn_expire(unsigned long data) if (cp->flags & IP_VS_CONN_F_NFCT) ip_vs_conn_drop_conntrack(cp); + kfree(cp->pe_data); if (unlikely(cp->app != NULL)) ip_vs_unbind_app(cp); ip_vs_unbind_dest(cp); @@ -782,6 +824,10 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, &cp->daddr, daddr); cp->dport = dport; cp->flags = flags; + if (flags & IP_VS_CONN_F_TEMPLATE && p->pe_data) { + cp->pe_data = p->pe_data; + cp->pe_data_len = p->pe_data_len; + } spin_lock_init(&cp->lock); /* @@ -832,7 +878,6 @@ ip_vs_conn_new(const struct ip_vs_conn_param *p, return cp; } - /* * /proc/net/ip_vs_conn entries */ @@ -848,7 +893,7 @@ static void *ip_vs_conn_array(struct seq_file *seq, loff_t pos) list_for_each_entry(cp, &ip_vs_conn_tab[idx], c_list) { if (pos-- == 0) { seq->private = &ip_vs_conn_tab[idx]; - return cp; + return cp; } } ct_read_unlock_bh(idx); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 87602a6..ab98893 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -176,6 +176,19 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, return pp->state_transition(cp, direction, skb, pp); } +static inline int +ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, + struct sk_buff *skb, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); + p->pe = svc->pe; + if (p->pe && p->pe->fill_param) + return p->pe->fill_param(p, skb); + return 0; +} /* * IPVS persistent scheduling function @@ -186,7 +199,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, */ static struct ip_vs_conn * ip_vs_sched_persist(struct ip_vs_service *svc, - const struct sk_buff *skb, + struct sk_buff *skb, __be16 ports[2]) { struct ip_vs_conn *cp = NULL; @@ -255,8 +268,9 @@ ip_vs_sched_persist(struct ip_vs_service *svc, vaddr = &fwmark; } } - ip_vs_conn_fill_param(svc->af, protocol, &snet, 0, - vaddr, vport, ¶m); + if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m)) + return NULL; } /* Check if a template already exists */ @@ -268,22 +282,30 @@ ip_vs_sched_persist(struct ip_vs_service *svc, dest = svc->scheduler->schedule(svc, skb); if (!dest) { IP_VS_DBG(1, "p-schedule: no dest found.\n"); + kfree(param.pe_data); return NULL; } if (ports[1] == svc->port && svc->port != FTPPORT) dport = dest->port; - /* Create a template */ + /* Create a template + * This adds param.pe_data to the template, + * and thus param.pe_data will be destroyed + * when the template expires */ ct = ip_vs_conn_new(¶m, &dest->addr, dport, IP_VS_CONN_F_TEMPLATE, dest); - if (ct == NULL) + if (ct == NULL) { + kfree(param.pe_data); return NULL; + } ct->timeout = svc->timeout; - } else + } else { /* set destination with the found template */ dest = ct->dest; + kfree(param.pe_data); + } dport = ports[1]; if (dport == svc->port && dest->port) @@ -322,7 +344,7 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * Protocols supported: TCP, UDP */ struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; diff --git a/net/netfilter/ipvs/ip_vs_sync.c b/net/netfilter/ipvs/ip_vs_sync.c index f68631f..ab85aed 100644 --- a/net/netfilter/ipvs/ip_vs_sync.c +++ b/net/netfilter/ipvs/ip_vs_sync.c @@ -288,6 +288,16 @@ void ip_vs_sync_conn(struct ip_vs_conn *cp) ip_vs_sync_conn(cp->control); } +static inline int +ip_vs_conn_fill_param_sync(int af, int protocol, + const union nf_inet_addr *caddr, __be16 cport, + const union nf_inet_addr *vaddr, __be16 vport, + struct ip_vs_conn_param *p) +{ + /* XXX: Need to take into account persistence engine */ + ip_vs_conn_fill_param(af, protocol, caddr, cport, vaddr, vport, p); + return 0; +} /* * Process received multicast message and create the corresponding @@ -372,11 +382,14 @@ static void ip_vs_process_message(const char *buffer, const size_t buflen) } { - ip_vs_conn_fill_param(AF_INET, s->protocol, + if (ip_vs_conn_fill_param_sync(AF_INET, s->protocol, (union nf_inet_addr *)&s->caddr, s->cport, (union nf_inet_addr *)&s->vaddr, - s->vport, ¶m); + s->vport, ¶m)) { + pr_err("ip_vs_conn_fill_param_sync failed"); + return; + } if (!(flags & IP_VS_CONN_F_TEMPLATE)) cp = ip_vs_conn_in_get(¶m); else -- cgit v1.1 From a3c918acd29a96aba3b46bf50136e7953a480d17 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:53 +0900 Subject: IPVS: Add persistence engine data to /proc/net/ip_vs_conn This shouldn't break compatibility with userspace as the new data is at the end of the line. I have confirmed that this doesn't break ipvsadm, the main (only?) user-space user of this data. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_conn.c | 25 ++++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 06da21e..4adedef 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -950,30 +950,45 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) if (v == SEQ_START_TOKEN) seq_puts(seq, - "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires\n"); + "Pro FromIP FPrt ToIP TPrt DestIP DPrt State Expires PEName PEData\n"); else { const struct ip_vs_conn *cp = v; + char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; + size_t len = 0; + + if (cp->dest && cp->dest->svc->pe && + cp->dest->svc->pe->show_pe_data) { + pe_data[0] = ' '; + len = strlen(cp->dest->svc->pe->name); + memcpy(pe_data + 1, cp->dest->svc->pe->name, len); + pe_data[len + 1] = ' '; + len += 2; + len += cp->dest->svc->pe->show_pe_data(cp, + pe_data + len); + } + pe_data[len] = '\0'; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) - seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X %pI6 %04X %-11s %7lu\n", + seq_printf(seq, "%-3s %pI6 %04X %pI6 %04X " + "%pI6 %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), &cp->caddr.in6, ntohs(cp->cport), &cp->vaddr.in6, ntohs(cp->vport), &cp->daddr.in6, ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); + (cp->timer.expires-jiffies)/HZ, pe_data); else #endif seq_printf(seq, "%-3s %08X %04X %08X %04X" - " %08X %04X %-11s %7lu\n", + " %08X %04X %-11s %7lu%s\n", ip_vs_proto_name(cp->protocol), ntohl(cp->caddr.ip), ntohs(cp->cport), ntohl(cp->vaddr.ip), ntohs(cp->vport), ntohl(cp->daddr.ip), ntohs(cp->dport), ip_vs_state_name(cp->protocol, cp->state), - (cp->timer.expires-jiffies)/HZ); + (cp->timer.expires-jiffies)/HZ, pe_data); } return 0; } -- cgit v1.1 From 8be67a6617b3403551fccb67b1c624c659419515 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:54 +0900 Subject: IPVS: management of persistence engine modules This is based heavily on the scheduler management code Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/Makefile | 2 +- net/netfilter/ipvs/ip_vs_pe.c | 147 ++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 148 insertions(+), 1 deletion(-) create mode 100644 net/netfilter/ipvs/ip_vs_pe.c (limited to 'net') diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 349fe88..4a87bf3 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -14,7 +14,7 @@ ip_vs-extra_objs-$(CONFIG_IP_VS_NFCT) += ip_vs_nfct.o ip_vs-objs := ip_vs_conn.o ip_vs_core.o ip_vs_ctl.o ip_vs_sched.o \ ip_vs_xmit.o ip_vs_app.o ip_vs_sync.o \ - ip_vs_est.o ip_vs_proto.o \ + ip_vs_est.o ip_vs_proto.o ip_vs_pe.o \ $(ip_vs_proto-objs-y) $(ip_vs-extra_objs-y) diff --git a/net/netfilter/ipvs/ip_vs_pe.c b/net/netfilter/ipvs/ip_vs_pe.c new file mode 100644 index 0000000..3414af7 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe.c @@ -0,0 +1,147 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include +#include +#include +#include +#include + +#include + +/* IPVS pe list */ +static LIST_HEAD(ip_vs_pe); + +/* lock for service table */ +static DEFINE_SPINLOCK(ip_vs_pe_lock); + +/* Bind a service with a pe */ +void ip_vs_bind_pe(struct ip_vs_service *svc, struct ip_vs_pe *pe) +{ + svc->pe = pe; +} + +/* Unbind a service from its pe */ +void ip_vs_unbind_pe(struct ip_vs_service *svc) +{ + svc->pe = NULL; +} + +/* Get pe in the pe list by name */ +static struct ip_vs_pe * +ip_vs_pe_getbyname(const char *pe_name) +{ + struct ip_vs_pe *pe; + + IP_VS_DBG(2, "%s(): pe_name \"%s\"\n", __func__, + pe_name); + + spin_lock_bh(&ip_vs_pe_lock); + + list_for_each_entry(pe, &ip_vs_pe, n_list) { + /* Test and get the modules atomically */ + if (pe->module && + !try_module_get(pe->module)) { + /* This pe is just deleted */ + continue; + } + if (strcmp(pe_name, pe->name)==0) { + /* HIT */ + spin_unlock_bh(&ip_vs_pe_lock); + return pe; + } + if (pe->module) + module_put(pe->module); + } + + spin_unlock_bh(&ip_vs_pe_lock); + return NULL; +} + +/* Lookup pe and try to load it if it doesn't exist */ +struct ip_vs_pe *ip_vs_pe_get(const char *name) +{ + struct ip_vs_pe *pe; + + /* Search for the pe by name */ + pe = ip_vs_pe_getbyname(name); + + /* If pe not found, load the module and search again */ + if (!pe) { + request_module("ip_vs_pe_%s", name); + pe = ip_vs_pe_getbyname(name); + } + + return pe; +} + +void ip_vs_pe_put(struct ip_vs_pe *pe) +{ + if (pe && pe->module) + module_put(pe->module); +} + +/* Register a pe in the pe list */ +int register_ip_vs_pe(struct ip_vs_pe *pe) +{ + struct ip_vs_pe *tmp; + + /* increase the module use count */ + ip_vs_use_count_inc(); + + spin_lock_bh(&ip_vs_pe_lock); + + if (!list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already linked\n", + __func__, pe->name); + return -EINVAL; + } + + /* Make sure that the pe with this name doesn't exist + * in the pe list. + */ + list_for_each_entry(tmp, &ip_vs_pe, n_list) { + if (strcmp(tmp->name, pe->name) == 0) { + spin_unlock_bh(&ip_vs_pe_lock); + ip_vs_use_count_dec(); + pr_err("%s(): [%s] pe already existed " + "in the system\n", __func__, pe->name); + return -EINVAL; + } + } + /* Add it into the d-linked pe list */ + list_add(&pe->n_list, &ip_vs_pe); + spin_unlock_bh(&ip_vs_pe_lock); + + pr_info("[%s] pe registered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(register_ip_vs_pe); + +/* Unregister a pe from the pe list */ +int unregister_ip_vs_pe(struct ip_vs_pe *pe) +{ + spin_lock_bh(&ip_vs_pe_lock); + if (list_empty(&pe->n_list)) { + spin_unlock_bh(&ip_vs_pe_lock); + pr_err("%s(): [%s] pe is not in the list. failed\n", + __func__, pe->name); + return -EINVAL; + } + + /* Remove it from the d-linked pe list */ + list_del(&pe->n_list); + spin_unlock_bh(&ip_vs_pe_lock); + + /* decrease the module use count */ + ip_vs_use_count_dec(); + + pr_info("[%s] pe unregistered.\n", pe->name); + + return 0; +} +EXPORT_SYMBOL_GPL(unregister_ip_vs_pe); -- cgit v1.1 From 0d1e71b04a04b6912e50926b9987c1e72facb1f3 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:54 +0900 Subject: IPVS: Allow configuration of persistence engines Allow the persistence engine of a virtual service to be set, edited and unset. This feature only works with the netlink user-space interface. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_ctl.c | 57 +++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 5f20caf..a697591 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -1134,6 +1134,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, { int ret = 0; struct ip_vs_scheduler *sched = NULL; + struct ip_vs_pe *pe = NULL; struct ip_vs_service *svc = NULL; /* increase the module use count */ @@ -1147,6 +1148,16 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, goto out_err; } + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_get(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out_err; + } + } + #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { ret = -EINVAL; @@ -1184,6 +1195,10 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, goto out_err; sched = NULL; + /* Bind the ct retriever */ + ip_vs_bind_pe(svc, pe); + pe = NULL; + /* Update the virtual service counters */ if (svc->port == FTPPORT) atomic_inc(&ip_vs_ftpsvc_counter); @@ -1215,6 +1230,7 @@ ip_vs_add_service(struct ip_vs_service_user_kern *u, kfree(svc); } ip_vs_scheduler_put(sched); + ip_vs_pe_put(pe); /* decrease the module use count */ ip_vs_use_count_dec(); @@ -1230,6 +1246,7 @@ static int ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) { struct ip_vs_scheduler *sched, *old_sched; + struct ip_vs_pe *pe = NULL, *old_pe = NULL; int ret = 0; /* @@ -1242,6 +1259,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } old_sched = sched; + if (u->pe_name && *u->pe_name) { + pe = ip_vs_pe_get(u->pe_name); + if (pe == NULL) { + pr_info("persistence engine module ip_vs_pe_%s " + "not found\n", u->pe_name); + ret = -ENOENT; + goto out; + } + old_pe = pe; + } + #ifdef CONFIG_IP_VS_IPV6 if (u->af == AF_INET6 && (u->netmask < 1 || u->netmask > 128)) { ret = -EINVAL; @@ -1293,12 +1321,17 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u) } } + old_pe = svc->pe; + if (pe != old_pe) { + ip_vs_unbind_pe(svc); + ip_vs_bind_pe(svc, pe); + } + out_unlock: write_unlock_bh(&__ip_vs_svc_lock); -#ifdef CONFIG_IP_VS_IPV6 out: -#endif ip_vs_scheduler_put(old_sched); + ip_vs_pe_put(old_pe); return ret; } @@ -1312,6 +1345,9 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) { struct ip_vs_dest *dest, *nxt; struct ip_vs_scheduler *old_sched; + struct ip_vs_pe *old_pe; + + pr_info("%s: enter\n", __func__); /* Count only IPv4 services for old get/setsockopt interface */ if (svc->af == AF_INET) @@ -1324,6 +1360,11 @@ static void __ip_vs_del_service(struct ip_vs_service *svc) ip_vs_unbind_scheduler(svc); ip_vs_scheduler_put(old_sched); + /* Unbind persistence engine */ + old_pe = svc->pe; + ip_vs_unbind_pe(svc); + ip_vs_pe_put(old_pe); + /* Unbind app inc */ if (svc->inc) { ip_vs_app_inc_put(svc->inc); @@ -2026,6 +2067,8 @@ static const unsigned char set_arglen[SET_CMDID(IP_VS_SO_SET_MAX)+1] = { static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, struct ip_vs_service_user *usvc_compat) { + memset(usvc, 0, sizeof(*usvc)); + usvc->af = AF_INET; usvc->protocol = usvc_compat->protocol; usvc->addr.ip = usvc_compat->addr; @@ -2043,6 +2086,8 @@ static void ip_vs_copy_usvc_compat(struct ip_vs_service_user_kern *usvc, static void ip_vs_copy_udest_compat(struct ip_vs_dest_user_kern *udest, struct ip_vs_dest_user *udest_compat) { + memset(udest, 0, sizeof(*udest)); + udest->addr.ip = udest_compat->addr; udest->port = udest_compat->port; udest->conn_flags = udest_compat->conn_flags; @@ -2539,6 +2584,8 @@ static const struct nla_policy ip_vs_svc_policy[IPVS_SVC_ATTR_MAX + 1] = { [IPVS_SVC_ATTR_FWMARK] = { .type = NLA_U32 }, [IPVS_SVC_ATTR_SCHED_NAME] = { .type = NLA_NUL_STRING, .len = IP_VS_SCHEDNAME_MAXLEN }, + [IPVS_SVC_ATTR_PE_NAME] = { .type = NLA_NUL_STRING, + .len = IP_VS_PENAME_MAXLEN }, [IPVS_SVC_ATTR_FLAGS] = { .type = NLA_BINARY, .len = sizeof(struct ip_vs_flags) }, [IPVS_SVC_ATTR_TIMEOUT] = { .type = NLA_U32 }, @@ -2615,6 +2662,8 @@ static int ip_vs_genl_fill_service(struct sk_buff *skb, } NLA_PUT_STRING(skb, IPVS_SVC_ATTR_SCHED_NAME, svc->scheduler->name); + if (svc->pe) + NLA_PUT_STRING(skb, IPVS_SVC_ATTR_PE_NAME, svc->pe->name); NLA_PUT(skb, IPVS_SVC_ATTR_FLAGS, sizeof(flags), &flags); NLA_PUT_U32(skb, IPVS_SVC_ATTR_TIMEOUT, svc->timeout / HZ); NLA_PUT_U32(skb, IPVS_SVC_ATTR_NETMASK, svc->netmask); @@ -2741,11 +2790,12 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, /* If a full entry was requested, check for the additional fields */ if (full_entry) { - struct nlattr *nla_sched, *nla_flags, *nla_timeout, + struct nlattr *nla_sched, *nla_flags, *nla_pe, *nla_timeout, *nla_netmask; struct ip_vs_flags flags; nla_sched = attrs[IPVS_SVC_ATTR_SCHED_NAME]; + nla_pe = attrs[IPVS_SVC_ATTR_PE_NAME]; nla_flags = attrs[IPVS_SVC_ATTR_FLAGS]; nla_timeout = attrs[IPVS_SVC_ATTR_TIMEOUT]; nla_netmask = attrs[IPVS_SVC_ATTR_NETMASK]; @@ -2763,6 +2813,7 @@ static int ip_vs_genl_parse_service(struct ip_vs_service_user_kern *usvc, usvc->flags = (usvc->flags & ~flags.mask) | (flags.flags & flags.mask); usvc->sched_name = nla_data(nla_sched); + usvc->pe_name = nla_pe ? nla_data(nla_pe) : NULL; usvc->timeout = nla_get_u32(nla_timeout); usvc->netmask = nla_get_u32(nla_netmask); } -- cgit v1.1 From f71499aa11f884255b69ce6c3b3c398c821591a1 Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:54 +0900 Subject: IPVS: Fallback if persistence engine fails Fall back to normal persistence handling if the persistence engine fails to recognise a packet. This way, at least the packet will go somewhere. It is envisaged that iptables could be used to block packets such if this is not desired although nf_conntrack_sip would likely need to be enhanced first. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/ip_vs_conn.c | 6 +++--- net/netfilter/ipvs/ip_vs_core.c | 10 ++++------ 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 4adedef..1d1a529 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -154,7 +154,7 @@ static unsigned int ip_vs_conn_hashkey_param(const struct ip_vs_conn_param *p, const union nf_inet_addr *addr; __be16 port; - if (p->pe && p->pe->hashkey_raw) + if (p->pe_data && p->pe->hashkey_raw) return p->pe->hashkey_raw(p, ip_vs_conn_rnd, inverse) & ip_vs_conn_tab_mask; @@ -353,7 +353,7 @@ struct ip_vs_conn *ip_vs_ct_in_get(const struct ip_vs_conn_param *p) ct_read_lock(hash); list_for_each_entry(cp, &ip_vs_conn_tab[hash], c_list) { - if (p->pe && p->pe->ct_match) { + if (p->pe_data && p->pe->ct_match) { if (p->pe->ct_match(p, cp)) goto out; continue; @@ -956,7 +956,7 @@ static int ip_vs_conn_seq_show(struct seq_file *seq, void *v) char pe_data[IP_VS_PENAME_MAXLEN + IP_VS_PEDATA_MAXLEN + 3]; size_t len = 0; - if (cp->dest && cp->dest->svc->pe && + if (cp->dest && cp->pe_data && cp->dest->svc->pe->show_pe_data) { pe_data[0] = ' '; len = strlen(cp->dest->svc->pe->name); diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index ab98893..e5fef7a 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -176,7 +176,7 @@ ip_vs_set_state(struct ip_vs_conn *cp, int direction, return pp->state_transition(cp, direction, skb, pp); } -static inline int +static inline void ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, struct sk_buff *skb, int protocol, const union nf_inet_addr *caddr, __be16 cport, @@ -186,8 +186,7 @@ ip_vs_conn_fill_param_persist(const struct ip_vs_service *svc, ip_vs_conn_fill_param(svc->af, protocol, caddr, cport, vaddr, vport, p); p->pe = svc->pe; if (p->pe && p->pe->fill_param) - return p->pe->fill_param(p, skb); - return 0; + p->pe->fill_param(p, skb); } /* @@ -268,9 +267,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, vaddr = &fwmark; } } - if (ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, - vaddr, vport, ¶m)) - return NULL; + ip_vs_conn_fill_param_persist(svc, skb, protocol, &snet, 0, + vaddr, vport, ¶m); } /* Check if a template already exists */ -- cgit v1.1 From 758ff03387228824617cef9507e5682488bf9e0c Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Sun, 22 Aug 2010 21:37:55 +0900 Subject: IPVS: sip persistence engine Add the SIP callid as a key for persistence. This allows multiple connections from the same IP address to be differentiated on the basis of the callid. When used in conjunction with the persistence mask, it allows connections from different IP addresses to be aggregated on the basis of the callid. It is envisaged that a persistence mask of 0.0.0.0 will be a useful setting. That is, ignore the source IP address when checking for persistence. It is envisaged that this option will be used in conjunction with one-packet scheduling. This only works with UDP and cannot be made to work with TCP within the current framework. Signed-off-by: Simon Horman Acked-by: Julian Anastasov --- net/netfilter/ipvs/Kconfig | 7 ++ net/netfilter/ipvs/Makefile | 3 + net/netfilter/ipvs/ip_vs_pe_sip.c | 167 ++++++++++++++++++++++++++++++++++++++ 3 files changed, 177 insertions(+) create mode 100644 net/netfilter/ipvs/ip_vs_pe_sip.c (limited to 'net') diff --git a/net/netfilter/ipvs/Kconfig b/net/netfilter/ipvs/Kconfig index af3c9f4..a22dac2 100644 --- a/net/netfilter/ipvs/Kconfig +++ b/net/netfilter/ipvs/Kconfig @@ -256,4 +256,11 @@ config IP_VS_NFCT connection state to be exported to the Netfilter framework for filtering purposes. +config IP_VS_PE_SIP + tristate "SIP persistence engine" + depends on IP_VS_PROTO_UDP + depends on NF_CONNTRACK_SIP + ---help--- + Allow persistence based on the SIP Call-ID + endif # IP_VS diff --git a/net/netfilter/ipvs/Makefile b/net/netfilter/ipvs/Makefile index 4a87bf3..34ee602 100644 --- a/net/netfilter/ipvs/Makefile +++ b/net/netfilter/ipvs/Makefile @@ -35,3 +35,6 @@ obj-$(CONFIG_IP_VS_NQ) += ip_vs_nq.o # IPVS application helpers obj-$(CONFIG_IP_VS_FTP) += ip_vs_ftp.o + +# IPVS connection template retrievers +obj-$(CONFIG_IP_VS_PE_SIP) += ip_vs_pe_sip.o diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c new file mode 100644 index 0000000..a0539f1 --- /dev/null +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -0,0 +1,167 @@ +#define KMSG_COMPONENT "IPVS" +#define pr_fmt(fmt) KMSG_COMPONENT ": " fmt + +#include +#include + +#include +#include +#include + +static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, + const char *callid, size_t callid_len, + int *idx) +{ + size_t len = min(min(callid_len, (size_t)64), buf_len - *idx - 1); + memcpy(buf + *idx, callid, len); + buf[*idx+len] = '\0'; + *idx += len + 1; + return buf + *idx - len; +} + +#define IP_VS_DEBUG_CALLID(callid, len) \ + ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \ + callid, len, &ip_vs_dbg_idx) + +static int get_callid(const char *dptr, unsigned int dataoff, + unsigned int datalen, + unsigned int *matchoff, unsigned int *matchlen) +{ + /* Find callid */ + while (1) { + int ret = ct_sip_get_header(NULL, dptr, dataoff, datalen, + SIP_HDR_CALL_ID, matchoff, + matchlen); + if (ret > 0) + break; + if (!ret) + return 0; + dataoff += *matchoff; + } + + /* Empty callid is useless */ + if (!*matchlen) + return -EINVAL; + + /* Too large is useless */ + if (*matchlen > IP_VS_PEDATA_MAXLEN) + return -EINVAL; + + /* SIP headers are always followed by a line terminator */ + if (*matchoff + *matchlen == datalen) + return -EINVAL; + + /* RFC 2543 allows lines to be terminated with CR, LF or CRLF, + * RFC 3261 allows only CRLF, we support both. */ + if (*(dptr + *matchoff + *matchlen) != '\r' && + *(dptr + *matchoff + *matchlen) != '\n') + return -EINVAL; + + IP_VS_DBG_BUF(9, "SIP callid %s (%d bytes)\n", + IP_VS_DEBUG_CALLID(dptr + *matchoff, *matchlen), + *matchlen); + return 0; +} + +static int +ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb) +{ + struct ip_vs_iphdr iph; + unsigned int dataoff, datalen, matchoff, matchlen; + const char *dptr; + + ip_vs_fill_iphdr(p->af, skb_network_header(skb), &iph); + + /* Only useful with UDP */ + if (iph.protocol != IPPROTO_UDP) + return -EINVAL; + + /* No Data ? */ + dataoff = iph.len + sizeof(struct udphdr); + if (dataoff >= skb->len) + return -EINVAL; + + dptr = skb->data + dataoff; + datalen = skb->len - dataoff; + + if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen)) + return -EINVAL; + + p->pe_data = kmalloc(matchlen, GFP_ATOMIC); + if (!p->pe_data) + return -ENOMEM; + + /* N.B: pe_data is only set on success, + * this allows fallback to the default persistence logic on failure + */ + memcpy(p->pe_data, dptr + matchoff, matchlen); + p->pe_data_len = matchlen; + + return 0; +} + +static bool ip_vs_sip_ct_match(const struct ip_vs_conn_param *p, + struct ip_vs_conn *ct) + +{ + bool ret = 0; + + if (ct->af == p->af && + ip_vs_addr_equal(p->af, p->caddr, &ct->caddr) && + /* protocol should only be IPPROTO_IP if + * d_addr is a fwmark */ + ip_vs_addr_equal(p->protocol == IPPROTO_IP ? AF_UNSPEC : p->af, + p->vaddr, &ct->vaddr) && + ct->vport == p->vport && + ct->flags & IP_VS_CONN_F_TEMPLATE && + ct->protocol == p->protocol && + ct->pe_data && ct->pe_data_len == p->pe_data_len && + !memcmp(ct->pe_data, p->pe_data, p->pe_data_len)) + ret = 1; + + IP_VS_DBG_BUF(9, "SIP template match %s %s->%s:%d %s\n", + ip_vs_proto_name(p->protocol), + IP_VS_DEBUG_CALLID(p->pe_data, p->pe_data_len), + IP_VS_DBG_ADDR(p->af, p->vaddr), ntohs(p->vport), + ret ? "hit" : "not hit"); + + return ret; +} + +static u32 ip_vs_sip_hashkey_raw(const struct ip_vs_conn_param *p, + u32 initval, bool inverse) +{ + return jhash(p->pe_data, p->pe_data_len, initval); +} + +static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf) +{ + memcpy(buf, cp->pe_data, cp->pe_data_len); + return cp->pe_data_len; +} + +static struct ip_vs_pe ip_vs_sip_pe = +{ + .name = "sip", + .refcnt = ATOMIC_INIT(0), + .module = THIS_MODULE, + .n_list = LIST_HEAD_INIT(ip_vs_sip_pe.n_list), + .fill_param = ip_vs_sip_fill_param, + .ct_match = ip_vs_sip_ct_match, + .hashkey_raw = ip_vs_sip_hashkey_raw, + .show_pe_data = ip_vs_sip_show_pe_data, +}; + +static int __init ip_vs_sip_init(void) +{ + return register_ip_vs_pe(&ip_vs_sip_pe); +} + +static void __exit ip_vs_sip_cleanup(void) +{ + unregister_ip_vs_pe(&ip_vs_sip_pe); +} + +module_init(ip_vs_sip_init); +module_exit(ip_vs_sip_cleanup); +MODULE_LICENSE("GPL"); -- cgit v1.1 From 0c200d935346fe0ebde9b6dffbb683dddd166fb9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 4 Oct 2010 20:53:18 +0200 Subject: netfilter: nf_nat: make find/put static The functions nf_nat_proto_find_get and nf_nat_proto_put are only used internally in nf_nat_core. This might break some out of tree NAT module. Signed-off-by: Stephen Hemminger Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/nf_nat_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index 2c084b3..e2e00c4 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -47,7 +47,7 @@ __nf_nat_proto_find(u_int8_t protonum) return rcu_dereference(nf_nat_protos[protonum]); } -const struct nf_nat_protocol * +static const struct nf_nat_protocol * nf_nat_proto_find_get(u_int8_t protonum) { const struct nf_nat_protocol *p; @@ -60,14 +60,12 @@ nf_nat_proto_find_get(u_int8_t protonum) return p; } -EXPORT_SYMBOL_GPL(nf_nat_proto_find_get); -void +static void nf_nat_proto_put(const struct nf_nat_protocol *p) { module_put(p->me); } -EXPORT_SYMBOL_GPL(nf_nat_proto_put); /* We keep an extra hash for each conntrack, for fast searching. */ static inline unsigned int -- cgit v1.1 From a8defca048fd11eb2d1a17ab61a60a856292dd4e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 20:56:05 +0200 Subject: netfilter: ipt_LOG: add bufferisation to call printk() once ipt_LOG & ip6t_LOG use lot of calls to printk() and use a lock in a hope several cpus wont mix their output in syslog. printk() being very expensive [1], its better to call it once, on a prebuilt and complete line. Also, with mixed IPv4 and IPv6 trafic, separate IPv4/IPv6 locks dont avoid garbage. I used an allocation of a 1024 bytes structure, sort of seq_printf() but with a fixed size limit. Use a static buffer if dynamic allocation failed. Emit a once time alert if buffer size happens to be too short. [1]: printk() has various features like printk_delay()... Signed-off-by: Eric Dumazet Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/ipt_LOG.c | 145 +++++++++++++++++++------------------- net/ipv6/netfilter/ip6t_LOG.c | 157 +++++++++++++++++++++--------------------- 2 files changed, 152 insertions(+), 150 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/ipt_LOG.c b/net/ipv4/netfilter/ipt_LOG.c index 915fc17..72ffc8f 100644 --- a/net/ipv4/netfilter/ipt_LOG.c +++ b/net/ipv4/netfilter/ipt_LOG.c @@ -24,16 +24,15 @@ #include #include #include +#include MODULE_LICENSE("GPL"); MODULE_AUTHOR("Netfilter Core Team "); MODULE_DESCRIPTION("Xtables: IPv4 packet logging to syslog"); -/* Use lock to serialize, so printks don't overlap */ -static DEFINE_SPINLOCK(log_lock); - /* One level of recursion won't kill us */ -static void dump_packet(const struct nf_loginfo *info, +static void dump_packet(struct sbuff *m, + const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int iphoff) { @@ -48,32 +47,32 @@ static void dump_packet(const struct nf_loginfo *info, ih = skb_header_pointer(skb, iphoff, sizeof(_iph), &_iph); if (ih == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Important fields: * TOS, len, DF/MF, fragment offset, TTL, src, dst, options. */ /* Max length: 40 "SRC=255.255.255.255 DST=255.255.255.255 " */ - printk("SRC=%pI4 DST=%pI4 ", + sb_add(m, "SRC=%pI4 DST=%pI4 ", &ih->saddr, &ih->daddr); /* Max length: 46 "LEN=65535 TOS=0xFF PREC=0xFF TTL=255 ID=65535 " */ - printk("LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", + sb_add(m, "LEN=%u TOS=0x%02X PREC=0x%02X TTL=%u ID=%u ", ntohs(ih->tot_len), ih->tos & IPTOS_TOS_MASK, ih->tos & IPTOS_PREC_MASK, ih->ttl, ntohs(ih->id)); /* Max length: 6 "CE DF MF " */ if (ntohs(ih->frag_off) & IP_CE) - printk("CE "); + sb_add(m, "CE "); if (ntohs(ih->frag_off) & IP_DF) - printk("DF "); + sb_add(m, "DF "); if (ntohs(ih->frag_off) & IP_MF) - printk("MF "); + sb_add(m, "MF "); /* Max length: 11 "FRAG:65535 " */ if (ntohs(ih->frag_off) & IP_OFFSET) - printk("FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); + sb_add(m, "FRAG:%u ", ntohs(ih->frag_off) & IP_OFFSET); if ((logflags & IPT_LOG_IPOPT) && ih->ihl * 4 > sizeof(struct iphdr)) { @@ -85,15 +84,15 @@ static void dump_packet(const struct nf_loginfo *info, op = skb_header_pointer(skb, iphoff+sizeof(_iph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + sb_add(m, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); } switch (ih->protocol) { @@ -102,7 +101,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + sb_add(m, "PROTO=TCP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -111,41 +110,41 @@ static void dump_packet(const struct nf_loginfo *info, th = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IPT_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + sb_add(m, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3F " */ - printk("RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + sb_add(m, "RES=0x%02x ", (u8)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + sb_add(m, "CWR "); if (th->ece) - printk("ECE "); + sb_add(m, "ECE "); if (th->urg) - printk("URG "); + sb_add(m, "URG "); if (th->ack) - printk("ACK "); + sb_add(m, "ACK "); if (th->psh) - printk("PSH "); + sb_add(m, "PSH "); if (th->rst) - printk("RST "); + sb_add(m, "RST "); if (th->syn) - printk("SYN "); + sb_add(m, "SYN "); if (th->fin) - printk("FIN "); + sb_add(m, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IPT_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -158,15 +157,15 @@ static void dump_packet(const struct nf_loginfo *info, iphoff+ih->ihl*4+sizeof(_tcph), optsize, _opt); if (op == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + sb_add(m, "OPT ("); for (i = 0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); } break; } @@ -177,9 +176,9 @@ static void dump_packet(const struct nf_loginfo *info, if (ih->protocol == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); + sb_add(m, "PROTO=UDP " ); else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); + sb_add(m, "PROTO=UDPLITE "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -188,13 +187,13 @@ static void dump_packet(const struct nf_loginfo *info, uh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -221,7 +220,7 @@ static void dump_packet(const struct nf_loginfo *info, [ICMP_ADDRESSREPLY] = 12 }; /* Max length: 11 "PROTO=ICMP " */ - printk("PROTO=ICMP "); + sb_add(m, "PROTO=ICMP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -230,19 +229,19 @@ static void dump_packet(const struct nf_loginfo *info, ich = skb_header_pointer(skb, iphoff + ih->ihl * 4, sizeof(_icmph), &_icmph); if (ich == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ich->type, ich->code); + sb_add(m, "TYPE=%u CODE=%u ", ich->type, ich->code); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ if (ich->type <= NR_ICMP_TYPES && required_len[ich->type] && skb->len-iphoff-ih->ihl*4 < required_len[ich->type]) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } @@ -251,35 +250,35 @@ static void dump_packet(const struct nf_loginfo *info, case ICMP_ECHOREPLY: case ICMP_ECHO: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + sb_add(m, "ID=%u SEQ=%u ", ntohs(ich->un.echo.id), ntohs(ich->un.echo.sequence)); break; case ICMP_PARAMETERPROB: /* Max length: 14 "PARAMETER=255 " */ - printk("PARAMETER=%u ", + sb_add(m, "PARAMETER=%u ", ntohl(ich->un.gateway) >> 24); break; case ICMP_REDIRECT: /* Max length: 24 "GATEWAY=255.255.255.255 " */ - printk("GATEWAY=%pI4 ", &ich->un.gateway); + sb_add(m, "GATEWAY=%pI4 ", &ich->un.gateway); /* Fall through */ case ICMP_DEST_UNREACH: case ICMP_SOURCE_QUENCH: case ICMP_TIME_EXCEEDED: /* Max length: 3+maxlen */ if (!iphoff) { /* Only recurse once. */ - printk("["); - dump_packet(info, skb, + sb_add(m, "["); + dump_packet(m, info, skb, iphoff + ih->ihl*4+sizeof(_icmph)); - printk("] "); + sb_add(m, "] "); } /* Max length: 10 "MTU=65535 " */ if (ich->type == ICMP_DEST_UNREACH && ich->code == ICMP_FRAG_NEEDED) - printk("MTU=%u ", ntohs(ich->un.frag.mtu)); + sb_add(m, "MTU=%u ", ntohs(ich->un.frag.mtu)); } break; } @@ -292,19 +291,19 @@ static void dump_packet(const struct nf_loginfo *info, break; /* Max length: 9 "PROTO=AH " */ - printk("PROTO=AH "); + sb_add(m, "PROTO=AH "); /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ah = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_ahdr), &_ahdr); if (ah == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(ah->spi)); + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); break; } case IPPROTO_ESP: { @@ -312,7 +311,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct ip_esp_hdr *eh; /* Max length: 10 "PROTO=ESP " */ - printk("PROTO=ESP "); + sb_add(m, "PROTO=ESP "); if (ntohs(ih->frag_off) & IP_OFFSET) break; @@ -321,25 +320,25 @@ static void dump_packet(const struct nf_loginfo *info, eh = skb_header_pointer(skb, iphoff+ih->ihl*4, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] ", + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - iphoff - ih->ihl*4); break; } /* Length: 15 "SPI=0xF1234567 " */ - printk("SPI=0x%x ", ntohl(eh->spi)); + sb_add(m, "SPI=0x%x ", ntohl(eh->spi)); break; } /* Max length: 10 "PROTO 255 " */ default: - printk("PROTO=%u ", ih->protocol); + sb_add(m, "PROTO=%u ", ih->protocol); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IPT_LOG_UID) && !iphoff && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u ", + sb_add(m, "UID=%u GID=%u ", skb->sk->sk_socket->file->f_cred->fsuid, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); @@ -347,7 +346,7 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!iphoff && skb->mark) - printk("MARK=0x%x ", skb->mark); + sb_add(m, "MARK=0x%x ", skb->mark); /* Proto Max log string length */ /* IP: 40+46+6+11+127 = 230 */ @@ -364,7 +363,8 @@ static void dump_packet(const struct nf_loginfo *info, /* maxlen = 230+ 91 + 230 + 252 = 803 */ } -static void dump_mac_header(const struct nf_loginfo *info, +static void dump_mac_header(struct sbuff *m, + const struct nf_loginfo *info, const struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -378,7 +378,7 @@ static void dump_mac_header(const struct nf_loginfo *info, switch (dev->type) { case ARPHRD_ETHER: - printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, ntohs(eth_hdr(skb)->h_proto)); return; @@ -387,17 +387,17 @@ static void dump_mac_header(const struct nf_loginfo *info, } fallback: - printk("MAC="); + sb_add(m, "MAC="); if (dev->hard_header_len && skb->mac_header != skb->network_header) { const unsigned char *p = skb_mac_header(skb); unsigned int i; - printk("%02x", *p++); + sb_add(m, "%02x", *p++); for (i = 1; i < dev->hard_header_len; i++, p++) - printk(":%02x", *p); + sb_add(m, ":%02x", *p); } - printk(" "); + sb_add(m, " "); } static struct nf_loginfo default_loginfo = { @@ -419,11 +419,12 @@ ipt_log_packet(u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { + struct sbuff *m = sb_open(); + if (!loginfo) loginfo = &default_loginfo; - spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, prefix, in ? in->name : "", out ? out->name : ""); @@ -434,20 +435,20 @@ ipt_log_packet(u_int8_t pf, physindev = skb->nf_bridge->physindev; if (physindev && in != physindev) - printk("PHYSIN=%s ", physindev->name); + sb_add(m, "PHYSIN=%s ", physindev->name); physoutdev = skb->nf_bridge->physoutdev; if (physoutdev && out != physoutdev) - printk("PHYSOUT=%s ", physoutdev->name); + sb_add(m, "PHYSOUT=%s ", physoutdev->name); } #endif /* MAC logging for input path only. */ if (in && !out) - dump_mac_header(loginfo, skb); + dump_mac_header(m, loginfo, skb); + + dump_packet(m, loginfo, skb, 0); - dump_packet(loginfo, skb, 0); - printk("\n"); - spin_unlock_bh(&log_lock); + sb_close(m); } static unsigned int diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 0a07ae7..09c8889 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -23,6 +23,7 @@ #include #include #include +#include MODULE_AUTHOR("Jan Rekorajski "); MODULE_DESCRIPTION("Xtables: IPv6 packet logging to syslog"); @@ -32,11 +33,9 @@ struct in_device; #include #include -/* Use lock to serialize, so printks don't overlap */ -static DEFINE_SPINLOCK(log_lock); - /* One level of recursion won't kill us */ -static void dump_packet(const struct nf_loginfo *info, +static void dump_packet(struct sbuff *m, + const struct nf_loginfo *info, const struct sk_buff *skb, unsigned int ip6hoff, int recurse) { @@ -55,15 +54,15 @@ static void dump_packet(const struct nf_loginfo *info, ih = skb_header_pointer(skb, ip6hoff, sizeof(_ip6h), &_ip6h); if (ih == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Max length: 88 "SRC=0000.0000.0000.0000.0000.0000.0000.0000 DST=0000.0000.0000.0000.0000.0000.0000.0000 " */ - printk("SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); + sb_add(m, "SRC=%pI6 DST=%pI6 ", &ih->saddr, &ih->daddr); /* Max length: 44 "LEN=65535 TC=255 HOPLIMIT=255 FLOWLBL=FFFFF " */ - printk("LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", + sb_add(m, "LEN=%Zu TC=%u HOPLIMIT=%u FLOWLBL=%u ", ntohs(ih->payload_len) + sizeof(struct ipv6hdr), (ntohl(*(__be32 *)ih) & 0x0ff00000) >> 20, ih->hop_limit, @@ -78,35 +77,35 @@ static void dump_packet(const struct nf_loginfo *info, hp = skb_header_pointer(skb, ptr, sizeof(_hdr), &_hdr); if (hp == NULL) { - printk("TRUNCATED"); + sb_add(m, "TRUNCATED"); return; } /* Max length: 48 "OPT (...) " */ if (logflags & IP6T_LOG_IPOPT) - printk("OPT ( "); + sb_add(m, "OPT ( "); switch (currenthdr) { case IPPROTO_FRAGMENT: { struct frag_hdr _fhdr; const struct frag_hdr *fh; - printk("FRAG:"); + sb_add(m, "FRAG:"); fh = skb_header_pointer(skb, ptr, sizeof(_fhdr), &_fhdr); if (fh == NULL) { - printk("TRUNCATED "); + sb_add(m, "TRUNCATED "); return; } /* Max length: 6 "65535 " */ - printk("%u ", ntohs(fh->frag_off) & 0xFFF8); + sb_add(m, "%u ", ntohs(fh->frag_off) & 0xFFF8); /* Max length: 11 "INCOMPLETE " */ if (fh->frag_off & htons(0x0001)) - printk("INCOMPLETE "); + sb_add(m, "INCOMPLETE "); - printk("ID:%08x ", ntohl(fh->identification)); + sb_add(m, "ID:%08x ", ntohl(fh->identification)); if (ntohs(fh->frag_off) & 0xFFF8) fragment = 1; @@ -120,7 +119,7 @@ static void dump_packet(const struct nf_loginfo *info, case IPPROTO_HOPOPTS: if (fragment) { if (logflags & IP6T_LOG_IPOPT) - printk(")"); + sb_add(m, ")"); return; } hdrlen = ipv6_optlen(hp); @@ -132,10 +131,10 @@ static void dump_packet(const struct nf_loginfo *info, const struct ip_auth_hdr *ah; /* Max length: 3 "AH " */ - printk("AH "); + sb_add(m, "AH "); if (fragment) { - printk(")"); + sb_add(m, ")"); return; } @@ -146,13 +145,13 @@ static void dump_packet(const struct nf_loginfo *info, * Max length: 26 "INCOMPLETE [65535 * bytes] )" */ - printk("INCOMPLETE [%u bytes] )", + sb_add(m, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 15 "SPI=0xF1234567 */ - printk("SPI=0x%x ", ntohl(ah->spi)); + sb_add(m, "SPI=0x%x ", ntohl(ah->spi)); } @@ -164,10 +163,10 @@ static void dump_packet(const struct nf_loginfo *info, const struct ip_esp_hdr *eh; /* Max length: 4 "ESP " */ - printk("ESP "); + sb_add(m, "ESP "); if (fragment) { - printk(")"); + sb_add(m, ")"); return; } @@ -177,23 +176,23 @@ static void dump_packet(const struct nf_loginfo *info, eh = skb_header_pointer(skb, ptr, sizeof(_esph), &_esph); if (eh == NULL) { - printk("INCOMPLETE [%u bytes] )", + sb_add(m, "INCOMPLETE [%u bytes] )", skb->len - ptr); return; } /* Length: 16 "SPI=0xF1234567 )" */ - printk("SPI=0x%x )", ntohl(eh->spi) ); + sb_add(m, "SPI=0x%x )", ntohl(eh->spi) ); } return; default: /* Max length: 20 "Unknown Ext Hdr 255" */ - printk("Unknown Ext Hdr %u", currenthdr); + sb_add(m, "Unknown Ext Hdr %u", currenthdr); return; } if (logflags & IP6T_LOG_IPOPT) - printk(") "); + sb_add(m, ") "); currenthdr = hp->nexthdr; ptr += hdrlen; @@ -205,7 +204,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct tcphdr *th; /* Max length: 10 "PROTO=TCP " */ - printk("PROTO=TCP "); + sb_add(m, "PROTO=TCP "); if (fragment) break; @@ -213,40 +212,40 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 25 "INCOMPLETE [65535 bytes] " */ th = skb_header_pointer(skb, ptr, sizeof(_tcph), &_tcph); if (th == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u ", + sb_add(m, "SPT=%u DPT=%u ", ntohs(th->source), ntohs(th->dest)); /* Max length: 30 "SEQ=4294967295 ACK=4294967295 " */ if (logflags & IP6T_LOG_TCPSEQ) - printk("SEQ=%u ACK=%u ", + sb_add(m, "SEQ=%u ACK=%u ", ntohl(th->seq), ntohl(th->ack_seq)); /* Max length: 13 "WINDOW=65535 " */ - printk("WINDOW=%u ", ntohs(th->window)); + sb_add(m, "WINDOW=%u ", ntohs(th->window)); /* Max length: 9 "RES=0x3C " */ - printk("RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); + sb_add(m, "RES=0x%02x ", (u_int8_t)(ntohl(tcp_flag_word(th) & TCP_RESERVED_BITS) >> 22)); /* Max length: 32 "CWR ECE URG ACK PSH RST SYN FIN " */ if (th->cwr) - printk("CWR "); + sb_add(m, "CWR "); if (th->ece) - printk("ECE "); + sb_add(m, "ECE "); if (th->urg) - printk("URG "); + sb_add(m, "URG "); if (th->ack) - printk("ACK "); + sb_add(m, "ACK "); if (th->psh) - printk("PSH "); + sb_add(m, "PSH "); if (th->rst) - printk("RST "); + sb_add(m, "RST "); if (th->syn) - printk("SYN "); + sb_add(m, "SYN "); if (th->fin) - printk("FIN "); + sb_add(m, "FIN "); /* Max length: 11 "URGP=65535 " */ - printk("URGP=%u ", ntohs(th->urg_ptr)); + sb_add(m, "URGP=%u ", ntohs(th->urg_ptr)); if ((logflags & IP6T_LOG_TCPOPT) && th->doff * 4 > sizeof(struct tcphdr)) { @@ -260,15 +259,15 @@ static void dump_packet(const struct nf_loginfo *info, ptr + sizeof(struct tcphdr), optsize, _opt); if (op == NULL) { - printk("OPT (TRUNCATED)"); + sb_add(m, "OPT (TRUNCATED)"); return; } /* Max length: 127 "OPT (" 15*4*2chars ") " */ - printk("OPT ("); + sb_add(m, "OPT ("); for (i =0; i < optsize; i++) - printk("%02X", op[i]); - printk(") "); + sb_add(m, "%02X", op[i]); + sb_add(m, ") "); } break; } @@ -279,9 +278,9 @@ static void dump_packet(const struct nf_loginfo *info, if (currenthdr == IPPROTO_UDP) /* Max length: 10 "PROTO=UDP " */ - printk("PROTO=UDP " ); + sb_add(m, "PROTO=UDP " ); else /* Max length: 14 "PROTO=UDPLITE " */ - printk("PROTO=UDPLITE "); + sb_add(m, "PROTO=UDPLITE "); if (fragment) break; @@ -289,12 +288,12 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 25 "INCOMPLETE [65535 bytes] " */ uh = skb_header_pointer(skb, ptr, sizeof(_udph), &_udph); if (uh == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 20 "SPT=65535 DPT=65535 " */ - printk("SPT=%u DPT=%u LEN=%u ", + sb_add(m, "SPT=%u DPT=%u LEN=%u ", ntohs(uh->source), ntohs(uh->dest), ntohs(uh->len)); break; @@ -304,7 +303,7 @@ static void dump_packet(const struct nf_loginfo *info, const struct icmp6hdr *ic; /* Max length: 13 "PROTO=ICMPv6 " */ - printk("PROTO=ICMPv6 "); + sb_add(m, "PROTO=ICMPv6 "); if (fragment) break; @@ -312,18 +311,18 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 25 "INCOMPLETE [65535 bytes] " */ ic = skb_header_pointer(skb, ptr, sizeof(_icmp6h), &_icmp6h); if (ic == NULL) { - printk("INCOMPLETE [%u bytes] ", skb->len - ptr); + sb_add(m, "INCOMPLETE [%u bytes] ", skb->len - ptr); return; } /* Max length: 18 "TYPE=255 CODE=255 " */ - printk("TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); + sb_add(m, "TYPE=%u CODE=%u ", ic->icmp6_type, ic->icmp6_code); switch (ic->icmp6_type) { case ICMPV6_ECHO_REQUEST: case ICMPV6_ECHO_REPLY: /* Max length: 19 "ID=65535 SEQ=65535 " */ - printk("ID=%u SEQ=%u ", + sb_add(m, "ID=%u SEQ=%u ", ntohs(ic->icmp6_identifier), ntohs(ic->icmp6_sequence)); break; @@ -334,35 +333,35 @@ static void dump_packet(const struct nf_loginfo *info, case ICMPV6_PARAMPROB: /* Max length: 17 "POINTER=ffffffff " */ - printk("POINTER=%08x ", ntohl(ic->icmp6_pointer)); + sb_add(m, "POINTER=%08x ", ntohl(ic->icmp6_pointer)); /* Fall through */ case ICMPV6_DEST_UNREACH: case ICMPV6_PKT_TOOBIG: case ICMPV6_TIME_EXCEED: /* Max length: 3+maxlen */ if (recurse) { - printk("["); - dump_packet(info, skb, ptr + sizeof(_icmp6h), - 0); - printk("] "); + sb_add(m, "["); + dump_packet(m, info, skb, + ptr + sizeof(_icmp6h), 0); + sb_add(m, "] "); } /* Max length: 10 "MTU=65535 " */ if (ic->icmp6_type == ICMPV6_PKT_TOOBIG) - printk("MTU=%u ", ntohl(ic->icmp6_mtu)); + sb_add(m, "MTU=%u ", ntohl(ic->icmp6_mtu)); } break; } /* Max length: 10 "PROTO=255 " */ default: - printk("PROTO=%u ", currenthdr); + sb_add(m, "PROTO=%u ", currenthdr); } /* Max length: 15 "UID=4294967295 " */ if ((logflags & IP6T_LOG_UID) && recurse && skb->sk) { read_lock_bh(&skb->sk->sk_callback_lock); if (skb->sk->sk_socket && skb->sk->sk_socket->file) - printk("UID=%u GID=%u ", + sb_add(m, "UID=%u GID=%u ", skb->sk->sk_socket->file->f_cred->fsuid, skb->sk->sk_socket->file->f_cred->fsgid); read_unlock_bh(&skb->sk->sk_callback_lock); @@ -370,10 +369,11 @@ static void dump_packet(const struct nf_loginfo *info, /* Max length: 16 "MARK=0xFFFFFFFF " */ if (!recurse && skb->mark) - printk("MARK=0x%x ", skb->mark); + sb_add(m, "MARK=0x%x ", skb->mark); } -static void dump_mac_header(const struct nf_loginfo *info, +static void dump_mac_header(struct sbuff *m, + const struct nf_loginfo *info, const struct sk_buff *skb) { struct net_device *dev = skb->dev; @@ -387,7 +387,7 @@ static void dump_mac_header(const struct nf_loginfo *info, switch (dev->type) { case ARPHRD_ETHER: - printk("MACSRC=%pM MACDST=%pM MACPROTO=%04x ", + sb_add(m, "MACSRC=%pM MACDST=%pM MACPROTO=%04x ", eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest, ntohs(eth_hdr(skb)->h_proto)); return; @@ -396,7 +396,7 @@ static void dump_mac_header(const struct nf_loginfo *info, } fallback: - printk("MAC="); + sb_add(m, "MAC="); if (dev->hard_header_len && skb->mac_header != skb->network_header) { const unsigned char *p = skb_mac_header(skb); @@ -408,19 +408,19 @@ fallback: p = NULL; if (p != NULL) { - printk("%02x", *p++); + sb_add(m, "%02x", *p++); for (i = 1; i < len; i++) - printk(":%02x", p[i]); + sb_add(m, ":%02x", p[i]); } - printk(" "); + sb_add(m, " "); if (dev->type == ARPHRD_SIT) { const struct iphdr *iph = (struct iphdr *)skb_mac_header(skb); - printk("TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr); + sb_add(m, "TUNNEL=%pI4->%pI4 ", &iph->saddr, &iph->daddr); } } else - printk(" "); + sb_add(m, " "); } static struct nf_loginfo default_loginfo = { @@ -442,22 +442,23 @@ ip6t_log_packet(u_int8_t pf, const struct nf_loginfo *loginfo, const char *prefix) { + struct sbuff *m = sb_open(); + if (!loginfo) loginfo = &default_loginfo; - spin_lock_bh(&log_lock); - printk("<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, - prefix, - in ? in->name : "", - out ? out->name : ""); + sb_add(m, "<%d>%sIN=%s OUT=%s ", loginfo->u.log.level, + prefix, + in ? in->name : "", + out ? out->name : ""); /* MAC logging for input path only. */ if (in && !out) - dump_mac_header(loginfo, skb); + dump_mac_header(m, loginfo, skb); + + dump_packet(m, loginfo, skb, skb_network_offset(skb), 1); - dump_packet(loginfo, skb, skb_network_offset(skb), 1); - printk("\n"); - spin_unlock_bh(&log_lock); + sb_close(m); } static unsigned int -- cgit v1.1 From e55df53dd62c73185af46fb6ffa7074b05ceefc4 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser Date: Mon, 4 Oct 2010 21:00:42 +0200 Subject: netfilter: remove duplicated include Remove duplicated include. Signed-off-by: Nicolas Kaiser Signed-off-by: Patrick McHardy --- net/netfilter/xt_ipvs.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/xt_ipvs.c b/net/netfilter/xt_ipvs.c index 7a4d66db..9127a3d 100644 --- a/net/netfilter/xt_ipvs.c +++ b/net/netfilter/xt_ipvs.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include -- cgit v1.1 From f68c53015c5b9aa98ffd87a34009f89bdbbd7160 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Mon, 4 Oct 2010 22:24:12 +0200 Subject: netfilter: unregister nf hooks, matches and targets in the reverse order Since we register nf hooks, matches and targets in order, we'd better unregister them in the reverse order. Signed-off-by: Changli Gao Signed-off-by: Patrick McHardy --- net/netfilter/core.c | 6 ++---- net/netfilter/x_tables.c | 12 ++++-------- 2 files changed, 6 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/netfilter/core.c b/net/netfilter/core.c index 78b505d..8f014f2 100644 --- a/net/netfilter/core.c +++ b/net/netfilter/core.c @@ -105,10 +105,8 @@ EXPORT_SYMBOL(nf_register_hooks); void nf_unregister_hooks(struct nf_hook_ops *reg, unsigned int n) { - unsigned int i; - - for (i = 0; i < n; i++) - nf_unregister_hook(®[i]); + while (n-- > 0) + nf_unregister_hook(®[n]); } EXPORT_SYMBOL(nf_unregister_hooks); diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index e34622f..80463507 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -116,10 +116,8 @@ EXPORT_SYMBOL(xt_register_targets); void xt_unregister_targets(struct xt_target *target, unsigned int n) { - unsigned int i; - - for (i = 0; i < n; i++) - xt_unregister_target(&target[i]); + while (n-- > 0) + xt_unregister_target(&target[n]); } EXPORT_SYMBOL(xt_unregister_targets); @@ -174,10 +172,8 @@ EXPORT_SYMBOL(xt_register_matches); void xt_unregister_matches(struct xt_match *match, unsigned int n) { - unsigned int i; - - for (i = 0; i < n; i++) - xt_unregister_match(&match[i]); + while (n-- > 0) + xt_unregister_match(&match[n]); } EXPORT_SYMBOL(xt_unregister_matches); -- cgit v1.1 From 24824a09e35402b8d58dcc5be803a5ad3937bdba Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Sat, 2 Oct 2010 06:11:55 +0000 Subject: net: dynamic ingress_queue allocation ingress being not used very much, and net_device->ingress_queue being quite a big object (128 or 256 bytes), use a dynamic allocation if needed (tc qdisc add dev eth0 ingress ...) dev_ingress_queue(dev) helper should be used only with RTNL taken. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 34 ++++++++++++++++++++++++++-------- net/sched/sch_api.c | 42 ++++++++++++++++++++++++++++-------------- net/sched/sch_generic.c | 12 ++++++++---- 3 files changed, 62 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index a313bab..ce6ad88 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2702,11 +2702,10 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); * the ingress scheduler, you just cant add policies on ingress. * */ -static int ing_filter(struct sk_buff *skb) +static int ing_filter(struct sk_buff *skb, struct netdev_queue *rxq) { struct net_device *dev = skb->dev; u32 ttl = G_TC_RTTL(skb->tc_verd); - struct netdev_queue *rxq; int result = TC_ACT_OK; struct Qdisc *q; @@ -2720,8 +2719,6 @@ static int ing_filter(struct sk_buff *skb) skb->tc_verd = SET_TC_RTTL(skb->tc_verd, ttl); skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_INGRESS); - rxq = &dev->ingress_queue; - q = rxq->qdisc; if (q != &noop_qdisc) { spin_lock(qdisc_lock(q)); @@ -2737,7 +2734,9 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, struct net_device *orig_dev) { - if (skb->dev->ingress_queue.qdisc == &noop_qdisc) + struct netdev_queue *rxq = rcu_dereference(skb->dev->ingress_queue); + + if (!rxq || rxq->qdisc == &noop_qdisc) goto out; if (*pt_prev) { @@ -2745,7 +2744,7 @@ static inline struct sk_buff *handle_ing(struct sk_buff *skb, *pt_prev = NULL; } - switch (ing_filter(skb)) { + switch (ing_filter(skb, rxq)) { case TC_ACT_SHOT: case TC_ACT_STOLEN: kfree_skb(skb); @@ -4940,7 +4939,6 @@ static void __netdev_init_queue_locks_one(struct net_device *dev, static void netdev_init_queue_locks(struct net_device *dev) { netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); - __netdev_init_queue_locks_one(dev, &dev->ingress_queue, NULL); } unsigned long netdev_fix_features(unsigned long features, const char *name) @@ -5452,11 +5450,29 @@ static void netdev_init_one_queue(struct net_device *dev, static void netdev_init_queues(struct net_device *dev) { - netdev_init_one_queue(dev, &dev->ingress_queue, NULL); netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); spin_lock_init(&dev->tx_global_lock); } +struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) +{ + struct netdev_queue *queue = dev_ingress_queue(dev); + +#ifdef CONFIG_NET_CLS_ACT + if (queue) + return queue; + queue = kzalloc(sizeof(*queue), GFP_KERNEL); + if (!queue) + return NULL; + netdev_init_one_queue(dev, queue, NULL); + __netdev_init_queue_locks_one(dev, queue, NULL); + queue->qdisc = &noop_qdisc; + queue->qdisc_sleeping = &noop_qdisc; + rcu_assign_pointer(dev->ingress_queue, queue); +#endif + return queue; +} + /** * alloc_netdev_mq - allocate network device * @sizeof_priv: size of private data to allocate space for @@ -5559,6 +5575,8 @@ void free_netdev(struct net_device *dev) kfree(dev->_tx); + kfree(rcu_dereference_raw(dev->ingress_queue)); + /* Flush device addresses */ dev_addr_flush(dev); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index b802078..b22ca2d 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -240,7 +240,10 @@ struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle) if (q) goto out; - q = qdisc_match_from_root(dev->ingress_queue.qdisc_sleeping, handle); + if (dev_ingress_queue(dev)) + q = qdisc_match_from_root( + dev_ingress_queue(dev)->qdisc_sleeping, + handle); out: return q; } @@ -690,6 +693,8 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, (new && new->flags & TCQ_F_INGRESS)) { num_q = 1; ingress = 1; + if (!dev_ingress_queue(dev)) + return -ENOENT; } if (dev->flags & IFF_UP) @@ -701,7 +706,7 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, } for (i = 0; i < num_q; i++) { - struct netdev_queue *dev_queue = &dev->ingress_queue; + struct netdev_queue *dev_queue = dev_ingress_queue(dev); if (!ingress) dev_queue = netdev_get_tx_queue(dev, i); @@ -979,7 +984,8 @@ static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n, void *arg) return -ENOENT; q = qdisc_leaf(p, clid); } else { /* ingress */ - q = dev->ingress_queue.qdisc_sleeping; + if (dev_ingress_queue(dev)) + q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { q = dev->qdisc; @@ -1043,8 +1049,9 @@ replay: if ((p = qdisc_lookup(dev, TC_H_MAJ(clid))) == NULL) return -ENOENT; q = qdisc_leaf(p, clid); - } else { /*ingress */ - q = dev->ingress_queue.qdisc_sleeping; + } else { /* ingress */ + if (dev_ingress_queue_create(dev)) + q = dev_ingress_queue(dev)->qdisc_sleeping; } } else { q = dev->qdisc; @@ -1123,11 +1130,14 @@ replay: create_n_graft: if (!(n->nlmsg_flags&NLM_F_CREATE)) return -ENOENT; - if (clid == TC_H_INGRESS) - q = qdisc_create(dev, &dev->ingress_queue, p, - tcm->tcm_parent, tcm->tcm_parent, - tca, &err); - else { + if (clid == TC_H_INGRESS) { + if (dev_ingress_queue(dev)) + q = qdisc_create(dev, dev_ingress_queue(dev), p, + tcm->tcm_parent, tcm->tcm_parent, + tca, &err); + else + err = -ENOENT; + } else { struct netdev_queue *dev_queue; if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue) @@ -1304,8 +1314,10 @@ static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb) if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx) < 0) goto done; - dev_queue = &dev->ingress_queue; - if (tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, &q_idx, s_q_idx) < 0) + dev_queue = dev_ingress_queue(dev); + if (dev_queue && + tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb, + &q_idx, s_q_idx) < 0) goto done; cont: @@ -1595,8 +1607,10 @@ static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb) if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t) < 0) goto done; - dev_queue = &dev->ingress_queue; - if (tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, &t, s_t) < 0) + dev_queue = dev_ingress_queue(dev); + if (dev_queue && + tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb, + &t, s_t) < 0) goto done; done: diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 545278a..3d57681 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -753,7 +753,8 @@ void dev_activate(struct net_device *dev) need_watchdog = 0; netdev_for_each_tx_queue(dev, transition_one_qdisc, &need_watchdog); - transition_one_qdisc(dev, &dev->ingress_queue, NULL); + if (dev_ingress_queue(dev)) + transition_one_qdisc(dev, dev_ingress_queue(dev), NULL); if (need_watchdog) { dev->trans_start = jiffies; @@ -812,7 +813,8 @@ static bool some_qdisc_is_busy(struct net_device *dev) void dev_deactivate(struct net_device *dev) { netdev_for_each_tx_queue(dev, dev_deactivate_queue, &noop_qdisc); - dev_deactivate_queue(dev, &dev->ingress_queue, &noop_qdisc); + if (dev_ingress_queue(dev)) + dev_deactivate_queue(dev, dev_ingress_queue(dev), &noop_qdisc); dev_watchdog_down(dev); @@ -838,7 +840,8 @@ void dev_init_scheduler(struct net_device *dev) { dev->qdisc = &noop_qdisc; netdev_for_each_tx_queue(dev, dev_init_scheduler_queue, &noop_qdisc); - dev_init_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc); + if (dev_ingress_queue(dev)) + dev_init_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); setup_timer(&dev->watchdog_timer, dev_watchdog, (unsigned long)dev); } @@ -861,7 +864,8 @@ static void shutdown_scheduler_queue(struct net_device *dev, void dev_shutdown(struct net_device *dev) { netdev_for_each_tx_queue(dev, shutdown_scheduler_queue, &noop_qdisc); - shutdown_scheduler_queue(dev, &dev->ingress_queue, &noop_qdisc); + if (dev_ingress_queue(dev)) + shutdown_scheduler_queue(dev, dev_ingress_queue(dev), &noop_qdisc); qdisc_destroy(dev->qdisc); dev->qdisc = &noop_qdisc; -- cgit v1.1 From 13f5bf18ba657d2d17c8fcf584e50359c718dd4b Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 5 Oct 2010 00:27:05 -0700 Subject: ipvs: Use frag walker helper in SCTP proto support. Signed-off-by: David S. Miller Acked-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_sctp.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 4c0855c..2f982a4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -61,6 +61,7 @@ sctp_snat_handler(struct sk_buff *skb, { sctp_sctphdr_t *sctph; unsigned int sctphoff; + struct sk_buff *iter; __be32 crc32; #ifdef CONFIG_IP_VS_IPV6 @@ -89,8 +90,8 @@ sctp_snat_handler(struct sk_buff *skb, /* Calculate the checksum */ crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); - for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) - crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb), + skb_walk_frags(skb, iter) + crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), crc32); crc32 = sctp_end_cksum(crc32); sctph->checksum = crc32; @@ -102,9 +103,9 @@ static int sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp) { - sctp_sctphdr_t *sctph; unsigned int sctphoff; + struct sk_buff *iter; __be32 crc32; #ifdef CONFIG_IP_VS_IPV6 @@ -133,8 +134,8 @@ sctp_dnat_handler(struct sk_buff *skb, /* Calculate the checksum */ crc32 = sctp_start_cksum((u8 *) sctph, skb_headlen(skb) - sctphoff); - for (skb = skb_shinfo(skb)->frag_list; skb; skb = skb->next) - crc32 = sctp_update_cksum((u8 *) skb->data, skb_headlen(skb), + skb_walk_frags(skb, iter) + crc32 = sctp_update_cksum((u8 *) iter->data, skb_headlen(iter), crc32); crc32 = sctp_end_cksum(crc32); sctph->checksum = crc32; @@ -145,9 +146,9 @@ sctp_dnat_handler(struct sk_buff *skb, static int sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) { - struct sk_buff *list = skb_shinfo(skb)->frag_list; unsigned int sctphoff; struct sctphdr *sh, _sctph; + struct sk_buff *iter; __le32 cmp; __le32 val; __u32 tmp; @@ -166,9 +167,9 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) cmp = sh->checksum; tmp = sctp_start_cksum((__u8 *) sh, skb_headlen(skb)); - for (; list; list = list->next) - tmp = sctp_update_cksum((__u8 *) list->data, - skb_headlen(list), tmp); + skb_walk_frags(skb, iter) + tmp = sctp_update_cksum((__u8 *) iter->data, + skb_headlen(iter), tmp); val = sctp_end_cksum(tmp); -- cgit v1.1 From ee624599d32bc698212d3c04faf908dc01a40457 Mon Sep 17 00:00:00 2001 From: Nicolas Kaiser Date: Mon, 4 Oct 2010 04:35:39 +0000 Subject: caif: remove duplicated include Remove duplicated include. Signed-off-by: Nicolas Kaiser Acked-by: Sjur Braendeland Signed-off-by: David S. Miller --- net/caif/caif_socket.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c index 4d918f8..abcba53 100644 --- a/net/caif/caif_socket.c +++ b/net/caif/caif_socket.c @@ -17,7 +17,6 @@ #include #include #include -#include #include #include #include -- cgit v1.1 From 6a31d2a97c04ffe9b161ec0177a2296366ff9249 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 20:00:18 +0000 Subject: fib: cleanups Code style cleanups before upcoming functional changes. C99 initializer for fib_props array. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 121 ++++++++++++---------- net/ipv4/fib_rules.c | 10 +- net/ipv4/fib_semantics.c | 257 +++++++++++++++++++++++++---------------------- 3 files changed, 206 insertions(+), 182 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 4a69a95..b05c23b 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -225,30 +225,33 @@ EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - return __inet_dev_addr_type(net, dev, addr); + return __inet_dev_addr_type(net, dev, addr); } EXPORT_SYMBOL(inet_dev_addr_type); /* Given (packet source, input interface) and optional (dst, oif, tos): - - (main) check, that source is valid i.e. not broadcast or our local - address. - - figure out what "logical" interface this packet arrived - and calculate "specific destination" address. - - check, that packet arrived from expected physical interface. + * - (main) check, that source is valid i.e. not broadcast or our local + * address. + * - figure out what "logical" interface this packet arrived + * and calculate "specific destination" address. + * - check, that packet arrived from expected physical interface. */ - int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, __be32 *spec_dst, u32 *itag, u32 mark) { struct in_device *in_dev; - struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = src, - .saddr = dst, - .tos = tos } }, - .mark = mark, - .iif = oif }; - + struct flowi fl = { + .nl_u = { + .ip4_u = { + .daddr = src, + .saddr = dst, + .tos = tos + } + }, + .mark = mark, + .iif = oif + }; struct fib_result res; int no_addr, rpf, accept_local; bool dev_match; @@ -477,9 +480,9 @@ static int rtentry_to_fib_config(struct net *net, int cmd, struct rtentry *rt, } /* - * Handle IP routing ioctl calls. These are used to manipulate the routing tables + * Handle IP routing ioctl calls. + * These are used to manipulate the routing tables */ - int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) { struct fib_config cfg; @@ -523,7 +526,7 @@ int ip_rt_ioctl(struct net *net, unsigned int cmd, void __user *arg) return -EINVAL; } -const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { +const struct nla_policy rtm_ipv4_policy[RTA_MAX + 1] = { [RTA_DST] = { .type = NLA_U32 }, [RTA_SRC] = { .type = NLA_U32 }, [RTA_IIF] = { .type = NLA_U32 }, @@ -537,7 +540,7 @@ const struct nla_policy rtm_ipv4_policy[RTA_MAX+1] = { }; static int rtm_to_fib_config(struct net *net, struct sk_buff *skb, - struct nlmsghdr *nlh, struct fib_config *cfg) + struct nlmsghdr *nlh, struct fib_config *cfg) { struct nlattr *attr; int err, remaining; @@ -692,12 +695,11 @@ out: } /* Prepare and feed intra-kernel routing request. - Really, it should be netlink message, but :-( netlink - can be not configured, so that we feed it directly - to fib engine. It is legal, because all events occur - only when netlink is already locked. + * Really, it should be netlink message, but :-( netlink + * can be not configured, so that we feed it directly + * to fib engine. It is legal, because all events occur + * only when netlink is already locked. */ - static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa->ifa_dev->dev); @@ -743,9 +745,9 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) struct in_ifaddr *prim = ifa; __be32 mask = ifa->ifa_mask; __be32 addr = ifa->ifa_local; - __be32 prefix = ifa->ifa_address&mask; + __be32 prefix = ifa->ifa_address & mask; - if (ifa->ifa_flags&IFA_F_SECONDARY) { + if (ifa->ifa_flags & IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); if (prim == NULL) { printk(KERN_WARNING "fib_add_ifaddr: bug: prim == NULL\n"); @@ -755,22 +757,24 @@ void fib_add_ifaddr(struct in_ifaddr *ifa) fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); - if (!(dev->flags&IFF_UP)) + if (!(dev->flags & IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != htonl(0xFFFFFFFF)) fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && + if (!ipv4_is_zeronet(prefix) && !(ifa->ifa_flags & IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { - fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : - RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); + fib_magic(RTM_NEWROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + prefix, ifa->ifa_prefixlen, prim); /* Add network specific broadcasts, when it takes a sense */ if (ifa->ifa_prefixlen < 31) { fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix | ~mask, + 32, prim); } } } @@ -781,17 +785,18 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) struct net_device *dev = in_dev->dev; struct in_ifaddr *ifa1; struct in_ifaddr *prim = ifa; - __be32 brd = ifa->ifa_address|~ifa->ifa_mask; - __be32 any = ifa->ifa_address&ifa->ifa_mask; + __be32 brd = ifa->ifa_address | ~ifa->ifa_mask; + __be32 any = ifa->ifa_address & ifa->ifa_mask; #define LOCAL_OK 1 #define BRD_OK 2 #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; - if (!(ifa->ifa_flags&IFA_F_SECONDARY)) - fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : - RTN_UNICAST, any, ifa->ifa_prefixlen, prim); + if (!(ifa->ifa_flags & IFA_F_SECONDARY)) + fib_magic(RTM_DELROUTE, + dev->flags & IFF_LOOPBACK ? RTN_LOCAL : RTN_UNICAST, + any, ifa->ifa_prefixlen, prim); else { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { @@ -801,9 +806,9 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) } /* Deletion is more complicated than add. - We should take care of not to delete too much :-) - - Scan address list to be sure that addresses are really gone. + * We should take care of not to delete too much :-) + * + * Scan address list to be sure that addresses are really gone. */ for (ifa1 = in_dev->ifa_list; ifa1; ifa1 = ifa1->ifa_next) { @@ -817,23 +822,23 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) ok |= BRD0_OK; } - if (!(ok&BRD_OK)) + if (!(ok & BRD_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); - if (!(ok&BRD1_OK)) + if (!(ok & BRD1_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); - if (!(ok&BRD0_OK)) + if (!(ok & BRD0_OK)) fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); - if (!(ok&LOCAL_OK)) { + if (!(ok & LOCAL_OK)) { fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); /* Check, that this local address finally disappeared. */ if (inet_addr_type(dev_net(dev), ifa->ifa_local) != RTN_LOCAL) { /* And the last, but not the least thing. - We must flush stray FIB entries. - - First of all, we scan fib_info list searching - for stray nexthop entries, then ignite fib_flush. - */ + * We must flush stray FIB entries. + * + * First of all, we scan fib_info list searching + * for stray nexthop entries, then ignite fib_flush. + */ if (fib_sync_down_addr(dev_net(dev), ifa->ifa_local)) fib_flush(dev_net(dev)); } @@ -844,14 +849,20 @@ static void fib_del_ifaddr(struct in_ifaddr *ifa) #undef BRD1_OK } -static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb ) +static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) { struct fib_result res; - struct flowi fl = { .mark = frn->fl_mark, - .nl_u = { .ip4_u = { .daddr = frn->fl_addr, - .tos = frn->fl_tos, - .scope = frn->fl_scope } } }; + struct flowi fl = { + .mark = frn->fl_mark, + .nl_u = { + .ip4_u = { + .daddr = frn->fl_addr, + .tos = frn->fl_tos, + .scope = frn->fl_scope + } + } + }; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; @@ -899,8 +910,8 @@ static void nl_fib_input(struct sk_buff *skb) nl_fib_lookup(frn, tb); - pid = NETLINK_CB(skb).pid; /* pid of sending process */ - NETLINK_CB(skb).pid = 0; /* from kernel */ + pid = NETLINK_CB(skb).pid; /* pid of sending process */ + NETLINK_CB(skb).pid = 0; /* from kernel */ NETLINK_CB(skb).dst_group = 0; /* unicast */ netlink_unicast(net->ipv4.fibnl, skb, pid, MSG_DONTWAIT); } @@ -947,7 +958,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event, fib_del_ifaddr(ifa); if (ifa->ifa_dev->ifa_list == NULL) { /* Last address was deleted from this interface. - Disable IP. + * Disable IP. */ fib_disable_ip(dev, 1, 0); } else { diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 76daeb5..3230052 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -6,7 +6,7 @@ * IPv4 Forwarding Information Base: policy rules. * * Authors: Alexey Kuznetsov, - * Thomas Graf + * Thomas Graf * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,7 +14,7 @@ * 2 of the License, or (at your option) any later version. * * Fixes: - * Rani Assaf : local_rule cannot be deleted + * Rani Assaf : local_rule cannot be deleted * Marc Boucher : routing by fwmark */ @@ -32,8 +32,7 @@ #include #include -struct fib4_rule -{ +struct fib4_rule { struct fib_rule common; u8 dst_len; u8 src_len; @@ -91,7 +90,8 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, goto errout; } - if ((tbl = fib_get_table(rule->fr_net, rule->table)) == NULL) + tbl = fib_get_table(rule->fr_net, rule->table); + if (!tbl) goto errout; err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 20f09c5..ba52f39 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -60,21 +60,30 @@ static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE]; static DEFINE_SPINLOCK(fib_multipath_lock); -#define for_nexthops(fi) { int nhsel; const struct fib_nh * nh; \ -for (nhsel=0, nh = (fi)->fib_nh; nhsel < (fi)->fib_nhs; nh++, nhsel++) - -#define change_nexthops(fi) { int nhsel; struct fib_nh *nexthop_nh; \ -for (nhsel=0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); nhsel < (fi)->fib_nhs; nexthop_nh++, nhsel++) +#define for_nexthops(fi) { \ + int nhsel; const struct fib_nh *nh; \ + for (nhsel = 0, nh = (fi)->fib_nh; \ + nhsel < (fi)->fib_nhs; \ + nh++, nhsel++) + +#define change_nexthops(fi) { \ + int nhsel; struct fib_nh *nexthop_nh; \ + for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ + nhsel < (fi)->fib_nhs; \ + nexthop_nh++, nhsel++) #else /* CONFIG_IP_ROUTE_MULTIPATH */ /* Hope, that gcc will optimize it to get rid of dummy loop */ -#define for_nexthops(fi) { int nhsel = 0; const struct fib_nh * nh = (fi)->fib_nh; \ -for (nhsel=0; nhsel < 1; nhsel++) +#define for_nexthops(fi) { \ + int nhsel; const struct fib_nh *nh = (fi)->fib_nh; \ + for (nhsel = 0; nhsel < 1; nhsel++) -#define change_nexthops(fi) { int nhsel = 0; struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ -for (nhsel=0; nhsel < 1; nhsel++) +#define change_nexthops(fi) { \ + int nhsel; \ + struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh); \ + for (nhsel = 0; nhsel < 1; nhsel++) #endif /* CONFIG_IP_ROUTE_MULTIPATH */ @@ -86,54 +95,54 @@ static const struct int error; u8 scope; } fib_props[RTN_MAX + 1] = { - { + [RTN_UNSPEC] = { .error = 0, .scope = RT_SCOPE_NOWHERE, - }, /* RTN_UNSPEC */ - { + }, + [RTN_UNICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_UNICAST */ - { + }, + [RTN_LOCAL] = { .error = 0, .scope = RT_SCOPE_HOST, - }, /* RTN_LOCAL */ - { + }, + [RTN_BROADCAST] = { .error = 0, .scope = RT_SCOPE_LINK, - }, /* RTN_BROADCAST */ - { + }, + [RTN_ANYCAST] = { .error = 0, .scope = RT_SCOPE_LINK, - }, /* RTN_ANYCAST */ - { + }, + [RTN_MULTICAST] = { .error = 0, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_MULTICAST */ - { + }, + [RTN_BLACKHOLE] = { .error = -EINVAL, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_BLACKHOLE */ - { + }, + [RTN_UNREACHABLE] = { .error = -EHOSTUNREACH, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_UNREACHABLE */ - { + }, + [RTN_PROHIBIT] = { .error = -EACCES, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_PROHIBIT */ - { + }, + [RTN_THROW] = { .error = -EAGAIN, .scope = RT_SCOPE_UNIVERSE, - }, /* RTN_THROW */ - { + }, + [RTN_NAT] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, - }, /* RTN_NAT */ - { + }, + [RTN_XRESOLVE] = { .error = -EINVAL, .scope = RT_SCOPE_NOWHERE, - }, /* RTN_XRESOLVE */ + }, }; @@ -142,7 +151,7 @@ static const struct void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { - printk(KERN_WARNING "Freeing alive fib_info %p\n", fi); + pr_warning("Freeing alive fib_info %p\n", fi); return; } change_nexthops(fi) { @@ -173,7 +182,7 @@ void fib_release_info(struct fib_info *fi) spin_unlock_bh(&fib_info_lock); } -static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) +static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi) { const struct fib_nh *onh = ofi->fib_nh; @@ -187,7 +196,7 @@ static __inline__ int nh_comp(const struct fib_info *fi, const struct fib_info * #ifdef CONFIG_NET_CLS_ROUTE nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_F_DEAD)) return -1; onh++; } endfor_nexthops(fi); @@ -238,7 +247,7 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) nfi->fib_priority == fi->fib_priority && memcmp(nfi->fib_metrics, fi->fib_metrics, sizeof(fi->fib_metrics)) == 0 && - ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + ((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_F_DEAD) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } @@ -247,9 +256,8 @@ static struct fib_info *fib_find_info(const struct fib_info *nfi) } /* Check, that the gateway is already configured. - Used only by redirect accept routine. + * Used only by redirect accept routine. */ - int ip_fib_check_default(__be32 gw, struct net_device *dev) { struct hlist_head *head; @@ -264,7 +272,7 @@ int ip_fib_check_default(__be32 gw, struct net_device *dev) hlist_for_each_entry(nh, node, head, nh_hash) { if (nh->nh_dev == dev && nh->nh_gw == gw && - !(nh->nh_flags&RTNH_F_DEAD)) { + !(nh->nh_flags & RTNH_F_DEAD)) { spin_unlock(&fib_info_lock); return 0; } @@ -362,10 +370,10 @@ int fib_detect_death(struct fib_info *fi, int order, } if (state == NUD_REACHABLE) return 0; - if ((state&NUD_VALID) && order != dflt) + if ((state & NUD_VALID) && order != dflt) return 0; - if ((state&NUD_VALID) || - (*last_idx<0 && order > dflt)) { + if ((state & NUD_VALID) || + (*last_idx < 0 && order > dflt)) { *last_resort = fi; *last_idx = order; } @@ -476,69 +484,69 @@ int fib_nh_match(struct fib_config *cfg, struct fib_info *fi) /* - Picture - ------- - - Semantics of nexthop is very messy by historical reasons. - We have to take into account, that: - a) gateway can be actually local interface address, - so that gatewayed route is direct. - b) gateway must be on-link address, possibly - described not by an ifaddr, but also by a direct route. - c) If both gateway and interface are specified, they should not - contradict. - d) If we use tunnel routes, gateway could be not on-link. - - Attempt to reconcile all of these (alas, self-contradictory) conditions - results in pretty ugly and hairy code with obscure logic. - - I chose to generalized it instead, so that the size - of code does not increase practically, but it becomes - much more general. - Every prefix is assigned a "scope" value: "host" is local address, - "link" is direct route, - [ ... "site" ... "interior" ... ] - and "universe" is true gateway route with global meaning. - - Every prefix refers to a set of "nexthop"s (gw, oif), - where gw must have narrower scope. This recursion stops - when gw has LOCAL scope or if "nexthop" is declared ONLINK, - which means that gw is forced to be on link. - - Code is still hairy, but now it is apparently logically - consistent and very flexible. F.e. as by-product it allows - to co-exists in peace independent exterior and interior - routing processes. - - Normally it looks as following. - - {universe prefix} -> (gw, oif) [scope link] - | - |-> {link prefix} -> (gw, oif) [scope local] - | - |-> {local prefix} (terminal node) + * Picture + * ------- + * + * Semantics of nexthop is very messy by historical reasons. + * We have to take into account, that: + * a) gateway can be actually local interface address, + * so that gatewayed route is direct. + * b) gateway must be on-link address, possibly + * described not by an ifaddr, but also by a direct route. + * c) If both gateway and interface are specified, they should not + * contradict. + * d) If we use tunnel routes, gateway could be not on-link. + * + * Attempt to reconcile all of these (alas, self-contradictory) conditions + * results in pretty ugly and hairy code with obscure logic. + * + * I chose to generalized it instead, so that the size + * of code does not increase practically, but it becomes + * much more general. + * Every prefix is assigned a "scope" value: "host" is local address, + * "link" is direct route, + * [ ... "site" ... "interior" ... ] + * and "universe" is true gateway route with global meaning. + * + * Every prefix refers to a set of "nexthop"s (gw, oif), + * where gw must have narrower scope. This recursion stops + * when gw has LOCAL scope or if "nexthop" is declared ONLINK, + * which means that gw is forced to be on link. + * + * Code is still hairy, but now it is apparently logically + * consistent and very flexible. F.e. as by-product it allows + * to co-exists in peace independent exterior and interior + * routing processes. + * + * Normally it looks as following. + * + * {universe prefix} -> (gw, oif) [scope link] + * | + * |-> {link prefix} -> (gw, oif) [scope local] + * | + * |-> {local prefix} (terminal node) */ - static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, struct fib_nh *nh) { int err; struct net *net; + struct net_device *dev; net = cfg->fc_nlinfo.nl_net; if (nh->nh_gw) { struct fib_result res; - if (nh->nh_flags&RTNH_F_ONLINK) { - struct net_device *dev; + if (nh->nh_flags & RTNH_F_ONLINK) { if (cfg->fc_scope >= RT_SCOPE_LINK) return -EINVAL; if (inet_addr_type(net, nh->nh_gw) != RTN_UNICAST) return -EINVAL; - if ((dev = __dev_get_by_index(net, nh->nh_oif)) == NULL) + dev = __dev_get_by_index(net, nh->nh_oif); + if (!dev) return -ENODEV; - if (!(dev->flags&IFF_UP)) + if (!(dev->flags & IFF_UP)) return -ENETDOWN; nh->nh_dev = dev; dev_hold(dev); @@ -559,7 +567,8 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl.fl4_scope < RT_SCOPE_LINK) fl.fl4_scope = RT_SCOPE_LINK; - if ((err = fib_lookup(net, &fl, &res)) != 0) + err = fib_lookup(net, &fl, &res); + if (err) return err; } err = -EINVAL; @@ -567,11 +576,12 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, goto out; nh->nh_scope = res.scope; nh->nh_oif = FIB_RES_OIF(res); - if ((nh->nh_dev = FIB_RES_DEV(res)) == NULL) + nh->nh_dev = dev = FIB_RES_DEV(res); + if (!dev) goto out; - dev_hold(nh->nh_dev); + dev_hold(dev); err = -ENETDOWN; - if (!(nh->nh_dev->flags & IFF_UP)) + if (!(dev->flags & IFF_UP)) goto out; err = 0; out: @@ -580,13 +590,13 @@ out: } else { struct in_device *in_dev; - if (nh->nh_flags&(RTNH_F_PERVASIVE|RTNH_F_ONLINK)) + if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) return -EINVAL; in_dev = inetdev_by_index(net, nh->nh_oif); if (in_dev == NULL) return -ENODEV; - if (!(in_dev->dev->flags&IFF_UP)) { + if (!(in_dev->dev->flags & IFF_UP)) { in_dev_put(in_dev); return -ENETDOWN; } @@ -602,7 +612,9 @@ static inline unsigned int fib_laddr_hashfn(__be32 val) { unsigned int mask = (fib_hash_size - 1); - return ((__force u32)val ^ ((__force u32)val >> 7) ^ ((__force u32)val >> 14)) & mask; + return ((__force u32)val ^ + ((__force u32)val >> 7) ^ + ((__force u32)val >> 14)) & mask; } static struct hlist_head *fib_hash_alloc(int bytes) @@ -611,7 +623,8 @@ static struct hlist_head *fib_hash_alloc(int bytes) return kzalloc(bytes, GFP_KERNEL); else return (struct hlist_head *) - __get_free_pages(GFP_KERNEL | __GFP_ZERO, get_order(bytes)); + __get_free_pages(GFP_KERNEL | __GFP_ZERO, + get_order(bytes)); } static void fib_hash_free(struct hlist_head *hash, int bytes) @@ -806,7 +819,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) goto failure; } else { change_nexthops(fi) { - if ((err = fib_check_nh(cfg, fi, nexthop_nh)) != 0) + err = fib_check_nh(cfg, fi, nexthop_nh); + if (err != 0) goto failure; } endfor_nexthops(fi) } @@ -819,7 +833,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) } link_it: - if ((ofi = fib_find_info(fi)) != NULL) { + ofi = fib_find_info(fi); + if (ofi) { fi->fib_dead = 1; free_fib_info(fi); ofi->fib_treeref++; @@ -895,7 +910,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, case RTN_ANYCAST: case RTN_MULTICAST: for_nexthops(fi) { - if (nh->nh_flags&RTNH_F_DEAD) + if (nh->nh_flags & RTNH_F_DEAD) continue; if (!flp->oif || flp->oif == nh->nh_oif) break; @@ -906,16 +921,15 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, goto out_fill_res; } #else - if (nhsel < 1) { + if (nhsel < 1) goto out_fill_res; - } #endif endfor_nexthops(fi); continue; default: - printk(KERN_WARNING "fib_semantic_match bad type %#x\n", - fa->fa_type); + pr_warning("fib_semantic_match bad type %#x\n", + fa->fa_type); return -EINVAL; } } @@ -1028,10 +1042,10 @@ nla_put_failure: } /* - Update FIB if: - - local address disappeared -> we must delete all the entries - referring to it. - - device went down -> we must shutdown all nexthops going via it. + * Update FIB if: + * - local address disappeared -> we must delete all the entries + * referring to it. + * - device went down -> we must shutdown all nexthops going via it. */ int fib_sync_down_addr(struct net *net, __be32 local) { @@ -1078,7 +1092,7 @@ int fib_sync_down_dev(struct net_device *dev, int force) prev_fi = fi; dead = 0; change_nexthops(fi) { - if (nexthop_nh->nh_flags&RTNH_F_DEAD) + if (nexthop_nh->nh_flags & RTNH_F_DEAD) dead++; else if (nexthop_nh->nh_dev == dev && nexthop_nh->nh_scope != scope) { @@ -1110,10 +1124,9 @@ int fib_sync_down_dev(struct net_device *dev, int force) #ifdef CONFIG_IP_ROUTE_MULTIPATH /* - Dead device goes up. We wake up dead nexthops. - It takes sense only on multipath routes. + * Dead device goes up. We wake up dead nexthops. + * It takes sense only on multipath routes. */ - int fib_sync_up(struct net_device *dev) { struct fib_info *prev_fi; @@ -1123,7 +1136,7 @@ int fib_sync_up(struct net_device *dev) struct fib_nh *nh; int ret; - if (!(dev->flags&IFF_UP)) + if (!(dev->flags & IFF_UP)) return 0; prev_fi = NULL; @@ -1142,12 +1155,12 @@ int fib_sync_up(struct net_device *dev) prev_fi = fi; alive = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { alive++; continue; } if (nexthop_nh->nh_dev == NULL || - !(nexthop_nh->nh_dev->flags&IFF_UP)) + !(nexthop_nh->nh_dev->flags & IFF_UP)) continue; if (nexthop_nh->nh_dev != dev || !__in_dev_get_rtnl(dev)) @@ -1169,10 +1182,9 @@ int fib_sync_up(struct net_device *dev) } /* - The algorithm is suboptimal, but it provides really - fair weighted route distribution. + * The algorithm is suboptimal, but it provides really + * fair weighted route distribution. */ - void fib_select_multipath(const struct flowi *flp, struct fib_result *res) { struct fib_info *fi = res->fi; @@ -1182,7 +1194,7 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags&RTNH_F_DEAD)) { + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD)) { power += nexthop_nh->nh_weight; nexthop_nh->nh_power = nexthop_nh->nh_weight; } @@ -1198,15 +1210,16 @@ void fib_select_multipath(const struct flowi *flp, struct fib_result *res) /* w should be random number [0..fi->fib_power-1], - it is pretty bad approximation. + * it is pretty bad approximation. */ w = jiffies % fi->fib_power; change_nexthops(fi) { - if (!(nexthop_nh->nh_flags&RTNH_F_DEAD) && + if (!(nexthop_nh->nh_flags & RTNH_F_DEAD) && nexthop_nh->nh_power) { - if ((w -= nexthop_nh->nh_power) <= 0) { + w -= nexthop_nh->nh_power; + if (w <= 0) { nexthop_nh->nh_power--; fi->fib_power--; res->nh_sel = nhsel; -- cgit v1.1 From 1df9916e46451533463f227e6be57cc2cfca4c5f Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 4 Oct 2010 20:14:17 +0000 Subject: fib: fib_rules_cleanup can be static fib_rules_cleanup_ups is only defined and used in one place. Signed-off-by: Stephen Hemminger Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 332c2e3..cfb7d25 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -144,7 +144,7 @@ fib_rules_register(const struct fib_rules_ops *tmpl, struct net *net) } EXPORT_SYMBOL_GPL(fib_rules_register); -void fib_rules_cleanup_ops(struct fib_rules_ops *ops) +static void fib_rules_cleanup_ops(struct fib_rules_ops *ops) { struct fib_rule *rule, *tmp; @@ -153,7 +153,6 @@ void fib_rules_cleanup_ops(struct fib_rules_ops *ops) fib_rule_put(rule); } } -EXPORT_SYMBOL_GPL(fib_rules_cleanup_ops); static void fib_rules_put_rcu(struct rcu_head *head) { -- cgit v1.1 From c61393ea83573ff422af505b6fd49ef9ec9b91ca Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 4 Oct 2010 20:17:53 +0000 Subject: ipv6: make __ipv6_isatap_ifid static Another exported symbol only used in one file Signed-off-by: Stephen Hemminger Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/addrconf.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 8c88340..ec7a91d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1544,7 +1544,7 @@ static int addrconf_ifid_infiniband(u8 *eui, struct net_device *dev) return 0; } -int __ipv6_isatap_ifid(u8 *eui, __be32 addr) +static int __ipv6_isatap_ifid(u8 *eui, __be32 addr) { if (addr == 0) return -1; @@ -1560,7 +1560,6 @@ int __ipv6_isatap_ifid(u8 *eui, __be32 addr) memcpy(eui + 4, &addr, 4); return 0; } -EXPORT_SYMBOL(__ipv6_isatap_ifid); static int addrconf_ifid_sit(u8 *eui, struct net_device *dev) { -- cgit v1.1 From 5a254ffe3ffdfa84fe076009bd8e88da412180d2 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Mon, 27 Sep 2010 09:07:26 -0700 Subject: wireless: Use first phyX name available when registering phy devices. Choose first available phyX name when creating phy devices. This means that reloading a wifi driver will not cause a change in the name of it's phy device. Also, allow users to rename a phy to any un-used name, including phy%d. Signed-off-by: Ben Greear Signed-off-by: John W. Linville --- net/wireless/core.c | 54 +++++++++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 9c21ebf..1684ad9 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -178,26 +178,10 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname) { struct cfg80211_registered_device *rdev2; - int wiphy_idx, taken = -1, result, digits; + int result; assert_cfg80211_lock(); - /* prohibit calling the thing phy%d when %d is not its number */ - sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); - if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { - /* count number of places needed to print wiphy_idx */ - digits = 1; - while (wiphy_idx /= 10) - digits++; - /* - * deny the name if it is phy where is printed - * without leading zeroes. taken == strlen(newname) here - */ - if (taken == strlen(PHY_NAME) + digits) - return -EINVAL; - } - - /* Ignore nop renames */ if (strcmp(newname, dev_name(&rdev->wiphy.dev)) == 0) return 0; @@ -205,7 +189,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, /* Ensure another device does not already have this name. */ list_for_each_entry(rdev2, &cfg80211_rdev_list, list) if (strcmp(newname, dev_name(&rdev2->wiphy.dev)) == 0) - return -EINVAL; + return -EEXIST; result = device_rename(&rdev->wiphy.dev, newname); if (result) @@ -320,9 +304,11 @@ static void cfg80211_event_work(struct work_struct *work) struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) { static int wiphy_counter; - - struct cfg80211_registered_device *rdev; + int i; + struct cfg80211_registered_device *rdev, *rdev2; int alloc_size; + char nname[IFNAMSIZ + 1]; + bool found = false; WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key)); WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc)); @@ -346,16 +332,36 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) if (unlikely(!wiphy_idx_valid(rdev->wiphy_idx))) { wiphy_counter--; + goto too_many_devs; + } + + /* 64k wiphy devices is enough for anyone! */ + for (i = 0; i < 0xFFFF; i++) { + found = false; + snprintf(nname, sizeof(nname)-1, PHY_NAME "%d", i); + nname[sizeof(nname)-1] = 0; + list_for_each_entry(rdev2, &cfg80211_rdev_list, list) + if (strcmp(nname, dev_name(&rdev2->wiphy.dev)) == 0) { + found = true; + break; + } + + if (!found) + break; + } + + if (unlikely(found)) { +too_many_devs: mutex_unlock(&cfg80211_mutex); - /* ugh, wrapped! */ + /* ugh, too many devices already! */ kfree(rdev); return NULL; } - mutex_unlock(&cfg80211_mutex); - /* give it a proper name */ - dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + dev_set_name(&rdev->wiphy.dev, "%s", nname); + + mutex_unlock(&cfg80211_mutex); mutex_init(&rdev->mtx); mutex_init(&rdev->devlist_mtx); -- cgit v1.1 From 85416a4fa193754ef36e12b20bb02fe661cb7f17 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Sat, 2 Oct 2010 13:17:07 +0200 Subject: mac80211: fix rx monitor filter refcounters This patch fixes an refcounting bug. Previously it was possible to corrupt the per-device recv. filter and monitor management counters when: iw dev wlanX set monitor [new flags] was issued on an active monitor interface. Acked-by: Johannes Berg Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 32 ++++++++++++++++++++++++++++++-- net/mac80211/ieee80211_i.h | 2 ++ net/mac80211/iface.c | 44 ++++++++++++++++++++++---------------------- 3 files changed, 54 insertions(+), 24 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index c981604..9e63fc2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -68,8 +68,36 @@ static int ieee80211_change_iface(struct wiphy *wiphy, params && params->use_4addr >= 0) sdata->u.mgd.use_4addr = params->use_4addr; - if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) - sdata->u.mntr_flags = *flags; + if (sdata->vif.type == NL80211_IFTYPE_MONITOR && flags) { + struct ieee80211_local *local = sdata->local; + + if (ieee80211_sdata_running(sdata)) { + /* + * Prohibit MONITOR_FLAG_COOK_FRAMES to be + * changed while the interface is up. + * Else we would need to add a lot of cruft + * to update everything: + * cooked_mntrs, monitor and all fif_* counters + * reconfigure hardware + */ + if ((*flags & MONITOR_FLAG_COOK_FRAMES) != + (sdata->u.mntr_flags & MONITOR_FLAG_COOK_FRAMES)) + return -EBUSY; + + ieee80211_adjust_monitor_flags(sdata, -1); + sdata->u.mntr_flags = *flags; + ieee80211_adjust_monitor_flags(sdata, 1); + + ieee80211_configure_filter(local); + } else { + /* + * Because the interface is down, ieee80211_do_stop + * and ieee80211_do_open take care of "everything" + * mentioned in the comment above. + */ + sdata->u.mntr_flags = *flags; + } + } return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 945fbf2..f6a6d78 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1132,6 +1132,8 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata); void ieee80211_remove_interfaces(struct ieee80211_local *local); u32 __ieee80211_recalc_idle(struct ieee80211_local *local); void ieee80211_recalc_idle(struct ieee80211_local *local); +void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, + const int offset); static inline bool ieee80211_sdata_running(struct ieee80211_sub_if_data *sdata) { diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 6678573..1300e88 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -148,6 +148,26 @@ static int ieee80211_check_concurrent_iface(struct ieee80211_sub_if_data *sdata, return 0; } +void ieee80211_adjust_monitor_flags(struct ieee80211_sub_if_data *sdata, + const int offset) +{ + struct ieee80211_local *local = sdata->local; + u32 flags = sdata->u.mntr_flags; + +#define ADJUST(_f, _s) do { \ + if (flags & MONITOR_FLAG_##_f) \ + local->fif_##_s += offset; \ + } while (0) + + ADJUST(FCSFAIL, fcsfail); + ADJUST(PLCPFAIL, plcpfail); + ADJUST(CONTROL, control); + ADJUST(CONTROL, pspoll); + ADJUST(OTHER_BSS, other_bss); + +#undef ADJUST +} + /* * NOTE: Be very careful when changing this function, it must NOT return * an error on interface type changes that have been pre-checked, so most @@ -240,17 +260,7 @@ static int ieee80211_do_open(struct net_device *dev, bool coming_up) hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; } - if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL) - local->fif_fcsfail++; - if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL) - local->fif_plcpfail++; - if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) { - local->fif_control++; - local->fif_pspoll++; - } - if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS) - local->fif_other_bss++; - + ieee80211_adjust_monitor_flags(sdata, 1); ieee80211_configure_filter(local); netif_carrier_on(dev); @@ -477,17 +487,7 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, hw_reconf_flags |= IEEE80211_CONF_CHANGE_MONITOR; } - if (sdata->u.mntr_flags & MONITOR_FLAG_FCSFAIL) - local->fif_fcsfail--; - if (sdata->u.mntr_flags & MONITOR_FLAG_PLCPFAIL) - local->fif_plcpfail--; - if (sdata->u.mntr_flags & MONITOR_FLAG_CONTROL) { - local->fif_pspoll--; - local->fif_control--; - } - if (sdata->u.mntr_flags & MONITOR_FLAG_OTHER_BSS) - local->fif_other_bss--; - + ieee80211_adjust_monitor_flags(sdata, -1); ieee80211_configure_filter(local); break; case NL80211_IFTYPE_MESH_POINT: -- cgit v1.1 From 17e5a8082894a4b66cb69e7ec16074f0f01281e1 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Wed, 29 Sep 2010 17:15:30 +0200 Subject: nl80211: allow drivers to indicate whether the survey data channel is in use Some user space applications only want to display survey data for the operating channel, however there is no API to get that yet. Signed-off-by: Felix Fietkau Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9c84825..0087c43 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3489,6 +3489,8 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq, if (survey->filled & SURVEY_INFO_NOISE_DBM) NLA_PUT_U8(msg, NL80211_SURVEY_INFO_NOISE, survey->noise); + if (survey->filled & SURVEY_INFO_IN_USE) + NLA_PUT_FLAG(msg, NL80211_SURVEY_INFO_IN_USE); nla_nest_end(msg, infoattr); -- cgit v1.1 From 663fcafd977f13e6483f7d4cf2ccdbc4fae81ed0 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Sep 2010 21:06:09 +0200 Subject: cfg80211/mac80211: allow management frame TX in AP mode Enable management frame transmission and subscribing to management frames through nl80211 in both cfg80211 and mac80211. Also update a few places that I forgot to update for P2P-client mode previously, and fix a small bug with non-action frames in this API. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 7 ++++++- net/wireless/mlme.c | 54 ++++++++++++++++++++++++++++++++++++++++---------- net/wireless/nl80211.c | 10 ++++++++-- 3 files changed, 57 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 9e63fc2..a7a78f2 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1549,7 +1549,11 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev, switch (sdata->vif.type) { case NL80211_IFTYPE_ADHOC: - if (mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_AP_VLAN: + case NL80211_IFTYPE_P2P_GO: + if (!ieee80211_is_action(mgmt->frame_control) || + mgmt->u.action.category == WLAN_CATEGORY_PUBLIC) break; rcu_read_lock(); sta = sta_info_get(sdata, mgmt->da); @@ -1558,6 +1562,7 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev, return -ENOLINK; break; case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: break; default: return -EOPNOTSUPP; diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 46f3711..caf11a427 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -876,21 +876,53 @@ int cfg80211_mlme_mgmt_tx(struct cfg80211_registered_device *rdev, if (ieee80211_is_action(mgmt->frame_control) && mgmt->u.action.category != WLAN_CATEGORY_PUBLIC) { - /* Verify that we are associated with the destination AP */ + int err = 0; + wdev_lock(wdev); - if (!wdev->current_bss || - memcmp(wdev->current_bss->pub.bssid, mgmt->bssid, - ETH_ALEN) != 0 || - ((wdev->iftype == NL80211_IFTYPE_STATION || - wdev->iftype == NL80211_IFTYPE_P2P_CLIENT) && - memcmp(wdev->current_bss->pub.bssid, mgmt->da, - ETH_ALEN) != 0)) { - wdev_unlock(wdev); - return -ENOTCONN; - } + switch (wdev->iftype) { + case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_STATION: + case NL80211_IFTYPE_P2P_CLIENT: + if (!wdev->current_bss) { + err = -ENOTCONN; + break; + } + + if (memcmp(wdev->current_bss->pub.bssid, + mgmt->bssid, ETH_ALEN)) { + err = -ENOTCONN; + break; + } + + /* + * check for IBSS DA must be done by driver as + * cfg80211 doesn't track the stations + */ + if (wdev->iftype == NL80211_IFTYPE_ADHOC) + break; + /* for station, check that DA is the AP */ + if (memcmp(wdev->current_bss->pub.bssid, + mgmt->da, ETH_ALEN)) { + err = -ENOTCONN; + break; + } + break; + case NL80211_IFTYPE_AP: + case NL80211_IFTYPE_P2P_GO: + case NL80211_IFTYPE_AP_VLAN: + if (memcmp(mgmt->bssid, dev->dev_addr, ETH_ALEN)) + err = -EINVAL; + break; + default: + err = -EOPNOTSUPP; + break; + } wdev_unlock(wdev); + + if (err) + return err; } if (memcmp(mgmt->sa, dev->dev_addr, ETH_ALEN) != 0) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0087c43..cbbbe9a 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4828,7 +4828,10 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { err = -EOPNOTSUPP; goto out; } @@ -4881,7 +4884,10 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { err = -EOPNOTSUPP; goto out; } -- cgit v1.1 From 2234362c427e2ef667595b9b81c0125003ac5607 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 30 Sep 2010 22:17:43 +0200 Subject: cfg80211: fix locking Add missing unlocking of the wiphy in set_channel, and don't try to unlock a non-existing wiphy in set_cqm. Cc: stable@kernel.org [2.6.35+] Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index cbbbe9a..21061cc 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -821,11 +821,13 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) result = get_rdev_dev_by_info_ifindex(info, &rdev, &netdev); if (result) - goto unlock; + goto unlock_rtnl; result = __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info); - unlock: + dev_put(netdev); + cfg80211_unlock_rdev(rdev); + unlock_rtnl: rtnl_unlock(); return result; @@ -5097,7 +5099,7 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); if (err) - goto unlock_rdev; + goto unlock_rtnl; wdev = dev->ieee80211_ptr; @@ -5115,9 +5117,10 @@ static int nl80211_set_cqm_rssi(struct genl_info *info, err = rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev, threshold, hysteresis); -unlock_rdev: + unlock_rdev: cfg80211_unlock_rdev(rdev); dev_put(dev); + unlock_rtnl: rtnl_unlock(); return err; -- cgit v1.1 From bc86863de63e6ae7ec6f9f524604631608c6cb02 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Fri, 1 Oct 2010 14:05:27 +0200 Subject: mac80211: perform scan cancel in hw reset work Move ieee80211_scan_cancel() and all other related code to ieee80211_restart_work() as ieee80211_restart_hw() is intended to be callable from any context. Fix a bug that RTNL lock is not taken during ieee80211_cancel_scan(). Take local->mtx before WARN(test_bit(SCAN_HW_SCANNING, &local->scanning) to prevent the race condition with __ieee80211_start_scan() described here: http://marc.info/?l=linux-wireless&m=128516716810537&w=2 Signed-off-by: Stanislaw Gruszka Signed-off-by: John W. Linville --- net/mac80211/main.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index e24fa5b..494dba1 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -295,7 +295,17 @@ static void ieee80211_restart_work(struct work_struct *work) struct ieee80211_local *local = container_of(work, struct ieee80211_local, restart_work); + /* wait for scan work complete */ + flush_workqueue(local->workqueue); + + mutex_lock(&local->mtx); + WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), + "%s called with hardware scan in progress\n", __func__); + mutex_unlock(&local->mtx); + rtnl_lock(); + if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) + ieee80211_scan_cancel(local); ieee80211_reconfig(local); rtnl_unlock(); } @@ -306,15 +316,6 @@ void ieee80211_restart_hw(struct ieee80211_hw *hw) trace_api_restart_hw(local); - /* wait for scan work complete */ - flush_workqueue(local->workqueue); - - WARN(test_bit(SCAN_HW_SCANNING, &local->scanning), - "%s called with hardware scan in progress\n", __func__); - - if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) - ieee80211_scan_cancel(local); - /* use this reason, ieee80211_reconfig will unblock it */ ieee80211_stop_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_SUSPEND); -- cgit v1.1 From d8ec44335c974cc8bf67ce70c63071d4e0702509 Mon Sep 17 00:00:00 2001 From: Juuso Oikarinen Date: Fri, 1 Oct 2010 16:02:31 +0300 Subject: mac80211: Add validity check for beacon_crc value On association to an AP, after receiving beacons, the beacon_crc value is set. The beacon_crc value is not reset in disassociation, but the BSS data may be expired at a later point. When associating again, it's possible that a beacon for the AP is not received, resulting in the beacon_ies to remain NULL. After association, further beacons will not update the beacon data, as the crc value of the beacon has not changed, and the beacon_crc still holds a value matching the beacon. The beacon_ies will remain forever null. One of the results of this is that WLAN power save cannot be entered, the STA will remain foreven in active mode. Fix this by adding a validation flag for the beacon_crc, which is cleared on association. Signed-off-by: Juuso Oikarinen Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ieee80211_i.h | 1 + net/mac80211/mlme.c | 7 +++++-- 2 files changed, 6 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f6a6d78..55d79db 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -369,6 +369,7 @@ struct ieee80211_if_managed { unsigned int flags; + bool beacon_crc_valid; u32 beacon_crc; enum { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 77913a1..c37086a 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1639,7 +1639,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, directed_tim = ieee80211_check_tim(elems.tim, elems.tim_len, ifmgd->aid); - if (ncrc != ifmgd->beacon_crc) { + if (ncrc != ifmgd->beacon_crc || !ifmgd->beacon_crc_valid) { ieee80211_rx_bss_info(sdata, mgmt, len, rx_status, &elems, true); @@ -1670,9 +1670,10 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata, } } - if (ncrc == ifmgd->beacon_crc) + if (ncrc == ifmgd->beacon_crc && ifmgd->beacon_crc_valid) return; ifmgd->beacon_crc = ncrc; + ifmgd->beacon_crc_valid = true; if (elems.erp_info && elems.erp_info_len >= 1) { erp_valid = true; @@ -2214,6 +2215,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, ifmgd->flags &= ~IEEE80211_STA_DISABLE_11N; ifmgd->flags &= ~IEEE80211_STA_NULLFUNC_ACKED; + ifmgd->beacon_crc_valid = false; + for (i = 0; i < req->crypto.n_ciphers_pairwise; i++) if (req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_WEP40 || req->crypto.ciphers_pairwise[i] == WLAN_CIPHER_SUITE_TKIP || -- cgit v1.1 From e8347ebad2f1b15bddb6ed3ed5f767531eb52dc3 Mon Sep 17 00:00:00 2001 From: Bill Jordan Date: Fri, 1 Oct 2010 13:54:28 -0400 Subject: cfg80211: patches to allow setting the WDS peer Added a nl interface to set the peer bssid of a WDS interface. Signed-off-by: Bill Jordan Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 21061cc..fd92b6b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -603,6 +603,7 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, NLA_PUT_U32(msg, i, NL80211_CMD_SET_WIPHY_NETNS); } CMD(set_channel, SET_CHANNEL); + CMD(set_wds_peer, SET_WDS_PEER); #undef CMD @@ -833,6 +834,53 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) return result; } +static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct wireless_dev *wdev; + struct net_device *dev; + u8 *bssid; + int err; + + if (!info->attrs[NL80211_ATTR_MAC]) + return -EINVAL; + + rtnl_lock(); + + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) + goto unlock_rtnl; + + wdev = dev->ieee80211_ptr; + + if (netif_running(dev)) { + err = -EBUSY; + goto out; + } + + if (!rdev->ops->set_wds_peer) { + err = -EOPNOTSUPP; + goto out; + } + + if (wdev->iftype != NL80211_IFTYPE_WDS) { + err = -EOPNOTSUPP; + goto out; + } + + bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); + err = rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid); + +out: + cfg80211_unlock_rdev(rdev); + dev_put(dev); +unlock_rtnl: + rtnl_unlock(); + + return err; +} + + static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) { struct cfg80211_registered_device *rdev; @@ -5473,6 +5521,12 @@ static struct genl_ops nl80211_ops[] = { .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, }, + { + .cmd = NL80211_CMD_SET_WDS_PEER, + .doit = nl80211_set_wds_peer, + .policy = nl80211_policy, + .flags = GENL_ADMIN_PERM, + }, }; static struct genl_multicast_group nl80211_mlme_mcgrp = { -- cgit v1.1 From 1be7fe8de9f25e173282f8f989f83bc5b5decfe9 Mon Sep 17 00:00:00 2001 From: Bill Jordan Date: Fri, 1 Oct 2010 11:20:41 -0400 Subject: mac80211: fix for WDS interfaces Initialize the rate table for WDS interfaces, and add cases to allow WDS packets to pass the xmit and receive tests. Signed-off-by: Bill Jordan Signed-off-by: John W. Linville --- net/mac80211/iface.c | 3 +++ net/mac80211/main.c | 3 +++ net/mac80211/rx.c | 1 + net/mac80211/tx.c | 3 +++ 4 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 1300e88..438a2f5 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -24,6 +24,7 @@ #include "led.h" #include "driver-ops.h" #include "wme.h" +#include "rate.h" /** * DOC: Interface list locking @@ -311,6 +312,8 @@ static int ieee80211_do_open(struct net_device *dev, bool coming_up) /* STA has been freed */ goto err_del_interface; } + + rate_control_rate_init(sta); } /* diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 494dba1..e127fbb 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -201,6 +201,8 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, sdata->vif.bss_conf.bssid = sdata->u.ibss.bssid; else if (sdata->vif.type == NL80211_IFTYPE_AP) sdata->vif.bss_conf.bssid = sdata->vif.addr; + else if (sdata->vif.type == NL80211_IFTYPE_WDS) + sdata->vif.bss_conf.bssid = NULL; else if (ieee80211_vif_is_mesh(&sdata->vif)) { sdata->vif.bss_conf.bssid = zero; } else { @@ -211,6 +213,7 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata, switch (sdata->vif.type) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_ADHOC: + case NL80211_IFTYPE_WDS: case NL80211_IFTYPE_MESH_POINT: break; default: diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index 0b0e83e..b3e161f 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -819,6 +819,7 @@ ieee80211_rx_h_check(struct ieee80211_rx_data *rx) if (unlikely((ieee80211_is_data(hdr->frame_control) || ieee80211_is_pspoll(hdr->frame_control)) && rx->sdata->vif.type != NL80211_IFTYPE_ADHOC && + rx->sdata->vif.type != NL80211_IFTYPE_WDS && (!rx->sta || !test_sta_flags(rx->sta, WLAN_STA_ASSOC)))) { if ((!ieee80211_has_fromds(hdr->frame_control) && !ieee80211_has_tods(hdr->frame_control) && diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index e1733dc..258fbdb 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -273,6 +273,9 @@ ieee80211_tx_h_check_assoc(struct ieee80211_tx_data *tx) */ return TX_DROP; + if (tx->sdata->vif.type == NL80211_IFTYPE_WDS) + return TX_CONTINUE; + if (tx->sdata->vif.type == NL80211_IFTYPE_MESH_POINT) return TX_CONTINUE; -- cgit v1.1 From 78be49ec2a0df34de9441930fdced20311fd709f Mon Sep 17 00:00:00 2001 From: Helmut Schaa Date: Sat, 2 Oct 2010 11:31:55 +0200 Subject: mac80211: distinct between max rates and the number of rates the hw can report Some drivers cannot handle multiple retry rates specified by the rc algorithm but instead use their own retry table (for example rt2800). However, if such a device registers itself with a max_rates value of 1 the rc algorithm cannot make use of the extended information the device can provide about retried rates. On the other hand, if a device registers itself with a max_rates value > 1 the rc algorithm assumes that the device can handle multi rate retries. Fix this issue by introducing another hw parameter max_report_rates that can be set to a different value then max_rates to indicate if a device is capable of reporting more rates then specified in max_rates. Signed-off-by: Helmut Schaa Signed-off-by: Ivo van Doorn Signed-off-by: John W. Linville --- net/mac80211/main.c | 4 ++++ net/mac80211/status.c | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index e127fbb..9c2f3f9 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -537,6 +537,7 @@ struct ieee80211_hw *ieee80211_alloc_hw(size_t priv_data_len, /* set up some defaults */ local->hw.queues = 1; local->hw.max_rates = 1; + local->hw.max_report_rates = 0; local->hw.conf.long_frame_max_tx_count = wiphy->retry_long; local->hw.conf.short_frame_max_tx_count = wiphy->retry_short; local->user_power_level = -1; @@ -612,6 +613,9 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) WLAN_CIPHER_SUITE_AES_CMAC }; + if (hw->max_report_rates == 0) + hw->max_report_rates = hw->max_rates; + /* * generic code guarantees at least one band, * set this very early because much code assumes diff --git a/net/mac80211/status.c b/net/mac80211/status.c index dd85006..95763e0 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -176,7 +176,7 @@ void ieee80211_tx_status(struct ieee80211_hw *hw, struct sk_buff *skb) for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) { /* the HW cannot have attempted that rate */ - if (i >= hw->max_rates) { + if (i >= hw->max_report_rates) { info->status.rates[i].idx = -1; info->status.rates[i].count = 0; } else if (info->status.rates[i].idx >= 0) { -- cgit v1.1 From 9eba612549f575d7dccda672ce932e15e7392d83 Mon Sep 17 00:00:00 2001 From: Bruno Randolf Date: Mon, 4 Oct 2010 11:17:30 +0900 Subject: mac80211: Add WME information element for IBSS Enable WME QoS in IBSS mode by adding a WME information element to beacons and probe respones and by checking for it and marking stations as WME capable if it is present. Signed-off-by: Bruno Randolf Signed-off-by: John W. Linville --- net/mac80211/ibss.c | 65 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c index 1a3aae5..ff60c02 100644 --- a/net/mac80211/ibss.c +++ b/net/mac80211/ibss.c @@ -173,6 +173,19 @@ static void __ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata, memcpy(skb_put(skb, ifibss->ie_len), ifibss->ie, ifibss->ie_len); + if (local->hw.queues >= 4) { + pos = skb_put(skb, 9); + *pos++ = WLAN_EID_VENDOR_SPECIFIC; + *pos++ = 7; /* len */ + *pos++ = 0x00; /* Microsoft OUI 00:50:F2 */ + *pos++ = 0x50; + *pos++ = 0xf2; + *pos++ = 2; /* WME */ + *pos++ = 0; /* WME info */ + *pos++ = 1; /* WME ver */ + *pos++ = 0; /* U-APSD no in use */ + } + rcu_assign_pointer(ifibss->presp, skb); sdata->vif.bss_conf.beacon_int = beacon_int; @@ -266,37 +279,45 @@ static void ieee80211_rx_bss_info(struct ieee80211_sub_if_data *sdata, if (!channel || channel->flags & IEEE80211_CHAN_DISABLED) return; - if (sdata->vif.type == NL80211_IFTYPE_ADHOC && elems->supp_rates && + if (sdata->vif.type == NL80211_IFTYPE_ADHOC && memcmp(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN) == 0) { - supp_rates = ieee80211_sta_get_rates(local, elems, band); rcu_read_lock(); - sta = sta_info_get(sdata, mgmt->sa); - if (sta) { - u32 prev_rates; - prev_rates = sta->sta.supp_rates[band]; - /* make sure mandatory rates are always added */ - sta->sta.supp_rates[band] = supp_rates | - ieee80211_mandatory_rates(local, band); + if (elems->supp_rates) { + supp_rates = ieee80211_sta_get_rates(local, elems, + band); + if (sta) { + u32 prev_rates; - if (sta->sta.supp_rates[band] != prev_rates) { + prev_rates = sta->sta.supp_rates[band]; + /* make sure mandatory rates are always added */ + sta->sta.supp_rates[band] = supp_rates | + ieee80211_mandatory_rates(local, band); + + if (sta->sta.supp_rates[band] != prev_rates) { #ifdef CONFIG_MAC80211_IBSS_DEBUG - printk(KERN_DEBUG "%s: updated supp_rates set " - "for %pM based on beacon/probe_response " - "(0x%x -> 0x%x)\n", - sdata->name, sta->sta.addr, - prev_rates, sta->sta.supp_rates[band]); + printk(KERN_DEBUG + "%s: updated supp_rates set " + "for %pM based on beacon" + "/probe_resp (0x%x -> 0x%x)\n", + sdata->name, sta->sta.addr, + prev_rates, + sta->sta.supp_rates[band]); #endif - rate_control_rate_init(sta); - } - rcu_read_unlock(); - } else { - rcu_read_unlock(); - ieee80211_ibss_add_sta(sdata, mgmt->bssid, mgmt->sa, - supp_rates, GFP_KERNEL); + rate_control_rate_init(sta); + } + } else + sta = ieee80211_ibss_add_sta(sdata, mgmt->bssid, + mgmt->sa, supp_rates, + GFP_ATOMIC); } + + if (sta && elems->wmm_info) + set_sta_flags(sta, WLAN_STA_WME); + + rcu_read_unlock(); } bss = ieee80211_bss_info_update(local, rx_status, mgmt, len, elems, -- cgit v1.1 From ff4c92d85c6f2777d2067f8552e7fefb4d1754ae Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Oct 2010 21:14:03 +0200 Subject: genetlink: introduce pre_doit/post_doit hooks Each family may have some amount of boilerplate locking code that applies to most, or even all, commands. This allows a family to handle such things in a more generic way, by allowing it to a) include private flags in each operation b) specify a pre_doit hook that is called, before an operation's doit() callback and may return an error directly, c) specify a post_doit hook that can undo locking or similar things done by pre_doit, and finally d) include two private pointers in each info struct passed between all these operations including doit(). (It's two because I'll need two in nl80211 -- can be extended.) Signed-off-by: Johannes Berg Acked-by: David S. Miller Signed-off-by: John W. Linville --- net/netlink/genetlink.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 26ed3e8..1781d99 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -547,8 +547,20 @@ static int genl_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh) info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN; info.attrs = family->attrbuf; genl_info_net_set(&info, net); + memset(&info.user_ptr, 0, sizeof(info.user_ptr)); - return ops->doit(skb, &info); + if (family->pre_doit) { + err = family->pre_doit(ops, skb, &info); + if (err) + return err; + } + + err = ops->doit(skb, &info); + + if (family->post_doit) + family->post_doit(ops, skb, &info); + + return err; } static void genl_rcv(struct sk_buff *skb) -- cgit v1.1 From 4c476991062a0a59523c2bf193f09087f469686a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Oct 2010 21:36:35 +0200 Subject: nl80211: use the new genetlink pre/post_doit hooks This makes nl80211 use the new genetlink pre_doit/post_doit hooks for locking and checking the interface/wiphy index. This significantly reduces the code size and the likelihood of locking errors. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 1737 +++++++++++++++--------------------------------- 1 file changed, 533 insertions(+), 1204 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index fd92b6b..a96da47 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -23,6 +23,11 @@ #include "nl80211.h" #include "reg.h" +static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info); +static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info); + /* the netlink family */ static struct genl_family nl80211_fam = { .id = GENL_ID_GENERATE, /* don't bother with a hardcoded ID */ @@ -31,6 +36,8 @@ static struct genl_family nl80211_fam = { .version = 1, /* no particular meaning now */ .maxattr = NL80211_ATTR_MAX, .netnsok = true, + .pre_doit = nl80211_pre_doit, + .post_doit = nl80211_post_doit, }; /* internal helper: get rdev and dev */ @@ -704,28 +711,18 @@ static int nl80211_dump_wiphy(struct sk_buff *skb, struct netlink_callback *cb) static int nl80211_get_wiphy(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; - struct cfg80211_registered_device *dev; - - dev = cfg80211_get_dev_from_info(info); - if (IS_ERR(dev)) - return PTR_ERR(dev); + struct cfg80211_registered_device *dev = info->user_ptr[0]; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - goto out_err; - - if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0) - goto out_free; + return -ENOMEM; - cfg80211_unlock_rdev(dev); + if (nl80211_send_wiphy(msg, info->snd_pid, info->snd_seq, 0, dev) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } return genlmsg_reply(msg, info); - - out_free: - nlmsg_free(msg); - out_err: - cfg80211_unlock_rdev(dev); - return -ENOBUFS; } static const struct nla_policy txq_params_policy[NL80211_TXQ_ATTR_MAX + 1] = { @@ -814,24 +811,10 @@ static int __nl80211_set_channel(struct cfg80211_registered_device *rdev, static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *netdev; - int result; - - rtnl_lock(); - - result = get_rdev_dev_by_info_ifindex(info, &rdev, &netdev); - if (result) - goto unlock_rtnl; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *netdev = info->user_ptr[1]; - result = __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info); - - dev_put(netdev); - cfg80211_unlock_rdev(rdev); - unlock_rtnl: - rtnl_unlock(); - - return result; + return __nl80211_set_channel(rdev, netdev->ieee80211_ptr, info); } static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) @@ -893,8 +876,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) u32 frag_threshold = 0, rts_threshold = 0; u8 coverage_class = 0; - rtnl_lock(); - /* * Try to find the wiphy and netdev. Normally this * function shouldn't need the netdev, but this is @@ -921,8 +902,7 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) rdev = __cfg80211_rdev_from_info(info); if (IS_ERR(rdev)) { mutex_unlock(&cfg80211_mutex); - result = PTR_ERR(rdev); - goto unlock; + return PTR_ERR(rdev); } wdev = NULL; netdev = NULL; @@ -1104,8 +1084,6 @@ static int nl80211_set_wiphy(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&rdev->mtx); if (netdev) dev_put(netdev); - unlock: - rtnl_unlock(); return result; } @@ -1185,33 +1163,20 @@ static int nl80211_dump_interface(struct sk_buff *skb, struct netlink_callback * static int nl80211_get_interface(struct sk_buff *skb, struct genl_info *info) { struct sk_buff *msg; - struct cfg80211_registered_device *dev; - struct net_device *netdev; - int err; - - err = get_rdev_dev_by_info_ifindex(info, &dev, &netdev); - if (err) - return err; + struct cfg80211_registered_device *dev = info->user_ptr[0]; + struct net_device *netdev = info->user_ptr[1]; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - goto out_err; + return -ENOMEM; if (nl80211_send_iface(msg, info->snd_pid, info->snd_seq, 0, - dev, netdev) < 0) - goto out_free; - - dev_put(netdev); - cfg80211_unlock_rdev(dev); + dev, netdev) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } return genlmsg_reply(msg, info); - - out_free: - nlmsg_free(msg); - out_err: - dev_put(netdev); - cfg80211_unlock_rdev(dev); - return -ENOBUFS; } static const struct nla_policy mntr_flags_policy[NL80211_MNTR_FLAG_MAX + 1] = { @@ -1271,39 +1236,29 @@ static int nl80211_valid_4addr(struct cfg80211_registered_device *rdev, static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct vif_params params; int err; enum nl80211_iftype otype, ntype; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; u32 _flags, *flags = NULL; bool change = false; memset(¶ms, 0, sizeof(params)); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - otype = ntype = dev->ieee80211_ptr->iftype; if (info->attrs[NL80211_ATTR_IFTYPE]) { ntype = nla_get_u32(info->attrs[NL80211_ATTR_IFTYPE]); if (otype != ntype) change = true; - if (ntype > NL80211_IFTYPE_MAX) { - err = -EINVAL; - goto unlock; - } + if (ntype > NL80211_IFTYPE_MAX) + return -EINVAL; } if (info->attrs[NL80211_ATTR_MESH_ID]) { - if (ntype != NL80211_IFTYPE_MESH_POINT) { - err = -EINVAL; - goto unlock; - } + if (ntype != NL80211_IFTYPE_MESH_POINT) + return -EINVAL; params.mesh_id = nla_data(info->attrs[NL80211_ATTR_MESH_ID]); params.mesh_id_len = nla_len(info->attrs[NL80211_ATTR_MESH_ID]); change = true; @@ -1314,20 +1269,18 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) change = true; err = nl80211_valid_4addr(rdev, dev, params.use_4addr, ntype); if (err) - goto unlock; + return err; } else { params.use_4addr = -1; } if (info->attrs[NL80211_ATTR_MNTR_FLAGS]) { - if (ntype != NL80211_IFTYPE_MONITOR) { - err = -EINVAL; - goto unlock; - } + if (ntype != NL80211_IFTYPE_MONITOR) + return -EINVAL; err = parse_monitor_flags(info->attrs[NL80211_ATTR_MNTR_FLAGS], &_flags); if (err) - goto unlock; + return err; flags = &_flags; change = true; @@ -1341,17 +1294,12 @@ static int nl80211_set_interface(struct sk_buff *skb, struct genl_info *info) if (!err && params.use_4addr != -1) dev->ieee80211_ptr->use_4addr = params.use_4addr; - unlock: - dev_put(dev); - cfg80211_unlock_rdev(rdev); - unlock_rtnl: - rtnl_unlock(); return err; } static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct vif_params params; int err; enum nl80211_iftype type = NL80211_IFTYPE_UNSPECIFIED; @@ -1368,19 +1316,9 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - rtnl_lock(); - - rdev = cfg80211_get_dev_from_info(info); - if (IS_ERR(rdev)) { - err = PTR_ERR(rdev); - goto unlock_rtnl; - } - if (!rdev->ops->add_virtual_intf || - !(rdev->wiphy.interface_modes & (1 << type))) { - err = -EOPNOTSUPP; - goto unlock; - } + !(rdev->wiphy.interface_modes & (1 << type))) + return -EOPNOTSUPP; if (type == NL80211_IFTYPE_MESH_POINT && info->attrs[NL80211_ATTR_MESH_ID]) { @@ -1392,7 +1330,7 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) params.use_4addr = !!nla_get_u8(info->attrs[NL80211_ATTR_4ADDR]); err = nl80211_valid_4addr(rdev, NULL, params.use_4addr, type); if (err) - goto unlock; + return err; } err = parse_monitor_flags(type == NL80211_IFTYPE_MONITOR ? @@ -1402,38 +1340,18 @@ static int nl80211_new_interface(struct sk_buff *skb, struct genl_info *info) nla_data(info->attrs[NL80211_ATTR_IFNAME]), type, err ? NULL : &flags, ¶ms); - unlock: - cfg80211_unlock_rdev(rdev); - unlock_rtnl: - rtnl_unlock(); return err; } static int nl80211_del_interface(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; - - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->del_virtual_intf) { - err = -EOPNOTSUPP; - goto out; - } + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; - err = rdev->ops->del_virtual_intf(&rdev->wiphy, dev); + if (!rdev->ops->del_virtual_intf) + return -EOPNOTSUPP; - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); - return err; + return rdev->ops->del_virtual_intf(&rdev->wiphy, dev); } struct get_key_cookie { @@ -1486,9 +1404,9 @@ static void get_key_callback(void *c, struct key_params *params) static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; u8 key_idx = 0; u8 *mac_addr = NULL; struct get_key_cookie cookie = { @@ -1506,30 +1424,17 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->get_key) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->get_key) + return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } + if (!msg) + return -ENOMEM; hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, NL80211_CMD_NEW_KEY); - - if (IS_ERR(hdr)) { - err = PTR_ERR(hdr); - goto free_msg; - } + if (IS_ERR(hdr)) + return PTR_ERR(hdr); cookie.msg = msg; cookie.idx = key_idx; @@ -1549,28 +1454,21 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) goto nla_put_failure; genlmsg_end(msg, hdr); - err = genlmsg_reply(msg, info); - goto out; + return genlmsg_reply(msg, info); nla_put_failure: err = -ENOBUFS; free_msg: nlmsg_free(msg); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); - return err; } static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct key_parse key; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; int (*func)(struct wiphy *wiphy, struct net_device *netdev, u8 key_index); @@ -1585,21 +1483,13 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) if (!key.def && !key.defmgmt) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - if (key.def) func = rdev->ops->set_default_key; else func = rdev->ops->set_default_mgmt_key; - if (!func) { - err = -EOPNOTSUPP; - goto out; - } + if (!func) + return -EOPNOTSUPP; wdev_lock(dev->ieee80211_ptr); err = nl80211_key_allowed(dev->ieee80211_ptr); @@ -1616,21 +1506,14 @@ static int nl80211_set_key(struct sk_buff *skb, struct genl_info *info) #endif wdev_unlock(dev->ieee80211_ptr); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - - unlock_rtnl: - rtnl_unlock(); - return err; } static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; struct key_parse key; u8 *mac_addr = NULL; @@ -1644,21 +1527,11 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->add_key) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->add_key) + return -EOPNOTSUPP; - if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, mac_addr)) { - err = -EINVAL; - goto out; - } + if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, mac_addr)) + return -EINVAL; wdev_lock(dev->ieee80211_ptr); err = nl80211_key_allowed(dev->ieee80211_ptr); @@ -1667,20 +1540,14 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) mac_addr, &key.p); wdev_unlock(dev->ieee80211_ptr); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); - return err; } static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; u8 *mac_addr = NULL; struct key_parse key; @@ -1691,16 +1558,8 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->del_key) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->del_key) + return -EOPNOTSUPP; wdev_lock(dev->ieee80211_ptr); err = nl80211_key_allowed(dev->ieee80211_ptr); @@ -1717,13 +1576,6 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) #endif wdev_unlock(dev->ieee80211_ptr); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - - unlock_rtnl: - rtnl_unlock(); - return err; } @@ -1731,36 +1583,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info) { int (*call)(struct wiphy *wiphy, struct net_device *dev, struct beacon_parameters *info); - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct beacon_parameters params; int haveinfo = 0; if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_BEACON_TAIL])) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; switch (info->genlhdr->cmd) { case NL80211_CMD_NEW_BEACON: /* these are required for NEW_BEACON */ if (!info->attrs[NL80211_ATTR_BEACON_INTERVAL] || !info->attrs[NL80211_ATTR_DTIM_PERIOD] || - !info->attrs[NL80211_ATTR_BEACON_HEAD]) { - err = -EINVAL; - goto out; - } + !info->attrs[NL80211_ATTR_BEACON_HEAD]) + return -EINVAL; call = rdev->ops->add_beacon; break; @@ -1769,14 +1610,11 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info) break; default: WARN_ON(1); - err = -EOPNOTSUPP; - goto out; + return -EOPNOTSUPP; } - if (!call) { - err = -EOPNOTSUPP; - goto out; - } + if (!call) + return -EOPNOTSUPP; memset(¶ms, 0, sizeof(params)); @@ -1806,53 +1644,25 @@ static int nl80211_addset_beacon(struct sk_buff *skb, struct genl_info *info) haveinfo = 1; } - if (!haveinfo) { - err = -EINVAL; - goto out; - } - - err = call(&rdev->wiphy, dev, ¶ms); - - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); + if (!haveinfo) + return -EINVAL; - return err; + return call(&rdev->wiphy, dev, ¶ms); } static int nl80211_del_beacon(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; - - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; - if (!rdev->ops->del_beacon) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->del_beacon) + return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - err = -EOPNOTSUPP; - goto out; - } - err = rdev->ops->del_beacon(&rdev->wiphy, dev); - - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; - return err; + return rdev->ops->del_beacon(&rdev->wiphy, dev); } static const struct nla_policy sta_flags_policy[NL80211_STA_FLAG_MAX + 1] = { @@ -2049,12 +1859,12 @@ static int nl80211_dump_station(struct sk_buff *skb, static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct station_info sinfo; struct sk_buff *msg; u8 *mac_addr = NULL; + int err; memset(&sinfo, 0, sizeof(sinfo)); @@ -2063,41 +1873,24 @@ static int nl80211_get_station(struct sk_buff *skb, struct genl_info *info) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); - rtnl_lock(); + if (!rdev->ops->get_station) + return -EOPNOTSUPP; - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo); if (err) - goto out_rtnl; - - if (!rdev->ops->get_station) { - err = -EOPNOTSUPP; - goto out; - } - - err = rdev->ops->get_station(&rdev->wiphy, dev, mac_addr, &sinfo); - if (err) - goto out; + return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - goto out; + return -ENOMEM; if (nl80211_send_station(msg, info->snd_pid, info->snd_seq, 0, - dev, mac_addr, &sinfo) < 0) - goto out_free; - - err = genlmsg_reply(msg, info); - goto out; - - out_free: - nlmsg_free(msg); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + dev, mac_addr, &sinfo) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } - return err; + return genlmsg_reply(msg, info); } /* @@ -2127,9 +1920,9 @@ static int get_vlan(struct genl_info *info, static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; struct station_parameters params; u8 *mac_addr = NULL; @@ -2167,12 +1960,6 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) params.plink_action = nla_get_u8(info->attrs[NL80211_ATTR_STA_PLINK_ACTION]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - err = get_vlan(info, rdev, ¶ms.vlan); if (err) goto out; @@ -2234,19 +2021,15 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info) out: if (params.vlan) dev_put(params.vlan); - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); return err; } static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; struct station_parameters params; u8 *mac_addr = NULL; @@ -2283,18 +2066,10 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) if (parse_station_flags(info, ¶ms)) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - err = -EINVAL; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EINVAL; err = get_vlan(info, rdev, ¶ms.vlan); if (err) @@ -2318,52 +2093,28 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) out: if (params.vlan) dev_put(params.vlan); - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); - return err; } static int nl80211_del_station(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; u8 *mac_addr = NULL; if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - err = -EINVAL; - goto out; - } - - if (!rdev->ops->del_station) { - err = -EOPNOTSUPP; - goto out; - } - - err = rdev->ops->del_station(&rdev->wiphy, dev, mac_addr); + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EINVAL; - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + if (!rdev->ops->del_station) + return -EOPNOTSUPP; - return err; + return rdev->ops->del_station(&rdev->wiphy, dev, mac_addr); } static int nl80211_send_mpath(struct sk_buff *msg, u32 pid, u32 seq, @@ -2490,9 +2241,9 @@ static int nl80211_dump_mpath(struct sk_buff *skb, static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; struct mpath_info pinfo; struct sk_buff *msg; u8 *dst = NULL; @@ -2505,53 +2256,33 @@ static int nl80211_get_mpath(struct sk_buff *skb, struct genl_info *info) dst = nla_data(info->attrs[NL80211_ATTR_MAC]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - - if (!rdev->ops->get_mpath) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->get_mpath) + return -EOPNOTSUPP; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { - err = -EOPNOTSUPP; - goto out; - } + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; err = rdev->ops->get_mpath(&rdev->wiphy, dev, dst, next_hop, &pinfo); if (err) - goto out; + return err; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!msg) - goto out; + return -ENOMEM; if (nl80211_send_mpath(msg, info->snd_pid, info->snd_seq, 0, - dev, dst, next_hop, &pinfo) < 0) - goto out_free; - - err = genlmsg_reply(msg, info); - goto out; - - out_free: - nlmsg_free(msg); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + dev, dst, next_hop, &pinfo) < 0) { + nlmsg_free(msg); + return -ENOBUFS; + } - return err; + return genlmsg_reply(msg, info); } static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; u8 *dst = NULL; u8 *next_hop = NULL; @@ -2564,42 +2295,22 @@ static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info) dst = nla_data(info->attrs[NL80211_ATTR_MAC]); next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - - if (!rdev->ops->change_mpath) { - err = -EOPNOTSUPP; - goto out; - } - - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { - err = -EOPNOTSUPP; - goto out; - } - - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!rdev->ops->change_mpath) + return -EOPNOTSUPP; - err = rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + if (!netif_running(dev)) + return -ENETDOWN; - return err; + return rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); } + static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; u8 *dst = NULL; u8 *next_hop = NULL; @@ -2612,75 +2323,37 @@ static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info) dst = nla_data(info->attrs[NL80211_ATTR_MAC]); next_hop = nla_data(info->attrs[NL80211_ATTR_MPATH_NEXT_HOP]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - - if (!rdev->ops->add_mpath) { - err = -EOPNOTSUPP; - goto out; - } - - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) { - err = -EOPNOTSUPP; - goto out; - } - - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!rdev->ops->add_mpath) + return -EOPNOTSUPP; - err = rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) + return -EOPNOTSUPP; - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + if (!netif_running(dev)) + return -ENETDOWN; - return err; + return rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); } static int nl80211_del_mpath(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; u8 *dst = NULL; if (info->attrs[NL80211_ATTR_MAC]) dst = nla_data(info->attrs[NL80211_ATTR_MAC]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - - if (!rdev->ops->del_mpath) { - err = -EOPNOTSUPP; - goto out; - } - - err = rdev->ops->del_mpath(&rdev->wiphy, dev, dst); - - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + if (!rdev->ops->del_mpath) + return -EOPNOTSUPP; - return err; + return rdev->ops->del_mpath(&rdev->wiphy, dev, dst); } static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct bss_parameters params; memset(¶ms, 0, sizeof(params)); @@ -2708,32 +2381,14 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_AP_ISOLATE]) params.ap_isolate = !!nla_get_u8(info->attrs[NL80211_ATTR_AP_ISOLATE]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - - if (!rdev->ops->change_bss) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->change_bss) + return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - err = -EOPNOTSUPP; - goto out; - } - - err = rdev->ops->change_bss(&rdev->wiphy, dev, ¶ms); - - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; - return err; + return rdev->ops->change_bss(&rdev->wiphy, dev, ¶ms); } static const struct nla_policy reg_rule_policy[NL80211_REG_RULE_ATTR_MAX + 1] = { @@ -2812,37 +2467,26 @@ static int nl80211_req_set_reg(struct sk_buff *skb, struct genl_info *info) static int nl80211_get_mesh_params(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct mesh_config cur_params; int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; void *hdr; struct nlattr *pinfoattr; struct sk_buff *msg; - rtnl_lock(); - - /* Look up our device */ - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - - if (!rdev->ops->get_mesh_params) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->get_mesh_params) + return -EOPNOTSUPP; /* Get the mesh params */ err = rdev->ops->get_mesh_params(&rdev->wiphy, dev, &cur_params); if (err) - goto out; + return err; /* Draw up a netlink message to send back */ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOBUFS; - goto out; - } + if (!msg) + return -ENOMEM; hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, NL80211_CMD_GET_MESH_PARAMS); if (!hdr) @@ -2881,22 +2525,13 @@ static int nl80211_get_mesh_params(struct sk_buff *skb, cur_params.dot11MeshHWMPRootMode); nla_nest_end(msg, pinfoattr); genlmsg_end(msg, hdr); - err = genlmsg_reply(msg, info); - goto out; + return genlmsg_reply(msg, info); nla_put_failure: genlmsg_cancel(msg, hdr); nlmsg_free(msg); - err = -EMSGSIZE; - out: - /* Cleanup */ - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); - - return err; -} + return -ENOBUFS; +} #define FILL_IN_MESH_PARAM_IF_SET(table, cfg, param, mask, attr_num, nla_fn) \ do {\ @@ -2925,10 +2560,9 @@ static const struct nla_policy nl80211_meshconf_params_policy[NL80211_MESHCONF_A static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info) { - int err; u32 mask; - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct mesh_config cfg; struct nlattr *tb[NL80211_MESHCONF_ATTR_MAX + 1]; struct nlattr *parent_attr; @@ -2940,16 +2574,8 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info) parent_attr, nl80211_meshconf_params_policy)) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - - if (!rdev->ops->set_mesh_params) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_mesh_params) + return -EOPNOTSUPP; /* This makes sure that there aren't more than 32 mesh config * parameters (otherwise our bitfield scheme would not work.) */ @@ -2995,16 +2621,7 @@ static int nl80211_set_mesh_params(struct sk_buff *skb, struct genl_info *info) nla_get_u8); /* Apply changes */ - err = rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask); - - out: - /* cleanup */ - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); - - return err; + return rdev->ops->set_mesh_params(&rdev->wiphy, dev, &cfg, mask); } #undef FILL_IN_MESH_PARAM_IF_SET @@ -3187,8 +2804,8 @@ static int validate_scan_freqs(struct nlattr *freqs) static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct cfg80211_scan_request *request; struct cfg80211_ssid *ssid; struct ieee80211_channel *channel; @@ -3201,36 +2818,22 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!is_valid_ie_attr(info->attrs[NL80211_ATTR_IE])) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - wiphy = &rdev->wiphy; - if (!rdev->ops->scan) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->scan) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; - if (rdev->scan_req) { - err = -EBUSY; - goto out; - } + if (rdev->scan_req) + return -EBUSY; if (info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]) { n_channels = validate_scan_freqs( info->attrs[NL80211_ATTR_SCAN_FREQUENCIES]); - if (!n_channels) { - err = -EINVAL; - goto out; - } + if (!n_channels) + return -EINVAL; } else { n_channels = 0; @@ -3243,29 +2846,23 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) nla_for_each_nested(attr, info->attrs[NL80211_ATTR_SCAN_SSIDS], tmp) n_ssids++; - if (n_ssids > wiphy->max_scan_ssids) { - err = -EINVAL; - goto out; - } + if (n_ssids > wiphy->max_scan_ssids) + return -EINVAL; if (info->attrs[NL80211_ATTR_IE]) ie_len = nla_len(info->attrs[NL80211_ATTR_IE]); else ie_len = 0; - if (ie_len > wiphy->max_scan_ie_len) { - err = -EINVAL; - goto out; - } + if (ie_len > wiphy->max_scan_ie_len) + return -EINVAL; request = kzalloc(sizeof(*request) + sizeof(*ssid) * n_ssids + sizeof(channel) * n_channels + ie_len, GFP_KERNEL); - if (!request) { - err = -ENOMEM; - goto out; - } + if (!request) + return -ENOMEM; if (n_ssids) request->ssids = (void *)&request->channels[n_channels]; @@ -3353,18 +2950,11 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!err) { nl80211_send_scan_start(rdev, dev); dev_hold(dev); - } - + } else { out_free: - if (err) { rdev->scan_req = NULL; kfree(request); } - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); return err; } @@ -3643,8 +3233,8 @@ static bool nl80211_valid_cipher_suite(u32 cipher) static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct ieee80211_channel *chan; const u8 *bssid, *ssid, *ie = NULL; int err, ssid_len, ie_len = 0; @@ -3686,12 +3276,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) key.p.key = NULL; } - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - if (key.idx >= 0) { int i; bool ok = false; @@ -3701,35 +3285,25 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) break; } } - if (!ok) { - err = -EINVAL; - goto out; - } + if (!ok) + return -EINVAL; } - if (!rdev->ops->auth) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->auth) + return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); chan = ieee80211_get_channel(&rdev->wiphy, nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); - if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) { - err = -EINVAL; - goto out; - } + if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) + return -EINVAL; ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); @@ -3740,24 +3314,15 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) } auth_type = nla_get_u32(info->attrs[NL80211_ATTR_AUTH_TYPE]); - if (!nl80211_valid_auth_type(auth_type)) { - err = -EINVAL; - goto out; - } + if (!nl80211_valid_auth_type(auth_type)) + return -EINVAL; local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; - err = cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, - ssid, ssid_len, ie, ie_len, - key.p.key, key.p.key_len, key.idx, - local_state_change); - -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); - return err; + return cfg80211_mlme_auth(rdev, dev, chan, auth_type, bssid, + ssid, ssid_len, ie, ie_len, + key.p.key, key.p.key_len, key.idx, + local_state_change); } static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, @@ -3841,8 +3406,8 @@ static int nl80211_crypto_settings(struct cfg80211_registered_device *rdev, static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct cfg80211_crypto_settings crypto; struct ieee80211_channel *chan; const u8 *bssid, *ssid, *ie = NULL, *prev_bssid = NULL; @@ -3857,36 +3422,22 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) !info->attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->assoc) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->assoc) + return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); chan = ieee80211_get_channel(&rdev->wiphy, nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); - if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) { - err = -EINVAL; - goto out; - } + if (!chan || (chan->flags & IEEE80211_CHAN_DISABLED)) + return -EINVAL; ssid = nla_data(info->attrs[NL80211_ATTR_SSID]); ssid_len = nla_len(info->attrs[NL80211_ATTR_SSID]); @@ -3901,10 +3452,8 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) nla_get_u32(info->attrs[NL80211_ATTR_USE_MFP]); if (mfp == NL80211_MFP_REQUIRED) use_mfp = true; - else if (mfp != NL80211_MFP_NO) { - err = -EINVAL; - goto out; - } + else if (mfp != NL80211_MFP_NO) + return -EINVAL; } if (info->attrs[NL80211_ATTR_PREV_BSSID]) @@ -3916,20 +3465,15 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) ssid, ssid_len, ie, ie_len, use_mfp, &crypto); -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); return err; } static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; const u8 *ie = NULL, *bssid; - int err, ie_len = 0; + int ie_len = 0; u16 reason_code; bool local_state_change; @@ -3942,35 +3486,22 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_REASON_CODE]) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->deauth) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->deauth) + return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); if (reason_code == 0) { /* Reason Code 0 is reserved */ - err = -EINVAL; - goto out; + return -EINVAL; } if (info->attrs[NL80211_ATTR_IE]) { @@ -3980,23 +3511,16 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; - err = cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code, - local_state_change); - -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); - return err; + return cfg80211_mlme_deauth(rdev, dev, bssid, ie, ie_len, reason_code, + local_state_change); } static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; const u8 *ie = NULL, *bssid; - int err, ie_len = 0; + int ie_len = 0; u16 reason_code; bool local_state_change; @@ -4009,35 +3533,22 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_REASON_CODE]) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->disassoc) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->disassoc) + return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); if (reason_code == 0) { /* Reason Code 0 is reserved */ - err = -EINVAL; - goto out; + return -EINVAL; } if (info->attrs[NL80211_ATTR_IE]) { @@ -4047,21 +3558,14 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) local_state_change = !!info->attrs[NL80211_ATTR_LOCAL_STATE_CHANGE]; - err = cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code, - local_state_change); - -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); - return err; + return cfg80211_mlme_disassoc(rdev, dev, bssid, ie, ie_len, reason_code, + local_state_change); } static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct cfg80211_ibss_params ibss; struct wiphy *wiphy; struct cfg80211_cached_keys *connkeys = NULL; @@ -4086,26 +3590,14 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->join_ibss) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->join_ibss) + return -EOPNOTSUPP; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) { - err = -EOPNOTSUPP; - goto out; - } + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; wiphy = &rdev->wiphy; @@ -4123,24 +3615,12 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); if (!ibss.channel || ibss.channel->flags & IEEE80211_CHAN_NO_IBSS || - ibss.channel->flags & IEEE80211_CHAN_DISABLED) { - err = -EINVAL; - goto out; - } + ibss.channel->flags & IEEE80211_CHAN_DISABLED) + return -EINVAL; ibss.channel_fixed = !!info->attrs[NL80211_ATTR_FREQ_FIXED]; ibss.privacy = !!info->attrs[NL80211_ATTR_PRIVACY]; - if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) { - connkeys = nl80211_parse_connkeys(rdev, - info->attrs[NL80211_ATTR_KEYS]); - if (IS_ERR(connkeys)) { - err = PTR_ERR(connkeys); - connkeys = NULL; - goto out; - } - } - if (info->attrs[NL80211_ATTR_BSS_BASIC_RATES]) { u8 *rates = nla_data(info->attrs[NL80211_ATTR_BSS_BASIC_RATES]); @@ -4150,10 +3630,8 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) wiphy->bands[ibss.channel->band]; int i, j; - if (n_rates == 0) { - err = -EINVAL; - goto out; - } + if (n_rates == 0) + return -EINVAL; for (i = 0; i < n_rates; i++) { int rate = (rates[i] & 0x7f) * 5; @@ -4166,60 +3644,39 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) break; } } - if (!found) { - err = -EINVAL; - goto out; - } + if (!found) + return -EINVAL; } } - err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); + if (ibss.privacy && info->attrs[NL80211_ATTR_KEYS]) { + connkeys = nl80211_parse_connkeys(rdev, + info->attrs[NL80211_ATTR_KEYS]); + if (IS_ERR(connkeys)) + return PTR_ERR(connkeys); + } -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: + err = cfg80211_join_ibss(rdev, dev, &ibss, connkeys); if (err) kfree(connkeys); - rtnl_unlock(); return err; } static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; - int err; - - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; - if (!rdev->ops->leave_ibss) { - err = -EOPNOTSUPP; - goto out; - } - - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->leave_ibss) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) + return -EOPNOTSUPP; - err = cfg80211_leave_ibss(rdev, dev, false); + if (!netif_running(dev)) + return -ENETDOWN; -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); - return err; + return cfg80211_leave_ibss(rdev, dev, false); } #ifdef CONFIG_NL80211_TESTMODE @@ -4229,20 +3686,12 @@ static struct genl_multicast_group nl80211_testmode_mcgrp = { static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int err; if (!info->attrs[NL80211_ATTR_TESTDATA]) return -EINVAL; - rtnl_lock(); - - rdev = cfg80211_get_dev_from_info(info); - if (IS_ERR(rdev)) { - err = PTR_ERR(rdev); - goto unlock_rtnl; - } - err = -EOPNOTSUPP; if (rdev->ops->testmode_cmd) { rdev->testmode_info = info; @@ -4252,10 +3701,6 @@ static int nl80211_testmode_do(struct sk_buff *skb, struct genl_info *info) rdev->testmode_info = NULL; } - cfg80211_unlock_rdev(rdev); - - unlock_rtnl: - rtnl_unlock(); return err; } @@ -4346,8 +3791,8 @@ EXPORT_SYMBOL(cfg80211_testmode_event); static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct cfg80211_connect_params connect; struct wiphy *wiphy; struct cfg80211_cached_keys *connkeys = NULL; @@ -4376,22 +3821,13 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) NL80211_MAX_NR_CIPHER_SUITES); if (err) return err; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; wiphy = &rdev->wiphy; @@ -4410,39 +3846,27 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) ieee80211_get_channel(wiphy, nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); if (!connect.channel || - connect.channel->flags & IEEE80211_CHAN_DISABLED) { - err = -EINVAL; - goto out; - } + connect.channel->flags & IEEE80211_CHAN_DISABLED) + return -EINVAL; } if (connect.privacy && info->attrs[NL80211_ATTR_KEYS]) { connkeys = nl80211_parse_connkeys(rdev, info->attrs[NL80211_ATTR_KEYS]); - if (IS_ERR(connkeys)) { - err = PTR_ERR(connkeys); - connkeys = NULL; - goto out; - } + if (IS_ERR(connkeys)) + return PTR_ERR(connkeys); } err = cfg80211_connect(rdev, dev, &connect, connkeys); - -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: if (err) kfree(connkeys); - rtnl_unlock(); return err; } static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; - int err; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; u16 reason; if (!info->attrs[NL80211_ATTR_REASON_CODE]) @@ -4453,36 +3877,19 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) if (reason == 0) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } - - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - err = cfg80211_disconnect(rdev, dev, reason, true); + if (!netif_running(dev)) + return -ENETDOWN; -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); - return err; + return cfg80211_disconnect(rdev, dev, reason, true); } static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net *net; int err; u32 pid; @@ -4492,43 +3899,26 @@ static int nl80211_wiphy_netns(struct sk_buff *skb, struct genl_info *info) pid = nla_get_u32(info->attrs[NL80211_ATTR_PID]); - rtnl_lock(); - - rdev = cfg80211_get_dev_from_info(info); - if (IS_ERR(rdev)) { - err = PTR_ERR(rdev); - goto out_rtnl; - } - net = get_net_ns_by_pid(pid); - if (IS_ERR(net)) { - err = PTR_ERR(net); - goto out; - } + if (IS_ERR(net)) + return PTR_ERR(net); err = 0; /* check if anything to do */ - if (net_eq(wiphy_net(&rdev->wiphy), net)) - goto out_put_net; + if (!net_eq(wiphy_net(&rdev->wiphy), net)) + err = cfg80211_switch_netns(rdev, net); - err = cfg80211_switch_netns(rdev, net); - out_put_net: put_net(net); - out: - cfg80211_unlock_rdev(rdev); - out_rtnl: - rtnl_unlock(); return err; } static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; int (*rdev_ops)(struct wiphy *wiphy, struct net_device *dev, struct cfg80211_pmksa *pmksa) = NULL; - int err; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; struct cfg80211_pmksa pmksa; memset(&pmksa, 0, sizeof(struct cfg80211_pmksa)); @@ -4539,20 +3929,12 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) if (!info->attrs[NL80211_ATTR_PMKID]) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; - pmksa.pmkid = nla_data(info->attrs[NL80211_ATTR_PMKID]); pmksa.bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; switch (info->genlhdr->cmd) { case NL80211_CMD_SET_PMKSA: @@ -4566,62 +3948,32 @@ static int nl80211_setdel_pmksa(struct sk_buff *skb, struct genl_info *info) break; } - if (!rdev_ops) { - err = -EOPNOTSUPP; - goto out; - } - - err = rdev_ops(&rdev->wiphy, dev, &pmksa); - - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + if (!rdev_ops) + return -EOPNOTSUPP; - return err; + return rdev_ops(&rdev->wiphy, dev, &pmksa); } static int nl80211_flush_pmksa(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - int err; - struct net_device *dev; - - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto out_rtnl; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto out; - } - - if (!rdev->ops->flush_pmksa) { - err = -EOPNOTSUPP; - goto out; - } - - err = rdev->ops->flush_pmksa(&rdev->wiphy, dev); - - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - out_rtnl: - rtnl_unlock(); + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - return err; + if (!rdev->ops->flush_pmksa) + return -EOPNOTSUPP; + return rdev->ops->flush_pmksa(&rdev->wiphy, dev); } static int nl80211_remain_on_channel(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct ieee80211_channel *chan; struct sk_buff *msg; void *hdr; @@ -4643,21 +3995,11 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, if (!duration || !msecs_to_jiffies(duration) || duration > 5000) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->remain_on_channel) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->remain_on_channel) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { channel_type = nla_get_u32( @@ -4665,24 +4007,18 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, if (channel_type != NL80211_CHAN_NO_HT && channel_type != NL80211_CHAN_HT20 && channel_type != NL80211_CHAN_HT40PLUS && - channel_type != NL80211_CHAN_HT40MINUS) { - err = -EINVAL; - goto out; - } + channel_type != NL80211_CHAN_HT40MINUS) + return -EINVAL; } freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); chan = rdev_freq_to_chan(rdev, freq, channel_type); - if (chan == NULL) { - err = -EINVAL; - goto out; - } + if (chan == NULL) + return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } + if (!msg) + return -ENOMEM; hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, NL80211_CMD_REMAIN_ON_CHANNEL); @@ -4701,58 +4037,35 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); genlmsg_end(msg, hdr); - err = genlmsg_reply(msg, info); - goto out; + + return genlmsg_reply(msg, info); nla_put_failure: err = -ENOBUFS; free_msg: nlmsg_free(msg); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); return err; } static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; u64 cookie; - int err; if (!info->attrs[NL80211_ATTR_COOKIE]) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->cancel_remain_on_channel) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->cancel_remain_on_channel) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); - err = rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie); - - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); - return err; + return rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie); } static u32 rateset_to_mask(struct ieee80211_supported_band *sband, @@ -4788,26 +4101,18 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, struct genl_info *info) { struct nlattr *tb[NL80211_TXRATE_MAX + 1]; - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct cfg80211_bitrate_mask mask; - int err, rem, i; - struct net_device *dev; + int rem, i; + struct net_device *dev = info->user_ptr[1]; struct nlattr *tx_rates; struct ieee80211_supported_band *sband; if (info->attrs[NL80211_ATTR_TX_RATES] == NULL) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->set_bitrate_mask) { - err = -EOPNOTSUPP; - goto unlock; - } + if (!rdev->ops->set_bitrate_mask) + return -EOPNOTSUPP; memset(&mask, 0, sizeof(mask)); /* Default to all rates enabled */ @@ -4824,15 +4129,11 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, nla_for_each_nested(tx_rates, info->attrs[NL80211_ATTR_TX_RATES], rem) { enum ieee80211_band band = nla_type(tx_rates); - if (band < 0 || band >= IEEE80211_NUM_BANDS) { - err = -EINVAL; - goto unlock; - } + if (band < 0 || band >= IEEE80211_NUM_BANDS) + return -EINVAL; sband = rdev->wiphy.bands[band]; - if (sband == NULL) { - err = -EINVAL; - goto unlock; - } + if (sband == NULL) + return -EINVAL; nla_parse(tb, NL80211_TXRATE_MAX, nla_data(tx_rates), nla_len(tx_rates), nl80211_txattr_policy); if (tb[NL80211_TXRATE_LEGACY]) { @@ -4840,29 +4141,19 @@ static int nl80211_set_tx_bitrate_mask(struct sk_buff *skb, sband, nla_data(tb[NL80211_TXRATE_LEGACY]), nla_len(tb[NL80211_TXRATE_LEGACY])); - if (mask.control[band].legacy == 0) { - err = -EINVAL; - goto unlock; - } + if (mask.control[band].legacy == 0) + return -EINVAL; } } - err = rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask); - - unlock: - dev_put(dev); - cfg80211_unlock_rdev(rdev); - unlock_rtnl: - rtnl_unlock(); - return err; + return rdev->ops->set_bitrate_mask(&rdev->wiphy, dev, NULL, &mask); } static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; u16 frame_type = IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_ACTION; - int err; if (!info->attrs[NL80211_ATTR_FRAME_MATCH]) return -EINVAL; @@ -4870,44 +4161,28 @@ static int nl80211_register_mgmt(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_FRAME_TYPE]) frame_type = nla_get_u16(info->attrs[NL80211_ATTR_FRAME_TYPE]); - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; /* not much point in registering if we can't reply */ - if (!rdev->ops->mgmt_tx) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->mgmt_tx) + return -EOPNOTSUPP; - err = cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid, + return cfg80211_mlme_register_mgmt(dev->ieee80211_ptr, info->snd_pid, frame_type, nla_data(info->attrs[NL80211_ATTR_FRAME_MATCH]), nla_len(info->attrs[NL80211_ATTR_FRAME_MATCH])); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); - return err; } static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; struct ieee80211_channel *chan; enum nl80211_channel_type channel_type = NL80211_CHAN_NO_HT; bool channel_type_valid = false; @@ -4921,31 +4196,19 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) !info->attrs[NL80211_ATTR_WIPHY_FREQ]) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - if (!rdev->ops->mgmt_tx) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->mgmt_tx) + return -EOPNOTSUPP; if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_STATION && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP && dev->ieee80211_ptr->iftype != NL80211_IFTYPE_AP_VLAN && - dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) { - err = -EOPNOTSUPP; - goto out; - } + dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) + return -EOPNOTSUPP; - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } + if (!netif_running(dev)) + return -ENETDOWN; if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { channel_type = nla_get_u32( @@ -4953,25 +4216,19 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) if (channel_type != NL80211_CHAN_NO_HT && channel_type != NL80211_CHAN_HT20 && channel_type != NL80211_CHAN_HT40PLUS && - channel_type != NL80211_CHAN_HT40MINUS) { - err = -EINVAL; - goto out; - } + channel_type != NL80211_CHAN_HT40MINUS) + return -EINVAL; channel_type_valid = true; } freq = nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ]); chan = rdev_freq_to_chan(rdev, freq, channel_type); - if (chan == NULL) { - err = -EINVAL; - goto out; - } + if (chan == NULL) + return -EINVAL; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } + if (!msg) + return -ENOMEM; hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, NL80211_CMD_FRAME); @@ -4991,110 +4248,72 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) NLA_PUT_U64(msg, NL80211_ATTR_COOKIE, cookie); genlmsg_end(msg, hdr); - err = genlmsg_reply(msg, info); - goto out; + return genlmsg_reply(msg, info); nla_put_failure: err = -ENOBUFS; free_msg: nlmsg_free(msg); - out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); return err; } static int nl80211_set_power_save(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; u8 ps_state; bool state; int err; - if (!info->attrs[NL80211_ATTR_PS_STATE]) { - err = -EINVAL; - goto out; - } + if (!info->attrs[NL80211_ATTR_PS_STATE]) + return -EINVAL; ps_state = nla_get_u32(info->attrs[NL80211_ATTR_PS_STATE]); - if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED) { - err = -EINVAL; - goto out; - } - - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; + if (ps_state != NL80211_PS_DISABLED && ps_state != NL80211_PS_ENABLED) + return -EINVAL; wdev = dev->ieee80211_ptr; - if (!rdev->ops->set_power_mgmt) { - err = -EOPNOTSUPP; - goto unlock_rdev; - } + if (!rdev->ops->set_power_mgmt) + return -EOPNOTSUPP; state = (ps_state == NL80211_PS_ENABLED) ? true : false; if (state == wdev->ps) - goto unlock_rdev; - - wdev->ps = state; - - if (rdev->ops->set_power_mgmt(wdev->wiphy, dev, wdev->ps, - wdev->ps_timeout)) - /* assume this means it's off */ - wdev->ps = false; - -unlock_rdev: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); + return 0; -out: + err = rdev->ops->set_power_mgmt(wdev->wiphy, dev, state, + wdev->ps_timeout); + if (!err) + wdev->ps = state; return err; } static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; enum nl80211_ps_state ps_state; struct wireless_dev *wdev; - struct net_device *dev; + struct net_device *dev = info->user_ptr[1]; struct sk_buff *msg; void *hdr; int err; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - wdev = dev->ieee80211_ptr; - if (!rdev->ops->set_power_mgmt) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_power_mgmt) + return -EOPNOTSUPP; msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (!msg) { - err = -ENOMEM; - goto out; - } + if (!msg) + return -ENOMEM; hdr = nl80211hdr_put(msg, info->snd_pid, info->snd_seq, 0, NL80211_CMD_GET_POWER_SAVE); if (!hdr) { - err = -ENOMEM; + err = -ENOBUFS; goto free_msg; } @@ -5106,22 +4325,12 @@ static int nl80211_get_power_save(struct sk_buff *skb, struct genl_info *info) NLA_PUT_U32(msg, NL80211_ATTR_PS_STATE, ps_state); genlmsg_end(msg, hdr); - err = genlmsg_reply(msg, info); - goto out; + return genlmsg_reply(msg, info); -nla_put_failure: + nla_put_failure: err = -ENOBUFS; - -free_msg: + free_msg: nlmsg_free(msg); - -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - -unlock_rtnl: - rtnl_unlock(); - return err; } @@ -5135,43 +4344,24 @@ nl80211_attr_cqm_policy[NL80211_ATTR_CQM_MAX + 1] __read_mostly = { static int nl80211_set_cqm_rssi(struct genl_info *info, s32 threshold, u32 hysteresis) { - struct cfg80211_registered_device *rdev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct wireless_dev *wdev; - struct net_device *dev; - int err; + struct net_device *dev = info->user_ptr[1]; if (threshold > 0) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - wdev = dev->ieee80211_ptr; - if (!rdev->ops->set_cqm_rssi_config) { - err = -EOPNOTSUPP; - goto unlock_rdev; - } + if (!rdev->ops->set_cqm_rssi_config) + return -EOPNOTSUPP; if (wdev->iftype != NL80211_IFTYPE_STATION && - wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) { - err = -EOPNOTSUPP; - goto unlock_rdev; - } - - err = rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev, - threshold, hysteresis); - - unlock_rdev: - cfg80211_unlock_rdev(rdev); - dev_put(dev); - unlock_rtnl: - rtnl_unlock(); + wdev->iftype != NL80211_IFTYPE_P2P_CLIENT) + return -EOPNOTSUPP; - return err; + return rdev->ops->set_cqm_rssi_config(wdev->wiphy, dev, + threshold, hysteresis); } static int nl80211_set_cqm(struct sk_buff *skb, struct genl_info *info) @@ -5205,6 +4395,54 @@ out: return err; } +#define NL80211_FLAG_NEED_WIPHY 0x01 +#define NL80211_FLAG_NEED_NETDEV 0x02 +#define NL80211_FLAG_NEED_RTNL 0x04 + +static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + struct cfg80211_registered_device *rdev; + struct net_device *dev; + int err; + bool rtnl = ops->internal_flags & NL80211_FLAG_NEED_RTNL; + + if (rtnl) + rtnl_lock(); + + if (ops->internal_flags & NL80211_FLAG_NEED_WIPHY) { + rdev = cfg80211_get_dev_from_info(info); + if (IS_ERR(rdev)) { + if (rtnl) + rtnl_unlock(); + return PTR_ERR(rdev); + } + info->user_ptr[0] = rdev; + } else if (ops->internal_flags & NL80211_FLAG_NEED_NETDEV) { + err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); + if (err) { + if (rtnl) + rtnl_unlock(); + return err; + } + info->user_ptr[0] = rdev; + info->user_ptr[1] = dev; + } + + return 0; +} + +static void nl80211_post_doit(struct genl_ops *ops, struct sk_buff *skb, + struct genl_info *info) +{ + if (info->user_ptr[0]) + cfg80211_unlock_rdev(info->user_ptr[0]); + if (info->user_ptr[1]) + dev_put(info->user_ptr[1]); + if (ops->internal_flags & NL80211_FLAG_NEED_RTNL) + rtnl_unlock(); +} + static struct genl_ops nl80211_ops[] = { { .cmd = NL80211_CMD_GET_WIPHY, @@ -5212,12 +4450,14 @@ static struct genl_ops nl80211_ops[] = { .dumpit = nl80211_dump_wiphy, .policy = nl80211_policy, /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_WIPHY, }, { .cmd = NL80211_CMD_SET_WIPHY, .doit = nl80211_set_wiphy, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_INTERFACE, @@ -5225,90 +4465,119 @@ static struct genl_ops nl80211_ops[] = { .dumpit = nl80211_dump_interface, .policy = nl80211_policy, /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_NETDEV, }, { .cmd = NL80211_CMD_SET_INTERFACE, .doit = nl80211_set_interface, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_INTERFACE, .doit = nl80211_new_interface, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_INTERFACE, .doit = nl80211_del_interface, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_KEY, .doit = nl80211_get_key, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_KEY, .doit = nl80211_set_key, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_KEY, .doit = nl80211_new_key, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_KEY, .doit = nl80211_del_key, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_BEACON, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .doit = nl80211_addset_beacon, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_BEACON, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .doit = nl80211_addset_beacon, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_BEACON, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, .doit = nl80211_del_beacon, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_STATION, .doit = nl80211_get_station, .dumpit = nl80211_dump_station, .policy = nl80211_policy, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_STATION, .doit = nl80211_set_station, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_STATION, .doit = nl80211_new_station, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_STATION, .doit = nl80211_del_station, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_MPATH, @@ -5316,30 +4585,40 @@ static struct genl_ops nl80211_ops[] = { .dumpit = nl80211_dump_mpath, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_MPATH, .doit = nl80211_set_mpath, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_NEW_MPATH, .doit = nl80211_new_mpath, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_MPATH, .doit = nl80211_del_mpath, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_BSS, .doit = nl80211_set_bss, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_REG, @@ -5364,18 +4643,24 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_get_mesh_params, .policy = nl80211_policy, /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_MESH_PARAMS, .doit = nl80211_set_mesh_params, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_TRIGGER_SCAN, .doit = nl80211_trigger_scan, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_SCAN, @@ -5387,36 +4672,48 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_authenticate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_ASSOCIATE, .doit = nl80211_associate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEAUTHENTICATE, .doit = nl80211_deauthenticate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DISASSOCIATE, .doit = nl80211_disassociate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_JOIN_IBSS, .doit = nl80211_join_ibss, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_LEAVE_IBSS, .doit = nl80211_leave_ibss, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, #ifdef CONFIG_NL80211_TESTMODE { @@ -5424,6 +4721,8 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_testmode_do, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, }, #endif { @@ -5431,18 +4730,24 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_connect, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DISCONNECT, .doit = nl80211_disconnect, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_WIPHY_NETNS, .doit = nl80211_wiphy_netns, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_WIPHY | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_SURVEY, @@ -5454,72 +4759,96 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_setdel_pmksa, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_DEL_PMKSA, .doit = nl80211_setdel_pmksa, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_FLUSH_PMKSA, .doit = nl80211_flush_pmksa, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_REMAIN_ON_CHANNEL, .doit = nl80211_remain_on_channel, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_CANCEL_REMAIN_ON_CHANNEL, .doit = nl80211_cancel_remain_on_channel, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_TX_BITRATE_MASK, .doit = nl80211_set_tx_bitrate_mask, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_REGISTER_FRAME, .doit = nl80211_register_mgmt, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_FRAME, .doit = nl80211_tx_mgmt, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_POWER_SAVE, .doit = nl80211_set_power_save, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_GET_POWER_SAVE, .doit = nl80211_get_power_save, .policy = nl80211_policy, /* can be retrieved by unprivileged users */ + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_CQM, .doit = nl80211_set_cqm, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_CHANNEL, .doit = nl80211_set_channel, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, { .cmd = NL80211_CMD_SET_WDS_PEER, -- cgit v1.1 From 41265714810e20d91ddd7aaca30043b0f12190ad Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Oct 2010 21:14:05 +0200 Subject: nl80211: use generic check for netif_running Use a new flag that requires the netdev to be UP and use it to check instead of coding the check into all functions that require it. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 97 +++++++++++++++----------------------------------- 1 file changed, 28 insertions(+), 69 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index a96da47..ca8d04c 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -411,9 +411,6 @@ static int nl80211_key_allowed(struct wireless_dev *wdev) { ASSERT_WDEV_LOCK(wdev); - if (!netif_running(wdev->netdev)) - return -ENETDOWN; - switch (wdev->iftype) { case NL80211_IFTYPE_AP: case NL80211_IFTYPE_AP_VLAN: @@ -2083,11 +2080,6 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info) goto out; } - if (!netif_running(dev)) { - err = -ENETDOWN; - goto out; - } - err = rdev->ops->add_station(&rdev->wiphy, dev, mac_addr, ¶ms); out: @@ -2301,9 +2293,6 @@ static int nl80211_set_mpath(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - return rdev->ops->change_mpath(&rdev->wiphy, dev, dst, next_hop); } @@ -2329,9 +2318,6 @@ static int nl80211_new_mpath(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_MESH_POINT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - return rdev->ops->add_mpath(&rdev->wiphy, dev, dst, next_hop); } @@ -2823,9 +2809,6 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info) if (!rdev->ops->scan) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - if (rdev->scan_req) return -EBUSY; @@ -3296,9 +3279,6 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); chan = ieee80211_get_channel(&rdev->wiphy, nla_get_u32(info->attrs[NL80211_ATTR_WIPHY_FREQ])); @@ -3429,9 +3409,6 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); chan = ieee80211_get_channel(&rdev->wiphy, @@ -3493,9 +3470,6 @@ static int nl80211_deauthenticate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); @@ -3540,9 +3514,6 @@ static int nl80211_disassociate(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); reason_code = nla_get_u16(info->attrs[NL80211_ATTR_REASON_CODE]); @@ -3596,9 +3567,6 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - wiphy = &rdev->wiphy; if (info->attrs[NL80211_ATTR_MAC]) @@ -3673,9 +3641,6 @@ static int nl80211_leave_ibss(struct sk_buff *skb, struct genl_info *info) if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_ADHOC) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - return cfg80211_leave_ibss(rdev, dev, false); } @@ -3826,9 +3791,6 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - wiphy = &rdev->wiphy; if (info->attrs[NL80211_ATTR_MAC]) @@ -3881,9 +3843,6 @@ static int nl80211_disconnect(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_CLIENT) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - return cfg80211_disconnect(rdev, dev, reason, true); } @@ -3998,9 +3957,6 @@ static int nl80211_remain_on_channel(struct sk_buff *skb, if (!rdev->ops->remain_on_channel) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { channel_type = nla_get_u32( info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); @@ -4060,9 +4016,6 @@ static int nl80211_cancel_remain_on_channel(struct sk_buff *skb, if (!rdev->ops->cancel_remain_on_channel) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - cookie = nla_get_u64(info->attrs[NL80211_ATTR_COOKIE]); return rdev->ops->cancel_remain_on_channel(&rdev->wiphy, dev, cookie); @@ -4207,9 +4160,6 @@ static int nl80211_tx_mgmt(struct sk_buff *skb, struct genl_info *info) dev->ieee80211_ptr->iftype != NL80211_IFTYPE_P2P_GO) return -EOPNOTSUPP; - if (!netif_running(dev)) - return -ENETDOWN; - if (info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]) { channel_type = nla_get_u32( info->attrs[NL80211_ATTR_WIPHY_CHANNEL_TYPE]); @@ -4398,6 +4348,9 @@ out: #define NL80211_FLAG_NEED_WIPHY 0x01 #define NL80211_FLAG_NEED_NETDEV 0x02 #define NL80211_FLAG_NEED_RTNL 0x04 +#define NL80211_FLAG_CHECK_NETDEV_UP 0x08 +#define NL80211_FLAG_NEED_NETDEV_UP (NL80211_FLAG_NEED_NETDEV |\ + NL80211_FLAG_CHECK_NETDEV_UP) static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb, struct genl_info *info) @@ -4425,6 +4378,12 @@ static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb, rtnl_unlock(); return err; } + if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && + !netif_running(dev)) { + if (rtnl) + rtnl_unlock(); + return -ENETDOWN; + } info->user_ptr[0] = rdev; info->user_ptr[1] = dev; } @@ -4504,7 +4463,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_set_key, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4512,7 +4471,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_new_key, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4520,7 +4479,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_del_key, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4568,7 +4527,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_new_station, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4585,7 +4544,7 @@ static struct genl_ops nl80211_ops[] = { .dumpit = nl80211_dump_mpath, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4593,7 +4552,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_set_mpath, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4601,7 +4560,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_new_mpath, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4659,7 +4618,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_trigger_scan, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4672,7 +4631,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_authenticate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4680,7 +4639,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_associate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4688,7 +4647,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_deauthenticate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4696,7 +4655,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_disassociate, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4704,7 +4663,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_join_ibss, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4712,7 +4671,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_leave_ibss, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, #ifdef CONFIG_NL80211_TESTMODE @@ -4730,7 +4689,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_connect, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4738,7 +4697,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_disconnect, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4783,7 +4742,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_remain_on_channel, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4791,7 +4750,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_cancel_remain_on_channel, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { @@ -4815,7 +4774,7 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_tx_mgmt, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, - .internal_flags = NL80211_FLAG_NEED_NETDEV | + .internal_flags = NL80211_FLAG_NEED_NETDEV_UP | NL80211_FLAG_NEED_RTNL, }, { -- cgit v1.1 From 6774889314ba507483e63c014fcb81adfc127202 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 4 Oct 2010 21:14:06 +0200 Subject: nl80211: reduce dumping boilerplate Consolidate boilerplate code needed for .dumpit calls operating on netdevs. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 151 +++++++++++++++++++------------------------------ 1 file changed, 59 insertions(+), 92 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index ca8d04c..0c94971 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -198,6 +198,47 @@ static int nl80211_get_ifidx(struct netlink_callback *cb) return res; } +static int nl80211_prepare_netdev_dump(struct sk_buff *skb, + struct netlink_callback *cb, + struct cfg80211_registered_device **rdev, + struct net_device **dev) +{ + int ifidx = cb->args[0]; + int err; + + if (!ifidx) + ifidx = nl80211_get_ifidx(cb); + if (ifidx < 0) + return ifidx; + + cb->args[0] = ifidx; + + rtnl_lock(); + + *dev = __dev_get_by_index(sock_net(skb->sk), ifidx); + if (!*dev) { + err = -ENODEV; + goto out_rtnl; + } + + *rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx); + if (IS_ERR(dev)) { + err = PTR_ERR(dev); + goto out_rtnl; + } + + return 0; + out_rtnl: + rtnl_unlock(); + return err; +} + +static void nl80211_finish_netdev_dump(struct cfg80211_registered_device *rdev) +{ + cfg80211_unlock_rdev(rdev); + rtnl_unlock(); +} + /* IE validation */ static bool is_valid_ie_attr(const struct nlattr *attr) { @@ -1796,28 +1837,12 @@ static int nl80211_dump_station(struct sk_buff *skb, struct cfg80211_registered_device *dev; struct net_device *netdev; u8 mac_addr[ETH_ALEN]; - int ifidx = cb->args[0]; int sta_idx = cb->args[1]; int err; - if (!ifidx) - ifidx = nl80211_get_ifidx(cb); - if (ifidx < 0) - return ifidx; - - rtnl_lock(); - - netdev = __dev_get_by_index(sock_net(skb->sk), ifidx); - if (!netdev) { - err = -ENODEV; - goto out_rtnl; - } - - dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); - goto out_rtnl; - } + err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + if (err) + return err; if (!dev->ops->dump_station) { err = -EOPNOTSUPP; @@ -1847,9 +1872,7 @@ static int nl80211_dump_station(struct sk_buff *skb, cb->args[1] = sta_idx; err = skb->len; out_err: - cfg80211_unlock_rdev(dev); - out_rtnl: - rtnl_unlock(); + nl80211_finish_netdev_dump(dev); return err; } @@ -2169,28 +2192,12 @@ static int nl80211_dump_mpath(struct sk_buff *skb, struct net_device *netdev; u8 dst[ETH_ALEN]; u8 next_hop[ETH_ALEN]; - int ifidx = cb->args[0]; int path_idx = cb->args[1]; int err; - if (!ifidx) - ifidx = nl80211_get_ifidx(cb); - if (ifidx < 0) - return ifidx; - - rtnl_lock(); - - netdev = __dev_get_by_index(sock_net(skb->sk), ifidx); - if (!netdev) { - err = -ENODEV; - goto out_rtnl; - } - - dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx); - if (IS_ERR(dev)) { - err = PTR_ERR(dev); - goto out_rtnl; - } + err = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + if (err) + return err; if (!dev->ops->dump_mpath) { err = -EOPNOTSUPP; @@ -2224,10 +2231,7 @@ static int nl80211_dump_mpath(struct sk_buff *skb, cb->args[1] = path_idx; err = skb->len; out_err: - cfg80211_unlock_rdev(dev); - out_rtnl: - rtnl_unlock(); - + nl80211_finish_netdev_dump(dev); return err; } @@ -3034,25 +3038,12 @@ static int nl80211_dump_scan(struct sk_buff *skb, struct net_device *dev; struct cfg80211_internal_bss *scan; struct wireless_dev *wdev; - int ifidx = cb->args[0]; int start = cb->args[1], idx = 0; int err; - if (!ifidx) - ifidx = nl80211_get_ifidx(cb); - if (ifidx < 0) - return ifidx; - cb->args[0] = ifidx; - - dev = dev_get_by_index(sock_net(skb->sk), ifidx); - if (!dev) - return -ENODEV; - - rdev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx); - if (IS_ERR(rdev)) { - err = PTR_ERR(rdev); - goto out_put_netdev; - } + err = nl80211_prepare_netdev_dump(skb, cb, &rdev, &dev); + if (err) + return err; wdev = dev->ieee80211_ptr; @@ -3068,21 +3059,17 @@ static int nl80211_dump_scan(struct sk_buff *skb, cb->nlh->nlmsg_seq, NLM_F_MULTI, rdev, wdev, scan) < 0) { idx--; - goto out; + break; } } - out: spin_unlock_bh(&rdev->bss_lock); wdev_unlock(wdev); cb->args[1] = idx; - err = skb->len; - cfg80211_unlock_rdev(rdev); - out_put_netdev: - dev_put(dev); + nl80211_finish_netdev_dump(rdev); - return err; + return skb->len; } static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq, @@ -3130,29 +3117,12 @@ static int nl80211_dump_survey(struct sk_buff *skb, struct survey_info survey; struct cfg80211_registered_device *dev; struct net_device *netdev; - int ifidx = cb->args[0]; int survey_idx = cb->args[1]; int res; - if (!ifidx) - ifidx = nl80211_get_ifidx(cb); - if (ifidx < 0) - return ifidx; - cb->args[0] = ifidx; - - rtnl_lock(); - - netdev = __dev_get_by_index(sock_net(skb->sk), ifidx); - if (!netdev) { - res = -ENODEV; - goto out_rtnl; - } - - dev = cfg80211_get_dev_from_ifindex(sock_net(skb->sk), ifidx); - if (IS_ERR(dev)) { - res = PTR_ERR(dev); - goto out_rtnl; - } + res = nl80211_prepare_netdev_dump(skb, cb, &dev, &netdev); + if (res) + return res; if (!dev->ops->dump_survey) { res = -EOPNOTSUPP; @@ -3180,10 +3150,7 @@ static int nl80211_dump_survey(struct sk_buff *skb, cb->args[1] = survey_idx; res = skb->len; out_err: - cfg80211_unlock_rdev(dev); - out_rtnl: - rtnl_unlock(); - + nl80211_finish_netdev_dump(dev); return res; } -- cgit v1.1 From 025e6be220e448c02045e8499c7db8ce4bc8eea2 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Oct 2010 10:41:47 +0200 Subject: mac80211: fix deadlock with multiple interfaces The locking around ieee80211_recalc_smps is buggy -- it cannot acquire another interface's mutex while the iflist mutex is held because another code path could be holding the iface mutex and trying to acquire the iflist mutex. But the locking is also unnecessary, we only check "ifmgd->associated" as a bool, and don't use the pointer (in check_mgd_smps). Reported-by: Ben Greear Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 2 +- net/mac80211/ieee80211_i.h | 3 +-- net/mac80211/main.c | 2 +- net/mac80211/mlme.c | 2 +- net/mac80211/util.c | 20 +++----------------- 5 files changed, 7 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index a7a78f2..94bf550 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1394,7 +1394,7 @@ int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, if (!sdata->u.mgd.associated || sdata->vif.bss_conf.channel_type == NL80211_CHAN_NO_HT) { mutex_lock(&sdata->local->iflist_mtx); - ieee80211_recalc_smps(sdata->local, sdata); + ieee80211_recalc_smps(sdata->local); mutex_unlock(&sdata->local->iflist_mtx); return 0; } diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 55d79db..08509e2 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1297,8 +1297,7 @@ u32 ieee80211_sta_get_rates(struct ieee80211_local *local, enum ieee80211_band band); int __ieee80211_request_smps(struct ieee80211_sub_if_data *sdata, enum ieee80211_smps_mode smps_mode); -void ieee80211_recalc_smps(struct ieee80211_local *local, - struct ieee80211_sub_if_data *forsdata); +void ieee80211_recalc_smps(struct ieee80211_local *local); size_t ieee80211_ie_split(const u8 *ies, size_t ielen, const u8 *ids, int n_ids, size_t offset); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 9c2f3f9..e371709 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -333,7 +333,7 @@ static void ieee80211_recalc_smps_work(struct work_struct *work) container_of(work, struct ieee80211_local, recalc_smps); mutex_lock(&local->iflist_mtx); - ieee80211_recalc_smps(local, NULL); + ieee80211_recalc_smps(local); mutex_unlock(&local->iflist_mtx); } diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index c37086a..2b29827 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -913,7 +913,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, mutex_lock(&local->iflist_mtx); ieee80211_recalc_ps(local, -1); - ieee80211_recalc_smps(local, sdata); + ieee80211_recalc_smps(local); mutex_unlock(&local->iflist_mtx); netif_tx_start_all_queues(sdata->dev); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index aba025d..4ee8f2b 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1297,16 +1297,12 @@ static int check_mgd_smps(struct ieee80211_if_managed *ifmgd, } /* must hold iflist_mtx */ -void ieee80211_recalc_smps(struct ieee80211_local *local, - struct ieee80211_sub_if_data *forsdata) +void ieee80211_recalc_smps(struct ieee80211_local *local) { struct ieee80211_sub_if_data *sdata; enum ieee80211_smps_mode smps_mode = IEEE80211_SMPS_OFF; int count = 0; - if (forsdata) - lockdep_assert_held(&forsdata->u.mgd.mtx); - lockdep_assert_held(&local->iflist_mtx); /* @@ -1324,18 +1320,8 @@ void ieee80211_recalc_smps(struct ieee80211_local *local, continue; if (sdata->vif.type != NL80211_IFTYPE_STATION) goto set; - if (sdata != forsdata) { - /* - * This nested is ok -- we are holding the iflist_mtx - * so can't get here twice or so. But it's required - * since normally we acquire it first and then the - * iflist_mtx. - */ - mutex_lock_nested(&sdata->u.mgd.mtx, SINGLE_DEPTH_NESTING); - count += check_mgd_smps(&sdata->u.mgd, &smps_mode); - mutex_unlock(&sdata->u.mgd.mtx); - } else - count += check_mgd_smps(&sdata->u.mgd, &smps_mode); + + count += check_mgd_smps(&sdata->u.mgd, &smps_mode); if (count > 1) { smps_mode = IEEE80211_SMPS_OFF; -- cgit v1.1 From e7480bbb926c5816e4fbfca70748096bbe0e4978 Mon Sep 17 00:00:00 2001 From: "Luis R. Rodriguez" Date: Fri, 1 Oct 2010 17:05:19 -0400 Subject: mac80211: fix channel assumption for association done work Be consistent and use the wk->chan instead of the local->hw.conf.channel for the association done work. This prevents any possible races against channel changes while we run this work. In the case that the race did happen we would be initializing the bit rates for the new AP under the assumption of a wrong channel and in the worst case, wrong band. This could lead to trying to assuming we could use CCK frames on 5 GHz, for example. This patch has a fix for kernels >= v2.6.34 Cc: stable@kernel.org Signed-off-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 2b29827..cd13aa8 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1291,7 +1291,7 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk, rates = 0; basic_rates = 0; - sband = local->hw.wiphy->bands[local->hw.conf.channel->band]; + sband = local->hw.wiphy->bands[wk->chan->band]; for (i = 0; i < elems.supp_rates_len; i++) { int rate = (elems.supp_rates[i] & 0x7f) * 5; @@ -1327,11 +1327,11 @@ static bool ieee80211_assoc_success(struct ieee80211_work *wk, } } - sta->sta.supp_rates[local->hw.conf.channel->band] = rates; + sta->sta.supp_rates[wk->chan->band] = rates; sdata->vif.bss_conf.basic_rates = basic_rates; /* cf. IEEE 802.11 9.2.12 */ - if (local->hw.conf.channel->band == IEEE80211_BAND_2GHZ && + if (wk->chan->band == IEEE80211_BAND_2GHZ && have_higher_than_11mbit) sdata->flags |= IEEE80211_SDATA_OPERATING_GMODE; else -- cgit v1.1 From caf586e5f23cebb2a68cbaf288d59dbbf2d74052 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 30 Sep 2010 21:06:55 +0000 Subject: net: add a core netdev->rx_dropped counter In various situations, a device provides a packet to our stack and we drop it before it enters protocol stack : - softnet backlog full (accounted in /proc/net/softnet_stat) - bad vlan tag (not accounted) - unknown/unregistered protocol (not accounted) We can handle a per-device counter of such dropped frames at core level, and automatically adds it to the device provided stats (rx_dropped), so that standard tools can be used (ifconfig, ip link, cat /proc/net/dev) This is a generalization of commit 8990f468a (net: rx_dropped accounting), thus reverting it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/8021q/vlan.h | 2 -- net/8021q/vlan_core.c | 2 ++ net/8021q/vlan_dev.c | 11 ++++------- net/core/dev.c | 19 +++++++++++-------- net/ipv4/ip_gre.c | 3 +-- net/ipv4/ipip.c | 3 +-- net/ipv6/ip6_tunnel.c | 3 +-- net/ipv6/ip6mr.c | 3 +-- net/ipv6/sit.c | 3 +-- 9 files changed, 22 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index b26ce34..8d9503a 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -25,7 +25,6 @@ struct vlan_priority_tci_mapping { * @rx_multicast: number of received multicast packets * @syncp: synchronization point for 64bit counters * @rx_errors: number of errors - * @rx_dropped: number of dropped packets */ struct vlan_rx_stats { u64 rx_packets; @@ -33,7 +32,6 @@ struct vlan_rx_stats { u64 rx_multicast; struct u64_stats_sync syncp; unsigned long rx_errors; - unsigned long rx_dropped; }; /** diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index b6d55a9..dee727ce 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -33,6 +33,7 @@ int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, return polling ? netif_receive_skb(skb) : netif_rx(skb); drop: + atomic_long_inc(&skb->dev->rx_dropped); dev_kfree_skb_any(skb); return NET_RX_DROP; } @@ -123,6 +124,7 @@ vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, return dev_gro_receive(napi, skb); drop: + atomic_long_inc(&skb->dev->rx_dropped); return GRO_DROP; } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f6fbcc0f..f54251e 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -225,16 +225,15 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, } } - if (unlikely(netif_rx(skb) == NET_RX_DROP)) { - if (rx_stats) - rx_stats->rx_dropped++; - } + netif_rx(skb); + rcu_read_unlock(); return NET_RX_SUCCESS; err_unlock: rcu_read_unlock(); err_free: + atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -846,15 +845,13 @@ static struct rtnl_link_stats64 *vlan_dev_get_stats64(struct net_device *dev, st accum.rx_packets += rxpackets; accum.rx_bytes += rxbytes; accum.rx_multicast += rxmulticast; - /* rx_errors, rx_dropped are ulong, not protected by syncp */ + /* rx_errors is ulong, not protected by syncp */ accum.rx_errors += p->rx_errors; - accum.rx_dropped += p->rx_dropped; } stats->rx_packets = accum.rx_packets; stats->rx_bytes = accum.rx_bytes; stats->rx_errors = accum.rx_errors; stats->multicast = accum.rx_multicast; - stats->rx_dropped = accum.rx_dropped; } return stats; } diff --git a/net/core/dev.c b/net/core/dev.c index ce6ad88..7d14955 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1483,8 +1483,9 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) skb_orphan(skb); nf_reset(skb); - if (!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len))) { + if (unlikely(!(dev->flags & IFF_UP) || + (skb->len > (dev->mtu + dev->hard_header_len)))) { + atomic_long_inc(&dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2548,6 +2549,7 @@ enqueue: local_irq_restore(flags); + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); return NET_RX_DROP; } @@ -2995,6 +2997,7 @@ ncls: if (pt_prev) { ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev); } else { + atomic_long_inc(&skb->dev->rx_dropped); kfree_skb(skb); /* Jamal, now you will not able to escape explaining * me how you were going to use this. :-) @@ -5429,14 +5432,14 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, if (ops->ndo_get_stats64) { memset(storage, 0, sizeof(*storage)); - return ops->ndo_get_stats64(dev, storage); - } - if (ops->ndo_get_stats) { + ops->ndo_get_stats64(dev, storage); + } else if (ops->ndo_get_stats) { netdev_stats_to_stats64(storage, ops->ndo_get_stats(dev)); - return storage; + } else { + netdev_stats_to_stats64(storage, &dev->stats); + dev_txq_stats_fold(dev, storage); } - netdev_stats_to_stats64(storage, &dev->stats); - dev_txq_stats_fold(dev, storage); + storage->rx_dropped += atomic_long_read(&dev->rx_dropped); return storage; } EXPORT_SYMBOL(dev_get_stats); diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index fbe2c47..9d421f4 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -679,8 +679,7 @@ static int ipgre_rcv(struct sk_buff *skb) skb_reset_network_header(skb); ipgre_ecn_decapsulate(iph, skb); - if (netif_rx(skb) == NET_RX_DROP) - tunnel->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 6ad46c2..e9b816e 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -414,8 +414,7 @@ static int ipip_rcv(struct sk_buff *skb) ipip_ecn_decapsulate(iph, skb); - if (netif_rx(skb) == NET_RX_DROP) - tunnel->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 8be3c45..c2c0f89 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -768,8 +768,7 @@ static int ip6_tnl_rcv(struct sk_buff *skb, __u16 protocol, dscp_ecn_decapsulate(t, ipv6h, skb); - if (netif_rx(skb) == NET_RX_DROP) - t->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index 2640c9b..6f32ffc 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -666,8 +666,7 @@ static int pim6_rcv(struct sk_buff *skb) skb_tunnel_rx(skb, reg_dev); - if (netif_rx(skb) == NET_RX_DROP) - reg_dev->stats.rx_dropped++; + netif_rx(skb); dev_put(reg_dev); return 0; diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index d770178..367a6cc 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -600,8 +600,7 @@ static int ipip6_rcv(struct sk_buff *skb) ipip6_ecn_decapsulate(iph, skb); - if (netif_rx(skb) == NET_RX_DROP) - tunnel->dev->stats.rx_dropped++; + netif_rx(skb); rcu_read_unlock(); return 0; -- cgit v1.1 From 110b2499370c401cdcc7c63e481084467291d556 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 04:27:36 +0000 Subject: net neigh: neigh_delete() and neigh_add() changes neigh_delete() and neigh_add() dont need to touch device refcount, we hold RTNL when calling them, so device cannot disappear under us. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 39 +++++++++++++++++---------------------- 1 file changed, 17 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b142a0d..d6996e0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1531,6 +1531,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err = -EINVAL; + ASSERT_RTNL(); if (nlmsg_len(nlh) < sizeof(*ndm)) goto out; @@ -1540,7 +1541,7 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; @@ -1556,34 +1557,31 @@ static int neigh_delete(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(dst_attr) < tbl->key_len) - goto out_dev_put; + goto out; if (ndm->ndm_flags & NTF_PROXY) { err = pneigh_delete(tbl, net, nla_data(dst_attr), dev); - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, nla_data(dst_attr), dev); if (neigh == NULL) { err = -ENOENT; - goto out_dev_put; + goto out; } err = neigh_update(neigh, NULL, NUD_FAILED, NEIGH_UPDATE_F_OVERRIDE | NEIGH_UPDATE_F_ADMIN); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; -out_dev_put: - if (dev) - dev_put(dev); out: return err; } @@ -1597,6 +1595,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) struct net_device *dev = NULL; int err; + ASSERT_RTNL(); err = nlmsg_parse(nlh, sizeof(*ndm), tb, NDA_MAX, NULL); if (err < 0) goto out; @@ -1607,14 +1606,14 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) ndm = nlmsg_data(nlh); if (ndm->ndm_ifindex) { - dev = dev_get_by_index(net, ndm->ndm_ifindex); + dev = __dev_get_by_index(net, ndm->ndm_ifindex); if (dev == NULL) { err = -ENODEV; goto out; } if (tb[NDA_LLADDR] && nla_len(tb[NDA_LLADDR]) < dev->addr_len) - goto out_dev_put; + goto out; } read_lock(&neigh_tbl_lock); @@ -1628,7 +1627,7 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) read_unlock(&neigh_tbl_lock); if (nla_len(tb[NDA_DST]) < tbl->key_len) - goto out_dev_put; + goto out; dst = nla_data(tb[NDA_DST]); lladdr = tb[NDA_LLADDR] ? nla_data(tb[NDA_LLADDR]) : NULL; @@ -1641,29 +1640,29 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) pn->flags = ndm->ndm_flags; err = 0; } - goto out_dev_put; + goto out; } if (dev == NULL) - goto out_dev_put; + goto out; neigh = neigh_lookup(tbl, dst, dev); if (neigh == NULL) { if (!(nlh->nlmsg_flags & NLM_F_CREATE)) { err = -ENOENT; - goto out_dev_put; + goto out; } neigh = __neigh_lookup_errno(tbl, dst, dev); if (IS_ERR(neigh)) { err = PTR_ERR(neigh); - goto out_dev_put; + goto out; } } else { if (nlh->nlmsg_flags & NLM_F_EXCL) { err = -EEXIST; neigh_release(neigh); - goto out_dev_put; + goto out; } if (!(nlh->nlmsg_flags & NLM_F_REPLACE)) @@ -1676,15 +1675,11 @@ static int neigh_add(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg) } else err = neigh_update(neigh, lladdr, ndm->ndm_state, flags); neigh_release(neigh); - goto out_dev_put; + goto out; } read_unlock(&neigh_tbl_lock); err = -EAFNOSUPPORT; - -out_dev_put: - if (dev) - dev_put(dev); out: return err; } -- cgit v1.1 From d6bf781712a1d25cc8987036b3a48535b331eb91 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 4 Oct 2010 06:15:44 +0000 Subject: net neigh: RCU conversion of neigh hash table David This is the first step for RCU conversion of neigh code. Next patches will convert hash_buckets[] and "struct neighbour" to RCU protected objects. Thanks [PATCH net-next] net neigh: RCU conversion of neigh hash table Instead of storing hash_buckets, hash_mask and hash_rnd in "struct neigh_table", a new structure is defined : struct neigh_hash_table { struct neighbour **hash_buckets; unsigned int hash_mask; __u32 hash_rnd; struct rcu_head rcu; }; And "struct neigh_table" has an RCU protected pointer to such a neigh_hash_table. This means the signature of (*hash)() function changed: We need to add a third parameter with the actual hash_rnd value, since this is not anymore a neigh_table field. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/atm/clip.c | 4 +- net/core/neighbour.c | 219 +++++++++++++++++++++++++++++++------------------- net/decnet/dn_neigh.c | 13 +-- net/ipv4/arp.c | 8 +- net/ipv6/ndisc.c | 10 ++- 5 files changed, 158 insertions(+), 96 deletions(-) (limited to 'net') diff --git a/net/atm/clip.c b/net/atm/clip.c index 95fdd11..ff956d1 100644 --- a/net/atm/clip.c +++ b/net/atm/clip.c @@ -310,9 +310,9 @@ static int clip_constructor(struct neighbour *neigh) return 0; } -static u32 clip_hash(const void *pkey, const struct net_device *dev) +static u32 clip_hash(const void *pkey, const struct net_device *dev, __u32 rnd) { - return jhash_2words(*(u32 *) pkey, dev->ifindex, clip_tbl.hash_rnd); + return jhash_2words(*(u32 *) pkey, dev->ifindex, rnd); } static struct neigh_table clip_tbl = { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index d6996e0..dd8920e 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -131,14 +131,17 @@ static int neigh_forced_gc(struct neigh_table *tbl) { int shrunk = 0; int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, forced_gc_runs); write_lock_bh(&tbl->lock); - for (i = 0; i <= tbl->hash_mask; i++) { + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (i = 0; i <= nht->hash_mask; i++) { struct neighbour *n, **np; - np = &tbl->hash_buckets[i]; + np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. @@ -199,9 +202,13 @@ static void pneigh_queue_purge(struct sk_buff_head *list) static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) { int i; + struct neigh_hash_table *nht; - for (i = 0; i <= tbl->hash_mask; i++) { - struct neighbour *n, **np = &tbl->hash_buckets[i]; + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + + for (i = 0; i <= nht->hash_mask; i++) { + struct neighbour *n, **np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { if (dev && n->dev != dev) { @@ -297,64 +304,81 @@ out_entries: goto out; } -static struct neighbour **neigh_hash_alloc(unsigned int entries) +static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) { - unsigned long size = entries * sizeof(struct neighbour *); - struct neighbour **ret; + size_t size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *ret; + struct neighbour **buckets; - if (size <= PAGE_SIZE) { - ret = kzalloc(size, GFP_ATOMIC); - } else { - ret = (struct neighbour **) - __get_free_pages(GFP_ATOMIC|__GFP_ZERO, get_order(size)); + ret = kmalloc(sizeof(*ret), GFP_ATOMIC); + if (!ret) + return NULL; + if (size <= PAGE_SIZE) + buckets = kzalloc(size, GFP_ATOMIC); + else + buckets = (struct neighbour **) + __get_free_pages(GFP_ATOMIC | __GFP_ZERO, + get_order(size)); + if (!buckets) { + kfree(ret); + return NULL; } + ret->hash_buckets = buckets; + ret->hash_mask = entries - 1; + get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; } -static void neigh_hash_free(struct neighbour **hash, unsigned int entries) +static void neigh_hash_free_rcu(struct rcu_head *head) { - unsigned long size = entries * sizeof(struct neighbour *); + struct neigh_hash_table *nht = container_of(head, + struct neigh_hash_table, + rcu); + size_t size = (nht->hash_mask + 1) * sizeof(struct neighbour *); + struct neighbour **buckets = nht->hash_buckets; if (size <= PAGE_SIZE) - kfree(hash); + kfree(buckets); else - free_pages((unsigned long)hash, get_order(size)); + free_pages((unsigned long)buckets, get_order(size)); + kfree(nht); } -static void neigh_hash_grow(struct neigh_table *tbl, unsigned long new_entries) +static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, + unsigned long new_entries) { - struct neighbour **new_hash, **old_hash; - unsigned int i, new_hash_mask, old_entries; + unsigned int i, hash; + struct neigh_hash_table *new_nht, *old_nht; NEIGH_CACHE_STAT_INC(tbl, hash_grows); BUG_ON(!is_power_of_2(new_entries)); - new_hash = neigh_hash_alloc(new_entries); - if (!new_hash) - return; + old_nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + new_nht = neigh_hash_alloc(new_entries); + if (!new_nht) + return old_nht; - old_entries = tbl->hash_mask + 1; - new_hash_mask = new_entries - 1; - old_hash = tbl->hash_buckets; - - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - for (i = 0; i < old_entries; i++) { + for (i = 0; i <= old_nht->hash_mask; i++) { struct neighbour *n, *next; - for (n = old_hash[i]; n; n = next) { - unsigned int hash_val = tbl->hash(n->primary_key, n->dev); + for (n = old_nht->hash_buckets[i]; + n != NULL; + n = next) { + hash = tbl->hash(n->primary_key, n->dev, + new_nht->hash_rnd); - hash_val &= new_hash_mask; + hash &= new_nht->hash_mask; next = n->next; - n->next = new_hash[hash_val]; - new_hash[hash_val] = n; + n->next = new_nht->hash_buckets[hash]; + new_nht->hash_buckets[hash] = n; } } - tbl->hash_buckets = new_hash; - tbl->hash_mask = new_hash_mask; - neigh_hash_free(old_hash, old_entries); + rcu_assign_pointer(tbl->nht, new_nht); + call_rcu(&old_nht->rcu, neigh_hash_free_rcu); + return new_nht; } struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, @@ -363,19 +387,23 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, dev); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; + read_lock(&tbl->lock); + for (n = nht->hash_buckets[hash_val]; n; n = n->next) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { neigh_hold(n); NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup); @@ -386,12 +414,15 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, struct neighbour *n; int key_len = tbl->key_len; u32 hash_val; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, lookups); - read_lock_bh(&tbl->lock); - hash_val = tbl->hash(pkey, NULL); - for (n = tbl->hash_buckets[hash_val & tbl->hash_mask]; n; n = n->next) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; + read_lock(&tbl->lock); + for (n = nht->hash_buckets[hash_val]; n; n = n->next) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { neigh_hold(n); @@ -399,7 +430,8 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, break; } } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } EXPORT_SYMBOL(neigh_lookup_nodev); @@ -411,6 +443,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, int key_len = tbl->key_len; int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + struct neigh_hash_table *nht; if (!n) { rc = ERR_PTR(-ENOBUFS); @@ -437,18 +470,20 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, n->confirmed = jiffies - (n->parms->base_reachable_time << 1); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); - if (atomic_read(&tbl->entries) > (tbl->hash_mask + 1)) - neigh_hash_grow(tbl, (tbl->hash_mask + 1) << 1); + if (atomic_read(&tbl->entries) > (nht->hash_mask + 1)) + nht = neigh_hash_grow(tbl, (nht->hash_mask + 1) << 1); - hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; + hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; if (n->parms->dead) { rc = ERR_PTR(-EINVAL); goto out_tbl_unlock; } - for (n1 = tbl->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = nht->hash_buckets[hash_val]; n1; n1 = n1->next) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); rc = n1; @@ -456,8 +491,8 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, } } - n->next = tbl->hash_buckets[hash_val]; - tbl->hash_buckets[hash_val] = n; + n->next = nht->hash_buckets[hash_val]; + nht->hash_buckets[hash_val] = n; n->dead = 0; neigh_hold(n); write_unlock_bh(&tbl->lock); @@ -698,10 +733,13 @@ static void neigh_periodic_work(struct work_struct *work) struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); struct neighbour *n, **np; unsigned int i; + struct neigh_hash_table *nht; NEIGH_CACHE_STAT_INC(tbl, periodic_gc_runs); write_lock_bh(&tbl->lock); + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); /* * periodically recompute ReachableTime from random function @@ -715,8 +753,8 @@ static void neigh_periodic_work(struct work_struct *work) neigh_rand_reach_time(p->base_reachable_time); } - for (i = 0 ; i <= tbl->hash_mask; i++) { - np = &tbl->hash_buckets[i]; + for (i = 0 ; i <= nht->hash_mask; i++) { + np = &nht->hash_buckets[i]; while ((n = *np) != NULL) { unsigned int state; @@ -1438,17 +1476,14 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl) panic("cannot create neighbour proc dir entry"); #endif - tbl->hash_mask = 1; - tbl->hash_buckets = neigh_hash_alloc(tbl->hash_mask + 1); + tbl->nht = neigh_hash_alloc(8); phsize = (PNEIGH_HASHMASK + 1) * sizeof(struct pneigh_entry *); tbl->phash_buckets = kzalloc(phsize, GFP_KERNEL); - if (!tbl->hash_buckets || !tbl->phash_buckets) + if (!tbl->nht || !tbl->phash_buckets) panic("cannot allocate neighbour cache hashes"); - get_random_bytes(&tbl->hash_rnd, sizeof(tbl->hash_rnd)); - rwlock_init(&tbl->lock); INIT_DELAYED_WORK_DEFERRABLE(&tbl->gc_work, neigh_periodic_work); schedule_delayed_work(&tbl->gc_work, tbl->parms.reachable_time); @@ -1504,8 +1539,8 @@ int neigh_table_clear(struct neigh_table *tbl) } write_unlock(&neigh_tbl_lock); - neigh_hash_free(tbl->hash_buckets, tbl->hash_mask + 1); - tbl->hash_buckets = NULL; + call_rcu(&tbl->nht->rcu, neigh_hash_free_rcu); + tbl->nht = NULL; kfree(tbl->phash_buckets); tbl->phash_buckets = NULL; @@ -1745,18 +1780,22 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl, unsigned long now = jiffies; unsigned int flush_delta = now - tbl->last_flush; unsigned int rand_delta = now - tbl->last_rand; - + struct neigh_hash_table *nht; struct ndt_config ndc = { .ndtc_key_len = tbl->key_len, .ndtc_entry_size = tbl->entry_size, .ndtc_entries = atomic_read(&tbl->entries), .ndtc_last_flush = jiffies_to_msecs(flush_delta), .ndtc_last_rand = jiffies_to_msecs(rand_delta), - .ndtc_hash_rnd = tbl->hash_rnd, - .ndtc_hash_mask = tbl->hash_mask, .ndtc_proxy_qlen = tbl->proxy_queue.qlen, }; + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + ndc.ndtc_hash_rnd = nht->hash_rnd; + ndc.ndtc_hash_mask = nht->hash_mask; + rcu_read_unlock_bh(); + NLA_PUT(skb, NDTA_CONFIG, sizeof(ndc), &ndc); } @@ -2088,14 +2127,18 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (h = 0; h <= tbl->hash_mask; h++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); + for (h = 0; h <= nht->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - for (n = tbl->hash_buckets[h], idx = 0; n; n = n->next) { + for (n = nht->hash_buckets[h], idx = 0; n; n = n->next) { if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) @@ -2104,7 +2147,6 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, cb->nlh->nlmsg_seq, RTM_NEWNEIGH, NLM_F_MULTI) <= 0) { - read_unlock_bh(&tbl->lock); rc = -1; goto out; } @@ -2112,9 +2154,10 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, idx++; } } - read_unlock_bh(&tbl->lock); rc = skb->len; out: + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; return rc; @@ -2147,15 +2190,20 @@ static int neigh_dump_info(struct sk_buff *skb, struct netlink_callback *cb) void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void *), void *cookie) { int chain; + struct neigh_hash_table *nht; - read_lock_bh(&tbl->lock); - for (chain = 0; chain <= tbl->hash_mask; chain++) { + rcu_read_lock_bh(); + nht = rcu_dereference_bh(tbl->nht); + + read_lock(&tbl->lock); + for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n; - for (n = tbl->hash_buckets[chain]; n; n = n->next) + for (n = nht->hash_buckets[chain]; n; n = n->next) cb(n, cookie); } - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_for_each); @@ -2164,11 +2212,14 @@ void __neigh_for_each_release(struct neigh_table *tbl, int (*cb)(struct neighbour *)) { int chain; + struct neigh_hash_table *nht; - for (chain = 0; chain <= tbl->hash_mask; chain++) { + nht = rcu_dereference_protected(tbl->nht, + lockdep_is_held(&tbl->lock)); + for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n, **np; - np = &tbl->hash_buckets[chain]; + np = &nht->hash_buckets[chain]; while ((n = *np) != NULL) { int release; @@ -2193,13 +2244,13 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; struct neighbour *n = NULL; int bucket = state->bucket; state->flags &= ~NEIGH_SEQ_IS_PNEIGH; - for (bucket = 0; bucket <= tbl->hash_mask; bucket++) { - n = tbl->hash_buckets[bucket]; + for (bucket = 0; bucket <= nht->hash_mask; bucket++) { + n = nht->hash_buckets[bucket]; while (n) { if (!net_eq(dev_net(n->dev), net)) @@ -2234,7 +2285,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, { struct neigh_seq_state *state = seq->private; struct net *net = seq_file_net(seq); - struct neigh_table *tbl = state->tbl; + struct neigh_hash_table *nht = state->nht; if (state->neigh_sub_iter) { void *v = state->neigh_sub_iter(state, n, pos); @@ -2265,10 +2316,10 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n) break; - if (++state->bucket > tbl->hash_mask) + if (++state->bucket > nht->hash_mask) break; - n = tbl->hash_buckets[state->bucket]; + n = nht->hash_buckets[state->bucket]; } if (n && pos) @@ -2367,6 +2418,7 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) __acquires(tbl->lock) + __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -2374,8 +2426,9 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl state->bucket = 0; state->flags = (neigh_seq_flags & ~NEIGH_SEQ_IS_PNEIGH); - read_lock_bh(&tbl->lock); - + rcu_read_lock_bh(); + state->nht = rcu_dereference_bh(tbl->nht); + read_lock(&tbl->lock); return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); @@ -2409,11 +2462,13 @@ EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) __releases(tbl->lock) + __releases(rcu_bh) { struct neigh_seq_state *state = seq->private; struct neigh_table *tbl = state->tbl; - read_unlock_bh(&tbl->lock); + read_unlock(&tbl->lock); + rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 0363bb95..a085dbc 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -48,7 +48,6 @@ #include #include -static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev); static int dn_neigh_construct(struct neighbour *); static void dn_long_error_report(struct neighbour *, struct sk_buff *); static void dn_short_error_report(struct neighbour *, struct sk_buff *); @@ -93,6 +92,13 @@ static const struct neigh_ops dn_phase3_ops = { .queue_xmit = dev_queue_xmit }; +static u32 dn_neigh_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) +{ + return jhash_2words(*(__u16 *)pkey, 0, hash_rnd); +} + struct neigh_table dn_neigh_table = { .family = PF_DECnet, .entry_size = sizeof(struct dn_neigh), @@ -122,11 +128,6 @@ struct neigh_table dn_neigh_table = { .gc_thresh3 = 1024, }; -static u32 dn_neigh_hash(const void *pkey, const struct net_device *dev) -{ - return jhash_2words(*(__u16 *)pkey, 0, dn_neigh_table.hash_rnd); -} - static int dn_neigh_construct(struct neighbour *neigh) { struct net_device *dev = neigh->dev; diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index d9031ad..f353095 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -127,7 +127,7 @@ EXPORT_SYMBOL(clip_tbl_hook); /* * Interface to generic neighbour cache. */ -static u32 arp_hash(const void *pkey, const struct net_device *dev); +static u32 arp_hash(const void *pkey, const struct net_device *dev, __u32 rnd); static int arp_constructor(struct neighbour *neigh); static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb); static void arp_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -225,9 +225,11 @@ int arp_mc_map(__be32 addr, u8 *haddr, struct net_device *dev, int dir) } -static u32 arp_hash(const void *pkey, const struct net_device *dev) +static u32 arp_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) { - return jhash_2words(*(u32 *)pkey, dev->ifindex, arp_tbl.hash_rnd); + return jhash_2words(*(u32 *)pkey, dev->ifindex, hash_rnd); } static int arp_constructor(struct neighbour *neigh) diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index b3dd844c..998d6d2 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -91,7 +91,9 @@ #include #include -static u32 ndisc_hash(const void *pkey, const struct net_device *dev); +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 rnd); static int ndisc_constructor(struct neighbour *neigh); static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb); static void ndisc_error_report(struct neighbour *neigh, struct sk_buff *skb); @@ -350,7 +352,9 @@ int ndisc_mc_map(struct in6_addr *addr, char *buf, struct net_device *dev, int d EXPORT_SYMBOL(ndisc_mc_map); -static u32 ndisc_hash(const void *pkey, const struct net_device *dev) +static u32 ndisc_hash(const void *pkey, + const struct net_device *dev, + __u32 hash_rnd) { const u32 *p32 = pkey; u32 addr_hash, i; @@ -359,7 +363,7 @@ static u32 ndisc_hash(const void *pkey, const struct net_device *dev) for (i = 0; i < (sizeof(struct in6_addr) / sizeof(u32)); i++) addr_hash ^= *p32++; - return jhash_2words(addr_hash, dev->ifindex, nd_tbl.hash_rnd); + return jhash_2words(addr_hash, dev->ifindex, hash_rnd); } static int ndisc_constructor(struct neighbour *neigh) -- cgit v1.1 From 3f66116e89521ef71ab0d63dc07a639def88a577 Mon Sep 17 00:00:00 2001 From: Alban Crequy Date: Mon, 4 Oct 2010 08:48:28 +0000 Subject: AF_UNIX: Implement SO_TIMESTAMP and SO_TIMETAMPNS on Unix sockets Userspace applications can already request to receive timestamps with: setsockopt(sockfd, SOL_SOCKET, SO_TIMESTAMP, ...) Although setsockopt() returns zero (success), timestamps are not added to the ancillary data. This patch fixes that on SOCK_DGRAM and SOCK_SEQPACKET Unix sockets. Signed-off-by: Alban Crequy Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 +++++ 1 file changed, 5 insertions(+) (limited to 'net') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c586da3..0ebc777 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1511,6 +1511,8 @@ restart: goto restart; } + if (sock_flag(other, SOCK_RCVTSTAMP)) + __net_timestamp(skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); other->sk_data_ready(other, len); @@ -1722,6 +1724,9 @@ static int unix_dgram_recvmsg(struct kiocb *iocb, struct socket *sock, if (err) goto out_free; + if (sock_flag(sk, SOCK_RCVTSTAMP)) + __sock_recv_timestamp(msg, sk, skb); + if (!siocb->scm) { siocb->scm = &tmp_scm; memset(&tmp_scm, 0, sizeof(tmp_scm)); -- cgit v1.1 From e12b453904c54bbdc515778ff664d87a7f9473af Mon Sep 17 00:00:00 2001 From: Flavio Leitner Date: Tue, 5 Oct 2010 14:23:58 +0000 Subject: bonding: fix to rejoin multicast groups immediately The IGMP specs states that if the system receives a membership report, it shouldn't send another for the next minute. However, if a link failure happens right after that, the backup slave and the switch connected to this slave will not know about the multicast and the traffic will hang for about a minute. This patch fixes it to rejoin multicast groups immediately after a failover restoring the multicast traffic. Signed-off-by: Flavio Leitner Signed-off-by: David S. Miller --- net/ipv4/igmp.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 2a4bb76..25f3396 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1269,14 +1269,14 @@ void ip_mc_rejoin_group(struct ip_mc_list *im) if (im->multiaddr == IGMP_ALL_HOSTS) return; - if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev)) { - igmp_mod_timer(im, IGMP_Initial_Report_Delay); - return; - } - /* else, v3 */ - im->crcount = in_dev->mr_qrv ? in_dev->mr_qrv : - IGMP_Unsolicited_Report_Count; - igmp_ifc_event(in_dev); + /* a failover is happening and switches + * must be notified immediately */ + if (IGMP_V1_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMP_HOST_MEMBERSHIP_REPORT); + else if (IGMP_V2_SEEN(in_dev)) + igmp_send_report(in_dev, im, IGMPV2_HOST_MEMBERSHIP_REPORT); + else + igmp_send_report(in_dev, im, IGMPV3_HOST_MEMBERSHIP_REPORT); #endif } EXPORT_SYMBOL(ip_mc_rejoin_group); -- cgit v1.1 From ebc0ffae5dfb4447e0a431ffe7fe1d467c48bbb9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 5 Oct 2010 10:41:36 +0000 Subject: fib: RCU conversion of fib_lookup() fib_lookup() converted to be called in RCU protected context, no reference taken and released on a contended cache line (fib_clntref) fib_table_lookup() and fib_semantic_match() get an additional parameter. struct fib_info gets an rcu_head field, and is freed after an rcu grace period. Stress test : (Sending 160.000.000 UDP frames on same neighbour, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_HASH) (about same results for FIB_TRIE) Before patch : real 1m31.199s user 0m13.761s sys 23m24.780s After patch: real 1m5.375s user 0m14.997s sys 15m50.115s Before patch Profile : 13044.00 15.4% __ip_route_output_key vmlinux 8438.00 10.0% dst_destroy vmlinux 5983.00 7.1% fib_semantic_match vmlinux 5410.00 6.4% fib_rules_lookup vmlinux 4803.00 5.7% neigh_lookup vmlinux 4420.00 5.2% _raw_spin_lock vmlinux 3883.00 4.6% rt_set_nexthop vmlinux 3261.00 3.9% _raw_read_lock vmlinux 2794.00 3.3% fib_table_lookup vmlinux 2374.00 2.8% neigh_resolve_output vmlinux 2153.00 2.5% dst_alloc vmlinux 1502.00 1.8% _raw_read_lock_bh vmlinux 1484.00 1.8% kmem_cache_alloc vmlinux 1407.00 1.7% eth_header vmlinux 1406.00 1.7% ipv4_dst_destroy vmlinux 1298.00 1.5% __copy_from_user_ll vmlinux 1174.00 1.4% dev_queue_xmit vmlinux 1000.00 1.2% ip_output vmlinux After patch Profile : 13712.00 15.8% dst_destroy vmlinux 8548.00 9.9% __ip_route_output_key vmlinux 7017.00 8.1% neigh_lookup vmlinux 4554.00 5.3% fib_semantic_match vmlinux 4067.00 4.7% _raw_read_lock vmlinux 3491.00 4.0% dst_alloc vmlinux 3186.00 3.7% neigh_resolve_output vmlinux 3103.00 3.6% fib_table_lookup vmlinux 2098.00 2.4% _raw_read_lock_bh vmlinux 2081.00 2.4% kmem_cache_alloc vmlinux 2013.00 2.3% _raw_spin_lock vmlinux 1763.00 2.0% __copy_from_user_ll vmlinux 1763.00 2.0% ip_output vmlinux 1761.00 2.0% ipv4_dst_destroy vmlinux 1631.00 1.9% eth_header vmlinux 1440.00 1.7% _raw_read_unlock_bh vmlinux Reference results, if IP route cache is enabled : real 0m29.718s user 0m10.845s sys 7m37.341s 25213.00 29.5% __ip_route_output_key vmlinux 9011.00 10.5% dst_release vmlinux 4817.00 5.6% ip_push_pending_frames vmlinux 4232.00 5.0% ip_finish_output vmlinux 3940.00 4.6% udp_sendmsg vmlinux 3730.00 4.4% __copy_from_user_ll vmlinux 3716.00 4.4% ip_route_output_flow vmlinux 2451.00 2.9% __xfrm_lookup vmlinux 2221.00 2.6% ip_append_data vmlinux 1718.00 2.0% _raw_spin_lock_bh vmlinux 1655.00 1.9% __alloc_skb vmlinux 1572.00 1.8% sock_wfree vmlinux 1345.00 1.6% kfree vmlinux Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 3 ++- net/ipv4/fib_frontend.c | 27 +++++++++++----------- net/ipv4/fib_hash.c | 5 ++-- net/ipv4/fib_lookup.h | 2 +- net/ipv4/fib_rules.c | 3 ++- net/ipv4/fib_semantics.c | 21 +++++++++++++---- net/ipv4/fib_trie.c | 10 ++++---- net/ipv4/route.c | 59 ++++++++++++++++++++---------------------------- 8 files changed, 66 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index cfb7d25..21698f8 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -225,7 +225,8 @@ jumped: err = ops->action(rule, fl, flags, arg); if (err != -EAGAIN) { - if (likely(atomic_inc_not_zero(&rule->refcnt))) { + if ((arg->flags & FIB_LOOKUP_NOREF) || + likely(atomic_inc_not_zero(&rule->refcnt))) { arg->rule = rule; goto out; } diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index b05c23b..919f2ad 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -168,8 +168,11 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) struct fib_result res = { 0 }; struct net_device *dev = NULL; - if (fib_lookup(net, &fl, &res)) + rcu_read_lock(); + if (fib_lookup(net, &fl, &res)) { + rcu_read_unlock(); return NULL; + } if (res.type != RTN_LOCAL) goto out; dev = FIB_RES_DEV(res); @@ -177,7 +180,7 @@ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) if (dev && devref) dev_hold(dev); out: - fib_res_put(&res); + rcu_read_unlock(); return dev; } EXPORT_SYMBOL(__ip_dev_find); @@ -207,11 +210,12 @@ static inline unsigned __inet_dev_addr_type(struct net *net, local_table = fib_get_table(net, RT_TABLE_LOCAL); if (local_table) { ret = RTN_UNICAST; - if (!fib_table_lookup(local_table, &fl, &res)) { + rcu_read_lock(); + if (!fib_table_lookup(local_table, &fl, &res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi->fib_dev) ret = res.type; - fib_res_put(&res); } + rcu_read_unlock(); } return ret; } @@ -235,6 +239,7 @@ EXPORT_SYMBOL(inet_dev_addr_type); * - figure out what "logical" interface this packet arrived * and calculate "specific destination" address. * - check, that packet arrived from expected physical interface. + * called with rcu_read_lock() */ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct net_device *dev, __be32 *spec_dst, @@ -259,7 +264,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, struct net *net; no_addr = rpf = accept_local = 0; - rcu_read_lock(); in_dev = __in_dev_get_rcu(dev); if (in_dev) { no_addr = in_dev->ifa_list == NULL; @@ -268,7 +272,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, if (mark && !IN_DEV_SRC_VMARK(in_dev)) fl.mark = 0; } - rcu_read_unlock(); if (in_dev == NULL) goto e_inval; @@ -278,7 +281,7 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, goto last_resort; if (res.type != RTN_UNICAST) { if (res.type != RTN_LOCAL || !accept_local) - goto e_inval_res; + goto e_inval; } *spec_dst = FIB_RES_PREFSRC(res); fib_combine_itag(itag, &res); @@ -299,10 +302,8 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, #endif if (dev_match) { ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; - fib_res_put(&res); return ret; } - fib_res_put(&res); if (no_addr) goto last_resort; if (rpf == 1) @@ -315,7 +316,6 @@ int fib_validate_source(__be32 src, __be32 dst, u8 tos, int oif, *spec_dst = FIB_RES_PREFSRC(res); ret = FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } - fib_res_put(&res); } return ret; @@ -326,8 +326,6 @@ last_resort: *itag = 0; return 0; -e_inval_res: - fib_res_put(&res); e_inval: return -EINVAL; e_rpf: @@ -873,15 +871,16 @@ static void nl_fib_lookup(struct fib_result_nl *frn, struct fib_table *tb) local_bh_disable(); frn->tb_id = tb->tb_id; - frn->err = fib_table_lookup(tb, &fl, &res); + rcu_read_lock(); + frn->err = fib_table_lookup(tb, &fl, &res, FIB_LOOKUP_NOREF); if (!frn->err) { frn->prefixlen = res.prefixlen; frn->nh_sel = res.nh_sel; frn->type = res.type; frn->scope = res.scope; - fib_res_put(&res); } + rcu_read_unlock(); local_bh_enable(); } } diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 4ed7e0d..83cca68 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -244,7 +244,8 @@ fn_new_zone(struct fn_hash *table, int z) } int fib_table_lookup(struct fib_table *tb, - const struct flowi *flp, struct fib_result *res) + const struct flowi *flp, struct fib_result *res, + int fib_flags) { int err; struct fn_zone *fz; @@ -264,7 +265,7 @@ int fib_table_lookup(struct fib_table *tb, err = fib_semantic_match(&f->fn_alias, flp, res, - fz->fz_order); + fz->fz_order, fib_flags); if (err <= 0) goto out; } diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 637b133..b9c9a9f 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -22,7 +22,7 @@ struct fib_alias { /* Exported by fib_semantics.c */ extern int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen); + struct fib_result *res, int prefixlen, int fib_flags); extern void fib_release_info(struct fib_info *); extern struct fib_info *fib_create_info(struct fib_config *cfg); extern int fib_nh_match(struct fib_config *cfg, struct fib_info *fi); diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 3230052..7981a24 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -57,6 +57,7 @@ int fib_lookup(struct net *net, struct flowi *flp, struct fib_result *res) { struct fib_lookup_arg arg = { .result = res, + .flags = FIB_LOOKUP_NOREF, }; int err; @@ -94,7 +95,7 @@ static int fib4_rule_action(struct fib_rule *rule, struct flowi *flp, if (!tbl) goto errout; - err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result); + err = fib_table_lookup(tbl, flp, (struct fib_result *) arg->result, arg->flags); if (err > 0) err = -EAGAIN; errout: diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index ba52f39..0f80dfc 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -148,6 +148,13 @@ static const struct /* Release a nexthop info record */ +static void free_fib_info_rcu(struct rcu_head *head) +{ + struct fib_info *fi = container_of(head, struct fib_info, rcu); + + kfree(fi); +} + void free_fib_info(struct fib_info *fi) { if (fi->fib_dead == 0) { @@ -161,7 +168,7 @@ void free_fib_info(struct fib_info *fi) } endfor_nexthops(fi); fib_info_cnt--; release_net(fi->fib_net); - kfree(fi); + call_rcu(&fi->rcu, free_fib_info_rcu); } void fib_release_info(struct fib_info *fi) @@ -553,6 +560,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, nh->nh_scope = RT_SCOPE_LINK; return 0; } + rcu_read_lock(); { struct flowi fl = { .nl_u = { @@ -568,8 +576,10 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (fl.fl4_scope < RT_SCOPE_LINK) fl.fl4_scope = RT_SCOPE_LINK; err = fib_lookup(net, &fl, &res); - if (err) + if (err) { + rcu_read_unlock(); return err; + } } err = -EINVAL; if (res.type != RTN_UNICAST && res.type != RTN_LOCAL) @@ -585,7 +595,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, goto out; err = 0; out: - fib_res_put(&res); + rcu_read_unlock(); return err; } else { struct in_device *in_dev; @@ -879,7 +889,7 @@ failure: /* Note! fib_semantic_match intentionally uses RCU list functions. */ int fib_semantic_match(struct list_head *head, const struct flowi *flp, - struct fib_result *res, int prefixlen) + struct fib_result *res, int prefixlen, int fib_flags) { struct fib_alias *fa; int nh_sel = 0; @@ -943,7 +953,8 @@ out_fill_res: res->type = fa->fa_type; res->scope = fa->fa_scope; res->fi = fa->fa_info; - atomic_inc(&res->fi->fib_clntref); + if (!(fib_flags & FIB_LOOKUP_NOREF)) + atomic_inc(&res->fi->fib_clntref); return 0; } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index a96e5ec..271c89b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1342,7 +1342,7 @@ err: /* should be called with rcu_read_lock */ static int check_leaf(struct trie *t, struct leaf *l, t_key key, const struct flowi *flp, - struct fib_result *res) + struct fib_result *res, int fib_flags) { struct leaf_info *li; struct hlist_head *hhead = &l->list; @@ -1356,7 +1356,7 @@ static int check_leaf(struct trie *t, struct leaf *l, if (l->key != (key & ntohl(mask))) continue; - err = fib_semantic_match(&li->falh, flp, res, plen); + err = fib_semantic_match(&li->falh, flp, res, plen, fib_flags); #ifdef CONFIG_IP_FIB_TRIE_STATS if (err <= 0) @@ -1372,7 +1372,7 @@ static int check_leaf(struct trie *t, struct leaf *l, } int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, - struct fib_result *res) + struct fib_result *res, int fib_flags) { struct trie *t = (struct trie *) tb->tb_data; int ret; @@ -1399,7 +1399,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, /* Just a leaf? */ if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res); + ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); goto found; } @@ -1424,7 +1424,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, } if (IS_LEAF(n)) { - ret = check_leaf(t, (struct leaf *)n, key, flp, res); + ret = check_leaf(t, (struct leaf *)n, key, flp, res, fib_flags); if (ret > 0) goto backtrace; goto found; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 04e0df8..7864d0c 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1773,12 +1773,15 @@ void ip_rt_get_source(u8 *addr, struct rtable *rt) if (rt->fl.iif == 0) src = rt->rt_src; - else if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) { - src = FIB_RES_PREFSRC(res); - fib_res_put(&res); - } else - src = inet_select_addr(rt->dst.dev, rt->rt_gateway, + else { + rcu_read_lock(); + if (fib_lookup(dev_net(rt->dst.dev), &rt->fl, &res) == 0) + src = FIB_RES_PREFSRC(res); + else + src = inet_select_addr(rt->dst.dev, rt->rt_gateway, RT_SCOPE_UNIVERSE); + rcu_read_unlock(); + } memcpy(addr, &src, 4); } @@ -2081,6 +2084,7 @@ static int ip_mkroute_input(struct sk_buff *skb, * Such approach solves two big problems: * 1. Not simplex devices are handled properly. * 2. IP spoofing attempts are filtered with 100% of guarantee. + * called with rcu_read_lock() */ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2102,7 +2106,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, unsigned hash; __be32 spec_dst; int err = -EINVAL; - int free_res = 0; struct net * net = dev_net(dev); /* IP on this device is disabled. */ @@ -2134,12 +2137,12 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, /* * Now we are ready to route packet. */ - if ((err = fib_lookup(net, &fl, &res)) != 0) { + err = fib_lookup(net, &fl, &res); + if (err != 0) { if (!IN_DEV_FORWARD(in_dev)) goto e_hostunreach; goto no_route; } - free_res = 1; RT_CACHE_STAT_INC(in_slow_tot); @@ -2148,8 +2151,8 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (res.type == RTN_LOCAL) { err = fib_validate_source(saddr, daddr, tos, - net->loopback_dev->ifindex, - dev, &spec_dst, &itag, skb->mark); + net->loopback_dev->ifindex, + dev, &spec_dst, &itag, skb->mark); if (err < 0) goto martian_source_keep_err; if (err) @@ -2164,9 +2167,6 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, goto martian_destination; err = ip_mkroute_input(skb, &res, &fl, in_dev, daddr, saddr, tos); -done: - if (free_res) - fib_res_put(&res); out: return err; brd_input: @@ -2226,7 +2226,7 @@ local_input: rth->rt_type = res.type; hash = rt_hash(daddr, saddr, fl.iif, rt_genid(net)); err = rt_intern_hash(hash, rth, NULL, skb, fl.iif); - goto done; + goto out; no_route: RT_CACHE_STAT_INC(in_no_route); @@ -2249,21 +2249,21 @@ martian_destination: e_hostunreach: err = -EHOSTUNREACH; - goto done; + goto out; e_inval: err = -EINVAL; - goto done; + goto out; e_nobufs: err = -ENOBUFS; - goto done; + goto out; martian_source: err = -EINVAL; martian_source_keep_err: ip_handle_martian_source(dev, in_dev, skb, daddr, saddr); - goto done; + goto out; } int ip_route_input_common(struct sk_buff *skb, __be32 daddr, __be32 saddr, @@ -2349,6 +2349,7 @@ skip_cache: } EXPORT_SYMBOL(ip_route_input_common); +/* called with rcu_read_lock() */ static int __mkroute_output(struct rtable **result, struct fib_result *res, const struct flowi *fl, @@ -2373,18 +2374,13 @@ static int __mkroute_output(struct rtable **result, if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; - rcu_read_lock(); in_dev = __in_dev_get_rcu(dev_out); - if (!in_dev) { - rcu_read_unlock(); + if (!in_dev) return -EINVAL; - } + if (res->type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; - if (res->fi) { - fib_info_put(res->fi); - res->fi = NULL; - } + res->fi = NULL; } else if (res->type == RTN_MULTICAST) { flags |= RTCF_MULTICAST | RTCF_LOCAL; if (!ip_check_mc(in_dev, oldflp->fl4_dst, oldflp->fl4_src, @@ -2394,10 +2390,8 @@ static int __mkroute_output(struct rtable **result, * default one, but do not gateway in this case. * Yes, it is hack. */ - if (res->fi && res->prefixlen < 4) { - fib_info_put(res->fi); + if (res->fi && res->prefixlen < 4) res->fi = NULL; - } } @@ -2467,6 +2461,7 @@ static int __mkroute_output(struct rtable **result, return 0; } +/* called with rcu_read_lock() */ static int ip_mkroute_output(struct rtable **rp, struct fib_result *res, const struct flowi *fl, @@ -2509,7 +2504,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, struct fib_result res; unsigned int flags = 0; struct net_device *dev_out = NULL; - int free_res = 0; int err; @@ -2636,15 +2630,12 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, err = -ENETUNREACH; goto out; } - free_res = 1; if (res.type == RTN_LOCAL) { if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; dev_out = net->loopback_dev; fl.oif = dev_out->ifindex; - if (res.fi) - fib_info_put(res.fi); res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; @@ -2668,8 +2659,6 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, make_route: err = ip_mkroute_output(rp, &res, &fl, oldflp, dev_out, flags); - if (free_res) - fib_res_put(&res); out: return err; } -- cgit v1.1 From 53f73c09d64f1fa7d7e6e8b6bb7468d42eddc92d Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Oct 2010 19:37:40 +0200 Subject: mac80211: avoid transmitting delBA to old AP When roaming while we have active BA session, we can end up transmitting delBA frames to the old AP while we're already on the new AP's channel, which can cause warnings. Simply avoid sending those frames, but still tear down the internal session state, since they are not really necessary anyway as we will implicitly disassociate when sending the association to the new AP. Signed-off-by: Johannes Berg Acked-by: Luis R. Rodriguez Signed-off-by: John W. Linville --- net/mac80211/agg-rx.c | 8 ++++---- net/mac80211/agg-tx.c | 14 +++++++++----- net/mac80211/debugfs_sta.c | 3 ++- net/mac80211/ht.c | 17 ++++++++++------- net/mac80211/ieee80211_i.h | 12 +++++++----- net/mac80211/iface.c | 3 ++- net/mac80211/mlme.c | 18 +++++++++--------- net/mac80211/pm.c | 2 +- net/mac80211/sta_info.c | 2 +- net/mac80211/sta_info.h | 2 ++ net/mac80211/util.c | 2 +- 11 files changed, 48 insertions(+), 35 deletions(-) (limited to 'net') diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 58eab9e..720b7a8 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -56,7 +56,7 @@ static void ieee80211_free_tid_rx(struct rcu_head *h) } void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, - u16 initiator, u16 reason) + u16 initiator, u16 reason, bool tx) { struct ieee80211_local *local = sta->local; struct tid_ampdu_rx *tid_rx; @@ -81,7 +81,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, "aggregation for tid %d\n", tid); /* check if this is a self generated aggregation halt */ - if (initiator == WLAN_BACK_RECIPIENT) + if (initiator == WLAN_BACK_RECIPIENT && tx) ieee80211_send_delba(sta->sdata, sta->sta.addr, tid, 0, reason); @@ -92,10 +92,10 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, } void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, - u16 initiator, u16 reason) + u16 initiator, u16 reason, bool tx) { mutex_lock(&sta->ampdu_mlme.mtx); - ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason); + ___ieee80211_stop_rx_ba_session(sta, tid, initiator, reason, tx); mutex_unlock(&sta->ampdu_mlme.mtx); } diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 8f23401..d4679b2 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -145,7 +145,8 @@ static void kfree_tid_tx(struct rcu_head *rcu_head) } int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator) + enum ieee80211_back_parties initiator, + bool tx) { struct ieee80211_local *local = sta->local; struct tid_ampdu_tx *tid_tx = sta->ampdu_mlme.tid_tx[tid]; @@ -185,6 +186,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, clear_bit(HT_AGG_STATE_OPERATIONAL, &tid_tx->state); tid_tx->stop_initiator = initiator; + tid_tx->tx_stop = tx; ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_TX_STOP, @@ -577,13 +579,14 @@ void ieee80211_start_tx_ba_cb_irqsafe(struct ieee80211_vif *vif, EXPORT_SYMBOL(ieee80211_start_tx_ba_cb_irqsafe); int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator) + enum ieee80211_back_parties initiator, + bool tx) { int ret; mutex_lock(&sta->ampdu_mlme.mtx); - ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator); + ret = ___ieee80211_stop_tx_ba_session(sta, tid, initiator, tx); mutex_unlock(&sta->ampdu_mlme.mtx); @@ -672,7 +675,7 @@ void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid) goto unlock_sta; } - if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR) + if (tid_tx->stop_initiator == WLAN_BACK_INITIATOR && tid_tx->tx_stop) ieee80211_send_delba(sta->sdata, ra, tid, WLAN_BACK_INITIATOR, WLAN_REASON_QSTA_NOT_USE); @@ -772,7 +775,8 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local, sta->ampdu_mlme.addba_req_num[tid] = 0; } else { - ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR); + ___ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_INITIATOR, + true); } out: diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index 6b7ff9f..50c40ea 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -196,7 +196,8 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu else ret = ieee80211_stop_tx_ba_session(&sta->sta, tid); } else { - __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, 3); + __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, + 3, true); ret = 0; } diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 11f74f5..4214bb6 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -101,16 +101,16 @@ void ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_supported_band *sband, ht_cap->mcs.rx_mask[32/8] |= 1; } -void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta) +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx) { int i; cancel_work_sync(&sta->ampdu_mlme.work); for (i = 0; i < STA_TID_NUM; i++) { - __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR); + __ieee80211_stop_tx_ba_session(sta, i, WLAN_BACK_INITIATOR, tx); __ieee80211_stop_rx_ba_session(sta, i, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_LEAVE_QBSS); + WLAN_REASON_QSTA_LEAVE_QBSS, tx); } } @@ -135,7 +135,7 @@ void ieee80211_ba_session_work(struct work_struct *work) if (test_and_clear_bit(tid, sta->ampdu_mlme.tid_rx_timer_expired)) ___ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_TIMEOUT); + WLAN_REASON_QSTA_TIMEOUT, true); tid_tx = sta->ampdu_mlme.tid_tx[tid]; if (!tid_tx) @@ -146,7 +146,8 @@ void ieee80211_ba_session_work(struct work_struct *work) else if (test_and_clear_bit(HT_AGG_STATE_WANT_STOP, &tid_tx->state)) ___ieee80211_stop_tx_ba_session(sta, tid, - WLAN_BACK_INITIATOR); + WLAN_BACK_INITIATOR, + true); } mutex_unlock(&sta->ampdu_mlme.mtx); } @@ -214,9 +215,11 @@ void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, #endif /* CONFIG_MAC80211_HT_DEBUG */ if (initiator == WLAN_BACK_INITIATOR) - __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0); + __ieee80211_stop_rx_ba_session(sta, tid, WLAN_BACK_INITIATOR, 0, + true); else - __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT); + __ieee80211_stop_tx_ba_session(sta, tid, WLAN_BACK_RECIPIENT, + true); } int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 08509e2..76c2b50 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -1175,10 +1175,10 @@ int ieee80211_send_smps_action(struct ieee80211_sub_if_data *sdata, void ieee80211_request_smps_work(struct work_struct *work); void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, - u16 initiator, u16 reason); + u16 initiator, u16 reason, bool stop); void __ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid, - u16 initiator, u16 reason); -void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta); + u16 initiator, u16 reason, bool stop); +void ieee80211_sta_tear_down_BA_sessions(struct sta_info *sta, bool tx); void ieee80211_process_delba(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, struct ieee80211_mgmt *mgmt, size_t len); @@ -1192,9 +1192,11 @@ void ieee80211_process_addba_request(struct ieee80211_local *local, size_t len); int __ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator); + enum ieee80211_back_parties initiator, + bool tx); int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid, - enum ieee80211_back_parties initiator); + enum ieee80211_back_parties initiator, + bool tx); void ieee80211_start_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u16 tid); void ieee80211_stop_tx_ba_cb(struct ieee80211_vif *vif, u8 *ra, u8 tid); void ieee80211_ba_session_work(struct work_struct *work); diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 438a2f5..e99d1b6 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -796,7 +796,8 @@ static void ieee80211_iface_work(struct work_struct *work) __ieee80211_stop_rx_ba_session( sta, tid, WLAN_BACK_RECIPIENT, - WLAN_REASON_QSTA_REQUIRE_SETUP); + WLAN_REASON_QSTA_REQUIRE_SETUP, + true); } mutex_unlock(&local->sta_mtx); } else switch (sdata->vif.type) { diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index cd13aa8..5695c94 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -921,7 +921,7 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, } static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, - bool remove_sta) + bool remove_sta, bool tx) { struct ieee80211_if_managed *ifmgd = &sdata->u.mgd; struct ieee80211_local *local = sdata->local; @@ -960,7 +960,7 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata, sta = sta_info_get(sdata, bssid); if (sta) { set_sta_flags(sta, WLAN_STA_BLOCK_BA); - ieee80211_sta_tear_down_BA_sessions(sta); + ieee80211_sta_tear_down_BA_sessions(sta, tx); } mutex_unlock(&local->sta_mtx); @@ -1124,7 +1124,7 @@ static void __ieee80211_connection_loss(struct ieee80211_sub_if_data *sdata) printk(KERN_DEBUG "Connection to AP %pM lost.\n", bssid); - ieee80211_set_disassoc(sdata, true); + ieee80211_set_disassoc(sdata, true, true); mutex_unlock(&ifmgd->mtx); mutex_lock(&local->mtx); @@ -1197,7 +1197,7 @@ ieee80211_rx_mgmt_deauth(struct ieee80211_sub_if_data *sdata, printk(KERN_DEBUG "%s: deauthenticated from %pM (Reason: %u)\n", sdata->name, bssid, reason_code); - ieee80211_set_disassoc(sdata, true); + ieee80211_set_disassoc(sdata, true, false); mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); mutex_unlock(&sdata->local->mtx); @@ -1229,7 +1229,7 @@ ieee80211_rx_mgmt_disassoc(struct ieee80211_sub_if_data *sdata, printk(KERN_DEBUG "%s: disassociated from %pM (Reason: %u)\n", sdata->name, mgmt->sa, reason_code); - ieee80211_set_disassoc(sdata, true); + ieee80211_set_disassoc(sdata, true, false); mutex_lock(&sdata->local->mtx); ieee80211_recalc_idle(sdata->local); mutex_unlock(&sdata->local->mtx); @@ -1880,7 +1880,7 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) printk(KERN_DEBUG "No probe response from AP %pM" " after %dms, disconnecting.\n", bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ); - ieee80211_set_disassoc(sdata, true); + ieee80211_set_disassoc(sdata, true, true); mutex_unlock(&ifmgd->mtx); mutex_lock(&local->mtx); ieee80211_recalc_idle(local); @@ -2204,7 +2204,7 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, } /* Trying to reassociate - clear previous association state */ - ieee80211_set_disassoc(sdata, true); + ieee80211_set_disassoc(sdata, true, false); } mutex_unlock(&ifmgd->mtx); @@ -2318,7 +2318,7 @@ int ieee80211_mgd_deauth(struct ieee80211_sub_if_data *sdata, memcpy(bssid, req->bss->bssid, ETH_ALEN); if (ifmgd->associated == req->bss) { - ieee80211_set_disassoc(sdata, false); + ieee80211_set_disassoc(sdata, false, true); mutex_unlock(&ifmgd->mtx); assoc_bss = true; } else { @@ -2401,7 +2401,7 @@ int ieee80211_mgd_disassoc(struct ieee80211_sub_if_data *sdata, sdata->name, req->bss->bssid, req->reason_code); memcpy(bssid, req->bss->bssid, ETH_ALEN); - ieee80211_set_disassoc(sdata, false); + ieee80211_set_disassoc(sdata, false, true); mutex_unlock(&ifmgd->mtx); diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index ce671df..e3e2bce3 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -46,7 +46,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw) list_for_each_entry(sta, &local->sta_list, list) { if (hw->flags & IEEE80211_HW_AMPDU_AGGREGATION) { set_sta_flags(sta, WLAN_STA_BLOCK_BA); - ieee80211_sta_tear_down_BA_sessions(sta); + ieee80211_sta_tear_down_BA_sessions(sta, true); } if (sta->uploaded) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index ca2cba9..aeaf2d6 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -633,7 +633,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta) * will be sufficient. */ set_sta_flags(sta, WLAN_STA_BLOCK_BA); - ieee80211_sta_tear_down_BA_sessions(sta); + ieee80211_sta_tear_down_BA_sessions(sta, true); spin_lock_irqsave(&local->sta_lock, flags); ret = sta_info_hash_del(local, sta); diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 810c5ce..cf21a2e 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -79,6 +79,7 @@ enum ieee80211_sta_info_flags { * @dialog_token: dialog token for aggregation session * @state: session state (see above) * @stop_initiator: initiator of a session stop + * @tx_stop: TX DelBA frame when stopping * * This structure is protected by RCU and the per-station * spinlock. Assignments to the array holding it must hold @@ -95,6 +96,7 @@ struct tid_ampdu_tx { unsigned long state; u8 dialog_token; u8 stop_initiator; + bool tx_stop; }; /** diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 4ee8f2b..0b6fc92 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -1221,7 +1221,7 @@ int ieee80211_reconfig(struct ieee80211_local *local) mutex_lock(&local->sta_mtx); list_for_each_entry(sta, &local->sta_list, list) { - ieee80211_sta_tear_down_BA_sessions(sta); + ieee80211_sta_tear_down_BA_sessions(sta, true); clear_sta_flags(sta, WLAN_STA_BLOCK_BA); } -- cgit v1.1 From e31b82136d1adc7a599b6e99d3321e5831841f5a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Oct 2010 19:39:30 +0200 Subject: cfg80211/mac80211: allow per-station GTKs This adds API to allow adding per-station GTKs, updates mac80211 to support it, and also allows drivers to remove a key from hwaccel again when this may be necessary due to multiple GTKs. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 32 +++++++++++----- net/mac80211/ieee80211_i.h | 2 - net/mac80211/key.c | 95 +++++++++++++++++++++++++++++----------------- net/mac80211/key.h | 3 ++ net/mac80211/rx.c | 41 ++++++++++++-------- net/mac80211/sta_info.c | 10 ++--- net/mac80211/sta_info.h | 6 ++- net/mac80211/tx.c | 2 +- net/wireless/core.h | 2 +- net/wireless/ibss.c | 2 +- net/wireless/nl80211.c | 87 ++++++++++++++++++++++++++++++++++++++---- net/wireless/sme.c | 2 +- net/wireless/util.c | 12 ++++-- net/wireless/wext-compat.c | 38 ++++++++++++------- 14 files changed, 239 insertions(+), 95 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 94bf550..8b0e874 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -103,7 +103,7 @@ static int ieee80211_change_iface(struct wiphy *wiphy, } static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx, const u8 *mac_addr, + u8 key_idx, bool pairwise, const u8 *mac_addr, struct key_params *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); @@ -131,6 +131,9 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, if (IS_ERR(key)) return PTR_ERR(key); + if (pairwise) + key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; + mutex_lock(&sdata->local->sta_mtx); if (mac_addr) { @@ -153,7 +156,7 @@ static int ieee80211_add_key(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx, const u8 *mac_addr) + u8 key_idx, bool pairwise, const u8 *mac_addr) { struct ieee80211_sub_if_data *sdata; struct sta_info *sta; @@ -170,10 +173,17 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, if (!sta) goto out_unlock; - if (sta->key) { - ieee80211_key_free(sdata->local, sta->key); - WARN_ON(sta->key); - ret = 0; + if (pairwise) { + if (sta->ptk) { + ieee80211_key_free(sdata->local, sta->ptk); + ret = 0; + } + } else { + if (sta->gtk[key_idx]) { + ieee80211_key_free(sdata->local, + sta->gtk[key_idx]); + ret = 0; + } } goto out_unlock; @@ -195,7 +205,8 @@ static int ieee80211_del_key(struct wiphy *wiphy, struct net_device *dev, } static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, - u8 key_idx, const u8 *mac_addr, void *cookie, + u8 key_idx, bool pairwise, const u8 *mac_addr, + void *cookie, void (*callback)(void *cookie, struct key_params *params)) { @@ -203,7 +214,7 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, struct sta_info *sta = NULL; u8 seq[6] = {0}; struct key_params params; - struct ieee80211_key *key; + struct ieee80211_key *key = NULL; u32 iv32; u16 iv16; int err = -ENOENT; @@ -217,7 +228,10 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev, if (!sta) goto out; - key = sta->key; + if (pairwise) + key = sta->ptk; + else if (key_idx < NUM_DEFAULT_KEYS) + key = sta->gtk[key_idx]; } else key = sdata->keys[key_idx]; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 76c2b50..f0610fa 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -549,8 +549,6 @@ struct ieee80211_sub_if_data { struct ieee80211_fragment_entry fragments[IEEE80211_FRAGMENT_MAX]; unsigned int fragment_next; -#define NUM_DEFAULT_KEYS 4 -#define NUM_DEFAULT_MGMT_KEYS 2 struct ieee80211_key *keys[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; struct ieee80211_key *default_key; struct ieee80211_key *default_mgmt_key; diff --git a/net/mac80211/key.c b/net/mac80211/key.c index 6a63d1a..ccd676b 100644 --- a/net/mac80211/key.c +++ b/net/mac80211/key.c @@ -68,15 +68,21 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) might_sleep(); - if (!key->local->ops->set_key) { - ret = -EOPNOTSUPP; + if (!key->local->ops->set_key) goto out_unsupported; - } assert_key_lock(key->local); sta = get_sta_for_key(key); + /* + * If this is a per-STA GTK, check if it + * is supported; if not, return. + */ + if (sta && !(key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE) && + !(key->local->hw.flags & IEEE80211_HW_SUPPORTS_PER_STA_GTK)) + goto out_unsupported; + sdata = key->sdata; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) sdata = container_of(sdata->bss, @@ -85,31 +91,28 @@ static int ieee80211_key_enable_hw_accel(struct ieee80211_key *key) ret = drv_set_key(key->local, SET_KEY, sdata, sta, &key->conf); - if (!ret) + if (!ret) { key->flags |= KEY_FLAG_UPLOADED_TO_HARDWARE; + return 0; + } - if (ret && ret != -ENOSPC && ret != -EOPNOTSUPP) + if (ret != -ENOSPC && ret != -EOPNOTSUPP) wiphy_err(key->local->hw.wiphy, "failed to set key (%d, %pM) to hardware (%d)\n", key->conf.keyidx, sta ? sta->addr : bcast_addr, ret); -out_unsupported: - if (ret) { - switch (key->conf.cipher) { - case WLAN_CIPHER_SUITE_WEP40: - case WLAN_CIPHER_SUITE_WEP104: - case WLAN_CIPHER_SUITE_TKIP: - case WLAN_CIPHER_SUITE_CCMP: - case WLAN_CIPHER_SUITE_AES_CMAC: - /* all of these we can do in software */ - ret = 0; - break; - default: - ret = -EINVAL; - } + out_unsupported: + switch (key->conf.cipher) { + case WLAN_CIPHER_SUITE_WEP40: + case WLAN_CIPHER_SUITE_WEP104: + case WLAN_CIPHER_SUITE_TKIP: + case WLAN_CIPHER_SUITE_CCMP: + case WLAN_CIPHER_SUITE_AES_CMAC: + /* all of these we can do in software */ + return 0; + default: + return -EINVAL; } - - return ret; } static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) @@ -147,6 +150,26 @@ static void ieee80211_key_disable_hw_accel(struct ieee80211_key *key) key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; } +void ieee80211_key_removed(struct ieee80211_key_conf *key_conf) +{ + struct ieee80211_key *key; + + key = container_of(key_conf, struct ieee80211_key, conf); + + might_sleep(); + assert_key_lock(key->local); + + key->flags &= ~KEY_FLAG_UPLOADED_TO_HARDWARE; + + /* + * Flush TX path to avoid attempts to use this key + * after this function returns. Until then, drivers + * must be prepared to handle the key. + */ + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(ieee80211_key_removed); + static void __ieee80211_set_default_key(struct ieee80211_sub_if_data *sdata, int idx) { @@ -202,6 +225,7 @@ void ieee80211_set_default_mgmt_key(struct ieee80211_sub_if_data *sdata, static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, struct sta_info *sta, + bool pairwise, struct ieee80211_key *old, struct ieee80211_key *new) { @@ -210,8 +234,14 @@ static void __ieee80211_key_replace(struct ieee80211_sub_if_data *sdata, if (new) list_add(&new->list, &sdata->key_list); - if (sta) { - rcu_assign_pointer(sta->key, new); + if (sta && pairwise) { + rcu_assign_pointer(sta->ptk, new); + } else if (sta) { + if (old) + idx = old->conf.keyidx; + else + idx = new->conf.keyidx; + rcu_assign_pointer(sta->gtk[idx], new); } else { WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx); @@ -355,6 +385,7 @@ int ieee80211_key_link(struct ieee80211_key *key, { struct ieee80211_key *old_key; int idx, ret; + bool pairwise = key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE; BUG_ON(!sdata); BUG_ON(!key); @@ -371,13 +402,6 @@ int ieee80211_key_link(struct ieee80211_key *key, */ if (test_sta_flags(sta, WLAN_STA_WME)) key->conf.flags |= IEEE80211_KEY_FLAG_WMM_STA; - - /* - * This key is for a specific sta interface, - * inform the driver that it should try to store - * this key as pairwise key. - */ - key->conf.flags |= IEEE80211_KEY_FLAG_PAIRWISE; } else { if (sdata->vif.type == NL80211_IFTYPE_STATION) { struct sta_info *ap; @@ -399,12 +423,14 @@ int ieee80211_key_link(struct ieee80211_key *key, mutex_lock(&sdata->local->key_mtx); - if (sta) - old_key = sta->key; + if (sta && pairwise) + old_key = sta->ptk; + else if (sta) + old_key = sta->gtk[idx]; else old_key = sdata->keys[idx]; - __ieee80211_key_replace(sdata, sta, old_key, key); + __ieee80211_key_replace(sdata, sta, pairwise, old_key, key); __ieee80211_key_destroy(old_key); ieee80211_debugfs_key_add(key); @@ -423,7 +449,8 @@ static void __ieee80211_key_free(struct ieee80211_key *key) */ if (key->sdata) __ieee80211_key_replace(key->sdata, key->sta, - key, NULL); + key->conf.flags & IEEE80211_KEY_FLAG_PAIRWISE, + key, NULL); __ieee80211_key_destroy(key); } diff --git a/net/mac80211/key.h b/net/mac80211/key.h index cb9a4a6..0db1c0f 100644 --- a/net/mac80211/key.h +++ b/net/mac80211/key.h @@ -16,6 +16,9 @@ #include #include +#define NUM_DEFAULT_KEYS 4 +#define NUM_DEFAULT_MGMT_KEYS 2 + #define WEP_IV_LEN 4 #define WEP_ICV_LEN 4 #define ALG_TKIP_KEY_LEN 32 diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index b3e161f..b67221d 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -846,7 +846,7 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) int keyidx; int hdrlen; ieee80211_rx_result result = RX_DROP_UNUSABLE; - struct ieee80211_key *stakey = NULL; + struct ieee80211_key *sta_ptk = NULL; int mmie_keyidx = -1; __le16 fc; @@ -888,15 +888,15 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) rx->key = NULL; if (rx->sta) - stakey = rcu_dereference(rx->sta->key); + sta_ptk = rcu_dereference(rx->sta->ptk); fc = hdr->frame_control; if (!ieee80211_has_protected(fc)) mmie_keyidx = ieee80211_get_mmie_keyidx(rx->skb); - if (!is_multicast_ether_addr(hdr->addr1) && stakey) { - rx->key = stakey; + if (!is_multicast_ether_addr(hdr->addr1) && sta_ptk) { + rx->key = sta_ptk; if ((status->flag & RX_FLAG_DECRYPTED) && (status->flag & RX_FLAG_IV_STRIPPED)) return RX_CONTINUE; @@ -912,7 +912,10 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS) return RX_DROP_MONITOR; /* unexpected BIP keyidx */ - rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); + if (rx->sta) + rx->key = rcu_dereference(rx->sta->gtk[mmie_keyidx]); + if (!rx->key) + rx->key = rcu_dereference(rx->sdata->keys[mmie_keyidx]); } else if (!ieee80211_has_protected(fc)) { /* * The frame was not protected, so skip decryption. However, we @@ -955,17 +958,25 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) skb_copy_bits(rx->skb, hdrlen + 3, &keyid, 1); keyidx = keyid >> 6; - rx->key = rcu_dereference(rx->sdata->keys[keyidx]); + /* check per-station GTK first, if multicast packet */ + if (is_multicast_ether_addr(hdr->addr1) && rx->sta) + rx->key = rcu_dereference(rx->sta->gtk[keyidx]); - /* - * RSNA-protected unicast frames should always be sent with - * pairwise or station-to-station keys, but for WEP we allow - * using a key index as well. - */ - if (rx->key && rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 && - rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 && - !is_multicast_ether_addr(hdr->addr1)) - rx->key = NULL; + /* if not found, try default key */ + if (!rx->key) { + rx->key = rcu_dereference(rx->sdata->keys[keyidx]); + + /* + * RSNA-protected unicast frames should always be + * sent with pairwise or station-to-station keys, + * but for WEP we allow using a key index as well. + */ + if (rx->key && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP40 && + rx->key->conf.cipher != WLAN_CIPHER_SUITE_WEP104 && + !is_multicast_ether_addr(hdr->addr1)) + rx->key = NULL; + } } if (rx->key) { diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index aeaf2d6..6d8f897 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -616,7 +616,7 @@ static int __must_check __sta_info_destroy(struct sta_info *sta) struct ieee80211_sub_if_data *sdata; struct sk_buff *skb; unsigned long flags; - int ret; + int ret, i; might_sleep(); @@ -644,10 +644,10 @@ static int __must_check __sta_info_destroy(struct sta_info *sta) if (ret) return ret; - if (sta->key) { - ieee80211_key_free(local, sta->key); - WARN_ON(sta->key); - } + for (i = 0; i < NUM_DEFAULT_KEYS; i++) + ieee80211_key_free(local, sta->gtk[i]); + if (sta->ptk) + ieee80211_key_free(local, sta->ptk); sta->dead = true; diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index cf21a2e..9265aca 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -199,7 +199,8 @@ enum plink_state { * @hnext: hash table linked list pointer * @local: pointer to the global information * @sdata: virtual interface this station belongs to - * @key: peer key negotiated with this station, if any + * @ptk: peer key negotiated with this station, if any + * @gtk: group keys negotiated with this station, if any * @rate_ctrl: rate control algorithm reference * @rate_ctrl_priv: rate control private per-STA pointer * @last_tx_rate: rate used for last transmit, to report to userspace as @@ -254,7 +255,8 @@ struct sta_info { struct sta_info *hnext; struct ieee80211_local *local; struct ieee80211_sub_if_data *sdata; - struct ieee80211_key *key; + struct ieee80211_key *gtk[NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS]; + struct ieee80211_key *ptk; struct rate_control_ref *rate_ctrl; void *rate_ctrl_priv; spinlock_t lock; diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 258fbdb..96c5943 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -532,7 +532,7 @@ ieee80211_tx_h_select_key(struct ieee80211_tx_data *tx) if (unlikely(info->flags & IEEE80211_TX_INTFL_DONT_ENCRYPT)) tx->key = NULL; - else if (tx->sta && (key = rcu_dereference(tx->sta->key))) + else if (tx->sta && (key = rcu_dereference(tx->sta->ptk))) tx->key = key; else if (ieee80211_is_mgmt(hdr->frame_control) && is_multicast_ether_addr(hdr->addr1) && diff --git a/net/wireless/core.h b/net/wireless/core.h index 37580e0..2d1d4c7 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -375,7 +375,7 @@ bool cfg80211_sme_failed_reassoc(struct wireless_dev *wdev); /* internal helpers */ int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, - const u8 *mac_addr); + bool pairwise, const u8 *mac_addr); void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, size_t ie_len, u16 reason, bool from_ap); void cfg80211_sme_scan_done(struct net_device *dev); diff --git a/net/wireless/ibss.c b/net/wireless/ibss.c index 8cb6e08..f33fbb7 100644 --- a/net/wireless/ibss.c +++ b/net/wireless/ibss.c @@ -160,7 +160,7 @@ static void __cfg80211_clear_ibss(struct net_device *dev, bool nowext) */ if (rdev->ops->del_key) for (i = 0; i < 6; i++) - rdev->ops->del_key(wdev->wiphy, dev, i, NULL); + rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL); if (wdev->current_bss) { cfg80211_unhold_bss(wdev->current_bss); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 0c94971..8826888 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -93,6 +93,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_KEY_CIPHER] = { .type = NLA_U32 }, [NL80211_ATTR_KEY_DEFAULT] = { .type = NLA_FLAG }, [NL80211_ATTR_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 }, + [NL80211_ATTR_KEY_TYPE] = { .type = NLA_U32 }, [NL80211_ATTR_BEACON_INTERVAL] = { .type = NLA_U32 }, [NL80211_ATTR_DTIM_PERIOD] = { .type = NLA_U32 }, @@ -168,7 +169,7 @@ static const struct nla_policy nl80211_policy[NL80211_ATTR_MAX+1] = { [NL80211_ATTR_FRAME_TYPE] = { .type = NLA_U16 }, }; -/* policy for the attributes */ +/* policy for the key attributes */ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = { [NL80211_KEY_DATA] = { .type = NLA_BINARY, .len = WLAN_MAX_KEY_LEN }, [NL80211_KEY_IDX] = { .type = NLA_U8 }, @@ -176,6 +177,7 @@ static const struct nla_policy nl80211_key_policy[NL80211_KEY_MAX + 1] = { [NL80211_KEY_SEQ] = { .type = NLA_BINARY, .len = 8 }, [NL80211_KEY_DEFAULT] = { .type = NLA_FLAG }, [NL80211_KEY_DEFAULT_MGMT] = { .type = NLA_FLAG }, + [NL80211_KEY_TYPE] = { .type = NLA_U32 }, }; /* ifidx get helper */ @@ -306,6 +308,7 @@ static int nl80211_msg_put_channel(struct sk_buff *msg, struct key_parse { struct key_params p; int idx; + int type; bool def, defmgmt; }; @@ -336,6 +339,12 @@ static int nl80211_parse_key_new(struct nlattr *key, struct key_parse *k) if (tb[NL80211_KEY_CIPHER]) k->p.cipher = nla_get_u32(tb[NL80211_KEY_CIPHER]); + if (tb[NL80211_KEY_TYPE]) { + k->type = nla_get_u32(tb[NL80211_KEY_TYPE]); + if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) + return -EINVAL; + } + return 0; } @@ -360,6 +369,12 @@ static int nl80211_parse_key_old(struct genl_info *info, struct key_parse *k) k->def = !!info->attrs[NL80211_ATTR_KEY_DEFAULT]; k->defmgmt = !!info->attrs[NL80211_ATTR_KEY_DEFAULT_MGMT]; + if (info->attrs[NL80211_ATTR_KEY_TYPE]) { + k->type = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); + if (k->type < 0 || k->type >= NUM_NL80211_KEYTYPES) + return -EINVAL; + } + return 0; } @@ -369,6 +384,7 @@ static int nl80211_parse_key(struct genl_info *info, struct key_parse *k) memset(k, 0, sizeof(*k)); k->idx = -1; + k->type = -1; if (info->attrs[NL80211_ATTR_KEY]) err = nl80211_parse_key_new(info->attrs[NL80211_ATTR_KEY], k); @@ -433,7 +449,7 @@ nl80211_parse_connkeys(struct cfg80211_registered_device *rdev, } else if (parse.defmgmt) goto error; err = cfg80211_validate_key_settings(rdev, &parse.p, - parse.idx, NULL); + parse.idx, false, NULL); if (err) goto error; result->params[parse.idx].cipher = parse.p.cipher; @@ -516,6 +532,9 @@ static int nl80211_send_wiphy(struct sk_buff *msg, u32 pid, u32 seq, int flags, NLA_PUT_U16(msg, NL80211_ATTR_MAX_SCAN_IE_LEN, dev->wiphy.max_scan_ie_len); + if (dev->wiphy.flags & WIPHY_FLAG_IBSS_RSN) + NLA_PUT_FLAG(msg, NL80211_ATTR_SUPPORT_IBSS_RSN); + NLA_PUT(msg, NL80211_ATTR_CIPHER_SUITES, sizeof(u32) * dev->wiphy.n_cipher_suites, dev->wiphy.cipher_suites); @@ -1446,7 +1465,8 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) int err; struct net_device *dev = info->user_ptr[1]; u8 key_idx = 0; - u8 *mac_addr = NULL; + const u8 *mac_addr = NULL; + bool pairwise; struct get_key_cookie cookie = { .error = 0, }; @@ -1462,6 +1482,17 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + pairwise = !!mac_addr; + if (info->attrs[NL80211_ATTR_KEY_TYPE]) { + u32 kt = nla_get_u32(info->attrs[NL80211_ATTR_KEY_TYPE]); + if (kt >= NUM_NL80211_KEYTYPES) + return -EINVAL; + if (kt != NL80211_KEYTYPE_GROUP && + kt != NL80211_KEYTYPE_PAIRWISE) + return -EINVAL; + pairwise = kt == NL80211_KEYTYPE_PAIRWISE; + } + if (!rdev->ops->get_key) return -EOPNOTSUPP; @@ -1482,8 +1513,12 @@ static int nl80211_get_key(struct sk_buff *skb, struct genl_info *info) if (mac_addr) NLA_PUT(msg, NL80211_ATTR_MAC, ETH_ALEN, mac_addr); - err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, mac_addr, - &cookie, get_key_callback); + if (pairwise && mac_addr && + !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + return -ENOENT; + + err = rdev->ops->get_key(&rdev->wiphy, dev, key_idx, pairwise, + mac_addr, &cookie, get_key_callback); if (err) goto free_msg; @@ -1553,7 +1588,7 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) int err; struct net_device *dev = info->user_ptr[1]; struct key_parse key; - u8 *mac_addr = NULL; + const u8 *mac_addr = NULL; err = nl80211_parse_key(info, &key); if (err) @@ -1565,16 +1600,31 @@ static int nl80211_new_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + if (key.type == -1) { + if (mac_addr) + key.type = NL80211_KEYTYPE_PAIRWISE; + else + key.type = NL80211_KEYTYPE_GROUP; + } + + /* for now */ + if (key.type != NL80211_KEYTYPE_PAIRWISE && + key.type != NL80211_KEYTYPE_GROUP) + return -EINVAL; + if (!rdev->ops->add_key) return -EOPNOTSUPP; - if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, mac_addr)) + if (cfg80211_validate_key_settings(rdev, &key.p, key.idx, + key.type == NL80211_KEYTYPE_PAIRWISE, + mac_addr)) return -EINVAL; wdev_lock(dev->ieee80211_ptr); err = nl80211_key_allowed(dev->ieee80211_ptr); if (!err) err = rdev->ops->add_key(&rdev->wiphy, dev, key.idx, + key.type == NL80211_KEYTYPE_PAIRWISE, mac_addr, &key.p); wdev_unlock(dev->ieee80211_ptr); @@ -1596,13 +1646,32 @@ static int nl80211_del_key(struct sk_buff *skb, struct genl_info *info) if (info->attrs[NL80211_ATTR_MAC]) mac_addr = nla_data(info->attrs[NL80211_ATTR_MAC]); + if (key.type == -1) { + if (mac_addr) + key.type = NL80211_KEYTYPE_PAIRWISE; + else + key.type = NL80211_KEYTYPE_GROUP; + } + + /* for now */ + if (key.type != NL80211_KEYTYPE_PAIRWISE && + key.type != NL80211_KEYTYPE_GROUP) + return -EINVAL; + if (!rdev->ops->del_key) return -EOPNOTSUPP; wdev_lock(dev->ieee80211_ptr); err = nl80211_key_allowed(dev->ieee80211_ptr); + + if (key.type == NL80211_KEYTYPE_PAIRWISE && mac_addr && + !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + err = -ENOENT; + if (!err) - err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx, mac_addr); + err = rdev->ops->del_key(&rdev->wiphy, dev, key.idx, + key.type == NL80211_KEYTYPE_PAIRWISE, + mac_addr); #ifdef CONFIG_CFG80211_WEXT if (!err) { @@ -3212,6 +3281,8 @@ static int nl80211_authenticate(struct sk_buff *skb, struct genl_info *info) return err; if (key.idx >= 0) { + if (key.type != -1 && key.type != NL80211_KEYTYPE_GROUP) + return -EINVAL; if (!key.p.key || !key.p.key_len) return -EINVAL; if ((key.p.cipher != WLAN_CIPHER_SUITE_WEP40 || diff --git a/net/wireless/sme.c b/net/wireless/sme.c index f161b98..e17b0be 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -698,7 +698,7 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie, */ if (rdev->ops->del_key) for (i = 0; i < 6; i++) - rdev->ops->del_key(wdev->wiphy, dev, i, NULL); + rdev->ops->del_key(wdev->wiphy, dev, i, false, NULL); #ifdef CONFIG_CFG80211_WEXT memset(&wrqu, 0, sizeof(wrqu)); diff --git a/net/wireless/util.c b/net/wireless/util.c index fb5448f..76120ae 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -144,19 +144,25 @@ void ieee80211_set_bitrate_flags(struct wiphy *wiphy) int cfg80211_validate_key_settings(struct cfg80211_registered_device *rdev, struct key_params *params, int key_idx, - const u8 *mac_addr) + bool pairwise, const u8 *mac_addr) { int i; if (key_idx > 5) return -EINVAL; + if (!pairwise && mac_addr && !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + return -EINVAL; + + if (pairwise && !mac_addr) + return -EINVAL; + /* * Disallow pairwise keys with non-zero index unless it's WEP * (because current deployments use pairwise WEP keys with * non-zero indizes but 802.11i clearly specifies to use zero) */ - if (mac_addr && key_idx && + if (pairwise && key_idx && params->cipher != WLAN_CIPHER_SUITE_WEP40 && params->cipher != WLAN_CIPHER_SUITE_WEP104) return -EINVAL; @@ -677,7 +683,7 @@ void cfg80211_upload_connect_keys(struct wireless_dev *wdev) for (i = 0; i < 6; i++) { if (!wdev->connect_keys->params[i].cipher) continue; - if (rdev->ops->add_key(wdev->wiphy, dev, i, NULL, + if (rdev->ops->add_key(wdev->wiphy, dev, i, false, NULL, &wdev->connect_keys->params[i])) { printk(KERN_ERR "%s: failed to set key %d\n", dev->name, i); diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 7e5c3a4..6002265 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -432,14 +432,17 @@ int cfg80211_wext_giwretry(struct net_device *dev, EXPORT_SYMBOL_GPL(cfg80211_wext_giwretry); static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *addr, - bool remove, bool tx_key, int idx, - struct key_params *params) + struct net_device *dev, bool pairwise, + const u8 *addr, bool remove, bool tx_key, + int idx, struct key_params *params) { struct wireless_dev *wdev = dev->ieee80211_ptr; int err, i; bool rejoin = false; + if (pairwise && !addr) + return -EINVAL; + if (!wdev->wext.keys) { wdev->wext.keys = kzalloc(sizeof(*wdev->wext.keys), GFP_KERNEL); @@ -478,7 +481,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, __cfg80211_leave_ibss(rdev, wdev->netdev, true); rejoin = true; } - err = rdev->ops->del_key(&rdev->wiphy, dev, idx, addr); + + if (!pairwise && addr && + !(rdev->wiphy.flags & WIPHY_FLAG_IBSS_RSN)) + err = -ENOENT; + else + err = rdev->ops->del_key(&rdev->wiphy, dev, idx, + pairwise, addr); } wdev->wext.connect.privacy = false; /* @@ -507,12 +516,13 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, if (addr) tx_key = false; - if (cfg80211_validate_key_settings(rdev, params, idx, addr)) + if (cfg80211_validate_key_settings(rdev, params, idx, pairwise, addr)) return -EINVAL; err = 0; if (wdev->current_bss) - err = rdev->ops->add_key(&rdev->wiphy, dev, idx, addr, params); + err = rdev->ops->add_key(&rdev->wiphy, dev, idx, + pairwise, addr, params); if (err) return err; @@ -563,17 +573,17 @@ static int __cfg80211_set_encryption(struct cfg80211_registered_device *rdev, } static int cfg80211_set_encryption(struct cfg80211_registered_device *rdev, - struct net_device *dev, const u8 *addr, - bool remove, bool tx_key, int idx, - struct key_params *params) + struct net_device *dev, bool pairwise, + const u8 *addr, bool remove, bool tx_key, + int idx, struct key_params *params) { int err; /* devlist mutex needed for possible IBSS re-join */ mutex_lock(&rdev->devlist_mtx); wdev_lock(dev->ieee80211_ptr); - err = __cfg80211_set_encryption(rdev, dev, addr, remove, - tx_key, idx, params); + err = __cfg80211_set_encryption(rdev, dev, pairwise, addr, + remove, tx_key, idx, params); wdev_unlock(dev->ieee80211_ptr); mutex_unlock(&rdev->devlist_mtx); @@ -635,7 +645,7 @@ int cfg80211_wext_siwencode(struct net_device *dev, else if (!remove) return -EINVAL; - return cfg80211_set_encryption(rdev, dev, NULL, remove, + return cfg80211_set_encryption(rdev, dev, false, NULL, remove, wdev->wext.default_key == -1, idx, ¶ms); } @@ -725,7 +735,9 @@ int cfg80211_wext_siwencodeext(struct net_device *dev, } return cfg80211_set_encryption( - rdev, dev, addr, remove, + rdev, dev, + !(ext->ext_flags & IW_ENCODE_EXT_GROUP_KEY), + addr, remove, ext->ext_flags & IW_ENCODE_EXT_SET_TX_KEY, idx, ¶ms); } -- cgit v1.1 From d537f5fdfcc9c57c1e3b9fb316a4bed2bf7597e7 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Tue, 5 Oct 2010 21:34:11 +0200 Subject: nl80211: fix error in generic netif_running check Yikes! The error return keeps a netdev reference and the rdev mutex locked, fix that! Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8826888..9942f0b 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -4418,6 +4418,8 @@ static int nl80211_pre_doit(struct genl_ops *ops, struct sk_buff *skb, } if (ops->internal_flags & NL80211_FLAG_CHECK_NETDEV_UP && !netif_running(dev)) { + cfg80211_unlock_rdev(rdev); + dev_put(dev); if (rtnl) rtnl_unlock(); return -ENETDOWN; -- cgit v1.1 From 259b62e35bf44a97983f275de569929a7d2bd5dd Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 6 Oct 2010 11:22:08 +0200 Subject: mac80211: reduce number of __ieee80211_scan_completed calls Use goto instruction to call __ieee80211_scan_completed only ones in ieee80211_scan_work. This is prepare for the next patch. Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/scan.c | 51 +++++++++++++++++++++++++++++---------------------- 1 file changed, 29 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 5171a95..830c02b 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -651,27 +651,25 @@ void ieee80211_scan_work(struct work_struct *work) container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata = local->scan_sdata; unsigned long next_delay = 0; + bool aborted; - if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { - bool aborted; + mutex_lock(&local->mtx); + if (test_and_clear_bit(SCAN_COMPLETED, &local->scanning)) { aborted = test_and_clear_bit(SCAN_ABORTED, &local->scanning); - __ieee80211_scan_completed(&local->hw, aborted); - return; + goto out_complete; } - mutex_lock(&local->mtx); - if (!sdata || !local->scan_req) { - mutex_unlock(&local->mtx); - return; - } + if (!sdata || !local->scan_req) + goto out; if (local->hw_scan_req) { int rc = drv_hw_scan(local, sdata, local->hw_scan_req); - mutex_unlock(&local->mtx); - if (rc) - __ieee80211_scan_completed(&local->hw, true); - return; + if (rc) { + aborted = true; + goto out_complete; + } else + goto out; } if (local->scan_req && !local->scanning) { @@ -682,23 +680,23 @@ void ieee80211_scan_work(struct work_struct *work) local->scan_sdata = NULL; rc = __ieee80211_start_scan(sdata, req); - mutex_unlock(&local->mtx); - - if (rc) - __ieee80211_scan_completed(&local->hw, true); - return; + if (rc) { + aborted = true; + goto out_complete; + } else + goto out; } - mutex_unlock(&local->mtx); - /* * Avoid re-scheduling when the sdata is going away. */ if (!ieee80211_sdata_running(sdata)) { - __ieee80211_scan_completed(&local->hw, true); - return; + aborted = true; + goto out_complete; } + mutex_unlock(&local->mtx); + /* * as long as no delay is required advance immediately * without scheduling a new work @@ -725,6 +723,15 @@ void ieee80211_scan_work(struct work_struct *work) } while (next_delay == 0); ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay); + return; + +out_complete: + mutex_unlock(&local->mtx); + __ieee80211_scan_completed(&local->hw, aborted); + return; + +out: + mutex_unlock(&local->mtx); } int ieee80211_request_scan(struct ieee80211_sub_if_data *sdata, -- cgit v1.1 From e229f844d7223b7063bea1e649203ac521a58fe1 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 6 Oct 2010 11:22:09 +0200 Subject: mac80211: keep lock when calling __ieee80211_scan_completed() We are taking local->mtx inside __ieee80211_scan_completed(), but just before call to that function we drop the lock. Dropping/taking lock is not good, because can lead to hard to understand race conditions. Patch split scan_completed() code into two functions, first must be called with local->mtx taken and second without it. Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/scan.c | 75 ++++++++++++++++++++++++++++------------------------- 1 file changed, 39 insertions(+), 36 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 830c02b..6964a45 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -249,12 +249,12 @@ static bool ieee80211_prep_hw_scan(struct ieee80211_local *local) return true; } -static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) +static bool __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, + bool was_hw_scan) { struct ieee80211_local *local = hw_to_local(hw); - bool was_hw_scan; - mutex_lock(&local->mtx); + lockdep_assert_held(&local->mtx); /* * It's ok to abort a not-yet-running scan (that @@ -265,17 +265,13 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) if (WARN_ON(!local->scanning && !aborted)) aborted = true; - if (WARN_ON(!local->scan_req)) { - mutex_unlock(&local->mtx); - return; - } + if (WARN_ON(!local->scan_req)) + return false; - was_hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0); - mutex_unlock(&local->mtx); - return; + return false; } kfree(local->hw_scan_req); @@ -289,23 +285,25 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted) local->scanning = 0; local->scan_channel = NULL; - /* we only have to protect scan_req and hw/sw scan */ - mutex_unlock(&local->mtx); - - ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); - if (was_hw_scan) - goto done; - - ieee80211_configure_filter(local); + return true; +} - drv_sw_scan_complete(local); +static void __ieee80211_scan_completed_finish(struct ieee80211_hw *hw, + bool was_hw_scan) +{ + struct ieee80211_local *local = hw_to_local(hw); - ieee80211_offchannel_return(local, true); + ieee80211_hw_config(local, IEEE80211_CONF_CHANGE_CHANNEL); + if (!was_hw_scan) { + ieee80211_configure_filter(local); + drv_sw_scan_complete(local); + ieee80211_offchannel_return(local, true); + } - done: mutex_lock(&local->mtx); ieee80211_recalc_idle(local); mutex_unlock(&local->mtx); + ieee80211_mlme_notify_scan_completed(local); ieee80211_ibss_notify_scan_completed(local); ieee80211_mesh_notify_scan_completed(local); @@ -366,6 +364,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; int rc; + lockdep_assert_held(&local->mtx); + if (local->scan_req) return -EBUSY; @@ -447,8 +447,8 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan) return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME; } -static int ieee80211_scan_state_decision(struct ieee80211_local *local, - unsigned long *next_delay) +static void ieee80211_scan_state_decision(struct ieee80211_local *local, + unsigned long *next_delay) { bool associated = false; bool tx_empty = true; @@ -458,12 +458,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata; struct ieee80211_channel *next_chan; - /* if no more bands/channels left, complete scan and advance to the idle state */ - if (local->scan_channel_idx >= local->scan_req->n_channels) { - __ieee80211_scan_completed(&local->hw, false); - return 1; - } - /* * check if at least one STA interface is associated, * check if at least one STA interface has pending tx frames @@ -535,7 +529,6 @@ static int ieee80211_scan_state_decision(struct ieee80211_local *local, } *next_delay = 0; - return 0; } static void ieee80211_scan_state_leave_oper_channel(struct ieee80211_local *local, @@ -651,7 +644,7 @@ void ieee80211_scan_work(struct work_struct *work) container_of(work, struct ieee80211_local, scan_work.work); struct ieee80211_sub_if_data *sdata = local->scan_sdata; unsigned long next_delay = 0; - bool aborted; + bool aborted, hw_scan, finish; mutex_lock(&local->mtx); @@ -704,8 +697,12 @@ void ieee80211_scan_work(struct work_struct *work) do { switch (local->next_scan_state) { case SCAN_DECISION: - if (ieee80211_scan_state_decision(local, &next_delay)) - return; + /* if no more bands/channels left, complete scan */ + if (local->scan_channel_idx >= local->scan_req->n_channels) { + aborted = false; + goto out_complete; + } + ieee80211_scan_state_decision(local, &next_delay); break; case SCAN_SET_CHANNEL: ieee80211_scan_state_set_channel(local, &next_delay); @@ -726,8 +723,11 @@ void ieee80211_scan_work(struct work_struct *work) return; out_complete: + hw_scan = test_bit(SCAN_HW_SCANNING, &local->scanning); + finish = __ieee80211_scan_completed(&local->hw, aborted, hw_scan); mutex_unlock(&local->mtx); - __ieee80211_scan_completed(&local->hw, aborted); + if (finish) + __ieee80211_scan_completed_finish(&local->hw, hw_scan); return; out: @@ -796,6 +796,7 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, void ieee80211_scan_cancel(struct ieee80211_local *local) { bool abortscan; + bool finish = false; cancel_delayed_work_sync(&local->scan_work); @@ -806,8 +807,10 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) mutex_lock(&local->mtx); abortscan = test_bit(SCAN_SW_SCANNING, &local->scanning) || (!local->scanning && local->scan_req); + if (abortscan) + finish = __ieee80211_scan_completed(&local->hw, true, false); mutex_unlock(&local->mtx); - if (abortscan) - __ieee80211_scan_completed(&local->hw, true); + if (finish) + __ieee80211_scan_completed_finish(&local->hw, false); } -- cgit v1.1 From 4136c4224ccf1907d309e1cdfaefef9da97dbc5e Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 6 Oct 2010 11:22:10 +0200 Subject: mac80211: assure we also cancel deferred scan request This is partial revert and fix for commit 85f72bc839705294b32b6c16b491c0422f0a71b3 "mac80211: only cancel software-based scans on suspend" When cfg80211 request the scan and mac80211 perform some management work, we defer the scan request. We do not canceling such requests when calling ieee80211_scan_cancel(), because of SCAN_SW_SCANNING bit check just before the call. So fix that problem. Another problem, which commit 85f72bc839705294b32b6c16b491c0422f0a71b3 tries to solve, is we can not cancel HW scan. Hence patch make ieee80211_scan_cancel() ignore HW scan (see code comments). Keeping local->mtx lock assures that the deferred scan will not become "working" HW scan. Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/main.c | 3 +-- net/mac80211/pm.c | 3 +-- net/mac80211/scan.c | 35 +++++++++++++++++++++++++---------- 3 files changed, 27 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/mac80211/main.c b/net/mac80211/main.c index e371709..915ecf8 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -307,8 +307,7 @@ static void ieee80211_restart_work(struct work_struct *work) mutex_unlock(&local->mtx); rtnl_lock(); - if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) - ieee80211_scan_cancel(local); + ieee80211_scan_cancel(local); ieee80211_reconfig(local); rtnl_unlock(); } diff --git a/net/mac80211/pm.c b/net/mac80211/pm.c index e3e2bce3..e373551 100644 --- a/net/mac80211/pm.c +++ b/net/mac80211/pm.c @@ -12,8 +12,7 @@ int __ieee80211_suspend(struct ieee80211_hw *hw) struct ieee80211_sub_if_data *sdata; struct sta_info *sta; - if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning))) - ieee80211_scan_cancel(local); + ieee80211_scan_cancel(local); ieee80211_stop_queues_by_reason(hw, IEEE80211_QUEUE_STOP_REASON_SUSPEND); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 6964a45..4dbef71 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -793,24 +793,39 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, return ret; } +/* + * Only call this function when a scan can't be queued -- under RTNL. + */ void ieee80211_scan_cancel(struct ieee80211_local *local) { - bool abortscan; - bool finish = false; - - cancel_delayed_work_sync(&local->scan_work); + bool abortscan, finish; /* - * Only call this function when a scan can't be - * queued -- mostly at suspend under RTNL. + * We are only canceling software scan, or deferred scan that was not + * yet really started (see __ieee80211_start_scan ). + * + * Regarding hardware scan: + * - we can not call __ieee80211_scan_completed() as when + * SCAN_HW_SCANNING bit is set this function change + * local->hw_scan_req to operate on 5G band, what race with + * driver which can use local->hw_scan_req + * + * - we can not cancel scan_work since driver can schedule it + * by ieee80211_scan_completed(..., true) to finish scan + * + * Hence low lever driver is responsible for canceling HW scan. */ + mutex_lock(&local->mtx); - abortscan = test_bit(SCAN_SW_SCANNING, &local->scanning) || - (!local->scanning && local->scan_req); + abortscan = local->scan_req && !test_bit(SCAN_HW_SCANNING, &local->scanning); if (abortscan) finish = __ieee80211_scan_completed(&local->hw, true, false); mutex_unlock(&local->mtx); - if (finish) - __ieee80211_scan_completed_finish(&local->hw, false); + if (abortscan) { + /* The scan is canceled, but stop work from being pending */ + cancel_delayed_work_sync(&local->scan_work); + if (finish) + __ieee80211_scan_completed_finish(&local->hw, false); + } } -- cgit v1.1 From 6eb11a9a311a0f7e5b9b66c18f7498a26c9ec206 Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 6 Oct 2010 11:22:11 +0200 Subject: mac80211: do not requeue scan work when not needed When performing hw scan and not abort it, __ieee80211_scan_completed() is currently called from scan work, so does not need to reschedule work to call drv_hw_scan(). Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/scan.c | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 4dbef71..9aab921 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -269,9 +269,9 @@ static bool __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted, return false; if (was_hw_scan && !aborted && ieee80211_prep_hw_scan(local)) { - ieee80211_queue_delayed_work(&local->hw, - &local->scan_work, 0); - return false; + int rc = drv_hw_scan(local, local->scan_sdata, local->hw_scan_req); + if (rc == 0) + return false; } kfree(local->hw_scan_req); @@ -656,15 +656,6 @@ void ieee80211_scan_work(struct work_struct *work) if (!sdata || !local->scan_req) goto out; - if (local->hw_scan_req) { - int rc = drv_hw_scan(local, sdata, local->hw_scan_req); - if (rc) { - aborted = true; - goto out_complete; - } else - goto out; - } - if (local->scan_req && !local->scanning) { struct cfg80211_scan_request *req = local->scan_req; int rc; -- cgit v1.1 From 3aed49ef17c7bc8397420529ac976fe058818e3d Mon Sep 17 00:00:00 2001 From: Stanislaw Gruszka Date: Wed, 6 Oct 2010 11:22:12 +0200 Subject: mac80211: compete scan to cfg80211 if deferred scan fail to start We nulify local->scan_req on failure in __ieee80211_start_scan, so __ieee80211_scan_completed will not call cfg80211_scan_done. Fix that. Signed-off-by: Stanislaw Gruszka Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/scan.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 9aab921..80e017d 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -665,6 +665,8 @@ void ieee80211_scan_work(struct work_struct *work) rc = __ieee80211_start_scan(sdata, req); if (rc) { + /* need to complete scan in cfg80211 */ + local->scan_req = req; aborted = true; goto out_complete; } else -- cgit v1.1 From b206b4ef062d83c0875a085672ed50e8c8b01521 Mon Sep 17 00:00:00 2001 From: Bruno Randolf Date: Wed, 6 Oct 2010 18:34:12 +0900 Subject: nl80211/mac80211: Add retry and failed transmission count to station info This information is already available in mac80211, we just need to export it via cfg80211 and nl80211. Signed-off-by: Bruno Randolf Acked-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 4 ++++ net/wireless/nl80211.c | 6 ++++++ 2 files changed, 10 insertions(+) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 8b0e874..2e5a3fb 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -327,6 +327,8 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) STATION_INFO_TX_BYTES | STATION_INFO_RX_PACKETS | STATION_INFO_TX_PACKETS | + STATION_INFO_TX_RETRIES | + STATION_INFO_TX_FAILED | STATION_INFO_TX_BITRATE; sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); @@ -334,6 +336,8 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->tx_bytes = sta->tx_bytes; sinfo->rx_packets = sta->rx_packets; sinfo->tx_packets = sta->tx_packets; + sinfo->tx_retries = sta->tx_retry_count; + sinfo->tx_failed = sta->tx_retry_failed; if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 9942f0b..524f554 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -1890,6 +1890,12 @@ static int nl80211_send_station(struct sk_buff *msg, u32 pid, u32 seq, if (sinfo->filled & STATION_INFO_TX_PACKETS) NLA_PUT_U32(msg, NL80211_STA_INFO_TX_PACKETS, sinfo->tx_packets); + if (sinfo->filled & STATION_INFO_TX_RETRIES) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_RETRIES, + sinfo->tx_retries); + if (sinfo->filled & STATION_INFO_TX_FAILED) + NLA_PUT_U32(msg, NL80211_STA_INFO_TX_FAILED, + sinfo->tx_failed); nla_nest_end(msg, sinfoattr); return genlmsg_end(msg, hdr); -- cgit v1.1 From 3207390a8b58bfc1335750f91cf6783c48ca19ca Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 6 Oct 2010 21:18:04 +0200 Subject: cfg80211: fix BSS double-unlinking When multiple interfaces are actively trying to associate with the same BSS, they may both find that the BSS isn't there and then try to unlink it. This can cause errors since the unlinking code can't currently deal with items that have already been unlinked. Normally this doesn't happen as most people don't try to use multiple station interfaces that associate at the same time too. Fix this by using the list entry as a flag to see if the item is still on a list. Cc: stable@kernel.org Reported-by: Ben Greear Tested-by: Hun-Kyi Wynn Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/scan.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 5ca8c71..503ebb8 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -650,14 +650,14 @@ void cfg80211_unlink_bss(struct wiphy *wiphy, struct cfg80211_bss *pub) bss = container_of(pub, struct cfg80211_internal_bss, pub); spin_lock_bh(&dev->bss_lock); + if (!list_empty(&bss->list)) { + list_del_init(&bss->list); + dev->bss_generation++; + rb_erase(&bss->rbn, &dev->bss_tree); - list_del(&bss->list); - dev->bss_generation++; - rb_erase(&bss->rbn, &dev->bss_tree); - + kref_put(&bss->ref, bss_release); + } spin_unlock_bh(&dev->bss_lock); - - kref_put(&bss->ref, bss_release); } EXPORT_SYMBOL(cfg80211_unlink_bss); -- cgit v1.1 From 494486f8fd0eec956c5df823581df5dcf5409a6f Mon Sep 17 00:00:00 2001 From: "John W. Linville" Date: Wed, 6 Oct 2010 16:40:40 -0400 Subject: mac80211: avoid uninitialized var warning in ieee80211_scan_cancel MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit net/mac80211/scan.c: In function ‘ieee80211_scan_cancel’: net/mac80211/scan.c:794: warning: ‘finish’ may be used uninitialized in this function Signed-off-by: John W. Linville --- net/mac80211/scan.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 80e017d..523db93 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -791,7 +791,8 @@ int ieee80211_request_internal_scan(struct ieee80211_sub_if_data *sdata, */ void ieee80211_scan_cancel(struct ieee80211_local *local) { - bool abortscan, finish; + bool abortscan; + bool finish = false; /* * We are only canceling software scan, or deferred scan that was not @@ -818,7 +819,7 @@ void ieee80211_scan_cancel(struct ieee80211_local *local) if (abortscan) { /* The scan is canceled, but stop work from being pending */ cancel_delayed_work_sync(&local->scan_work); - if (finish) - __ieee80211_scan_completed_finish(&local->hw, false); } + if (finish) + __ieee80211_scan_completed_finish(&local->hw, false); } -- cgit v1.1 From 767e97e1e0db0d0f3152cd2f3bd3403596aedbad Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 6 Oct 2010 17:49:21 -0700 Subject: neigh: RCU conversion of struct neighbour This is the second step for neighbour RCU conversion. (first was commit d6bf7817 : RCU conversion of neigh hash table) neigh_lookup() becomes lockless, but still take a reference on found neighbour. (no more read_lock()/read_unlock() on tbl->lock) struct neighbour gets an additional rcu_head field and is freed after an RCU grace period. Future work would need to eventually not take a reference on neighbour for temporary dst (DST_NOCACHE), but this would need dst->_neighbour to use a noref bit like we did for skb->_dst. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 137 ++++++++++++++++++++++++++++++++------------------- 1 file changed, 85 insertions(+), 52 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index dd8920e..3ffafaa0 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -139,10 +139,12 @@ static int neigh_forced_gc(struct neigh_table *tbl) nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (i = 0; i <= nht->hash_mask; i++) { - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { /* Neighbour record may be discarded if: * - nobody refers to it. * - it is not permanent @@ -150,7 +152,9 @@ static int neigh_forced_gc(struct neigh_table *tbl) write_lock(&n->lock); if (atomic_read(&n->refcnt) == 1 && !(n->nud_state & NUD_PERMANENT)) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; shrunk = 1; write_unlock(&n->lock); @@ -208,14 +212,18 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev) lockdep_is_held(&tbl->lock)); for (i = 0; i <= nht->hash_mask; i++) { - struct neighbour *n, **np = &nht->hash_buckets[i]; + struct neighbour *n; + struct neighbour __rcu **np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { if (dev && n->dev != dev) { np = &n->next; continue; } - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); write_lock(&n->lock); neigh_del_timer(n); n->dead = 1; @@ -323,7 +331,7 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int entries) kfree(ret); return NULL; } - ret->hash_buckets = buckets; + rcu_assign_pointer(ret->hash_buckets, buckets); ret->hash_mask = entries - 1; get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd)); return ret; @@ -362,17 +370,22 @@ static struct neigh_hash_table *neigh_hash_grow(struct neigh_table *tbl, for (i = 0; i <= old_nht->hash_mask; i++) { struct neighbour *n, *next; - for (n = old_nht->hash_buckets[i]; + for (n = rcu_dereference_protected(old_nht->hash_buckets[i], + lockdep_is_held(&tbl->lock)); n != NULL; n = next) { hash = tbl->hash(n->primary_key, n->dev, new_nht->hash_rnd); hash &= new_nht->hash_mask; - next = n->next; - - n->next = new_nht->hash_buckets[hash]; - new_nht->hash_buckets[hash] = n; + next = rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock)); + + rcu_assign_pointer(n->next, + rcu_dereference_protected( + new_nht->hash_buckets[hash], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(new_nht->hash_buckets[hash], n); } } @@ -394,15 +407,18 @@ struct neighbour *neigh_lookup(struct neigh_table *tbl, const void *pkey, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); hash_val = tbl->hash(pkey, dev, nht->hash_rnd) & nht->hash_mask; - read_lock(&tbl->lock); - for (n = nht->hash_buckets[hash_val]; n; n = n->next) { + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (dev == n->dev && !memcmp(n->primary_key, pkey, key_len)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } @@ -421,16 +437,19 @@ struct neighbour *neigh_lookup_nodev(struct neigh_table *tbl, struct net *net, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); hash_val = tbl->hash(pkey, NULL, nht->hash_rnd) & nht->hash_mask; - read_lock(&tbl->lock); - for (n = nht->hash_buckets[hash_val]; n; n = n->next) { + + for (n = rcu_dereference_bh(nht->hash_buckets[hash_val]); + n != NULL; + n = rcu_dereference_bh(n->next)) { if (!memcmp(n->primary_key, pkey, key_len) && net_eq(dev_net(n->dev), net)) { - neigh_hold(n); + if (!atomic_inc_not_zero(&n->refcnt)) + n = NULL; NEIGH_CACHE_STAT_INC(tbl, hits); break; } } - read_unlock(&tbl->lock); + rcu_read_unlock_bh(); return n; } @@ -483,7 +502,11 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, goto out_tbl_unlock; } - for (n1 = nht->hash_buckets[hash_val]; n1; n1 = n1->next) { + for (n1 = rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock)); + n1 != NULL; + n1 = rcu_dereference_protected(n1->next, + lockdep_is_held(&tbl->lock))) { if (dev == n1->dev && !memcmp(n1->primary_key, pkey, key_len)) { neigh_hold(n1); rc = n1; @@ -491,10 +514,12 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey, } } - n->next = nht->hash_buckets[hash_val]; - nht->hash_buckets[hash_val] = n; n->dead = 0; neigh_hold(n); + rcu_assign_pointer(n->next, + rcu_dereference_protected(nht->hash_buckets[hash_val], + lockdep_is_held(&tbl->lock))); + rcu_assign_pointer(nht->hash_buckets[hash_val], n); write_unlock_bh(&tbl->lock); NEIGH_PRINTK2("neigh %p is created.\n", n); rc = n; @@ -651,6 +676,12 @@ static inline void neigh_parms_put(struct neigh_parms *parms) neigh_parms_destroy(parms); } +static void neigh_destroy_rcu(struct rcu_head *head) +{ + struct neighbour *neigh = container_of(head, struct neighbour, rcu); + + kmem_cache_free(neigh->tbl->kmem_cachep, neigh); +} /* * neighbour must already be out of the table; * @@ -690,7 +721,7 @@ void neigh_destroy(struct neighbour *neigh) NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh); atomic_dec(&neigh->tbl->entries); - kmem_cache_free(neigh->tbl->kmem_cachep, neigh); + call_rcu(&neigh->rcu, neigh_destroy_rcu); } EXPORT_SYMBOL(neigh_destroy); @@ -731,7 +762,8 @@ static void neigh_connect(struct neighbour *neigh) static void neigh_periodic_work(struct work_struct *work) { struct neigh_table *tbl = container_of(work, struct neigh_table, gc_work.work); - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; unsigned int i; struct neigh_hash_table *nht; @@ -756,7 +788,8 @@ static void neigh_periodic_work(struct work_struct *work) for (i = 0 ; i <= nht->hash_mask; i++) { np = &nht->hash_buckets[i]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { unsigned int state; write_lock(&n->lock); @@ -1213,8 +1246,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, } /* This function can be used in contexts, where only old dev_queue_xmit - worked, f.e. if you want to override normal output path (eql, shaper), - but resolution is not made yet. + * worked, f.e. if you want to override normal output path (eql, shaper), + * but resolution is not made yet. */ int neigh_compat_output(struct sk_buff *skb) @@ -2123,7 +2156,7 @@ static void neigh_update_notify(struct neighbour *neigh) static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, struct netlink_callback *cb) { - struct net * net = sock_net(skb->sk); + struct net *net = sock_net(skb->sk); struct neighbour *n; int rc, h, s_h = cb->args[1]; int idx, s_idx = idx = cb->args[2]; @@ -2132,13 +2165,14 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); for (h = 0; h <= nht->hash_mask; h++) { if (h < s_h) continue; if (h > s_h) s_idx = 0; - for (n = nht->hash_buckets[h], idx = 0; n; n = n->next) { + for (n = rcu_dereference_bh(nht->hash_buckets[h]), idx = 0; + n != NULL; + n = rcu_dereference_bh(n->next)) { if (!net_eq(dev_net(n->dev), net)) continue; if (idx < s_idx) @@ -2150,13 +2184,12 @@ static int neigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb, rc = -1; goto out; } - next: +next: idx++; } } rc = skb->len; out: - read_unlock(&tbl->lock); rcu_read_unlock_bh(); cb->args[1] = h; cb->args[2] = idx; @@ -2195,11 +2228,13 @@ void neigh_for_each(struct neigh_table *tbl, void (*cb)(struct neighbour *, void rcu_read_lock_bh(); nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); + read_lock(&tbl->lock); /* avoid resizes */ for (chain = 0; chain <= nht->hash_mask; chain++) { struct neighbour *n; - for (n = nht->hash_buckets[chain]; n; n = n->next) + for (n = rcu_dereference_bh(nht->hash_buckets[chain]); + n != NULL; + n = rcu_dereference_bh(n->next)) cb(n, cookie); } read_unlock(&tbl->lock); @@ -2217,16 +2252,20 @@ void __neigh_for_each_release(struct neigh_table *tbl, nht = rcu_dereference_protected(tbl->nht, lockdep_is_held(&tbl->lock)); for (chain = 0; chain <= nht->hash_mask; chain++) { - struct neighbour *n, **np; + struct neighbour *n; + struct neighbour __rcu **np; np = &nht->hash_buckets[chain]; - while ((n = *np) != NULL) { + while ((n = rcu_dereference_protected(*np, + lockdep_is_held(&tbl->lock))) != NULL) { int release; write_lock(&n->lock); release = cb(n); if (release) { - *np = n->next; + rcu_assign_pointer(*np, + rcu_dereference_protected(n->next, + lockdep_is_held(&tbl->lock))); n->dead = 1; } else np = &n->next; @@ -2250,7 +2289,7 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) state->flags &= ~NEIGH_SEQ_IS_PNEIGH; for (bucket = 0; bucket <= nht->hash_mask; bucket++) { - n = nht->hash_buckets[bucket]; + n = rcu_dereference_bh(nht->hash_buckets[bucket]); while (n) { if (!net_eq(dev_net(n->dev), net)) @@ -2267,8 +2306,8 @@ static struct neighbour *neigh_get_first(struct seq_file *seq) break; if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2292,7 +2331,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (v) return n; } - n = n->next; + n = rcu_dereference_bh(n->next); while (1) { while (n) { @@ -2309,8 +2348,8 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (n->nud_state & ~NUD_NOARP) break; - next: - n = n->next; +next: + n = rcu_dereference_bh(n->next); } if (n) @@ -2319,7 +2358,7 @@ static struct neighbour *neigh_get_next(struct seq_file *seq, if (++state->bucket > nht->hash_mask) break; - n = nht->hash_buckets[state->bucket]; + n = rcu_dereference_bh(nht->hash_buckets[state->bucket]); } if (n && pos) @@ -2417,7 +2456,6 @@ static void *neigh_get_idx_any(struct seq_file *seq, loff_t *pos) } void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl, unsigned int neigh_seq_flags) - __acquires(tbl->lock) __acquires(rcu_bh) { struct neigh_seq_state *state = seq->private; @@ -2428,7 +2466,7 @@ void *neigh_seq_start(struct seq_file *seq, loff_t *pos, struct neigh_table *tbl rcu_read_lock_bh(); state->nht = rcu_dereference_bh(tbl->nht); - read_lock(&tbl->lock); + return *pos ? neigh_get_idx_any(seq, pos) : SEQ_START_TOKEN; } EXPORT_SYMBOL(neigh_seq_start); @@ -2461,13 +2499,8 @@ out: EXPORT_SYMBOL(neigh_seq_next); void neigh_seq_stop(struct seq_file *seq, void *v) - __releases(tbl->lock) __releases(rcu_bh) { - struct neigh_seq_state *state = seq->private; - struct neigh_table *tbl = state->tbl; - - read_unlock(&tbl->lock); rcu_read_unlock_bh(); } EXPORT_SYMBOL(neigh_seq_stop); -- cgit v1.1 From 9c6d5e5537e3997c47b65925a235392c1968fb51 Mon Sep 17 00:00:00 2001 From: John Heffner Date: Wed, 6 Oct 2010 21:18:02 -0700 Subject: TCP: Fix setting of snd_ssthresh in tcp_mtu_probe_success This looks like a simple typo that has gone unnoticed for some time. The impact is relatively low but it's clearly wrong. Signed-off-by: John Heffner Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f6fdd72..e4fbdae 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2874,7 +2874,7 @@ static void tcp_mtup_probe_success(struct sock *sk) icsk->icsk_mtup.probe_size; tp->snd_cwnd_cnt = 0; tp->snd_cwnd_stamp = tcp_time_stamp; - tp->rcv_ssthresh = tcp_current_ssthresh(sk); + tp->snd_ssthresh = tcp_current_ssthresh(sk); icsk->icsk_mtup.search_low = icsk->icsk_mtup.probe_size; icsk->icsk_mtup.probe_size = 0; -- cgit v1.1 From 1f4f0f645cc1d7f1187fcdb0ac22c2e69bd68050 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 5 Oct 2010 04:24:09 +0000 Subject: dccp: Kill dead code and add static markers. Remove dead code and make some functions static. Compile tested only. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/dccp/dccp.h | 2 -- net/dccp/feat.c | 10 ---------- net/dccp/feat.h | 1 - net/dccp/options.c | 4 +--- net/dccp/proto.c | 48 ++++++++++++++++++++++++------------------------ 5 files changed, 25 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 3ccef1b..019d6ff 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -246,7 +246,6 @@ static inline void dccp_clear_xmit_timers(struct sock *sk) extern unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu); extern const char *dccp_packet_name(const int type); -extern const char *dccp_state_name(const int state); extern void dccp_set_state(struct sock *sk, const int state); extern void dccp_done(struct sock *sk); @@ -449,7 +448,6 @@ extern int dccp_insert_options_rsk(struct dccp_request_sock*, struct sk_buff*); extern int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed); extern u32 dccp_timestamp(void); extern void dccp_timestamping_init(void); -extern int dccp_insert_option_timestamp(struct sk_buff *skb); extern int dccp_insert_option(struct sk_buff *skb, unsigned char option, const void *value, unsigned char len); diff --git a/net/dccp/feat.c b/net/dccp/feat.c index df7dd26..568def9 100644 --- a/net/dccp/feat.c +++ b/net/dccp/feat.c @@ -730,16 +730,6 @@ int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, 0, list, len); } -/* Analogous to dccp_feat_register_sp(), but for non-negotiable values */ -int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val) -{ - /* any changes must be registered before establishing the connection */ - if (sk->sk_state != DCCP_CLOSED) - return -EISCONN; - if (dccp_feat_type(feat) != FEAT_NN) - return -EINVAL; - return __feat_register_nn(&dccp_sk(sk)->dccps_featneg, feat, 0, val); -} /* * Tracking features whose value depend on the choice of CCID diff --git a/net/dccp/feat.h b/net/dccp/feat.h index f967216..e56a4e5 100644 --- a/net/dccp/feat.h +++ b/net/dccp/feat.h @@ -111,7 +111,6 @@ extern int dccp_feat_init(struct sock *sk); extern void dccp_feat_initialise_sysctls(void); extern int dccp_feat_register_sp(struct sock *sk, u8 feat, u8 is_local, u8 const *list, u8 len); -extern int dccp_feat_register_nn(struct sock *sk, u8 feat, u64 val); extern int dccp_feat_parse_options(struct sock *, struct dccp_request_sock *, u8 mand, u8 opt, u8 feat, u8 *val, u8 len); extern int dccp_feat_clone_list(struct list_head const *, struct list_head *); diff --git a/net/dccp/options.c b/net/dccp/options.c index 9271851..d4b1ae0 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -369,7 +369,7 @@ int dccp_insert_option_elapsed_time(struct sk_buff *skb, u32 elapsed_time) EXPORT_SYMBOL_GPL(dccp_insert_option_elapsed_time); -int dccp_insert_option_timestamp(struct sk_buff *skb) +static int dccp_insert_option_timestamp(struct sk_buff *skb) { __be32 now = htonl(dccp_timestamp()); /* yes this will overflow but that is the point as we want a @@ -378,8 +378,6 @@ int dccp_insert_option_timestamp(struct sk_buff *skb) return dccp_insert_option(skb, DCCPO_TIMESTAMP, &now, sizeof(now)); } -EXPORT_SYMBOL_GPL(dccp_insert_option_timestamp); - static int dccp_insert_option_timestamp_echo(struct dccp_sock *dp, struct dccp_request_sock *dreq, struct sk_buff *skb) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 096250d..b054ba1 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -50,6 +50,30 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo); /* the maximum queue length for tx in packets. 0 is no limit */ int sysctl_dccp_tx_qlen __read_mostly = 5; +#ifdef CONFIG_IP_DCCP_DEBUG +static const char *dccp_state_name(const int state) +{ + static const char *const dccp_state_names[] = { + [DCCP_OPEN] = "OPEN", + [DCCP_REQUESTING] = "REQUESTING", + [DCCP_PARTOPEN] = "PARTOPEN", + [DCCP_LISTEN] = "LISTEN", + [DCCP_RESPOND] = "RESPOND", + [DCCP_CLOSING] = "CLOSING", + [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", + [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", + [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", + [DCCP_TIME_WAIT] = "TIME_WAIT", + [DCCP_CLOSED] = "CLOSED", + }; + + if (state >= DCCP_MAX_STATES) + return "INVALID STATE!"; + else + return dccp_state_names[state]; +} +#endif + void dccp_set_state(struct sock *sk, const int state) { const int oldstate = sk->sk_state; @@ -146,30 +170,6 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); -const char *dccp_state_name(const int state) -{ - static const char *const dccp_state_names[] = { - [DCCP_OPEN] = "OPEN", - [DCCP_REQUESTING] = "REQUESTING", - [DCCP_PARTOPEN] = "PARTOPEN", - [DCCP_LISTEN] = "LISTEN", - [DCCP_RESPOND] = "RESPOND", - [DCCP_CLOSING] = "CLOSING", - [DCCP_ACTIVE_CLOSEREQ] = "CLOSEREQ", - [DCCP_PASSIVE_CLOSE] = "PASSIVE_CLOSE", - [DCCP_PASSIVE_CLOSEREQ] = "PASSIVE_CLOSEREQ", - [DCCP_TIME_WAIT] = "TIME_WAIT", - [DCCP_CLOSED] = "CLOSED", - }; - - if (state >= DCCP_MAX_STATES) - return "INVALID STATE!"; - else - return dccp_state_names[state]; -} - -EXPORT_SYMBOL_GPL(dccp_state_name); - int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); -- cgit v1.1 From 3d3211ef5cf7558d289d1a1efbef8c45595639aa Mon Sep 17 00:00:00 2001 From: John Fastabend Date: Wed, 6 Oct 2010 23:35:15 -0700 Subject: net: netif_set_real_num_rx_queues may cap num_rx_queues at init time Do not set num_rx_queues in netif_set_real_num_rx_queues() some drivers will increase the real_num_rx_queues later due to a feature changes or available interrupts increasing. By setting num_rx_queues here this ends up creating a cap on the number of rx queues available. For example the ixgbe driver sets the max number of queues it intends to use ever then sets the current number in use with the netif_set_num_{rx|tx}_queues calls. With the current implementation the number of rx queues gets limited so when a feature such as DCB or FCoE is enabled the queues are no longer available. kobjects will only be allocated for real_num_rx_queues so the waste in memory is minimal. Signed-off-by: John Fastabend Signed-off-by: Jeff Kirsher Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 7d14955..fd1b75a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1593,8 +1593,6 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) rxq); if (rc) return rc; - } else { - dev->num_rx_queues = rxq; } dev->real_num_rx_queues = rxq; -- cgit v1.1 From 7b99a7c2dab7efe7c265b66fedbf3444958ebfe3 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 7 Oct 2010 12:55:24 +0200 Subject: mac80211: fix sw scan locking The recent scan overhaul broke locking because now we can jump to code that attempts to unlock, while we don't have the mutex held. Fix this by holding the mutex around all the relevant code. Reported-by: Ben Greear Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/scan.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 523db93..fb274db 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -681,8 +681,6 @@ void ieee80211_scan_work(struct work_struct *work) goto out_complete; } - mutex_unlock(&local->mtx); - /* * as long as no delay is required advance immediately * without scheduling a new work @@ -713,6 +711,7 @@ void ieee80211_scan_work(struct work_struct *work) } while (next_delay == 0); ieee80211_queue_delayed_work(&local->hw, &local->scan_work, next_delay); + mutex_unlock(&local->mtx); return; out_complete: -- cgit v1.1 From 43b19952de54b0fccfcdc5968891ebe550367fe8 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 7 Oct 2010 13:10:30 +0200 Subject: nl80211: use new genl helpers for WDS Bill Jordan's patch to allow setting the WDS peer crossed with my patch removing all the boilerplate code in nl80211, and consequently he didn't make use of it yet. Fix that. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 45 ++++++++++++--------------------------------- 1 file changed, 12 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 524f554..4fed166 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -876,48 +876,25 @@ static int nl80211_set_channel(struct sk_buff *skb, struct genl_info *info) static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) { - struct cfg80211_registered_device *rdev; - struct wireless_dev *wdev; - struct net_device *dev; + struct cfg80211_registered_device *rdev = info->user_ptr[0]; + struct net_device *dev = info->user_ptr[1]; + struct wireless_dev *wdev = dev->ieee80211_ptr; u8 *bssid; - int err; if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; - rtnl_lock(); - - err = get_rdev_dev_by_info_ifindex(info, &rdev, &dev); - if (err) - goto unlock_rtnl; - - wdev = dev->ieee80211_ptr; - - if (netif_running(dev)) { - err = -EBUSY; - goto out; - } + if (netif_running(dev)) + return -EBUSY; - if (!rdev->ops->set_wds_peer) { - err = -EOPNOTSUPP; - goto out; - } + if (!rdev->ops->set_wds_peer) + return -EOPNOTSUPP; - if (wdev->iftype != NL80211_IFTYPE_WDS) { - err = -EOPNOTSUPP; - goto out; - } + if (wdev->iftype != NL80211_IFTYPE_WDS) + return -EOPNOTSUPP; bssid = nla_data(info->attrs[NL80211_ATTR_MAC]); - err = rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid); - -out: - cfg80211_unlock_rdev(rdev); - dev_put(dev); -unlock_rtnl: - rtnl_unlock(); - - return err; + return rdev->ops->set_wds_peer(wdev->wiphy, dev, bssid); } @@ -4860,6 +4837,8 @@ static struct genl_ops nl80211_ops[] = { .doit = nl80211_set_wds_peer, .policy = nl80211_policy, .flags = GENL_ADMIN_PERM, + .internal_flags = NL80211_FLAG_NEED_NETDEV | + NL80211_FLAG_NEED_RTNL, }, }; -- cgit v1.1 From 388ac775be95e510c2095ed6cd59422a5183a9fb Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 7 Oct 2010 13:11:09 +0200 Subject: cfg80211: constify WDS address There's no need for the WDS peer address to not be const, so make it const. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 2 +- net/wireless/nl80211.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 2e5a3fb..ecf9b71 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1363,7 +1363,7 @@ static int ieee80211_get_tx_power(struct wiphy *wiphy, int *dbm) } static int ieee80211_set_wds_peer(struct wiphy *wiphy, struct net_device *dev, - u8 *addr) + const u8 *addr) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 4fed166..882dc92 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -879,7 +879,7 @@ static int nl80211_set_wds_peer(struct sk_buff *skb, struct genl_info *info) struct cfg80211_registered_device *rdev = info->user_ptr[0]; struct net_device *dev = info->user_ptr[1]; struct wireless_dev *wdev = dev->ieee80211_ptr; - u8 *bssid; + const u8 *bssid; if (!info->attrs[NL80211_ATTR_MAC]) return -EINVAL; -- cgit v1.1 From 4e7f79511e7332ae4056eda9156a0299511ea41e Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Fri, 8 Oct 2010 10:33:39 -0700 Subject: net: Update kernel-doc for netif_set_real_num_rx_queues() Synchronise the comment with the preceding implementation change. Signed-off-by: Ben Hutchings Signed-off-by: David S. Miller --- net/core/dev.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index fd1b75a4..4962c8a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1576,8 +1576,8 @@ EXPORT_SYMBOL(netif_set_real_num_tx_queues); * * This must be called either with the rtnl_lock held or before * registration of the net device. Returns 0 on success, or a - * negative error code. If called before registration, it also - * sets the maximum number of queues, and always succeeds. + * negative error code. If called before registration, it always + * succeeds. */ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { -- cgit v1.1 From 8391d07b80e8da957cd888870e23f8e218438622 Mon Sep 17 00:00:00 2001 From: Dimitris Michailidis Date: Thu, 7 Oct 2010 14:48:38 +0000 Subject: ipv4: Remove leftover rcu_read_unlock calls from __mkroute_output() Commit "fib: RCU conversion of fib_lookup()" removed rcu_read_lock() from __mkroute_output but left a couple of calls to rcu_read_unlock() in there. This causes lockdep to complain that the rcu_read_unlock() call in __ip_route_output_key causes a lock inbalance and quickly crashes the kernel. The below fixes this for me. Signed-off-by: Dimitris Michailidis Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/route.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 7864d0c..3888f6b 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2396,12 +2396,10 @@ static int __mkroute_output(struct rtable **result, rth = dst_alloc(&ipv4_dst_ops); - if (!rth) { - rcu_read_unlock(); + if (!rth) return -ENOBUFS; - } + in_dev_hold(in_dev); - rcu_read_unlock(); rth->idev = in_dev; atomic_set(&rth->dst.__refcnt, 1); -- cgit v1.1 From 6d8e74ed377dd4cbad7ccc69300f734090e15c05 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Fri, 8 Oct 2010 04:02:01 +0000 Subject: Phonet: advise against enabling the pipe controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As it currently is, the new code path is not compatible with existing Nokia modems. This would break existing userspace for Nokia modem, such as the existing oFono ISI driver. Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/Kconfig | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig index 901956a..a4fceb8 100644 --- a/net/phonet/Kconfig +++ b/net/phonet/Kconfig @@ -24,4 +24,5 @@ config PHONET_PIPECTRLR data with Nokia Slim modems like WG2.5 used on ST-Ericsson U8500 platform. - If unsure, say N. + This option is incompatible with older Nokia modems. + Say N here unless you really know what you are doing. -- cgit v1.1 From 03789f26722a15ccfe6f191e9fb3d356f2f18a1e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Fri, 8 Oct 2010 04:02:02 +0000 Subject: Phonet: cleanup pipe enable socket option MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The current code works like this: int garbage, status; socklen_t len = sizeof(status); /* enable pipe */ setsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &garbage, sizeof(garbage)); /* disable pipe */ setsockopt(fd, SOL_PNPIPE, PNPIPE_DISABLE, &garbage, sizeof(garbage)); /* get status */ getsockopt(fd, SOL_PNPIPE, PNPIPE_INQ, &status, &len); ...which does not follow the usual socket option pattern. This patch merges all three "options" into a single gettable&settable option, before Linux 2.6.37 gets out: int status; socklen_t len = sizeof(status); /* enable pipe */ status = 1; setsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &status, sizeof(status)); /* disable pipe */ status = 0; setsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &status, sizeof(status)); /* get status */ getsockopt(fd, SOL_PNPIPE, PNPIPE_ENABLE, &status, &len); This also fixes the error code from EFAULT to ENOTCONN. Signed-off-by: Rémi Denis-Courmont Cc: Kumar Sanghvi Signed-off-by: David S. Miller --- net/phonet/pep.c | 72 +++++++++++++++++++++++--------------------------------- 1 file changed, 30 insertions(+), 42 deletions(-) (limited to 'net') diff --git a/net/phonet/pep.c b/net/phonet/pep.c index aa3d870..f818f76 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -327,29 +327,20 @@ static int pipe_handler_send_ind(struct sock *sk, u16 dobj, u8 utid, return pn_skb_send(sk, skb, &spn); } -static int pipe_handler_enable_pipe(struct sock *sk, int cmd) +static int pipe_handler_enable_pipe(struct sock *sk, int enable) { - int ret; struct pep_sock *pn = pep_sk(sk); - - switch (cmd) { - case PNPIPE_ENABLE: - ret = pipe_handler_send_req(sk, pn->pn_sk.sobject, - PNS_PIPE_ENABLE_UTID, PNS_PEP_ENABLE_REQ, - pn->pipe_handle, GFP_ATOMIC); - break; - - case PNPIPE_DISABLE: - ret = pipe_handler_send_req(sk, pn->pn_sk.sobject, - PNS_PIPE_DISABLE_UTID, PNS_PEP_DISABLE_REQ, - pn->pipe_handle, GFP_ATOMIC); - break; - - default: - ret = -EINVAL; + int utid, req; + + if (enable) { + utid = PNS_PIPE_ENABLE_UTID; + req = PNS_PEP_ENABLE_REQ; + } else { + utid = PNS_PIPE_DISABLE_UTID; + req = PNS_PEP_DISABLE_REQ; } - - return ret; + return pipe_handler_send_req(sk, pn->pn_sk.sobject, utid, req, + pn->pipe_handle, GFP_ATOMIC); } static int pipe_handler_create_pipe(struct sock *sk, int pipe_handle, int cmd) @@ -1187,23 +1178,6 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, break; } - case PNPIPE_ENABLE: - if (pn->pipe_state != PIPE_DISABLED) { - err = -EFAULT; - break; - } - err = pipe_handler_enable_pipe(sk, PNPIPE_ENABLE); - break; - - case PNPIPE_DISABLE: - if (pn->pipe_state != PIPE_ENABLED) { - err = -EFAULT; - break; - } - - err = pipe_handler_enable_pipe(sk, PNPIPE_DISABLE); - break; - case PNPIPE_DESTROY: if (pn->pipe_state < PIPE_DISABLED) { err = -EFAULT; @@ -1239,6 +1213,17 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, err = 0; } goto out_norel; + +#ifdef CONFIG_PHONET_PIPECTRLR + case PNPIPE_ENABLE: + if (pn->pipe_state <= PIPE_IDLE) { + err = -ENOTCONN; + break; + } + err = pipe_handler_enable_pipe(sk, val); + break; +#endif + default: err = -ENOPROTOOPT; } @@ -1264,15 +1249,18 @@ static int pep_getsockopt(struct sock *sk, int level, int optname, val = pn->ifindex ? PNPIPE_ENCAP_IP : PNPIPE_ENCAP_NONE; break; + case PNPIPE_IFINDEX: + val = pn->ifindex; + break; + #ifdef CONFIG_PHONET_PIPECTRLR - case PNPIPE_INQ: - val = pn->pipe_state; + case PNPIPE_ENABLE: + if (pn->pipe_state <= PIPE_IDLE) + return -ENOTCONN; + val = pn->pipe_state != PIPE_DISABLED; break; #endif - case PNPIPE_IFINDEX: - val = pn->ifindex; - break; default: return -ENOPROTOOPT; } -- cgit v1.1 From a131d82266e77b8eb8a2dab930a94ed0de9e91b0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?R=C3=A9mi=20Denis-Courmont?= Date: Fri, 8 Oct 2010 04:02:03 +0000 Subject: Phonet: mark the pipe controller as EXPERIMENTAL MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit There are a bunch of issues that need to be fixed, including: - GFP_KERNEL allocations from atomic context (and GFP_ATOMIC in process context), - abuse of the setsockopt() call convention, - unprotected/unlocked static variables... IMHO, we will need to alter the userspace ABI when we fix it. So mark the configuration option as EXPERIMENTAL for the time being (or should it be BROKEN instead?). Signed-off-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/phonet/Kconfig b/net/phonet/Kconfig index a4fceb8..0d9b8a2 100644 --- a/net/phonet/Kconfig +++ b/net/phonet/Kconfig @@ -16,8 +16,8 @@ config PHONET will be called phonet. If unsure, say N. config PHONET_PIPECTRLR - bool "Phonet Pipe Controller" - depends on PHONET + bool "Phonet Pipe Controller (EXPERIMENTAL)" + depends on PHONET && EXPERIMENTAL default N help The Pipe Controller implementation in Phonet stack to support Pipe -- cgit v1.1 From 4315d834c1496ddca977e9e22002b77c85bfec2c Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Thu, 7 Oct 2010 10:09:10 +0000 Subject: net: Fix rxq ref counting The rx->count reference is used to track reference counts to the number of rx-queue kobjects created for the device. This patch eliminates initialization of the counter in netif_alloc_rx_queues and instead increments the counter each time a kobject is created. This is now symmetric with the decrement that is done when an object is released. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 1 - net/core/net-sysfs.c | 2 ++ 2 files changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4962c8a..193eafa 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5024,7 +5024,6 @@ static int netif_alloc_rx_queues(struct net_device *dev) return -ENOMEM; } dev->_rx = rx; - atomic_set(&rx->count, count); /* * Set a pointer to first element in the array which holds the diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index fa81fd0..b143173 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -726,6 +726,7 @@ static struct kobj_type rx_queue_ktype = { static int rx_queue_add_kobject(struct net_device *net, int index) { struct netdev_rx_queue *queue = net->_rx + index; + struct netdev_rx_queue *first = queue->first; struct kobject *kobj = &queue->kobj; int error = 0; @@ -738,6 +739,7 @@ static int rx_queue_add_kobject(struct net_device *net, int index) } kobject_uevent(kobj, KOBJ_ADD); + atomic_inc(&first->count); return error; } -- cgit v1.1 From e18434c45765cfc4a74b5c61da9e9fbc6ddd3d5f Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Thu, 30 Sep 2010 06:17:44 +0000 Subject: net_sched: use __TCA_HTB_MAX and TCA_HTB_MAX Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/sched/sch_htb.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 4be8d04..87d9449 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1302,14 +1302,14 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, struct htb_class *cl = (struct htb_class *)*arg, *parent; struct nlattr *opt = tca[TCA_OPTIONS]; struct qdisc_rate_table *rtab = NULL, *ctab = NULL; - struct nlattr *tb[TCA_HTB_RTAB + 1]; + struct nlattr *tb[__TCA_HTB_MAX]; struct tc_htb_opt *hopt; /* extract all subattrs from opt attr */ if (!opt) goto failure; - err = nla_parse_nested(tb, TCA_HTB_RTAB, opt, htb_policy); + err = nla_parse_nested(tb, TCA_HTB_MAX, opt, htb_policy); if (err < 0) goto failure; -- cgit v1.1 From 5b40964eadea40509d353318d2c82e8b7bf5e8a5 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Mon, 11 Oct 2010 00:46:51 +0200 Subject: irda: Remove BKL instances from af_irda.c Most of the times, lock_kernel() was pointless or could simply be replaced by lock_sock(). Signed-off-by: Samuel Ortiz --- net/irda/af_irda.c | 370 ++++++++++++++++++++++++++++------------------------- 1 file changed, 196 insertions(+), 174 deletions(-) (limited to 'net') diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index bf36351..7f097989 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -715,14 +715,11 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_irda saddr; struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); - int err; - lock_kernel(); memset(&saddr, 0, sizeof(saddr)); if (peer) { - err = -ENOTCONN; if (sk->sk_state != TCP_ESTABLISHED) - goto out; + return -ENOTCONN; saddr.sir_family = AF_IRDA; saddr.sir_lsap_sel = self->dtsap_sel; @@ -739,10 +736,8 @@ static int irda_getname(struct socket *sock, struct sockaddr *uaddr, /* uaddr_len come to us uninitialised */ *uaddr_len = sizeof (struct sockaddr_irda); memcpy(uaddr, &saddr, *uaddr_len); - err = 0; -out: - unlock_kernel(); - return err; + + return 0; } /* @@ -758,7 +753,8 @@ static int irda_listen(struct socket *sock, int backlog) IRDA_DEBUG(2, "%s()\n", __func__); - lock_kernel(); + lock_sock(sk); + if ((sk->sk_type != SOCK_STREAM) && (sk->sk_type != SOCK_SEQPACKET) && (sk->sk_type != SOCK_DGRAM)) goto out; @@ -770,7 +766,7 @@ static int irda_listen(struct socket *sock, int backlog) err = 0; } out: - unlock_kernel(); + release_sock(sk); return err; } @@ -793,7 +789,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) if (addr_len != sizeof(struct sockaddr_irda)) return -EINVAL; - lock_kernel(); + lock_sock(sk); #ifdef CONFIG_IRDA_ULTRA /* Special care for Ultra sockets */ if ((sk->sk_type == SOCK_DGRAM) && @@ -836,7 +832,7 @@ static int irda_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) err = 0; out: - unlock_kernel(); + release_sock(sk); return err; } @@ -856,12 +852,13 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) IRDA_DEBUG(2, "%s()\n", __func__); - lock_kernel(); err = irda_create(sock_net(sk), newsock, sk->sk_protocol, 0); if (err) - goto out; + return err; err = -EINVAL; + + lock_sock(sk); if (sock->state != SS_UNCONNECTED) goto out; @@ -947,7 +944,7 @@ static int irda_accept(struct socket *sock, struct socket *newsock, int flags) irda_connect_response(new); err = 0; out: - unlock_kernel(); + release_sock(sk); return err; } @@ -981,7 +978,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, IRDA_DEBUG(2, "%s(%p)\n", __func__, self); - lock_kernel(); + lock_sock(sk); /* Don't allow connect for Ultra sockets */ err = -ESOCKTNOSUPPORT; if ((sk->sk_type == SOCK_DGRAM) && (sk->sk_protocol == IRDAPROTO_ULTRA)) @@ -1072,6 +1069,8 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, if (sk->sk_state != TCP_ESTABLISHED) { sock->state = SS_UNCONNECTED; + if (sk->sk_prot->disconnect(sk, flags)) + sock->state = SS_DISCONNECTING; err = sock_error(sk); if (!err) err = -ECONNRESET; @@ -1084,7 +1083,7 @@ static int irda_connect(struct socket *sock, struct sockaddr *uaddr, self->saddr = irttp_get_saddr(self->tsap); err = 0; out: - unlock_kernel(); + release_sock(sk); return err; } @@ -1231,7 +1230,6 @@ static int irda_release(struct socket *sock) if (sk == NULL) return 0; - lock_kernel(); lock_sock(sk); sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; @@ -1250,7 +1248,6 @@ static int irda_release(struct socket *sock) /* Destroy networking socket if we are the last reference on it, * i.e. if(sk->sk_refcnt == 0) -> sk_free(sk) */ sock_put(sk); - unlock_kernel(); /* Notes on socket locking and deallocation... - Jean II * In theory we should put pairs of sock_hold() / sock_put() to @@ -1298,7 +1295,6 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); - lock_kernel(); /* Note : socket.c set MSG_EOR on SEQPACKET sockets */ if (msg->msg_flags & ~(MSG_DONTWAIT | MSG_EOR | MSG_CMSG_COMPAT | MSG_NOSIGNAL)) { @@ -1306,6 +1302,8 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, goto out; } + lock_sock(sk); + if (sk->sk_shutdown & SEND_SHUTDOWN) goto out_err; @@ -1361,14 +1359,14 @@ static int irda_sendmsg(struct kiocb *iocb, struct socket *sock, goto out_err; } - unlock_kernel(); + release_sock(sk); /* Tell client how much data we actually sent */ return len; out_err: err = sk_stream_error(sk, msg->msg_flags, err); out: - unlock_kernel(); + release_sock(sk); return err; } @@ -1390,14 +1388,10 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, IRDA_DEBUG(4, "%s()\n", __func__); - lock_kernel(); - if ((err = sock_error(sk)) < 0) - goto out; - skb = skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &err); if (!skb) - goto out; + return err; skb_reset_transport_header(skb); copied = skb->len; @@ -1425,12 +1419,8 @@ static int irda_recvmsg_dgram(struct kiocb *iocb, struct socket *sock, irttp_flow_request(self->tsap, FLOW_START); } } - unlock_kernel(); - return copied; -out: - unlock_kernel(); - return err; + return copied; } /* @@ -1448,17 +1438,15 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, IRDA_DEBUG(3, "%s()\n", __func__); - lock_kernel(); if ((err = sock_error(sk)) < 0) - goto out; + return err; - err = -EINVAL; if (sock->flags & __SO_ACCEPTCON) - goto out; + return -EINVAL; err =-EOPNOTSUPP; if (flags & MSG_OOB) - goto out; + return -EOPNOTSUPP; err = 0; target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); @@ -1500,7 +1488,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, finish_wait(sk_sleep(sk), &wait); if (err) - goto out; + return err; if (sk->sk_shutdown & RCV_SHUTDOWN) break; @@ -1553,9 +1541,7 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, } } -out: - unlock_kernel(); - return err ? : copied; + return copied; } /* @@ -1573,13 +1559,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, struct sk_buff *skb; int err; - lock_kernel(); - IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); - err = -EINVAL; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) - goto out; + return -EINVAL; + + lock_sock(sk); if (sk->sk_shutdown & SEND_SHUTDOWN) { send_sig(SIGPIPE, current, 0); @@ -1630,10 +1615,12 @@ static int irda_sendmsg_dgram(struct kiocb *iocb, struct socket *sock, IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err); goto out; } - unlock_kernel(); + + release_sock(sk); return len; + out: - unlock_kernel(); + release_sock(sk); return err; } @@ -1656,10 +1643,11 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, IRDA_DEBUG(4, "%s(), len=%zd\n", __func__, len); - lock_kernel(); err = -EINVAL; if (msg->msg_flags & ~(MSG_DONTWAIT|MSG_CMSG_COMPAT)) - goto out; + return -EINVAL; + + lock_sock(sk); err = -EPIPE; if (sk->sk_shutdown & SEND_SHUTDOWN) { @@ -1732,7 +1720,7 @@ static int irda_sendmsg_ultra(struct kiocb *iocb, struct socket *sock, if (err) IRDA_DEBUG(0, "%s(), err=%d\n", __func__, err); out: - unlock_kernel(); + release_sock(sk); return err ? : len; } #endif /* CONFIG_IRDA_ULTRA */ @@ -1747,7 +1735,7 @@ static int irda_shutdown(struct socket *sock, int how) IRDA_DEBUG(1, "%s(%p)\n", __func__, self); - lock_kernel(); + lock_sock(sk); sk->sk_state = TCP_CLOSE; sk->sk_shutdown |= SEND_SHUTDOWN; @@ -1769,7 +1757,7 @@ static int irda_shutdown(struct socket *sock, int how) self->daddr = DEV_ADDR_ANY; /* Until we get re-connected */ self->saddr = 0x0; /* so IrLMP assign us any link */ - unlock_kernel(); + release_sock(sk); return 0; } @@ -1786,7 +1774,6 @@ static unsigned int irda_poll(struct file * file, struct socket *sock, IRDA_DEBUG(4, "%s()\n", __func__); - lock_kernel(); poll_wait(file, sk_sleep(sk), wait); mask = 0; @@ -1834,20 +1821,8 @@ static unsigned int irda_poll(struct file * file, struct socket *sock, default: break; } - unlock_kernel(); - return mask; -} -static unsigned int irda_datagram_poll(struct file *file, struct socket *sock, - poll_table *wait) -{ - int err; - - lock_kernel(); - err = datagram_poll(file, sock, wait); - unlock_kernel(); - - return err; + return mask; } /* @@ -1860,7 +1835,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) IRDA_DEBUG(4, "%s(), cmd=%#x\n", __func__, cmd); - lock_kernel(); err = -EINVAL; switch (cmd) { case TIOCOUTQ: { @@ -1903,7 +1877,6 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __func__); err = -ENOIOCTLCMD; } - unlock_kernel(); return err; } @@ -1927,7 +1900,7 @@ static int irda_compat_ioctl(struct socket *sock, unsigned int cmd, unsigned lon * Set some options for the socket * */ -static int __irda_setsockopt(struct socket *sock, int level, int optname, +static int irda_setsockopt(struct socket *sock, int level, int optname, char __user *optval, unsigned int optlen) { struct sock *sk = sock->sk; @@ -1935,13 +1908,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, struct irda_ias_set *ias_opt; struct ias_object *ias_obj; struct ias_attrib * ias_attr; /* Attribute in IAS object */ - int opt, free_ias = 0; + int opt, free_ias = 0, err = 0; IRDA_DEBUG(2, "%s(%p)\n", __func__, self); if (level != SOL_IRLMP) return -ENOPROTOOPT; + lock_sock(sk); + switch (optname) { case IRLMP_IAS_SET: /* The user want to add an attribute to an existing IAS object @@ -1951,17 +1926,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, * create the right attribute... */ - if (optlen != sizeof(struct irda_ias_set)) - return -EINVAL; + if (optlen != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); - if (ias_opt == NULL) - return -ENOMEM; + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } /* Copy query to the driver. */ if (copy_from_user(ias_opt, optval, optlen)) { kfree(ias_opt); - return -EFAULT; + err = -EFAULT; + goto out; } /* Find the object we target. @@ -1971,7 +1951,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, if(ias_opt->irda_class_name[0] == '\0') { if(self->ias_obj == NULL) { kfree(ias_opt); - return -EINVAL; + err = -EINVAL; + goto out; } ias_obj = self->ias_obj; } else @@ -1983,7 +1964,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, if((!capable(CAP_NET_ADMIN)) && ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { kfree(ias_opt); - return -EPERM; + err = -EPERM; + goto out; } /* If the object doesn't exist, create it */ @@ -1993,7 +1975,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, jiffies); if (ias_obj == NULL) { kfree(ias_opt); - return -ENOMEM; + err = -ENOMEM; + goto out; } free_ias = 1; } @@ -2005,7 +1988,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, kfree(ias_obj->name); kfree(ias_obj); } - return -EINVAL; + err = -EINVAL; + goto out; } /* Look at the type */ @@ -2028,7 +2012,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, kfree(ias_obj); } - return -EINVAL; + err = -EINVAL; + goto out; } /* Add an octet sequence attribute */ irias_add_octseq_attrib( @@ -2060,7 +2045,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, kfree(ias_obj->name); kfree(ias_obj); } - return -EINVAL; + err = -EINVAL; + goto out; } irias_insert_object(ias_obj); kfree(ias_opt); @@ -2071,17 +2057,22 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, * object is not owned by the kernel and delete it. */ - if (optlen != sizeof(struct irda_ias_set)) - return -EINVAL; + if (optlen != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); - if (ias_opt == NULL) - return -ENOMEM; + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } /* Copy query to the driver. */ if (copy_from_user(ias_opt, optval, optlen)) { kfree(ias_opt); - return -EFAULT; + err = -EFAULT; + goto out; } /* Find the object we target. @@ -2094,7 +2085,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, ias_obj = irias_find_object(ias_opt->irda_class_name); if(ias_obj == (struct ias_object *) NULL) { kfree(ias_opt); - return -EINVAL; + err = -EINVAL; + goto out; } /* Only ROOT can mess with the global IAS database. @@ -2103,7 +2095,8 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, if((!capable(CAP_NET_ADMIN)) && ((ias_obj == NULL) || (ias_obj != self->ias_obj))) { kfree(ias_opt); - return -EPERM; + err = -EPERM; + goto out; } /* Find the attribute (in the object) we target */ @@ -2111,14 +2104,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, ias_opt->irda_attrib_name); if(ias_attr == (struct ias_attrib *) NULL) { kfree(ias_opt); - return -EINVAL; + err = -EINVAL; + goto out; } /* Check is the user space own the object */ if(ias_attr->value->owner != IAS_USER_ATTR) { IRDA_DEBUG(1, "%s(), attempting to delete a kernel attribute\n", __func__); kfree(ias_opt); - return -EPERM; + err = -EPERM; + goto out; } /* Remove the attribute (and maybe the object) */ @@ -2126,11 +2121,15 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, kfree(ias_opt); break; case IRLMP_MAX_SDU_SIZE: - if (optlen < sizeof(int)) - return -EINVAL; + if (optlen < sizeof(int)) { + err = -EINVAL; + goto out; + } - if (get_user(opt, (int __user *)optval)) - return -EFAULT; + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + goto out; + } /* Only possible for a seqpacket service (TTP with SAR) */ if (sk->sk_type != SOCK_SEQPACKET) { @@ -2140,16 +2139,21 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, } else { IRDA_WARNING("%s: not allowed to set MAXSDUSIZE for this socket type!\n", __func__); - return -ENOPROTOOPT; + err = -ENOPROTOOPT; + goto out; } break; case IRLMP_HINTS_SET: - if (optlen < sizeof(int)) - return -EINVAL; + if (optlen < sizeof(int)) { + err = -EINVAL; + goto out; + } /* The input is really a (__u8 hints[2]), easier as an int */ - if (get_user(opt, (int __user *)optval)) - return -EFAULT; + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + goto out; + } /* Unregister any old registration */ if (self->skey) @@ -2163,12 +2167,16 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, * making a discovery (nodes which don't match any hint * bit in the mask are not reported). */ - if (optlen < sizeof(int)) - return -EINVAL; + if (optlen < sizeof(int)) { + err = -EINVAL; + goto out; + } /* The input is really a (__u8 hints[2]), easier as an int */ - if (get_user(opt, (int __user *)optval)) - return -EFAULT; + if (get_user(opt, (int __user *)optval)) { + err = -EFAULT; + goto out; + } /* Set the new hint mask */ self->mask.word = (__u16) opt; @@ -2180,19 +2188,12 @@ static int __irda_setsockopt(struct socket *sock, int level, int optname, break; default: - return -ENOPROTOOPT; + err = -ENOPROTOOPT; + break; } - return 0; -} -static int irda_setsockopt(struct socket *sock, int level, int optname, - char __user *optval, unsigned int optlen) -{ - int err; - - lock_kernel(); - err = __irda_setsockopt(sock, level, optname, optval, optlen); - unlock_kernel(); +out: + release_sock(sk); return err; } @@ -2249,7 +2250,7 @@ static int irda_extract_ias_value(struct irda_ias_set *ias_opt, /* * Function irda_getsockopt (sock, level, optname, optval, optlen) */ -static int __irda_getsockopt(struct socket *sock, int level, int optname, +static int irda_getsockopt(struct socket *sock, int level, int optname, char __user *optval, int __user *optlen) { struct sock *sk = sock->sk; @@ -2262,7 +2263,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname, int daddr = DEV_ADDR_ANY; /* Dest address for IAS queries */ int val = 0; int len = 0; - int err; + int err = 0; int offset, total; IRDA_DEBUG(2, "%s(%p)\n", __func__, self); @@ -2276,15 +2277,18 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname, if(len < 0) return -EINVAL; + lock_sock(sk); + switch (optname) { case IRLMP_ENUMDEVICES: /* Ask lmp for the current discovery log */ discoveries = irlmp_get_discoveries(&list.len, self->mask.word, self->nslots); /* Check if the we got some results */ - if (discoveries == NULL) - return -EAGAIN; /* Didn't find any devices */ - err = 0; + if (discoveries == NULL) { + err = -EAGAIN; + goto out; /* Didn't find any devices */ + } /* Write total list length back to client */ if (copy_to_user(optval, &list, @@ -2297,8 +2301,7 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname, sizeof(struct irda_device_info); /* Copy the list itself - watch for overflow */ - if(list.len > 2048) - { + if (list.len > 2048) { err = -EINVAL; goto bed; } @@ -2314,17 +2317,20 @@ static int __irda_getsockopt(struct socket *sock, int level, int optname, bed: /* Free up our buffer */ kfree(discoveries); - if (err) - return err; break; case IRLMP_MAX_SDU_SIZE: val = self->max_data_size; len = sizeof(int); - if (put_user(len, optlen)) - return -EFAULT; + if (put_user(len, optlen)) { + err = -EFAULT; + goto out; + } + + if (copy_to_user(optval, &val, len)) { + err = -EFAULT; + goto out; + } - if (copy_to_user(optval, &val, len)) - return -EFAULT; break; case IRLMP_IAS_GET: /* The user want an object from our local IAS database. @@ -2332,17 +2338,22 @@ bed: * that we found */ /* Check that the user has allocated the right space for us */ - if (len != sizeof(struct irda_ias_set)) - return -EINVAL; + if (len != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); - if (ias_opt == NULL) - return -ENOMEM; + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } /* Copy query to the driver. */ if (copy_from_user(ias_opt, optval, len)) { kfree(ias_opt); - return -EFAULT; + err = -EFAULT; + goto out; } /* Find the object we target. @@ -2355,7 +2366,8 @@ bed: ias_obj = irias_find_object(ias_opt->irda_class_name); if(ias_obj == (struct ias_object *) NULL) { kfree(ias_opt); - return -EINVAL; + err = -EINVAL; + goto out; } /* Find the attribute (in the object) we target */ @@ -2363,21 +2375,23 @@ bed: ias_opt->irda_attrib_name); if(ias_attr == (struct ias_attrib *) NULL) { kfree(ias_opt); - return -EINVAL; + err = -EINVAL; + goto out; } /* Translate from internal to user structure */ err = irda_extract_ias_value(ias_opt, ias_attr->value); if(err) { kfree(ias_opt); - return err; + goto out; } /* Copy reply to the user */ if (copy_to_user(optval, ias_opt, sizeof(struct irda_ias_set))) { kfree(ias_opt); - return -EFAULT; + err = -EFAULT; + goto out; } /* Note : don't need to put optlen, we checked it */ kfree(ias_opt); @@ -2388,17 +2402,22 @@ bed: * then wait for the answer to come back. */ /* Check that the user has allocated the right space for us */ - if (len != sizeof(struct irda_ias_set)) - return -EINVAL; + if (len != sizeof(struct irda_ias_set)) { + err = -EINVAL; + goto out; + } ias_opt = kmalloc(sizeof(struct irda_ias_set), GFP_ATOMIC); - if (ias_opt == NULL) - return -ENOMEM; + if (ias_opt == NULL) { + err = -ENOMEM; + goto out; + } /* Copy query to the driver. */ if (copy_from_user(ias_opt, optval, len)) { kfree(ias_opt); - return -EFAULT; + err = -EFAULT; + goto out; } /* At this point, there are two cases... @@ -2419,7 +2438,8 @@ bed: daddr = ias_opt->daddr; if((!daddr) || (daddr == DEV_ADDR_ANY)) { kfree(ias_opt); - return -EINVAL; + err = -EINVAL; + goto out; } } @@ -2428,7 +2448,8 @@ bed: IRDA_WARNING("%s: busy with a previous query\n", __func__); kfree(ias_opt); - return -EBUSY; + err = -EBUSY; + goto out; } self->iriap = iriap_open(LSAP_ANY, IAS_CLIENT, self, @@ -2436,7 +2457,8 @@ bed: if (self->iriap == NULL) { kfree(ias_opt); - return -ENOMEM; + err = -ENOMEM; + goto out; } /* Treat unexpected wakeup as disconnect */ @@ -2455,7 +2477,8 @@ bed: * we can free it regardless! */ kfree(ias_opt); /* Treat signals as disconnect */ - return -EHOSTUNREACH; + err = -EHOSTUNREACH; + goto out; } /* Check what happened */ @@ -2465,9 +2488,11 @@ bed: /* Requested object/attribute doesn't exist */ if((self->errno == IAS_CLASS_UNKNOWN) || (self->errno == IAS_ATTRIB_UNKNOWN)) - return -EADDRNOTAVAIL; + err = -EADDRNOTAVAIL; else - return -EHOSTUNREACH; + err = -EHOSTUNREACH; + + goto out; } /* Translate from internal to user structure */ @@ -2476,14 +2501,15 @@ bed: irias_delete_value(self->ias_result); if (err) { kfree(ias_opt); - return err; + goto out; } /* Copy reply to the user */ if (copy_to_user(optval, ias_opt, sizeof(struct irda_ias_set))) { kfree(ias_opt); - return -EFAULT; + err = -EFAULT; + goto out; } /* Note : don't need to put optlen, we checked it */ kfree(ias_opt); @@ -2504,11 +2530,15 @@ bed: */ /* Check that the user is passing us an int */ - if (len != sizeof(int)) - return -EINVAL; + if (len != sizeof(int)) { + err = -EINVAL; + goto out; + } /* Get timeout in ms (max time we block the caller) */ - if (get_user(val, (int __user *)optval)) - return -EFAULT; + if (get_user(val, (int __user *)optval)) { + err = -EFAULT; + goto out; + } /* Tell IrLMP we want to be notified */ irlmp_update_client(self->ckey, self->mask.word, @@ -2520,8 +2550,6 @@ bed: /* Wait until a node is discovered */ if (!self->cachedaddr) { - int ret = 0; - IRDA_DEBUG(1, "%s(), nothing discovered yet, going to sleep...\n", __func__); /* Set watchdog timer to expire in ms. */ @@ -2534,7 +2562,7 @@ bed: /* Wait for IR-LMP to call us back */ __wait_event_interruptible(self->query_wait, (self->cachedaddr != 0 || self->errno == -ETIME), - ret); + err); /* If watchdog is still activated, kill it! */ if(timer_pending(&(self->watchdog))) @@ -2542,8 +2570,8 @@ bed: IRDA_DEBUG(1, "%s(), ...waking up !\n", __func__); - if (ret != 0) - return ret; + if (err != 0) + goto out; } else IRDA_DEBUG(1, "%s(), found immediately !\n", @@ -2566,25 +2594,19 @@ bed: * If the user want more details, he should query * the whole discovery log and pick one device... */ - if (put_user(daddr, (int __user *)optval)) - return -EFAULT; + if (put_user(daddr, (int __user *)optval)) { + err = -EFAULT; + goto out; + } break; default: - return -ENOPROTOOPT; + err = -ENOPROTOOPT; } - return 0; -} - -static int irda_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) -{ - int err; +out: - lock_kernel(); - err = __irda_getsockopt(sock, level, optname, optval, optlen); - unlock_kernel(); + release_sock(sk); return err; } @@ -2628,7 +2650,7 @@ static const struct proto_ops irda_seqpacket_ops = { .socketpair = sock_no_socketpair, .accept = irda_accept, .getname = irda_getname, - .poll = irda_datagram_poll, + .poll = datagram_poll, .ioctl = irda_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = irda_compat_ioctl, @@ -2652,7 +2674,7 @@ static const struct proto_ops irda_dgram_ops = { .socketpair = sock_no_socketpair, .accept = irda_accept, .getname = irda_getname, - .poll = irda_datagram_poll, + .poll = datagram_poll, .ioctl = irda_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = irda_compat_ioctl, @@ -2677,7 +2699,7 @@ static const struct proto_ops irda_ultra_ops = { .socketpair = sock_no_socketpair, .accept = sock_no_accept, .getname = irda_getname, - .poll = irda_datagram_poll, + .poll = datagram_poll, .ioctl = irda_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = irda_compat_ioctl, -- cgit v1.1 From f8cba16cad68c9b9ee7fecae48a1b91708e8e482 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Tue, 5 Oct 2010 01:24:20 +0200 Subject: irda: Remove BKL instances from irnet The code intends to lock the irnet_socket, so adding a mutex to it allows for a complet BKL removal. Signed-off-by: Samuel Ortiz --- net/irda/irnet/irnet.h | 2 ++ net/irda/irnet/irnet_ppp.c | 61 +++++++++++++++++++++++++++++++--------------- 2 files changed, 44 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/irda/irnet/irnet.h b/net/irda/irnet/irnet.h index 4300df3..0d82ff5 100644 --- a/net/irda/irnet/irnet.h +++ b/net/irda/irnet/irnet.h @@ -458,6 +458,8 @@ typedef struct irnet_socket int disco_index; /* Last read in the discovery log */ int disco_number; /* Size of the discovery log */ + struct mutex lock; + } irnet_socket; /* diff --git a/net/irda/irnet/irnet_ppp.c b/net/irda/irnet/irnet_ppp.c index 69f1fa6..0993bd4 100644 --- a/net/irda/irnet/irnet_ppp.c +++ b/net/irda/irnet/irnet_ppp.c @@ -480,7 +480,6 @@ dev_irnet_open(struct inode * inode, ap = kzalloc(sizeof(*ap), GFP_KERNEL); DABORT(ap == NULL, -ENOMEM, FS_ERROR, "Can't allocate struct irnet...\n"); - lock_kernel(); /* initialize the irnet structure */ ap->file = file; @@ -502,18 +501,20 @@ dev_irnet_open(struct inode * inode, { DERROR(FS_ERROR, "Can't setup IrDA link...\n"); kfree(ap); - unlock_kernel(); + return err; } /* For the control channel */ ap->event_index = irnet_events.index; /* Cancel all past events */ + mutex_init(&ap->lock); + /* Put our stuff where we will be able to find it later */ file->private_data = ap; DEXIT(FS_TRACE, " - ap=0x%p\n", ap); - unlock_kernel(); + return 0; } @@ -664,7 +665,9 @@ dev_irnet_ioctl( { DEBUG(FS_INFO, "Entering PPP discipline.\n"); /* PPP channel setup (ap->chan in configued in dev_irnet_open())*/ - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + err = ppp_register_channel(&ap->chan); if(err == 0) { @@ -677,14 +680,17 @@ dev_irnet_ioctl( } else DERROR(FS_ERROR, "Can't setup PPP channel...\n"); - unlock_kernel(); + + mutex_unlock(&ap->lock); } else { /* In theory, should be N_TTY */ DEBUG(FS_INFO, "Exiting PPP discipline.\n"); /* Disconnect from the generic PPP layer */ - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + if(ap->ppp_open) { ap->ppp_open = 0; @@ -693,24 +699,31 @@ dev_irnet_ioctl( else DERROR(FS_ERROR, "Channel not registered !\n"); err = 0; - unlock_kernel(); + + mutex_unlock(&ap->lock); } break; /* Query PPP channel and unit number */ case PPPIOCGCHAN: - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + if(ap->ppp_open && !put_user(ppp_channel_index(&ap->chan), (int __user *)argp)) err = 0; - unlock_kernel(); + + mutex_unlock(&ap->lock); break; case PPPIOCGUNIT: - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + if(ap->ppp_open && !put_user(ppp_unit_number(&ap->chan), (int __user *)argp)) err = 0; - unlock_kernel(); + + mutex_unlock(&ap->lock); break; /* All these ioctls can be passed both directly and from ppp_generic, @@ -730,9 +743,12 @@ dev_irnet_ioctl( if(!capable(CAP_NET_ADMIN)) err = -EPERM; else { - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + err = ppp_irnet_ioctl(&ap->chan, cmd, arg); - unlock_kernel(); + + mutex_unlock(&ap->lock); } break; @@ -740,7 +756,9 @@ dev_irnet_ioctl( /* Get termios */ case TCGETS: DEBUG(FS_INFO, "Get termios.\n"); - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + #ifndef TCGETS2 if(!kernel_termios_to_user_termios((struct termios __user *)argp, &ap->termios)) err = 0; @@ -748,12 +766,15 @@ dev_irnet_ioctl( if(kernel_termios_to_user_termios_1((struct termios __user *)argp, &ap->termios)) err = 0; #endif - unlock_kernel(); + + mutex_unlock(&ap->lock); break; /* Set termios */ case TCSETSF: DEBUG(FS_INFO, "Set termios.\n"); - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; + #ifndef TCGETS2 if(!user_termios_to_kernel_termios(&ap->termios, (struct termios __user *)argp)) err = 0; @@ -761,7 +782,8 @@ dev_irnet_ioctl( if(!user_termios_to_kernel_termios_1(&ap->termios, (struct termios __user *)argp)) err = 0; #endif - unlock_kernel(); + + mutex_unlock(&ap->lock); break; /* Set DTR/RTS */ @@ -784,9 +806,10 @@ dev_irnet_ioctl( * We should also worry that we don't accept junk here and that * we get rid of our own buffers */ #ifdef FLUSH_TO_PPP - lock_kernel(); + if (mutex_lock_interruptible(&ap->lock)) + return -EINTR; ppp_output_wakeup(&ap->chan); - unlock_kernel(); + mutex_unlock(&ap->lock); #endif /* FLUSH_TO_PPP */ err = 0; break; -- cgit v1.1 From efc463eb508798da4243625b08c7396462cabf9f Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Mon, 11 Oct 2010 01:17:56 +0200 Subject: irda: Fix parameter extraction stack overflow Cc: stable@kernel.org Reported-by: Ilja Van Sprundel Signed-off-by: Samuel Ortiz --- net/irda/parameters.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/parameters.c b/net/irda/parameters.c index fc1a205..71cd38c 100644 --- a/net/irda/parameters.c +++ b/net/irda/parameters.c @@ -298,6 +298,8 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi, p.pi = pi; /* In case handler needs to know */ p.pl = buf[1]; /* Extract length of value */ + if (p.pl > 32) + p.pl = 32; IRDA_DEBUG(2, "%s(), pi=%#x, pl=%d\n", __func__, p.pi, p.pl); @@ -318,7 +320,7 @@ static int irda_extract_string(void *self, __u8 *buf, int len, __u8 pi, (__u8) str[0], (__u8) str[1]); /* Null terminate string */ - str[p.pl+1] = '\0'; + str[p.pl] = '\0'; p.pv.c = str; /* Handler will need to take a copy */ -- cgit v1.1 From 37f9fc452d138dfc4da2ee1ce5ae85094efc3606 Mon Sep 17 00:00:00 2001 From: Samuel Ortiz Date: Wed, 6 Oct 2010 01:03:12 +0200 Subject: irda: Fix heap memory corruption in iriap.c While parsing the GetValuebyClass command frame, we could potentially write passed the skb->data pointer. Cc: stable@kernel.org Reported-by: Ilja Van Sprundel Signed-off-by: Samuel Ortiz --- net/irda/iriap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/irda/iriap.c b/net/irda/iriap.c index fce364c..5b743bd 100644 --- a/net/irda/iriap.c +++ b/net/irda/iriap.c @@ -502,7 +502,8 @@ static void iriap_getvaluebyclass_confirm(struct iriap_cb *self, IRDA_DEBUG(4, "%s(), strlen=%d\n", __func__, value_len); /* Make sure the string is null-terminated */ - fp[n+value_len] = 0x00; + if (n + value_len < skb->len) + fp[n + value_len] = 0x00; IRDA_DEBUG(4, "Got string %s\n", fp+n); /* Will truncate to IAS_MAX_STRING bytes */ -- cgit v1.1 From 34d101dd6204bd100fc2e6f7b5f9a10f959ce2c9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 09:16:57 -0700 Subject: neigh: speedup neigh_hh_init() When a new dst is used to send a frame, neigh_resolve_output() tries to associate an struct hh_cache to this dst, calling neigh_hh_init() with the neigh rwlock write locked. Most of the time, hh_cache is already known and linked into neighbour, so we find it and increment its refcount. This patch changes the logic so that we call neigh_hh_init() with neighbour lock read locked only, so that fast path can be run in parallel by concurrent cpus. This brings part of the speedup we got with commit c7d4426a98a5f (introduce DST_NOCACHE flag) for non cached dsts, even for cached ones, removing one of the contention point that routers hit on multiqueue enabled machines. Further improvements would need to use a seqlock instead of an rwlock to protect neigh->ha[], to not dirty neigh too often and remove two atomic ops. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 4 +-- net/core/neighbour.c | 99 ++++++++++++++++++++++++++++++++-------------------- 2 files changed, 63 insertions(+), 40 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 6c41b1f..978a1ee 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -228,8 +228,8 @@ again: child = dst->child; dst->hh = NULL; - if (hh && atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + if (hh) + hh_cache_put(hh); if (neigh) { dst->neighbour = NULL; diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 3ffafaa0..2044906 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -709,8 +709,7 @@ void neigh_destroy(struct neighbour *neigh) write_seqlock_bh(&hh->hh_lock); hh->hh_output = neigh_blackhole; write_sequnlock_bh(&hh->hh_lock); - if (atomic_dec_and_test(&hh->hh_refcnt)) - kfree(hh); + hh_cache_put(hh); } skb_queue_purge(&neigh->arp_queue); @@ -1210,39 +1209,67 @@ struct neighbour *neigh_event_ns(struct neigh_table *tbl, } EXPORT_SYMBOL(neigh_event_ns); +static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, + __be16 protocol) +{ + struct hh_cache *hh; + + for (hh = n->hh; hh; hh = hh->hh_next) { + if (hh->hh_type == protocol) { + atomic_inc(&hh->hh_refcnt); + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); + return true; + } + } + return false; +} + +/* called with read_lock_bh(&n->lock); */ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, __be16 protocol) { struct hh_cache *hh; struct net_device *dev = dst->dev; - for (hh = n->hh; hh; hh = hh->hh_next) - if (hh->hh_type == protocol) - break; + if (likely(neigh_hh_lookup(n, dst, protocol))) + return; - if (!hh && (hh = kzalloc(sizeof(*hh), GFP_ATOMIC)) != NULL) { - seqlock_init(&hh->hh_lock); - hh->hh_type = protocol; - atomic_set(&hh->hh_refcnt, 0); - hh->hh_next = NULL; + /* slow path */ + hh = kzalloc(sizeof(*hh), GFP_ATOMIC); + if (!hh) + return; - if (dev->header_ops->cache(n, hh)) { - kfree(hh); - hh = NULL; - } else { - atomic_inc(&hh->hh_refcnt); - hh->hh_next = n->hh; - n->hh = hh; - if (n->nud_state & NUD_CONNECTED) - hh->hh_output = n->ops->hh_output; - else - hh->hh_output = n->ops->output; - } + seqlock_init(&hh->hh_lock); + hh->hh_type = protocol; + atomic_set(&hh->hh_refcnt, 2); + + if (dev->header_ops->cache(n, hh)) { + kfree(hh); + return; } - if (hh) { - atomic_inc(&hh->hh_refcnt); - dst->hh = hh; + read_unlock(&n->lock); + write_lock(&n->lock); + + /* must check if another thread already did the insert */ + if (neigh_hh_lookup(n, dst, protocol)) { + kfree(hh); + goto end; } + + if (n->nud_state & NUD_CONNECTED) + hh->hh_output = n->ops->hh_output; + else + hh->hh_output = n->ops->output; + + hh->hh_next = n->hh; + n->hh = hh; + + if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) + hh_cache_put(hh); +end: + write_unlock(&n->lock); + read_lock(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit @@ -1281,21 +1308,17 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; + + read_lock_bh(&neigh->lock); if (dev->header_ops->cache && !dst->hh && - !(dst->flags & DST_NOCACHE)) { - write_lock_bh(&neigh->lock); - if (!dst->hh) - neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - write_unlock_bh(&neigh->lock); - } else { - read_lock_bh(&neigh->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); - } + !(dst->flags & DST_NOCACHE)) + neigh_hh_init(neigh, dst, dst->ops->protocol); + + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + read_unlock_bh(&neigh->lock); + if (err >= 0) rc = neigh->ops->queue_xmit(skb); else -- cgit v1.1 From 7623225f905263424c7254dc6a07bff083a498dd Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Mon, 11 Oct 2010 14:46:52 -0400 Subject: Revert "wireless: Use first phyX name available when registering phy devices." This reverts commit 5a254ffe3ffdfa84fe076009bd8e88da412180d2. The commit failed to take into account that allocated wireless devices (wiphys) are not added into the device list upon allocation, but only when they are registered. Therefore, it opened up a race between allocating and registering a name, so that if two processes allocate and register concurrently ("alloc, alloc, register, register" rather than "alloc, register, alloc, register") the code will attempt to use the same name twice. Signed-off-by: Johannes Berg --- net/wireless/core.c | 54 ++++++++++++++++++++++++----------------------------- 1 file changed, 24 insertions(+), 30 deletions(-) (limited to 'net') diff --git a/net/wireless/core.c b/net/wireless/core.c index 1684ad9..9c21ebf 100644 --- a/net/wireless/core.c +++ b/net/wireless/core.c @@ -178,10 +178,26 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, char *newname) { struct cfg80211_registered_device *rdev2; - int result; + int wiphy_idx, taken = -1, result, digits; assert_cfg80211_lock(); + /* prohibit calling the thing phy%d when %d is not its number */ + sscanf(newname, PHY_NAME "%d%n", &wiphy_idx, &taken); + if (taken == strlen(newname) && wiphy_idx != rdev->wiphy_idx) { + /* count number of places needed to print wiphy_idx */ + digits = 1; + while (wiphy_idx /= 10) + digits++; + /* + * deny the name if it is phy where is printed + * without leading zeroes. taken == strlen(newname) here + */ + if (taken == strlen(PHY_NAME) + digits) + return -EINVAL; + } + + /* Ignore nop renames */ if (strcmp(newname, dev_name(&rdev->wiphy.dev)) == 0) return 0; @@ -189,7 +205,7 @@ int cfg80211_dev_rename(struct cfg80211_registered_device *rdev, /* Ensure another device does not already have this name. */ list_for_each_entry(rdev2, &cfg80211_rdev_list, list) if (strcmp(newname, dev_name(&rdev2->wiphy.dev)) == 0) - return -EEXIST; + return -EINVAL; result = device_rename(&rdev->wiphy.dev, newname); if (result) @@ -304,11 +320,9 @@ static void cfg80211_event_work(struct work_struct *work) struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) { static int wiphy_counter; - int i; - struct cfg80211_registered_device *rdev, *rdev2; + + struct cfg80211_registered_device *rdev; int alloc_size; - char nname[IFNAMSIZ + 1]; - bool found = false; WARN_ON(ops->add_key && (!ops->del_key || !ops->set_default_key)); WARN_ON(ops->auth && (!ops->assoc || !ops->deauth || !ops->disassoc)); @@ -332,37 +346,17 @@ struct wiphy *wiphy_new(const struct cfg80211_ops *ops, int sizeof_priv) if (unlikely(!wiphy_idx_valid(rdev->wiphy_idx))) { wiphy_counter--; - goto too_many_devs; - } - - /* 64k wiphy devices is enough for anyone! */ - for (i = 0; i < 0xFFFF; i++) { - found = false; - snprintf(nname, sizeof(nname)-1, PHY_NAME "%d", i); - nname[sizeof(nname)-1] = 0; - list_for_each_entry(rdev2, &cfg80211_rdev_list, list) - if (strcmp(nname, dev_name(&rdev2->wiphy.dev)) == 0) { - found = true; - break; - } - - if (!found) - break; - } - - if (unlikely(found)) { -too_many_devs: mutex_unlock(&cfg80211_mutex); - /* ugh, too many devices already! */ + /* ugh, wrapped! */ kfree(rdev); return NULL; } - /* give it a proper name */ - dev_set_name(&rdev->wiphy.dev, "%s", nname); - mutex_unlock(&cfg80211_mutex); + /* give it a proper name */ + dev_set_name(&rdev->wiphy.dev, PHY_NAME "%d", rdev->wiphy_idx); + mutex_init(&rdev->mtx); mutex_init(&rdev->devlist_mtx); INIT_LIST_HEAD(&rdev->netdev_list); -- cgit v1.1 From b38afa87698375179026224522c2e48dcbf17e65 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 7 Oct 2010 16:12:06 -0700 Subject: mac80211: Improve mlme probe response log messages. Old messages didn't mention the device in question. Signed-off-by: Ben Greear Signed-off-by: John W. Linville --- net/mac80211/mlme.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index 5695c94..a3a9421 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -1864,10 +1864,12 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) else if (ifmgd->probe_send_count < IEEE80211_MAX_PROBE_TRIES) { #ifdef CONFIG_MAC80211_VERBOSE_DEBUG - printk(KERN_DEBUG "No probe response from AP %pM" - " after %dms, try %d\n", bssid, - (1000 * IEEE80211_PROBE_WAIT)/HZ, - ifmgd->probe_send_count); + wiphy_debug(local->hw.wiphy, + "%s: No probe response from AP %pM" + " after %dms, try %d\n", + sdata->name, + bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ, + ifmgd->probe_send_count); #endif ieee80211_mgd_probe_ap_send(sdata); } else { @@ -1877,9 +1879,11 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata) */ ifmgd->flags &= ~(IEEE80211_STA_CONNECTION_POLL | IEEE80211_STA_BEACON_POLL); - printk(KERN_DEBUG "No probe response from AP %pM" - " after %dms, disconnecting.\n", - bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ); + wiphy_debug(local->hw.wiphy, + "%s: No probe response from AP %pM" + " after %dms, disconnecting.\n", + sdata->name, + bssid, (1000 * IEEE80211_PROBE_WAIT)/HZ); ieee80211_set_disassoc(sdata, true, true); mutex_unlock(&ifmgd->mtx); mutex_lock(&local->mtx); -- cgit v1.1 From 5a5c731aa59cc2c44ca20f45b1a577cd4f5435e2 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 7 Oct 2010 16:39:20 -0700 Subject: wireless: Set some stats used by /proc/net/wireless (wext) Some stats for /proc/net/wireless (and wext in general) are not being set. This patch addresses a few of those with values easily obtained from mac80211 core. Signed-off-by: Ben Greear Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 4 +++- net/wireless/wext-compat.c | 4 ++++ 2 files changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index ecf9b71..25fb351 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -329,7 +329,8 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) STATION_INFO_TX_PACKETS | STATION_INFO_TX_RETRIES | STATION_INFO_TX_FAILED | - STATION_INFO_TX_BITRATE; + STATION_INFO_TX_BITRATE | + STATION_INFO_RX_DROP_MISC; sinfo->inactive_time = jiffies_to_msecs(jiffies - sta->last_rx); sinfo->rx_bytes = sta->rx_bytes; @@ -338,6 +339,7 @@ static void sta_set_sinfo(struct sta_info *sta, struct station_info *sinfo) sinfo->tx_packets = sta->tx_packets; sinfo->tx_retries = sta->tx_retry_count; sinfo->tx_failed = sta->tx_retry_failed; + sinfo->rx_dropped_misc = sta->rx_dropped; if ((sta->local->hw.flags & IEEE80211_HW_SIGNAL_DBM) || (sta->local->hw.flags & IEEE80211_HW_SIGNAL_UNSPEC)) { diff --git a/net/wireless/wext-compat.c b/net/wireless/wext-compat.c index 6002265..12222ee 100644 --- a/net/wireless/wext-compat.c +++ b/net/wireless/wext-compat.c @@ -1366,6 +1366,10 @@ struct iw_statistics *cfg80211_wireless_stats(struct net_device *dev) } wstats.qual.updated |= IW_QUAL_NOISE_INVALID; + if (sinfo.filled & STATION_INFO_RX_DROP_MISC) + wstats.discard.misc = sinfo.rx_dropped_misc; + if (sinfo.filled & STATION_INFO_TX_FAILED) + wstats.discard.retries = sinfo.tx_failed; return &wstats; } -- cgit v1.1 From d12c74528e3065c90df70fbc06ec6ffd6e804738 Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Fri, 8 Oct 2010 22:27:07 +0200 Subject: mac80211: fix possible null-pointer de-reference This patch not only fixes a null-pointer de-reference that would be triggered by a PLINK_OPEN frame with mis- matching/incompatible mesh configuration, but also responds correctly to non-compatible PLINK_OPEN frames by generating a PLINK_CLOSE with the right reason code. The original bug was detected by smatch. ( http://repo.or.cz/w/smatch.git ) net/mac80211/mesh_plink.c +574 mesh_rx_plink_frame(168) error: we previously assumed 'sta' could be null. Cc: Reviewed-and-Tested-by: Steve deRosier Reviewed-and-Tested-by: Javier Cardona Acked-by: Johannes Berg Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/mesh_plink.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c index ea13a80..1c91f0f 100644 --- a/net/mac80211/mesh_plink.c +++ b/net/mac80211/mesh_plink.c @@ -412,7 +412,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m enum plink_event event; enum plink_frame_type ftype; size_t baselen; - bool deactivated; + bool deactivated, matches_local = true; u8 ie_len; u8 *baseaddr; __le16 plid, llid, reason; @@ -487,6 +487,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m /* Now we will figure out the appropriate event... */ event = PLINK_UNDEFINED; if (ftype != PLINK_CLOSE && (!mesh_matches_local(&elems, sdata))) { + matches_local = false; switch (ftype) { case PLINK_OPEN: event = OPN_RJCT; @@ -498,7 +499,15 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m /* avoid warning */ break; } - spin_lock_bh(&sta->lock); + } + + if (!sta && !matches_local) { + rcu_read_unlock(); + reason = cpu_to_le16(MESH_CAPABILITY_POLICY_VIOLATION); + llid = 0; + mesh_plink_frame_tx(sdata, PLINK_CLOSE, mgmt->sa, llid, + plid, reason); + return; } else if (!sta) { /* ftype == PLINK_OPEN */ u32 rates; @@ -522,7 +531,7 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m } event = OPN_ACPT; spin_lock_bh(&sta->lock); - } else { + } else if (matches_local) { spin_lock_bh(&sta->lock); switch (ftype) { case PLINK_OPEN: @@ -564,6 +573,8 @@ void mesh_rx_plink_frame(struct ieee80211_sub_if_data *sdata, struct ieee80211_m rcu_read_unlock(); return; } + } else { + spin_lock_bh(&sta->lock); } mpl_dbg("Mesh plink (peer, state, llid, plid, event): %pM %s %d %d %d\n", -- cgit v1.1 From 15943a72c7d2031c9150917ca9161a9f891d455a Mon Sep 17 00:00:00 2001 From: Christian Lamparter Date: Fri, 8 Oct 2010 22:35:09 +0200 Subject: mac80211: temporarily disable reorder release timer Several serve threading problems in the current release reorder timer implementation have been discovered. A lengthy discussion - which lists some of the pitfalls and possible solutions - can be found at: http://marc.info/?t=128635927000001 But due to the complicated nature of the subject and the imminent advent of a new -rc cycle, it was decided to disable the feature for the time being. Signed-off-by: Christian Lamparter Signed-off-by: John W. Linville --- net/mac80211/rx.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) (limited to 'net') diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index b67221d..902b03ee 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -622,6 +622,26 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, tid_agg_rx->buf_size; } + /* + * Disable the reorder release timer for now. + * + * The current implementation lacks a proper locking scheme + * which would protect vital statistic and debug counters + * from being updated by two different but concurrent BHs. + * + * More information about the topic is available from: + * - thread: http://marc.info/?t=128635927000001 + * + * What was wrong: + * => http://marc.info/?l=linux-wireless&m=128636170811964 + * "Basically the thing is that until your patch, the data + * in the struct didn't actually need locking because it + * was accessed by the RX path only which is not concurrent." + * + * List of what needs to be fixed: + * => http://marc.info/?l=linux-wireless&m=128656352920957 + * + if (tid_agg_rx->stored_mpdu_num) { j = index = seq_sub(tid_agg_rx->head_seq_num, tid_agg_rx->ssn) % tid_agg_rx->buf_size; @@ -640,6 +660,10 @@ static void ieee80211_sta_reorder_release(struct ieee80211_hw *hw, } else { del_timer(&tid_agg_rx->reorder_timer); } + */ + +set_release_timer: + return; } /* -- cgit v1.1 From 8610c29a2c9f273886b1c31ae4d92c69d4326262 Mon Sep 17 00:00:00 2001 From: Felix Fietkau Date: Sat, 9 Oct 2010 02:39:29 +0200 Subject: cfg80211: add channel utilization stats to the survey command Using these, user space can calculate a relative channel utilization with arbitrary intervals by regularly taking snapshots of the survey results. Signed-off-by: Felix Fietkau Signed-off-by: John W. Linville --- net/wireless/nl80211.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net') diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 882dc92..c506241 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -3153,6 +3153,21 @@ static int nl80211_send_survey(struct sk_buff *msg, u32 pid, u32 seq, survey->noise); if (survey->filled & SURVEY_INFO_IN_USE) NLA_PUT_FLAG(msg, NL80211_SURVEY_INFO_IN_USE); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME, + survey->channel_time); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_BUSY) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_BUSY, + survey->channel_time_busy); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_EXT_BUSY) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_EXT_BUSY, + survey->channel_time_ext_busy); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_RX) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_RX, + survey->channel_time_rx); + if (survey->filled & SURVEY_INFO_CHANNEL_TIME_TX) + NLA_PUT_U64(msg, NL80211_SURVEY_INFO_CHANNEL_TIME_TX, + survey->channel_time_tx); nla_nest_end(msg, infoattr); -- cgit v1.1 From 730bd83b036e72b0134352ca27e76ea08475fbf1 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Sun, 10 Oct 2010 18:52:10 +0200 Subject: mac80211: don't kmalloc 16 bytes Since this small buffer isn't used for DMA, we can simply allocate it on the stack, it just needs to be 16 bytes of which only 8 will be used for WEP40 keys. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/wep.c | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) (limited to 'net') diff --git a/net/mac80211/wep.c b/net/mac80211/wep.c index f27484c..2ff6d1e 100644 --- a/net/mac80211/wep.c +++ b/net/mac80211/wep.c @@ -222,7 +222,7 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local, struct ieee80211_key *key) { u32 klen; - u8 *rc4key; + u8 rc4key[3 + WLAN_KEY_LEN_WEP104]; u8 keyidx; struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data; unsigned int hdrlen; @@ -245,10 +245,6 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local, klen = 3 + key->conf.keylen; - rc4key = kmalloc(klen, GFP_ATOMIC); - if (!rc4key) - return -1; - /* Prepend 24-bit IV to RC4 key */ memcpy(rc4key, skb->data + hdrlen, 3); @@ -260,8 +256,6 @@ static int ieee80211_wep_decrypt(struct ieee80211_local *local, len)) ret = -1; - kfree(rc4key); - /* Trim ICV */ skb_trim(skb, skb->len - WEP_ICV_LEN); -- cgit v1.1 From 15d46f38df87f89242e470f5797120fa384c1fc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Bj=C3=B6rn=20Smedman?= Date: Sun, 10 Oct 2010 22:14:25 +0200 Subject: mac80211: minstrel_ht A-MPDU fix MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This patch fixes two problems with the minstrel_ht rate control algorithms handling of A-MPDU frames: 1. The ampdu_len field of the tx status is not always initialized for non-HT frames (and it would probably be unreasonable to require all drivers to do so). This could cause rate control statistics to be corrupted. We now trust the ampdu_len and ampdu_ack_len fields only when the frame is marked with the IEEE80211_TX_STAT_AMPDU flag. 2. Successful transmission attempts where only recognized when the A-MPDU subframe carrying the rate control status information was marked with the IEEE80211_TX_STAT_ACK flag. If this information happed to be carried on a frame that failed to be ACKed then the other subframes (which may have succeeded) where not correctly registered. We now update rate control statistics regardless of whether the subframe carrying the information was ACKed or not. Cc: Signed-off-by: Björn Smedman Acked-by: Felix Fietkau Signed-off-by: John W. Linville --- net/mac80211/rc80211_minstrel_ht.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index c5b4659..2a18d66 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -397,8 +397,9 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, !(info->flags & IEEE80211_TX_STAT_AMPDU)) return; - if (!info->status.ampdu_len) { - info->status.ampdu_ack_len = 1; + if (!(info->flags & IEEE80211_TX_STAT_AMPDU)) { + info->status.ampdu_ack_len = + (info->flags & IEEE80211_TX_STAT_ACK ? 1 : 0); info->status.ampdu_len = 1; } @@ -426,7 +427,7 @@ minstrel_ht_tx_status(void *priv, struct ieee80211_supported_band *sband, group = minstrel_ht_get_group_idx(&ar[i]); rate = &mi->groups[group].rates[ar[i].idx % 8]; - if (last && (info->flags & IEEE80211_TX_STAT_ACK)) + if (last) rate->success += info->status.ampdu_ack_len; rate->attempts += ar[i].count * info->status.ampdu_len; -- cgit v1.1 From 0ed8ddf4045fcfcac36bad753dc4046118c603ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 7 Oct 2010 10:44:07 +0000 Subject: neigh: Protect neigh->ha[] with a seqlock Add a seqlock in struct neighbour to protect neigh->ha[], and avoid dirtying neighbour in stress situation (many different flows / dsts) Dirtying takes place because of read_lock(&n->lock) and n->used writes. Switching to a seqlock, and writing n->used only on jiffies changes permits less dirtying. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/neighbour.c | 47 ++++++++++++++++++++++++++++++----------------- net/ipv4/arp.c | 6 ++---- net/sched/sch_teql.c | 8 ++++---- 3 files changed, 36 insertions(+), 25 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index 2044906..b165b96 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -294,6 +294,7 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl) skb_queue_head_init(&n->arp_queue); rwlock_init(&n->lock); + seqlock_init(&n->ha_lock); n->updated = n->used = now; n->nud_state = NUD_NONE; n->output = neigh_blackhole; @@ -1015,7 +1016,7 @@ out_unlock_bh: } EXPORT_SYMBOL(__neigh_event_send); -static void neigh_update_hhs(struct neighbour *neigh) +static void neigh_update_hhs(const struct neighbour *neigh) { struct hh_cache *hh; void (*update)(struct hh_cache*, const struct net_device*, const unsigned char *) @@ -1151,7 +1152,9 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new, } if (lladdr != neigh->ha) { + write_seqlock(&neigh->ha_lock); memcpy(&neigh->ha, lladdr, dev->addr_len); + write_sequnlock(&neigh->ha_lock); neigh_update_hhs(neigh); if (!(new & NUD_CONNECTED)) neigh->confirmed = jiffies - @@ -1214,6 +1217,7 @@ static inline bool neigh_hh_lookup(struct neighbour *n, struct dst_entry *dst, { struct hh_cache *hh; + smp_rmb(); /* paired with smp_wmb() in neigh_hh_init() */ for (hh = n->hh; hh; hh = hh->hh_next) { if (hh->hh_type == protocol) { atomic_inc(&hh->hh_refcnt); @@ -1248,8 +1252,8 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, kfree(hh); return; } - read_unlock(&n->lock); - write_lock(&n->lock); + + write_lock_bh(&n->lock); /* must check if another thread already did the insert */ if (neigh_hh_lookup(n, dst, protocol)) { @@ -1263,13 +1267,13 @@ static void neigh_hh_init(struct neighbour *n, struct dst_entry *dst, hh->hh_output = n->ops->output; hh->hh_next = n->hh; + smp_wmb(); /* paired with smp_rmb() in neigh_hh_lookup() */ n->hh = hh; if (unlikely(cmpxchg(&dst->hh, NULL, hh) != NULL)) hh_cache_put(hh); end: - write_unlock(&n->lock); - read_lock(&n->lock); + write_unlock_bh(&n->lock); } /* This function can be used in contexts, where only old dev_queue_xmit @@ -1308,16 +1312,18 @@ int neigh_resolve_output(struct sk_buff *skb) if (!neigh_event_send(neigh, skb)) { int err; struct net_device *dev = neigh->dev; + unsigned int seq; - read_lock_bh(&neigh->lock); if (dev->header_ops->cache && !dst->hh && !(dst->flags & DST_NOCACHE)) neigh_hh_init(neigh, dst, dst->ops->protocol); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); if (err >= 0) rc = neigh->ops->queue_xmit(skb); @@ -1344,13 +1350,16 @@ int neigh_connected_output(struct sk_buff *skb) struct dst_entry *dst = skb_dst(skb); struct neighbour *neigh = dst->neighbour; struct net_device *dev = neigh->dev; + unsigned int seq; __skb_pull(skb, skb_network_offset(skb)); - read_lock_bh(&neigh->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - neigh->ha, NULL, skb->len); - read_unlock_bh(&neigh->lock); + do { + seq = read_seqbegin(&neigh->ha_lock); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), + neigh->ha, NULL, skb->len); + } while (read_seqretry(&neigh->ha_lock, seq)); + if (err >= 0) err = neigh->ops->queue_xmit(skb); else { @@ -2148,10 +2157,14 @@ static int neigh_fill_info(struct sk_buff *skb, struct neighbour *neigh, read_lock_bh(&neigh->lock); ndm->ndm_state = neigh->nud_state; - if ((neigh->nud_state & NUD_VALID) && - nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, neigh->ha) < 0) { - read_unlock_bh(&neigh->lock); - goto nla_put_failure; + if (neigh->nud_state & NUD_VALID) { + char haddr[MAX_ADDR_LEN]; + + neigh_ha_snapshot(haddr, neigh, neigh->dev); + if (nla_put(skb, NDA_LLADDR, neigh->dev->addr_len, haddr) < 0) { + read_unlock_bh(&neigh->lock); + goto nla_put_failure; + } } ci.ndm_used = jiffies_to_clock_t(now - neigh->used); diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index f353095..d8e540c 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -502,10 +502,8 @@ int arp_find(unsigned char *haddr, struct sk_buff *skb) if (n) { n->used = jiffies; - if (n->nud_state&NUD_VALID || neigh_event_send(n, skb) == 0) { - read_lock_bh(&n->lock); - memcpy(haddr, n->ha, dev->addr_len); - read_unlock_bh(&n->lock); + if (n->nud_state & NUD_VALID || neigh_event_send(n, skb) == 0) { + neigh_ha_snapshot(haddr, n, dev); neigh_release(n); return 0; } diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index feaabc1..401af95 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -241,11 +241,11 @@ __teql_resolve(struct sk_buff *skb, struct sk_buff *skb_res, struct net_device * } if (neigh_event_send(n, skb_res) == 0) { int err; + char haddr[MAX_ADDR_LEN]; - read_lock(&n->lock); - err = dev_hard_header(skb, dev, ntohs(skb->protocol), - n->ha, NULL, skb->len); - read_unlock(&n->lock); + neigh_ha_snapshot(haddr, n, dev); + err = dev_hard_header(skb, dev, ntohs(skb->protocol), haddr, + NULL, skb->len); if (err < 0) { neigh_release(n); -- cgit v1.1 From fc66f95c68b6d4535a0ea2ea15d5cf626e310956 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Oct 2010 06:37:34 +0000 Subject: net dst: use a percpu_counter to track entries struct dst_ops tracks number of allocated dst in an atomic_t field, subject to high cache line contention in stress workload. Switch to a percpu_counter, to reduce number of time we need to dirty a central location. Place it on a separate cache line to avoid dirtying read only fields. Stress test : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE, SLUB/NUMA) Before: real 0m51.179s user 0m15.329s sys 10m15.942s After: real 0m45.570s user 0m15.525s sys 9m56.669s With a small reordering of struct neighbour fields, subject of a following patch, (to separate refcnt from other read mostly fields) real 0m41.841s user 0m15.261s sys 8m45.949s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 11 +++++++++-- net/core/dst.c | 6 +++--- net/decnet/dn_route.c | 3 ++- net/ipv4/route.c | 36 ++++++++++++++++++++++-------------- net/ipv4/xfrm4_policy.c | 4 ++-- net/ipv6/route.c | 28 ++++++++++++++++++++-------- net/ipv6/xfrm6_policy.c | 10 ++++++---- 7 files changed, 64 insertions(+), 34 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 77f7b5f..7f9ce96 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -106,7 +106,6 @@ static struct dst_ops fake_dst_ops = { .family = AF_INET, .protocol = cpu_to_be16(ETH_P_IP), .update_pmtu = fake_update_pmtu, - .entries = ATOMIC_INIT(0), }; /* @@ -1003,15 +1002,22 @@ int __init br_netfilter_init(void) { int ret; - ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + ret = dst_entries_init(&fake_dst_ops); if (ret < 0) return ret; + + ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + if (ret < 0) { + dst_entries_destroy(&fake_dst_ops); + return ret; + } #ifdef CONFIG_SYSCTL brnf_sysctl_header = register_sysctl_paths(brnf_path, brnf_table); if (brnf_sysctl_header == NULL) { printk(KERN_WARNING "br_netfilter: can't register to sysctl.\n"); nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops)); + dst_entries_destroy(&fake_dst_ops); return -ENOMEM; } #endif @@ -1025,4 +1031,5 @@ void br_netfilter_fini(void) #ifdef CONFIG_SYSCTL unregister_sysctl_table(brnf_sysctl_header); #endif + dst_entries_destroy(&fake_dst_ops); } diff --git a/net/core/dst.c b/net/core/dst.c index 978a1ee..32e542d 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -168,7 +168,7 @@ void *dst_alloc(struct dst_ops *ops) { struct dst_entry *dst; - if (ops->gc && atomic_read(&ops->entries) > ops->gc_thresh) { + if (ops->gc && dst_entries_get_fast(ops) > ops->gc_thresh) { if (ops->gc(ops)) return NULL; } @@ -183,7 +183,7 @@ void *dst_alloc(struct dst_ops *ops) #if RT_CACHE_DEBUG >= 2 atomic_inc(&dst_total); #endif - atomic_inc(&ops->entries); + dst_entries_add(ops, 1); return dst; } EXPORT_SYMBOL(dst_alloc); @@ -236,7 +236,7 @@ again: neigh_release(neigh); } - atomic_dec(&dst->ops->entries); + dst_entries_add(dst->ops, -1); if (dst->ops->destroy) dst->ops->destroy(dst); diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c index 6585ea6..df0f3e5 100644 --- a/net/decnet/dn_route.c +++ b/net/decnet/dn_route.c @@ -132,7 +132,6 @@ static struct dst_ops dn_dst_ops = { .negative_advice = dn_dst_negative_advice, .link_failure = dn_dst_link_failure, .update_pmtu = dn_dst_update_pmtu, - .entries = ATOMIC_INIT(0), }; static __inline__ unsigned dn_hash(__le16 src, __le16 dst) @@ -1758,6 +1757,7 @@ void __init dn_route_init(void) dn_dst_ops.kmem_cachep = kmem_cache_create("dn_dst_cache", sizeof(struct dn_route), 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + dst_entries_init(&dn_dst_ops); setup_timer(&dn_route_timer, dn_dst_check_expire, 0); dn_route_timer.expires = jiffies + decnet_dst_gc_interval * HZ; add_timer(&dn_route_timer); @@ -1816,5 +1816,6 @@ void __exit dn_route_cleanup(void) dn_run_flush(0); proc_net_remove(&init_net, "decnet_cache"); + dst_entries_destroy(&dn_dst_ops); } diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 3888f6b..0755aa4 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -159,7 +159,6 @@ static struct dst_ops ipv4_dst_ops = { .link_failure = ipv4_link_failure, .update_pmtu = ip_rt_update_pmtu, .local_out = __ip_local_out, - .entries = ATOMIC_INIT(0), }; #define ECN_OR_COST(class) TC_PRIO_##class @@ -466,7 +465,7 @@ static int rt_cpu_seq_show(struct seq_file *seq, void *v) seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x " " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n", - atomic_read(&ipv4_dst_ops.entries), + dst_entries_get_slow(&ipv4_dst_ops), st->in_hit, st->in_slow_tot, st->in_slow_mc, @@ -945,6 +944,7 @@ static int rt_garbage_collect(struct dst_ops *ops) struct rtable *rth, **rthp; unsigned long now = jiffies; int goal; + int entries = dst_entries_get_fast(&ipv4_dst_ops); /* * Garbage collection is pretty expensive, @@ -954,28 +954,28 @@ static int rt_garbage_collect(struct dst_ops *ops) RT_CACHE_STAT_INC(gc_total); if (now - last_gc < ip_rt_gc_min_interval && - atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) { + entries < ip_rt_max_size) { RT_CACHE_STAT_INC(gc_ignored); goto out; } + entries = dst_entries_get_slow(&ipv4_dst_ops); /* Calculate number of entries, which we want to expire now. */ - goal = atomic_read(&ipv4_dst_ops.entries) - - (ip_rt_gc_elasticity << rt_hash_log); + goal = entries - (ip_rt_gc_elasticity << rt_hash_log); if (goal <= 0) { if (equilibrium < ipv4_dst_ops.gc_thresh) equilibrium = ipv4_dst_ops.gc_thresh; - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + goal = entries - equilibrium; if (goal > 0) { equilibrium += min_t(unsigned int, goal >> 1, rt_hash_mask + 1); - goal = atomic_read(&ipv4_dst_ops.entries) - equilibrium; + goal = entries - equilibrium; } } else { /* We are in dangerous area. Try to reduce cache really * aggressively. */ goal = max_t(unsigned int, goal >> 1, rt_hash_mask + 1); - equilibrium = atomic_read(&ipv4_dst_ops.entries) - goal; + equilibrium = entries - goal; } if (now - last_gc >= ip_rt_gc_min_interval) @@ -1032,14 +1032,16 @@ static int rt_garbage_collect(struct dst_ops *ops) expire >>= 1; #if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire>> %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, i); + dst_entries_get_fast(&ipv4_dst_ops), goal, i); #endif - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) goto out; } while (!in_softirq() && time_before_eq(jiffies, now)); - if (atomic_read(&ipv4_dst_ops.entries) < ip_rt_max_size) + if (dst_entries_get_fast(&ipv4_dst_ops) < ip_rt_max_size) + goto out; + if (dst_entries_get_slow(&ipv4_dst_ops) < ip_rt_max_size) goto out; if (net_ratelimit()) printk(KERN_WARNING "dst cache overflow\n"); @@ -1049,11 +1051,12 @@ static int rt_garbage_collect(struct dst_ops *ops) work_done: expire += ip_rt_gc_min_interval; if (expire > ip_rt_gc_timeout || - atomic_read(&ipv4_dst_ops.entries) < ipv4_dst_ops.gc_thresh) + dst_entries_get_fast(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh || + dst_entries_get_slow(&ipv4_dst_ops) < ipv4_dst_ops.gc_thresh) expire = ip_rt_gc_timeout; #if RT_CACHE_DEBUG >= 2 printk(KERN_DEBUG "expire++ %u %d %d %d\n", expire, - atomic_read(&ipv4_dst_ops.entries), goal, rover); + dst_entries_get_fast(&ipv4_dst_ops), goal, rover); #endif out: return 0; } @@ -2717,7 +2720,6 @@ static struct dst_ops ipv4_dst_blackhole_ops = { .destroy = ipv4_dst_destroy, .check = ipv4_blackhole_dst_check, .update_pmtu = ipv4_rt_blackhole_update_pmtu, - .entries = ATOMIC_INIT(0), }; @@ -3287,6 +3289,12 @@ int __init ip_rt_init(void) ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep; + if (dst_entries_init(&ipv4_dst_ops) < 0) + panic("IP: failed to allocate ipv4_dst_ops counter\n"); + + if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0) + panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n"); + rt_hash_table = (struct rt_hash_bucket *) alloc_large_system_hash("IP route cache", sizeof(struct rt_hash_bucket), diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index a580349..4464f3b 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -174,7 +174,7 @@ static inline int xfrm4_garbage_collect(struct dst_ops *ops) struct net *net = container_of(ops, struct net, xfrm.xfrm4_dst_ops); xfrm4_policy_afinfo.garbage_collect(net); - return (atomic_read(&ops->entries) > ops->gc_thresh * 2); + return (dst_entries_get_slow(ops) > ops->gc_thresh * 2); } static void xfrm4_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -232,7 +232,6 @@ static struct dst_ops xfrm4_dst_ops = { .ifdown = xfrm4_dst_ifdown, .local_out = __ip_local_out, .gc_thresh = 1024, - .entries = ATOMIC_INIT(0), }; static struct xfrm_policy_afinfo xfrm4_policy_afinfo = { @@ -288,6 +287,7 @@ void __init xfrm4_init(int rt_max_size) * and start cleaning when were 1/2 full */ xfrm4_dst_ops.gc_thresh = rt_max_size/2; + dst_entries_init(&xfrm4_dst_ops); xfrm4_state_init(); xfrm4_policy_init(); diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 17e2179..25661f9 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -109,7 +109,6 @@ static struct dst_ops ip6_dst_ops_template = { .link_failure = ip6_link_failure, .update_pmtu = ip6_rt_update_pmtu, .local_out = __ip6_local_out, - .entries = ATOMIC_INIT(0), }; static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -122,7 +121,6 @@ static struct dst_ops ip6_dst_blackhole_ops = { .destroy = ip6_dst_destroy, .check = ip6_dst_check, .update_pmtu = ip6_rt_blackhole_update_pmtu, - .entries = ATOMIC_INIT(0), }; static struct rt6_info ip6_null_entry_template = { @@ -1058,19 +1056,22 @@ static int ip6_dst_gc(struct dst_ops *ops) int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity; int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout; unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc; + int entries; + entries = dst_entries_get_fast(ops); if (time_after(rt_last_gc + rt_min_interval, now) && - atomic_read(&ops->entries) <= rt_max_size) + entries <= rt_max_size) goto out; net->ipv6.ip6_rt_gc_expire++; fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net); net->ipv6.ip6_rt_last_gc = now; - if (atomic_read(&ops->entries) < ops->gc_thresh) + entries = dst_entries_get_slow(ops); + if (entries < ops->gc_thresh) net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1; out: net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity; - return atomic_read(&ops->entries) > rt_max_size; + return entries > rt_max_size; } /* Clean host part of a prefix. Not necessary in radix tree, @@ -2524,7 +2525,7 @@ static int rt6_stats_seq_show(struct seq_file *seq, void *v) net->ipv6.rt6_stats->fib_rt_alloc, net->ipv6.rt6_stats->fib_rt_entries, net->ipv6.rt6_stats->fib_rt_cache, - atomic_read(&net->ipv6.ip6_dst_ops.entries), + dst_entries_get_slow(&net->ipv6.ip6_dst_ops), net->ipv6.rt6_stats->fib_discarded_routes); return 0; @@ -2666,11 +2667,14 @@ static int __net_init ip6_route_net_init(struct net *net) memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template, sizeof(net->ipv6.ip6_dst_ops)); + if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0) + goto out_ip6_dst_ops; + net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template, sizeof(*net->ipv6.ip6_null_entry), GFP_KERNEL); if (!net->ipv6.ip6_null_entry) - goto out_ip6_dst_ops; + goto out_ip6_dst_entries; net->ipv6.ip6_null_entry->dst.path = (struct dst_entry *)net->ipv6.ip6_null_entry; net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops; @@ -2720,6 +2724,8 @@ out_ip6_prohibit_entry: out_ip6_null_entry: kfree(net->ipv6.ip6_null_entry); #endif +out_ip6_dst_entries: + dst_entries_destroy(&net->ipv6.ip6_dst_ops); out_ip6_dst_ops: goto out; } @@ -2758,10 +2764,14 @@ int __init ip6_route_init(void) if (!ip6_dst_ops_template.kmem_cachep) goto out; - ret = register_pernet_subsys(&ip6_route_net_ops); + ret = dst_entries_init(&ip6_dst_blackhole_ops); if (ret) goto out_kmem_cache; + ret = register_pernet_subsys(&ip6_route_net_ops); + if (ret) + goto out_dst_entries; + ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep; /* Registering of the loopback is done before this portion of code, @@ -2808,6 +2818,8 @@ out_fib6_init: fib6_gc_cleanup(); out_register_subsys: unregister_pernet_subsys(&ip6_route_net_ops); +out_dst_entries: + dst_entries_destroy(&ip6_dst_blackhole_ops); out_kmem_cache: kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep); goto out; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index 39676ea..7e74023 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -199,7 +199,7 @@ static inline int xfrm6_garbage_collect(struct dst_ops *ops) struct net *net = container_of(ops, struct net, xfrm.xfrm6_dst_ops); xfrm6_policy_afinfo.garbage_collect(net); - return atomic_read(&ops->entries) > ops->gc_thresh * 2; + return dst_entries_get_fast(ops) > ops->gc_thresh * 2; } static void xfrm6_update_pmtu(struct dst_entry *dst, u32 mtu) @@ -255,7 +255,6 @@ static struct dst_ops xfrm6_dst_ops = { .ifdown = xfrm6_dst_ifdown, .local_out = __ip6_local_out, .gc_thresh = 1024, - .entries = ATOMIC_INIT(0), }; static struct xfrm_policy_afinfo xfrm6_policy_afinfo = { @@ -312,11 +311,13 @@ int __init xfrm6_init(void) */ gc_thresh = FIB6_TABLE_HASHSZ * 8; xfrm6_dst_ops.gc_thresh = (gc_thresh < 1024) ? 1024 : gc_thresh; + dst_entries_init(&xfrm6_dst_ops); ret = xfrm6_policy_init(); - if (ret) + if (ret) { + dst_entries_destroy(&xfrm6_dst_ops); goto out; - + } ret = xfrm6_state_init(); if (ret) goto out_policy; @@ -341,4 +342,5 @@ void xfrm6_fini(void) //xfrm6_input_fini(); xfrm6_policy_fini(); xfrm6_state_fini(); + dst_entries_destroy(&xfrm6_dst_ops); } -- cgit v1.1 From 0b53d4604ac2b4f2faa9a62a04ea9b383ad2efe0 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:35:40 +0200 Subject: dccp: fix the adjustments to AWL and SWL This fixes a problem and a potential loophole with regard to seqno/ackno validity: currently the initial adjustments to AWL/SWL are only performed once at the begin of the connection, during the handshake. Since the Sequence Window feature is always greater than Wmin=32 (7.5.2), it is however necessary to perform these adjustments at least for the first W/W' (variables as per 7.5.1) packets in the lifetime of a connection. This requirement is complicated by the fact that W/W' can change at any time during the lifetime of a connection. Therefore it is better to perform that safety check each time SWL/AWL are updated, as implemented by the patch. A second problem solved by this patch is that the remote/local Sequence Window feature values (which set the bounds for AWL/SWL/SWH) are undefined until the feature negotiation has completed. During the initial handshake we have more stringent sequence number protection; the changes added by this patch effect that {A,S}W{L,H} are within the correct bounds at the instant that feature negotiation completes (since the SeqWin feature activation handlers call dccp_update_gsr/gss()). Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 20 ++++++++++++++++++++ net/dccp/input.c | 18 ++++++------------ net/dccp/minisocks.c | 30 +++++++++--------------------- 3 files changed, 35 insertions(+), 33 deletions(-) (limited to 'net') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 019d6ff..e051c77 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -414,6 +414,23 @@ static inline void dccp_update_gsr(struct sock *sk, u64 seq) dp->dccps_gsr = seq; /* Sequence validity window depends on remote Sequence Window (7.5.1) */ dp->dccps_swl = SUB48(ADD48(dp->dccps_gsr, 1), dp->dccps_r_seq_win / 4); + /* + * Adjust SWL so that it is not below ISR. In contrast to RFC 4340, + * 7.5.1 we perform this check beyond the initial handshake: W/W' are + * always > 32, so for the first W/W' packets in the lifetime of a + * connection we always have to adjust SWL. + * A second reason why we are doing this is that the window depends on + * the feature-remote value of Sequence Window: nothing stops the peer + * from updating this value while we are busy adjusting SWL for the + * first W packets (we would have to count from scratch again then). + * Therefore it is safer to always make sure that the Sequence Window + * is not artificially extended by a peer who grows SWL downwards by + * continually updating the feature-remote Sequence-Window. + * If sequence numbers wrap it is bad luck. But that will take a while + * (48 bit), and this measure prevents Sequence-number attacks. + */ + if (before48(dp->dccps_swl, dp->dccps_isr)) + dp->dccps_swl = dp->dccps_isr; dp->dccps_swh = ADD48(dp->dccps_gsr, (3 * dp->dccps_r_seq_win) / 4); } @@ -424,6 +441,9 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) dp->dccps_gss = seq; /* Ack validity window depends on local Sequence Window value (7.5.1) */ dp->dccps_awl = SUB48(ADD48(dp->dccps_gss, 1), dp->dccps_l_seq_win); + /* Adjust AWL so that it is not below ISS - see comment above for SWL */ + if (before48(dp->dccps_awl, dp->dccps_iss)) + dp->dccps_awl = dp->dccps_iss; dp->dccps_awh = dp->dccps_gss; } diff --git a/net/dccp/input.c b/net/dccp/input.c index 10c957a..aecc8c74 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -441,20 +441,14 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; - dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; - dccp_update_gsr(sk, dp->dccps_isr); /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - * - * AWL was adjusted in dccp_v4_connect -acme + * Set ISR, GSR from packet. ISS was set in dccp_v{4,6}_connect + * and GSS in dccp_transmit_skb(). Setting AWL/AWH and SWL/SWH + * is done as part of activating the feature values below, since + * these settings depend on the local/remote Sequence Window + * features, which were undefined or not confirmed until now. */ - dccp_set_seqno(&dp->dccps_swl, - max48(dp->dccps_swl, dp->dccps_isr)); + dp->dccps_gsr = dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 128b089..d7041a0 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -121,30 +121,18 @@ struct sock *dccp_create_openreq_child(struct sock *sk, * * Choose S.ISS (initial seqno) or set from Init Cookies * Initialize S.GAR := S.ISS - * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookies - */ - newdp->dccps_gar = newdp->dccps_iss = dreq->dreq_iss; - dccp_update_gss(newsk, dreq->dreq_iss); - - newdp->dccps_isr = dreq->dreq_isr; - dccp_update_gsr(newsk, dreq->dreq_isr); - - /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. + * Set S.ISR, S.GSR from packet (or Init Cookies) + * + * Setting AWL/AWH and SWL/SWH happens as part of the feature + * activation below, as these windows all depend on the local + * and remote Sequence Window feature values (7.5.2). */ - dccp_set_seqno(&newdp->dccps_swl, - max48(newdp->dccps_swl, newdp->dccps_isr)); - dccp_set_seqno(&newdp->dccps_awl, - max48(newdp->dccps_awl, newdp->dccps_iss)); + newdp->dccps_gss = newdp->dccps_iss = dreq->dreq_iss; + newdp->dccps_gar = newdp->dccps_iss; + newdp->dccps_gsr = newdp->dccps_isr = dreq->dreq_isr; /* - * Activate features after initialising the sequence numbers, - * since CCID initialisation may depend on GSS, ISR, ISS etc. + * Activate features: initialise CCIDs, sequence windows etc. */ if (dccp_feat_activate_values(newsk, &dreq->dreq_featneg)) { /* It is still raw copy of parent, so invalidate -- cgit v1.1 From 93344af44c0f649582bf1e3b5ecc45b3d19e98c2 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:36:33 +0200 Subject: dccp: merge now-reduced connect_init() function After moving the assignment of GAR/ISS from dccp_connect_init() to dccp_transmit_skb(), the former function becomes very small, so that a merger with dccp_connect() suggests itself. Signed-off-by: Gerrit Renker --- net/dccp/output.c | 18 +++++------------- 1 file changed, 5 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/dccp/output.c b/net/dccp/output.c index aadbdb5..6993a93 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -474,8 +474,9 @@ int dccp_send_reset(struct sock *sk, enum dccp_reset_codes code) /* * Do all connect socket setups that can be done AF independent. */ -static inline void dccp_connect_init(struct sock *sk) +int dccp_connect(struct sock *sk) { + struct sk_buff *skb; struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -485,22 +486,12 @@ static inline void dccp_connect_init(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ - dp->dccps_gar = dp->dccps_iss; - - icsk->icsk_retransmits = 0; -} - -int dccp_connect(struct sock *sk) -{ - struct sk_buff *skb; - struct inet_connection_sock *icsk = inet_csk(sk); - /* do not connect if feature negotiation setup fails */ if (dccp_feat_finalise_settings(dccp_sk(sk))) return -EPROTO; - dccp_connect_init(sk); + /* Initialise GAR as per 8.5; AWL/AWH are set in dccp_transmit_skb() */ + dp->dccps_gar = dp->dccps_iss; skb = alloc_skb(sk->sk_prot->max_header, sk->sk_allocation); if (unlikely(skb == NULL)) @@ -516,6 +507,7 @@ int dccp_connect(struct sock *sk) DCCP_INC_STATS(DCCP_MIB_ACTIVEOPENS); /* Timer for repeating the REQUEST until an answer. */ + icsk->icsk_retransmits = 0; inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, DCCP_RTO_MAX); return 0; -- cgit v1.1 From baf9e782e1dc4991edecfa3b8700cf8739c40259 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:37:38 +0200 Subject: dccp: remove unused argument in CCID tx function This removes the argument `more' from ccid_hc_tx_packet_sent, since it was nowhere used in the entire code. (Btw, this argument was not even used in the original KAME code where the function initially came from; compare the variable moreToSend in the freebsd61-dccp-kame-28.08.2006.patch kept by Emmanuel Lochin.) Signed-off-by: Gerrit Renker --- net/dccp/ccid.h | 6 +++--- net/dccp/ccids/ccid2.c | 2 +- net/dccp/ccids/ccid3.c | 3 +-- net/dccp/output.c | 2 +- 4 files changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index 6d16a90..117fb09 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -73,7 +73,7 @@ struct ccid_operations { int (*ccid_hc_tx_send_packet)(struct sock *sk, struct sk_buff *skb); void (*ccid_hc_tx_packet_sent)(struct sock *sk, - int more, unsigned int len); + unsigned int len); void (*ccid_hc_rx_get_info)(struct sock *sk, struct tcp_info *info); void (*ccid_hc_tx_get_info)(struct sock *sk, @@ -144,10 +144,10 @@ static inline int ccid_hc_tx_send_packet(struct ccid *ccid, struct sock *sk, } static inline void ccid_hc_tx_packet_sent(struct ccid *ccid, struct sock *sk, - int more, unsigned int len) + unsigned int len) { if (ccid->ccid_ops->ccid_hc_tx_packet_sent != NULL) - ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, more, len); + ccid->ccid_ops->ccid_hc_tx_packet_sent(sk, len); } static inline void ccid_hc_rx_packet_recv(struct ccid *ccid, struct sock *sk, diff --git a/net/dccp/ccids/ccid2.c b/net/dccp/ccids/ccid2.c index dc18172..d850e29 100644 --- a/net/dccp/ccids/ccid2.c +++ b/net/dccp/ccids/ccid2.c @@ -151,7 +151,7 @@ out: sock_put(sk); } -static void ccid2_hc_tx_packet_sent(struct sock *sk, int more, unsigned int len) +static void ccid2_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct dccp_sock *dp = dccp_sk(sk); struct ccid2_hc_tx_sock *hc = ccid2_hc_tx_sk(sk); diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index c3f3a25..3060a60 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -351,8 +351,7 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, struct sk_buff *skb) return 0; } -static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, - unsigned int len) +static void ccid3_hc_tx_packet_sent(struct sock *sk, unsigned int len) { struct ccid3_hc_tx_sock *hc = ccid3_hc_tx_sk(sk); diff --git a/net/dccp/output.c b/net/dccp/output.c index 6993a93..a988fe9 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -304,7 +304,7 @@ void dccp_write_xmit(struct sock *sk, int block) dcb->dccpd_type = DCCP_PKT_DATA; err = dccp_transmit_skb(sk, skb); - ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, 0, len); + ccid_hc_tx_packet_sent(dp->dccps_hc_tx_ccid, sk, len); if (err) DCCP_BUG("err=%d after ccid_hc_tx_packet_sent", err); -- cgit v1.1 From d196c9a5d4e150cdff675662214c80c69b906958 Mon Sep 17 00:00:00 2001 From: Ivo Calado Date: Mon, 11 Oct 2010 20:40:04 +0200 Subject: dccp: generalise data-loss condition This patch generalises the task of determining data loss from RFC 4340, 7.7.1. Let S_A, S_B be sequence numbers such that S_B is "after" S_A, and let N_B be the NDP count of packet S_B. Then, using modulo-2^48 arithmetic, D = S_B - S_A - 1 is an upper bound of the number of lost data packets, D - N_B is an approximation of the number of lost data packets (there are cases where this is not exact). The patch implements this as dccp_loss_count(S_A, S_B, N_B) := max(S_B - S_A - 1 - N_B, 0) Signed-off-by: Ivo Calado Signed-off-by: Erivaldo Xavier Signed-off-by: Leandro Sales Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e051c77..60f4f96 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -153,18 +153,27 @@ static inline u64 max48(const u64 seq1, const u64 seq2) } /** - * dccp_loss_free - Evaluates condition for data loss from RFC 4340, 7.7.1 - * @s1: start sequence number - * @s2: end sequence number + * dccp_loss_count - Approximate the number of lost data packets in a burst loss + * @s1: last known sequence number before the loss ('hole') + * @s2: first sequence number seen after the 'hole' * @ndp: NDP count on packet with sequence number @s2 - * Returns true if the sequence range s1...s2 has no data loss. */ -static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) +static inline u64 dccp_loss_count(const u64 s1, const u64 s2, const u64 ndp) { s64 delta = dccp_delta_seqno(s1, s2); WARN_ON(delta < 0); - return (u64)delta <= ndp + 1; + delta -= ndp + 1; + + return delta > 0 ? delta : 0; +} + +/** + * dccp_loss_free - Evaluate condition for data loss from RFC 4340, 7.7.1 + */ +static inline bool dccp_loss_free(const u64 s1, const u64 s2, const u64 ndp) +{ + return dccp_loss_count(s1, s2, ndp) == 0; } enum { -- cgit v1.1 From ecdfbdabbe4e0cf0443cbbea2df1bf51bf67f3f3 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:41:13 +0200 Subject: dccp: schedule an Ack when receiving timestamps This schedules an Ack when receiving a timestamp, exploiting the existing inet_csk_schedule_ack() function, saving one case in the `dccp_ack_pending()' function. Signed-off-by: Gerrit Renker --- net/dccp/dccp.h | 3 +-- net/dccp/options.c | 2 ++ 2 files changed, 3 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 60f4f96..3eb264b 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -459,8 +459,7 @@ static inline void dccp_update_gss(struct sock *sk, u64 seq) static inline int dccp_ack_pending(const struct sock *sk) { const struct dccp_sock *dp = dccp_sk(sk); - return dp->dccps_timestamp_echo != 0 || - (dp->dccps_hc_rx_ackvec != NULL && + return (dp->dccps_hc_rx_ackvec != NULL && dccp_ackvec_pending(dp->dccps_hc_rx_ackvec)) || inet_csk_ack_scheduled(sk); } diff --git a/net/dccp/options.c b/net/dccp/options.c index d4b1ae0..cd30618 100644 --- a/net/dccp/options.c +++ b/net/dccp/options.c @@ -163,6 +163,8 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq, dccp_role(sk), ntohl(opt_val), (unsigned long long) DCCP_SKB_CB(skb)->dccpd_ack_seq); + /* schedule an Ack in case this sender is quiescent */ + inet_csk_schedule_ack(sk); break; case DCCPO_TIMESTAMP_ECHO: if (len != 4 && len != 6 && len != 8) -- cgit v1.1 From 2f34b32977ade4249601f35f7eb0cdd56b4e0f89 Mon Sep 17 00:00:00 2001 From: Gerrit Renker Date: Mon, 11 Oct 2010 20:44:42 +0200 Subject: dccp: cosmetics - warning format This omits the redundant "DCCP:" in warning messages, since DCCP_WARN() already echoes the function name, avoiding messages like kernel: [10988.766503] dccp_close: DCCP: ABORT -- 209 bytes unread Signed-off-by: Gerrit Renker --- net/dccp/input.c | 2 +- net/dccp/proto.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/dccp/input.c b/net/dccp/input.c index aecc8c74..2659853 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -259,7 +259,7 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) sysctl_dccp_sync_ratelimit))) return 0; - DCCP_WARN("DCCP: Step 6 failed for %s packet, " + DCCP_WARN("Step 6 failed for %s packet, " "(LSWL(%llu) <= P.seqno(%llu) <= S.SWH(%llu)) and " "(P.ackno %s or LAWL(%llu) <= P.ackno(%llu) <= S.AWH(%llu), " "sending SYNC...\n", dccp_packet_name(dh->dccph_type), diff --git a/net/dccp/proto.c b/net/dccp/proto.c index b054ba1..7e5fc04 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -944,7 +944,7 @@ void dccp_close(struct sock *sk, long timeout) if (data_was_unread) { /* Unread data was tossed, send an appropriate Reset Code */ - DCCP_WARN("DCCP: ABORT -- %u bytes unread\n", data_was_unread); + DCCP_WARN("ABORT with %u bytes unread\n", data_was_unread); dccp_send_reset(sk, DCCP_RESET_CODE_ABORTED); dccp_set_state(sk, DCCP_CLOSED); } else if (sock_flag(sk, SOCK_LINGER) && !sk->sk_lingertime) { -- cgit v1.1 From 8f1e1742233cd1c3444dfc6c945a2efb2814e157 Mon Sep 17 00:00:00 2001 From: David Vrabel Date: Mon, 9 Aug 2010 17:38:10 -0400 Subject: Bluetooth: HCI devices are either BR/EDR or AMP radios HCI transport drivers may not know what type of radio an AMP device has so only say whether they're BR/EDR or AMP devices. Signed-off-by: David Vrabel Signed-off-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/hci_sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 8fb967b..1a9f0db 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -196,8 +196,8 @@ static inline char *host_typetostr(int type) switch (type) { case HCI_BREDR: return "BR/EDR"; - case HCI_80211: - return "802.11"; + case HCI_AMP: + return "AMP"; default: return "UNKNOWN"; } -- cgit v1.1 From 08601469a5aac8b3453d8aca0b8f9bd8dfdae12c Mon Sep 17 00:00:00 2001 From: Yuri Kululin Date: Fri, 23 Jul 2010 13:57:12 +0400 Subject: Bluetooth: Fix RFCOMM RPN negotiation According to the ETSI 3GPP TS 07.10 the default bit rate value for RFCOMM is 9600 bit/s. Return this bit rate in case of RPN request and accept other sane bit rates proposed by the sender in RPM command. Signed-off-by: Yuri Kululin Signed-off-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/rfcomm/core.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 15ea84b..b464843 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1415,8 +1415,8 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ return 0; if (len == 1) { - /* This is a request, return default settings */ - bit_rate = RFCOMM_RPN_BR_115200; + /* This is a request, return default (according to ETSI TS 07.10) settings */ + bit_rate = RFCOMM_RPN_BR_9600; data_bits = RFCOMM_RPN_DATA_8; stop_bits = RFCOMM_RPN_STOP_1; parity = RFCOMM_RPN_PARITY_NONE; @@ -1431,9 +1431,9 @@ static int rfcomm_recv_rpn(struct rfcomm_session *s, int cr, int len, struct sk_ if (rpn->param_mask & cpu_to_le16(RFCOMM_RPN_PM_BITRATE)) { bit_rate = rpn->bit_rate; - if (bit_rate != RFCOMM_RPN_BR_115200) { + if (bit_rate > RFCOMM_RPN_BR_230400) { BT_DBG("RPN bit rate mismatch 0x%x", bit_rate); - bit_rate = RFCOMM_RPN_BR_115200; + bit_rate = RFCOMM_RPN_BR_9600; rpn_mask ^= RFCOMM_RPN_PM_BITRATE; } } -- cgit v1.1 From 0fba2558cbb3fae3a09147b7d6e18a6204204e0e Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Wed, 8 Sep 2010 10:05:26 -0700 Subject: Bluetooth: Validate PSM values in calls to connect() and bind() Valid L2CAP PSMs are odd numbers, and the least significant bit of the most significant byte must be 0. Signed-off-by: Mat Martineau Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 25 +++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 0b54b7d..2f625b8 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -1008,10 +1008,20 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen) goto done; } - if (la.l2_psm && __le16_to_cpu(la.l2_psm) < 0x1001 && - !capable(CAP_NET_BIND_SERVICE)) { - err = -EACCES; - goto done; + if (la.l2_psm) { + __u16 psm = __le16_to_cpu(la.l2_psm); + + /* PSM must be odd and lsb of upper byte must be 0 */ + if ((psm & 0x0101) != 0x0001) { + err = -EINVAL; + goto done; + } + + /* Restrict usage of well-known PSMs */ + if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE)) { + err = -EACCES; + goto done; + } } write_lock_bh(&l2cap_sk_list.lock); @@ -1190,6 +1200,13 @@ static int l2cap_sock_connect(struct socket *sock, struct sockaddr *addr, int al goto done; } + /* PSM must be odd and lsb of upper byte must be 0 */ + if ((__le16_to_cpu(la.l2_psm) & 0x0101) != 0x0001 && + sk->sk_type != SOCK_RAW) { + err = -EINVAL; + goto done; + } + /* Set destination address and psm */ bacpy(&bt_sk(sk)->dst, &la.l2_bdaddr); l2cap_pi(sk)->psm = la.l2_psm; -- cgit v1.1 From 796c86eec84ddfd02281c5071838ed1fefda6b90 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Wed, 8 Sep 2010 10:05:27 -0700 Subject: Bluetooth: Add common code for stream-oriented recvmsg() This commit adds a bt_sock_stream_recvmsg() function for use by any Bluetooth code that uses SOCK_STREAM sockets. This code is copied from rfcomm_sock_recvmsg() with minimal modifications to remove RFCOMM-specific functionality and improve readability. L2CAP (with the SOCK_STREAM socket type) and RFCOMM have common needs when it comes to reading data. Proper stream read semantics require that applications can read from a stream one byte at a time and not lose any data. The RFCOMM code already operated on and pulled data from the underlying L2CAP socket, so very few changes were required to make the code more generic for use with non-RFCOMM data over L2CAP. Applications that need more awareness of L2CAP frame boundaries are still free to use SOCK_SEQPACKET sockets, and may verify that they connection did not fall back to basic mode by calling getsockopt(). Signed-off-by: Mat Martineau Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/af_bluetooth.c | 109 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 109 insertions(+) (limited to 'net') diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ed0f22f..c4cf3f5 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -265,6 +265,115 @@ int bt_sock_recvmsg(struct kiocb *iocb, struct socket *sock, } EXPORT_SYMBOL(bt_sock_recvmsg); +static long bt_sock_data_wait(struct sock *sk, long timeo) +{ + DECLARE_WAITQUEUE(wait, current); + + add_wait_queue(sk_sleep(sk), &wait); + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); + + if (!skb_queue_empty(&sk->sk_receive_queue)) + break; + + if (sk->sk_err || (sk->sk_shutdown & RCV_SHUTDOWN)) + break; + + if (signal_pending(current) || !timeo) + break; + + set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + release_sock(sk); + timeo = schedule_timeout(timeo); + lock_sock(sk); + clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); + } + + __set_current_state(TASK_RUNNING); + remove_wait_queue(sk_sleep(sk), &wait); + return timeo; +} + +int bt_sock_stream_recvmsg(struct kiocb *iocb, struct socket *sock, + struct msghdr *msg, size_t size, int flags) +{ + struct sock *sk = sock->sk; + int err = 0; + size_t target, copied = 0; + long timeo; + + if (flags & MSG_OOB) + return -EOPNOTSUPP; + + msg->msg_namelen = 0; + + BT_DBG("sk %p size %zu", sk, size); + + lock_sock(sk); + + target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); + timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); + + do { + struct sk_buff *skb; + int chunk; + + skb = skb_dequeue(&sk->sk_receive_queue); + if (!skb) { + if (copied >= target) + break; + + if ((err = sock_error(sk)) != 0) + break; + if (sk->sk_shutdown & RCV_SHUTDOWN) + break; + + err = -EAGAIN; + if (!timeo) + break; + + timeo = bt_sock_data_wait(sk, timeo); + + if (signal_pending(current)) { + err = sock_intr_errno(timeo); + goto out; + } + continue; + } + + chunk = min_t(unsigned int, skb->len, size); + if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { + skb_queue_head(&sk->sk_receive_queue, skb); + if (!copied) + copied = -EFAULT; + break; + } + copied += chunk; + size -= chunk; + + sock_recv_ts_and_drops(msg, sk, skb); + + if (!(flags & MSG_PEEK)) { + skb_pull(skb, chunk); + if (skb->len) { + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + kfree_skb(skb); + + } else { + /* put message back and return */ + skb_queue_head(&sk->sk_receive_queue, skb); + break; + } + } while (size); + +out: + release_sock(sk); + return copied ? : err; +} +EXPORT_SYMBOL(bt_sock_stream_recvmsg); + static inline unsigned int bt_accept_poll(struct sock *parent) { struct list_head *p, *n; -- cgit v1.1 From 3d7d01dffec4a6757ed1e3182f01c7ef5caa2539 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Wed, 8 Sep 2010 10:05:28 -0700 Subject: Bluetooth: Use common SOCK_STREAM receive code in RFCOMM To reduce code duplication, have rfcomm_sock_recvmsg() call bt_sock_stream_recvmsg(). The common bt_sock_stream_recvmsg() code is nearly identical, with the RFCOMM-specific functionality for deferred setup and connection unthrottling left in rfcomm_sock_recvmsg(). Signed-off-by: Mat Martineau Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/rfcomm/sock.c | 104 +++----------------------------------------- 1 file changed, 6 insertions(+), 98 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 194b3a0..aec505f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -621,121 +621,29 @@ static int rfcomm_sock_sendmsg(struct kiocb *iocb, struct socket *sock, return sent; } -static long rfcomm_sock_data_wait(struct sock *sk, long timeo) -{ - DECLARE_WAITQUEUE(wait, current); - - add_wait_queue(sk_sleep(sk), &wait); - for (;;) { - set_current_state(TASK_INTERRUPTIBLE); - - if (!skb_queue_empty(&sk->sk_receive_queue) || - sk->sk_err || - (sk->sk_shutdown & RCV_SHUTDOWN) || - signal_pending(current) || - !timeo) - break; - - set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - release_sock(sk); - timeo = schedule_timeout(timeo); - lock_sock(sk); - clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags); - } - - __set_current_state(TASK_RUNNING); - remove_wait_queue(sk_sleep(sk), &wait); - return timeo; -} - static int rfcomm_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr *msg, size_t size, int flags) { struct sock *sk = sock->sk; struct rfcomm_dlc *d = rfcomm_pi(sk)->dlc; - int err = 0; - size_t target, copied = 0; - long timeo; + int len; if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { rfcomm_dlc_accept(d); return 0; } - if (flags & MSG_OOB) - return -EOPNOTSUPP; - - msg->msg_namelen = 0; - - BT_DBG("sk %p size %zu", sk, size); + len = bt_sock_stream_recvmsg(iocb, sock, msg, size, flags); lock_sock(sk); + if (!(flags & MSG_PEEK) && len > 0) + atomic_sub(len, &sk->sk_rmem_alloc); - target = sock_rcvlowat(sk, flags & MSG_WAITALL, size); - timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT); - - do { - struct sk_buff *skb; - int chunk; - - skb = skb_dequeue(&sk->sk_receive_queue); - if (!skb) { - if (copied >= target) - break; - - if ((err = sock_error(sk)) != 0) - break; - if (sk->sk_shutdown & RCV_SHUTDOWN) - break; - - err = -EAGAIN; - if (!timeo) - break; - - timeo = rfcomm_sock_data_wait(sk, timeo); - - if (signal_pending(current)) { - err = sock_intr_errno(timeo); - goto out; - } - continue; - } - - chunk = min_t(unsigned int, skb->len, size); - if (memcpy_toiovec(msg->msg_iov, skb->data, chunk)) { - skb_queue_head(&sk->sk_receive_queue, skb); - if (!copied) - copied = -EFAULT; - break; - } - copied += chunk; - size -= chunk; - - sock_recv_ts_and_drops(msg, sk, skb); - - if (!(flags & MSG_PEEK)) { - atomic_sub(chunk, &sk->sk_rmem_alloc); - - skb_pull(skb, chunk); - if (skb->len) { - skb_queue_head(&sk->sk_receive_queue, skb); - break; - } - kfree_skb(skb); - - } else { - /* put message back and return */ - skb_queue_head(&sk->sk_receive_queue, skb); - break; - } - } while (size); - -out: if (atomic_read(&sk->sk_rmem_alloc) <= (sk->sk_rcvbuf >> 2)) rfcomm_dlc_unthrottle(rfcomm_pi(sk)->dlc); - release_sock(sk); - return copied ? : err; + + return len; } static int rfcomm_sock_setsockopt_old(struct socket *sock, int optname, char __user *optval, unsigned int optlen) -- cgit v1.1 From 6fdf482bb3298267816be59d9a3dce29632382d3 Mon Sep 17 00:00:00 2001 From: Mat Martineau Date: Wed, 8 Sep 2010 10:05:29 -0700 Subject: Bluetooth: Use a stream-oriented recvmsg with SOCK_STREAM L2CAP sockets. L2CAP ERTM sockets can be opened with the SOCK_STREAM socket type, which is a mandatory request for ERTM mode. However, these sockets still have SOCK_SEQPACKET read semantics when bt_sock_recvmsg() is used to pull data from the receive queue. If the application is only reading part of a frame, then the unread portion of the frame is discarded. If the application requests more bytes than are in the current frame, only the current frame's data is returned. This patch utilizes common code derived from RFCOMM's recvmsg() function to make L2CAP SOCK_STREAM reads behave like RFCOMM reads (and other SOCK_STREAM sockets in general). The application may read one byte at a time from the input stream and not lose any data, and may also read across L2CAP frame boundaries. Signed-off-by: Mat Martineau Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 2f625b8..a9e7470 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -1951,6 +1951,9 @@ static int l2cap_sock_recvmsg(struct kiocb *iocb, struct socket *sock, struct ms release_sock(sk); + if (sock->type == SOCK_STREAM) + return bt_sock_stream_recvmsg(iocb, sock, msg, len, flags); + return bt_sock_recvmsg(iocb, sock, msg, len, flags); } -- cgit v1.1 From 5017d8dde13313cab1e607379536a7930ed682b0 Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Wed, 8 Sep 2010 16:26:53 +0300 Subject: Bluetooth: remove extra newline from debug output Signed-off-by: Andrei Emeltchenko Acked-by: Ville Tervo Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/rfcomm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index b464843..4a22b14 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -1698,7 +1698,7 @@ static int rfcomm_recv_frame(struct rfcomm_session *s, struct sk_buff *skb) break; default: - BT_ERR("Unknown packet type 0x%02x\n", type); + BT_ERR("Unknown packet type 0x%02x", type); break; } kfree_skb(skb); -- cgit v1.1 From 80e2c88803c72747cb19febe2ad708bf0ad557bf Mon Sep 17 00:00:00 2001 From: Johan Hedberg Date: Thu, 9 Sep 2010 10:32:39 +0300 Subject: Bluetooth: Don't clear the blacklist when closing the HCI device Clearing the blacklist in hci_dev_do_close() would mean that user space needs to do extra work to re-block devices after a DEVDOWN-DEVUP cycle. This patch removes the clearing of the blacklist in this case and thereby saves user space from the extra work. Signed-off-by: Johan Hedberg Acked-by: Ville Tervo Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/hci_core.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index c52f091..bc2a052 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -562,7 +562,6 @@ static int hci_dev_do_close(struct hci_dev *hdev) hci_dev_lock_bh(hdev); inquiry_cache_flush(hdev); hci_conn_hash_flush(hdev); - hci_blacklist_clear(hdev); hci_dev_unlock_bh(hdev); hci_notify(hdev, HCI_DEV_DOWN); -- cgit v1.1 From 8979481328dc2e14cea9f99b3562ffcf8655998e Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Wed, 15 Sep 2010 14:28:44 +0300 Subject: Bluetooth: check L2CAP length in first ACL fragment Current Bluetooth code assembles fragments of big L2CAP packets in l2cap_recv_acldata and then checks allowed L2CAP size in assemled L2CAP packet (pi->imtu < skb->len). The patch moves allowed L2CAP size check to the early stage when we receive the first fragment of L2CAP packet. We do not need to reserve and keep L2CAP fragments for bad packets. Updated version after comments from Mat Martineau and Gustavo Padovan . Trace below is received when using stress tools sending big fragmented L2CAP packets. ... [ 1712.798492] swapper: page allocation failure. order:4, mode:0x4020 [ 1712.804809] [] (unwind_backtrace+0x0/0xdc) from [] (__alloc_pages_nodemask+0x4) [ 1712.814666] [] (__alloc_pages_nodemask+0x47c/0x4d4) from [] (__get_free_pages+) [ 1712.824645] [] (__get_free_pages+0x10/0x3c) from [] (__alloc_skb+0x4c/0xfc) [ 1712.833465] [] (__alloc_skb+0x4c/0xfc) from [] (l2cap_recv_acldata+0xf0/0x1f8 ) [ 1712.843322] [] (l2cap_recv_acldata+0xf0/0x1f8 [l2cap]) from [] (hci_rx_task+0x) ... Signed-off-by: Andrei Emeltchenko Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index a9e7470..5441083 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -4663,6 +4663,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl if (flags & ACL_START) { struct l2cap_hdr *hdr; + struct sock *sk; + u16 cid; int len; if (conn->rx_len) { @@ -4681,6 +4683,7 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl hdr = (struct l2cap_hdr *) skb->data; len = __le16_to_cpu(hdr->len) + L2CAP_HDR_SIZE; + cid = __le16_to_cpu(hdr->cid); if (len == skb->len) { /* Complete frame received */ @@ -4697,6 +4700,19 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl goto drop; } + sk = l2cap_get_chan_by_scid(&conn->chan_list, cid); + + if (sk && l2cap_pi(sk)->imtu < len - L2CAP_HDR_SIZE) { + BT_ERR("Frame exceeding recv MTU (len %d, MTU %d)", + len, l2cap_pi(sk)->imtu); + bh_unlock_sock(sk); + l2cap_conn_unreliable(conn, ECOMM); + goto drop; + } + + if (sk) + bh_unlock_sock(sk); + /* Allocate skb for the complete frame (with header) */ conn->rx_skb = bt_skb_alloc(len, GFP_ATOMIC); if (!conn->rx_skb) -- cgit v1.1 From aae7fe22a875a84e328469e228cba033ebbf20cb Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Wed, 15 Sep 2010 14:28:43 +0300 Subject: Bluetooth: check for l2cap header in start fragment BLUETOOTH SPECIFICATION Version 4.0 [Vol 3] page 36 mentioned "Note: Start Fragments always begin with the Basic L2CAP header of a PDU." Signed-off-by: Andrei Emeltchenko Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 5441083..7da6432 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -4675,7 +4675,8 @@ static int l2cap_recv_acldata(struct hci_conn *hcon, struct sk_buff *skb, u16 fl l2cap_conn_unreliable(conn, ECOMM); } - if (skb->len < 2) { + /* Start fragment always begin with Basic L2CAP header */ + if (skb->len < L2CAP_HDR_SIZE) { BT_ERR("Frame is too short (len %d)", skb->len); l2cap_conn_unreliable(conn, ECOMM); goto drop; -- cgit v1.1 From cb810a189da312d4074cb3fc8b0267b7115e7be1 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Fri, 3 Sep 2010 17:51:57 -0300 Subject: Bluetooth: remove unused variable from cmtp A value was attributed to 'src', but no one was using. Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/cmtp/core.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index d4c6af0..d461606 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -321,12 +321,11 @@ static int cmtp_session(void *arg) int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) { struct cmtp_session *session, *s; - bdaddr_t src, dst; + bdaddr_t dst; int i, err; BT_DBG(""); - baswap(&src, &bt_sk(sock->sk)->src); baswap(&dst, &bt_sk(sock->sk)->dst); session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL); -- cgit v1.1 From d6b2eb2f893547d7488d31a7088d78dd77ab5995 Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Fri, 3 Sep 2010 18:29:46 -0300 Subject: Bluetooth: make batostr() print in the right order The Bluetooth core uses the the BD_ADDR in the opposite order from the human readable order. So we are changing batostr() to print in the correct order and then removing some baswap(), as they are not needed anymore. Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/cmtp/core.c | 5 +---- net/bluetooth/hci_sysfs.c | 17 ++++------------- net/bluetooth/hidp/core.c | 8 ++------ net/bluetooth/lib.c | 4 ++-- net/bluetooth/rfcomm/tty.c | 4 +--- 5 files changed, 10 insertions(+), 28 deletions(-) (limited to 'net') diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index d461606..ec0a134 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -321,13 +321,10 @@ static int cmtp_session(void *arg) int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) { struct cmtp_session *session, *s; - bdaddr_t dst; int i, err; BT_DBG(""); - baswap(&dst, &bt_sk(sock->sk)->dst); - session = kzalloc(sizeof(struct cmtp_session), GFP_KERNEL); if (!session) return -ENOMEM; @@ -346,7 +343,7 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) BT_DBG("mtu %d", session->mtu); - sprintf(session->name, "%s", batostr(&dst)); + sprintf(session->name, "%s", batostr(&bt_sk(sock->sk)->dst)); session->sock = sock; session->state = BT_CONFIG; diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 1a9f0db..5fce3d6 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -37,9 +37,7 @@ static ssize_t show_link_type(struct device *dev, struct device_attribute *attr, static ssize_t show_link_address(struct device *dev, struct device_attribute *attr, char *buf) { struct hci_conn *conn = dev_get_drvdata(dev); - bdaddr_t bdaddr; - baswap(&bdaddr, &conn->dst); - return sprintf(buf, "%s\n", batostr(&bdaddr)); + return sprintf(buf, "%s\n", batostr(&conn->dst)); } static ssize_t show_link_features(struct device *dev, struct device_attribute *attr, char *buf) @@ -238,9 +236,7 @@ static ssize_t show_class(struct device *dev, struct device_attribute *attr, cha static ssize_t show_address(struct device *dev, struct device_attribute *attr, char *buf) { struct hci_dev *hdev = dev_get_drvdata(dev); - bdaddr_t bdaddr; - baswap(&bdaddr, &hdev->bdaddr); - return sprintf(buf, "%s\n", batostr(&bdaddr)); + return sprintf(buf, "%s\n", batostr(&hdev->bdaddr)); } static ssize_t show_features(struct device *dev, struct device_attribute *attr, char *buf) @@ -408,10 +404,8 @@ static int inquiry_cache_show(struct seq_file *f, void *p) for (e = cache->list; e; e = e->next) { struct inquiry_data *data = &e->data; - bdaddr_t bdaddr; - baswap(&bdaddr, &data->bdaddr); seq_printf(f, "%s %d %d %d 0x%.2x%.2x%.2x 0x%.4x %d %d %u\n", - batostr(&bdaddr), + batostr(&data->bdaddr), data->pscan_rep_mode, data->pscan_period_mode, data->pscan_mode, data->dev_class[2], data->dev_class[1], data->dev_class[0], @@ -445,13 +439,10 @@ static int blacklist_show(struct seq_file *f, void *p) list_for_each(l, &hdev->blacklist) { struct bdaddr_list *b; - bdaddr_t bdaddr; b = list_entry(l, struct bdaddr_list, list); - baswap(&bdaddr, &b->bdaddr); - - seq_printf(f, "%s\n", batostr(&bdaddr)); + seq_printf(f, "%s\n", batostr(&b->bdaddr)); } hci_dev_unlock_bh(hdev); diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index bfe641b..c0ee8b3 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -758,7 +758,6 @@ static int hidp_setup_hid(struct hidp_session *session, struct hidp_connadd_req *req) { struct hid_device *hid; - bdaddr_t src, dst; int err; session->rd_data = kzalloc(req->rd_size, GFP_KERNEL); @@ -781,9 +780,6 @@ static int hidp_setup_hid(struct hidp_session *session, hid->driver_data = session; - baswap(&src, &bt_sk(session->ctrl_sock->sk)->src); - baswap(&dst, &bt_sk(session->ctrl_sock->sk)->dst); - hid->bus = BUS_BLUETOOTH; hid->vendor = req->vendor; hid->product = req->product; @@ -791,8 +787,8 @@ static int hidp_setup_hid(struct hidp_session *session, hid->country = req->country; strncpy(hid->name, req->name, 128); - strncpy(hid->phys, batostr(&src), 64); - strncpy(hid->uniq, batostr(&dst), 64); + strncpy(hid->phys, batostr(&bt_sk(session->ctrl_sock->sk)->src), 64); + strncpy(hid->uniq, batostr(&bt_sk(session->ctrl_sock->sk)->dst), 64); hid->dev.parent = hidp_get_device(session); hid->ll_driver = &hidp_hid_driver; diff --git a/net/bluetooth/lib.c b/net/bluetooth/lib.c index ad2af58..b826d1b 100644 --- a/net/bluetooth/lib.c +++ b/net/bluetooth/lib.c @@ -51,8 +51,8 @@ char *batostr(bdaddr_t *ba) i ^= 1; sprintf(str[i], "%2.2X:%2.2X:%2.2X:%2.2X:%2.2X:%2.2X", - ba->b[0], ba->b[1], ba->b[2], - ba->b[3], ba->b[4], ba->b[5]); + ba->b[5], ba->b[4], ba->b[3], + ba->b[2], ba->b[1], ba->b[0]); return str[i]; } diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index befc3a5..2646163 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -183,9 +183,7 @@ static struct device *rfcomm_get_device(struct rfcomm_dev *dev) static ssize_t show_address(struct device *tty_dev, struct device_attribute *attr, char *buf) { struct rfcomm_dev *dev = dev_get_drvdata(tty_dev); - bdaddr_t bdaddr; - baswap(&bdaddr, &dev->dst); - return sprintf(buf, "%s\n", batostr(&bdaddr)); + return sprintf(buf, "%s\n", batostr(&dev->dst)); } static ssize_t show_channel(struct device *tty_dev, struct device_attribute *attr, char *buf) -- cgit v1.1 From 0175d629e096a5a59dde6cc91b32ac3160a44b9e Mon Sep 17 00:00:00 2001 From: "Gustavo F. Padovan" Date: Fri, 24 Sep 2010 20:30:57 -0300 Subject: Bluetooth: Use the proper error value from bt_skb_send_alloc() &err points to the proper error set by bt_skb_send_alloc() when it fails. Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 7da6432..01c4762 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -1652,7 +1652,7 @@ static inline int l2cap_skbuff_fromiovec(struct sock *sk, struct msghdr *msg, in *frag = bt_skb_send_alloc(sk, count, msg->msg_flags & MSG_DONTWAIT, &err); if (!*frag) - return -EFAULT; + return err; if (memcpy_fromiovec(skb_put(*frag, count), msg->msg_iov, count)) return -EFAULT; @@ -1678,7 +1678,7 @@ static struct sk_buff *l2cap_create_connless_pdu(struct sock *sk, struct msghdr skb = bt_skb_send_alloc(sk, count + hlen, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); /* Create L2CAP header */ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); @@ -1707,7 +1707,7 @@ static struct sk_buff *l2cap_create_basic_pdu(struct sock *sk, struct msghdr *ms skb = bt_skb_send_alloc(sk, count + hlen, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); /* Create L2CAP header */ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); @@ -1744,7 +1744,7 @@ static struct sk_buff *l2cap_create_iframe_pdu(struct sock *sk, struct msghdr *m skb = bt_skb_send_alloc(sk, count + hlen, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) - return ERR_PTR(-ENOMEM); + return ERR_PTR(err); /* Create L2CAP header */ lh = (struct l2cap_hdr *) skb_put(skb, L2CAP_HDR_SIZE); -- cgit v1.1 From ab3e571564a5d8fa4923e2e858e574c931a014c7 Mon Sep 17 00:00:00 2001 From: Haijun Liu Date: Thu, 30 Sep 2010 16:52:40 +0800 Subject: Bluetooth: Update conf_state before send config_req out Update conf_state with L2CAP_CONF_REQ_SENT before send config_req out in l2cap_config_req(). Signed-off-by: Haijun Liu Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 1 + 1 file changed, 1 insertion(+) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 01c4762..16049de 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -3171,6 +3171,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, struct l2cap_cmd_hdr if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT)) { u8 buf[64]; + l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, l2cap_build_conf_req(sk, buf), buf); l2cap_pi(sk)->num_conf_req++; -- cgit v1.1 From 534c92fde74c8d2143fc15b5f1d957f42cb43570 Mon Sep 17 00:00:00 2001 From: Andrei Emeltchenko Date: Fri, 1 Oct 2010 12:05:11 +0300 Subject: Bluetooth: clean up rfcomm code Remove dead code and unused rfcomm thread events Signed-off-by: Andrei Emeltchenko Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/rfcomm/core.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c index 4a22b14..39a5d87 100644 --- a/net/bluetooth/rfcomm/core.c +++ b/net/bluetooth/rfcomm/core.c @@ -113,11 +113,10 @@ static void rfcomm_session_del(struct rfcomm_session *s); #define __get_rpn_stop_bits(line) (((line) >> 2) & 0x1) #define __get_rpn_parity(line) (((line) >> 3) & 0x7) -static inline void rfcomm_schedule(uint event) +static inline void rfcomm_schedule(void) { if (!rfcomm_thread) return; - //set_bit(event, &rfcomm_event); set_bit(RFCOMM_SCHED_WAKEUP, &rfcomm_event); wake_up_process(rfcomm_thread); } @@ -203,13 +202,13 @@ static inline int __check_fcs(u8 *data, int type, u8 fcs) static void rfcomm_l2state_change(struct sock *sk) { BT_DBG("%p state %d", sk, sk->sk_state); - rfcomm_schedule(RFCOMM_SCHED_STATE); + rfcomm_schedule(); } static void rfcomm_l2data_ready(struct sock *sk, int bytes) { BT_DBG("%p bytes %d", sk, bytes); - rfcomm_schedule(RFCOMM_SCHED_RX); + rfcomm_schedule(); } static int rfcomm_l2sock_create(struct socket **sock) @@ -255,7 +254,7 @@ static void rfcomm_session_timeout(unsigned long arg) BT_DBG("session %p state %ld", s, s->state); set_bit(RFCOMM_TIMED_OUT, &s->flags); - rfcomm_schedule(RFCOMM_SCHED_TIMEO); + rfcomm_schedule(); } static void rfcomm_session_set_timer(struct rfcomm_session *s, long timeout) @@ -283,7 +282,7 @@ static void rfcomm_dlc_timeout(unsigned long arg) set_bit(RFCOMM_TIMED_OUT, &d->flags); rfcomm_dlc_put(d); - rfcomm_schedule(RFCOMM_SCHED_TIMEO); + rfcomm_schedule(); } static void rfcomm_dlc_set_timer(struct rfcomm_dlc *d, long timeout) @@ -465,7 +464,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) case BT_CONFIG: if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { set_bit(RFCOMM_AUTH_REJECT, &d->flags); - rfcomm_schedule(RFCOMM_SCHED_AUTH); + rfcomm_schedule(); break; } /* Fall through */ @@ -485,7 +484,7 @@ static int __rfcomm_dlc_close(struct rfcomm_dlc *d, int err) case BT_CONNECT2: if (test_and_clear_bit(RFCOMM_DEFER_SETUP, &d->flags)) { set_bit(RFCOMM_AUTH_REJECT, &d->flags); - rfcomm_schedule(RFCOMM_SCHED_AUTH); + rfcomm_schedule(); break; } /* Fall through */ @@ -533,7 +532,7 @@ int rfcomm_dlc_send(struct rfcomm_dlc *d, struct sk_buff *skb) skb_queue_tail(&d->tx_queue, skb); if (!test_bit(RFCOMM_TX_THROTTLED, &d->flags)) - rfcomm_schedule(RFCOMM_SCHED_TX); + rfcomm_schedule(); return len; } @@ -545,7 +544,7 @@ void __rfcomm_dlc_throttle(struct rfcomm_dlc *d) d->v24_sig |= RFCOMM_V24_FC; set_bit(RFCOMM_MSC_PENDING, &d->flags); } - rfcomm_schedule(RFCOMM_SCHED_TX); + rfcomm_schedule(); } void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) @@ -556,7 +555,7 @@ void __rfcomm_dlc_unthrottle(struct rfcomm_dlc *d) d->v24_sig &= ~RFCOMM_V24_FC; set_bit(RFCOMM_MSC_PENDING, &d->flags); } - rfcomm_schedule(RFCOMM_SCHED_TX); + rfcomm_schedule(); } /* @@ -577,7 +576,7 @@ int rfcomm_dlc_set_modem_status(struct rfcomm_dlc *d, u8 v24_sig) d->v24_sig = v24_sig; if (!test_and_set_bit(RFCOMM_MSC_PENDING, &d->flags)) - rfcomm_schedule(RFCOMM_SCHED_TX); + rfcomm_schedule(); return 0; } @@ -816,7 +815,7 @@ static int rfcomm_queue_disc(struct rfcomm_dlc *d) cmd->fcs = __fcs2((u8 *) cmd); skb_queue_tail(&d->tx_queue, skb); - rfcomm_schedule(RFCOMM_SCHED_TX); + rfcomm_schedule(); return 0; } @@ -1884,7 +1883,7 @@ static inline void rfcomm_accept_connection(struct rfcomm_session *s) * L2CAP MTU minus UIH header and FCS. */ s->mtu = min(l2cap_pi(nsock->sk)->omtu, l2cap_pi(nsock->sk)->imtu) - 5; - rfcomm_schedule(RFCOMM_SCHED_RX); + rfcomm_schedule(); } else sock_release(nsock); } @@ -2093,7 +2092,7 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt) rfcomm_session_put(s); - rfcomm_schedule(RFCOMM_SCHED_AUTH); + rfcomm_schedule(); } static struct hci_cb rfcomm_cb = { -- cgit v1.1 From 29b4433d991c88d86ca48a4c1cc33c671475be4b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 10:22:12 +0000 Subject: net: percpu net_device refcount We tried very hard to remove all possible dev_hold()/dev_put() pairs in network stack, using RCU conversions. There is still an unavoidable device refcount change for every dst we create/destroy, and this can slow down some workloads (routers or some app servers, mmap af_packet) We can switch to a percpu refcount implementation, now dynamic per_cpu infrastructure is mature. On a 64 cpus machine, this consumes 256 bytes per device. On x86, dev_hold(dev) code : before lock incl 0x280(%ebx) after: movl 0x260(%ebx),%eax incl fs:(%eax) Stress bench : (Sending 160.000.000 UDP frames, IP route cache disabled, dual E5540 @2.53GHz, 32bit kernel, FIB_TRIE) Before: real 1m1.662s user 0m14.373s sys 12m55.960s After: real 0m51.179s user 0m15.329s sys 10m15.942s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 40 +++++++++++++++++++++++++++++++++------- 1 file changed, 33 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 193eafa..04972a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5192,9 +5192,6 @@ int init_dummy_netdev(struct net_device *dev) */ dev->reg_state = NETREG_DUMMY; - /* initialize the ref count */ - atomic_set(&dev->refcnt, 1); - /* NAPI wants this */ INIT_LIST_HEAD(&dev->napi_list); @@ -5202,6 +5199,11 @@ int init_dummy_netdev(struct net_device *dev) set_bit(__LINK_STATE_PRESENT, &dev->state); set_bit(__LINK_STATE_START, &dev->state); + /* Note : We dont allocate pcpu_refcnt for dummy devices, + * because users of this 'device' dont need to change + * its refcount. + */ + return 0; } EXPORT_SYMBOL_GPL(init_dummy_netdev); @@ -5243,6 +5245,16 @@ out: } EXPORT_SYMBOL(register_netdev); +int netdev_refcnt_read(const struct net_device *dev) +{ + int i, refcnt = 0; + + for_each_possible_cpu(i) + refcnt += *per_cpu_ptr(dev->pcpu_refcnt, i); + return refcnt; +} +EXPORT_SYMBOL(netdev_refcnt_read); + /* * netdev_wait_allrefs - wait until all references are gone. * @@ -5257,11 +5269,14 @@ EXPORT_SYMBOL(register_netdev); static void netdev_wait_allrefs(struct net_device *dev) { unsigned long rebroadcast_time, warning_time; + int refcnt; linkwatch_forget_dev(dev); rebroadcast_time = warning_time = jiffies; - while (atomic_read(&dev->refcnt) != 0) { + refcnt = netdev_refcnt_read(dev); + + while (refcnt != 0) { if (time_after(jiffies, rebroadcast_time + 1 * HZ)) { rtnl_lock(); @@ -5288,11 +5303,13 @@ static void netdev_wait_allrefs(struct net_device *dev) msleep(250); + refcnt = netdev_refcnt_read(dev); + if (time_after(jiffies, warning_time + 10 * HZ)) { printk(KERN_EMERG "unregister_netdevice: " "waiting for %s to become free. Usage " "count = %d\n", - dev->name, atomic_read(&dev->refcnt)); + dev->name, refcnt); warning_time = jiffies; } } @@ -5350,7 +5367,7 @@ void netdev_run_todo(void) netdev_wait_allrefs(dev); /* paranoia */ - BUG_ON(atomic_read(&dev->refcnt)); + BUG_ON(netdev_refcnt_read(dev)); WARN_ON(rcu_dereference_raw(dev->ip_ptr)); WARN_ON(dev->ip6_ptr); WARN_ON(dev->dn_ptr); @@ -5520,9 +5537,13 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; - if (dev_addr_init(dev)) + dev->pcpu_refcnt = alloc_percpu(int); + if (!dev->pcpu_refcnt) goto free_tx; + if (dev_addr_init(dev)) + goto free_pcpu; + dev_mc_init(dev); dev_uc_init(dev); @@ -5553,6 +5574,8 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, free_tx: kfree(tx); +free_pcpu: + free_percpu(dev->pcpu_refcnt); free_p: kfree(p); return NULL; @@ -5586,6 +5609,9 @@ void free_netdev(struct net_device *dev) list_for_each_entry_safe(p, n, &dev->napi_list, dev_list) netif_napi_del(p); + free_percpu(dev->pcpu_refcnt); + dev->pcpu_refcnt = NULL; + /* Compatibility with error handling in drivers */ if (dev->reg_state == NETREG_UNINITIALIZED) { kfree((char *)dev - dev->padded); -- cgit v1.1 From cfd8e12f42746df396ecbdf7a1d8e92e8e4dbb97 Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Mon, 11 Oct 2010 10:28:59 -0700 Subject: wireless: Print wiphy name in sysfs. The index cannot be used to reliably reconstruct a phy name, so explicitly add the phy name to sysfs so that scripts can figure out the parent phy device for a particular wireless interface. Signed-off-by: Ben Greear Signed-off-by: John W. Linville --- net/wireless/sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'net') diff --git a/net/wireless/sysfs.c b/net/wireless/sysfs.c index 74a9e3c..4294fa2 100644 --- a/net/wireless/sysfs.c +++ b/net/wireless/sysfs.c @@ -35,6 +35,14 @@ SHOW_FMT(index, "%d", wiphy_idx); SHOW_FMT(macaddress, "%pM", wiphy.perm_addr); SHOW_FMT(address_mask, "%pM", wiphy.addr_mask); +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, + char *buf) { + struct wiphy *wiphy = &dev_to_rdev(dev)->wiphy; + return sprintf(buf, "%s\n", dev_name(&wiphy->dev)); +} + + static ssize_t addresses_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -57,6 +65,7 @@ static struct device_attribute ieee80211_dev_attrs[] = { __ATTR_RO(macaddress), __ATTR_RO(address_mask), __ATTR_RO(addresses), + __ATTR_RO(name), {} }; -- cgit v1.1 From 12b00c2c025b8af697d9a022ea2e928cad889ef1 Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 13 Oct 2010 15:56:56 +0200 Subject: netfilter: xtables: resolve indirect macros 1/3 Many of the used macros are just there for userspace compatibility. Substitute the in-kernel code to directly use the terminal macro and stuff the defines into #ifndef __KERNEL__ sections. Signed-off-by: Jan Engelhardt --- net/ipv4/netfilter/arp_tables.c | 10 +++++----- net/ipv4/netfilter/ip_tables.c | 12 ++++++------ net/ipv6/netfilter/ip6_tables.c | 12 ++++++------ 3 files changed, 17 insertions(+), 17 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e8f4f9a..e427a9e 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -895,7 +895,7 @@ static int compat_table_info(const struct xt_table_info *info, static int get_info(struct net *net, void __user *user, const int *len, int compat) { - char name[ARPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; @@ -908,7 +908,7 @@ static int get_info(struct net *net, void __user *user, if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; - name[ARPT_TABLE_MAXNAMELEN-1] = '\0'; + name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(NFPROTO_ARP); @@ -1474,7 +1474,7 @@ out_unlock: } struct compat_arpt_replace { - char name[ARPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; @@ -1628,7 +1628,7 @@ static int compat_copy_entries_to_user(unsigned int total_size, } struct compat_arpt_get_entries { - char name[ARPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_arpt_entry entrytable[0]; }; @@ -1840,7 +1840,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = { { .name = ARPT_ERROR_TARGET, .target = arpt_error, - .targetsize = ARPT_FUNCTION_MAXNAMELEN, + .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_ARP, }, }; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index d163f2e..2efd41b 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -1092,7 +1092,7 @@ static int compat_table_info(const struct xt_table_info *info, static int get_info(struct net *net, void __user *user, const int *len, int compat) { - char name[IPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; @@ -1105,7 +1105,7 @@ static int get_info(struct net *net, void __user *user, if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; - name[IPT_TABLE_MAXNAMELEN-1] = '\0'; + name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(AF_INET); @@ -1400,7 +1400,7 @@ do_add_counters(struct net *net, const void __user *user, #ifdef CONFIG_COMPAT struct compat_ipt_replace { - char name[IPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; @@ -1884,7 +1884,7 @@ compat_do_ipt_set_ctl(struct sock *sk, int cmd, void __user *user, } struct compat_ipt_get_entries { - char name[IPT_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ipt_entry entrytable[0]; }; @@ -2039,7 +2039,7 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IPT_SO_GET_REVISION_MATCH: case IPT_SO_GET_REVISION_TARGET: { - struct ipt_get_revision rev; + struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { @@ -2188,7 +2188,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = { { .name = IPT_ERROR_TARGET, .target = ipt_error, - .targetsize = IPT_FUNCTION_MAXNAMELEN, + .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV4, }, }; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 8e754be..4b973e1 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -1105,7 +1105,7 @@ static int compat_table_info(const struct xt_table_info *info, static int get_info(struct net *net, void __user *user, const int *len, int compat) { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; struct xt_table *t; int ret; @@ -1118,7 +1118,7 @@ static int get_info(struct net *net, void __user *user, if (copy_from_user(name, user, sizeof(name)) != 0) return -EFAULT; - name[IP6T_TABLE_MAXNAMELEN-1] = '\0'; + name[XT_TABLE_MAXNAMELEN-1] = '\0'; #ifdef CONFIG_COMPAT if (compat) xt_compat_lock(AF_INET6); @@ -1415,7 +1415,7 @@ do_add_counters(struct net *net, const void __user *user, unsigned int len, #ifdef CONFIG_COMPAT struct compat_ip6t_replace { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; u32 valid_hooks; u32 num_entries; u32 size; @@ -1899,7 +1899,7 @@ compat_do_ip6t_set_ctl(struct sock *sk, int cmd, void __user *user, } struct compat_ip6t_get_entries { - char name[IP6T_TABLE_MAXNAMELEN]; + char name[XT_TABLE_MAXNAMELEN]; compat_uint_t size; struct compat_ip6t_entry entrytable[0]; }; @@ -2054,7 +2054,7 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) case IP6T_SO_GET_REVISION_MATCH: case IP6T_SO_GET_REVISION_TARGET: { - struct ip6t_get_revision rev; + struct xt_get_revision rev; int target; if (*len != sizeof(rev)) { @@ -2203,7 +2203,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = { { .name = IP6T_ERROR_TARGET, .target = ip6t_error, - .targetsize = IP6T_FUNCTION_MAXNAMELEN, + .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV6, }, }; -- cgit v1.1 From 87a2e70db62fec7348c6e5545eb7b7650c33d81b Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 13 Oct 2010 16:11:22 +0200 Subject: netfilter: xtables: resolve indirect macros 2/3 Signed-off-by: Jan Engelhardt --- net/ipv4/netfilter/arp_tables.c | 38 ++++++++++++++--------------- net/ipv4/netfilter/ip_tables.c | 54 ++++++++++++++++++++--------------------- net/ipv6/netfilter/ip6_tables.c | 54 ++++++++++++++++++++--------------------- net/sched/act_ipt.c | 12 ++++----- 4 files changed, 79 insertions(+), 79 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index e427a9e..ed178cb 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -228,7 +228,7 @@ arpt_error(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; } -static inline const struct arpt_entry_target * +static inline const struct xt_entry_target * arpt_get_target_c(const struct arpt_entry *e) { return arpt_get_target((struct arpt_entry *)e); @@ -282,7 +282,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, arp = arp_hdr(skb); do { - const struct arpt_entry_target *t; + const struct xt_entry_target *t; if (!arp_packet_match(arp, skb->dev, indev, outdev, &e->arp)) { e = arpt_next_entry(e); @@ -297,7 +297,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, if (!t->u.kernel.target->target) { int v; - v = ((struct arpt_standard_target *)t)->verdict; + v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != ARPT_RETURN) { @@ -377,7 +377,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - const struct arpt_standard_target *t + const struct xt_standard_target *t = (void *)arpt_get_target_c(e); int visited = e->comefrom & (1 << hook); @@ -464,14 +464,14 @@ static int mark_source_chains(const struct xt_table_info *newinfo, static inline int check_entry(const struct arpt_entry *e, const char *name) { - const struct arpt_entry_target *t; + const struct xt_entry_target *t; if (!arp_checkentry(&e->arp)) { duprintf("arp_tables: arp check failed %p %s.\n", e, name); return -EINVAL; } - if (e->target_offset + sizeof(struct arpt_entry_target) > e->next_offset) + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; t = arpt_get_target_c(e); @@ -483,7 +483,7 @@ static inline int check_entry(const struct arpt_entry *e, const char *name) static inline int check_target(struct arpt_entry *e, const char *name) { - struct arpt_entry_target *t = arpt_get_target(e); + struct xt_entry_target *t = arpt_get_target(e); int ret; struct xt_tgchk_param par = { .table = name, @@ -506,7 +506,7 @@ static inline int check_target(struct arpt_entry *e, const char *name) static inline int find_check_entry(struct arpt_entry *e, const char *name, unsigned int size) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; int ret; @@ -536,7 +536,7 @@ out: static bool check_underflow(const struct arpt_entry *e) { - const struct arpt_entry_target *t; + const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(&e->arp)) @@ -544,7 +544,7 @@ static bool check_underflow(const struct arpt_entry *e) t = arpt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; - verdict = ((struct arpt_standard_target *)t)->verdict; + verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } @@ -566,7 +566,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, } if (e->next_offset - < sizeof(struct arpt_entry) + sizeof(struct arpt_entry_target)) { + < sizeof(struct arpt_entry) + sizeof(struct xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; @@ -598,7 +598,7 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e, static inline void cleanup_entry(struct arpt_entry *e) { struct xt_tgdtor_param par; - struct arpt_entry_target *t; + struct xt_entry_target *t; t = arpt_get_target(e); par.target = t->u.kernel.target; @@ -794,7 +794,7 @@ static int copy_entries_to_user(unsigned int total_size, /* FIXME: use iterator macros --RR */ /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ - const struct arpt_entry_target *t; + const struct xt_entry_target *t; e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -807,7 +807,7 @@ static int copy_entries_to_user(unsigned int total_size, t = arpt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct arpt_entry_target, + + offsetof(struct xt_entry_target, u.user.name), t->u.kernel.target->name, strlen(t->u.kernel.target->name)+1) != 0) { @@ -844,7 +844,7 @@ static int compat_calc_entry(const struct arpt_entry *e, const struct xt_table_info *info, const void *base, struct xt_table_info *newinfo) { - const struct arpt_entry_target *t; + const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; @@ -1204,7 +1204,7 @@ static int do_add_counters(struct net *net, const void __user *user, #ifdef CONFIG_COMPAT static inline void compat_release_entry(struct compat_arpt_entry *e) { - struct arpt_entry_target *t; + struct xt_entry_target *t; t = compat_arpt_get_target(e); module_put(t->u.kernel.target->me); @@ -1220,7 +1220,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e, const unsigned int *underflows, const char *name) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; int ret, off, h; @@ -1288,7 +1288,7 @@ compat_copy_entry_from_user(struct compat_arpt_entry *e, void **dstptr, unsigned int *size, const char *name, struct xt_table_info *newinfo, unsigned char *base) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; struct arpt_entry *de; unsigned int origsize; @@ -1567,7 +1567,7 @@ static int compat_copy_entry_to_user(struct arpt_entry *e, void __user **dstptr, struct xt_counters *counters, unsigned int i) { - struct arpt_entry_target *t; + struct xt_entry_target *t; struct compat_arpt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 2efd41b..cb10888 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -186,7 +186,7 @@ static inline bool unconditional(const struct ipt_ip *ip) } /* for const-correctness */ -static inline const struct ipt_entry_target * +static inline const struct xt_entry_target * ipt_get_target_c(const struct ipt_entry *e) { return ipt_get_target((struct ipt_entry *)e); @@ -230,7 +230,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { - const struct ipt_standard_target *t = (void *)ipt_get_target_c(s); + const struct xt_standard_target *t = (void *)ipt_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ @@ -346,7 +346,7 @@ ipt_do_table(struct sk_buff *skb, get_entry(table_base, private->underflow[hook])); do { - const struct ipt_entry_target *t; + const struct xt_entry_target *t; const struct xt_entry_match *ematch; IP_NF_ASSERT(e); @@ -380,7 +380,7 @@ ipt_do_table(struct sk_buff *skb, if (!t->u.kernel.target->target) { int v; - v = ((struct ipt_standard_target *)t)->verdict; + v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != IPT_RETURN) { @@ -461,7 +461,7 @@ mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - const struct ipt_standard_target *t + const struct xt_standard_target *t = (void *)ipt_get_target_c(e); int visited = e->comefrom & (1 << hook); @@ -552,7 +552,7 @@ mark_source_chains(const struct xt_table_info *newinfo, return 1; } -static void cleanup_match(struct ipt_entry_match *m, struct net *net) +static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; @@ -568,14 +568,14 @@ static void cleanup_match(struct ipt_entry_match *m, struct net *net) static int check_entry(const struct ipt_entry *e, const char *name) { - const struct ipt_entry_target *t; + const struct xt_entry_target *t; if (!ip_checkentry(&e->ip)) { duprintf("ip check failed %p %s.\n", e, par->match->name); return -EINVAL; } - if (e->target_offset + sizeof(struct ipt_entry_target) > + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -587,7 +587,7 @@ check_entry(const struct ipt_entry *e, const char *name) } static int -check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) +check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ipt_ip *ip = par->entryinfo; int ret; @@ -605,7 +605,7 @@ check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) } static int -find_check_match(struct ipt_entry_match *m, struct xt_mtchk_param *par) +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; @@ -630,7 +630,7 @@ err: static int check_target(struct ipt_entry *e, struct net *net, const char *name) { - struct ipt_entry_target *t = ipt_get_target(e); + struct xt_entry_target *t = ipt_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, @@ -656,7 +656,7 @@ static int find_check_entry(struct ipt_entry *e, struct net *net, const char *name, unsigned int size) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; @@ -707,7 +707,7 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name, static bool check_underflow(const struct ipt_entry *e) { - const struct ipt_entry_target *t; + const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(&e->ip)) @@ -715,7 +715,7 @@ static bool check_underflow(const struct ipt_entry *e) t = ipt_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; - verdict = ((struct ipt_standard_target *)t)->verdict; + verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } @@ -738,7 +738,7 @@ check_entry_size_and_hooks(struct ipt_entry *e, } if (e->next_offset - < sizeof(struct ipt_entry) + sizeof(struct ipt_entry_target)) { + < sizeof(struct ipt_entry) + sizeof(struct xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; @@ -771,7 +771,7 @@ static void cleanup_entry(struct ipt_entry *e, struct net *net) { struct xt_tgdtor_param par; - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ @@ -972,8 +972,8 @@ copy_entries_to_user(unsigned int total_size, /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; - const struct ipt_entry_match *m; - const struct ipt_entry_target *t; + const struct xt_entry_match *m; + const struct xt_entry_target *t; e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -990,7 +990,7 @@ copy_entries_to_user(unsigned int total_size, m = (void *)e + i; if (copy_to_user(userptr + off + i - + offsetof(struct ipt_entry_match, + + offsetof(struct xt_entry_match, u.user.name), m->u.kernel.match->name, strlen(m->u.kernel.match->name)+1) @@ -1002,7 +1002,7 @@ copy_entries_to_user(unsigned int total_size, t = ipt_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct ipt_entry_target, + + offsetof(struct xt_entry_target, u.user.name), t->u.kernel.target->name, strlen(t->u.kernel.target->name)+1) != 0) { @@ -1040,7 +1040,7 @@ static int compat_calc_entry(const struct ipt_entry *e, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_match *ematch; - const struct ipt_entry_target *t; + const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; @@ -1407,7 +1407,7 @@ struct compat_ipt_replace { u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; - compat_uptr_t counters; /* struct ipt_counters * */ + compat_uptr_t counters; /* struct xt_counters * */ struct compat_ipt_entry entries[0]; }; @@ -1416,7 +1416,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, unsigned int i) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct compat_ipt_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; @@ -1451,7 +1451,7 @@ compat_copy_entry_to_user(struct ipt_entry *e, void __user **dstptr, } static int -compat_find_calc_match(struct ipt_entry_match *m, +compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ipt_ip *ip, unsigned int hookmask, @@ -1473,7 +1473,7 @@ compat_find_calc_match(struct ipt_entry_match *m, static void compat_release_entry(struct compat_ipt_entry *e) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ @@ -1494,7 +1494,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e, const char *name) { struct xt_entry_match *ematch; - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; @@ -1576,7 +1576,7 @@ compat_copy_entry_from_user(struct compat_ipt_entry *e, void **dstptr, unsigned int *size, const char *name, struct xt_table_info *newinfo, unsigned char *base) { - struct ipt_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; struct ipt_entry *de; unsigned int origsize; diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 4b973e1..c7334c1 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -215,7 +215,7 @@ static inline bool unconditional(const struct ip6t_ip6 *ipv6) return memcmp(ipv6, &uncond, sizeof(uncond)) == 0; } -static inline const struct ip6t_entry_target * +static inline const struct xt_entry_target * ip6t_get_target_c(const struct ip6t_entry *e) { return ip6t_get_target((struct ip6t_entry *)e); @@ -260,7 +260,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, const char *hookname, const char **chainname, const char **comment, unsigned int *rulenum) { - const struct ip6t_standard_target *t = (void *)ip6t_get_target_c(s); + const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ @@ -369,7 +369,7 @@ ip6t_do_table(struct sk_buff *skb, e = get_entry(table_base, private->hook_entry[hook]); do { - const struct ip6t_entry_target *t; + const struct xt_entry_target *t; const struct xt_entry_match *ematch; IP_NF_ASSERT(e); @@ -403,7 +403,7 @@ ip6t_do_table(struct sk_buff *skb, if (!t->u.kernel.target->target) { int v; - v = ((struct ip6t_standard_target *)t)->verdict; + v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ if (v != IP6T_RETURN) { @@ -474,7 +474,7 @@ mark_source_chains(const struct xt_table_info *newinfo, e->counters.pcnt = pos; for (;;) { - const struct ip6t_standard_target *t + const struct xt_standard_target *t = (void *)ip6t_get_target_c(e); int visited = e->comefrom & (1 << hook); @@ -565,7 +565,7 @@ mark_source_chains(const struct xt_table_info *newinfo, return 1; } -static void cleanup_match(struct ip6t_entry_match *m, struct net *net) +static void cleanup_match(struct xt_entry_match *m, struct net *net) { struct xt_mtdtor_param par; @@ -581,14 +581,14 @@ static void cleanup_match(struct ip6t_entry_match *m, struct net *net) static int check_entry(const struct ip6t_entry *e, const char *name) { - const struct ip6t_entry_target *t; + const struct xt_entry_target *t; if (!ip6_checkentry(&e->ipv6)) { duprintf("ip_tables: ip check failed %p %s.\n", e, name); return -EINVAL; } - if (e->target_offset + sizeof(struct ip6t_entry_target) > + if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset) return -EINVAL; @@ -599,7 +599,7 @@ check_entry(const struct ip6t_entry *e, const char *name) return 0; } -static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par) +static int check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { const struct ip6t_ip6 *ipv6 = par->entryinfo; int ret; @@ -618,7 +618,7 @@ static int check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par) } static int -find_check_match(struct ip6t_entry_match *m, struct xt_mtchk_param *par) +find_check_match(struct xt_entry_match *m, struct xt_mtchk_param *par) { struct xt_match *match; int ret; @@ -643,7 +643,7 @@ err: static int check_target(struct ip6t_entry *e, struct net *net, const char *name) { - struct ip6t_entry_target *t = ip6t_get_target(e); + struct xt_entry_target *t = ip6t_get_target(e); struct xt_tgchk_param par = { .net = net, .table = name, @@ -670,7 +670,7 @@ static int find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, unsigned int size) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; int ret; unsigned int j; @@ -721,7 +721,7 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name, static bool check_underflow(const struct ip6t_entry *e) { - const struct ip6t_entry_target *t; + const struct xt_entry_target *t; unsigned int verdict; if (!unconditional(&e->ipv6)) @@ -729,7 +729,7 @@ static bool check_underflow(const struct ip6t_entry *e) t = ip6t_get_target_c(e); if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0) return false; - verdict = ((struct ip6t_standard_target *)t)->verdict; + verdict = ((struct xt_standard_target *)t)->verdict; verdict = -verdict - 1; return verdict == NF_DROP || verdict == NF_ACCEPT; } @@ -752,7 +752,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, } if (e->next_offset - < sizeof(struct ip6t_entry) + sizeof(struct ip6t_entry_target)) { + < sizeof(struct ip6t_entry) + sizeof(struct xt_entry_target)) { duprintf("checking: element %p size %u\n", e, e->next_offset); return -EINVAL; @@ -784,7 +784,7 @@ check_entry_size_and_hooks(struct ip6t_entry *e, static void cleanup_entry(struct ip6t_entry *e, struct net *net) { struct xt_tgdtor_param par; - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ @@ -985,8 +985,8 @@ copy_entries_to_user(unsigned int total_size, /* ... then go back and fix counters and names */ for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ unsigned int i; - const struct ip6t_entry_match *m; - const struct ip6t_entry_target *t; + const struct xt_entry_match *m; + const struct xt_entry_target *t; e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off @@ -1003,7 +1003,7 @@ copy_entries_to_user(unsigned int total_size, m = (void *)e + i; if (copy_to_user(userptr + off + i - + offsetof(struct ip6t_entry_match, + + offsetof(struct xt_entry_match, u.user.name), m->u.kernel.match->name, strlen(m->u.kernel.match->name)+1) @@ -1015,7 +1015,7 @@ copy_entries_to_user(unsigned int total_size, t = ip6t_get_target_c(e); if (copy_to_user(userptr + off + e->target_offset - + offsetof(struct ip6t_entry_target, + + offsetof(struct xt_entry_target, u.user.name), t->u.kernel.target->name, strlen(t->u.kernel.target->name)+1) != 0) { @@ -1053,7 +1053,7 @@ static int compat_calc_entry(const struct ip6t_entry *e, const void *base, struct xt_table_info *newinfo) { const struct xt_entry_match *ematch; - const struct ip6t_entry_target *t; + const struct xt_entry_target *t; unsigned int entry_offset; int off, i, ret; @@ -1422,7 +1422,7 @@ struct compat_ip6t_replace { u32 hook_entry[NF_INET_NUMHOOKS]; u32 underflow[NF_INET_NUMHOOKS]; u32 num_counters; - compat_uptr_t counters; /* struct ip6t_counters * */ + compat_uptr_t counters; /* struct xt_counters * */ struct compat_ip6t_entry entries[0]; }; @@ -1431,7 +1431,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, unsigned int *size, struct xt_counters *counters, unsigned int i) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct compat_ip6t_entry __user *ce; u_int16_t target_offset, next_offset; compat_uint_t origsize; @@ -1466,7 +1466,7 @@ compat_copy_entry_to_user(struct ip6t_entry *e, void __user **dstptr, } static int -compat_find_calc_match(struct ip6t_entry_match *m, +compat_find_calc_match(struct xt_entry_match *m, const char *name, const struct ip6t_ip6 *ipv6, unsigned int hookmask, @@ -1488,7 +1488,7 @@ compat_find_calc_match(struct ip6t_entry_match *m, static void compat_release_entry(struct compat_ip6t_entry *e) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct xt_entry_match *ematch; /* Cleanup all matches */ @@ -1509,7 +1509,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e, const char *name) { struct xt_entry_match *ematch; - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; unsigned int entry_offset; unsigned int j; @@ -1591,7 +1591,7 @@ compat_copy_entry_from_user(struct compat_ip6t_entry *e, void **dstptr, unsigned int *size, const char *name, struct xt_table_info *newinfo, unsigned char *base) { - struct ip6t_entry_target *t; + struct xt_entry_target *t; struct xt_target *target; struct ip6t_entry *de; unsigned int origsize; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index c7e59e6..f6d464f 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -39,7 +39,7 @@ static struct tcf_hashinfo ipt_hash_info = { .lock = &ipt_lock, }; -static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int hook) +static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook) { struct xt_tgchk_param par; struct xt_target *target; @@ -66,7 +66,7 @@ static int ipt_init_target(struct ipt_entry_target *t, char *table, unsigned int return 0; } -static void ipt_destroy_target(struct ipt_entry_target *t) +static void ipt_destroy_target(struct xt_entry_target *t) { struct xt_tgdtor_param par = { .target = t->u.kernel.target, @@ -99,7 +99,7 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = { [TCA_IPT_TABLE] = { .type = NLA_STRING, .len = IFNAMSIZ }, [TCA_IPT_HOOK] = { .type = NLA_U32 }, [TCA_IPT_INDEX] = { .type = NLA_U32 }, - [TCA_IPT_TARG] = { .len = sizeof(struct ipt_entry_target) }, + [TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) }, }; static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, @@ -108,7 +108,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, struct nlattr *tb[TCA_IPT_MAX + 1]; struct tcf_ipt *ipt; struct tcf_common *pc; - struct ipt_entry_target *td, *t; + struct xt_entry_target *td, *t; char *tname; int ret = 0, err; u32 hook = 0; @@ -126,7 +126,7 @@ static int tcf_ipt_init(struct nlattr *nla, struct nlattr *est, if (tb[TCA_IPT_TARG] == NULL) return -EINVAL; - td = (struct ipt_entry_target *)nla_data(tb[TCA_IPT_TARG]); + td = (struct xt_entry_target *)nla_data(tb[TCA_IPT_TARG]); if (nla_len(tb[TCA_IPT_TARG]) < td->u.target_size) return -EINVAL; @@ -249,7 +249,7 @@ static int tcf_ipt_dump(struct sk_buff *skb, struct tc_action *a, int bind, int { unsigned char *b = skb_tail_pointer(skb); struct tcf_ipt *ipt = a->priv; - struct ipt_entry_target *t; + struct xt_entry_target *t; struct tcf_t tm; struct tc_cnt c; -- cgit v1.1 From 243bf6e29eef642de0ff62f1ebf58bc2396d6d6e Mon Sep 17 00:00:00 2001 From: Jan Engelhardt Date: Wed, 13 Oct 2010 16:28:00 +0200 Subject: netfilter: xtables: resolve indirect macros 3/3 --- net/ipv4/netfilter/arp_tables.c | 14 +++++++------- net/ipv4/netfilter/arpt_mangle.c | 2 +- net/ipv4/netfilter/ip_tables.c | 18 +++++++++--------- net/ipv6/netfilter/ip6_tables.c | 18 +++++++++--------- net/sched/act_ipt.c | 2 +- 5 files changed, 27 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index ed178cb..d756eda 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -300,7 +300,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ - if (v != ARPT_RETURN) { + if (v != XT_RETURN) { verdict = (unsigned)(-v) - 1; break; } @@ -332,7 +332,7 @@ unsigned int arpt_do_table(struct sk_buff *skb, /* Target might have changed stuff. */ arp = arp_hdr(skb); - if (verdict == ARPT_CONTINUE) + if (verdict == XT_CONTINUE) e = arpt_next_entry(e); else /* Verdict */ @@ -392,13 +392,13 @@ static int mark_source_chains(const struct xt_table_info *newinfo, /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct arpt_entry) && (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < 0 && unconditional(&e->arp)) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -433,7 +433,7 @@ static int mark_source_chains(const struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - ARPT_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && newpos >= 0) { if (newpos > newinfo->size - sizeof(struct arpt_entry)) { @@ -1828,7 +1828,7 @@ void arpt_unregister_table(struct xt_table *table) /* The built-in targets: standard (NULL) and error. */ static struct xt_target arpt_builtin_tg[] __read_mostly = { { - .name = ARPT_STANDARD_TARGET, + .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_ARP, #ifdef CONFIG_COMPAT @@ -1838,7 +1838,7 @@ static struct xt_target arpt_builtin_tg[] __read_mostly = { #endif }, { - .name = ARPT_ERROR_TARGET, + .name = XT_ERROR_TARGET, .target = arpt_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_ARP, diff --git a/net/ipv4/netfilter/arpt_mangle.c b/net/ipv4/netfilter/arpt_mangle.c index e1be7dd..b8ddcc4 100644 --- a/net/ipv4/netfilter/arpt_mangle.c +++ b/net/ipv4/netfilter/arpt_mangle.c @@ -63,7 +63,7 @@ static int checkentry(const struct xt_tgchk_param *par) return false; if (mangle->target != NF_DROP && mangle->target != NF_ACCEPT && - mangle->target != ARPT_CONTINUE) + mangle->target != XT_CONTINUE) return false; return true; } diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index cb10888..d31b007 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -232,7 +232,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, { const struct xt_standard_target *t = (void *)ipt_get_target_c(s); - if (strcmp(t->target.u.kernel.target->name, IPT_ERROR_TARGET) == 0) { + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; @@ -241,7 +241,7 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e, if (s->target_offset == sizeof(struct ipt_entry) && strcmp(t->target.u.kernel.target->name, - IPT_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && t->verdict < 0 && unconditional(&s->ip)) { /* Tail of chains: STANDARD target (return/policy) */ @@ -383,7 +383,7 @@ ipt_do_table(struct sk_buff *skb, v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ - if (v != IPT_RETURN) { + if (v != XT_RETURN) { verdict = (unsigned)(-v) - 1; break; } @@ -421,7 +421,7 @@ ipt_do_table(struct sk_buff *skb, verdict = t->u.kernel.target->target(skb, &acpar); /* Target might have changed stuff. */ ip = ip_hdr(skb); - if (verdict == IPT_CONTINUE) + if (verdict == XT_CONTINUE) e = ipt_next_entry(e); else /* Verdict */ @@ -475,13 +475,13 @@ mark_source_chains(const struct xt_table_info *newinfo, /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct ipt_entry) && (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < 0 && unconditional(&e->ip)) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -524,7 +524,7 @@ mark_source_chains(const struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - IPT_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && newpos >= 0) { if (newpos > newinfo->size - sizeof(struct ipt_entry)) { @@ -2176,7 +2176,7 @@ static int icmp_checkentry(const struct xt_mtchk_param *par) static struct xt_target ipt_builtin_tg[] __read_mostly = { { - .name = IPT_STANDARD_TARGET, + .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV4, #ifdef CONFIG_COMPAT @@ -2186,7 +2186,7 @@ static struct xt_target ipt_builtin_tg[] __read_mostly = { #endif }, { - .name = IPT_ERROR_TARGET, + .name = XT_ERROR_TARGET, .target = ipt_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV4, diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index c7334c1..c683e9e 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -262,7 +262,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, { const struct xt_standard_target *t = (void *)ip6t_get_target_c(s); - if (strcmp(t->target.u.kernel.target->name, IP6T_ERROR_TARGET) == 0) { + if (strcmp(t->target.u.kernel.target->name, XT_ERROR_TARGET) == 0) { /* Head of user chain: ERROR target with chainname */ *chainname = t->target.data; (*rulenum) = 0; @@ -271,7 +271,7 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e, if (s->target_offset == sizeof(struct ip6t_entry) && strcmp(t->target.u.kernel.target->name, - IP6T_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && t->verdict < 0 && unconditional(&s->ipv6)) { /* Tail of chains: STANDARD target (return/policy) */ @@ -406,7 +406,7 @@ ip6t_do_table(struct sk_buff *skb, v = ((struct xt_standard_target *)t)->verdict; if (v < 0) { /* Pop from stack? */ - if (v != IP6T_RETURN) { + if (v != XT_RETURN) { verdict = (unsigned)(-v) - 1; break; } @@ -434,7 +434,7 @@ ip6t_do_table(struct sk_buff *skb, acpar.targinfo = t->data; verdict = t->u.kernel.target->target(skb, &acpar); - if (verdict == IP6T_CONTINUE) + if (verdict == XT_CONTINUE) e = ip6t_next_entry(e); else /* Verdict */ @@ -488,13 +488,13 @@ mark_source_chains(const struct xt_table_info *newinfo, /* Unconditional return/END. */ if ((e->target_offset == sizeof(struct ip6t_entry) && (strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < 0 && unconditional(&e->ipv6)) || visited) { unsigned int oldpos, size; if ((strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0) && + XT_STANDARD_TARGET) == 0) && t->verdict < -NF_MAX_VERDICT - 1) { duprintf("mark_source_chains: bad " "negative verdict (%i)\n", @@ -537,7 +537,7 @@ mark_source_chains(const struct xt_table_info *newinfo, int newpos = t->verdict; if (strcmp(t->target.u.user.name, - IP6T_STANDARD_TARGET) == 0 && + XT_STANDARD_TARGET) == 0 && newpos >= 0) { if (newpos > newinfo->size - sizeof(struct ip6t_entry)) { @@ -2191,7 +2191,7 @@ static int icmp6_checkentry(const struct xt_mtchk_param *par) /* The built-in targets: standard (NULL) and error. */ static struct xt_target ip6t_builtin_tg[] __read_mostly = { { - .name = IP6T_STANDARD_TARGET, + .name = XT_STANDARD_TARGET, .targetsize = sizeof(int), .family = NFPROTO_IPV6, #ifdef CONFIG_COMPAT @@ -2201,7 +2201,7 @@ static struct xt_target ip6t_builtin_tg[] __read_mostly = { #endif }, { - .name = IP6T_ERROR_TARGET, + .name = XT_ERROR_TARGET, .target = ip6t_error, .targetsize = XT_FUNCTION_MAXNAMELEN, .family = NFPROTO_IPV6, diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index f6d464f..8daef96 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -230,7 +230,7 @@ static int tcf_ipt(struct sk_buff *skb, struct tc_action *a, result = TC_ACT_SHOT; ipt->tcf_qstats.drops++; break; - case IPT_CONTINUE: + case XT_CONTINUE: result = TC_ACT_PIPE; break; default: -- cgit v1.1 From a91fd267e327ca7599654b4e9ed7b62c5adaccee Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Wed, 13 Oct 2010 21:22:35 +0200 Subject: IPVS: ip_vs_dbg_callid() is only needed for debugging ip_vs_dbg_callid() and IP_VS_DEBUG_CALLID() are only needed it CONFIG_IP_VS_DEBUG is defined. This resolves the following build warning when CONFIG_IP_VS_DEBUG is not defined. net/netfilter/ipvs/ip_vs_pe_sip.c:11: warning: 'ip_vs_dbg_callid' defined but not used Reported-by: Patrick McHardy Signed-off-by: Simon Horman Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_pe_sip.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c index a0539f1..b8b4e96 100644 --- a/net/netfilter/ipvs/ip_vs_pe_sip.c +++ b/net/netfilter/ipvs/ip_vs_pe_sip.c @@ -8,6 +8,7 @@ #include #include +#ifdef CONFIG_IP_VS_DEBUG static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, const char *callid, size_t callid_len, int *idx) @@ -22,6 +23,7 @@ static const char *ip_vs_dbg_callid(char *buf, size_t buf_len, #define IP_VS_DEBUG_CALLID(callid, len) \ ip_vs_dbg_callid(ip_vs_dbg_buf, sizeof(ip_vs_dbg_buf), \ callid, len, &ip_vs_dbg_idx) +#endif static int get_callid(const char *dptr, unsigned int dataoff, unsigned int datalen, -- cgit v1.1 From 271733cf844a2f5f186ef3b40c26d6397b71039a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Oct 2010 12:06:23 +0200 Subject: cfg80211: notify drivers about frame registrations Drivers may need to adjust their filters according to frame registrations, so notify them about them. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/mlme.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index caf11a427..26838d9 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -764,6 +764,8 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, u16 frame_type, const u8 *match_data, int match_len) { + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct cfg80211_mgmt_registration *reg, *nreg; int err = 0; u16 mgmt_type; @@ -810,22 +812,37 @@ int cfg80211_mlme_register_mgmt(struct wireless_dev *wdev, u32 snd_pid, nreg->frame_type = cpu_to_le16(frame_type); list_add(&nreg->list, &wdev->mgmt_registrations); + if (rdev->ops->mgmt_frame_register) + rdev->ops->mgmt_frame_register(wiphy, wdev->netdev, + frame_type, true); + out: spin_unlock_bh(&wdev->mgmt_registrations_lock); + return err; } void cfg80211_mlme_unregister_socket(struct wireless_dev *wdev, u32 nlpid) { + struct wiphy *wiphy = wdev->wiphy; + struct cfg80211_registered_device *rdev = wiphy_to_dev(wiphy); struct cfg80211_mgmt_registration *reg, *tmp; spin_lock_bh(&wdev->mgmt_registrations_lock); list_for_each_entry_safe(reg, tmp, &wdev->mgmt_registrations, list) { - if (reg->nlpid == nlpid) { - list_del(®->list); - kfree(reg); + if (reg->nlpid != nlpid) + continue; + + if (rdev->ops->mgmt_frame_register) { + u16 frame_type = le16_to_cpu(reg->frame_type); + + rdev->ops->mgmt_frame_register(wiphy, wdev->netdev, + frame_type, false); } + + list_del(®->list); + kfree(reg); } spin_unlock_bh(&wdev->mgmt_registrations_lock); -- cgit v1.1 From 7be5086d4cb7cceb71d724a9524d5e927785d04f Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Oct 2010 12:06:24 +0200 Subject: mac80211: add probe request filter flag Using the frame registration notification, we can see when probe requests are requested and notify the low-level driver via filtering. The flag is also set in AP and IBSS modes. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/cfg.c | 18 ++++++++++++++++++ net/mac80211/ieee80211_i.h | 4 +++- net/mac80211/iface.c | 9 ++++++++- net/mac80211/main.c | 3 +++ 4 files changed, 32 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 25fb351..18bd0e5 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -1604,6 +1604,23 @@ static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct net_device *dev, return 0; } +static void ieee80211_mgmt_frame_register(struct wiphy *wiphy, + struct net_device *dev, + u16 frame_type, bool reg) +{ + struct ieee80211_local *local = wiphy_priv(wiphy); + + if (frame_type != (IEEE80211_FTYPE_MGMT | IEEE80211_STYPE_PROBE_REQ)) + return; + + if (reg) + local->probe_req_reg++; + else + local->probe_req_reg--; + + ieee80211_queue_work(&local->hw, &local->reconfig_filter); +} + struct cfg80211_ops mac80211_config_ops = { .add_virtual_intf = ieee80211_add_iface, .del_virtual_intf = ieee80211_del_iface, @@ -1655,4 +1672,5 @@ struct cfg80211_ops mac80211_config_ops = { .cancel_remain_on_channel = ieee80211_cancel_remain_on_channel, .mgmt_tx = ieee80211_mgmt_tx, .set_cqm_rssi_config = ieee80211_set_cqm_rssi_config, + .mgmt_frame_register = ieee80211_mgmt_frame_register, }; diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index f0610fa..b80c386 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -707,7 +707,9 @@ struct ieee80211_local { int open_count; int monitors, cooked_mntrs; /* number of interfaces with corresponding FIF_ flags */ - int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll; + int fif_fcsfail, fif_plcpfail, fif_control, fif_other_bss, fif_pspoll, + fif_probe_req; + int probe_req_reg; unsigned int filter_flags; /* FIF_* */ bool wiphy_ciphers_allocated; diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index e99d1b6..f9163b1 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -280,8 +280,11 @@ static int ieee80211_do_open(struct net_device *dev, bool coming_up) ieee80211_start_mesh(sdata); } else if (sdata->vif.type == NL80211_IFTYPE_AP) { local->fif_pspoll++; + local->fif_probe_req++; ieee80211_configure_filter(local); + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + local->fif_probe_req++; } changed |= ieee80211_reset_erp_info(sdata); @@ -428,8 +431,12 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, if (sdata->flags & IEEE80211_SDATA_PROMISC) atomic_dec(&local->iff_promiscs); - if (sdata->vif.type == NL80211_IFTYPE_AP) + if (sdata->vif.type == NL80211_IFTYPE_AP) { local->fif_pspoll--; + local->fif_probe_req--; + } else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) { + local->fif_probe_req--; + } netif_addr_lock_bh(sdata->dev); spin_lock_bh(&local->filter_lock); diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 915ecf8..5162303 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -54,6 +54,9 @@ void ieee80211_configure_filter(struct ieee80211_local *local) if (local->monitors || local->scanning) new_flags |= FIF_BCN_PRBRESP_PROMISC; + if (local->fif_probe_req || local->probe_req_reg) + new_flags |= FIF_PROBE_REQ; + if (local->fif_fcsfail) new_flags |= FIF_FCSFAIL; -- cgit v1.1 From e4b55957eb695b43055b6badec026628b24fe80a Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Wed, 13 Oct 2010 19:23:21 +0200 Subject: mac80211: fix SMPS request It looks like I submitted a different patch than I tested, because clearly the code in mac80211 is missing actually propagating the requested SMPS mode. Fix that! Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/mac80211/ht.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c index 4214bb6..75d679d 100644 --- a/net/mac80211/ht.c +++ b/net/mac80211/ht.c @@ -291,6 +291,8 @@ void ieee80211_request_smps(struct ieee80211_vif *vif, if (WARN_ON(smps_mode == IEEE80211_SMPS_OFF)) smps_mode = IEEE80211_SMPS_AUTOMATIC; + sdata->u.mgd.driver_smps_mode = smps_mode; + ieee80211_queue_work(&sdata->local->hw, &sdata->u.mgd.request_smps_work); } -- cgit v1.1 From 7368ddf144afd79456fd853fa25f33e31da003a9 Mon Sep 17 00:00:00 2001 From: Paul Gortmaker Date: Tue, 12 Oct 2010 14:25:58 +0000 Subject: tipc: clean out all instances of #if 0'd unused code Remove all instances of legacy, or as yet to be implemented code that is currently living within an #if 0 ... #endif block. In the rare instance that some of it be needed in the future, it can still be dragged out of history, but there is no need for it to sit in mainline. Signed-off-by: Paul Gortmaker Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/tipc/config.c | 141 -------------------------------------------------- net/tipc/discover.c | 20 ------- net/tipc/discover.h | 3 -- net/tipc/link.c | 112 +-------------------------------------- net/tipc/link.h | 4 -- net/tipc/name_table.c | 17 ------ net/tipc/net.c | 9 ---- net/tipc/node.c | 26 ---------- net/tipc/port.c | 44 ---------------- 9 files changed, 1 insertion(+), 375 deletions(-) (limited to 'net') diff --git a/net/tipc/config.c b/net/tipc/config.c index 961d1b0..c429b0d 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -120,139 +120,6 @@ struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string) return buf; } - -#if 0 - -/* Now obsolete code for handling commands not yet implemented the new way */ - -/* - * Some of this code assumed that the manager structure contains two added - * fields: - * u32 link_subscriptions; - * struct list_head link_subscribers; - * which are currently not present. These fields may need to be re-introduced - * if and when support for link subscriptions is added. - */ - -void tipc_cfg_link_event(u32 addr, char *name, int up) -{ - /* TIPC DOESN'T HANDLE LINK EVENT SUBSCRIPTIONS AT THE MOMENT */ -} - -int tipc_cfg_cmd(const struct tipc_cmd_msg * msg, - char *data, - u32 sz, - u32 *ret_size, - struct tipc_portid *orig) -{ - int rv = -EINVAL; - u32 cmd = msg->cmd; - - *ret_size = 0; - switch (cmd) { - case TIPC_REMOVE_LINK: - case TIPC_CMD_BLOCK_LINK: - case TIPC_CMD_UNBLOCK_LINK: - if (!cfg_check_connection(orig)) - rv = link_control(msg->argv.link_name, msg->cmd, 0); - break; - case TIPC_ESTABLISH: - { - int connected; - - tipc_isconnected(mng.conn_port_ref, &connected); - if (connected || !orig) { - rv = TIPC_FAILURE; - break; - } - rv = tipc_connect2port(mng.conn_port_ref, orig); - if (rv == TIPC_OK) - orig = 0; - break; - } - case TIPC_GET_PEER_ADDRESS: - *ret_size = link_peer_addr(msg->argv.link_name, data, sz); - break; - case TIPC_GET_ROUTES: - rv = TIPC_OK; - break; - default: {} - } - if (*ret_size) - rv = TIPC_OK; - return rv; -} - -static void cfg_cmd_event(struct tipc_cmd_msg *msg, - char *data, - u32 sz, - struct tipc_portid const *orig) -{ - int rv = -EINVAL; - struct tipc_cmd_result_msg rmsg; - struct iovec msg_sect[2]; - int *arg; - - msg->cmd = ntohl(msg->cmd); - - cfg_prepare_res_msg(msg->cmd, msg->usr_handle, rv, &rmsg, msg_sect, - data, 0); - if (ntohl(msg->magic) != TIPC_MAGIC) - goto exit; - - switch (msg->cmd) { - case TIPC_CREATE_LINK: - if (!cfg_check_connection(orig)) - rv = disc_create_link(&msg->argv.create_link); - break; - case TIPC_LINK_SUBSCRIBE: - { - struct subscr_data *sub; - - if (mng.link_subscriptions > 64) - break; - sub = kmalloc(sizeof(*sub), - GFP_ATOMIC); - if (sub == NULL) { - warn("Memory squeeze; dropped remote link subscription\n"); - break; - } - INIT_LIST_HEAD(&sub->subd_list); - tipc_createport(mng.user_ref, - (void *)sub, - TIPC_HIGH_IMPORTANCE, - 0, - 0, - (tipc_conn_shutdown_event)cfg_linksubscr_cancel, - 0, - 0, - (tipc_conn_msg_event)cfg_linksubscr_cancel, - 0, - &sub->port_ref); - if (!sub->port_ref) { - kfree(sub); - break; - } - memcpy(sub->usr_handle,msg->usr_handle, - sizeof(sub->usr_handle)); - sub->domain = msg->argv.domain; - list_add_tail(&sub->subd_list, &mng.link_subscribers); - tipc_connect2port(sub->port_ref, orig); - rmsg.retval = TIPC_OK; - tipc_send(sub->port_ref, 2u, msg_sect); - mng.link_subscriptions++; - return; - } - default: - rv = tipc_cfg_cmd(msg, data, sz, (u32 *)&msg_sect[1].iov_len, orig); - } -exit: - rmsg.result_len = htonl(msg_sect[1].iov_len); - rmsg.retval = htonl(rv); - tipc_cfg_respond(msg_sect, 2u, orig); -} -#endif - #define MAX_STATS_INFO 2000 static struct sk_buff *tipc_show_stats(void) @@ -557,14 +424,6 @@ struct sk_buff *tipc_cfg_do_cmd(u32 orig_node, u16 cmd, const void *request_area case TIPC_CMD_SHOW_PORTS: rep_tlv_buf = tipc_port_get_ports(); break; -#if 0 - case TIPC_CMD_SHOW_PORT_STATS: - rep_tlv_buf = port_show_stats(req_tlv_area, req_tlv_space); - break; - case TIPC_CMD_RESET_PORT_STATS: - rep_tlv_buf = tipc_cfg_reply_error_string(TIPC_CFG_NOT_SUPPORTED); - break; -#endif case TIPC_CMD_SET_LOG_SIZE: rep_tlv_buf = tipc_log_resize_cmd(req_tlv_area, req_tlv_space); break; diff --git a/net/tipc/discover.c b/net/tipc/discover.c index f28d1ae..dbd79c6 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -46,16 +46,6 @@ #define TIPC_LINK_REQ_FAST 2000 /* normal delay if bearer has no links */ #define TIPC_LINK_REQ_SLOW 600000 /* normal delay if bearer has links */ -#if 0 -#define GET_NODE_INFO 300 -#define GET_NODE_INFO_RESULT 301 -#define FORWARD_LINK_PROBE 302 -#define LINK_REQUEST_REJECTED 303 -#define LINK_REQUEST_ACCEPTED 304 -#define DROP_LINK_REQUEST 305 -#define CHECK_LINK_COUNT 306 -#endif - /* * TODO: Most of the inter-cluster setup stuff should be * rewritten, and be made conformant with specification. @@ -79,16 +69,6 @@ struct link_req { }; -#if 0 -int disc_create_link(const struct tipc_link_create *argv) -{ - /* - * Code for inter cluster link setup here - */ - return TIPC_OK; -} -#endif - /* * disc_lost_link(): A link has lost contact */ diff --git a/net/tipc/discover.h b/net/tipc/discover.h index c36eaeb..9d064c3 100644 --- a/net/tipc/discover.h +++ b/net/tipc/discover.h @@ -51,8 +51,5 @@ void tipc_disc_stop_link_req(struct link_req *req); void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr); void tipc_disc_link_event(u32 addr, char *name, int up); -#if 0 -int disc_create_link(const struct tipc_link_create *argv); -#endif #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index b8cf1e9..4be78ec 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -99,23 +99,6 @@ struct link_name { char if_peer[TIPC_MAX_IF_NAME]; }; -#if 0 - -/* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */ - -/** - * struct link_event - link up/down event notification - */ - -struct link_event { - u32 addr; - int up; - void (*fcn)(u32, char *, int); - char name[TIPC_MAX_LINK_NAME]; -}; - -#endif - static void link_handle_out_of_seq_msg(struct link *l_ptr, struct sk_buff *buf); static void link_recv_proto_msg(struct link *l_ptr, struct sk_buff *buf); @@ -634,39 +617,9 @@ void tipc_link_stop(struct link *l_ptr) l_ptr->proto_msg_queue = NULL; } -#if 0 - /* LINK EVENT CODE IS NOT SUPPORTED AT PRESENT */ - -static void link_recv_event(struct link_event *ev) -{ - ev->fcn(ev->addr, ev->name, ev->up); - kfree(ev); -} - -static void link_send_event(void (*fcn)(u32 a, char *n, int up), - struct link *l_ptr, int up) -{ - struct link_event *ev; - - ev = kmalloc(sizeof(*ev), GFP_ATOMIC); - if (!ev) { - warn("Link event allocation failure\n"); - return; - } - ev->addr = l_ptr->addr; - ev->up = up; - ev->fcn = fcn; - memcpy(ev->name, l_ptr->name, TIPC_MAX_LINK_NAME); - tipc_k_signal((Handler)link_recv_event, (unsigned long)ev); -} - -#else - #define link_send_event(fcn, l_ptr, up) do { } while (0) -#endif - void tipc_link_reset(struct link *l_ptr) { struct sk_buff *buf; @@ -690,10 +643,7 @@ void tipc_link_reset(struct link *l_ptr) tipc_node_link_down(l_ptr->owner, l_ptr); tipc_bearer_remove_dest(l_ptr->b_ptr, l_ptr->addr); -#if 0 - tipc_printf(TIPC_CONS, "\nReset link <%s>\n", l_ptr->name); - dbg_link_dump(); -#endif + if (was_active_link && tipc_node_has_active_links(l_ptr->owner) && l_ptr->owner->permit_changeover) { l_ptr->reset_checkpoint = checkpoint; @@ -3197,44 +3147,6 @@ struct sk_buff *tipc_link_cmd_show_stats(const void *req_tlv_area, int req_tlv_s return buf; } -#if 0 -int link_control(const char *name, u32 op, u32 val) -{ - int res = -EINVAL; - struct link *l_ptr; - u32 bearer_id; - struct tipc_node * node; - u32 a; - - a = link_name2addr(name, &bearer_id); - read_lock_bh(&tipc_net_lock); - node = tipc_node_find(a); - if (node) { - tipc_node_lock(node); - l_ptr = node->links[bearer_id]; - if (l_ptr) { - if (op == TIPC_REMOVE_LINK) { - struct bearer *b_ptr = l_ptr->b_ptr; - spin_lock_bh(&b_ptr->publ.lock); - tipc_link_delete(l_ptr); - spin_unlock_bh(&b_ptr->publ.lock); - } - if (op == TIPC_CMD_BLOCK_LINK) { - tipc_link_reset(l_ptr); - l_ptr->blocked = 1; - } - if (op == TIPC_CMD_UNBLOCK_LINK) { - l_ptr->blocked = 0; - } - res = 0; - } - tipc_node_unlock(node); - } - read_unlock_bh(&tipc_net_lock); - return res; -} -#endif - /** * tipc_link_get_max_pkt - get maximum packet size to use when sending to destination * @dest: network address of destination node @@ -3265,28 +3177,6 @@ u32 tipc_link_get_max_pkt(u32 dest, u32 selector) return res; } -#if 0 -static void link_dump_rec_queue(struct link *l_ptr) -{ - struct sk_buff *crs; - - if (!l_ptr->oldest_deferred_in) { - info("Reception queue empty\n"); - return; - } - info("Contents of Reception queue:\n"); - crs = l_ptr->oldest_deferred_in; - while (crs) { - if (crs->data == (void *)0x0000a3a3) { - info("buffer %x invalid\n", crs); - return; - } - msg_dbg(buf_msg(crs), "In rec queue:\n"); - crs = crs->next; - } -} -#endif - static void link_dump_send_queue(struct link *l_ptr) { if (l_ptr->next_out) { diff --git a/net/tipc/link.h b/net/tipc/link.h index 26151d3..4e944ef 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -210,10 +210,6 @@ struct link { u32 msg_length_counts; u32 msg_lengths_total; u32 msg_length_profile[7]; -#if 0 - u32 sent_tunneled; - u32 recv_tunneled; -#endif } stats; struct print_buf print_buf; diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c index 9ca4b06..3a8de43 100644 --- a/net/tipc/name_table.c +++ b/net/tipc/name_table.c @@ -1009,16 +1009,6 @@ static void nametbl_list(struct print_buf *buf, u32 depth_info, } } -#if 0 -void tipc_nametbl_print(struct print_buf *buf, const char *str) -{ - tipc_printf(buf, str); - read_lock_bh(&tipc_nametbl_lock); - nametbl_list(buf, 0, 0, 0, 0); - read_unlock_bh(&tipc_nametbl_lock); -} -#endif - #define MAX_NAME_TBL_QUERY 32768 struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space) @@ -1051,13 +1041,6 @@ struct sk_buff *tipc_nametbl_get(const void *req_tlv_area, int req_tlv_space) return buf; } -#if 0 -void tipc_nametbl_dump(void) -{ - nametbl_list(TIPC_CONS, 0, 0, 0, 0); -} -#endif - int tipc_nametbl_init(void) { table.types = kcalloc(tipc_nametbl_size, sizeof(struct hlist_head), diff --git a/net/tipc/net.c b/net/tipc/net.c index 7e05af4..1a621cf 100644 --- a/net/tipc/net.c +++ b/net/tipc/net.c @@ -129,15 +129,6 @@ u32 tipc_net_select_router(u32 addr, u32 ref) return tipc_zone_select_router(tipc_net.zones[tipc_zone(addr)], addr, ref); } -#if 0 -u32 tipc_net_next_node(u32 a) -{ - if (tipc_net.zones[tipc_zone(a)]) - return tipc_zone_next_node(a); - return 0; -} -#endif - void tipc_net_remove_as_router(u32 router) { u32 z_num; diff --git a/net/tipc/node.c b/net/tipc/node.c index 7c49cd0..823e9ab 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -125,16 +125,6 @@ void tipc_node_delete(struct tipc_node *n_ptr) if (!n_ptr) return; -#if 0 - /* Not needed because links are already deleted via tipc_bearer_stop() */ - - u32 l_num; - - for (l_num = 0; l_num < MAX_BEARERS; l_num++) { - link_delete(n_ptr->links[l_num]); - } -#endif - dbg("node %x deleted\n", n_ptr->addr); kfree(n_ptr); } @@ -597,22 +587,6 @@ void tipc_node_remove_router(struct tipc_node *n_ptr, u32 router) node_lost_contact(n_ptr); } -#if 0 -void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str) -{ - u32 i; - - tipc_printf(buf, "\n\n%s", str); - for (i = 0; i < MAX_BEARERS; i++) { - if (!n_ptr->links[i]) - continue; - tipc_printf(buf, "Links[%u]: %x, ", i, n_ptr->links[i]); - } - tipc_printf(buf, "Active links: [%x,%x]\n", - n_ptr->active_links[0], n_ptr->active_links[1]); -} -#endif - u32 tipc_available_nodes(const u32 domain) { struct tipc_node *n_ptr; diff --git a/net/tipc/port.c b/net/tipc/port.c index d760336..5c4285b 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -710,50 +710,6 @@ struct sk_buff *tipc_port_get_ports(void) return buf; } -#if 0 - -#define MAX_PORT_STATS 2000 - -struct sk_buff *port_show_stats(const void *req_tlv_area, int req_tlv_space) -{ - u32 ref; - struct port *p_ptr; - struct sk_buff *buf; - struct tlv_desc *rep_tlv; - struct print_buf pb; - int str_len; - - if (!TLV_CHECK(req_tlv_area, req_tlv_space, TIPC_TLV_PORT_REF)) - return cfg_reply_error_string(TIPC_CFG_TLV_ERROR); - - ref = *(u32 *)TLV_DATA(req_tlv_area); - ref = ntohl(ref); - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return cfg_reply_error_string("port not found"); - - buf = tipc_cfg_reply_alloc(TLV_SPACE(MAX_PORT_STATS)); - if (!buf) { - tipc_port_unlock(p_ptr); - return NULL; - } - rep_tlv = (struct tlv_desc *)buf->data; - - tipc_printbuf_init(&pb, TLV_DATA(rep_tlv), MAX_PORT_STATS); - port_print(p_ptr, &pb, 1); - /* NEED TO FILL IN ADDITIONAL PORT STATISTICS HERE */ - tipc_port_unlock(p_ptr); - str_len = tipc_printbuf_validate(&pb); - - skb_put(buf, TLV_SPACE(str_len)); - TLV_SET(rep_tlv, TIPC_TLV_ULTRA_STRING, NULL, str_len); - - return buf; -} - -#endif - void tipc_port_reinit(void) { struct port *p_ptr; -- cgit v1.1 From b3d6255388de0680a14f0907deb7b7f4fa0d25d5 Mon Sep 17 00:00:00 2001 From: Kumar Sanghvi Date: Tue, 12 Oct 2010 20:14:43 +0000 Subject: Phonet: 'connect' socket implementation for Pipe controller MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Based on suggestion by Rémi Denis-Courmont to implement 'connect' for Pipe controller logic, this patch implements 'connect' socket call for the Pipe controller logic. The patch does following:- - Removes setsockopts for PNPIPE_CREATE and PNPIPE_DESTROY - Adds setsockopt for setting the Pipe handle value - Implements connect socket call - Updates the Pipe controller logic User-space should now follow below sequence with Pipe controller:- -socket -bind -setsockopt for PNPIPE_PIPE_HANDLE -connect -setsockopt for PNPIPE_ENCAP_IP -setsockopt for PNPIPE_ENABLE GPRS/3G data has been tested working fine with this. Signed-off-by: Kumar Sanghvi Acked-by: Rémi Denis-Courmont Signed-off-by: David S. Miller --- net/phonet/pep.c | 300 ++++++++++++++++++++-------------------------------- net/phonet/socket.c | 99 +++++++++++++++++ 2 files changed, 212 insertions(+), 187 deletions(-) (limited to 'net') diff --git a/net/phonet/pep.c b/net/phonet/pep.c index f818f76..9c903f9 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -88,15 +88,6 @@ static int pep_reply(struct sock *sk, struct sk_buff *oskb, const struct pnpipehdr *oph = pnp_hdr(oskb); struct pnpipehdr *ph; struct sk_buff *skb; -#ifdef CONFIG_PHONET_PIPECTRLR - const struct phonethdr *hdr = pn_hdr(oskb); - struct sockaddr_pn spn = { - .spn_family = AF_PHONET, - .spn_resource = 0xD9, - .spn_dev = hdr->pn_sdev, - .spn_obj = hdr->pn_sobj, - }; -#endif skb = alloc_skb(MAX_PNPIPE_HEADER + len, priority); if (!skb) @@ -114,11 +105,7 @@ static int pep_reply(struct sock *sk, struct sk_buff *oskb, ph->pipe_handle = oph->pipe_handle; ph->error_code = code; -#ifdef CONFIG_PHONET_PIPECTRLR - return pn_skb_send(sk, skb, &spn); -#else return pn_skb_send(sk, skb, &pipe_srv); -#endif } #define PAD 0x00 @@ -188,18 +175,13 @@ static int pipe_get_flow_info(struct sock *sk, struct sk_buff *skb, return 0; } -static int pipe_handler_send_req(struct sock *sk, u16 dobj, u8 utid, - u8 msg_id, u8 p_handle, gfp_t priority) +static int pipe_handler_send_req(struct sock *sk, u8 utid, + u8 msg_id, gfp_t priority) { int len; struct pnpipehdr *ph; struct sk_buff *skb; - struct sockaddr_pn spn = { - .spn_family = AF_PHONET, - .spn_resource = 0xD9, - .spn_dev = pn_dev(dobj), - .spn_obj = pn_obj(dobj), - }; + struct pep_sock *pn = pep_sk(sk); static const u8 data[4] = { PAD, PAD, PAD, PAD, @@ -235,30 +217,25 @@ static int pipe_handler_send_req(struct sock *sk, u16 dobj, u8 utid, ph = pnp_hdr(skb); ph->utid = utid; ph->message_id = msg_id; - ph->pipe_handle = p_handle; + ph->pipe_handle = pn->pipe_handle; ph->error_code = PN_PIPE_NO_ERROR; - return pn_skb_send(sk, skb, &spn); + return pn_skb_send(sk, skb, &pn->remote_pep); } -static int pipe_handler_send_created_ind(struct sock *sk, u16 dobj, - u8 utid, u8 p_handle, u8 msg_id, u8 tx_fc, u8 rx_fc) +static int pipe_handler_send_created_ind(struct sock *sk, + u8 utid, u8 msg_id) { int err_code; struct pnpipehdr *ph; struct sk_buff *skb; - struct sockaddr_pn spn = { - .spn_family = AF_PHONET, - .spn_resource = 0xD9, - .spn_dev = pn_dev(dobj), - .spn_obj = pn_obj(dobj), - }; + struct pep_sock *pn = pep_sk(sk); static u8 data[4] = { 0x03, 0x04, }; - data[2] = tx_fc; - data[3] = rx_fc; + data[2] = pn->tx_fc; + data[3] = pn->rx_fc; /* * actually, below is number of sub-blocks and not error code. @@ -282,24 +259,18 @@ static int pipe_handler_send_created_ind(struct sock *sk, u16 dobj, ph = pnp_hdr(skb); ph->utid = utid; ph->message_id = msg_id; - ph->pipe_handle = p_handle; + ph->pipe_handle = pn->pipe_handle; ph->error_code = err_code; - return pn_skb_send(sk, skb, &spn); + return pn_skb_send(sk, skb, &pn->remote_pep); } -static int pipe_handler_send_ind(struct sock *sk, u16 dobj, u8 utid, - u8 p_handle, u8 msg_id) +static int pipe_handler_send_ind(struct sock *sk, u8 utid, u8 msg_id) { int err_code; struct pnpipehdr *ph; struct sk_buff *skb; - struct sockaddr_pn spn = { - .spn_family = AF_PHONET, - .spn_resource = 0xD9, - .spn_dev = pn_dev(dobj), - .spn_obj = pn_obj(dobj), - }; + struct pep_sock *pn = pep_sk(sk); /* * actually, below is a filler. @@ -321,10 +292,10 @@ static int pipe_handler_send_ind(struct sock *sk, u16 dobj, u8 utid, ph = pnp_hdr(skb); ph->utid = utid; ph->message_id = msg_id; - ph->pipe_handle = p_handle; + ph->pipe_handle = pn->pipe_handle; ph->error_code = err_code; - return pn_skb_send(sk, skb, &spn); + return pn_skb_send(sk, skb, &pn->remote_pep); } static int pipe_handler_enable_pipe(struct sock *sk, int enable) @@ -339,34 +310,7 @@ static int pipe_handler_enable_pipe(struct sock *sk, int enable) utid = PNS_PIPE_DISABLE_UTID; req = PNS_PEP_DISABLE_REQ; } - return pipe_handler_send_req(sk, pn->pn_sk.sobject, utid, req, - pn->pipe_handle, GFP_ATOMIC); -} - -static int pipe_handler_create_pipe(struct sock *sk, int pipe_handle, int cmd) -{ - int ret; - struct pep_sock *pn = pep_sk(sk); - - switch (cmd) { - case PNPIPE_CREATE: - ret = pipe_handler_send_req(sk, pn->pn_sk.sobject, - PNS_PEP_CONNECT_UTID, PNS_PEP_CONNECT_REQ, - pipe_handle, GFP_ATOMIC); - break; - - case PNPIPE_DESTROY: - ret = pipe_handler_send_req(sk, pn->remote_pep, - PNS_PEP_DISCONNECT_UTID, - PNS_PEP_DISCONNECT_REQ, - pn->pipe_handle, GFP_ATOMIC); - break; - - default: - ret = -EINVAL; - } - - return ret; + return pipe_handler_send_req(sk, utid, req, GFP_ATOMIC); } #endif @@ -434,14 +378,6 @@ static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority) struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; struct sk_buff *skb; -#ifdef CONFIG_PHONET_PIPECTRLR - struct sockaddr_pn spn = { - .spn_family = AF_PHONET, - .spn_resource = 0xD9, - .spn_dev = pn_dev(pn->remote_pep), - .spn_obj = pn_obj(pn->remote_pep), - }; -#endif skb = alloc_skb(MAX_PNPIPE_HEADER + 4, priority); if (!skb) @@ -462,7 +398,7 @@ static int pipe_snd_status(struct sock *sk, u8 type, u8 status, gfp_t priority) ph->data[4] = status; #ifdef CONFIG_PHONET_PIPECTRLR - return pn_skb_send(sk, skb, &spn); + return pn_skb_send(sk, skb, &pn->remote_pep); #else return pn_skb_send(sk, skb, &pipe_srv); #endif @@ -582,12 +518,6 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) struct pnpipehdr *hdr = pnp_hdr(skb); struct sk_buff_head *queue; int err = 0; -#ifdef CONFIG_PHONET_PIPECTRLR - struct phonethdr *ph = pn_hdr(skb); - static u8 host_pref_rx_fc[3], host_req_tx_fc[3]; - u8 remote_pref_rx_fc[3], remote_req_tx_fc[3]; - u8 negotiated_rx_fc, negotiated_tx_fc; -#endif BUG_ON(sk->sk_state == TCP_CLOSE_WAIT); @@ -596,40 +526,6 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) pep_reject_conn(sk, skb, PN_PIPE_ERR_PEP_IN_USE); break; -#ifdef CONFIG_PHONET_PIPECTRLR - case PNS_PEP_CONNECT_RESP: - if ((ph->pn_sdev == pn_dev(pn->remote_pep)) && - (ph->pn_sobj == pn_obj(pn->remote_pep))) { - pipe_get_flow_info(sk, skb, remote_pref_rx_fc, - remote_req_tx_fc); - - negotiated_tx_fc = pipe_negotiate_fc(remote_req_tx_fc, - host_pref_rx_fc, - sizeof(host_pref_rx_fc)); - negotiated_rx_fc = pipe_negotiate_fc(host_req_tx_fc, - remote_pref_rx_fc, - sizeof(host_pref_rx_fc)); - - pn->pipe_state = PIPE_DISABLED; - pipe_handler_send_created_ind(sk, pn->remote_pep, - PNS_PIPE_CREATED_IND_UTID, - pn->pipe_handle, PNS_PIPE_CREATED_IND, - negotiated_tx_fc, negotiated_rx_fc); - pipe_handler_send_created_ind(sk, pn->pn_sk.sobject, - PNS_PIPE_CREATED_IND_UTID, - pn->pipe_handle, PNS_PIPE_CREATED_IND, - negotiated_tx_fc, negotiated_rx_fc); - } else { - pipe_handler_send_req(sk, pn->remote_pep, - PNS_PEP_CONNECT_UTID, - PNS_PEP_CONNECT_REQ, pn->pipe_handle, - GFP_ATOMIC); - pipe_get_flow_info(sk, skb, host_pref_rx_fc, - host_req_tx_fc); - } - break; -#endif - case PNS_PEP_DISCONNECT_REQ: pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); sk->sk_state = TCP_CLOSE_WAIT; @@ -640,10 +536,7 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_PHONET_PIPECTRLR case PNS_PEP_DISCONNECT_RESP: pn->pipe_state = PIPE_IDLE; - pipe_handler_send_req(sk, pn->pn_sk.sobject, - PNS_PEP_DISCONNECT_UTID, - PNS_PEP_DISCONNECT_REQ, pn->pipe_handle, - GFP_KERNEL); + sk->sk_state = TCP_CLOSE; break; #endif @@ -654,21 +547,18 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_PHONET_PIPECTRLR case PNS_PEP_ENABLE_RESP: - if ((ph->pn_sdev == pn_dev(pn->remote_pep)) && - (ph->pn_sobj == pn_obj(pn->remote_pep))) { - pn->pipe_state = PIPE_ENABLED; - pipe_handler_send_ind(sk, pn->remote_pep, - PNS_PIPE_ENABLED_IND_UTID, - pn->pipe_handle, PNS_PIPE_ENABLED_IND); - pipe_handler_send_ind(sk, pn->pn_sk.sobject, - PNS_PIPE_ENABLED_IND_UTID, - pn->pipe_handle, PNS_PIPE_ENABLED_IND); - } else - pipe_handler_send_req(sk, pn->remote_pep, - PNS_PIPE_ENABLE_UTID, - PNS_PEP_ENABLE_REQ, pn->pipe_handle, - GFP_KERNEL); + pn->pipe_state = PIPE_ENABLED; + pipe_handler_send_ind(sk, PNS_PIPE_ENABLED_IND_UTID, + PNS_PIPE_ENABLED_IND); + if (!pn_flow_safe(pn->tx_fc)) { + atomic_set(&pn->tx_credits, 1); + sk->sk_write_space(sk); + } + if (sk->sk_state == TCP_ESTABLISHED) + break; /* Nothing to do */ + sk->sk_state = TCP_ESTABLISHED; + pipe_grant_credits(sk); break; #endif @@ -692,22 +582,12 @@ static int pipe_do_rcv(struct sock *sk, struct sk_buff *skb) #ifdef CONFIG_PHONET_PIPECTRLR case PNS_PEP_DISABLE_RESP: - if ((ph->pn_sdev == pn_dev(pn->remote_pep)) && - (ph->pn_sobj == pn_obj(pn->remote_pep))) { - pn->pipe_state = PIPE_DISABLED; - pipe_handler_send_ind(sk, pn->remote_pep, - PNS_PIPE_DISABLED_IND_UTID, - pn->pipe_handle, - PNS_PIPE_DISABLED_IND); - pipe_handler_send_ind(sk, pn->pn_sk.sobject, - PNS_PIPE_DISABLED_IND_UTID, - pn->pipe_handle, - PNS_PIPE_DISABLED_IND); - } else - pipe_handler_send_req(sk, pn->remote_pep, - PNS_PIPE_DISABLE_UTID, - PNS_PEP_DISABLE_REQ, pn->pipe_handle, - GFP_KERNEL); + pn->pipe_state = PIPE_DISABLED; + atomic_set(&pn->tx_credits, 0); + pipe_handler_send_ind(sk, PNS_PIPE_DISABLED_IND_UTID, + PNS_PIPE_DISABLED_IND); + sk->sk_state = TCP_SYN_RECV; + pn->rx_credits = 0; break; #endif @@ -802,6 +682,42 @@ static void pipe_destruct(struct sock *sk) skb_queue_purge(&pn->ctrlreq_queue); } +#ifdef CONFIG_PHONET_PIPECTRLR +static int pep_connresp_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct pep_sock *pn = pep_sk(sk); + u8 host_pref_rx_fc[3] = {3, 2, 1}, host_req_tx_fc[3] = {3, 2, 1}; + u8 remote_pref_rx_fc[3], remote_req_tx_fc[3]; + u8 negotiated_rx_fc, negotiated_tx_fc; + int ret; + + pipe_get_flow_info(sk, skb, remote_pref_rx_fc, + remote_req_tx_fc); + negotiated_tx_fc = pipe_negotiate_fc(remote_req_tx_fc, + host_pref_rx_fc, + sizeof(host_pref_rx_fc)); + negotiated_rx_fc = pipe_negotiate_fc(host_req_tx_fc, + remote_pref_rx_fc, + sizeof(host_pref_rx_fc)); + + pn->pipe_state = PIPE_DISABLED; + sk->sk_state = TCP_SYN_RECV; + sk->sk_backlog_rcv = pipe_do_rcv; + sk->sk_destruct = pipe_destruct; + pn->rx_credits = 0; + pn->rx_fc = negotiated_rx_fc; + pn->tx_fc = negotiated_tx_fc; + sk->sk_state_change(sk); + + ret = pipe_handler_send_created_ind(sk, + PNS_PIPE_CREATED_IND_UTID, + PNS_PIPE_CREATED_IND + ); + + return ret; +} +#endif + static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb) { struct sock *newsk; @@ -884,9 +800,6 @@ static int pep_connreq_rcv(struct sock *sk, struct sk_buff *skb) newpn->rx_fc = newpn->tx_fc = PN_LEGACY_FLOW_CONTROL; newpn->init_enable = enabled; newpn->aligned = aligned; -#ifdef CONFIG_PHONET_PIPECTRLR - newpn->remote_pep = pn->remote_pep; -#endif BUG_ON(!skb_queue_empty(&newsk->sk_receive_queue)); skb_queue_head(&newsk->sk_receive_queue, skb); @@ -968,6 +881,12 @@ static int pep_do_rcv(struct sock *sk, struct sk_buff *skb) err = pep_connreq_rcv(sk, skb); break; +#ifdef CONFIG_PHONET_PIPECTRLR + case PNS_PEP_CONNECT_RESP: + err = pep_connresp_rcv(sk, skb); + break; +#endif + case PNS_PEP_DISCONNECT_REQ: pep_reply(sk, skb, PN_PIPE_NO_ERROR, NULL, 0, GFP_ATOMIC); break; @@ -1032,6 +951,18 @@ static void pep_sock_close(struct sock *sk, long timeout) /* Forcefully remove dangling Phonet pipe */ pipe_do_remove(sk); +#ifdef CONFIG_PHONET_PIPECTRLR + if (pn->pipe_state != PIPE_IDLE) { + /* send pep disconnect request */ + pipe_handler_send_req(sk, + PNS_PEP_DISCONNECT_UTID, PNS_PEP_DISCONNECT_REQ, + GFP_KERNEL); + + pn->pipe_state = PIPE_IDLE; + sk->sk_state = TCP_CLOSE; + } +#endif + ifindex = pn->ifindex; pn->ifindex = 0; release_sock(sk); @@ -1108,6 +1039,20 @@ out: return newsk; } +#ifdef CONFIG_PHONET_PIPECTRLR +static int pep_sock_connect(struct sock *sk, struct sockaddr *addr, int len) +{ + struct pep_sock *pn = pep_sk(sk); + struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; + + memcpy(&pn->remote_pep, spn, sizeof(struct sockaddr_pn)); + + return pipe_handler_send_req(sk, + PNS_PEP_CONNECT_UTID, PNS_PEP_CONNECT_REQ, + GFP_ATOMIC); +} +#endif + static int pep_ioctl(struct sock *sk, int cmd, unsigned long arg) { struct pep_sock *pn = pep_sk(sk); @@ -1149,10 +1094,6 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, { struct pep_sock *pn = pep_sk(sk); int val = 0, err = 0; -#ifdef CONFIG_PHONET_PIPECTRLR - int remote_pep; - int pipe_handle; -#endif if (level != SOL_PNPIPE) return -ENOPROTOOPT; @@ -1164,28 +1105,15 @@ static int pep_setsockopt(struct sock *sk, int level, int optname, lock_sock(sk); switch (optname) { #ifdef CONFIG_PHONET_PIPECTRLR - case PNPIPE_CREATE: + case PNPIPE_PIPE_HANDLE: if (val) { if (pn->pipe_state > PIPE_IDLE) { err = -EFAULT; break; } - remote_pep = val & 0xFFFF; - pipe_handle = (val >> 16) & 0xFF; - pn->remote_pep = remote_pep; - err = pipe_handler_create_pipe(sk, pipe_handle, - PNPIPE_CREATE); - break; - } - - case PNPIPE_DESTROY: - if (pn->pipe_state < PIPE_DISABLED) { - err = -EFAULT; + pn->pipe_handle = val; break; } - - err = pipe_handler_create_pipe(sk, 0x0, PNPIPE_DESTROY); - break; #endif case PNPIPE_ENCAP: @@ -1278,14 +1206,6 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) struct pep_sock *pn = pep_sk(sk); struct pnpipehdr *ph; int err; -#ifdef CONFIG_PHONET_PIPECTRLR - struct sockaddr_pn spn = { - .spn_family = AF_PHONET, - .spn_resource = 0xD9, - .spn_dev = pn_dev(pn->remote_pep), - .spn_obj = pn_obj(pn->remote_pep), - }; -#endif if (pn_flow_safe(pn->tx_fc) && !atomic_add_unless(&pn->tx_credits, -1, 0)) { @@ -1304,7 +1224,7 @@ static int pipe_skb_send(struct sock *sk, struct sk_buff *skb) ph->message_id = PNS_PIPE_DATA; ph->pipe_handle = pn->pipe_handle; #ifdef CONFIG_PHONET_PIPECTRLR - err = pn_skb_send(sk, skb, &spn); + err = pn_skb_send(sk, skb, &pn->remote_pep); #else err = pn_skb_send(sk, skb, &pipe_srv); #endif @@ -1504,6 +1424,8 @@ static void pep_sock_unhash(struct sock *sk) struct sock *skparent = NULL; lock_sock(sk); + +#ifndef CONFIG_PHONET_PIPECTRLR if ((1 << sk->sk_state) & ~(TCPF_CLOSE|TCPF_LISTEN)) { skparent = pn->listener; release_sock(sk); @@ -1513,6 +1435,7 @@ static void pep_sock_unhash(struct sock *sk) sk_del_node_init(sk); sk = skparent; } +#endif /* Unhash a listening sock only when it is closed * and all of its active connected pipes are closed. */ if (hlist_empty(&pn->hlist)) @@ -1526,6 +1449,9 @@ static void pep_sock_unhash(struct sock *sk) static struct proto pep_proto = { .close = pep_sock_close, .accept = pep_sock_accept, +#ifdef CONFIG_PHONET_PIPECTRLR + .connect = pep_sock_connect, +#endif .ioctl = pep_ioctl, .init = pep_init, .setsockopt = pep_setsockopt, diff --git a/net/phonet/socket.c b/net/phonet/socket.c index aca8fba..25f746d 100644 --- a/net/phonet/socket.c +++ b/net/phonet/socket.c @@ -225,6 +225,101 @@ static int pn_socket_autobind(struct socket *sock) return 0; /* socket was already bound */ } +#ifdef CONFIG_PHONET_PIPECTRLR +static int pn_socket_connect(struct socket *sock, struct sockaddr *addr, + int len, int flags) +{ + struct sock *sk = sock->sk; + struct sockaddr_pn *spn = (struct sockaddr_pn *)addr; + long timeo; + int err; + + if (len < sizeof(struct sockaddr_pn)) + return -EINVAL; + if (spn->spn_family != AF_PHONET) + return -EAFNOSUPPORT; + + lock_sock(sk); + + switch (sock->state) { + case SS_UNCONNECTED: + sk->sk_state = TCP_CLOSE; + break; + case SS_CONNECTING: + switch (sk->sk_state) { + case TCP_SYN_RECV: + sock->state = SS_CONNECTED; + err = -EISCONN; + goto out; + case TCP_CLOSE: + err = -EALREADY; + if (flags & O_NONBLOCK) + goto out; + goto wait_connect; + } + break; + case SS_CONNECTED: + switch (sk->sk_state) { + case TCP_SYN_RECV: + err = -EISCONN; + goto out; + case TCP_CLOSE: + sock->state = SS_UNCONNECTED; + break; + } + break; + case SS_DISCONNECTING: + case SS_FREE: + break; + } + sk->sk_state = TCP_CLOSE; + sk_stream_kill_queues(sk); + + sock->state = SS_CONNECTING; + err = sk->sk_prot->connect(sk, addr, len); + if (err < 0) { + sock->state = SS_UNCONNECTED; + sk->sk_state = TCP_CLOSE; + goto out; + } + + err = -EINPROGRESS; +wait_connect: + if (sk->sk_state != TCP_SYN_RECV && (flags & O_NONBLOCK)) + goto out; + + timeo = sock_sndtimeo(sk, flags & O_NONBLOCK); + release_sock(sk); + + err = -ERESTARTSYS; + timeo = wait_event_interruptible_timeout(*sk_sleep(sk), + sk->sk_state != TCP_CLOSE, + timeo); + + lock_sock(sk); + if (timeo < 0) + goto out; /* -ERESTARTSYS */ + + err = -ETIMEDOUT; + if (timeo == 0 && sk->sk_state != TCP_SYN_RECV) + goto out; + + if (sk->sk_state != TCP_SYN_RECV) { + sock->state = SS_UNCONNECTED; + err = sock_error(sk); + if (!err) + err = -ECONNREFUSED; + goto out; + } + sock->state = SS_CONNECTED; + err = 0; + +out: + release_sock(sk); + return err; +} +#endif + static int pn_socket_accept(struct socket *sock, struct socket *newsock, int flags) { @@ -393,7 +488,11 @@ const struct proto_ops phonet_stream_ops = { .owner = THIS_MODULE, .release = pn_socket_release, .bind = pn_socket_bind, +#ifdef CONFIG_PHONET_PIPECTRLR + .connect = pn_socket_connect, +#else .connect = sock_no_connect, +#endif .socketpair = sock_no_socketpair, .accept = pn_socket_accept, .getname = pn_socket_getname, -- cgit v1.1 From 9ebad4ab87f2ffa6eca825327721e647c7457264 Mon Sep 17 00:00:00 2001 From: Johannes Berg Date: Thu, 14 Oct 2010 13:41:35 +0200 Subject: radiotap: fix vendor namespace parsing There's a bug with radiotap vendor namespace parsing if you don't register for the given namespace extensions. Fix this by passing only the unknown vendor namespaces and the registered data to frontends, but not both. Signed-off-by: Johannes Berg Signed-off-by: John W. Linville --- net/wireless/radiotap.c | 58 +++++++++++++++++++++++++++---------------------- 1 file changed, 32 insertions(+), 26 deletions(-) (limited to 'net') diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c index c774bc0..dbe35e1 100644 --- a/net/wireless/radiotap.c +++ b/net/wireless/radiotap.c @@ -201,7 +201,7 @@ int ieee80211_radiotap_iterator_next( { while (1) { int hit = 0; - int pad, align, size, subns, vnslen; + int pad, align, size, subns; uint32_t oui; /* if no more EXT bits, that's it */ @@ -261,6 +261,27 @@ int ieee80211_radiotap_iterator_next( if (pad) iterator->_arg += align - pad; + if (iterator->_arg_index % 32 == IEEE80211_RADIOTAP_VENDOR_NAMESPACE) { + int vnslen; + + if ((unsigned long)iterator->_arg + size - + (unsigned long)iterator->_rtheader > + (unsigned long)iterator->_max_length) + return -EINVAL; + + oui = (*iterator->_arg << 16) | + (*(iterator->_arg + 1) << 8) | + *(iterator->_arg + 2); + subns = *(iterator->_arg + 3); + + find_ns(iterator, oui, subns); + + vnslen = get_unaligned_le16(iterator->_arg + 4); + iterator->_next_ns_data = iterator->_arg + size + vnslen; + if (!iterator->current_namespace) + size += vnslen; + } + /* * this is what we will return to user, but we need to * move on first so next call has something fresh to test @@ -287,40 +308,25 @@ int ieee80211_radiotap_iterator_next( /* these special ones are valid in each bitmap word */ switch (iterator->_arg_index % 32) { case IEEE80211_RADIOTAP_VENDOR_NAMESPACE: - iterator->_bitmap_shifter >>= 1; - iterator->_arg_index++; - iterator->_reset_on_ext = 1; - vnslen = get_unaligned_le16(iterator->this_arg + 4); - iterator->_next_ns_data = iterator->_arg + vnslen; - oui = (*iterator->this_arg << 16) | - (*(iterator->this_arg + 1) << 8) | - *(iterator->this_arg + 2); - subns = *(iterator->this_arg + 3); - - find_ns(iterator, oui, subns); - iterator->is_radiotap_ns = 0; - /* allow parsers to show this information */ + /* + * If parser didn't register this vendor + * namespace with us, allow it to show it + * as 'raw. Do do that, set argument index + * to vendor namespace. + */ iterator->this_arg_index = IEEE80211_RADIOTAP_VENDOR_NAMESPACE; - iterator->this_arg_size += vnslen; - if ((unsigned long)iterator->this_arg + - iterator->this_arg_size - - (unsigned long)iterator->_rtheader > - (unsigned long)(unsigned long)iterator->_max_length) - return -EINVAL; - hit = 1; - break; + if (!iterator->current_namespace) + hit = 1; + goto next_entry; case IEEE80211_RADIOTAP_RADIOTAP_NAMESPACE: - iterator->_bitmap_shifter >>= 1; - iterator->_arg_index++; - iterator->_reset_on_ext = 1; iterator->current_namespace = &radiotap_ns; iterator->is_radiotap_ns = 1; - break; + goto next_entry; case IEEE80211_RADIOTAP_EXT: /* * bit 31 was set, there is more -- cgit v1.1 From 564824b0c52c34692d804bb6ea214451615b0b50 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 11 Oct 2010 19:05:25 +0000 Subject: net: allocate skbs on local node commit b30973f877 (node-aware skb allocation) spread a wrong habit of allocating net drivers skbs on a given memory node : The one closest to the NIC hardware. This is wrong because as soon as we try to scale network stack, we need to use many cpus to handle traffic and hit slub/slab management on cross-node allocations/frees when these cpus have to alloc/free skbs bound to a central node. skb allocated in RX path are ephemeral, they have a very short lifetime : Extra cost to maintain NUMA affinity is too expensive. What appeared as a nice idea four years ago is in fact a bad one. In 2010, NIC hardwares are multiqueue, or we use RPS to spread the load, and two 10Gb NIC might deliver more than 28 million packets per second, needing all the available cpus. Cost of cross-node handling in network and vm stacks outperforms the small benefit hardware had when doing its DMA transfert in its 'local' memory node at RX time. Even trying to differentiate the two allocations done for one skb (the sk_buff on local node, the data part on NIC hardware node) is not enough to bring good performance. Signed-off-by: Eric Dumazet Acked-by: Tom Herbert Signed-off-by: David S. Miller --- net/core/skbuff.c | 13 +------------ 1 file changed, 1 insertion(+), 12 deletions(-) (limited to 'net') diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 752c197..4e8b82e 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -247,10 +247,9 @@ EXPORT_SYMBOL(__alloc_skb); struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int length, gfp_t gfp_mask) { - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; struct sk_buff *skb; - skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, node); + skb = __alloc_skb(length + NET_SKB_PAD, gfp_mask, 0, NUMA_NO_NODE); if (likely(skb)) { skb_reserve(skb, NET_SKB_PAD); skb->dev = dev; @@ -259,16 +258,6 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, } EXPORT_SYMBOL(__netdev_alloc_skb); -struct page *__netdev_alloc_page(struct net_device *dev, gfp_t gfp_mask) -{ - int node = dev->dev.parent ? dev_to_node(dev->dev.parent) : -1; - struct page *page; - - page = alloc_pages_node(node, gfp_mask, 0); - return page; -} -EXPORT_SYMBOL(__netdev_alloc_page); - void skb_add_rx_frag(struct sk_buff *skb, int i, struct page *page, int off, int size) { -- cgit v1.1 From 2c1c00040af3d187b7d602dd53b5adbbf3a4caa4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Oct 2010 02:45:40 +0000 Subject: fib6: use FIB_LOOKUP_NOREF in fib6_rule_lookup() Avoid two atomic ops on found rule in fib6_rule_lookup() Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv6/fib6_rules.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/ipv6/fib6_rules.c b/net/ipv6/fib6_rules.c index b1108ed..d829874 100644 --- a/net/ipv6/fib6_rules.c +++ b/net/ipv6/fib6_rules.c @@ -34,11 +34,10 @@ struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi *fl, { struct fib_lookup_arg arg = { .lookup_ptr = lookup, + .flags = FIB_LOOKUP_NOREF, }; fib_rules_lookup(net->ipv6.fib6_rules_ops, fl, flags, &arg); - if (arg.rule) - fib_rule_put(arg.rule); if (arg.result) return arg.result; -- cgit v1.1 From a0a4a85a15df6335e3d11f83b2ac06ebebea313f Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Oct 2010 04:43:04 +0000 Subject: fib: remove a useless synchronize_rcu() call fib_nl_delrule() calls synchronize_rcu() for no apparent reason, while rtnl is held. I suspect it was done to avoid an atomic_inc_not_zero() in fib_rules_lookup(), which commit 7fa7cb7109d07 added anyway. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/fib_rules.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c index 21698f8..1bc3f25 100644 --- a/net/core/fib_rules.c +++ b/net/core/fib_rules.c @@ -494,7 +494,6 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg) } } - synchronize_rcu(); notify_rule_change(RTM_DELRULE, rule, ops, nlh, NETLINK_CB(skb).pid); fib_rule_put(rule); -- cgit v1.1 From 874ffa8f72444d6253d2669fed304875c128f86b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Oct 2010 06:56:11 +0000 Subject: fib_trie: use fls() instead of open coded loop fib_table_lookup() might use fls() to speedup an open coded loop. Noticed while doing a profile analysis. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 271c89b..31494f3 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1384,8 +1384,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, t_key cindex = 0; int current_prefix_length = KEYLENGTH; struct tnode *cn; - t_key node_prefix, key_prefix, pref_mismatch; - int mp; + t_key pref_mismatch; rcu_read_lock(); @@ -1500,10 +1499,7 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, * matching prefix. */ - node_prefix = mask_pfx(cn->key, cn->pos); - key_prefix = mask_pfx(key, cn->pos); - pref_mismatch = key_prefix^node_prefix; - mp = 0; + pref_mismatch = mask_pfx(cn->key ^ key, cn->pos); /* * In short: If skipped bits in this node do not match @@ -1511,13 +1507,9 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi *flp, * state.directly. */ if (pref_mismatch) { - while (!(pref_mismatch & (1<<(KEYLENGTH-1)))) { - mp++; - pref_mismatch = pref_mismatch << 1; - } - key_prefix = tkey_extract_bits(cn->key, mp, cn->pos-mp); + int mp = KEYLENGTH - fls(pref_mismatch); - if (key_prefix != 0) + if (tkey_extract_bits(cn->key, mp, cn->pos - mp) != 0) goto backtrace; if (current_prefix_length >= cn->pos) -- cgit v1.1 From 10da66f7552b3c7966c2f4f1f72009fb0b5539ec Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 13 Oct 2010 08:22:03 +0000 Subject: fib: avoid false sharing on fib_table_hash While doing profile analysis, I found fib_hash_table was sometime in a cache line shared by a possibly often written kernel structure. (CONFIG_IP_ROUTE_MULTIPATH || !CONFIG_IPV6_MULTIPLE_TABLES) It's hard to detect because not easily reproductible. Make sure we allocate a full cache line to keep this shared in all cpus caches. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 11 +++++------ net/ipv6/ip6_fib.c | 9 ++++++--- 2 files changed, 11 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 919f2ad..3df057e 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -1016,16 +1016,15 @@ static struct notifier_block fib_netdev_notifier = { static int __net_init ip_fib_net_init(struct net *net) { int err; - unsigned int i; + size_t size = sizeof(struct hlist_head) * FIB_TABLE_HASHSZ; + + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); - net->ipv4.fib_table_hash = kzalloc( - sizeof(struct hlist_head)*FIB_TABLE_HASHSZ, GFP_KERNEL); + net->ipv4.fib_table_hash = kzalloc(size, GFP_KERNEL); if (net->ipv4.fib_table_hash == NULL) return -ENOMEM; - for (i = 0; i < FIB_TABLE_HASHSZ; i++) - INIT_HLIST_HEAD(&net->ipv4.fib_table_hash[i]); - err = fib4_rules_init(net); if (err < 0) goto fail; diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index b6a5859..de38211 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1500,15 +1500,18 @@ static void fib6_gc_timer_cb(unsigned long arg) static int __net_init fib6_net_init(struct net *net) { + size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ; + setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net); net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL); if (!net->ipv6.rt6_stats) goto out_timer; - net->ipv6.fib_table_hash = kcalloc(FIB6_TABLE_HASHSZ, - sizeof(*net->ipv6.fib_table_hash), - GFP_KERNEL); + /* Avoid false sharing : Use at least a full cache line */ + size = max_t(size_t, size, L1_CACHE_BYTES); + + net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL); if (!net->ipv6.fib_table_hash) goto out_rt6_stats; -- cgit v1.1 From 31e3c3f6f1f9b154981a0e6620df700463db30ee Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Wed, 13 Oct 2010 13:20:35 +0000 Subject: tipc: cleanup function namespace Do some cleanups of TIPC based on make namespacecheck 1. Don't export unused symbols 2. Eliminate dead code 3. Make functions and variables local 4. Rename buf_acquire to tipc_buf_acquire since it is used in several files Compile tested only. This make break out of tree kernel modules that depend on TIPC routines. Signed-off-by: Stephen Hemminger Acked-by: Jon Maloy Acked-by: Paul Gortmaker Signed-off-by: David S. Miller --- net/tipc/addr.c | 5 -- net/tipc/bcast.c | 10 ++- net/tipc/bcast.h | 3 - net/tipc/cluster.c | 21 +---- net/tipc/cluster.h | 2 +- net/tipc/config.c | 7 +- net/tipc/config.h | 6 -- net/tipc/core.c | 28 ++---- net/tipc/core.h | 9 +- net/tipc/dbg.c | 13 ++- net/tipc/dbg.h | 3 - net/tipc/discover.c | 16 +--- net/tipc/discover.h | 2 - net/tipc/link.c | 45 +++++----- net/tipc/link.h | 4 - net/tipc/msg.c | 2 +- net/tipc/name_distr.c | 2 +- net/tipc/node.c | 19 +--- net/tipc/node.h | 1 - net/tipc/port.c | 234 +++++--------------------------------------------- net/tipc/port.h | 2 - net/tipc/ref.c | 17 ---- net/tipc/ref.h | 1 - net/tipc/subscr.c | 9 -- net/tipc/zone.c | 11 --- net/tipc/zone.h | 1 - 26 files changed, 84 insertions(+), 389 deletions(-) (limited to 'net') diff --git a/net/tipc/addr.c b/net/tipc/addr.c index 2ddc351..8a2e89b 100644 --- a/net/tipc/addr.c +++ b/net/tipc/addr.c @@ -41,11 +41,6 @@ #include "cluster.h" #include "net.h" -u32 tipc_get_addr(void) -{ - return tipc_own_addr; -} - /** * tipc_addr_domain_valid - validates a network domain address * diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c index ecfaac1..22a60fc 100644 --- a/net/tipc/bcast.c +++ b/net/tipc/bcast.c @@ -121,6 +121,9 @@ static DEFINE_SPINLOCK(bc_lock); const char tipc_bclink_name[] = "broadcast-link"; +static void tipc_nmap_diff(struct tipc_node_map *nm_a, + struct tipc_node_map *nm_b, + struct tipc_node_map *nm_diff); static u32 buf_seqno(struct sk_buff *buf) { @@ -287,7 +290,7 @@ static void bclink_send_nack(struct tipc_node *n_ptr) if (!less(n_ptr->bclink.gap_after, n_ptr->bclink.gap_to)) return; - buf = buf_acquire(INT_H_SIZE); + buf = tipc_buf_acquire(INT_H_SIZE); if (buf) { msg = buf_msg(buf); tipc_msg_init(msg, BCAST_PROTOCOL, STATE_MSG, @@ -871,8 +874,9 @@ void tipc_nmap_remove(struct tipc_node_map *nm_ptr, u32 node) * @nm_diff: output node map A-B (i.e. nodes of A that are not in B) */ -void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b, - struct tipc_node_map *nm_diff) +static void tipc_nmap_diff(struct tipc_node_map *nm_a, + struct tipc_node_map *nm_b, + struct tipc_node_map *nm_diff) { int stop = ARRAY_SIZE(nm_a->map); int w; diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h index e8c2b81..011c03f 100644 --- a/net/tipc/bcast.h +++ b/net/tipc/bcast.h @@ -84,9 +84,6 @@ static inline int tipc_nmap_equal(struct tipc_node_map *nm_a, struct tipc_node_m return !memcmp(nm_a, nm_b, sizeof(*nm_a)); } -void tipc_nmap_diff(struct tipc_node_map *nm_a, struct tipc_node_map *nm_b, - struct tipc_node_map *nm_diff); - void tipc_port_list_add(struct port_list *pl_ptr, u32 port); void tipc_port_list_free(struct port_list *pl_ptr); diff --git a/net/tipc/cluster.c b/net/tipc/cluster.c index e68f705..7fea14b 100644 --- a/net/tipc/cluster.c +++ b/net/tipc/cluster.c @@ -113,25 +113,6 @@ void tipc_cltr_delete(struct cluster *c_ptr) kfree(c_ptr); } -u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr) -{ - struct tipc_node *n_ptr; - u32 n_num = tipc_node(addr) + 1; - - if (!c_ptr) - return addr; - for (; n_num <= c_ptr->highest_node; n_num++) { - n_ptr = c_ptr->nodes[n_num]; - if (n_ptr && tipc_node_has_active_links(n_ptr)) - return n_ptr->addr; - } - for (n_num = 1; n_num < tipc_node(addr); n_num++) { - n_ptr = c_ptr->nodes[n_num]; - if (n_ptr && tipc_node_has_active_links(n_ptr)) - return n_ptr->addr; - } - return 0; -} void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr) { @@ -232,7 +213,7 @@ struct tipc_node *tipc_cltr_select_node(struct cluster *c_ptr, u32 selector) static struct sk_buff *tipc_cltr_prepare_routing_msg(u32 data_size, u32 dest) { u32 size = INT_H_SIZE + data_size; - struct sk_buff *buf = buf_acquire(size); + struct sk_buff *buf = tipc_buf_acquire(size); struct tipc_msg *msg; if (buf) { diff --git a/net/tipc/cluster.h b/net/tipc/cluster.h index 333efb0..32636d9 100644 --- a/net/tipc/cluster.h +++ b/net/tipc/cluster.h @@ -75,7 +75,7 @@ void tipc_cltr_attach_node(struct cluster *c_ptr, struct tipc_node *n_ptr); void tipc_cltr_send_slave_routes(struct cluster *c_ptr, u32 dest); void tipc_cltr_broadcast(struct sk_buff *buf); int tipc_cltr_init(void); -u32 tipc_cltr_next_node(struct cluster *c_ptr, u32 addr); + void tipc_cltr_bcast_new_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi); void tipc_cltr_send_local_routes(struct cluster *c_ptr, u32 dest); void tipc_cltr_bcast_lost_route(struct cluster *c_ptr, u32 dest, u32 lo, u32 hi); diff --git a/net/tipc/config.c b/net/tipc/config.c index c429b0d..50a6133 100644 --- a/net/tipc/config.c +++ b/net/tipc/config.c @@ -95,7 +95,7 @@ int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type, return 1; } -struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value) +static struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value) { struct sk_buff *buf; __be32 value_net; @@ -109,6 +109,11 @@ struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value) return buf; } +static struct sk_buff *tipc_cfg_reply_unsigned(u32 value) +{ + return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value); +} + struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string) { struct sk_buff *buf; diff --git a/net/tipc/config.h b/net/tipc/config.h index 5cd7cc5..481e12e 100644 --- a/net/tipc/config.h +++ b/net/tipc/config.h @@ -45,7 +45,6 @@ struct sk_buff *tipc_cfg_reply_alloc(int payload_size); int tipc_cfg_append_tlv(struct sk_buff *buf, int tlv_type, void *tlv_data, int tlv_data_size); -struct sk_buff *tipc_cfg_reply_unsigned_type(u16 tlv_type, u32 value); struct sk_buff *tipc_cfg_reply_string_type(u16 tlv_type, char *string); static inline struct sk_buff *tipc_cfg_reply_none(void) @@ -53,11 +52,6 @@ static inline struct sk_buff *tipc_cfg_reply_none(void) return tipc_cfg_reply_alloc(0); } -static inline struct sk_buff *tipc_cfg_reply_unsigned(u32 value) -{ - return tipc_cfg_reply_unsigned_type(TIPC_TLV_UNSIGNED, value); -} - static inline struct sk_buff *tipc_cfg_reply_error_string(char *string) { return tipc_cfg_reply_string_type(TIPC_TLV_ERROR_STRING, string); diff --git a/net/tipc/core.c b/net/tipc/core.c index 466b861..c005303 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -96,13 +96,13 @@ int tipc_net_id; int tipc_remote_management; -int tipc_get_mode(void) +static int tipc_get_mode(void) { return tipc_mode; } /** - * buf_acquire - creates a TIPC message buffer + * tipc_buf_acquire - creates a TIPC message buffer * @size: message size (including TIPC header) * * Returns a new buffer with data pointers set to the specified size. @@ -111,7 +111,7 @@ int tipc_get_mode(void) * There may also be unrequested tailroom present at the buffer's end. */ -struct sk_buff *buf_acquire(u32 size) +struct sk_buff *tipc_buf_acquire(u32 size) { struct sk_buff *skb; unsigned int buf_size = (BUF_HEADROOM + size + 3) & ~3u; @@ -129,7 +129,7 @@ struct sk_buff *buf_acquire(u32 size) * tipc_core_stop_net - shut down TIPC networking sub-systems */ -void tipc_core_stop_net(void) +static void tipc_core_stop_net(void) { tipc_eth_media_stop(); tipc_net_stop(); @@ -154,7 +154,7 @@ int tipc_core_start_net(unsigned long addr) * tipc_core_stop - switch TIPC from SINGLE NODE to NOT RUNNING mode */ -void tipc_core_stop(void) +static void tipc_core_stop(void) { if (tipc_mode != TIPC_NODE_MODE) return; @@ -176,7 +176,7 @@ void tipc_core_stop(void) * tipc_core_start - switch TIPC from NOT RUNNING to SINGLE NODE mode */ -int tipc_core_start(void) +static int tipc_core_start(void) { int res; @@ -246,7 +246,6 @@ MODULE_VERSION(TIPC_MOD_VER); EXPORT_SYMBOL(tipc_attach); EXPORT_SYMBOL(tipc_detach); -EXPORT_SYMBOL(tipc_get_addr); EXPORT_SYMBOL(tipc_get_mode); EXPORT_SYMBOL(tipc_createport); EXPORT_SYMBOL(tipc_deleteport); @@ -262,23 +261,10 @@ EXPORT_SYMBOL(tipc_withdraw); EXPORT_SYMBOL(tipc_connect2port); EXPORT_SYMBOL(tipc_disconnect); EXPORT_SYMBOL(tipc_shutdown); -EXPORT_SYMBOL(tipc_isconnected); -EXPORT_SYMBOL(tipc_peer); -EXPORT_SYMBOL(tipc_ref_valid); EXPORT_SYMBOL(tipc_send); -EXPORT_SYMBOL(tipc_send_buf); EXPORT_SYMBOL(tipc_send2name); -EXPORT_SYMBOL(tipc_forward2name); -EXPORT_SYMBOL(tipc_send_buf2name); -EXPORT_SYMBOL(tipc_forward_buf2name); EXPORT_SYMBOL(tipc_send2port); -EXPORT_SYMBOL(tipc_forward2port); -EXPORT_SYMBOL(tipc_send_buf2port); -EXPORT_SYMBOL(tipc_forward_buf2port); EXPORT_SYMBOL(tipc_multicast); -/* EXPORT_SYMBOL(tipc_multicast_buf); not available yet */ -EXPORT_SYMBOL(tipc_ispublished); -EXPORT_SYMBOL(tipc_available_nodes); /* TIPC API for external bearers (see tipc_bearer.h) */ @@ -295,6 +281,4 @@ EXPORT_SYMBOL(tipc_createport_raw); EXPORT_SYMBOL(tipc_reject_msg); EXPORT_SYMBOL(tipc_send_buf_fast); EXPORT_SYMBOL(tipc_acknowledge); -EXPORT_SYMBOL(tipc_get_port); -EXPORT_SYMBOL(tipc_get_handle); diff --git a/net/tipc/core.h b/net/tipc/core.h index 1887990..e19389e 100644 --- a/net/tipc/core.h +++ b/net/tipc/core.h @@ -83,9 +83,7 @@ * Note: TIPC_LOG is configured to echo its output to the system console; * user-defined buffers can be configured to do the same thing. */ - extern struct print_buf *const TIPC_NULL; -extern struct print_buf *const TIPC_CONS; extern struct print_buf *const TIPC_LOG; void tipc_printf(struct print_buf *, const char *fmt, ...); @@ -204,10 +202,7 @@ extern atomic_t tipc_user_count; * Routines available to privileged subsystems */ -extern int tipc_core_start(void); -extern void tipc_core_stop(void); -extern int tipc_core_start_net(unsigned long addr); -extern void tipc_core_stop_net(void); +extern int tipc_core_start_net(unsigned long); extern int tipc_handler_start(void); extern void tipc_handler_stop(void); extern int tipc_netlink_start(void); @@ -328,7 +323,7 @@ static inline struct tipc_msg *buf_msg(struct sk_buff *skb) return (struct tipc_msg *)skb->data; } -extern struct sk_buff *buf_acquire(u32 size); +extern struct sk_buff *tipc_buf_acquire(u32 size); /** * buf_discard - frees a TIPC message buffer diff --git a/net/tipc/dbg.c b/net/tipc/dbg.c index 6569d45..46f51d2 100644 --- a/net/tipc/dbg.c +++ b/net/tipc/dbg.c @@ -52,7 +52,7 @@ static struct print_buf null_buf = { NULL, 0, NULL, 0 }; struct print_buf *const TIPC_NULL = &null_buf; static struct print_buf cons_buf = { NULL, 0, NULL, 1 }; -struct print_buf *const TIPC_CONS = &cons_buf; +static struct print_buf *const TIPC_CONS = &cons_buf; static struct print_buf log_buf = { NULL, 0, NULL, 1 }; struct print_buf *const TIPC_LOG = &log_buf; @@ -76,6 +76,10 @@ struct print_buf *const TIPC_LOG = &log_buf; static char print_string[TIPC_PB_MAX_STR]; static DEFINE_SPINLOCK(print_lock); +static void tipc_printbuf_reset(struct print_buf *pb); +static int tipc_printbuf_empty(struct print_buf *pb); +static void tipc_printbuf_move(struct print_buf *pb_to, + struct print_buf *pb_from); #define FORMAT(PTR,LEN,FMT) \ {\ @@ -116,7 +120,7 @@ void tipc_printbuf_init(struct print_buf *pb, char *raw, u32 size) * @pb: pointer to print buffer structure */ -void tipc_printbuf_reset(struct print_buf *pb) +static void tipc_printbuf_reset(struct print_buf *pb) { if (pb->buf) { pb->crs = pb->buf; @@ -132,7 +136,7 @@ void tipc_printbuf_reset(struct print_buf *pb) * Returns non-zero if print buffer is empty. */ -int tipc_printbuf_empty(struct print_buf *pb) +static int tipc_printbuf_empty(struct print_buf *pb) { return !pb->buf || (pb->crs == pb->buf); } @@ -181,7 +185,8 @@ int tipc_printbuf_validate(struct print_buf *pb) * Source print buffer becomes empty if a successful move occurs. */ -void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from) +static void tipc_printbuf_move(struct print_buf *pb_to, + struct print_buf *pb_from) { int len; diff --git a/net/tipc/dbg.h b/net/tipc/dbg.h index 5ef1bc8..3ba6ba8 100644 --- a/net/tipc/dbg.h +++ b/net/tipc/dbg.h @@ -56,10 +56,7 @@ struct print_buf { #define TIPC_PB_MAX_STR 512 /* max printable string (with trailing NUL) */ void tipc_printbuf_init(struct print_buf *pb, char *buf, u32 size); -void tipc_printbuf_reset(struct print_buf *pb); -int tipc_printbuf_empty(struct print_buf *pb); int tipc_printbuf_validate(struct print_buf *pb); -void tipc_printbuf_move(struct print_buf *pb_to, struct print_buf *pb_from); int tipc_log_resize(int log_size); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index dbd79c6..4a7cd37 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -68,20 +68,6 @@ struct link_req { unsigned int timer_intv; }; - -/* - * disc_lost_link(): A link has lost contact - */ - -void tipc_disc_link_event(u32 addr, char *name, int up) -{ - if (in_own_cluster(addr)) - return; - /* - * Code for inter cluster link setup here - */ -} - /** * tipc_disc_init_msg - initialize a link setup message * @type: message type (request or response) @@ -95,7 +81,7 @@ static struct sk_buff *tipc_disc_init_msg(u32 type, u32 dest_domain, struct bearer *b_ptr) { - struct sk_buff *buf = buf_acquire(DSC_H_SIZE); + struct sk_buff *buf = tipc_buf_acquire(DSC_H_SIZE); struct tipc_msg *msg; if (buf) { diff --git a/net/tipc/discover.h b/net/tipc/discover.h index 9d064c3..f8e7506 100644 --- a/net/tipc/discover.h +++ b/net/tipc/discover.h @@ -50,6 +50,4 @@ void tipc_disc_stop_link_req(struct link_req *req); void tipc_disc_recv_msg(struct sk_buff *buf, struct bearer *b_ptr); -void tipc_disc_link_event(u32 addr, char *name, int up); - #endif diff --git a/net/tipc/link.c b/net/tipc/link.c index 4be78ec..b31992c 100644 --- a/net/tipc/link.c +++ b/net/tipc/link.c @@ -112,6 +112,9 @@ static void link_state_event(struct link *l_ptr, u32 event); static void link_reset_statistics(struct link *l_ptr); static void link_print(struct link *l_ptr, struct print_buf *buf, const char *str); +static void link_start(struct link *l_ptr); +static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf); + /* * Debugging code used by link routines only @@ -442,7 +445,7 @@ struct link *tipc_link_create(struct bearer *b_ptr, const u32 peer, k_init_timer(&l_ptr->timer, (Handler)link_timeout, (unsigned long)l_ptr); list_add_tail(&l_ptr->link_list, &b_ptr->links); - tipc_k_signal((Handler)tipc_link_start, (unsigned long)l_ptr); + tipc_k_signal((Handler)link_start, (unsigned long)l_ptr); dbg("tipc_link_create(): tolerance = %u,cont intv = %u, abort_limit = %u\n", l_ptr->tolerance, l_ptr->continuity_interval, l_ptr->abort_limit); @@ -482,9 +485,9 @@ void tipc_link_delete(struct link *l_ptr) kfree(l_ptr); } -void tipc_link_start(struct link *l_ptr) +static void link_start(struct link *l_ptr) { - dbg("tipc_link_start %x\n", l_ptr); + dbg("link_start %x\n", l_ptr); link_state_event(l_ptr, STARTING_EVT); } @@ -1000,7 +1003,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf) /* Fragmentation needed ? */ if (size > max_packet) - return tipc_link_send_long_buf(l_ptr, buf); + return link_send_long_buf(l_ptr, buf); /* Packet can be queued or sent: */ @@ -1036,7 +1039,7 @@ int tipc_link_send_buf(struct link *l_ptr, struct sk_buff *buf) /* Try creating a new bundle */ if (size <= max_packet * 2 / 3) { - struct sk_buff *bundler = buf_acquire(max_packet); + struct sk_buff *bundler = tipc_buf_acquire(max_packet); struct tipc_msg bundler_hdr; if (bundler) { @@ -1312,7 +1315,7 @@ again: /* Prepare header of first fragment: */ - buf_chain = buf = buf_acquire(max_pkt); + buf_chain = buf = tipc_buf_acquire(max_pkt); if (!buf) return -ENOMEM; buf->next = NULL; @@ -1369,7 +1372,7 @@ error: msg_set_size(&fragm_hdr, fragm_sz + INT_H_SIZE); msg_set_fragm_no(&fragm_hdr, ++fragm_no); prev = buf; - buf = buf_acquire(fragm_sz + INT_H_SIZE); + buf = tipc_buf_acquire(fragm_sz + INT_H_SIZE); if (!buf) goto error; @@ -2145,7 +2148,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg, if (tipc_bearer_congested(l_ptr->b_ptr, l_ptr)) { if (!l_ptr->proto_msg_queue) { l_ptr->proto_msg_queue = - buf_acquire(sizeof(l_ptr->proto_msg)); + tipc_buf_acquire(sizeof(l_ptr->proto_msg)); } buf = l_ptr->proto_msg_queue; if (!buf) @@ -2159,7 +2162,7 @@ void tipc_link_send_proto_msg(struct link *l_ptr, u32 msg_typ, int probe_msg, msg_dbg(msg, ">>"); - buf = buf_acquire(msg_size); + buf = tipc_buf_acquire(msg_size); if (!buf) return; @@ -2318,10 +2321,10 @@ exit: * tipc_link_tunnel(): Send one message via a link belonging to * another bearer. Owner node is locked. */ -void tipc_link_tunnel(struct link *l_ptr, - struct tipc_msg *tunnel_hdr, - struct tipc_msg *msg, - u32 selector) +static void tipc_link_tunnel(struct link *l_ptr, + struct tipc_msg *tunnel_hdr, + struct tipc_msg *msg, + u32 selector) { struct link *tunnel; struct sk_buff *buf; @@ -2334,7 +2337,7 @@ void tipc_link_tunnel(struct link *l_ptr, return; } msg_set_size(tunnel_hdr, length + INT_H_SIZE); - buf = buf_acquire(length + INT_H_SIZE); + buf = tipc_buf_acquire(length + INT_H_SIZE); if (!buf) { warn("Link changeover error, " "unable to send tunnel msg\n"); @@ -2380,7 +2383,7 @@ void tipc_link_changeover(struct link *l_ptr) if (!l_ptr->first_out) { struct sk_buff *buf; - buf = buf_acquire(INT_H_SIZE); + buf = tipc_buf_acquire(INT_H_SIZE); if (buf) { skb_copy_to_linear_data(buf, &tunnel_hdr, INT_H_SIZE); msg_set_size(&tunnel_hdr, INT_H_SIZE); @@ -2441,7 +2444,7 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *tunnel) msg_set_ack(msg, mod(l_ptr->next_in_no - 1)); /* Update */ msg_set_bcast_ack(msg, l_ptr->owner->bclink.last_in); msg_set_size(&tunnel_hdr, length + INT_H_SIZE); - outbuf = buf_acquire(length + INT_H_SIZE); + outbuf = tipc_buf_acquire(length + INT_H_SIZE); if (outbuf == NULL) { warn("Link changeover error, " "unable to send duplicate msg\n"); @@ -2477,7 +2480,7 @@ static struct sk_buff *buf_extract(struct sk_buff *skb, u32 from_pos) u32 size = msg_size(msg); struct sk_buff *eb; - eb = buf_acquire(size); + eb = tipc_buf_acquire(size); if (eb) skb_copy_to_linear_data(eb, msg, size); return eb; @@ -2605,11 +2608,11 @@ void tipc_link_recv_bundle(struct sk_buff *buf) /* - * tipc_link_send_long_buf: Entry for buffers needing fragmentation. + * link_send_long_buf: Entry for buffers needing fragmentation. * The buffer is complete, inclusive total message length. * Returns user data length. */ -int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) +static int link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) { struct tipc_msg *inmsg = buf_msg(buf); struct tipc_msg fragm_hdr; @@ -2648,7 +2651,7 @@ int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf) fragm_sz = rest; msg_set_type(&fragm_hdr, LAST_FRAGMENT); } - fragm = buf_acquire(fragm_sz + INT_H_SIZE); + fragm = tipc_buf_acquire(fragm_sz + INT_H_SIZE); if (fragm == NULL) { warn("Link unable to fragment message\n"); dsz = -ENOMEM; @@ -2753,7 +2756,7 @@ int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, buf_discard(fbuf); return 0; } - pbuf = buf_acquire(msg_size(imsg)); + pbuf = tipc_buf_acquire(msg_size(imsg)); if (pbuf != NULL) { pbuf->next = *pending; *pending = pbuf; diff --git a/net/tipc/link.h b/net/tipc/link.h index 4e944ef..f98bc61 100644 --- a/net/tipc/link.h +++ b/net/tipc/link.h @@ -225,7 +225,6 @@ void tipc_link_send_duplicate(struct link *l_ptr, struct link *dest); void tipc_link_reset_fragments(struct link *l_ptr); int tipc_link_is_up(struct link *l_ptr); int tipc_link_is_active(struct link *l_ptr); -void tipc_link_start(struct link *l_ptr); u32 tipc_link_push_packet(struct link *l_ptr); void tipc_link_stop(struct link *l_ptr); struct sk_buff *tipc_link_cmd_config(const void *req_tlv_area, int req_tlv_space, u16 cmd); @@ -239,9 +238,6 @@ int tipc_link_send_sections_fast(struct port* sender, struct iovec const *msg_sect, const u32 num_sect, u32 destnode); -int tipc_link_send_long_buf(struct link *l_ptr, struct sk_buff *buf); -void tipc_link_tunnel(struct link *l_ptr, struct tipc_msg *tnl_hdr, - struct tipc_msg *msg, u32 selector); void tipc_link_recv_bundle(struct sk_buff *buf); int tipc_link_recv_fragment(struct sk_buff **pending, struct sk_buff **fb, diff --git a/net/tipc/msg.c b/net/tipc/msg.c index 3810638..ecb532f 100644 --- a/net/tipc/msg.c +++ b/net/tipc/msg.c @@ -112,7 +112,7 @@ int tipc_msg_build(struct tipc_msg *hdr, return dsz; } - *buf = buf_acquire(sz); + *buf = tipc_buf_acquire(sz); if (!(*buf)) return -ENOMEM; skb_copy_to_linear_data(*buf, hdr, hsz); diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 6ac3c54..7b90717 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -98,7 +98,7 @@ static void publ_to_item(struct distr_item *i, struct publication *p) static struct sk_buff *named_prepare_buf(u32 type, u32 size, u32 dest) { - struct sk_buff *buf = buf_acquire(LONG_H_SIZE + size); + struct sk_buff *buf = tipc_buf_acquire(LONG_H_SIZE + size); struct tipc_msg *msg; if (buf != NULL) { diff --git a/net/tipc/node.c b/net/tipc/node.c index 823e9ab..b4d87eb 100644 --- a/net/tipc/node.c +++ b/net/tipc/node.c @@ -50,7 +50,8 @@ void node_print(struct print_buf *buf, struct tipc_node *n_ptr, char *str); static void node_lost_contact(struct tipc_node *n_ptr); static void node_established_contact(struct tipc_node *n_ptr); -struct tipc_node *tipc_nodes = NULL; /* sorted list of nodes within cluster */ +/* sorted list of nodes within cluster */ +static struct tipc_node *tipc_nodes = NULL; static DEFINE_SPINLOCK(node_create_lock); @@ -587,22 +588,6 @@ void tipc_node_remove_router(struct tipc_node *n_ptr, u32 router) node_lost_contact(n_ptr); } -u32 tipc_available_nodes(const u32 domain) -{ - struct tipc_node *n_ptr; - u32 cnt = 0; - - read_lock_bh(&tipc_net_lock); - for (n_ptr = tipc_nodes; n_ptr; n_ptr = n_ptr->next) { - if (!tipc_in_scope(domain, n_ptr->addr)) - continue; - if (tipc_node_is_up(n_ptr)) - cnt++; - } - read_unlock_bh(&tipc_net_lock); - return cnt; -} - struct sk_buff *tipc_node_get_nodes(const void *req_tlv_area, int req_tlv_space) { u32 domain; diff --git a/net/tipc/node.h b/net/tipc/node.h index 45f3db3..fff331b 100644 --- a/net/tipc/node.h +++ b/net/tipc/node.h @@ -96,7 +96,6 @@ struct tipc_node { } bclink; }; -extern struct tipc_node *tipc_nodes; extern u32 tipc_own_tag; struct tipc_node *tipc_node_create(u32 addr); diff --git a/net/tipc/port.c b/net/tipc/port.c index 5c4285b..82092ea 100644 --- a/net/tipc/port.c +++ b/net/tipc/port.c @@ -293,34 +293,6 @@ int tipc_deleteport(u32 ref) return 0; } -/** - * tipc_get_port() - return port associated with 'ref' - * - * Note: Port is not locked. - */ - -struct tipc_port *tipc_get_port(const u32 ref) -{ - return (struct tipc_port *)tipc_ref_deref(ref); -} - -/** - * tipc_get_handle - return user handle associated to port 'ref' - */ - -void *tipc_get_handle(const u32 ref) -{ - struct port *p_ptr; - void * handle; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return NULL; - handle = p_ptr->publ.usr_handle; - tipc_port_unlock(p_ptr); - return handle; -} - static int port_unreliable(struct port *p_ptr) { return msg_src_droppable(&p_ptr->publ.phdr); @@ -392,7 +364,7 @@ static struct sk_buff *port_build_proto_msg(u32 destport, u32 destnode, struct sk_buff *buf; struct tipc_msg *msg; - buf = buf_acquire(LONG_H_SIZE); + buf = tipc_buf_acquire(LONG_H_SIZE); if (buf) { msg = buf_msg(buf); tipc_msg_init(msg, usr, type, LONG_H_SIZE, destnode); @@ -433,7 +405,7 @@ int tipc_reject_msg(struct sk_buff *buf, u32 err) hdr_sz = MCAST_H_SIZE; else hdr_sz = LONG_H_SIZE; - rbuf = buf_acquire(data_sz + hdr_sz); + rbuf = tipc_buf_acquire(data_sz + hdr_sz); if (rbuf == NULL) { buf_discard(buf); return data_sz; @@ -1242,50 +1214,13 @@ int tipc_shutdown(u32 ref) return tipc_disconnect(ref); } -int tipc_isconnected(u32 ref, int *isconnected) -{ - struct port *p_ptr; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - *isconnected = p_ptr->publ.connected; - tipc_port_unlock(p_ptr); - return 0; -} - -int tipc_peer(u32 ref, struct tipc_portid *peer) -{ - struct port *p_ptr; - int res; - - p_ptr = tipc_port_lock(ref); - if (!p_ptr) - return -EINVAL; - if (p_ptr->publ.connected) { - peer->ref = port_peerport(p_ptr); - peer->node = port_peernode(p_ptr); - res = 0; - } else - res = -ENOTCONN; - tipc_port_unlock(p_ptr); - return res; -} - -int tipc_ref_valid(u32 ref) -{ - /* Works irrespective of type */ - return !!tipc_ref_deref(ref); -} - - /* * tipc_port_recv_sections(): Concatenate and deliver sectioned * message for this node. */ -int tipc_port_recv_sections(struct port *sender, unsigned int num_sect, - struct iovec const *msg_sect) +static int tipc_port_recv_sections(struct port *sender, unsigned int num_sect, + struct iovec const *msg_sect) { struct sk_buff *buf; int res; @@ -1336,65 +1271,16 @@ int tipc_send(u32 ref, unsigned int num_sect, struct iovec const *msg_sect) } /** - * tipc_send_buf - send message buffer on connection - */ - -int tipc_send_buf(u32 ref, struct sk_buff *buf, unsigned int dsz) -{ - struct port *p_ptr; - struct tipc_msg *msg; - u32 destnode; - u32 hsz; - u32 sz; - u32 res; - - p_ptr = tipc_port_deref(ref); - if (!p_ptr || !p_ptr->publ.connected) - return -EINVAL; - - msg = &p_ptr->publ.phdr; - hsz = msg_hdr_sz(msg); - sz = hsz + dsz; - msg_set_size(msg, sz); - if (skb_cow(buf, hsz)) - return -ENOMEM; - - skb_push(buf, hsz); - skb_copy_to_linear_data(buf, msg, hsz); - destnode = msg_destnode(msg); - p_ptr->publ.congested = 1; - if (!tipc_port_congested(p_ptr)) { - if (likely(destnode != tipc_own_addr)) - res = tipc_send_buf_fast(buf, destnode); - else { - tipc_port_recv_msg(buf); - res = sz; - } - if (likely(res != -ELINKCONG)) { - port_incr_out_seqno(p_ptr); - p_ptr->sent++; - p_ptr->publ.congested = 0; - return res; - } - } - if (port_unreliable(p_ptr)) { - p_ptr->publ.congested = 0; - return dsz; - } - return -ELINKCONG; -} - -/** * tipc_forward2name - forward message sections to port name */ -int tipc_forward2name(u32 ref, - struct tipc_name const *name, - u32 domain, - u32 num_sect, - struct iovec const *msg_sect, - struct tipc_portid const *orig, - unsigned int importance) +static int tipc_forward2name(u32 ref, + struct tipc_name const *name, + u32 domain, + u32 num_sect, + struct iovec const *msg_sect, + struct tipc_portid const *orig, + unsigned int importance) { struct port *p_ptr; struct tipc_msg *msg; @@ -1457,89 +1343,15 @@ int tipc_send2name(u32 ref, } /** - * tipc_forward_buf2name - forward message buffer to port name - */ - -int tipc_forward_buf2name(u32 ref, - struct tipc_name const *name, - u32 domain, - struct sk_buff *buf, - unsigned int dsz, - struct tipc_portid const *orig, - unsigned int importance) -{ - struct port *p_ptr; - struct tipc_msg *msg; - u32 destnode = domain; - u32 destport; - int res; - - p_ptr = (struct port *)tipc_ref_deref(ref); - if (!p_ptr || p_ptr->publ.connected) - return -EINVAL; - - msg = &p_ptr->publ.phdr; - if (importance <= TIPC_CRITICAL_IMPORTANCE) - msg_set_importance(msg, importance); - msg_set_type(msg, TIPC_NAMED_MSG); - msg_set_orignode(msg, orig->node); - msg_set_origport(msg, orig->ref); - msg_set_nametype(msg, name->type); - msg_set_nameinst(msg, name->instance); - msg_set_lookup_scope(msg, tipc_addr_scope(domain)); - msg_set_hdr_sz(msg, LONG_H_SIZE); - msg_set_size(msg, LONG_H_SIZE + dsz); - destport = tipc_nametbl_translate(name->type, name->instance, &destnode); - msg_set_destnode(msg, destnode); - msg_set_destport(msg, destport); - msg_dbg(msg, "forw2name ==> "); - if (skb_cow(buf, LONG_H_SIZE)) - return -ENOMEM; - skb_push(buf, LONG_H_SIZE); - skb_copy_to_linear_data(buf, msg, LONG_H_SIZE); - msg_dbg(buf_msg(buf),"PREP:"); - if (likely(destport)) { - p_ptr->sent++; - if (destnode == tipc_own_addr) - return tipc_port_recv_msg(buf); - res = tipc_send_buf_fast(buf, destnode); - if (likely(res != -ELINKCONG)) - return res; - if (port_unreliable(p_ptr)) - return dsz; - return -ELINKCONG; - } - return tipc_reject_msg(buf, TIPC_ERR_NO_NAME); -} - -/** - * tipc_send_buf2name - send message buffer to port name - */ - -int tipc_send_buf2name(u32 ref, - struct tipc_name const *dest, - u32 domain, - struct sk_buff *buf, - unsigned int dsz) -{ - struct tipc_portid orig; - - orig.ref = ref; - orig.node = tipc_own_addr; - return tipc_forward_buf2name(ref, dest, domain, buf, dsz, &orig, - TIPC_PORT_IMPORTANCE); -} - -/** * tipc_forward2port - forward message sections to port identity */ -int tipc_forward2port(u32 ref, - struct tipc_portid const *dest, - unsigned int num_sect, - struct iovec const *msg_sect, - struct tipc_portid const *orig, - unsigned int importance) +static int tipc_forward2port(u32 ref, + struct tipc_portid const *dest, + unsigned int num_sect, + struct iovec const *msg_sect, + struct tipc_portid const *orig, + unsigned int importance) { struct port *p_ptr; struct tipc_msg *msg; @@ -1591,12 +1403,12 @@ int tipc_send2port(u32 ref, /** * tipc_forward_buf2port - forward message buffer to port identity */ -int tipc_forward_buf2port(u32 ref, - struct tipc_portid const *dest, - struct sk_buff *buf, - unsigned int dsz, - struct tipc_portid const *orig, - unsigned int importance) +static int tipc_forward_buf2port(u32 ref, + struct tipc_portid const *dest, + struct sk_buff *buf, + unsigned int dsz, + struct tipc_portid const *orig, + unsigned int importance) { struct port *p_ptr; struct tipc_msg *msg; diff --git a/net/tipc/port.h b/net/tipc/port.h index e74bd956..73bbf44 100644 --- a/net/tipc/port.h +++ b/net/tipc/port.h @@ -109,8 +109,6 @@ struct port { extern spinlock_t tipc_port_list_lock; struct port_list; -int tipc_port_recv_sections(struct port *p_ptr, u32 num_sect, - struct iovec const *msg_sect); int tipc_port_reject_sections(struct port *p_ptr, struct tipc_msg *hdr, struct iovec const *msg_sect, u32 num_sect, int err); diff --git a/net/tipc/ref.c b/net/tipc/ref.c index 8dea665..ab8ad32 100644 --- a/net/tipc/ref.c +++ b/net/tipc/ref.c @@ -282,23 +282,6 @@ void *tipc_ref_lock(u32 ref) return NULL; } -/** - * tipc_ref_unlock - unlock referenced object - */ - -void tipc_ref_unlock(u32 ref) -{ - if (likely(tipc_ref_table.entries)) { - struct reference *entry; - - entry = &tipc_ref_table.entries[ref & - tipc_ref_table.index_mask]; - if (likely((entry->ref == ref) && (entry->object))) - spin_unlock_bh(&entry->lock); - else - err("Attempt to unlock non-existent reference\n"); - } -} /** * tipc_ref_deref - return pointer referenced object (without locking it) diff --git a/net/tipc/ref.h b/net/tipc/ref.h index 7e3798e..5bc8e7a 100644 --- a/net/tipc/ref.h +++ b/net/tipc/ref.h @@ -44,7 +44,6 @@ u32 tipc_ref_acquire(void *object, spinlock_t **lock); void tipc_ref_discard(u32 ref); void *tipc_ref_lock(u32 ref); -void tipc_ref_unlock(u32 ref); void *tipc_ref_deref(u32 ref); #endif diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index 1a5b9a6..18813ac 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -598,12 +598,3 @@ void tipc_subscr_stop(void) topsrv.user_ref = 0; } } - - -int tipc_ispublished(struct tipc_name const *name) -{ - u32 domain = 0; - - return tipc_nametbl_translate(name->type, name->instance, &domain) != 0; -} - diff --git a/net/tipc/zone.c b/net/tipc/zone.c index 2c01ba2..83f8b5e 100644 --- a/net/tipc/zone.c +++ b/net/tipc/zone.c @@ -160,14 +160,3 @@ u32 tipc_zone_select_router(struct _zone *z_ptr, u32 addr, u32 ref) } return 0; } - - -u32 tipc_zone_next_node(u32 addr) -{ - struct cluster *c_ptr = tipc_cltr_find(addr); - - if (c_ptr) - return tipc_cltr_next_node(c_ptr, addr); - return 0; -} - diff --git a/net/tipc/zone.h b/net/tipc/zone.h index 7bdc340..bd1c20c 100644 --- a/net/tipc/zone.h +++ b/net/tipc/zone.h @@ -61,7 +61,6 @@ void tipc_zone_send_external_routes(struct _zone *z_ptr, u32 dest); struct _zone *tipc_zone_create(u32 addr); void tipc_zone_delete(struct _zone *z_ptr); void tipc_zone_attach_cluster(struct _zone *z_ptr, struct cluster *c_ptr); -u32 tipc_zone_next_node(u32 addr); static inline struct _zone *tipc_zone_find(u32 addr) { -- cgit v1.1 From 1fdb936101637c91819efea47e921bb592e07e34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 14 Oct 2010 01:42:30 +0000 Subject: tcp: sack lost marking fixes MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When only fast rexmit should be done, tcp_mark_head_lost marks L too far. Also, sacked_upto below 1 is perfectly valid number, the packets == 0 then needs to be trapped elsewhere. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_input.c | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index e4fbdae..ee0df48 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2495,7 +2495,7 @@ static void tcp_timeout_skbs(struct sock *sk) /* Mark head of queue up as lost. With RFC3517 SACK, the packets is * is against sacked "cnt", otherwise it's against facked "cnt" */ -static void tcp_mark_head_lost(struct sock *sk, int packets) +static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head) { struct tcp_sock *tp = tcp_sk(sk); struct sk_buff *skb; @@ -2503,13 +2503,13 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) int err; unsigned int mss; - if (packets == 0) - return; - WARN_ON(packets > tp->packets_out); if (tp->lost_skb_hint) { skb = tp->lost_skb_hint; cnt = tp->lost_cnt_hint; + /* Head already handled? */ + if (mark_head && skb != tcp_write_queue_head(sk)) + return; } else { skb = tcp_write_queue_head(sk); cnt = 0; @@ -2544,6 +2544,9 @@ static void tcp_mark_head_lost(struct sock *sk, int packets) } tcp_skb_mark_lost(tp, skb); + + if (mark_head) + break; } tcp_verify_left_out(tp); } @@ -2555,17 +2558,18 @@ static void tcp_update_scoreboard(struct sock *sk, int fast_rexmit) struct tcp_sock *tp = tcp_sk(sk); if (tcp_is_reno(tp)) { - tcp_mark_head_lost(sk, 1); + tcp_mark_head_lost(sk, 1, 1); } else if (tcp_is_fack(tp)) { int lost = tp->fackets_out - tp->reordering; if (lost <= 0) lost = 1; - tcp_mark_head_lost(sk, lost); + tcp_mark_head_lost(sk, lost, 0); } else { int sacked_upto = tp->sacked_out - tp->reordering; - if (sacked_upto < fast_rexmit) - sacked_upto = fast_rexmit; - tcp_mark_head_lost(sk, sacked_upto); + if (sacked_upto >= 0) + tcp_mark_head_lost(sk, sacked_upto, 0); + else if (fast_rexmit) + tcp_mark_head_lost(sk, 1, 1); } tcp_timeout_skbs(sk); @@ -2971,7 +2975,7 @@ static void tcp_fastretrans_alert(struct sock *sk, int pkts_acked, int flag) before(tp->snd_una, tp->high_seq) && icsk->icsk_ca_state != TCP_CA_Open && tp->fackets_out > tp->reordering) { - tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering); + tcp_mark_head_lost(sk, tp->fackets_out - tp->reordering, 0); NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPLOSS); } -- cgit v1.1 From c60ce4e265404ca42ba860401f4b0f1e97332a67 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= Date: Thu, 14 Oct 2010 01:52:09 +0000 Subject: tcp: use correct counters in CA_CWR state too MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit As CWR is stronger than CA_Disorder state, we can miscount SACK/Reno failure into other timeouts. Not a bad problem as it can happen only due to ECN, FRTO detecting spurious RTO or xmit error which are the only callers of tcp_enter_cwr. And even then losses and RTO must still follow thereafter to actually end up into the relevant code paths. Compile tested. Signed-off-by: Ilpo Järvinen Signed-off-by: David S. Miller --- net/ipv4/tcp_timer.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c index f3c8c6c..74a6aa0 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c @@ -367,18 +367,19 @@ void tcp_retransmit_timer(struct sock *sk) if (icsk->icsk_retransmits == 0) { int mib_idx; - if (icsk->icsk_ca_state == TCP_CA_Disorder) { - if (tcp_is_sack(tp)) - mib_idx = LINUX_MIB_TCPSACKFAILURES; - else - mib_idx = LINUX_MIB_TCPRENOFAILURES; - } else if (icsk->icsk_ca_state == TCP_CA_Recovery) { + if (icsk->icsk_ca_state == TCP_CA_Recovery) { if (tcp_is_sack(tp)) mib_idx = LINUX_MIB_TCPSACKRECOVERYFAIL; else mib_idx = LINUX_MIB_TCPRENORECOVERYFAIL; } else if (icsk->icsk_ca_state == TCP_CA_Loss) { mib_idx = LINUX_MIB_TCPLOSSFAILURES; + } else if ((icsk->icsk_ca_state == TCP_CA_Disorder) || + tp->sacked_out) { + if (tcp_is_sack(tp)) + mib_idx = LINUX_MIB_TCPSACKFAILURES; + else + mib_idx = LINUX_MIB_TCPRENOFAILURES; } else { mib_idx = LINUX_MIB_TCPTIMEOUTS; } -- cgit v1.1 From 9bef83edfba72ba58b42c14fb046da2199574bc0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Oct 2010 20:53:04 +0000 Subject: fib_hash: embed initial hash table in fn_zone While looking for false sharing problems, I noticed sizeof(struct fn_zone) was small (28 bytes) and possibly sharing a cache line with an often written kernel structure. Most of the time, fn_zone uses its initial hash table of 16 slots. We can avoid the false sharing problem by embedding this initial hash table in fn_zone itself, so that sizeof(fn_zone) > L1_CACHE_BYTES We did a similar optimization in commit a6501e080c (Reduce memory needs and speedup lookups) Add a fz_revorder field to speedup fn_hash() a bit. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 52 +++++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 29 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 83cca68..10001aa 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -54,23 +54,23 @@ struct fib_node { struct fib_alias fn_embedded_alias; }; +#define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) + struct fn_zone { struct fn_zone *fz_next; /* Next not empty zone */ struct hlist_head *fz_hash; /* Hash table pointer */ - int fz_nent; /* Number of entries */ - - int fz_divisor; /* Hash divisor */ u32 fz_hashmask; /* (fz_divisor - 1) */ -#define FZ_HASHMASK(fz) ((fz)->fz_hashmask) - int fz_order; /* Zone order */ - __be32 fz_mask; + u8 fz_order; /* Zone order (0..32) */ + u8 fz_revorder; /* 32 - fz_order */ + __be32 fz_mask; /* inet_make_mask(order) */ #define FZ_MASK(fz) ((fz)->fz_mask) -}; -/* NOTE. On fast computers evaluation of fz_hashmask and fz_mask - * can be cheaper than memory lookup, so that FZ_* macros are used. - */ + struct hlist_head fz_embedded_hash[EMBEDDED_HASH_SIZE]; + + int fz_nent; /* Number of entries */ + int fz_divisor; /* Hash size (mask+1) */ +}; struct fn_hash { struct fn_zone *fn_zones[33]; @@ -79,11 +79,11 @@ struct fn_hash { static inline u32 fn_hash(__be32 key, struct fn_zone *fz) { - u32 h = ntohl(key)>>(32 - fz->fz_order); + u32 h = ntohl(key) >> fz->fz_revorder; h ^= (h>>20); h ^= (h>>10); h ^= (h>>5); - h &= FZ_HASHMASK(fz); + h &= fz->fz_hashmask; return h; } @@ -147,14 +147,14 @@ static void fn_rehash_zone(struct fn_zone *fz) int old_divisor, new_divisor; u32 new_hashmask; - old_divisor = fz->fz_divisor; + new_divisor = old_divisor = fz->fz_divisor; switch (old_divisor) { - case 16: - new_divisor = 256; + case EMBEDDED_HASH_SIZE: + new_divisor *= EMBEDDED_HASH_SIZE; break; - case 256: - new_divisor = 1024; + case EMBEDDED_HASH_SIZE*EMBEDDED_HASH_SIZE: + new_divisor *= (EMBEDDED_HASH_SIZE/2); break; default: if ((old_divisor << 1) > FZ_MAX_DIVISOR) { @@ -184,7 +184,8 @@ static void fn_rehash_zone(struct fn_zone *fz) fib_hash_genid++; write_unlock_bh(&fib_hash_lock); - fz_hash_free(old_ht, old_divisor); + if (old_ht != fz->fz_embedded_hash) + fz_hash_free(old_ht, old_divisor); } } @@ -210,18 +211,11 @@ fn_new_zone(struct fn_hash *table, int z) if (!fz) return NULL; - if (z) { - fz->fz_divisor = 16; - } else { - fz->fz_divisor = 1; - } - fz->fz_hashmask = (fz->fz_divisor - 1); - fz->fz_hash = fz_hash_alloc(fz->fz_divisor); - if (!fz->fz_hash) { - kfree(fz); - return NULL; - } + fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; + fz->fz_hashmask = fz->fz_divisor - 1; + fz->fz_hash = fz->fz_embedded_hash; fz->fz_order = z; + fz->fz_revorder = 32 - z; fz->fz_mask = inet_make_mask(z); /* Find the first not empty zone with more specific mask */ -- cgit v1.1 From 117a8cdea3647e8e11fac10d14eafefc20f9bda5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Oct 2010 20:53:34 +0000 Subject: fib_hash: RCU conversion phase 1 First step for RCU conversion of fib_hash : struct fn_zone are created and never deleted. Very classic conversion, using rcu_assign_pointer(), rcu_dereference() and rtnl_dereference() verbs. __rcu markers on fz_next and fn_zone_list They are created under RTNL, we dont need fib_hash_lock anymore in fn_new_zone(). Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 57 +++++++++++++++++++++++++++++++++++------------------ 1 file changed, 38 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 10001aa..04f05a9 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -57,7 +57,7 @@ struct fib_node { #define EMBEDDED_HASH_SIZE (L1_CACHE_BYTES / sizeof(struct hlist_head)) struct fn_zone { - struct fn_zone *fz_next; /* Next not empty zone */ + struct fn_zone __rcu *fz_next; /* Next not empty zone */ struct hlist_head *fz_hash; /* Hash table pointer */ u32 fz_hashmask; /* (fz_divisor - 1) */ @@ -73,8 +73,8 @@ struct fn_zone { }; struct fn_hash { - struct fn_zone *fn_zones[33]; - struct fn_zone *fn_zone_list; + struct fn_zone *fn_zones[33]; + struct fn_zone __rcu *fn_zone_list; }; static inline u32 fn_hash(__be32 key, struct fn_zone *fz) @@ -219,21 +219,21 @@ fn_new_zone(struct fn_hash *table, int z) fz->fz_mask = inet_make_mask(z); /* Find the first not empty zone with more specific mask */ - for (i=z+1; i<=32; i++) + for (i = z + 1; i <= 32; i++) if (table->fn_zones[i]) break; - write_lock_bh(&fib_hash_lock); - if (i>32) { + if (i > 32) { /* No more specific masks, we are the first. */ - fz->fz_next = table->fn_zone_list; - table->fn_zone_list = fz; + rcu_assign_pointer(fz->fz_next, + rtnl_dereference(table->fn_zone_list)); + rcu_assign_pointer(table->fn_zone_list, fz); } else { - fz->fz_next = table->fn_zones[i]->fz_next; - table->fn_zones[i]->fz_next = fz; + rcu_assign_pointer(fz->fz_next, + rtnl_dereference(table->fn_zones[i]->fz_next)); + rcu_assign_pointer(table->fn_zones[i]->fz_next, fz); } table->fn_zones[z] = fz; fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); return fz; } @@ -245,8 +245,11 @@ int fib_table_lookup(struct fib_table *tb, struct fn_zone *fz; struct fn_hash *t = (struct fn_hash *)tb->tb_data; + rcu_read_lock(); read_lock(&fib_hash_lock); - for (fz = t->fn_zone_list; fz; fz = fz->fz_next) { + for (fz = rcu_dereference(t->fn_zone_list); + fz != NULL; + fz = rcu_dereference(fz->fz_next)) { struct hlist_head *head; struct hlist_node *node; struct fib_node *f; @@ -267,6 +270,7 @@ int fib_table_lookup(struct fib_table *tb, err = 1; out: read_unlock(&fib_hash_lock); + rcu_read_unlock(); return err; } @@ -362,6 +366,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) return NULL; } +/* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; @@ -657,13 +662,16 @@ static int fn_flush_list(struct fn_zone *fz, int idx) return found; } +/* caller must hold RTNL. */ int fib_table_flush(struct fib_table *tb) { struct fn_hash *table = (struct fn_hash *) tb->tb_data; struct fn_zone *fz; int found = 0; - for (fz = table->fn_zone_list; fz; fz = fz->fz_next) { + for (fz = rtnl_dereference(table->fn_zone_list); + fz != NULL; + fz = rtnl_dereference(fz->fz_next)) { int i; for (i = fz->fz_divisor - 1; i >= 0; i--) @@ -741,23 +749,29 @@ fn_hash_dump_zone(struct sk_buff *skb, struct netlink_callback *cb, int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) { - int m, s_m; + int m = 0, s_m; struct fn_zone *fz; struct fn_hash *table = (struct fn_hash *)tb->tb_data; s_m = cb->args[2]; + rcu_read_lock(); read_lock(&fib_hash_lock); - for (fz = table->fn_zone_list, m=0; fz; fz = fz->fz_next, m++) { - if (m < s_m) continue; + for (fz = rcu_dereference(table->fn_zone_list); + fz != NULL; + fz = rcu_dereference(fz->fz_next), m++) { + if (m < s_m) + continue; if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { cb->args[2] = m; read_unlock(&fib_hash_lock); + rcu_read_unlock(); return -1; } memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); } read_unlock(&fib_hash_lock); + rcu_read_unlock(); cb->args[2] = m; return skb->len; } @@ -820,8 +834,9 @@ static struct fib_alias *fib_get_first(struct seq_file *seq) iter->genid = fib_hash_genid; iter->valid = 1; - for (iter->zone = table->fn_zone_list; iter->zone; - iter->zone = iter->zone->fz_next) { + for (iter->zone = rcu_dereference(table->fn_zone_list); + iter->zone != NULL; + iter->zone = rcu_dereference(iter->zone->fz_next)) { int maxslot; if (!iter->zone->fz_nent) @@ -906,7 +921,7 @@ static struct fib_alias *fib_get_next(struct seq_file *seq) } } - iter->zone = iter->zone->fz_next; + iter->zone = rcu_dereference(iter->zone->fz_next); if (!iter->zone) goto out; @@ -946,9 +961,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) static void *fib_seq_start(struct seq_file *seq, loff_t *pos) __acquires(fib_hash_lock) + __acquires(RCU) { void *v = NULL; + rcu_read_lock(); read_lock(&fib_hash_lock); if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; @@ -963,8 +980,10 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void fib_seq_stop(struct seq_file *seq, void *v) __releases(fib_hash_lock) + __releases(RCU) { read_unlock(&fib_hash_lock); + rcu_read_unlock(); } static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) -- cgit v1.1 From 19f572565ef66a0439574fd2299a7c804147e133 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 14 Oct 2010 20:56:39 +0000 Subject: fib_hash: RCU conversion phase 2 Get rid of fib_hash_lock rwlock. The fn_zone hash table resize is the noticeable part of this patch. I added a seqlock per fn_zone, so that readers can restart their lookup in the (very rare) case a writer expanded the hash table. Add rcu heads in fib_alias and fib_node, use call_rcu() to defer their freeing, and use appropriate _rcu list manipulations. Stress test (160.000.000 udp frames sent, IP route cache disabled to mimic DDOS attack, FIB_HASH) Before: real 0m41.191s user 0m13.137s sys 8m55.241s After: real 0m38.091s user 0m13.189s sys 7m53.018s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 176 +++++++++++++++++++++++++++++--------------------- net/ipv4/fib_lookup.h | 2 - 2 files changed, 101 insertions(+), 77 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 04f05a9..4f1aafd 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -58,7 +58,8 @@ struct fib_node { struct fn_zone { struct fn_zone __rcu *fz_next; /* Next not empty zone */ - struct hlist_head *fz_hash; /* Hash table pointer */ + struct hlist_head __rcu *fz_hash; /* Hash table pointer */ + seqlock_t fz_lock; u32 fz_hashmask; /* (fz_divisor - 1) */ u8 fz_order; /* Zone order (0..32) */ @@ -92,7 +93,6 @@ static inline __be32 fz_key(__be32 dst, struct fn_zone *fz) return dst & FZ_MASK(fz); } -static DEFINE_RWLOCK(fib_hash_lock); static unsigned int fib_hash_genid; #define FZ_MAX_DIVISOR ((PAGE_SIZE<fn_hash); + hlist_del_rcu(&f->fn_hash); new_head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; - hlist_add_head(&f->fn_hash, new_head); + hlist_add_head_rcu(&f->fn_hash, new_head); } } } @@ -175,32 +174,55 @@ static void fn_rehash_zone(struct fn_zone *fz) ht = fz_hash_alloc(new_divisor); if (ht) { - write_lock_bh(&fib_hash_lock); + struct fn_zone nfz; + + memcpy(&nfz, fz, sizeof(nfz)); + + write_seqlock_bh(&fz->fz_lock); old_ht = fz->fz_hash; - fz->fz_hash = ht; + nfz.fz_hash = ht; + nfz.fz_hashmask = new_hashmask; + nfz.fz_divisor = new_divisor; + fn_rebuild_zone(&nfz, old_ht, old_divisor); + fib_hash_genid++; + rcu_assign_pointer(fz->fz_hash, ht); fz->fz_hashmask = new_hashmask; fz->fz_divisor = new_divisor; - fn_rebuild_zone(fz, old_ht, old_divisor); - fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); + write_sequnlock_bh(&fz->fz_lock); - if (old_ht != fz->fz_embedded_hash) + if (old_ht != fz->fz_embedded_hash) { + synchronize_rcu(); fz_hash_free(old_ht, old_divisor); + } } } -static inline void fn_free_node(struct fib_node * f) +static void fn_free_node_rcu(struct rcu_head *head) { + struct fib_node *f = container_of(head, struct fib_node, fn_embedded_alias.rcu); + kmem_cache_free(fn_hash_kmem, f); } +static inline void fn_free_node(struct fib_node *f) +{ + call_rcu(&f->fn_embedded_alias.rcu, fn_free_node_rcu); +} + +static void fn_free_alias_rcu(struct rcu_head *head) +{ + struct fib_alias *fa = container_of(head, struct fib_alias, rcu); + + kmem_cache_free(fn_alias_kmem, fa); +} + static inline void fn_free_alias(struct fib_alias *fa, struct fib_node *f) { fib_release_info(fa->fa_info); if (fa == &f->fn_embedded_alias) fa->fa_info = NULL; else - kmem_cache_free(fn_alias_kmem, fa); + call_rcu(&fa->rcu, fn_free_alias_rcu); } static struct fn_zone * @@ -211,6 +233,7 @@ fn_new_zone(struct fn_hash *table, int z) if (!fz) return NULL; + seqlock_init(&fz->fz_lock); fz->fz_divisor = z ? EMBEDDED_HASH_SIZE : 1; fz->fz_hashmask = fz->fz_divisor - 1; fz->fz_hash = fz->fz_embedded_hash; @@ -246,30 +269,34 @@ int fib_table_lookup(struct fib_table *tb, struct fn_hash *t = (struct fn_hash *)tb->tb_data; rcu_read_lock(); - read_lock(&fib_hash_lock); for (fz = rcu_dereference(t->fn_zone_list); fz != NULL; fz = rcu_dereference(fz->fz_next)) { - struct hlist_head *head; + struct hlist_head __rcu *head; struct hlist_node *node; struct fib_node *f; - __be32 k = fz_key(flp->fl4_dst, fz); + __be32 k; + unsigned int seq; - head = &fz->fz_hash[fn_hash(k, fz)]; - hlist_for_each_entry(f, node, head, fn_hash) { - if (f->fn_key != k) - continue; + do { + seq = read_seqbegin(&fz->fz_lock); + k = fz_key(flp->fl4_dst, fz); + + head = &fz->fz_hash[fn_hash(k, fz)]; + hlist_for_each_entry_rcu(f, node, head, fn_hash) { + if (f->fn_key != k) + continue; - err = fib_semantic_match(&f->fn_alias, + err = fib_semantic_match(&f->fn_alias, flp, res, fz->fz_order, fib_flags); - if (err <= 0) - goto out; - } + if (err <= 0) + goto out; + } + } while (read_seqretry(&fz->fz_lock, seq)); } err = 1; out: - read_unlock(&fib_hash_lock); rcu_read_unlock(); return err; } @@ -292,11 +319,11 @@ void fib_table_select_default(struct fib_table *tb, last_resort = NULL; order = -1; - read_lock(&fib_hash_lock); - hlist_for_each_entry(f, node, &fz->fz_hash[0], fn_hash) { + rcu_read_lock(); + hlist_for_each_entry_rcu(f, node, &fz->fz_hash[0], fn_hash) { struct fib_alias *fa; - list_for_each_entry(fa, &f->fn_alias, fa_list) { + list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { struct fib_info *next_fi = fa->fa_info; if (fa->fa_scope != res->scope || @@ -340,7 +367,7 @@ void fib_table_select_default(struct fib_table *tb, fib_result_assign(res, last_resort); tb->tb_default = last_idx; out: - read_unlock(&fib_hash_lock); + rcu_read_unlock(); } /* Insert node F to FZ. */ @@ -348,7 +375,7 @@ static inline void fib_insert_node(struct fn_zone *fz, struct fib_node *f) { struct hlist_head *head = &fz->fz_hash[fn_hash(f->fn_key, fz)]; - hlist_add_head(&f->fn_hash, head); + hlist_add_head_rcu(&f->fn_hash, head); } /* Return the node in FZ matching KEY. */ @@ -358,7 +385,7 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) struct hlist_node *node; struct fib_node *f; - hlist_for_each_entry(f, node, head, fn_hash) { + hlist_for_each_entry_rcu(f, node, head, fn_hash) { if (f->fn_key == key) return f; } @@ -366,6 +393,16 @@ static struct fib_node *fib_find_node(struct fn_zone *fz, __be32 key) return NULL; } + +static struct fib_alias *fib_fast_alloc(struct fib_node *f) +{ + struct fib_alias *fa = &f->fn_embedded_alias; + + if (fa->fa_info != NULL) + fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); + return fa; +} + /* Caller must hold RTNL. */ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) { @@ -451,7 +488,6 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) } if (cfg->fc_nlflags & NLM_F_REPLACE) { - struct fib_info *fi_drop; u8 state; fa = fa_first; @@ -460,21 +496,25 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) err = 0; goto out; } - write_lock_bh(&fib_hash_lock); - fi_drop = fa->fa_info; - fa->fa_info = fi; - fa->fa_type = cfg->fc_type; - fa->fa_scope = cfg->fc_scope; + err = -ENOBUFS; + new_fa = fib_fast_alloc(f); + if (new_fa == NULL) + goto out; + + new_fa->fa_tos = fa->fa_tos; + new_fa->fa_info = fi; + new_fa->fa_type = cfg->fc_type; + new_fa->fa_scope = cfg->fc_scope; state = fa->fa_state; - fa->fa_state &= ~FA_S_ACCESSED; + new_fa->fa_state = state & ~FA_S_ACCESSED; fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); + list_replace_rcu(&fa->fa_list, &new_fa->fa_list); - fib_release_info(fi_drop); + fn_free_alias(fa, f); if (state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); - rtmsg_fib(RTM_NEWROUTE, key, fa, cfg->fc_dst_len, tb->tb_id, - &cfg->fc_nlinfo, NLM_F_REPLACE); + rtmsg_fib(RTM_NEWROUTE, key, new_fa, cfg->fc_dst_len, + tb->tb_id, &cfg->fc_nlinfo, NLM_F_REPLACE); return 0; } @@ -506,12 +546,10 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) f = new_f; } - new_fa = &f->fn_embedded_alias; - if (new_fa->fa_info != NULL) { - new_fa = kmem_cache_alloc(fn_alias_kmem, GFP_KERNEL); - if (new_fa == NULL) - goto out; - } + new_fa = fib_fast_alloc(f); + if (new_fa == NULL) + goto out; + new_fa->fa_info = fi; new_fa->fa_tos = tos; new_fa->fa_type = cfg->fc_type; @@ -522,13 +560,11 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) * Insert new entry to the list. */ - write_lock_bh(&fib_hash_lock); if (new_f) fib_insert_node(fz, new_f); - list_add_tail(&new_fa->fa_list, + list_add_tail_rcu(&new_fa->fa_list, (fa ? &fa->fa_list : &f->fn_alias)); fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); if (new_f) fz->fz_nent++; @@ -603,14 +639,12 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) tb->tb_id, &cfg->fc_nlinfo, 0); kill_fn = 0; - write_lock_bh(&fib_hash_lock); - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(&f->fn_alias)) { - hlist_del(&f->fn_hash); + hlist_del_rcu(&f->fn_hash); kill_fn = 1; } fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); if (fa->fa_state & FA_S_ACCESSED) rt_cache_flush(cfg->fc_nlinfo.nl_net, -1); @@ -641,14 +675,12 @@ static int fn_flush_list(struct fn_zone *fz, int idx) struct fib_info *fi = fa->fa_info; if (fi && (fi->fib_flags&RTNH_F_DEAD)) { - write_lock_bh(&fib_hash_lock); - list_del(&fa->fa_list); + list_del_rcu(&fa->fa_list); if (list_empty(&f->fn_alias)) { - hlist_del(&f->fn_hash); + hlist_del_rcu(&f->fn_hash); kill_f = 1; } fib_hash_genid++; - write_unlock_bh(&fib_hash_lock); fn_free_alias(fa, f); found++; @@ -693,10 +725,10 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, s_i = cb->args[4]; i = 0; - hlist_for_each_entry(f, node, head, fn_hash) { + hlist_for_each_entry_rcu(f, node, head, fn_hash) { struct fib_alias *fa; - list_for_each_entry(fa, &f->fn_alias, fa_list) { + list_for_each_entry_rcu(fa, &f->fn_alias, fa_list) { if (i < s_i) goto next; @@ -714,7 +746,7 @@ fn_hash_dump_bucket(struct sk_buff *skb, struct netlink_callback *cb, cb->args[4] = i; return -1; } - next: +next: i++; } } @@ -755,7 +787,6 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, s_m = cb->args[2]; rcu_read_lock(); - read_lock(&fib_hash_lock); for (fz = rcu_dereference(table->fn_zone_list); fz != NULL; fz = rcu_dereference(fz->fz_next), m++) { @@ -763,14 +794,12 @@ int fib_table_dump(struct fib_table *tb, struct sk_buff *skb, continue; if (fn_hash_dump_zone(skb, cb, tb, fz) < 0) { cb->args[2] = m; - read_unlock(&fib_hash_lock); rcu_read_unlock(); return -1; } memset(&cb->args[3], 0, sizeof(cb->args) - 3*sizeof(cb->args[0])); } - read_unlock(&fib_hash_lock); rcu_read_unlock(); cb->args[2] = m; return skb->len; @@ -960,13 +989,11 @@ static struct fib_alias *fib_get_idx(struct seq_file *seq, loff_t pos) } static void *fib_seq_start(struct seq_file *seq, loff_t *pos) - __acquires(fib_hash_lock) __acquires(RCU) { void *v = NULL; rcu_read_lock(); - read_lock(&fib_hash_lock); if (fib_get_table(seq_file_net(seq), RT_TABLE_MAIN)) v = *pos ? fib_get_idx(seq, *pos - 1) : SEQ_START_TOKEN; return v; @@ -979,17 +1006,16 @@ static void *fib_seq_next(struct seq_file *seq, void *v, loff_t *pos) } static void fib_seq_stop(struct seq_file *seq, void *v) - __releases(fib_hash_lock) __releases(RCU) { - read_unlock(&fib_hash_lock); rcu_read_unlock(); } static unsigned fib_flag_trans(int type, __be32 mask, struct fib_info *fi) { static const unsigned type2flags[RTN_MAX + 1] = { - [7] = RTF_REJECT, [8] = RTF_REJECT, + [7] = RTF_REJECT, + [8] = RTF_REJECT, }; unsigned flags = type2flags[type]; diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index b9c9a9f..5072d8e 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -12,9 +12,7 @@ struct fib_alias { u8 fa_type; u8 fa_scope; u8 fa_state; -#ifdef CONFIG_IP_FIB_TRIE struct rcu_head rcu; -#endif }; #define FA_S_ACCESSED 0x01 -- cgit v1.1 From d793fe8caa3911e6a1e826b45d4ee00d250cdec8 Mon Sep 17 00:00:00 2001 From: Nathan Holstein Date: Fri, 15 Oct 2010 11:54:02 -0400 Subject: Bluetooth: fix oops in l2cap_connect_req In error cases when the ACL is insecure or we fail to allocate a new struct sock, we jump to the "response" label. If so, "sk" will be null and the kernel crashes. Signed-off-by: Nathan Holstein Acked-by: Marcel Holtmann Signed-off-by: Gustavo F. Padovan --- net/bluetooth/l2cap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 0b54b7d..dc60205 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -2891,7 +2891,7 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd struct l2cap_chan_list *list = &conn->chan_list; struct l2cap_conn_req *req = (struct l2cap_conn_req *) data; struct l2cap_conn_rsp rsp; - struct sock *parent, *uninitialized_var(sk); + struct sock *parent, *sk = NULL; int result, status = L2CAP_CS_NO_INFO; u16 dcid = 0, scid = __le16_to_cpu(req->scid); @@ -3000,7 +3000,7 @@ sendresp: L2CAP_INFO_REQ, sizeof(info), &info); } - if (!(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) && + if (sk && !(l2cap_pi(sk)->conf_state & L2CAP_CONF_REQ_SENT) && result == L2CAP_CR_SUCCESS) { u8 buf[128]; l2cap_pi(sk)->conf_state |= L2CAP_CONF_REQ_SENT; -- cgit v1.1 From 724829b3ad8e8aeb0aec46de383d35bfa1ad3875 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 18 Oct 2010 01:06:20 -0700 Subject: tipc: Kill tipc_get_mode() completely. It's completely unused and exporting a static symbol makes no sense and breaks the build. Reported-by: Stephen Rothwell Signed-off-by: David S. Miller --- net/tipc/core.c | 6 ------ 1 file changed, 6 deletions(-) (limited to 'net') diff --git a/net/tipc/core.c b/net/tipc/core.c index c005303..e2a09eb 100644 --- a/net/tipc/core.c +++ b/net/tipc/core.c @@ -96,11 +96,6 @@ int tipc_net_id; int tipc_remote_management; -static int tipc_get_mode(void) -{ - return tipc_mode; -} - /** * tipc_buf_acquire - creates a TIPC message buffer * @size: message size (including TIPC header) @@ -246,7 +241,6 @@ MODULE_VERSION(TIPC_MOD_VER); EXPORT_SYMBOL(tipc_attach); EXPORT_SYMBOL(tipc_detach); -EXPORT_SYMBOL(tipc_get_mode); EXPORT_SYMBOL(tipc_createport); EXPORT_SYMBOL(tipc_deleteport); EXPORT_SYMBOL(tipc_ownidentity); -- cgit v1.1 From ccc901ee58cfb090a31216a6eda0f1e9dfc572fa Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Thu, 14 Oct 2010 13:58:26 +0000 Subject: tipc: Simplify bearer shutdown logic Optimize processing in TIPC's bearer shutdown code, including: 1. Remove an unnecessary check to see if TIPC bearer's can exist. 2. Don't release spinlocks before calling a media-specific disabling routine, since the routine can't sleep. 3. Make bearer_disable() operate directly on a struct bearer, instead of needlessly taking a name and then mapping that to the struct. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Reviewed-by: Neil Horman Signed-off-by: David S. Miller --- net/tipc/bearer.c | 38 +++++++++++--------------------------- 1 file changed, 11 insertions(+), 27 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index 9c10c6b..fd9c06c 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -288,9 +288,6 @@ static struct bearer *bearer_find(const char *name) struct bearer *b_ptr; u32 i; - if (tipc_mode != TIPC_NET_MODE) - return NULL; - for (i = 0, b_ptr = tipc_bearers; i < MAX_BEARERS; i++, b_ptr++) { if (b_ptr->active && (!strcmp(b_ptr->publ.name, name))) return b_ptr; @@ -630,30 +627,17 @@ int tipc_block_bearer(const char *name) * Note: This routine assumes caller holds tipc_net_lock. */ -static int bearer_disable(const char *name) +static int bearer_disable(struct bearer *b_ptr) { - struct bearer *b_ptr; struct link *l_ptr; struct link *temp_l_ptr; - b_ptr = bearer_find(name); - if (!b_ptr) { - warn("Attempt to disable unknown bearer <%s>\n", name); - return -EINVAL; - } - - info("Disabling bearer <%s>\n", name); + info("Disabling bearer <%s>\n", b_ptr->publ.name); tipc_disc_stop_link_req(b_ptr->link_req); spin_lock_bh(&b_ptr->publ.lock); b_ptr->link_req = NULL; b_ptr->publ.blocked = 1; - if (b_ptr->media->disable_bearer) { - spin_unlock_bh(&b_ptr->publ.lock); - write_unlock_bh(&tipc_net_lock); - b_ptr->media->disable_bearer(&b_ptr->publ); - write_lock_bh(&tipc_net_lock); - spin_lock_bh(&b_ptr->publ.lock); - } + b_ptr->media->disable_bearer(&b_ptr->publ); list_for_each_entry_safe(l_ptr, temp_l_ptr, &b_ptr->links, link_list) { tipc_link_delete(l_ptr); } @@ -664,10 +648,16 @@ static int bearer_disable(const char *name) int tipc_disable_bearer(const char *name) { + struct bearer *b_ptr; int res; write_lock_bh(&tipc_net_lock); - res = bearer_disable(name); + b_ptr = bearer_find(name); + if (b_ptr == NULL) { + warn("Attempt to disable unknown bearer <%s>\n", name); + res = -EINVAL; + } else + res = bearer_disable(b_ptr); write_unlock_bh(&tipc_net_lock); return res; } @@ -680,13 +670,7 @@ void tipc_bearer_stop(void) for (i = 0; i < MAX_BEARERS; i++) { if (tipc_bearers[i].active) - tipc_bearers[i].publ.blocked = 1; - } - for (i = 0; i < MAX_BEARERS; i++) { - if (tipc_bearers[i].active) - bearer_disable(tipc_bearers[i].publ.name); + bearer_disable(&tipc_bearers[i]); } media_count = 0; } - - -- cgit v1.1 From 76b6717bc6ccb715b04e36efc26566a6313ede5f Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 18 Oct 2010 11:13:30 +0200 Subject: netfilter: fix kconfig unmet dependency warning Fix netfilter kconfig unmet dependencies warning & spell out "compatible" while there. warning: (IP_NF_TARGET_TTL && NET && INET && NETFILTER && IP_NF_IPTABLES && NETFILTER_ADVANCED || IP6_NF_TARGET_HL && NET && INET && IPV6 && NETFILTER && IP6_NF_IPTABLES && NETFILTER_ADVANCED) selects NETFILTER_XT_TARGET_HL which has unmet direct dependencies ((IP_NF_MANGLE || IP6_NF_MANGLE) && NETFILTER_ADVANCED) Signed-off-by: Randy Dunlap Signed-off-by: Patrick McHardy --- net/ipv4/netfilter/Kconfig | 4 ++-- net/ipv6/netfilter/Kconfig | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig index 1833bdb..8e33506 100644 --- a/net/ipv4/netfilter/Kconfig +++ b/net/ipv4/netfilter/Kconfig @@ -324,10 +324,10 @@ config IP_NF_TARGET_ECN config IP_NF_TARGET_TTL tristate '"TTL" target support' - depends on NETFILTER_ADVANCED + depends on NETFILTER_ADVANCED && IP_NF_MANGLE select NETFILTER_XT_TARGET_HL ---help--- - This is a backwards-compat option for the user's convenience + This is a backwards-compatible option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_HL. diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig index 29d643b..44d2eea 100644 --- a/net/ipv6/netfilter/Kconfig +++ b/net/ipv6/netfilter/Kconfig @@ -132,10 +132,10 @@ config IP6_NF_MATCH_RT # The targets config IP6_NF_TARGET_HL tristate '"HL" hoplimit target support' - depends on NETFILTER_ADVANCED + depends on NETFILTER_ADVANCED && IP6_NF_MANGLE select NETFILTER_XT_TARGET_HL ---help--- - This is a backwards-compat option for the user's convenience + This is a backwards-compatible option for the user's convenience (e.g. when running oldconfig). It selects CONFIG_NETFILTER_XT_TARGET_HL. -- cgit v1.1 From 27a954bd560f3e385bbed38fde3051fe718b8d75 Mon Sep 17 00:00:00 2001 From: Andy Walls Date: Sun, 17 Oct 2010 15:11:22 +0000 Subject: IPv4: route.c: Change checks against 0xffffffff to ipv4_is_lbcast() Change a few checks against the hardcoded broadcast address, 0xffffffff, to ipv4_is_lbcast(). Remove some existing checks using ipv4_is_lbcast() that are now obviously superfluous. Signed-off-by: Andy Walls Signed-off-by: David S. Miller --- net/ipv4/route.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) (limited to 'net') diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 0755aa4..ff98983d 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2124,7 +2124,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, ipv4_is_loopback(saddr)) goto martian_source; - if (daddr == htonl(0xFFFFFFFF) || (saddr == 0 && daddr == 0)) + if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0)) goto brd_input; /* Accept zero addresses only to limited broadcast; @@ -2133,8 +2133,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (ipv4_is_zeronet(saddr)) goto martian_source; - if (ipv4_is_lbcast(daddr) || ipv4_is_zeronet(daddr) || - ipv4_is_loopback(daddr)) + if (ipv4_is_zeronet(daddr) || ipv4_is_loopback(daddr)) goto martian_destination; /* @@ -2367,11 +2366,11 @@ static int __mkroute_output(struct rtable **result, if (ipv4_is_loopback(fl->fl4_src) && !(dev_out->flags & IFF_LOOPBACK)) return -EINVAL; - if (fl->fl4_dst == htonl(0xFFFFFFFF)) + if (ipv4_is_lbcast(fl->fl4_dst)) res->type = RTN_BROADCAST; else if (ipv4_is_multicast(fl->fl4_dst)) res->type = RTN_MULTICAST; - else if (ipv4_is_lbcast(fl->fl4_dst) || ipv4_is_zeronet(fl->fl4_dst)) + else if (ipv4_is_zeronet(fl->fl4_dst)) return -EINVAL; if (dev_out->flags & IFF_LOOPBACK) @@ -2530,7 +2529,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, if (oldflp->oif == 0 && (ipv4_is_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF))) { + ipv4_is_lbcast(oldflp->fl4_dst))) { /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ dev_out = __ip_dev_find(net, oldflp->fl4_src, false); if (dev_out == NULL) @@ -2574,7 +2573,7 @@ static int ip_route_output_slow(struct net *net, struct rtable **rp, goto out; /* Wrong error code */ if (ipv4_is_local_multicast(oldflp->fl4_dst) || - oldflp->fl4_dst == htonl(0xFFFFFFFF)) { + ipv4_is_lbcast(oldflp->fl4_dst)) { if (!fl.fl4_src) fl.fl4_src = inet_select_addr(dev_out, 0, RT_SCOPE_LINK); -- cgit v1.1 From c2355e1ab910278a94d487b78590ee3c8eecd08a Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 13 Oct 2010 16:01:49 +0000 Subject: bonding: Fix bonding drivers improper modification of netpoll structure The bonding driver currently modifies the netpoll structure in its xmit path while sending frames from netpoll. This is racy, as other cpus can access the netpoll structure in parallel. Since the bonding driver points np->dev to a slave device, other cpus can inadvertently attempt to send data directly to slave devices, leading to improper locking with the bonding master, lost frames, and deadlocks. This patch fixes that up. This patch also removes the real_dev pointer from the netpoll structure as that data is really only used by bonding in the poll_controller, and we can emulate its behavior by check each slave for IS_UP. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netpoll.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 537e01a..4e98ffa 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -288,11 +288,11 @@ static int netpoll_owner_active(struct net_device *dev) return 0; } -void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) +void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb, + struct net_device *dev) { int status = NETDEV_TX_BUSY; unsigned long tries; - struct net_device *dev = np->dev; const struct net_device_ops *ops = dev->netdev_ops; /* It is up to the caller to keep npinfo alive. */ struct netpoll_info *npinfo = np->dev->npinfo; @@ -346,7 +346,7 @@ void netpoll_send_skb(struct netpoll *np, struct sk_buff *skb) schedule_delayed_work(&npinfo->tx_work,0); } } -EXPORT_SYMBOL(netpoll_send_skb); +EXPORT_SYMBOL(netpoll_send_skb_on_dev); void netpoll_send_udp(struct netpoll *np, const char *msg, int len) { -- cgit v1.1 From 990c3d6f9c4115347659fc2b163907c8c832ae44 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Wed, 13 Oct 2010 16:01:51 +0000 Subject: bonding: Fix napi poll for bonding driver Usually the netpoll path, when preforming a napi poll can get away with just polling all the napi instances of the configured device. Thats not the case for the bonding driver however, as the napi instances which may wind up getting flagged as needing polling after the poll_controller call don't belong to the bonded device, but rather to the slave devices. Fix this by checking the device in question for the IFF_MASTER flag, if set, we know we need to check the full poll list for this cpu, rather than just the devices napi instance list. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/core/netpoll.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 4e98ffa..d79d221 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -156,8 +156,15 @@ static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int budget = 16; + struct softnet_data *sd = &__get_cpu_var(softnet_data); + struct list_head *nlist; - list_for_each_entry(napi, &dev->napi_list, dev_list) { + if (dev->flags & IFF_MASTER) + nlist = &sd->poll_list; + else + nlist = &dev->napi_list; + + list_for_each_entry(napi, nlist, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { budget = poll_one_napi(dev->npinfo, napi, budget); -- cgit v1.1 From ebbf41df4aabb6d506fa18ea8cb4c2b4388a18b9 Mon Sep 17 00:00:00 2001 From: Pablo Neira Ayuso Date: Tue, 19 Oct 2010 10:19:06 +0200 Subject: netfilter: ctnetlink: add expectation deletion events This patch allows to listen to events that inform about expectations destroyed. Signed-off-by: Pablo Neira Ayuso Signed-off-by: Patrick McHardy --- net/netfilter/nf_conntrack_expect.c | 6 ++++-- net/netfilter/nf_conntrack_netlink.c | 30 +++++++++++++++++++++--------- 2 files changed, 25 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c index b30a1f2..46e8966 100644 --- a/net/netfilter/nf_conntrack_expect.c +++ b/net/netfilter/nf_conntrack_expect.c @@ -41,7 +41,8 @@ static struct kmem_cache *nf_ct_expect_cachep __read_mostly; static HLIST_HEAD(nf_ct_userspace_expect_list); /* nf_conntrack_expect helper functions */ -void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) +void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp, + u32 pid, int report) { struct nf_conn_help *master_help = nfct_help(exp->master); struct net *net = nf_ct_exp_net(exp); @@ -55,11 +56,12 @@ void nf_ct_unlink_expect(struct nf_conntrack_expect *exp) if (!(exp->flags & NF_CT_EXPECT_USERSPACE)) master_help->expecting[exp->class]--; + nf_ct_expect_event_report(IPEXP_DESTROY, exp, pid, report); nf_ct_expect_put(exp); NF_CT_STAT_INC(net, expect_delete); } -EXPORT_SYMBOL_GPL(nf_ct_unlink_expect); +EXPORT_SYMBOL_GPL(nf_ct_unlink_expect_report); static void nf_ct_expectation_timed_out(unsigned long ul_expect) { diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index b4077be..62bad22 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -1632,17 +1632,20 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) struct nlmsghdr *nlh; struct nfgenmsg *nfmsg; struct sk_buff *skb; - unsigned int type; + unsigned int type, group; int flags = 0; - if (events & (1 << IPEXP_NEW)) { + if (events & (1 << IPEXP_DESTROY)) { + type = IPCTNL_MSG_EXP_DELETE; + group = NFNLGRP_CONNTRACK_EXP_DESTROY; + } else if (events & (1 << IPEXP_NEW)) { type = IPCTNL_MSG_EXP_NEW; flags = NLM_F_CREATE|NLM_F_EXCL; + group = NFNLGRP_CONNTRACK_EXP_NEW; } else return 0; - if (!item->report && - !nfnetlink_has_listeners(net, NFNLGRP_CONNTRACK_EXP_NEW)) + if (!item->report && !nfnetlink_has_listeners(net, group)) return 0; skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); @@ -1665,8 +1668,7 @@ ctnetlink_expect_event(unsigned int events, struct nf_exp_event *item) rcu_read_unlock(); nlmsg_end(skb, nlh); - nfnetlink_send(skb, net, item->pid, NFNLGRP_CONNTRACK_EXP_NEW, - item->report, GFP_ATOMIC); + nfnetlink_send(skb, net, item->pid, group, item->report, GFP_ATOMIC); return 0; nla_put_failure: @@ -1849,7 +1851,13 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, } /* after list removal, usage count == 1 */ - nf_ct_unexpect_related(exp); + spin_lock_bh(&nf_conntrack_lock); + if (del_timer(&exp->timeout)) { + nf_ct_unlink_expect_report(exp, NETLINK_CB(skb).pid, + nlmsg_report(nlh)); + nf_ct_expect_put(exp); + } + spin_unlock_bh(&nf_conntrack_lock); /* have to put what we 'get' above. * after this line usage count == 0 */ nf_ct_expect_put(exp); @@ -1866,7 +1874,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, m_help = nfct_help(exp->master); if (!strcmp(m_help->helper->name, name) && del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); nf_ct_expect_put(exp); } } @@ -1880,7 +1890,9 @@ ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb, &net->ct.expect_hash[i], hnode) { if (del_timer(&exp->timeout)) { - nf_ct_unlink_expect(exp); + nf_ct_unlink_expect_report(exp, + NETLINK_CB(skb).pid, + nlmsg_report(nlh)); nf_ct_expect_put(exp); } } -- cgit v1.1 From 714f095f74582764d629785f03b459a3d0503624 Mon Sep 17 00:00:00 2001 From: Hans Schillstrom Date: Tue, 19 Oct 2010 10:38:48 +0200 Subject: ipvs: IPv6 tunnel mode IPv6 encapsulation uses a bad source address for the tunnel. i.e. VIP will be used as local-addr and encap. dst addr. Decapsulation will not accept this. Example LVS (eth1 2003::2:0:1/96, VIP 2003::2:0:100) (eth0 2003::1:0:1/96) RS (ethX 2003::1:0:5/96) tcpdump 2003::2:0:100 > 2003::1:0:5: IP6 (hlim 63, next-header TCP (6) payload length: 40) 2003::3:0:10.50991 > 2003::2:0:100.http: Flags [S], cksum 0x7312 (correct), seq 3006460279, win 5760, options [mss 1440,sackOK,TS val 1904932 ecr 0,nop,wscale 3], length 0 In Linux IPv6 impl. you can't have a tunnel with an any cast address receiving packets (I have not tried to interpret RFC 2473) To have receive capabilities the tunnel must have: - Local address set as multicast addr or an unicast addr - Remote address set as an unicast addr. - Loop back addres or Link local address are not allowed. This causes us to setup a tunnel in the Real Server with the LVS as the remote address, here you can't use the VIP address since it's used inside the tunnel. Solution Use outgoing interface IPv6 address (match against the destination). i.e. use ip6_route_output() to look up the route cache and then use ipv6_dev_get_saddr(...) to set the source address of the encapsulated packet. Additionally, cache the results in new destination fields: dst_cookie and dst_saddr and properly check the returned dst from ip6_route_output. We now add xfrm_lookup call only for the tunneling method where the source address is a local one. Signed-off-by:Hans Schillstrom Signed-off-by: Patrick McHardy --- net/netfilter/ipvs/ip_vs_xmit.c | 171 +++++++++++++++++++++------------------- 1 file changed, 92 insertions(+), 79 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 8817afa..b0bd8af 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -26,6 +26,7 @@ #include /* for ip_route_output */ #include #include +#include #include #include #include @@ -37,26 +38,27 @@ * Destination cache to speed up outgoing route lookup */ static inline void -__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst) +__ip_vs_dst_set(struct ip_vs_dest *dest, u32 rtos, struct dst_entry *dst, + u32 dst_cookie) { struct dst_entry *old_dst; old_dst = dest->dst_cache; dest->dst_cache = dst; dest->dst_rtos = rtos; + dest->dst_cookie = dst_cookie; dst_release(old_dst); } static inline struct dst_entry * -__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) +__ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) { struct dst_entry *dst = dest->dst_cache; if (!dst) return NULL; - if ((dst->obsolete - || (dest->af == AF_INET && rtos != dest->dst_rtos)) && - dst->ops->check(dst, cookie) == NULL) { + if ((dst->obsolete || rtos != dest->dst_rtos) && + dst->ops->check(dst, dest->dst_cookie) == NULL) { dest->dst_cache = NULL; dst_release(dst); return NULL; @@ -66,15 +68,16 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos, u32 cookie) } static struct rtable * -__ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) { + struct net *net = dev_net(skb->dev); struct rtable *rt; /* Route to the other host */ struct ip_vs_dest *dest = cp->dest; if (dest) { spin_lock(&dest->dst_lock); if (!(rt = (struct rtable *) - __ip_vs_dst_check(dest, rtos, 0))) { + __ip_vs_dst_check(dest, rtos))) { struct flowi fl = { .oif = 0, .nl_u = { @@ -84,13 +87,13 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(net, &rt, &fl)) { spin_unlock(&dest->dst_lock); IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &dest->addr.ip); return NULL; } - __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst)); + __ip_vs_dst_set(dest, rtos, dst_clone(&rt->dst), 0); IP_VS_DBG(10, "new dst %pI4, refcnt=%d, rtos=%X\n", &dest->addr.ip, atomic_read(&rt->dst.__refcnt), rtos); @@ -106,7 +109,7 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) .tos = rtos, } }, }; - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", &cp->daddr.ip); return NULL; @@ -117,62 +120,79 @@ __ip_vs_get_out_rt(struct ip_vs_conn *cp, u32 rtos) } #ifdef CONFIG_IP_VS_IPV6 + +static struct dst_entry * +__ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, + struct in6_addr *ret_saddr, int do_xfrm) +{ + struct dst_entry *dst; + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip6_u = { + .daddr = *daddr, + }, + }, + }; + + dst = ip6_route_output(net, NULL, &fl); + if (dst->error) + goto out_err; + if (!ret_saddr) + return dst; + if (ipv6_addr_any(&fl.fl6_src) && + ipv6_dev_get_saddr(net, ip6_dst_idev(dst)->dev, + &fl.fl6_dst, 0, &fl.fl6_src) < 0) + goto out_err; + if (do_xfrm && xfrm_lookup(net, &dst, &fl, NULL, 0) < 0) + goto out_err; + ipv6_addr_copy(ret_saddr, &fl.fl6_src); + return dst; + +out_err: + dst_release(dst); + IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", daddr); + return NULL; +} + static struct rt6_info * -__ip_vs_get_out_rt_v6(struct ip_vs_conn *cp) +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, + struct in6_addr *ret_saddr, int do_xfrm) { + struct net *net = dev_net(skb->dev); struct rt6_info *rt; /* Route to the other host */ struct ip_vs_dest *dest = cp->dest; + struct dst_entry *dst; if (dest) { spin_lock(&dest->dst_lock); - rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0, 0); + rt = (struct rt6_info *)__ip_vs_dst_check(dest, 0); if (!rt) { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = dest->addr.in6, - .saddr = { - .s6_addr32 = - { 0, 0, 0, 0 }, - }, - }, - }, - }; + u32 cookie; - rt = (struct rt6_info *)ip6_route_output(&init_net, - NULL, &fl); - if (!rt) { + dst = __ip_vs_route_output_v6(net, &dest->addr.in6, + &dest->dst_saddr, + do_xfrm); + if (!dst) { spin_unlock(&dest->dst_lock); - IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", - &dest->addr.in6); return NULL; } - __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst)); - IP_VS_DBG(10, "new dst %pI6, refcnt=%d\n", - &dest->addr.in6, + rt = (struct rt6_info *) dst; + cookie = rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + __ip_vs_dst_set(dest, 0, dst_clone(&rt->dst), cookie); + IP_VS_DBG(10, "new dst %pI6, src %pI6, refcnt=%d\n", + &dest->addr.in6, &dest->dst_saddr, atomic_read(&rt->dst.__refcnt)); } + if (ret_saddr) + ipv6_addr_copy(ret_saddr, &dest->dst_saddr); spin_unlock(&dest->dst_lock); } else { - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = cp->daddr.in6, - .saddr = { - .s6_addr32 = { 0, 0, 0, 0 }, - }, - }, - }, - }; - - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("ip6_route_output error, dest: %pI6\n", - &cp->daddr.in6); + dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr, + do_xfrm); + if (!dst) return NULL; - } + rt = (struct rt6_info *) dst; } return rt; @@ -248,6 +268,7 @@ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { + struct net *net = dev_net(skb->dev); struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); u8 tos = iph->tos; @@ -263,7 +284,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (ip_route_output_key(&init_net, &rt, &fl)) { + if (ip_route_output_key(net, &rt, &fl)) { IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", __func__, &iph->daddr); goto tx_error_icmp; @@ -313,25 +334,18 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { + struct net *net = dev_net(skb->dev); + struct dst_entry *dst; struct rt6_info *rt; /* Route to the other host */ struct ipv6hdr *iph = ipv6_hdr(skb); int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip6_u = { - .daddr = iph->daddr, - .saddr = { .s6_addr32 = {0, 0, 0, 0} }, } }, - }; EnterFunction(10); - rt = (struct rt6_info *)ip6_route_output(&init_net, NULL, &fl); - if (!rt) { - IP_VS_DBG_RL("%s(): ip6_route_output error, dest: %pI6\n", - __func__, &iph->daddr); + dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0); + if (!dst) goto tx_error_icmp; - } + rt = (struct rt6_info *) dst; /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -397,7 +411,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) goto tx_error_icmp; /* MTU checking */ @@ -472,7 +486,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); if (!rt) goto tx_error_icmp; @@ -557,7 +571,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct iphdr *old_iph = ip_hdr(skb); u8 tos = old_iph->tos; __be16 df = old_iph->frag_off; - sk_buff_data_t old_transport_header = skb->transport_header; struct iphdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; @@ -572,7 +585,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos)))) goto tx_error_icmp; tdev = rt->dst.dev; @@ -616,7 +629,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, old_iph = ip_hdr(skb); } - skb->transport_header = old_transport_header; + skb->transport_header = skb->network_header; /* fix old IP header checksum */ ip_send_check(old_iph); @@ -670,9 +683,9 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { struct rt6_info *rt; /* Route to the other host */ + struct in6_addr saddr; /* Source for tunnel */ struct net_device *tdev; /* Device to other host */ struct ipv6hdr *old_iph = ipv6_hdr(skb); - sk_buff_data_t old_transport_header = skb->transport_header; struct ipv6hdr *iph; /* Our new IP header */ unsigned int max_headroom; /* The extra header space needed */ int mtu; @@ -687,17 +700,17 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1); if (!rt) goto tx_error_icmp; tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); - /* TODO IPv6: do we need this check in IPv6? */ - if (mtu < 1280) { + if (mtu < IPV6_MIN_MTU) { dst_release(&rt->dst); - IP_VS_DBG_RL("%s(): mtu less than 1280\n", __func__); + IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, + IPV6_MIN_MTU); goto tx_error; } if (skb_dst(skb)) @@ -730,7 +743,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, old_iph = ipv6_hdr(skb); } - skb->transport_header = old_transport_header; + skb->transport_header = skb->network_header; skb_push(skb, sizeof(struct ipv6hdr)); skb_reset_network_header(skb); @@ -750,8 +763,8 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, be16_add_cpu(&iph->payload_len, sizeof(*old_iph)); iph->priority = old_iph->priority; memset(&iph->flow_lbl, 0, sizeof(iph->flow_lbl)); - iph->daddr = rt->rt6i_dst.addr; - iph->saddr = cp->vaddr.in6; /* rt->rt6i_src.addr; */ + ipv6_addr_copy(&iph->daddr, &cp->daddr.in6); + ipv6_addr_copy(&iph->saddr, &saddr); iph->hop_limit = old_iph->hop_limit; /* Another hack: avoid icmp_send in ip_fragment */ @@ -791,7 +804,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) goto tx_error_icmp; /* MTU checking */ @@ -843,7 +856,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); if (!rt) goto tx_error_icmp; @@ -919,7 +932,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - if (!(rt = __ip_vs_get_out_rt(cp, RT_TOS(ip_hdr(skb)->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos)))) goto tx_error_icmp; /* MTU checking */ @@ -993,7 +1006,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - rt = __ip_vs_get_out_rt_v6(cp); + rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); if (!rt) goto tx_error_icmp; -- cgit v1.1 From 9e917dca74138cccf398ce8bb924c7fd2980ec1d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Oct 2010 00:39:18 +0000 Subject: net: avoid a dev refcount in ip_mc_find_dev() We hold RTNL in ip_mc_find_dev(), no need to touch device refcount. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_frontend.c | 2 +- net/ipv4/igmp.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 3df057e..36e27c2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -153,7 +153,7 @@ static void fib_flush(struct net *net) * @addr: the source address * @devref: if true, take a reference on the found device * - * If a caller uses devref=false, it should be protected by RCU + * If a caller uses devref=false, it should be protected by RCU, or RTNL */ struct net_device *__ip_dev_find(struct net *net, __be32 addr, bool devref) { diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 25f3396..a525328 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1418,6 +1418,7 @@ void ip_mc_destroy_dev(struct in_device *in_dev) write_unlock_bh(&in_dev->mc_list_lock); } +/* RTNL is locked */ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) { struct flowi fl = { .nl_u = { .ip4_u = @@ -1433,10 +1434,9 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) return idev; } if (imr->imr_address.s_addr) { - dev = ip_dev_find(net, imr->imr_address.s_addr); + dev = __ip_dev_find(net, imr->imr_address.s_addr, false); if (!dev) return NULL; - dev_put(dev); } if (!dev && !ip_route_output_key(net, &rt, &fl)) { -- cgit v1.1 From 8723e1b4ad9be4444423b4d41509ce859a629649 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 19 Oct 2010 00:39:26 +0000 Subject: inet: RCU changes in inetdev_by_index() Convert inetdev_by_index() to not increment in_dev refcount. Callers hold RCU or RTNL, and should not decrement in_dev refcount. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/devinet.c | 7 ++++--- net/ipv4/fib_semantics.c | 25 +++++++++++-------------- net/ipv4/igmp.c | 2 -- net/ipv4/ip_gre.c | 4 +--- 4 files changed, 16 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index c2ff48f..dc94b03 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -403,6 +403,9 @@ static int inet_set_ifa(struct net_device *dev, struct in_ifaddr *ifa) return inet_insert_ifa(ifa); } +/* Caller must hold RCU or RTNL : + * We dont take a reference on found in_device + */ struct in_device *inetdev_by_index(struct net *net, int ifindex) { struct net_device *dev; @@ -411,7 +414,7 @@ struct in_device *inetdev_by_index(struct net *net, int ifindex) rcu_read_lock(); dev = dev_get_by_index_rcu(net, ifindex); if (dev) - in_dev = in_dev_get(dev); + in_dev = rcu_dereference_rtnl(dev->ip_ptr); rcu_read_unlock(); return in_dev; } @@ -453,8 +456,6 @@ static int inet_rtm_deladdr(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg goto errout; } - __in_dev_put(in_dev); - for (ifap = &in_dev->ifa_list; (ifa = *ifap) != NULL; ifap = &ifa->ifa_next) { if (tb[IFA_LOCAL] && diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 0f80dfc..6734c9c 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -590,32 +590,29 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, if (!dev) goto out; dev_hold(dev); - err = -ENETDOWN; - if (!(dev->flags & IFF_UP)) - goto out; - err = 0; -out: - rcu_read_unlock(); - return err; + err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN; } else { struct in_device *in_dev; if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK)) return -EINVAL; + rcu_read_lock(); + err = -ENODEV; in_dev = inetdev_by_index(net, nh->nh_oif); if (in_dev == NULL) - return -ENODEV; - if (!(in_dev->dev->flags & IFF_UP)) { - in_dev_put(in_dev); - return -ENETDOWN; - } + goto out; + err = -ENETDOWN; + if (!(in_dev->dev->flags & IFF_UP)) + goto out; nh->nh_dev = in_dev->dev; dev_hold(nh->nh_dev); nh->nh_scope = RT_SCOPE_HOST; - in_dev_put(in_dev); + err = 0; } - return 0; +out: + rcu_read_unlock(); + return err; } static inline unsigned int fib_laddr_hashfn(__be32 val) diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index a525328..c8877c6 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -1429,8 +1429,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr) if (imr->imr_ifindex) { idev = inetdev_by_index(net, imr->imr_ifindex); - if (idev) - __in_dev_put(idev); return idev; } if (imr->imr_address.s_addr) { diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c index 9d421f4..d0ffcbe 100644 --- a/net/ipv4/ip_gre.c +++ b/net/ipv4/ip_gre.c @@ -1245,10 +1245,8 @@ static int ipgre_close(struct net_device *dev) if (ipv4_is_multicast(t->parms.iph.daddr) && t->mlink) { struct in_device *in_dev; in_dev = inetdev_by_index(dev_net(dev), t->mlink); - if (in_dev) { + if (in_dev) ip_mc_dec_group(in_dev, t->parms.iph.daddr); - in_dev_put(in_dev); - } } return 0; } -- cgit v1.1 From d86bef73b4a24e59e7c1f896a72bbf38430ac2c6 Mon Sep 17 00:00:00 2001 From: Eduardo Blanco Date: Tue, 19 Oct 2010 10:26:47 +0100 Subject: Fixed race condition at ip_vs.ko module init. Lists were initialized after the module was registered. Multiple ipvsadm processes at module load triggered a race condition that resulted in a null pointer dereference in do_ip_vs_get_ctl(). As a result, __ip_vs_mutex was left locked preventing all further ipvsadm commands. Signed-off-by: Eduardo J. Blanco Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ctl.c | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index a697591..0b884d3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -3400,6 +3400,16 @@ int __init ip_vs_control_init(void) EnterFunction(2); + /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ + for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_svc_table[idx]); + INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); + } + for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { + INIT_LIST_HEAD(&ip_vs_rtable[idx]); + } + smp_wmb(); + ret = nf_register_sockopt(&ip_vs_sockopts); if (ret) { pr_err("cannot register sockopt.\n"); @@ -3418,15 +3428,6 @@ int __init ip_vs_control_init(void) sysctl_header = register_sysctl_paths(net_vs_ctl_path, vs_vars); - /* Initialize ip_vs_svc_table, ip_vs_svc_fwm_table, ip_vs_rtable */ - for(idx = 0; idx < IP_VS_SVC_TAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_svc_table[idx]); - INIT_LIST_HEAD(&ip_vs_svc_fwm_table[idx]); - } - for(idx = 0; idx < IP_VS_RTAB_SIZE; idx++) { - INIT_LIST_HEAD(&ip_vs_rtable[idx]); - } - ip_vs_new_estimator(&ip_vs_stats); /* Hook the defense timer */ -- cgit v1.1 From f13d493d9cf772d510d78ae00bb9f4d680b3170b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 19 Oct 2010 07:04:26 +0000 Subject: netpoll: Revert napi_poll fix for bonding driver In an erlier patch I modified napi_poll so that devices with IFF_MASTER polled the per_cpu list instead of the device list for napi. I did this because the bonding driver has no napi instances to poll, it instead expects to check the slave devices napi instances, which napi_poll was unaware of. Looking at this more closely however, I now see this isn't strictly needed. As the bond driver poll_controller calls the slaves poll_controller via netpoll_poll_dev, which recursively calls poll_napi on each slave, allowing those napi instances to get serviced. The earlier patch isn't at all harmfull, its just not needed, so lets revert it to make the code cleaner. Sorry for the noise, Signed-off-by: Neil Horman Reviewed-by: WANG Cong Signed-off-by: David S. Miller --- net/core/netpoll.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) (limited to 'net') diff --git a/net/core/netpoll.c b/net/core/netpoll.c index d79d221..4e98ffa 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -156,15 +156,8 @@ static void poll_napi(struct net_device *dev) { struct napi_struct *napi; int budget = 16; - struct softnet_data *sd = &__get_cpu_var(softnet_data); - struct list_head *nlist; - if (dev->flags & IFF_MASTER) - nlist = &sd->poll_list; - else - nlist = &dev->napi_list; - - list_for_each_entry(napi, nlist, dev_list) { + list_for_each_entry(napi, &dev->napi_list, dev_list) { if (napi->poll_owner != smp_processor_id() && spin_trylock(&napi->poll_lock)) { budget = poll_one_napi(dev->npinfo, napi, budget); -- cgit v1.1 From c5e90f562047ff9713183cf5e18f5e8997bc7373 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Tue, 19 Oct 2010 21:51:03 +0000 Subject: phonet: remove the unused variable pn Signed-off-by: Changli Gao Signed-off-by: David S. Miller --- net/phonet/pep.c | 1 - 1 file changed, 1 deletion(-) (limited to 'net') diff --git a/net/phonet/pep.c b/net/phonet/pep.c index 9c903f9..3e60f2e 100644 --- a/net/phonet/pep.c +++ b/net/phonet/pep.c @@ -300,7 +300,6 @@ static int pipe_handler_send_ind(struct sock *sk, u8 utid, u8 msg_id) static int pipe_handler_enable_pipe(struct sock *sk, int enable) { - struct pep_sock *pn = pep_sk(sk); int utid, req; if (enable) { -- cgit v1.1 From 55513fb4281464e97aa1ff2b9c906ca5aed917c5 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 17:55:58 +0000 Subject: net: fail alloc_netdev_mq if queue count < 1 In alloc_netdev_mq fail if requested queue_count < 1. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 04972a4..f44d29a 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5511,6 +5511,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, BUG_ON(strlen(name) >= sizeof(dev->name)); + if (queue_count < 1) { + pr_err("alloc_netdev: Unable to allocate device " + "with zero queues.\n"); + return NULL; + } + alloc_size = sizeof(struct net_device); if (sizeof_priv) { /* ensure 32-byte alignment of private area */ -- cgit v1.1 From bd25fa7ba59cd26094319dfba0011b48465f7355 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 18:00:16 +0000 Subject: net: cleanups in RX queue allocation Clean up in RX queue allocation. In netif_set_real_num_rx_queues return error on attempt to set zero queues, or requested number is greater than number of allocated queues. In netif_alloc_rx_queues, do BUG_ON if queue_count is zero. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++++++++++------------------- 1 file changed, 17 insertions(+), 19 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index f44d29a..d33adec 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1583,12 +1583,12 @@ int netif_set_real_num_rx_queues(struct net_device *dev, unsigned int rxq) { int rc; + if (rxq < 1 || rxq > dev->num_rx_queues) + return -EINVAL; + if (dev->reg_state == NETREG_REGISTERED) { ASSERT_RTNL(); - if (rxq > dev->num_rx_queues) - return -EINVAL; - rc = net_rx_queue_update_kobjects(dev, dev->real_num_rx_queues, rxq); if (rc) @@ -5013,25 +5013,23 @@ static int netif_alloc_rx_queues(struct net_device *dev) { #ifdef CONFIG_RPS unsigned int i, count = dev->num_rx_queues; + struct netdev_rx_queue *rx; - if (count) { - struct netdev_rx_queue *rx; - - rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); - if (!rx) { - pr_err("netdev: Unable to allocate %u rx queues.\n", - count); - return -ENOMEM; - } - dev->_rx = rx; + BUG_ON(count < 1); - /* - * Set a pointer to first element in the array which holds the - * reference count. - */ - for (i = 0; i < count; i++) - rx[i].first = rx; + rx = kcalloc(count, sizeof(struct netdev_rx_queue), GFP_KERNEL); + if (!rx) { + pr_err("netdev: Unable to allocate %u rx queues.\n", count); + return -ENOMEM; } + dev->_rx = rx; + + /* + * Set a pointer to first element in the array which holds the + * reference count. + */ + for (i = 0; i < count; i++) + rx[i].first = rx; #endif return 0; } -- cgit v1.1 From e6484930d7c73d324bccda7d43d131088da697b9 Mon Sep 17 00:00:00 2001 From: Tom Herbert Date: Mon, 18 Oct 2010 18:04:39 +0000 Subject: net: allocate tx queues in register_netdevice This patch introduces netif_alloc_netdev_queues which is called from register_device instead of alloc_netdev_mq. This makes TX queue allocation symmetric with RX allocation. Also, queue locks allocation is done in netdev_init_one_queue. Change set_real_num_tx_queues to fail if requested number < 1 or greater than number of allocated queues. Signed-off-by: Tom Herbert Acked-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dev.c | 106 ++++++++++++++++++++++++++++----------------------------- 1 file changed, 53 insertions(+), 53 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index d33adec..4c3ac53 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1553,18 +1553,20 @@ static void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev) * Routine to help set real_num_tx_queues. To avoid skbs mapped to queues * greater then real_num_tx_queues stale skbs on the qdisc must be flushed. */ -void netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) +int netif_set_real_num_tx_queues(struct net_device *dev, unsigned int txq) { - unsigned int real_num = dev->real_num_tx_queues; + if (txq < 1 || txq > dev->num_tx_queues) + return -EINVAL; - if (unlikely(txq > dev->num_tx_queues)) - ; - else if (txq > real_num) - dev->real_num_tx_queues = txq; - else if (txq < real_num) { - dev->real_num_tx_queues = txq; - qdisc_reset_all_tx_gt(dev, txq); + if (dev->reg_state == NETREG_REGISTERED) { + ASSERT_RTNL(); + + if (txq < dev->real_num_tx_queues) + qdisc_reset_all_tx_gt(dev, txq); } + + dev->real_num_tx_queues = txq; + return 0; } EXPORT_SYMBOL(netif_set_real_num_tx_queues); @@ -4928,20 +4930,6 @@ static void rollback_registered(struct net_device *dev) rollback_registered_many(&single); } -static void __netdev_init_queue_locks_one(struct net_device *dev, - struct netdev_queue *dev_queue, - void *_unused) -{ - spin_lock_init(&dev_queue->_xmit_lock); - netdev_set_xmit_lockdep_class(&dev_queue->_xmit_lock, dev->type); - dev_queue->xmit_lock_owner = -1; -} - -static void netdev_init_queue_locks(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, __netdev_init_queue_locks_one, NULL); -} - unsigned long netdev_fix_features(unsigned long features, const char *name) { /* Fix illegal SG+CSUM combinations. */ @@ -5034,6 +5022,41 @@ static int netif_alloc_rx_queues(struct net_device *dev) return 0; } +static int netif_alloc_netdev_queues(struct net_device *dev) +{ + unsigned int count = dev->num_tx_queues; + struct netdev_queue *tx; + + BUG_ON(count < 1); + + tx = kcalloc(count, sizeof(struct netdev_queue), GFP_KERNEL); + if (!tx) { + pr_err("netdev: Unable to allocate %u tx queues.\n", + count); + return -ENOMEM; + } + dev->_tx = tx; + return 0; +} + +static void netdev_init_one_queue(struct net_device *dev, + struct netdev_queue *queue, + void *_unused) +{ + queue->dev = dev; + + /* Initialize queue lock */ + spin_lock_init(&queue->_xmit_lock); + netdev_set_xmit_lockdep_class(&queue->_xmit_lock, dev->type); + queue->xmit_lock_owner = -1; +} + +static void netdev_init_queues(struct net_device *dev) +{ + netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); + spin_lock_init(&dev->tx_global_lock); +} + /** * register_netdevice - register a network device * @dev: device to register @@ -5067,7 +5090,6 @@ int register_netdevice(struct net_device *dev) spin_lock_init(&dev->addr_list_lock); netdev_set_addr_lockdep_class(dev); - netdev_init_queue_locks(dev); dev->iflink = -1; @@ -5075,6 +5097,12 @@ int register_netdevice(struct net_device *dev) if (ret) goto out; + ret = netif_alloc_netdev_queues(dev); + if (ret) + goto out; + + netdev_init_queues(dev); + /* Init, if this function is available */ if (dev->netdev_ops->ndo_init) { ret = dev->netdev_ops->ndo_init(dev); @@ -5456,19 +5484,6 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev, } EXPORT_SYMBOL(dev_get_stats); -static void netdev_init_one_queue(struct net_device *dev, - struct netdev_queue *queue, - void *_unused) -{ - queue->dev = dev; -} - -static void netdev_init_queues(struct net_device *dev) -{ - netdev_for_each_tx_queue(dev, netdev_init_one_queue, NULL); - spin_lock_init(&dev->tx_global_lock); -} - struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) { struct netdev_queue *queue = dev_ingress_queue(dev); @@ -5480,7 +5495,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) if (!queue) return NULL; netdev_init_one_queue(dev, queue, NULL); - __netdev_init_queue_locks_one(dev, queue, NULL); queue->qdisc = &noop_qdisc; queue->qdisc_sleeping = &noop_qdisc; rcu_assign_pointer(dev->ingress_queue, queue); @@ -5502,7 +5516,6 @@ struct netdev_queue *dev_ingress_queue_create(struct net_device *dev) struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, void (*setup)(struct net_device *), unsigned int queue_count) { - struct netdev_queue *tx; struct net_device *dev; size_t alloc_size; struct net_device *p; @@ -5530,20 +5543,12 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, return NULL; } - tx = kcalloc(queue_count, sizeof(struct netdev_queue), GFP_KERNEL); - if (!tx) { - printk(KERN_ERR "alloc_netdev: Unable to allocate " - "tx qdiscs.\n"); - goto free_p; - } - - dev = PTR_ALIGN(p, NETDEV_ALIGN); dev->padded = (char *)dev - (char *)p; dev->pcpu_refcnt = alloc_percpu(int); if (!dev->pcpu_refcnt) - goto free_tx; + goto free_p; if (dev_addr_init(dev)) goto free_pcpu; @@ -5553,7 +5558,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev_net_set(dev, &init_net); - dev->_tx = tx; dev->num_tx_queues = queue_count; dev->real_num_tx_queues = queue_count; @@ -5564,8 +5568,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, dev->gso_max_size = GSO_MAX_SIZE; - netdev_init_queues(dev); - INIT_LIST_HEAD(&dev->ethtool_ntuple_list.list); dev->ethtool_ntuple_list.count = 0; INIT_LIST_HEAD(&dev->napi_list); @@ -5576,8 +5578,6 @@ struct net_device *alloc_netdev_mq(int sizeof_priv, const char *name, strcpy(dev->name, name); return dev; -free_tx: - kfree(tx); free_pcpu: free_percpu(dev->pcpu_refcnt); free_p: -- cgit v1.1 From 27b75c95f10d249574d9c4cb9dab878107faede8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Oct 2010 05:44:11 +0000 Subject: net: avoid RCU for NOCACHE dst There is no point using RCU for dst we allocate for a very short time (used once). Change dst_release() to take DST_NOCACHE into account, but also change skb_dst_set_noref() to force a refcount increment for such dst. This is a _huge_ gain, because we dont waste memory to store xx thousand of dsts. Instead of queueing them to RCU, we can free them instantly. CPU caches can stay hot, re-using same memory blocks to hold temporary dsts. Note : remove unneeded smp_mb__before_atomic_dec(); in dst_release(), since atomic_dec_return() implies a full memory barrier. Stress test, 160.000.000 udp frames sent, IP route cache disabled (DDOS). Before: real 0m38.091s user 0m13.189s sys 7m53.018s After: real 0m29.946s user 0m12.157s sys 7m40.605s For reference, if IP route cache was enabled : real 0m32.030s user 0m10.521s sys 8m15.243s Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/core/dst.c | 29 ++++++++++++++++++++++++++++- net/ipv4/route.c | 9 ++++----- 2 files changed, 32 insertions(+), 6 deletions(-) (limited to 'net') diff --git a/net/core/dst.c b/net/core/dst.c index 32e542d..8abe628 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -271,13 +271,40 @@ void dst_release(struct dst_entry *dst) if (dst) { int newrefcnt; - smp_mb__before_atomic_dec(); newrefcnt = atomic_dec_return(&dst->__refcnt); WARN_ON(newrefcnt < 0); + if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) { + dst = dst_destroy(dst); + if (dst) + __dst_free(dst); + } } } EXPORT_SYMBOL(dst_release); +/** + * skb_dst_set_noref - sets skb dst, without a reference + * @skb: buffer + * @dst: dst entry + * + * Sets skb dst, assuming a reference was not taken on dst + * skb_dst_drop() should not dst_release() this dst + */ +void skb_dst_set_noref(struct sk_buff *skb, struct dst_entry *dst) +{ + WARN_ON(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); + /* If dst not in cache, we must take a reference, because + * dst_release() will destroy dst as soon as its refcount becomes zero + */ + if (unlikely(dst->flags & DST_NOCACHE)) { + dst_hold(dst); + skb_dst_set(skb, dst); + } else { + skb->_skb_refdst = (unsigned long)dst | SKB_DST_NOREF; + } +} +EXPORT_SYMBOL(skb_dst_set_noref); + /* Dirty hack. We did it in 2.2 (in __dst_free), * we have _very_ good reasons not to repeat * this mistake in 2.3, but we have no choice diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ff98983d..d6cb2bf 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1105,9 +1105,9 @@ restart: * Note that we do rt_free on this new route entry, so that * once its refcount hits zero, we are still able to reap it * (Thanks Alexey) - * Note also the rt_free uses call_rcu. We don't actually - * need rcu protection here, this is just our path to get - * on the route gc list. + * Note: To avoid expensive rcu stuff for this uncached dst, + * we set DST_NOCACHE so that dst_release() can free dst without + * waiting a grace period. */ rt->dst.flags |= DST_NOCACHE; @@ -1117,12 +1117,11 @@ restart: if (net_ratelimit()) printk(KERN_WARNING "Neighbour table failure & not caching routes.\n"); - rt_drop(rt); + ip_rt_put(rt); return err; } } - rt_free(rt); goto skip_hashing; } -- cgit v1.1 From 13937911f93ef52ae652f4652761aea6a58d3193 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:01 +0000 Subject: ebtables: Allow filtering of hardware accelerated vlan frames. An upcoming commit will allow packets with hardware vlan acceleration information to be passed though more parts of the network stack, including packets trunked through the bridge. This adds support for matching and filtering those packets through ebtables. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 16 +++++++++------- net/bridge/netfilter/ebt_vlan.c | 21 ++++++++++++++------- net/bridge/netfilter/ebtables.c | 15 +++++++++++---- 3 files changed, 34 insertions(+), 18 deletions(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 7f9ce96..47c2dab 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -64,22 +64,24 @@ static int brnf_filter_pppoe_tagged __read_mostly = 0; static inline __be16 vlan_proto(const struct sk_buff *skb) { - return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + if (vlan_tx_tag_present(skb)) + return skb->protocol; + else if (skb->protocol == htons(ETH_P_8021Q)) + return vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; + else + return 0; } #define IS_VLAN_IP(skb) \ - (skb->protocol == htons(ETH_P_8021Q) && \ - vlan_proto(skb) == htons(ETH_P_IP) && \ + (vlan_proto(skb) == htons(ETH_P_IP) && \ brnf_filter_vlan_tagged) #define IS_VLAN_IPV6(skb) \ - (skb->protocol == htons(ETH_P_8021Q) && \ - vlan_proto(skb) == htons(ETH_P_IPV6) &&\ + (vlan_proto(skb) == htons(ETH_P_IPV6) && \ brnf_filter_vlan_tagged) #define IS_VLAN_ARP(skb) \ - (skb->protocol == htons(ETH_P_8021Q) && \ - vlan_proto(skb) == htons(ETH_P_ARP) && \ + (vlan_proto(skb) == htons(ETH_P_ARP) && \ brnf_filter_vlan_tagged) static inline __be16 pppoe_proto(const struct sk_buff *skb) diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c index 87b53b3..cc11d6b 100644 --- a/net/bridge/netfilter/ebt_vlan.c +++ b/net/bridge/netfilter/ebt_vlan.c @@ -39,8 +39,6 @@ static bool ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct ebt_vlan_info *info = par->matchinfo; - const struct vlan_hdr *fp; - struct vlan_hdr _frame; unsigned short TCI; /* Whole TCI, given from parsed frame */ unsigned short id; /* VLAN ID, given from frame TCI */ @@ -48,9 +46,20 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par) /* VLAN encapsulated Type/Length field, given from orig frame */ __be16 encap; - fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame); - if (fp == NULL) - return false; + if (vlan_tx_tag_present(skb)) { + TCI = vlan_tx_tag_get(skb); + encap = skb->protocol; + } else { + const struct vlan_hdr *fp; + struct vlan_hdr _frame; + + fp = skb_header_pointer(skb, 0, sizeof(_frame), &_frame); + if (fp == NULL) + return false; + + TCI = ntohs(fp->h_vlan_TCI); + encap = fp->h_vlan_encapsulated_proto; + } /* Tag Control Information (TCI) consists of the following elements: * - User_priority. The user_priority field is three bits in length, @@ -59,10 +68,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par) * (CFI) is a single bit flag value. Currently ignored. * - VLAN Identifier (VID). The VID is encoded as * an unsigned binary number. */ - TCI = ntohs(fp->h_vlan_TCI); id = TCI & VLAN_VID_MASK; prio = (TCI >> 13) & 0x7; - encap = fp->h_vlan_encapsulated_proto; /* Checking VLAN Identifier (VID) */ if (GET_BITMASK(EBT_VLAN_ID)) diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index bcc102e..a1dcf83 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -124,16 +124,23 @@ ebt_dev_check(const char *entry, const struct net_device *device) #define FWINV2(bool,invflg) ((bool) ^ !!(e->invflags & invflg)) /* process standard matches */ static inline int -ebt_basic_match(const struct ebt_entry *e, const struct ethhdr *h, +ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out) { + const struct ethhdr *h = eth_hdr(skb); + __be16 ethproto; int verdict, i; + if (vlan_tx_tag_present(skb)) + ethproto = htons(ETH_P_8021Q); + else + ethproto = h->h_proto; + if (e->bitmask & EBT_802_3) { - if (FWINV2(ntohs(h->h_proto) >= 1536, EBT_IPROTO)) + if (FWINV2(ntohs(ethproto) >= 1536, EBT_IPROTO)) return 1; } else if (!(e->bitmask & EBT_NOPROTO) && - FWINV2(e->ethproto != h->h_proto, EBT_IPROTO)) + FWINV2(e->ethproto != ethproto, EBT_IPROTO)) return 1; if (FWINV2(ebt_dev_check(e->in, in), EBT_IIN)) @@ -213,7 +220,7 @@ unsigned int ebt_do_table (unsigned int hook, struct sk_buff *skb, base = private->entries; i = 0; while (i < nentries) { - if (ebt_basic_match(point, eth_hdr(skb), in, out)) + if (ebt_basic_match(point, skb, in, out)) goto letscontinue; if (EBT_MATCH_ITERATE(point, ebt_do_match, skb, &acpar) != 0) -- cgit v1.1 From b738127dfb469bb9f595cdace30e7f881e8146b2 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:02 +0000 Subject: vlan: Rename VLAN_GROUP_ARRAY_LEN to VLAN_N_VID. VLAN_GROUP_ARRAY_LEN is simply the number of possible vlan VIDs. Since vlan groups will soon be more of an implementation detail for vlan devices, rename the constant to be descriptive of its actual purpose. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/8021q/vlan.c | 16 ++++++++-------- net/bridge/netfilter/ebt_vlan.c | 4 ++-- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 25c2133..54f22d8 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -439,7 +439,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, switch (event) { case NETDEV_CHANGE: /* Propagate real device state to vlan devices */ - for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + for (i = 0; i < VLAN_N_VID; i++) { vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; @@ -450,7 +450,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_CHANGEADDR: /* Adjust unicast filters on underlying device */ - for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + for (i = 0; i < VLAN_N_VID; i++) { vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; @@ -464,7 +464,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, break; case NETDEV_CHANGEMTU: - for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + for (i = 0; i < VLAN_N_VID; i++) { vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; @@ -478,7 +478,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_FEAT_CHANGE: /* Propagate device features to underlying device */ - for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + for (i = 0; i < VLAN_N_VID; i++) { vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; @@ -490,7 +490,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_DOWN: /* Put all VLANs for this dev in the down state too. */ - for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + for (i = 0; i < VLAN_N_VID; i++) { vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; @@ -508,7 +508,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, case NETDEV_UP: /* Put all VLANs for this dev in the up state too. */ - for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + for (i = 0; i < VLAN_N_VID; i++) { vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; @@ -532,7 +532,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, /* Delete all VLANs for this dev. */ grp->killall = 1; - for (i = 0; i < VLAN_GROUP_ARRAY_LEN; i++) { + for (i = 0; i < VLAN_N_VID; i++) { vlandev = vlan_group_get_device(grp, i); if (!vlandev) continue; @@ -540,7 +540,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, /* unregistration of last vlan destroys group, abort * afterwards */ if (grp->nr_vlans == 1) - i = VLAN_GROUP_ARRAY_LEN; + i = VLAN_N_VID; unregister_vlan_dev(vlandev, &list); } diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c index cc11d6b..eae67bf 100644 --- a/net/bridge/netfilter/ebt_vlan.c +++ b/net/bridge/netfilter/ebt_vlan.c @@ -118,10 +118,10 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par) * 0 - The null VLAN ID. * 1 - The default Port VID (PVID) * 0x0FFF - Reserved for implementation use. - * if_vlan.h: VLAN_GROUP_ARRAY_LEN 4096. */ + * if_vlan.h: VLAN_N_VID 4096. */ if (GET_BITMASK(EBT_VLAN_ID)) { if (!!info->id) { /* if id!=0 => check vid range */ - if (info->id > VLAN_GROUP_ARRAY_LEN) { + if (info->id > VLAN_N_VID) { pr_debug("id %d is out of range (1-4096)\n", info->id); return -EINVAL; -- cgit v1.1 From 7b9c60903714bf0a19d746b228864bad3497284e Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:04 +0000 Subject: vlan: Enable software emulation for vlan accleration. Currently users of hardware vlan accleration need to know whether the device supports it before generating packets. However, vlan acceleration will soon be available in a more flexible manner so knowing ahead of time becomes much more difficult. This adds a software fallback path for vlan packets on devices without the necessary offloading support, similar to other types of hardware accleration. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/core/dev.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 4c3ac53..1bfd96b 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1694,7 +1694,12 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol) static bool dev_can_checksum(struct net_device *dev, struct sk_buff *skb) { - if (can_checksum_protocol(dev->features, skb->protocol)) + int features = dev->features; + + if (vlan_tx_tag_present(skb)) + features &= dev->vlan_features; + + if (can_checksum_protocol(features, skb->protocol)) return true; if (skb->protocol == htons(ETH_P_8021Q)) { @@ -1793,6 +1798,16 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, int features) __be16 type = skb->protocol; int err; + if (type == htons(ETH_P_8021Q)) { + struct vlan_ethhdr *veh; + + if (unlikely(!pskb_may_pull(skb, VLAN_ETH_HLEN))) + return ERR_PTR(-EINVAL); + + veh = (struct vlan_ethhdr *)skb->data; + type = veh->h_vlan_encapsulated_proto; + } + skb_reset_mac_header(skb); skb->mac_len = skb->network_header - skb->mac_header; __skb_pull(skb, skb->mac_len); @@ -1964,9 +1979,14 @@ static inline void skb_orphan_try(struct sk_buff *skb) static inline int skb_needs_linearize(struct sk_buff *skb, struct net_device *dev) { + int features = dev->features; + + if (skb->protocol == htons(ETH_P_8021Q) || vlan_tx_tag_present(skb)) + features &= dev->vlan_features; + return skb_is_nonlinear(skb) && - ((skb_has_frag_list(skb) && !(dev->features & NETIF_F_FRAGLIST)) || - (skb_shinfo(skb)->nr_frags && (!(dev->features & NETIF_F_SG) || + ((skb_has_frag_list(skb) && !(features & NETIF_F_FRAGLIST)) || + (skb_shinfo(skb)->nr_frags && (!(features & NETIF_F_SG) || illegal_highdma(dev, skb)))); } @@ -1989,6 +2009,15 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev, skb_orphan_try(skb); + if (vlan_tx_tag_present(skb) && + !(dev->features & NETIF_F_HW_VLAN_TX)) { + skb = __vlan_put_tag(skb, vlan_tx_tag_get(skb)); + if (unlikely(!skb)) + goto out; + + skb->vlan_tci = 0; + } + if (netif_needs_gso(dev, skb)) { if (unlikely(dev_gso_segment(skb))) goto out_kfree_skb; @@ -2050,6 +2079,7 @@ out_kfree_gso_skb: skb->destructor = DEV_GSO_CB(skb)->destructor; out_kfree_skb: kfree_skb(skb); +out: return rc; } -- cgit v1.1 From 65ac6a5fa658b90f1be700c55e7cd72e4611015d Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:05 +0000 Subject: vlan: Avoid hash table lookup to find group. A struct net_device always maps to zero or one vlan groups and we always know the device when we are looking up a group. We currently do a hash table lookup on the device to find the group but it is much simpler to just store a pointer. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/8021q/vlan.c | 64 ++++++++-------------------------------------------- net/8021q/vlan.h | 17 -------------- net/8021q/vlan_dev.c | 2 +- 3 files changed, 11 insertions(+), 72 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index 54f22d8..f862dcc 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -44,9 +44,6 @@ int vlan_net_id __read_mostly; -/* Our listing of VLAN group(s) */ -static struct hlist_head vlan_group_hash[VLAN_GRP_HASH_SIZE]; - const char vlan_fullname[] = "802.1Q VLAN Support"; const char vlan_version[] = DRV_VERSION; static const char vlan_copyright[] = "Ben Greear "; @@ -59,40 +56,6 @@ static struct packet_type vlan_packet_type __read_mostly = { /* End of global variables definitions. */ -static inline unsigned int vlan_grp_hashfn(unsigned int idx) -{ - return ((idx >> VLAN_GRP_HASH_SHIFT) ^ idx) & VLAN_GRP_HASH_MASK; -} - -/* Must be invoked with RCU read lock (no preempt) */ -static struct vlan_group *__vlan_find_group(struct net_device *real_dev) -{ - struct vlan_group *grp; - struct hlist_node *n; - int hash = vlan_grp_hashfn(real_dev->ifindex); - - hlist_for_each_entry_rcu(grp, n, &vlan_group_hash[hash], hlist) { - if (grp->real_dev == real_dev) - return grp; - } - - return NULL; -} - -/* Find the protocol handler. Assumes VID < VLAN_VID_MASK. - * - * Must be invoked with RCU read lock (no preempt) - */ -struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id) -{ - struct vlan_group *grp = __vlan_find_group(real_dev); - - if (grp) - return vlan_group_get_device(grp, vlan_id); - - return NULL; -} - static void vlan_group_free(struct vlan_group *grp) { int i; @@ -111,8 +74,6 @@ static struct vlan_group *vlan_group_alloc(struct net_device *real_dev) return NULL; grp->real_dev = real_dev; - hlist_add_head_rcu(&grp->hlist, - &vlan_group_hash[vlan_grp_hashfn(real_dev->ifindex)]); return grp; } @@ -151,7 +112,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) ASSERT_RTNL(); - grp = __vlan_find_group(real_dev); + grp = real_dev->vlgrp; BUG_ON(!grp); /* Take it out of our own structures, but be sure to interlock with @@ -173,11 +134,10 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) if (grp->nr_vlans == 0) { vlan_gvrp_uninit_applicant(real_dev); + rcu_assign_pointer(real_dev->vlgrp, NULL); if (real_dev->features & NETIF_F_HW_VLAN_RX) ops->ndo_vlan_rx_register(real_dev, NULL); - hlist_del_rcu(&grp->hlist); - /* Free the group, after all cpu's are done. */ call_rcu(&grp->rcu, vlan_rcu_free); } @@ -207,7 +167,7 @@ int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id) return -EOPNOTSUPP; } - if (__find_vlan_dev(real_dev, vlan_id) != NULL) + if (vlan_find_dev(real_dev, vlan_id) != NULL) return -EEXIST; return 0; @@ -222,7 +182,7 @@ int register_vlan_dev(struct net_device *dev) struct vlan_group *grp, *ngrp = NULL; int err; - grp = __vlan_find_group(real_dev); + grp = real_dev->vlgrp; if (!grp) { ngrp = grp = vlan_group_alloc(real_dev); if (!grp) @@ -252,8 +212,11 @@ int register_vlan_dev(struct net_device *dev) vlan_group_set_device(grp, vlan_id, dev); grp->nr_vlans++; - if (ngrp && real_dev->features & NETIF_F_HW_VLAN_RX) - ops->ndo_vlan_rx_register(real_dev, ngrp); + if (ngrp) { + if (real_dev->features & NETIF_F_HW_VLAN_RX) + ops->ndo_vlan_rx_register(real_dev, ngrp); + rcu_assign_pointer(real_dev->vlgrp, ngrp); + } if (real_dev->features & NETIF_F_HW_VLAN_FILTER) ops->ndo_vlan_rx_add_vid(real_dev, vlan_id); @@ -264,7 +227,6 @@ out_uninit_applicant: vlan_gvrp_uninit_applicant(real_dev); out_free_group: if (ngrp) { - hlist_del_rcu(&ngrp->hlist); /* Free the group, after all cpu's are done. */ call_rcu(&ngrp->rcu, vlan_rcu_free); } @@ -428,7 +390,7 @@ static int vlan_device_event(struct notifier_block *unused, unsigned long event, dev->netdev_ops->ndo_vlan_rx_add_vid(dev, 0); } - grp = __vlan_find_group(dev); + grp = dev->vlgrp; if (!grp) goto out; @@ -746,8 +708,6 @@ err0: static void __exit vlan_cleanup_module(void) { - unsigned int i; - vlan_ioctl_set(NULL); vlan_netlink_fini(); @@ -755,10 +715,6 @@ static void __exit vlan_cleanup_module(void) dev_remove_pack(&vlan_packet_type); - /* This table must be empty if there are no module references left. */ - for (i = 0; i < VLAN_GRP_HASH_SIZE; i++) - BUG_ON(!hlist_empty(&vlan_group_hash[i])); - unregister_pernet_subsys(&vlan_net_ops); rcu_barrier(); /* Wait for completion of call_rcu()'s */ diff --git a/net/8021q/vlan.h b/net/8021q/vlan.h index 8d9503a..db01b31 100644 --- a/net/8021q/vlan.h +++ b/net/8021q/vlan.h @@ -72,23 +72,6 @@ static inline struct vlan_dev_info *vlan_dev_info(const struct net_device *dev) return netdev_priv(dev); } -#define VLAN_GRP_HASH_SHIFT 5 -#define VLAN_GRP_HASH_SIZE (1 << VLAN_GRP_HASH_SHIFT) -#define VLAN_GRP_HASH_MASK (VLAN_GRP_HASH_SIZE - 1) - -/* Find a VLAN device by the MAC address of its Ethernet device, and - * it's VLAN ID. The default configuration is to have VLAN's scope - * to be box-wide, so the MAC will be ignored. The mac will only be - * looked at if we are configured to have a separate set of VLANs per - * each MAC addressable interface. Note that this latter option does - * NOT follow the spec for VLANs, but may be useful for doing very - * large quantities of VLAN MUX/DEMUX onto FrameRelay or ATM PVCs. - * - * Must be invoked with rcu_read_lock (ie preempt disabled) - * or with RTNL. - */ -struct net_device *__find_vlan_dev(struct net_device *real_dev, u16 vlan_id); - /* found in vlan_dev.c */ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, struct packet_type *ptype, struct net_device *orig_dev); diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index f54251e..14e3d1f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -158,7 +158,7 @@ int vlan_skb_recv(struct sk_buff *skb, struct net_device *dev, vlan_id = vlan_tci & VLAN_VID_MASK; rcu_read_lock(); - vlan_dev = __find_vlan_dev(dev, vlan_id); + vlan_dev = vlan_find_dev(dev, vlan_id); /* If the VLAN device is defined, we use it. * If not, and the VID is 0, it is a 802.1p packet (not -- cgit v1.1 From 3701e51382a026cba10c60b03efabe534fba4ca4 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:06 +0000 Subject: vlan: Centralize handling of hardware acceleration. Currently each driver that is capable of vlan hardware acceleration must be aware of the vlan groups that are configured and then pass the stripped tag to a specialized receive function. This is different from other types of hardware offload in that it places a significant amount of knowledge in the driver itself rather keeping it in the networking core. This makes vlan offloading function more similarly to other forms of offloading (such as checksum offloading or TSO) by doing the following: * On receive, stripped vlans are passed directly to the network core, without attempting to check for vlan groups or reconstructing the header if no group * vlans are made less special by folding the logic into the main receive routines * On transmit, the device layer will add the vlan header in software if the hardware doesn't support it, instead of spreading that logic out in upper layers, such as bonding. There are a number of advantages to this: * Fixes all bugs with drivers incorrectly dropping vlan headers at once. * Avoids having to disable VLAN acceleration when in promiscuous mode (good for bridging since it always puts devices in promiscuous mode). * Keeps VLAN tag separate until given to ultimate consumer, which avoids needing to do header reconstruction as in tg3 unless absolutely necessary. * Consolidates common code in core networking. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/8021q/vlan.c | 9 +--- net/8021q/vlan_core.c | 125 +++++++++++--------------------------------------- net/core/dev.c | 47 ++++++------------- 3 files changed, 44 insertions(+), 137 deletions(-) (limited to 'net') diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c index f862dcc..05b867e 100644 --- a/net/8021q/vlan.c +++ b/net/8021q/vlan.c @@ -135,7 +135,7 @@ void unregister_vlan_dev(struct net_device *dev, struct list_head *head) vlan_gvrp_uninit_applicant(real_dev); rcu_assign_pointer(real_dev->vlgrp, NULL); - if (real_dev->features & NETIF_F_HW_VLAN_RX) + if (ops->ndo_vlan_rx_register) ops->ndo_vlan_rx_register(real_dev, NULL); /* Free the group, after all cpu's are done. */ @@ -156,11 +156,6 @@ int vlan_check_real_dev(struct net_device *real_dev, u16 vlan_id) return -EOPNOTSUPP; } - if ((real_dev->features & NETIF_F_HW_VLAN_RX) && !ops->ndo_vlan_rx_register) { - pr_info("8021q: device %s has buggy VLAN hw accel\n", name); - return -EOPNOTSUPP; - } - if ((real_dev->features & NETIF_F_HW_VLAN_FILTER) && (!ops->ndo_vlan_rx_add_vid || !ops->ndo_vlan_rx_kill_vid)) { pr_info("8021q: Device %s has buggy VLAN hw accel\n", name); @@ -213,7 +208,7 @@ int register_vlan_dev(struct net_device *dev) grp->nr_vlans++; if (ngrp) { - if (real_dev->features & NETIF_F_HW_VLAN_RX) + if (ops->ndo_vlan_rx_register) ops->ndo_vlan_rx_register(real_dev, ngrp); rcu_assign_pointer(real_dev->vlgrp, ngrp); } diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c index dee727ce..69b2f79 100644 --- a/net/8021q/vlan_core.c +++ b/net/8021q/vlan_core.c @@ -4,54 +4,29 @@ #include #include "vlan.h" -/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */ -int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, - u16 vlan_tci, int polling) +bool vlan_hwaccel_do_receive(struct sk_buff **skbp) { + struct sk_buff *skb = *skbp; + u16 vlan_id = skb->vlan_tci & VLAN_VID_MASK; struct net_device *vlan_dev; - u16 vlan_id; - - if (netpoll_rx(skb)) - return NET_RX_DROP; - - if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master))) - skb->deliver_no_wcard = 1; + struct vlan_rx_stats *rx_stats; - skb->skb_iif = skb->dev->ifindex; - __vlan_hwaccel_put_tag(skb, vlan_tci); - vlan_id = vlan_tci & VLAN_VID_MASK; - vlan_dev = vlan_group_get_device(grp, vlan_id); - - if (vlan_dev) - skb->dev = vlan_dev; - else if (vlan_id) { - if (!(skb->dev->flags & IFF_PROMISC)) - goto drop; - skb->pkt_type = PACKET_OTHERHOST; + vlan_dev = vlan_find_dev(skb->dev, vlan_id); + if (!vlan_dev) { + if (vlan_id) + skb->pkt_type = PACKET_OTHERHOST; + return false; } - return polling ? netif_receive_skb(skb) : netif_rx(skb); + skb = *skbp = skb_share_check(skb, GFP_ATOMIC); + if (unlikely(!skb)) + return false; -drop: - atomic_long_inc(&skb->dev->rx_dropped); - dev_kfree_skb_any(skb); - return NET_RX_DROP; -} -EXPORT_SYMBOL(__vlan_hwaccel_rx); - -void vlan_hwaccel_do_receive(struct sk_buff *skb) -{ - struct net_device *dev = skb->dev; - struct vlan_rx_stats *rx_stats; - - skb->dev = vlan_dev_real_dev(dev); - netif_nit_deliver(skb); - - skb->dev = dev; - skb->priority = vlan_get_ingress_priority(dev, skb->vlan_tci); + skb->dev = vlan_dev; + skb->priority = vlan_get_ingress_priority(vlan_dev, skb->vlan_tci); skb->vlan_tci = 0; - rx_stats = this_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats); + rx_stats = this_cpu_ptr(vlan_dev_info(vlan_dev)->vlan_rx_stats); u64_stats_update_begin(&rx_stats->syncp); rx_stats->rx_packets++; @@ -68,11 +43,13 @@ void vlan_hwaccel_do_receive(struct sk_buff *skb) * This allows the VLAN to have a different MAC than the * underlying device, and still route correctly. */ if (!compare_ether_addr(eth_hdr(skb)->h_dest, - dev->dev_addr)) + vlan_dev->dev_addr)) skb->pkt_type = PACKET_HOST; break; } u64_stats_update_end(&rx_stats->syncp); + + return true; } struct net_device *vlan_dev_real_dev(const struct net_device *dev) @@ -87,75 +64,27 @@ u16 vlan_dev_vlan_id(const struct net_device *dev) } EXPORT_SYMBOL(vlan_dev_vlan_id); -static gro_result_t -vlan_gro_common(struct napi_struct *napi, struct vlan_group *grp, - unsigned int vlan_tci, struct sk_buff *skb) +/* VLAN rx hw acceleration helper. This acts like netif_{rx,receive_skb}(). */ +int __vlan_hwaccel_rx(struct sk_buff *skb, struct vlan_group *grp, + u16 vlan_tci, int polling) { - struct sk_buff *p; - struct net_device *vlan_dev; - u16 vlan_id; - - if (skb_bond_should_drop(skb, ACCESS_ONCE(skb->dev->master))) - skb->deliver_no_wcard = 1; - - skb->skb_iif = skb->dev->ifindex; __vlan_hwaccel_put_tag(skb, vlan_tci); - vlan_id = vlan_tci & VLAN_VID_MASK; - vlan_dev = vlan_group_get_device(grp, vlan_id); - - if (vlan_dev) - skb->dev = vlan_dev; - else if (vlan_id) { - if (!(skb->dev->flags & IFF_PROMISC)) - goto drop; - skb->pkt_type = PACKET_OTHERHOST; - } - - for (p = napi->gro_list; p; p = p->next) { - unsigned long diffs; - - diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; - diffs |= compare_ether_header(skb_mac_header(p), - skb_gro_mac_header(skb)); - NAPI_GRO_CB(p)->same_flow = !diffs; - NAPI_GRO_CB(p)->flush = 0; - } - - return dev_gro_receive(napi, skb); - -drop: - atomic_long_inc(&skb->dev->rx_dropped); - return GRO_DROP; + return polling ? netif_receive_skb(skb) : netif_rx(skb); } +EXPORT_SYMBOL(__vlan_hwaccel_rx); gro_result_t vlan_gro_receive(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci, struct sk_buff *skb) { - if (netpoll_rx_on(skb)) - return vlan_hwaccel_receive_skb(skb, grp, vlan_tci) - ? GRO_DROP : GRO_NORMAL; - - skb_gro_reset_offset(skb); - - return napi_skb_finish(vlan_gro_common(napi, grp, vlan_tci, skb), skb); + __vlan_hwaccel_put_tag(skb, vlan_tci); + return napi_gro_receive(napi, skb); } EXPORT_SYMBOL(vlan_gro_receive); gro_result_t vlan_gro_frags(struct napi_struct *napi, struct vlan_group *grp, unsigned int vlan_tci) { - struct sk_buff *skb = napi_frags_skb(napi); - - if (!skb) - return GRO_DROP; - - if (netpoll_rx_on(skb)) { - skb->protocol = eth_type_trans(skb, skb->dev); - return vlan_hwaccel_receive_skb(skb, grp, vlan_tci) - ? GRO_DROP : GRO_NORMAL; - } - - return napi_frags_finish(napi, skb, - vlan_gro_common(napi, grp, vlan_tci, skb)); + __vlan_hwaccel_put_tag(napi->skb, vlan_tci); + return napi_gro_frags(napi); } EXPORT_SYMBOL(vlan_gro_frags); diff --git a/net/core/dev.c b/net/core/dev.c index 1bfd96b..97fd6bc 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2789,33 +2789,6 @@ out: } #endif -/* - * netif_nit_deliver - deliver received packets to network taps - * @skb: buffer - * - * This function is used to deliver incoming packets to network - * taps. It should be used when the normal netif_receive_skb path - * is bypassed, for example because of VLAN acceleration. - */ -void netif_nit_deliver(struct sk_buff *skb) -{ - struct packet_type *ptype; - - if (list_empty(&ptype_all)) - return; - - skb_reset_network_header(skb); - skb_reset_transport_header(skb); - skb->mac_len = skb->network_header - skb->mac_header; - - rcu_read_lock(); - list_for_each_entry_rcu(ptype, &ptype_all, list) { - if (!ptype->dev || ptype->dev == skb->dev) - deliver_skb(skb, ptype, skb->dev); - } - rcu_read_unlock(); -} - /** * netdev_rx_handler_register - register receive handler * @dev: device to register a handler for @@ -2925,9 +2898,6 @@ static int __netif_receive_skb(struct sk_buff *skb) if (!netdev_tstamp_prequeue) net_timestamp_check(skb); - if (vlan_tx_tag_present(skb)) - vlan_hwaccel_do_receive(skb); - /* if we've gotten here through NAPI, check netpoll */ if (netpoll_receive_skb(skb)) return NET_RX_DROP; @@ -2940,8 +2910,7 @@ static int __netif_receive_skb(struct sk_buff *skb) * be delivered to pkt handlers that are exact matches. Also * the deliver_no_wcard flag will be set. If packet handlers * are sensitive to duplicate packets these skbs will need to - * be dropped at the handler. The vlan accel path may have - * already set the deliver_no_wcard flag. + * be dropped at the handler. */ null_or_orig = NULL; orig_dev = skb->dev; @@ -3000,6 +2969,18 @@ ncls: goto out; } + if (vlan_tx_tag_present(skb)) { + if (pt_prev) { + ret = deliver_skb(skb, pt_prev, orig_dev); + pt_prev = NULL; + } + if (vlan_hwaccel_do_receive(&skb)) { + ret = __netif_receive_skb(skb); + goto out; + } else if (unlikely(!skb)) + goto out; + } + /* * Make sure frames received on VLAN interfaces stacked on * bonding interfaces still make their way to any base bonding @@ -3264,6 +3245,7 @@ __napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) unsigned long diffs; diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev; + diffs |= p->vlan_tci ^ skb->vlan_tci; diffs |= compare_ether_header(skb_mac_header(p), skb_gro_mac_header(skb)); NAPI_GRO_CB(p)->same_flow = !diffs; @@ -3323,6 +3305,7 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); + skb->vlan_tci = 0; napi->skb = skb; } -- cgit v1.1 From d5dbda23804156ae6f35025ade5307a49d1db6d7 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:07 +0000 Subject: ethtool: Add support for vlan accleration. Now that vlan acceleration is handled consistently regardless of usage, it is possible to enable and disable it at will. This adds support for Ethtool operations that change the offloading status for debugging purposes, similar to other forms of hardware acceleration. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/core/ethtool.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/ethtool.c b/net/core/ethtool.c index 685c700..956a9f4 100644 --- a/net/core/ethtool.c +++ b/net/core/ethtool.c @@ -132,7 +132,8 @@ EXPORT_SYMBOL(ethtool_op_set_ufo); * NETIF_F_xxx values in include/linux/netdevice.h */ static const u32 flags_dup_features = - (ETH_FLAG_LRO | ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH); + (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE | + ETH_FLAG_RXHASH); u32 ethtool_op_get_flags(struct net_device *dev) { -- cgit v1.1 From 361ff8a6cf90d62c0071b7e532e37369bfd3ae77 Mon Sep 17 00:00:00 2001 From: Jesse Gross Date: Wed, 20 Oct 2010 13:56:08 +0000 Subject: bridge: Add support for TX vlan offload. If some of the underlying devices support it, enable vlan offload on transmit for bridge devices. This allows senders to take advantage of the hardware support, similar to other forms of acceleration. Signed-off-by: Jesse Gross Signed-off-by: David S. Miller --- net/bridge/br_device.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index cf09fe59..17cb0b6 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -212,6 +212,11 @@ static int br_set_tx_csum(struct net_device *dev, u32 data) return 0; } +static int br_set_flags(struct net_device *netdev, u32 data) +{ + return ethtool_op_set_flags(netdev, data, ETH_FLAG_TXVLAN); +} + #ifdef CONFIG_NET_POLL_CONTROLLER static void br_poll_controller(struct net_device *br_dev) { @@ -304,6 +309,7 @@ static const struct ethtool_ops br_ethtool_ops = { .get_ufo = ethtool_op_get_ufo, .set_ufo = ethtool_op_set_ufo, .get_flags = ethtool_op_get_flags, + .set_flags = br_set_flags, }; static const struct net_device_ops br_netdev_ops = { @@ -343,5 +349,5 @@ void br_dev_setup(struct net_device *dev) dev->features = NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HIGHDMA | NETIF_F_GSO_MASK | NETIF_F_NO_CSUM | NETIF_F_LLTX | - NETIF_F_NETNS_LOCAL | NETIF_F_GSO; + NETIF_F_NETNS_LOCAL | NETIF_F_GSO | NETIF_F_HW_VLAN_TX; } -- cgit v1.1 From 5bc9068e9d962ca6b8bec3f0eb6f60ab4dee1d04 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:14:31 +0300 Subject: ipvs: fix CHECKSUM_PARTIAL for TCP, UDP Fix CHECKSUM_PARTIAL handling. Tested for IPv4 TCP, UDP not tested because it needs network card with HW CSUM support. May be fixes problem where IPVS can not be used in virtual boxes. Problem appears with DNAT to local address when the local stack sends reply in CHECKSUM_PARTIAL mode. Fix tcp_dnat_handler and udp_dnat_handler to provide vaddr and daddr in right order (old and new IP) when calling tcp_partial_csum_update/udp_partial_csum_update (CHECKSUM_PARTIAL). Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_proto_tcp.c | 10 +++++----- net/netfilter/ipvs/ip_vs_proto_udp.c | 10 +++++----- 2 files changed, 10 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 282d24d..318d011 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -101,15 +101,15 @@ tcp_partial_csum_update(int af, struct tcphdr *tcph, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) tcph->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); + csum_unfold(tcph->check)))); else #endif tcph->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(tcph->check)))); + csum_unfold(tcph->check)))); } @@ -223,7 +223,7 @@ tcp_dnat_handler(struct sk_buff *skb, * Adjust TCP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, + tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); } else if (!cp->app) { diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 8553231..f929089 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -102,15 +102,15 @@ udp_partial_csum_update(int af, struct udphdr *uhdr, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) uhdr->check = - csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, + ~csum_fold(ip_vs_check_diff16(oldip->ip6, newip->ip6, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); + csum_unfold(uhdr->check)))); else #endif uhdr->check = - csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, + ~csum_fold(ip_vs_check_diff4(oldip->ip, newip->ip, ip_vs_check_diff2(oldlen, newlen, - ~csum_unfold(uhdr->check)))); + csum_unfold(uhdr->check)))); } @@ -229,7 +229,7 @@ udp_dnat_handler(struct sk_buff *skb, * Adjust UDP checksums */ if (skb->ip_summed == CHECKSUM_PARTIAL) { - udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, + udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); } else if (!cp->app && (udph->check != 0)) { -- cgit v1.1 From 8b27b10f5863a5b63e46304a71aa01463d1efac4 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:17:20 +0300 Subject: ipvs: optimize checksums for apps Avoid full checksum calculation for apps that can provide info whether csum was broken after payload mangling. For now only ip_vs_ftp mangles payload and it updates the csum, so the full recalculation is avoided for all packets. Add CHECKSUM_UNNECESSARY for snat_handler (TCP and UDP). It is needed to support SNAT from local address for the case when csum is fully recalculated. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_ftp.c | 7 ++++++- net/netfilter/ipvs/ip_vs_proto_tcp.c | 31 +++++++++++++++++++++++++------ net/netfilter/ipvs/ip_vs_proto_udp.c | 31 +++++++++++++++++++++++++------ 3 files changed, 56 insertions(+), 13 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_ftp.c b/net/netfilter/ipvs/ip_vs_ftp.c index 090889a..7545500 100644 --- a/net/netfilter/ipvs/ip_vs_ftp.c +++ b/net/netfilter/ipvs/ip_vs_ftp.c @@ -242,9 +242,14 @@ static int ip_vs_ftp_out(struct ip_vs_app *app, struct ip_vs_conn *cp, ret = nf_nat_mangle_tcp_packet(skb, ct, ctinfo, start-data, end-start, buf, buf_len); - if (ret) + if (ret) { ip_vs_nfct_expect_related(skb, ct, n_cp, IPPROTO_TCP, 0, 0); + if (skb->ip_summed == CHECKSUM_COMPLETE) + skb->ip_summed = CHECKSUM_UNNECESSARY; + /* csum is updated */ + ret = 1; + } } /* diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 318d011..64dc295 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -120,6 +120,7 @@ tcp_snat_handler(struct sk_buff *skb, struct tcphdr *tcph; unsigned int tcphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -134,13 +135,20 @@ tcp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; /* Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -151,12 +159,13 @@ tcp_snat_handler(struct sk_buff *skb, tcp_partial_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; @@ -174,6 +183,7 @@ tcp_snat_handler(struct sk_buff *skb, skb->len - tcphoff, cp->protocol, skb->csum); + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, tcph->check, @@ -190,6 +200,7 @@ tcp_dnat_handler(struct sk_buff *skb, struct tcphdr *tcph; unsigned int tcphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -204,6 +215,8 @@ tcp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -212,8 +225,13 @@ tcp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn and iph ack_seq stuff */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - tcphoff; + else + payload_csum = 1; } tcph = (void *)skb_network_header(skb) + tcphoff; @@ -226,12 +244,13 @@ tcp_dnat_handler(struct sk_buff *skb, tcp_partial_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - tcphoff)); - } else if (!cp->app) { + } else if (!payload_csum) { /* Only port and addr are changed, do fast csum update */ tcp_fast_csum_update(cp->af, tcph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ tcph->check = 0; diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index f929089..9c558c4 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -121,6 +121,7 @@ udp_snat_handler(struct sk_buff *skb, struct udphdr *udph; unsigned int udphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -135,6 +136,8 @@ udp_snat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -142,8 +145,13 @@ udp_snat_handler(struct sk_buff *skb, /* * Call application helper if needed */ - if (!ip_vs_app_pkt_out(cp, skb)) + if (!(ret = ip_vs_app_pkt_out(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -156,12 +164,13 @@ udp_snat_handler(struct sk_buff *skb, udp_partial_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->daddr, &cp->vaddr, cp->dport, cp->vport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; @@ -181,6 +190,7 @@ udp_snat_handler(struct sk_buff *skb, skb->csum); if (udph->check == 0) udph->check = CSUM_MANGLED_0; + skb->ip_summed = CHECKSUM_UNNECESSARY; IP_VS_DBG(11, "O-pkt: %s O-csum=%d (+%zd)\n", pp->name, udph->check, (char*)&(udph->check) - (char*)udph); @@ -196,6 +206,7 @@ udp_dnat_handler(struct sk_buff *skb, struct udphdr *udph; unsigned int udphoff; int oldlen; + int payload_csum = 0; #ifdef CONFIG_IP_VS_IPV6 if (cp->af == AF_INET6) @@ -210,6 +221,8 @@ udp_dnat_handler(struct sk_buff *skb, return 0; if (unlikely(cp->app != NULL)) { + int ret; + /* Some checks before mangling */ if (pp->csum_check && !pp->csum_check(cp->af, skb, pp)) return 0; @@ -218,8 +231,13 @@ udp_dnat_handler(struct sk_buff *skb, * Attempt ip_vs_app call. * It will fix ip_vs_conn */ - if (!ip_vs_app_pkt_in(cp, skb)) + if (!(ret = ip_vs_app_pkt_in(cp, skb))) return 0; + /* ret=2: csum update is needed after payload mangling */ + if (ret == 1) + oldlen = skb->len - udphoff; + else + payload_csum = 1; } udph = (void *)skb_network_header(skb) + udphoff; @@ -232,12 +250,13 @@ udp_dnat_handler(struct sk_buff *skb, udp_partial_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, htons(oldlen), htons(skb->len - udphoff)); - } else if (!cp->app && (udph->check != 0)) { + } else if (!payload_csum && (udph->check != 0)) { /* Only port and addr are changed, do fast csum update */ udp_fast_csum_update(cp->af, udph, &cp->vaddr, &cp->daddr, cp->vport, cp->dport); if (skb->ip_summed == CHECKSUM_COMPLETE) - skb->ip_summed = CHECKSUM_NONE; + skb->ip_summed = (cp->app && pp->csum_check) ? + CHECKSUM_UNNECESSARY : CHECKSUM_NONE; } else { /* full checksum calculation */ udph->check = 0; -- cgit v1.1 From cf356d69db0afef692cd640917bc70f708c27f14 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:21:07 +0300 Subject: ipvs: switch to notrack mode Change skb->ipvs_property semantic. This is preparation to support ip_vs_out processing in LOCAL_OUT. ipvs_property=1 will be used to avoid expensive lookups for traffic sent by transmitters. Now when conntrack support is not used we call ip_vs_notrack method to avoid problems in OUTPUT and POST_ROUTING hooks instead of exiting POST_ROUTING as before. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 39 ++++----------------------------------- net/netfilter/ipvs/ip_vs_xmit.c | 7 +++++-- 2 files changed, 9 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index e5fef7a..2224530 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -507,23 +507,6 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, return NF_DROP; } -/* - * It is hooked before NF_IP_PRI_NAT_SRC at the NF_INET_POST_ROUTING - * chain and is used to avoid double NAT and confirmation when we do - * not want to keep the conntrack structure - */ -static unsigned int ip_vs_post_routing(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - if (!skb->ipvs_property) - return NF_ACCEPT; - /* The packet was sent from IPVS, exit this chain */ - return NF_STOP; -} - __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) { return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); @@ -682,8 +665,9 @@ static int handle_response_icmp(int af, struct sk_buff *skb, /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); verdict = NF_ACCEPT; @@ -929,8 +913,9 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); + skb->ipvs_property = 1; if (!(cp->flags & IP_VS_CONN_F_NFCT)) - skb->ipvs_property = 1; + ip_vs_notrack(skb); else ip_vs_update_conntrack(skb, cp, 0); ip_vs_conn_put(cp); @@ -1496,14 +1481,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP_PRI_NAT_SRC-1, - }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1532,14 +1509,6 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, - /* Before the netfilter connection tracking, exit from POST_ROUTING */ - { - .hook = ip_vs_post_routing, - .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_POST_ROUTING, - .priority = NF_IP6_PRI_NAT_SRC-1, - }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index b0bd8af..94b53b4 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -217,6 +217,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) ({ \ int __ret = NF_ACCEPT; \ \ + (skb)->ipvs_property = 1; \ if (unlikely((cp)->flags & IP_VS_CONN_F_NFCT)) \ __ret = ip_vs_confirm_conntrack(skb, cp); \ if (__ret == NF_ACCEPT) { \ @@ -228,8 +229,9 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) #define IP_VS_XMIT_NAT(pf, skb, cp) \ do { \ + (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - (skb)->ipvs_property = 1; \ + ip_vs_notrack(skb); \ else \ ip_vs_update_conntrack(skb, cp, 1); \ skb_forward_csum(skb); \ @@ -239,8 +241,9 @@ do { \ #define IP_VS_XMIT(pf, skb, cp) \ do { \ + (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ - (skb)->ipvs_property = 1; \ + ip_vs_notrack(skb); \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ -- cgit v1.1 From 190ecd27cd7294105e3b26ca71663c7d940acbbb Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:24:37 +0300 Subject: ipvs: do not schedule conns from real servers This patch is needed to avoid scheduling of packets from local real server when we add ip_vs_in in LOCAL_OUT hook to support local client. Currently, when ip_vs_in can not find existing connection it tries to create new one by calling ip_vs_schedule. The default indication from ip_vs_schedule was if connection was scheduled to real server. If real server is not available we try to use the bypass forwarding method or to send ICMP error. But in some cases we do not want to use the bypass feature. So, add flag 'ignored' to indicate if the scheduler ignores this packet. Make sure we do not create new connections from replies. We can hit this problem for persistent services and local real server when ip_vs_in is added to LOCAL_OUT hook to handle local clients. Also, make sure ip_vs_schedule ignores SYN packets for Active FTP DATA from local real server. The FTP DATA connection should be created on SYN+ACK from client to assign correct connection daddr. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 34 ++++++++++++++++++++++++++++++++-- net/netfilter/ipvs/ip_vs_proto_sctp.c | 6 ++++-- net/netfilter/ipvs/ip_vs_proto_tcp.c | 7 +++++-- net/netfilter/ipvs/ip_vs_proto_udp.c | 6 ++++-- 4 files changed, 45 insertions(+), 8 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 2224530..0090d6d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -342,7 +342,8 @@ ip_vs_sched_persist(struct ip_vs_service *svc, * Protocols supported: TCP, UDP */ struct ip_vs_conn * -ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) +ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, + struct ip_vs_protocol *pp, int *ignored) { struct ip_vs_conn *cp = NULL; struct ip_vs_iphdr iph; @@ -350,16 +351,43 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) __be16 _ports[2], *pptr; unsigned int flags; + *ignored = 1; ip_vs_fill_iphdr(svc->af, skb_network_header(skb), &iph); pptr = skb_header_pointer(skb, iph.len, sizeof(_ports), _ports); if (pptr == NULL) return NULL; /* + * FTPDATA needs this check when using local real server. + * Never schedule Active FTPDATA connections from real server. + * For LVS-NAT they must be already created. For other methods + * with persistence the connection is created on SYN+ACK. + */ + if (pptr[0] == FTPDATA) { + IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA"); + return NULL; + } + + /* + * Do not schedule replies from local real server. It is risky + * for fwmark services but mostly for persistent services. + */ + if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && + (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { + IP_VS_DBG_PKT(12, pp, skb, 0, + "Not scheduling reply for existing connection"); + __ip_vs_conn_put(cp); + return NULL; + } + + /* * Persistent service */ - if (svc->flags & IP_VS_SVC_F_PERSISTENT) + if (svc->flags & IP_VS_SVC_F_PERSISTENT) { + *ignored = 0; return ip_vs_sched_persist(svc, skb, pptr); + } /* * Non-persistent service @@ -372,6 +400,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb) return NULL; } + *ignored = 0; + dest = svc->scheduler->schedule(svc, skb); if (dest == NULL) { IP_VS_DBG(1, "Schedule: no dest found.\n"); diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 4c0855c..9ab5232 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -31,6 +31,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, if ((sch->type == SCTP_CID_INIT) && (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, sh->dest))) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -44,8 +46,8 @@ sctp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 64dc295..85d80a6 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -43,9 +43,12 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } + /* No !th->ack check to allow scheduling on SYN+ACK for Active FTP */ if (th->syn && (svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, th->dest))) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -60,8 +63,8 @@ tcp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 9c558c4..5d21f08 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -46,6 +46,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, svc = ip_vs_service_get(af, skb->mark, iph.protocol, &iph.daddr, uh->dest); if (svc) { + int ignored; + if (ip_vs_todrop()) { /* * It seems that we are very loaded. @@ -60,8 +62,8 @@ udp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * Let the virtual server select a real server for the * incoming connection, and create a connection entry. */ - *cpp = ip_vs_schedule(svc, skb); - if (!*cpp) { + *cpp = ip_vs_schedule(svc, skb, pp, &ignored); + if (!*cpp && !ignored) { *verdict = ip_vs_leave(svc, skb, pp); return 0; } -- cgit v1.1 From 489fdedaed5ddb437dd2840eb93df37a6dd8c7de Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:27:31 +0300 Subject: ipvs: stop ICMP from FORWARD to local Delivering locally ICMP from FORWARD hook is not supported. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 0090d6d..27ecb25 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -48,6 +48,7 @@ #ifdef CONFIG_IP_VS_IPV6 #include #include +#include #endif #include @@ -1191,7 +1192,14 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) if (IPPROTO_TCP == cih->protocol || IPPROTO_UDP == cih->protocol) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit(skb, cp, pp, offset); - /* do not touch skb anymore */ + /* LOCALNODE from FORWARD hook is not supported */ + if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && + skb_rtable(skb)->rt_flags & RTCF_LOCAL) { + IP_VS_DBG(1, "%s(): " + "local delivery to %pI4 but in FORWARD\n", + __func__, &skb_rtable(skb)->rt_dst); + verdict = NF_DROP; + } out: __ip_vs_conn_put(cp); @@ -1212,6 +1220,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) struct ip_vs_protocol *pp; unsigned int offset, verdict; union nf_inet_addr snet; + struct rt6_info *rt; *related = 1; @@ -1290,7 +1299,15 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) IPPROTO_SCTP == cih->nexthdr) offset += 2 * sizeof(__u16); verdict = ip_vs_icmp_xmit_v6(skb, cp, pp, offset); - /* do not touch skb anymore */ + /* LOCALNODE from FORWARD hook is not supported */ + if (verdict == NF_ACCEPT && hooknum == NF_INET_FORWARD && + (rt = (struct rt6_info *) skb_dst(skb)) && + rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "local delivery to %pI6 but in FORWARD\n", + __func__, &rt->rt6i_dst); + verdict = NF_DROP; + } __ip_vs_conn_put(cp); -- cgit v1.1 From 4256f1aaa662697c1faa0984b7a698c2c8c57735 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:29:40 +0300 Subject: ipvs: fix CHECKSUM_PARTIAL for TUN method The recent change in IP_VS_XMIT_TUNNEL to set CHECKSUM_NONE is not correct. After adding IPIP header skb->csum becomes invalid but the CHECKSUM_PARTIAL case must be supported. So, use skb_forward_csum() which is most suitable for us to allow local clients to send IPIP to remote real server. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_xmit.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 94b53b4..63cc0fe 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -222,7 +222,7 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) __ret = ip_vs_confirm_conntrack(skb, cp); \ if (__ret == NF_ACCEPT) { \ nf_reset(skb); \ - (skb)->ip_summed = CHECKSUM_NONE; \ + skb_forward_csum(skb); \ } \ __ret; \ }) -- cgit v1.1 From 1ca5bb5450aa2401fa272efeb741ebb260d0fbb0 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:32:29 +0300 Subject: ipvs: create ip_vs_defrag_user Create new function ip_vs_defrag_user to return correct IP_DEFRAG_xxx user depending on the hooknum. It will be needed when we add handlers in LOCAL_OUT. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 55 +++++++++++++++++++++++++---------------- 1 file changed, 34 insertions(+), 21 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 27ecb25..f7f5283 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -543,6 +543,15 @@ __sum16 ip_vs_checksum_complete(struct sk_buff *skb, int offset) return csum_fold(skb_checksum(skb, offset, skb->len - offset, 0)); } +static inline enum ip_defrag_users ip_vs_defrag_user(unsigned int hooknum) +{ + if (NF_INET_LOCAL_IN == hooknum) + return IP_DEFRAG_VS_IN; + if (NF_INET_FORWARD == hooknum) + return IP_DEFRAG_VS_FWD; + return IP_DEFRAG_VS_OUT; +} + static inline int ip_vs_gather_frags(struct sk_buff *skb, u_int32_t user) { int err = ip_defrag(skb, user); @@ -714,7 +723,8 @@ out: * Find any that might be relevant, check against existing connections. * Currently handles error types - unreachable, quench, ttl exceeded. */ -static int ip_vs_out_icmp(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp(struct sk_buff *skb, int *related, + unsigned int hooknum) { struct iphdr *iph; struct icmphdr _icmph, *ic; @@ -729,7 +739,7 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -788,7 +798,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related) } #ifdef CONFIG_IP_VS_IPV6 -static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) +static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, + unsigned int hooknum) { struct ipv6hdr *iph; struct icmp6hdr _icmph, *ic; @@ -804,7 +815,7 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related) /* reassemble IP fragments */ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -986,7 +997,9 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); + int related; + int verdict = ip_vs_out_icmp_v6(skb, &related, + hooknum); if (related) { if (sysctl_ip_vs_snat_reroute && @@ -1000,7 +1013,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_out_icmp(skb, &related); + int related; + int verdict = ip_vs_out_icmp(skb, &related, hooknum); if (related) { if (sysctl_ip_vs_snat_reroute && @@ -1019,19 +1033,19 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, /* reassemble IP fragments */ #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { - if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_out_icmp_v6(skb, &related); - - if (related) - return verdict; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { + if (ip_vs_gather_frags_v6(skb, + ip_vs_defrag_user(hooknum))) + return NF_STOLEN; } + + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } else #endif if (unlikely(ip_hdr(skb)->frag_off & htons(IP_MF|IP_OFFSET) && !pp->dont_defrag)) { - if (ip_vs_gather_frags(skb, IP_DEFRAG_VS_OUT)) + if (ip_vs_gather_frags(skb, + ip_vs_defrag_user(hooknum))) return NF_STOLEN; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); @@ -1114,8 +1128,7 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) /* reassemble IP fragments */ if (ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) { - if (ip_vs_gather_frags(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : IP_DEFRAG_VS_FWD)) + if (ip_vs_gather_frags(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1226,9 +1239,7 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) /* reassemble IP fragments */ if (ipv6_hdr(skb)->nexthdr == IPPROTO_FRAGMENT) { - if (ip_vs_gather_frags_v6(skb, hooknum == NF_INET_LOCAL_IN ? - IP_DEFRAG_VS_IN : - IP_DEFRAG_VS_FWD)) + if (ip_vs_gather_frags_v6(skb, ip_vs_defrag_user(hooknum))) return NF_STOLEN; } @@ -1349,7 +1360,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { if (unlikely(iph.protocol == IPPROTO_ICMPV6)) { - int related, verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); + int related; + int verdict = ip_vs_in_icmp_v6(skb, &related, hooknum); if (related) return verdict; @@ -1358,7 +1370,8 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, } else #endif if (unlikely(iph.protocol == IPPROTO_ICMP)) { - int related, verdict = ip_vs_in_icmp(skb, &related, hooknum); + int related; + int verdict = ip_vs_in_icmp(skb, &related, hooknum); if (related) return verdict; -- cgit v1.1 From f5a41847acc535e2e2018e397b1876ba7577d9d9 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:35:46 +0300 Subject: ipvs: move ip_route_me_harder for ICMP Currently, ip_route_me_harder after ip_vs_out_icmp is called even if packet is not related to IPVS connection. Move it into handle_response_icmp. Also, force rerouting if sending to local client because IPv4 stack uses addresses from the route. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 41 +++++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 20 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index f7f5283..c4f091d 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -702,6 +702,17 @@ static int handle_response_icmp(int af, struct sk_buff *skb, #endif ip_vs_nat_icmp(skb, pp, cp, 1); +#ifdef CONFIG_IP_VS_IPV6 + if (af == AF_INET6) { + if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + goto out; + } else +#endif + if ((sysctl_ip_vs_snat_reroute || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto out; + /* do the statistics and put it back */ ip_vs_out_stats(cp, skb); @@ -940,16 +951,16 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, * if it came from this machine itself. So re-compute * the routing information. */ - if (sysctl_ip_vs_snat_reroute) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) { - if (ip6_route_me_harder(skb) != 0) - goto drop; - } else + if (af == AF_INET6) { + if (sysctl_ip_vs_snat_reroute && ip6_route_me_harder(skb) != 0) + goto drop; + } else #endif - if (ip_route_me_harder(skb, RTN_LOCAL) != 0) - goto drop; - } + if ((sysctl_ip_vs_snat_reroute || + skb_rtable(skb)->rt_flags & RTCF_LOCAL) && + ip_route_me_harder(skb, RTN_LOCAL) != 0) + goto drop; IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); @@ -1001,13 +1012,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int verdict = ip_vs_out_icmp_v6(skb, &related, hooknum); - if (related) { - if (sysctl_ip_vs_snat_reroute && - NF_ACCEPT == verdict && - ip6_route_me_harder(skb)) - verdict = NF_DROP; + if (related) return verdict; - } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } } else @@ -1016,13 +1022,8 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int related; int verdict = ip_vs_out_icmp(skb, &related, hooknum); - if (related) { - if (sysctl_ip_vs_snat_reroute && - NF_ACCEPT == verdict && - ip_route_me_harder(skb, RTN_LOCAL)) - verdict = NF_DROP; + if (related) return verdict; - } ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); } -- cgit v1.1 From fc604767613b6d2036cdc35b660bc39451040a47 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:38:15 +0300 Subject: ipvs: changes for local real server This patch deals with local real servers: - Add support for DNAT to local address (different real server port). It needs ip_vs_out hook in LOCAL_OUT for both families because skb->protocol is not set for locally generated packets and can not be used to set 'af'. - Skip packets in ip_vs_in marked with skb->ipvs_property because ip_vs_out processing can be executed in LOCAL_OUT but we still have the conn_out_get check in ip_vs_in. - Ignore packets with inet->nodefrag from local stack - Require skb_dst(skb) != NULL because we use it to get struct net - Add support for changing the route to local IPv4 stack after DNAT depending on the source address type. Local client sets output route and the remote client sets input route. It looks like IPv6 does not need such rerouting because the replies use addresses from initial incoming header, not from skb route. - All transmitters now have strict checks for the destination address type: redirect from non-local address to local real server requires NAT method, local address can not be used as source address when talking to remote real server. - Now LOCALNODE is not set explicitly as forwarding method in real server to allow the connections to provide correct forwarding method to the backup server. Not sure if this breaks tools that expect to see 'Local' real server type. If needed, this can be supported with new flag IP_VS_DEST_F_LOCAL. Now it should be possible connections in backup that lost their fwmark information during sync to be forwarded properly to their daddr, even if it is local address in the backup server. By this way backup could be used as real server for DR or TUN, for NAT there are some restrictions because tuple collisions in conntracks can create problems for the traffic. - Call ip_vs_dst_reset when destination is updated in case some real server IP type is changed between local and remote. [ horms@verge.net.au: removed trailing whitespace ] Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 123 ++++++++++-- net/netfilter/ipvs/ip_vs_ctl.c | 18 +- net/netfilter/ipvs/ip_vs_xmit.c | 433 ++++++++++++++++++++++++++++++++-------- 3 files changed, 457 insertions(+), 117 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index c4f091d..a6c8aff1 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -984,26 +984,34 @@ drop: } /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. * Check if outgoing packet belongs to the established ip_vs_conn. */ static unsigned int -ip_vs_out(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int af; EnterFunction(11); - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - + /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } + + if (unlikely(!skb_dst(skb))) + return NF_ACCEPT; + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1106,6 +1114,69 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, return handle_response(af, skb, pp, cp, iph.len); } +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_out(hooknum, skb, AF_INET6); +} + +/* + * It is hooked at the NF_INET_LOCAL_OUT chain, used only for VS/NAT. + * Check if packet is reply for established ip_vs_conn. + */ +static unsigned int +ip_vs_local_reply6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_out(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif /* * Handle ICMP messages in the outside-to-inside direction (incoming). @@ -1342,6 +1413,10 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, struct ip_vs_conn *cp; int ret, restart, af, pkts; + /* Already marked as IPVS request or reply? */ + if (skb->ipvs_property) + return NF_ACCEPT; + af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); @@ -1525,13 +1600,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_out, + .hook = ip_vs_local_reply4, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -99, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1542,6 +1617,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, #ifdef CONFIG_IP_VS_IPV6 /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be @@ -1553,13 +1636,13 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_IN, .priority = 100, }, - /* After packet filtering, change source only for VS/NAT */ + /* Before ip_vs_in, change source only for VS/NAT */ { - .hook = ip_vs_out, + .hook = ip_vs_local_reply6, .owner = THIS_MODULE, - .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 100, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -99, }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ @@ -1570,6 +1653,14 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_FORWARD, .priority = 99, }, + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_FORWARD, + .priority = 100, + }, #endif }; diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 0b884d3..5f5daa3 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -777,20 +777,6 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, conn_flags = udest->conn_flags & IP_VS_CONN_F_DEST_MASK; conn_flags |= IP_VS_CONN_F_INACTIVE; - /* check if local node and update the flags */ -#ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) { - if (__ip_vs_addr_is_local_v6(&udest->addr.in6)) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - } else -#endif - if (inet_addr_type(&init_net, udest->addr.ip) == RTN_LOCAL) { - conn_flags = (conn_flags & ~IP_VS_CONN_F_FWD_MASK) - | IP_VS_CONN_F_LOCALNODE; - } - /* set the IP_VS_CONN_F_NOOUTPUT flag if not masquerading/NAT */ if ((conn_flags & IP_VS_CONN_F_FWD_MASK) != IP_VS_CONN_F_MASQ) { conn_flags |= IP_VS_CONN_F_NOOUTPUT; @@ -824,6 +810,10 @@ __ip_vs_update_dest(struct ip_vs_service *svc, struct ip_vs_dest *dest, dest->u_threshold = udest->u_threshold; dest->l_threshold = udest->l_threshold; + spin_lock(&dest->dst_lock); + ip_vs_dst_reset(dest); + spin_unlock(&dest->dst_lock); + if (add) ip_vs_new_estimator(&dest->stats); diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 63cc0fe..8608882 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -67,12 +67,19 @@ __ip_vs_dst_check(struct ip_vs_dest *dest, u32 rtos) return dst; } +/* + * Get route to destination or remote server + * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, + * &4=Allow redirect from remote daddr to local + */ static struct rtable * -__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) +__ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_dest *dest, + __be32 daddr, u32 rtos, int rt_mode) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb_dst(skb)->dev); struct rtable *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rtable *ort; /* Original route */ + int local; if (dest) { spin_lock(&dest->dst_lock); @@ -104,23 +111,95 @@ __ip_vs_get_out_rt(struct sk_buff *skb, struct ip_vs_conn *cp, u32 rtos) .oif = 0, .nl_u = { .ip4_u = { - .daddr = cp->daddr.ip, + .daddr = daddr, .saddr = 0, .tos = rtos, } }, }; if (ip_route_output_key(net, &rt, &fl)) { IP_VS_DBG_RL("ip_route_output error, dest: %pI4\n", - &cp->daddr.ip); + &daddr); return NULL; } } + local = rt->rt_flags & RTCF_LOCAL; + if (!((local ? 1 : 2) & rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI4\n", + (rt->rt_flags & RTCF_LOCAL) ? + "local":"non-local", &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + if (local && !(rt_mode & 4) && !((ort = skb_rtable(skb)) && + ort->rt_flags & RTCF_LOCAL)) { + IP_VS_DBG_RL("Redirect from non-local address %pI4 to local " + "requires NAT method, dest: %pI4\n", + &ip_hdr(skb)->daddr, &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + if (unlikely(!local && ipv4_is_loopback(ip_hdr(skb)->saddr))) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI4 " + "to non-local address, dest: %pI4\n", + &ip_hdr(skb)->saddr, &rt->rt_dst); + ip_rt_put(rt); + return NULL; + } + return rt; } +/* Reroute packet to local IPv4 stack after DNAT */ +static int +__ip_vs_reroute_locally(struct sk_buff *skb) +{ + struct rtable *rt = skb_rtable(skb); + struct net_device *dev = rt->dst.dev; + struct net *net = dev_net(dev); + struct iphdr *iph = ip_hdr(skb); + + if (rt->fl.iif) { + unsigned long orefdst = skb->_skb_refdst; + + if (ip_route_input(skb, iph->daddr, iph->saddr, + iph->tos, skb->dev)) + return 0; + refdst_drop(orefdst); + } else { + struct flowi fl = { + .oif = 0, + .nl_u = { + .ip4_u = { + .daddr = iph->daddr, + .saddr = iph->saddr, + .tos = RT_TOS(iph->tos), + } + }, + .mark = skb->mark, + }; + struct rtable *rt; + + if (ip_route_output_key(net, &rt, &fl)) + return 0; + if (!(rt->rt_flags & RTCF_LOCAL)) { + ip_rt_put(rt); + return 0; + } + /* Drop old route. */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } + return 1; +} + #ifdef CONFIG_IP_VS_IPV6 +static inline int __ip_vs_is_local_route6(struct rt6_info *rt) +{ + return rt->rt6i_dev && rt->rt6i_dev->flags & IFF_LOOPBACK; +} + static struct dst_entry * __ip_vs_route_output_v6(struct net *net, struct in6_addr *daddr, struct in6_addr *ret_saddr, int do_xfrm) @@ -155,14 +234,21 @@ out_err: return NULL; } +/* + * Get route to destination or remote server + * rt_mode: flags, &1=Allow local dest, &2=Allow non-local dest, + * &4=Allow redirect from remote daddr to local + */ static struct rt6_info * -__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, - struct in6_addr *ret_saddr, int do_xfrm) +__ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_dest *dest, + struct in6_addr *daddr, struct in6_addr *ret_saddr, + int do_xfrm, int rt_mode) { - struct net *net = dev_net(skb->dev); + struct net *net = dev_net(skb_dst(skb)->dev); struct rt6_info *rt; /* Route to the other host */ - struct ip_vs_dest *dest = cp->dest; + struct rt6_info *ort; /* Original route */ struct dst_entry *dst; + int local; if (dest) { spin_lock(&dest->dst_lock); @@ -188,13 +274,38 @@ __ip_vs_get_out_rt_v6(struct sk_buff *skb, struct ip_vs_conn *cp, ipv6_addr_copy(ret_saddr, &dest->dst_saddr); spin_unlock(&dest->dst_lock); } else { - dst = __ip_vs_route_output_v6(net, &cp->daddr.in6, ret_saddr, - do_xfrm); + dst = __ip_vs_route_output_v6(net, daddr, ret_saddr, do_xfrm); if (!dst) return NULL; rt = (struct rt6_info *) dst; } + local = __ip_vs_is_local_route6(rt); + if (!((local ? 1 : 2) & rt_mode)) { + IP_VS_DBG_RL("Stopping traffic to %s address, dest: %pI6\n", + local ? "local":"non-local", daddr); + dst_release(&rt->dst); + return NULL; + } + if (local && !(rt_mode & 4) && + !((ort = (struct rt6_info *) skb_dst(skb)) && + __ip_vs_is_local_route6(ort))) { + IP_VS_DBG_RL("Redirect from non-local address %pI6 to local " + "requires NAT method, dest: %pI6\n", + &ipv6_hdr(skb)->daddr, daddr); + dst_release(&rt->dst); + return NULL; + } + if (unlikely(!local && (!skb->dev || skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&ipv6_hdr(skb)->saddr) & + IPV6_ADDR_LOOPBACK)) { + IP_VS_DBG_RL("Stopping traffic from loopback address %pI6 " + "to non-local address, dest: %pI6\n", + &ipv6_hdr(skb)->saddr, daddr); + dst_release(&rt->dst); + return NULL; + } + return rt; } #endif @@ -227,23 +338,27 @@ ip_vs_dst_reset(struct ip_vs_dest *dest) __ret; \ }) -#define IP_VS_XMIT_NAT(pf, skb, cp) \ +#define IP_VS_XMIT_NAT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ else \ ip_vs_update_conntrack(skb, cp, 1); \ + if (local) \ + return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ } while (0) -#define IP_VS_XMIT(pf, skb, cp) \ +#define IP_VS_XMIT(pf, skb, cp, local) \ do { \ (skb)->ipvs_property = 1; \ if (likely(!((cp)->flags & IP_VS_CONN_F_NFCT))) \ ip_vs_notrack(skb); \ + if (local) \ + return NF_ACCEPT; \ skb_forward_csum(skb); \ NF_HOOK(pf, NF_INET_LOCAL_OUT, (skb), NULL, \ skb_dst(skb)->dev, dst_output); \ @@ -258,7 +373,7 @@ ip_vs_null_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { /* we do not touch skb and do not need pskb ptr */ - return NF_ACCEPT; + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); } @@ -271,27 +386,15 @@ int ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { - struct net *net = dev_net(skb->dev); struct rtable *rt; /* Route to the other host */ struct iphdr *iph = ip_hdr(skb); - u8 tos = iph->tos; int mtu; - struct flowi fl = { - .oif = 0, - .nl_u = { - .ip4_u = { - .daddr = iph->daddr, - .saddr = 0, - .tos = RT_TOS(tos), } }, - }; EnterFunction(10); - if (ip_route_output_key(net, &rt, &fl)) { - IP_VS_DBG_RL("%s(): ip_route_output error, dest: %pI4\n", - __func__, &iph->daddr); + if (!(rt = __ip_vs_get_out_rt(skb, NULL, iph->daddr, + RT_TOS(iph->tos), 2))) goto tx_error_icmp; - } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -319,7 +422,7 @@ ip_vs_bypass_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -337,18 +440,14 @@ int ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct ip_vs_protocol *pp) { - struct net *net = dev_net(skb->dev); - struct dst_entry *dst; struct rt6_info *rt; /* Route to the other host */ struct ipv6hdr *iph = ipv6_hdr(skb); int mtu; EnterFunction(10); - dst = __ip_vs_route_output_v6(net, &iph->daddr, NULL, 0); - if (!dst) + if (!(rt = __ip_vs_get_out_rt_v6(skb, NULL, &iph->daddr, NULL, 0, 2))) goto tx_error_icmp; - rt = (struct rt6_info *) dst; /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -376,7 +475,7 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -401,6 +500,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int mtu; struct iphdr *iph = ip_hdr(skb); + int local; EnterFunction(10); @@ -414,16 +514,40 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), 1|2|4))) goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { + IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { - ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -433,16 +557,27 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) - goto tx_error; + goto tx_error_put; ip_hdr(skb)->daddr = cp->daddr.ip; ip_send_check(ip_hdr(skb)); + if (!local) { + /* drop old route */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length @@ -452,7 +587,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); LeaveFunction(10); return NF_STOLEN; @@ -475,6 +610,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, { struct rt6_info *rt; /* Route to the other host */ int mtu; + int local; EnterFunction(10); @@ -489,18 +625,44 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, IP_VS_DBG(10, "filled cport=%d\n", ntohs(*p)); } - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2|4))) goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG_RL_PKT(10, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to local address"); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG_RL_PKT(1, pp, skb, 0, + "ip_vs_nat_xmit_v6(): " + "stopping DNAT to loopback address"); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -510,14 +672,19 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop old route */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - /* mangle the packet */ if (pp->dnat_handler && !pp->dnat_handler(skb, pp, cp)) goto tx_error; - ipv6_hdr(skb)->daddr = cp->daddr.in6; + ipv6_addr_copy(&ipv6_hdr(skb)->daddr, &cp->daddr.in6); + + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); @@ -528,7 +695,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); LeaveFunction(10); return NF_STOLEN; @@ -588,16 +755,20 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(tos), 1|2))) goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct iphdr); if (mtu < 68) { - ip_rt_put(rt); IP_VS_DBG_RL("%s(): mtu less than 68\n", __func__); - goto tx_error; + goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); @@ -607,9 +778,8 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if ((old_iph->frag_off & htons(IP_DF)) && mtu < ntohs(old_iph->tot_len)) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - ip_rt_put(rt); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* @@ -678,6 +848,9 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; +tx_error_put: + ip_rt_put(rt); + goto tx_error; } #ifdef CONFIG_IP_VS_IPV6 @@ -703,27 +876,29 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - rt = __ip_vs_get_out_rt_v6(skb, cp, &saddr, 1); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, + &saddr, 1, 1|2))) goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } tdev = rt->dst.dev; mtu = dst_mtu(&rt->dst) - sizeof(struct ipv6hdr); if (mtu < IPV6_MIN_MTU) { - dst_release(&rt->dst); IP_VS_DBG_RL("%s(): mtu less than %d\n", __func__, IPV6_MIN_MTU); - goto tx_error; + goto tx_error_put; } if (skb_dst(skb)) skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* @@ -789,6 +964,9 @@ tx_error: kfree_skb(skb); LeaveFunction(10); return NF_STOLEN; +tx_error_put: + dst_release(&rt->dst); + goto tx_error; } #endif @@ -807,8 +985,13 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(iph->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(iph->tos), 1|2))) goto tx_error_icmp; + if (rt->rt_flags & RTCF_LOCAL) { + ip_rt_put(rt); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 1); + } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -836,7 +1019,7 @@ ip_vs_dr_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT(NFPROTO_IPV4, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -859,9 +1042,13 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2))) goto tx_error_icmp; + if (__ip_vs_is_local_route6(rt)) { + dst_release(&rt->dst); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 1); + } /* MTU checking */ mtu = dst_mtu(&rt->dst); @@ -889,7 +1076,7 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT(NFPROTO_IPV6, skb, cp, 0); LeaveFunction(10); return NF_STOLEN; @@ -915,6 +1102,7 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct rtable *rt; /* Route to the other host */ int mtu; int rc; + int local; EnterFunction(10); @@ -935,16 +1123,43 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - if (!(rt = __ip_vs_get_out_rt(skb, cp, RT_TOS(ip_hdr(skb)->tos)))) + if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, + RT_TOS(ip_hdr(skb)->tos), 1|2|4))) goto tx_error_icmp; + local = rt->rt_flags & RTCF_LOCAL; + + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI4\n", + __func__, &cp->daddr.ip); + goto tx_error_put; + } /* MTU checking */ mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (ip_hdr(skb)->frag_off & htons(IP_DF))) { - ip_rt_put(rt); icmp_send(skb, ICMP_DEST_UNREACH, ICMP_FRAG_NEEDED, htonl(mtu)); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -954,16 +1169,27 @@ ip_vs_icmp_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - ip_vs_nat_icmp(skb, pp, cp, 0); + if (!local) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + ip_rt_put(rt); + /* + * Some IPv4 replies get local address from routes, + * not from iph, so while we DNAT after routing + * we need this second input/output route. + */ + if (!__ip_vs_reroute_locally(skb)) + goto tx_error; + } + /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV4, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV4, skb, cp, local); rc = NF_STOLEN; goto out; @@ -989,6 +1215,7 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct rt6_info *rt; /* Route to the other host */ int mtu; int rc; + int local; EnterFunction(10); @@ -1009,17 +1236,44 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, * mangle and send the packet here (only for VS/NAT) */ - rt = __ip_vs_get_out_rt_v6(skb, cp, NULL, 0); - if (!rt) + if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, NULL, + 0, 1|2|4))) goto tx_error_icmp; + local = __ip_vs_is_local_route6(rt); + /* + * Avoid duplicate tuple in reply direction for NAT traffic + * to local address when connection is sync-ed + */ +#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) + if (cp->flags & IP_VS_CONN_F_SYNC && local) { + enum ip_conntrack_info ctinfo; + struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); + + if (ct && !nf_ct_is_untracked(ct)) { + IP_VS_DBG(10, "%s(): " + "stopping DNAT to local address %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + } +#endif + + /* From world but DNAT to loopback address? */ + if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && + ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { + IP_VS_DBG(1, "%s(): " + "stopping DNAT to loopback %pI6\n", + __func__, &cp->daddr.in6); + goto tx_error_put; + } + /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); - goto tx_error; + goto tx_error_put; } /* copy-on-write the packet before mangling it */ @@ -1029,16 +1283,21 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, if (skb_cow(skb, rt->dst.dev->hard_header_len)) goto tx_error_put; - /* drop the old route when skb is not shared */ - skb_dst_drop(skb); - skb_dst_set(skb, &rt->dst); - ip_vs_nat_icmp_v6(skb, pp, cp, 0); + if (!local || !skb->dev) { + /* drop the old route when skb is not shared */ + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + } else { + /* destined to loopback, do we need to change route? */ + dst_release(&rt->dst); + } + /* Another hack: avoid icmp_send in ip_fragment */ skb->local_df = 1; - IP_VS_XMIT(NFPROTO_IPV6, skb, cp); + IP_VS_XMIT_NAT(NFPROTO_IPV6, skb, cp, local); rc = NF_STOLEN; goto out; -- cgit v1.1 From cb59155f21d4c0507d2034c2953f6a3f7806913d Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:40:51 +0300 Subject: ipvs: changes for local client This patch deals with local client processing. Prefer LOCAL_OUT hook for scheduling connections from local clients. LOCAL_IN is still supported if the packets are not marked as processed in LOCAL_OUT. The idea to process requests in LOCAL_OUT is to alter conntrack reply before it is confirmed at POST_ROUTING. If the local requests are processed in LOCAL_IN the conntrack can not be updated and matching by state is impossible. Add the following handlers: - ip_vs_reply[46] at LOCAL_IN:99 to process replies from remote real servers to local clients. Now when both replies from remote real servers (ip_vs_reply*) and local real servers (ip_vs_local_reply*) are handled it is safe to remove the conn_out_get call from ip_vs_in because it does not support related ICMP packets. - ip_vs_local_request[46] at LOCAL_OUT:-98 to process requests from local client Handling in LOCAL_OUT causes some changes: - as skb->dev, skb->protocol and skb->pkt_type are not defined in LOCAL_OUT make sure we set skb->dev before calling icmpv6_send, prefer skb_dst(skb) for struct net and remove the skb->protocol checks from TUN transmitters. [ horms@verge.net.au: removed trailing whitespace ] Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 266 ++++++++++++++++++++++++++++------------ net/netfilter/ipvs/ip_vs_xmit.c | 51 +++++--- 2 files changed, 225 insertions(+), 92 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index a6c8aff1..5fbcf67 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -529,9 +529,14 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb, * ICMP_PORT_UNREACH is sent here no matter it is TCP/UDP. --WZ */ #ifdef CONFIG_IP_VS_IPV6 - if (svc->af == AF_INET6) + if (svc->af == AF_INET6) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_PORT_UNREACH, 0); - else + } else #endif icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0); @@ -1065,57 +1070,61 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) */ cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (unlikely(!cp)) { - if (sysctl_ip_vs_nat_icmp_send && - (pp->protocol == IPPROTO_TCP || - pp->protocol == IPPROTO_UDP || - pp->protocol == IPPROTO_SCTP)) { - __be16 _ports[2], *pptr; - - pptr = skb_header_pointer(skb, iph.len, - sizeof(_ports), _ports); - if (pptr == NULL) - return NF_ACCEPT; /* Not for me */ - if (ip_vs_lookup_real_service(af, iph.protocol, - &iph.saddr, - pptr[0])) { - /* - * Notify the real server: there is no - * existing entry if it is not RST - * packet or not TCP packet. - */ - if ((iph.protocol != IPPROTO_TCP && - iph.protocol != IPPROTO_SCTP) - || ((iph.protocol == IPPROTO_TCP - && !is_tcp_reset(skb, iph.len)) - || (iph.protocol == IPPROTO_SCTP - && !is_sctp_abort(skb, - iph.len)))) { + if (likely(cp)) + return handle_response(af, skb, pp, cp, iph.len); + if (sysctl_ip_vs_nat_icmp_send && + (pp->protocol == IPPROTO_TCP || + pp->protocol == IPPROTO_UDP || + pp->protocol == IPPROTO_SCTP)) { + __be16 _ports[2], *pptr; + + pptr = skb_header_pointer(skb, iph.len, + sizeof(_ports), _ports); + if (pptr == NULL) + return NF_ACCEPT; /* Not for me */ + if (ip_vs_lookup_real_service(af, iph.protocol, + &iph.saddr, + pptr[0])) { + /* + * Notify the real server: there is no + * existing entry if it is not RST + * packet or not TCP packet. + */ + if ((iph.protocol != IPPROTO_TCP && + iph.protocol != IPPROTO_SCTP) + || ((iph.protocol == IPPROTO_TCP + && !is_tcp_reset(skb, iph.len)) + || (iph.protocol == IPPROTO_SCTP + && !is_sctp_abort(skb, + iph.len)))) { #ifdef CONFIG_IP_VS_IPV6 - if (af == AF_INET6) - icmpv6_send(skb, - ICMPV6_DEST_UNREACH, - ICMPV6_PORT_UNREACH, - 0); - else + if (af == AF_INET6) { + struct net *net = + dev_net(skb_dst(skb)->dev); + + if (!skb->dev) + skb->dev = net->loopback_dev; + icmpv6_send(skb, + ICMPV6_DEST_UNREACH, + ICMPV6_PORT_UNREACH, + 0); + } else #endif - icmp_send(skb, - ICMP_DEST_UNREACH, - ICMP_PORT_UNREACH, 0); - return NF_DROP; - } + icmp_send(skb, + ICMP_DEST_UNREACH, + ICMP_PORT_UNREACH, 0); + return NF_DROP; } } - IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); - return NF_ACCEPT; } - - return handle_response(af, skb, pp, cp, iph.len); + IP_VS_DBG_PKT(12, pp, skb, 0, + "ip_vs_out: packet continues traversal as normal"); + return NF_ACCEPT; } /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. * Check if packet is reply for established ip_vs_conn. */ static unsigned int @@ -1147,7 +1156,8 @@ ip_vs_local_reply4(unsigned int hooknum, struct sk_buff *skb, #ifdef CONFIG_IP_VS_IPV6 /* - * It is hooked at the NF_INET_FORWARD chain, used only for VS/NAT. + * It is hooked at the NF_INET_FORWARD and NF_INET_LOCAL_IN chain, + * used only for VS/NAT. * Check if packet is reply for established ip_vs_conn. */ static unsigned int @@ -1404,34 +1414,43 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) * and send it on its way... */ static unsigned int -ip_vs_in(unsigned int hooknum, struct sk_buff *skb, - const struct net_device *in, const struct net_device *out, - int (*okfn)(struct sk_buff *)) +ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) { struct ip_vs_iphdr iph; struct ip_vs_protocol *pp; struct ip_vs_conn *cp; - int ret, restart, af, pkts; + int ret, restart, pkts; /* Already marked as IPVS request or reply? */ if (skb->ipvs_property) return NF_ACCEPT; - af = (skb->protocol == htons(ETH_P_IP)) ? AF_INET : AF_INET6; - - ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); - /* - * Big tappo: only PACKET_HOST, including loopback for local client - * Don't handle local packets on IPv6 for now + * Big tappo: + * - remote client: only PACKET_HOST + * - route: used for struct net when skb->dev is unset */ - if (unlikely(skb->pkt_type != PACKET_HOST)) { - IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s ignored\n", - skb->pkt_type, - iph.protocol, - IP_VS_DBG_ADDR(af, &iph.daddr)); + if (unlikely((skb->pkt_type != PACKET_HOST && + hooknum != NF_INET_LOCAL_OUT) || + !skb_dst(skb))) { + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + IP_VS_DBG_BUF(12, "packet type=%d proto=%d daddr=%s" + " ignored in hook %u\n", + skb->pkt_type, iph.protocol, + IP_VS_DBG_ADDR(af, &iph.daddr), hooknum); return NF_ACCEPT; } + ip_vs_fill_iphdr(af, skb_network_header(skb), &iph); + + /* Bad... Do not break raw sockets */ + if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT && + af == AF_INET)) { + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(skb->sk); + + if (inet && sk->sk_family == PF_INET && inet->nodefrag) + return NF_ACCEPT; + } #ifdef CONFIG_IP_VS_IPV6 if (af == AF_INET6) { @@ -1467,11 +1486,6 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (unlikely(!cp)) { int v; - /* For local client packets, it could be a response */ - cp = pp->conn_out_get(af, skb, pp, &iph, iph.len, 0); - if (cp) - return handle_response(af, skb, pp, cp, iph.len); - if (!pp->conn_schedule(af, skb, pp, &v, &cp)) return v; } @@ -1479,7 +1493,7 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) */ IP_VS_DBG_PKT(12, pp, skb, 0, - "packet continues traversal as normal"); + "ip_vs_in: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1550,6 +1564,72 @@ out: return ret; } +/* + * AF_INET handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET); +} + +/* + * AF_INET handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request4(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET); + local_bh_enable(); + return verdict; +} + +#ifdef CONFIG_IP_VS_IPV6 + +/* + * AF_INET6 handler in NF_INET_LOCAL_IN chain + * Schedule and forward packets from remote clients + */ +static unsigned int +ip_vs_remote_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ip_vs_in(hooknum, skb, AF_INET6); +} + +/* + * AF_INET6 handler in NF_INET_LOCAL_OUT chain + * Schedule and forward packets from local clients + */ +static unsigned int +ip_vs_local_request6(unsigned int hooknum, struct sk_buff *skb, + const struct net_device *in, const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + unsigned int verdict; + + /* Disable BH in LOCAL_OUT until all places are fixed */ + local_bh_disable(); + verdict = ip_vs_in(hooknum, skb, AF_INET6); + local_bh_enable(); + return verdict; +} + +#endif + /* * It is hooked at the NF_INET_FORWARD chain, in order to catch ICMP @@ -1590,15 +1670,23 @@ ip_vs_forward_icmp_v6(unsigned int hooknum, struct sk_buff *skb, static struct nf_hook_ops ip_vs_ops[] __read_mostly = { + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_IN, + .priority = 99, + }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_in, + .hook = ip_vs_remote_request4, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, + .hooknum = NF_INET_LOCAL_IN, + .priority = 101, }, /* Before ip_vs_in, change source only for VS/NAT */ { @@ -1608,14 +1696,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_OUT, .priority = -99, }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request4, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -98, + }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp, .owner = THIS_MODULE, .pf = PF_INET, - .hooknum = NF_INET_FORWARD, - .priority = 99, + .hooknum = NF_INET_FORWARD, + .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { @@ -1626,15 +1722,23 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .priority = 100, }, #ifdef CONFIG_IP_VS_IPV6 + /* After packet filtering, change source only for VS/NAT */ + { + .hook = ip_vs_reply6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_IN, + .priority = 99, + }, /* After packet filtering, forward packet through VS/DR, VS/TUN, * or VS/NAT(change destination), so that filtering rules can be * applied to IPVS. */ { - .hook = ip_vs_in, + .hook = ip_vs_remote_request6, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_INET_LOCAL_IN, - .priority = 100, + .hooknum = NF_INET_LOCAL_IN, + .priority = 101, }, /* Before ip_vs_in, change source only for VS/NAT */ { @@ -1644,14 +1748,22 @@ static struct nf_hook_ops ip_vs_ops[] __read_mostly = { .hooknum = NF_INET_LOCAL_OUT, .priority = -99, }, + /* After mangle, schedule and forward local requests */ + { + .hook = ip_vs_local_request6, + .owner = THIS_MODULE, + .pf = PF_INET6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = -98, + }, /* After packet filtering (but before ip_vs_out_icmp), catch icmp * destined for 0.0.0.0/0, which is for incoming IPVS connections */ { .hook = ip_vs_forward_icmp_v6, .owner = THIS_MODULE, .pf = PF_INET6, - .hooknum = NF_INET_FORWARD, - .priority = 99, + .hooknum = NF_INET_FORWARD, + .priority = 99, }, /* After packet filtering, change source only for VS/NAT */ { diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 8608882..97b5361 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -11,6 +11,16 @@ * * Changes: * + * Description of forwarding methods: + * - all transmitters are called from LOCAL_IN (remote clients) and + * LOCAL_OUT (local clients) but for ICMP can be called from FORWARD + * - not all connections have destination server, for example, + * connections in backup server when fwmark is used + * - bypass connections use daddr from packet + * LOCAL_OUT rules: + * - skb->dev is NULL, skb->protocol is not set (both are set in POST_ROUTING) + * - skb->pkt_type is not set yet + * - the only place where we can see skb->sk != NULL */ #define KMSG_COMPONENT "IPVS" @@ -452,8 +462,13 @@ ip_vs_bypass_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { - dst_release(&rt->dst); + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); + dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error; } @@ -659,6 +674,11 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); @@ -748,13 +768,6 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (skb->protocol != htons(ETH_P_IP)) { - IP_VS_DBG_RL("%s(): protocol error, " - "ETH_P_IP: %d, skb protocol: %d\n", - __func__, htons(ETH_P_IP), skb->protocol); - goto tx_error; - } - if (!(rt = __ip_vs_get_out_rt(skb, cp->dest, cp->daddr.ip, RT_TOS(tos), 1|2))) goto tx_error_icmp; @@ -869,13 +882,6 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, EnterFunction(10); - if (skb->protocol != htons(ETH_P_IPV6)) { - IP_VS_DBG_RL("%s(): protocol error, " - "ETH_P_IPV6: %d, skb protocol: %d\n", - __func__, htons(ETH_P_IPV6), skb->protocol); - goto tx_error; - } - if (!(rt = __ip_vs_get_out_rt_v6(skb, cp->dest, &cp->daddr.in6, &saddr, 1, 1|2))) goto tx_error_icmp; @@ -896,6 +902,11 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb_dst(skb)->ops->update_pmtu(skb_dst(skb), mtu); if (mtu < ntohs(old_iph->payload_len) + sizeof(struct ipv6hdr)) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; @@ -1053,6 +1064,11 @@ ip_vs_dr_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); dst_release(&rt->dst); IP_VS_DBG_RL("%s(): frag needed\n", __func__); @@ -1271,6 +1287,11 @@ ip_vs_icmp_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* MTU checking */ mtu = dst_mtu(&rt->dst); if (skb->len > mtu) { + if (!skb->dev) { + struct net *net = dev_net(skb_dst(skb)->dev); + + skb->dev = net->loopback_dev; + } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); IP_VS_DBG_RL("%s(): frag needed\n", __func__); goto tx_error_put; -- cgit v1.1 From 3233759be7eeca9998c514b8f49e8cf2b85e64d3 Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:43:36 +0300 Subject: ipvs: inherit forwarding method in backup Connections in backup server should inherit the forwarding method from real server. It is a way to fix a problem where the forwarding method in backup connection is damaged by logical OR operation with the real server's connection flags. And the change is needed for setups where the backup server uses different forwarding method for the same real servers. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_conn.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index 1d1a529..e9adecd 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -563,6 +563,8 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) */ if (!(cp->flags & IP_VS_CONN_F_TEMPLATE)) conn_flags &= ~IP_VS_CONN_F_INACTIVE; + /* connections inherit forwarding method from dest */ + cp->flags &= ~IP_VS_CONN_F_FWD_MASK; } cp->flags |= conn_flags; cp->dest = dest; -- cgit v1.1 From 0d79641a96d612aaa6d57a4d4f521d7ed9c9ccdd Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Sun, 17 Oct 2010 16:46:17 +0300 Subject: ipvs: provide address family for debugging As skb->protocol is not valid in LOCAL_OUT add parameter for address family in packet debugging functions. Even if ports are not present in AH and ESP change them to use ip_vs_tcpudp_debug_packet to show at least valid addresses as before. This patch removes the last user of skb->protocol in IPVS. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/netfilter/ipvs/ip_vs_core.c | 41 +++++++++++++++----------- net/netfilter/ipvs/ip_vs_proto.c | 8 ++--- net/netfilter/ipvs/ip_vs_proto_ah_esp.c | 52 ++------------------------------- net/netfilter/ipvs/ip_vs_proto_sctp.c | 2 +- net/netfilter/ipvs/ip_vs_proto_tcp.c | 4 +-- net/netfilter/ipvs/ip_vs_proto_udp.c | 4 +-- net/netfilter/ipvs/ip_vs_xmit.c | 18 +++++++----- 7 files changed, 45 insertions(+), 84 deletions(-) (limited to 'net') diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c index 5fbcf67..b4e51e9 100644 --- a/net/netfilter/ipvs/ip_vs_core.c +++ b/net/netfilter/ipvs/ip_vs_core.c @@ -365,7 +365,8 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, * with persistence the connection is created on SYN+ACK. */ if (pptr[0] == FTPDATA) { - IP_VS_DBG_PKT(12, pp, skb, 0, "Not scheduling FTPDATA"); + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, + "Not scheduling FTPDATA"); return NULL; } @@ -376,7 +377,7 @@ ip_vs_schedule(struct ip_vs_service *svc, struct sk_buff *skb, if ((!skb->dev || skb->dev->flags & IFF_LOOPBACK) && (svc->flags & IP_VS_SVC_F_PERSISTENT || svc->fwmark) && (cp = pp->conn_in_get(svc->af, skb, pp, &iph, iph.len, 1))) { - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, svc->af, pp, skb, 0, "Not scheduling reply for existing connection"); __ip_vs_conn_put(cp); return NULL; @@ -617,10 +618,10 @@ void ip_vs_nat_icmp(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_UNNECESSARY; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered outgoing ICMP"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, + IP_VS_DBG_PKT(11, AF_INET, pp, skb, (void *)ciph - (void *)iph, "Forwarding altered incoming ICMP"); } @@ -662,11 +663,13 @@ void ip_vs_nat_icmp_v6(struct sk_buff *skb, struct ip_vs_protocol *pp, skb->ip_summed = CHECKSUM_PARTIAL; if (inout) - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered outgoing ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered outgoing ICMPv6"); else - IP_VS_DBG_PKT(11, pp, skb, (void *)ciph - (void *)iph, - "Forwarding altered incoming ICMPv6"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, + (void *)ciph - (void *)iph, + "Forwarding altered incoming ICMPv6"); } #endif @@ -798,7 +801,8 @@ static int ip_vs_out_icmp(struct sk_buff *skb, int *related, pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking outgoing ICMP for"); offset += cih->ihl * 4; @@ -874,7 +878,8 @@ static int ip_vs_out_icmp_v6(struct sk_buff *skb, int *related, if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking outgoing ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking outgoing ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -922,7 +927,7 @@ static unsigned int handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, struct ip_vs_conn *cp, int ihl) { - IP_VS_DBG_PKT(11, pp, skb, 0, "Outgoing packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Outgoing packet"); if (!skb_make_writable(skb, ihl)) goto drop; @@ -967,7 +972,7 @@ handle_response(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, ip_route_me_harder(skb, RTN_LOCAL) != 0) goto drop; - IP_VS_DBG_PKT(10, pp, skb, 0, "After SNAT"); + IP_VS_DBG_PKT(10, af, pp, skb, 0, "After SNAT"); ip_vs_out_stats(cp, skb); ip_vs_set_state(cp, IP_VS_DIR_OUTPUT, skb, pp); @@ -1117,7 +1122,7 @@ ip_vs_out(unsigned int hooknum, struct sk_buff *skb, int af) } } } - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_out: packet continues traversal as normal"); return NF_ACCEPT; } @@ -1253,7 +1258,8 @@ ip_vs_in_icmp(struct sk_buff *skb, int *related, unsigned int hooknum) pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMP for"); + IP_VS_DBG_PKT(11, AF_INET, pp, skb, offset, + "Checking incoming ICMP for"); offset += cih->ihl * 4; @@ -1364,7 +1370,8 @@ ip_vs_in_icmp_v6(struct sk_buff *skb, int *related, unsigned int hooknum) if (unlikely(cih->nexthdr == IPPROTO_FRAGMENT && pp->dont_defrag)) return NF_ACCEPT; - IP_VS_DBG_PKT(11, pp, skb, offset, "Checking incoming ICMPv6 for"); + IP_VS_DBG_PKT(11, AF_INET6, pp, skb, offset, + "Checking incoming ICMPv6 for"); offset += sizeof(struct ipv6hdr); @@ -1492,12 +1499,12 @@ ip_vs_in(unsigned int hooknum, struct sk_buff *skb, int af) if (unlikely(!cp)) { /* sorry, all this trouble for a no-hit :) */ - IP_VS_DBG_PKT(12, pp, skb, 0, + IP_VS_DBG_PKT(12, af, pp, skb, 0, "ip_vs_in: packet continues traversal as normal"); return NF_ACCEPT; } - IP_VS_DBG_PKT(11, pp, skb, 0, "Incoming packet"); + IP_VS_DBG_PKT(11, af, pp, skb, 0, "Incoming packet"); /* Check the server status */ if (cp->dest && !(cp->dest->flags & IP_VS_DEST_F_AVAILABLE)) { diff --git a/net/netfilter/ipvs/ip_vs_proto.c b/net/netfilter/ipvs/ip_vs_proto.c index 027f654..c539983 100644 --- a/net/netfilter/ipvs/ip_vs_proto.c +++ b/net/netfilter/ipvs/ip_vs_proto.c @@ -172,8 +172,8 @@ ip_vs_tcpudp_debug_packet_v4(struct ip_vs_protocol *pp, else if (ih->frag_off & htons(IP_OFFSET)) sprintf(buf, "%pI4->%pI4 frag", &ih->saddr, &ih->daddr); else { - __be16 _ports[2], *pptr -; + __be16 _ports[2], *pptr; + pptr = skb_header_pointer(skb, offset + ih->ihl*4, sizeof(_ports), _ports); if (pptr == NULL) @@ -223,13 +223,13 @@ ip_vs_tcpudp_debug_packet_v6(struct ip_vs_protocol *pp, void -ip_vs_tcpudp_debug_packet(struct ip_vs_protocol *pp, +ip_vs_tcpudp_debug_packet(int af, struct ip_vs_protocol *pp, const struct sk_buff *skb, int offset, const char *msg) { #ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) + if (af == AF_INET6) ip_vs_tcpudp_debug_packet_v6(pp, skb, offset, msg); else #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c index 8956ef3..3a04611 100644 --- a/net/netfilter/ipvs/ip_vs_proto_ah_esp.c +++ b/net/netfilter/ipvs/ip_vs_proto_ah_esp.c @@ -117,54 +117,6 @@ ah_esp_conn_schedule(int af, struct sk_buff *skb, struct ip_vs_protocol *pp, return 0; } - -static void -ah_esp_debug_packet_v4(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct iphdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "TRUNCATED"); - else - sprintf(buf, "%pI4->%pI4", &ih->saddr, &ih->daddr); - - pr_debug("%s: %s %s\n", msg, pp->name, buf); -} - -#ifdef CONFIG_IP_VS_IPV6 -static void -ah_esp_debug_packet_v6(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ - char buf[256]; - struct ipv6hdr _iph, *ih; - - ih = skb_header_pointer(skb, offset, sizeof(_iph), &_iph); - if (ih == NULL) - sprintf(buf, "TRUNCATED"); - else - sprintf(buf, "%pI6->%pI6", &ih->saddr, &ih->daddr); - - pr_debug("%s: %s %s\n", msg, pp->name, buf); -} -#endif - -static void -ah_esp_debug_packet(struct ip_vs_protocol *pp, const struct sk_buff *skb, - int offset, const char *msg) -{ -#ifdef CONFIG_IP_VS_IPV6 - if (skb->protocol == htons(ETH_P_IPV6)) - ah_esp_debug_packet_v6(pp, skb, offset, msg); - else -#endif - ah_esp_debug_packet_v4(pp, skb, offset, msg); -} - - static void ah_esp_init(struct ip_vs_protocol *pp) { /* nothing to do now */ @@ -195,7 +147,7 @@ struct ip_vs_protocol ip_vs_protocol_ah = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ .set_state_timeout = NULL, }; @@ -219,7 +171,7 @@ struct ip_vs_protocol ip_vs_protocol_esp = { .register_app = NULL, .unregister_app = NULL, .app_conn_bind = NULL, - .debug_packet = ah_esp_debug_packet, + .debug_packet = ip_vs_tcpudp_debug_packet, .timeout_change = NULL, /* ISAKMP */ }; #endif diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c index 9ab5232..d254345 100644 --- a/net/netfilter/ipvs/ip_vs_proto_sctp.c +++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c @@ -176,7 +176,7 @@ sctp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) if (val != cmp) { /* CRC failure, dump it. */ - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_tcp.c b/net/netfilter/ipvs/ip_vs_proto_tcp.c index 85d80a6..f6c5200 100644 --- a/net/netfilter/ipvs/ip_vs_proto_tcp.c +++ b/net/netfilter/ipvs/ip_vs_proto_tcp.c @@ -300,7 +300,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -311,7 +311,7 @@ tcp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - tcphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_proto_udp.c b/net/netfilter/ipvs/ip_vs_proto_udp.c index 5d21f08..9d106a0 100644 --- a/net/netfilter/ipvs/ip_vs_proto_udp.c +++ b/net/netfilter/ipvs/ip_vs_proto_udp.c @@ -314,7 +314,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ipv6_hdr(skb)->nexthdr, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } @@ -325,7 +325,7 @@ udp_csum_check(int af, struct sk_buff *skb, struct ip_vs_protocol *pp) skb->len - udphoff, ip_hdr(skb)->protocol, skb->csum)) { - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, af, pp, skb, 0, "Failed checksum for"); return 0; } diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c index 97b5361..de04ea3 100644 --- a/net/netfilter/ipvs/ip_vs_xmit.c +++ b/net/netfilter/ipvs/ip_vs_xmit.c @@ -543,7 +543,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, pp, skb, 0, "ip_vs_nat_xmit(): " + IP_VS_DBG_RL_PKT(10, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): " "stopping DNAT to local address"); goto tx_error_put; } @@ -552,7 +553,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && ipv4_is_loopback(rt->rt_dst) && skb_rtable(skb)->fl.iif) { - IP_VS_DBG_RL_PKT(1, pp, skb, 0, "ip_vs_nat_xmit(): " + IP_VS_DBG_RL_PKT(1, AF_INET, pp, skb, 0, "ip_vs_nat_xmit(): " "stopping DNAT to loopback address"); goto tx_error_put; } @@ -561,7 +562,8 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, mtu = dst_mtu(&rt->dst); if ((skb->len > mtu) && (iph->frag_off & htons(IP_DF))) { icmp_send(skb, ICMP_DEST_UNREACH,ICMP_FRAG_NEEDED, htonl(mtu)); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, "ip_vs_nat_xmit(): frag needed for"); + IP_VS_DBG_RL_PKT(0, AF_INET, pp, skb, 0, + "ip_vs_nat_xmit(): frag needed for"); goto tx_error_put; } @@ -593,7 +595,7 @@ ip_vs_nat_xmit(struct sk_buff *skb, struct ip_vs_conn *cp, goto tx_error; } - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still @@ -654,7 +656,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, struct nf_conn *ct = ct = nf_ct_get(skb, &ctinfo); if (ct && !nf_ct_is_untracked(ct)) { - IP_VS_DBG_RL_PKT(10, pp, skb, 0, + IP_VS_DBG_RL_PKT(10, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to local address"); goto tx_error_put; @@ -665,7 +667,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, /* From world but DNAT to loopback address? */ if (local && skb->dev && !(skb->dev->flags & IFF_LOOPBACK) && ipv6_addr_type(&rt->rt6i_dst.addr) & IPV6_ADDR_LOOPBACK) { - IP_VS_DBG_RL_PKT(1, pp, skb, 0, + IP_VS_DBG_RL_PKT(1, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): " "stopping DNAT to loopback address"); goto tx_error_put; @@ -680,7 +682,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, skb->dev = net->loopback_dev; } icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu); - IP_VS_DBG_RL_PKT(0, pp, skb, 0, + IP_VS_DBG_RL_PKT(0, AF_INET6, pp, skb, 0, "ip_vs_nat_xmit_v6(): frag needed for"); goto tx_error_put; } @@ -706,7 +708,7 @@ ip_vs_nat_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp, dst_release(&rt->dst); } - IP_VS_DBG_PKT(10, pp, skb, 0, "After DNAT"); + IP_VS_DBG_PKT(10, AF_INET6, pp, skb, 0, "After DNAT"); /* FIXME: when application helper enlarges the packet and the length is larger than the MTU of outgoing device, there will be still -- cgit v1.1 From 7b5edbc4cfe2297b0915adea5aa1eafcafadbf06 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 15 Oct 2010 19:22:34 +0000 Subject: net/sched: fix missing spinlock init Under network load, doing : tc qdisc del dev eth0 root triggers : [ 167.193087] BUG: spinlock bad magic on CPU#3, udpflood/4928 [ 167.193139] lock: c15bc324, .magic: 00000000, .owner: /-1, .owner_cpu: -1 [ 167.193193] Pid: 4928, comm: udpflood Not tainted 2.6.36-rc7-11417-g215340c-dirty #323 [ 167.193245] Call Trace: [ 167.193292] [] ? printk+0x18/0x20 [ 167.193342] [] spin_bug+0xa3/0xf0 [ 167.193389] [] do_raw_spin_lock+0x7d/0x160 [ 167.193440] [] ? __dev_xmit_skb+0x27e/0x2b0 [ 167.193496] [] ? trace_hardirqs_on+0xb/0x10 [ 167.193545] [] _raw_spin_lock+0x3a/0x40 [ 167.193593] [] ? __dev_xmit_skb+0x27e/0x2b0 [ 167.193641] [] __dev_xmit_skb+0x27e/0x2b0 commit 79640a4ca695 (add additional lock to qdisc to increase throughput) forgot to initialize noop_qdisc and noqueue_qdisc busylock Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/sched/sch_generic.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'net') diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 3d57681..0abcc49 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -383,6 +383,7 @@ struct Qdisc noop_qdisc = { .list = LIST_HEAD_INIT(noop_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noop_qdisc.q.lock), .dev_queue = &noop_netdev_queue, + .busylock = __SPIN_LOCK_UNLOCKED(noop_qdisc.busylock), }; EXPORT_SYMBOL(noop_qdisc); @@ -409,6 +410,7 @@ static struct Qdisc noqueue_qdisc = { .list = LIST_HEAD_INIT(noqueue_qdisc.list), .q.lock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.q.lock), .dev_queue = &noqueue_netdev_queue, + .busylock = __SPIN_LOCK_UNLOCKED(noqueue_qdisc.busylock), }; -- cgit v1.1 From 9b0c290e78d667e6a483bde8c7cef7dd15f49017 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 20 Oct 2010 22:03:38 +0000 Subject: fib: introduce fib_alias_accessed() helper Perf tools session at NFWS 2010 pointed out a false sharing on struct fib_alias that can be avoided pretty easily, if we set FA_S_ACCESSED bit only if needed (ie : not already set) Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/fib_hash.c | 3 ++- net/ipv4/fib_lookup.h | 7 +++++++ net/ipv4/fib_semantics.c | 2 +- net/ipv4/fib_trie.c | 3 ++- 4 files changed, 12 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 4f1aafd..43e1c59 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -335,7 +335,8 @@ void fib_table_select_default(struct fib_table *tb, if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; - fa->fa_state |= FA_S_ACCESSED; + + fib_alias_accessed(fa); if (fi == NULL) { if (next_fi != res->fi) diff --git a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h index 5072d8e..a29edf2 100644 --- a/net/ipv4/fib_lookup.h +++ b/net/ipv4/fib_lookup.h @@ -17,6 +17,13 @@ struct fib_alias { #define FA_S_ACCESSED 0x01 +/* Dont write on fa_state unless needed, to keep it shared on all cpus */ +static inline void fib_alias_accessed(struct fib_alias *fa) +{ + if (!(fa->fa_state & FA_S_ACCESSED)) + fa->fa_state |= FA_S_ACCESSED; +} + /* Exported by fib_semantics.c */ extern int fib_semantic_match(struct list_head *head, const struct flowi *flp, diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6734c9c..3e0da3e 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -901,7 +901,7 @@ int fib_semantic_match(struct list_head *head, const struct flowi *flp, if (fa->fa_scope < flp->fl4_scope) continue; - fa->fa_state |= FA_S_ACCESSED; + fib_alias_accessed(fa); err = fib_props[fa->fa_type].error; if (err == 0) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 31494f3..cd5e13a 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1838,7 +1838,8 @@ void fib_table_select_default(struct fib_table *tb, if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) continue; - fa->fa_state |= FA_S_ACCESSED; + + fib_alias_accessed(fa); if (fi == NULL) { if (next_fi != res->fi) -- cgit v1.1 From 11165f1457181e4499e5eada442434a07827ffd8 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 18 Oct 2010 14:27:29 +0000 Subject: socket: localize functions A couple of functions in socket.c are only used there and should be localized. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/socket.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/socket.c b/net/socket.c index 717a5f1..72da57d 100644 --- a/net/socket.c +++ b/net/socket.c @@ -209,8 +209,8 @@ int move_addr_to_kernel(void __user *uaddr, int ulen, struct sockaddr *kaddr) * specified. Zero is returned for a success. */ -int move_addr_to_user(struct sockaddr *kaddr, int klen, void __user *uaddr, - int __user *ulen) +static int move_addr_to_user(struct sockaddr *kaddr, int klen, + void __user *uaddr, int __user *ulen) { int err; int len; @@ -661,7 +661,8 @@ void __sock_recv_timestamp(struct msghdr *msg, struct sock *sk, } EXPORT_SYMBOL_GPL(__sock_recv_timestamp); -inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, struct sk_buff *skb) +static inline void sock_recv_drops(struct msghdr *msg, struct sock *sk, + struct sk_buff *skb) { if (sock_flag(sk, SOCK_RXQ_OVFL) && skb && skb->dropcount) put_cmsg(msg, SOL_SOCKET, SO_RXQ_OVFL, -- cgit v1.1 From d0280232813a6a5e2bfca6e9257b866352115c09 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Mon, 18 Oct 2010 14:03:21 +0000 Subject: bridge: make br_parse_ip_options static Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_netfilter.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 47c2dab..865fd76 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -215,7 +215,7 @@ static inline void nf_bridge_update_protocol(struct sk_buff *skb) * expected format */ -int br_parse_ip_options(struct sk_buff *skb) +static int br_parse_ip_options(struct sk_buff *skb) { struct ip_options *opt; struct iphdr *iph; -- cgit v1.1 From 6f747aca5e61778190d1e27bdc469f49149f0230 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 15 Oct 2010 05:15:59 +0000 Subject: xfrm6: make xfrm6_tunnel_free_spi local Function only defined and used in one file. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv6/xfrm6_tunnel.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index ac7584b..2969cad 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -199,7 +199,7 @@ static void x6spi_destroy_rcu(struct rcu_head *head) container_of(head, struct xfrm6_tunnel_spi, rcu_head)); } -void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) +static void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) { struct xfrm6_tunnel_net *xfrm6_tn = xfrm6_tunnel_pernet(net); struct xfrm6_tunnel_spi *x6spi; @@ -223,8 +223,6 @@ void xfrm6_tunnel_free_spi(struct net *net, xfrm_address_t *saddr) spin_unlock_bh(&xfrm6_tunnel_spi_lock); } -EXPORT_SYMBOL(xfrm6_tunnel_free_spi); - static int xfrm6_tunnel_output(struct xfrm_state *x, struct sk_buff *skb) { skb_push(skb, -skb_network_offset(skb)); -- cgit v1.1 From 8d8a0b1cc2a8f9794a3f1f747089b6a93774408d Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 15 Oct 2010 05:12:01 +0000 Subject: rtnetlink: remove rtnl_kill_links The function rtnl_kill_links is defined but never used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/rtnetlink.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net') diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index b2a718d..8121268 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -299,14 +299,6 @@ static void __rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) unregister_netdevice_many(&list_kill); } -void rtnl_kill_links(struct net *net, struct rtnl_link_ops *ops) -{ - rtnl_lock(); - __rtnl_kill_links(net, ops); - rtnl_unlock(); -} -EXPORT_SYMBOL_GPL(rtnl_kill_links); - /** * __rtnl_link_unregister - Unregister rtnl_link_ops from rtnetlink. * @ops: struct rtnl_link_ops * to unregister -- cgit v1.1 From 1c4c40c42da468ef02dc04940930c1926c964558 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Fri, 15 Oct 2010 05:14:19 +0000 Subject: xfrm: make xfrm_bundle_ok local Only used in one place. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cbab6e1..044e778 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -50,6 +50,9 @@ static struct xfrm_policy_afinfo *xfrm_policy_get_afinfo(unsigned short family); static void xfrm_policy_put_afinfo(struct xfrm_policy_afinfo *afinfo); static void xfrm_init_pmtu(struct dst_entry *dst); static int stale_bundle(struct dst_entry *dst); +static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *xdst, + struct flowi *fl, int family, int strict); + static struct xfrm_policy *__xfrm_policy_unlink(struct xfrm_policy *pol, int dir); @@ -2276,7 +2279,7 @@ static void xfrm_init_pmtu(struct dst_entry *dst) * still valid. */ -int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, +static int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, struct flowi *fl, int family, int strict) { struct dst_entry *dst = &first->u.dst; @@ -2358,8 +2361,6 @@ int xfrm_bundle_ok(struct xfrm_policy *pol, struct xfrm_dst *first, return 1; } -EXPORT_SYMBOL(xfrm_bundle_ok); - int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo) { struct net *net; -- cgit v1.1 From 3511c9132f8b1e1b5634e41a3331c44b0c13be70 Mon Sep 17 00:00:00 2001 From: Changli Gao Date: Sat, 16 Oct 2010 13:04:08 +0000 Subject: net_sched: remove the unused parameter of qdisc_create_dflt() The first parameter dev isn't in use in qdisc_create_dflt(). Signed-off-by: Changli Gao Acked-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- net/sched/sch_atm.c | 5 ++--- net/sched/sch_cbq.c | 12 ++++++------ net/sched/sch_drr.c | 4 ++-- net/sched/sch_dsmark.c | 6 ++---- net/sched/sch_fifo.c | 3 +-- net/sched/sch_generic.c | 10 ++++------ net/sched/sch_hfsc.c | 8 +++----- net/sched/sch_htb.c | 8 +++----- net/sched/sch_mq.c | 2 +- net/sched/sch_multiq.c | 3 +-- net/sched/sch_netem.c | 3 +-- net/sched/sch_prio.c | 2 +- 12 files changed, 27 insertions(+), 39 deletions(-) (limited to 'net') diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 6318e11..2825407 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -275,8 +275,7 @@ static int atm_tc_change(struct Qdisc *sch, u32 classid, u32 parent, goto err_out; } flow->filter_list = NULL; - flow->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, classid); + flow->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (!flow->q) flow->q = &noop_qdisc; pr_debug("atm_tc_change: qdisc %p\n", flow->q); @@ -543,7 +542,7 @@ static int atm_tc_init(struct Qdisc *sch, struct nlattr *opt) INIT_LIST_HEAD(&p->flows); INIT_LIST_HEAD(&p->link.list); list_add(&p->link.list, &p->flows); - p->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, + p->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); if (!p->link.q) p->link.q = &noop_qdisc; diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index 28c01ef..eb76315 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1379,9 +1379,9 @@ static int cbq_init(struct Qdisc *sch, struct nlattr *opt) q->link.sibling = &q->link; q->link.common.classid = sch->handle; q->link.qdisc = sch; - if (!(q->link.q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, - sch->handle))) + q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, + sch->handle); + if (!q->link.q) q->link.q = &noop_qdisc; q->link.priority = TC_CBQ_MAXPRIO-1; @@ -1623,7 +1623,7 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, struct cbq_class *cl = (struct cbq_class*)arg; if (new == NULL) { - new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid); if (new == NULL) return -ENOBUFS; @@ -1874,8 +1874,8 @@ cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **t cl->R_tab = rtab; rtab = NULL; cl->refcnt = 1; - if (!(cl->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, classid))) + cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); + if (!cl->q) cl->q = &noop_qdisc; cl->common.classid = classid; cl->tparent = parent; diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index b74046a..aa8b531 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -110,7 +110,7 @@ static int drr_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->refcnt = 1; cl->common.classid = classid; cl->quantum = quantum; - cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, + cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; @@ -218,7 +218,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg, struct drr_class *cl = (struct drr_class *)arg; if (new == NULL) { - new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid); if (new == NULL) new = &noop_qdisc; diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 63d41f8..1d295d6 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -61,8 +61,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg, sch, p, new, old); if (new == NULL) { - new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); if (new == NULL) new = &noop_qdisc; @@ -384,8 +383,7 @@ static int dsmark_init(struct Qdisc *sch, struct nlattr *opt) p->default_index = default_index; p->set_tc_index = nla_get_flag(tb[TCA_DSMARK_SET_TC_INDEX]); - p->q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, sch->handle); + p->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); if (p->q == NULL) p->q = &noop_qdisc; diff --git a/net/sched/sch_fifo.c b/net/sched/sch_fifo.c index 5948baf..4dfecb0 100644 --- a/net/sched/sch_fifo.c +++ b/net/sched/sch_fifo.c @@ -172,8 +172,7 @@ struct Qdisc *fifo_create_dflt(struct Qdisc *sch, struct Qdisc_ops *ops, struct Qdisc *q; int err = -ENOMEM; - q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - ops, TC_H_MAKE(sch->handle, 1)); + q = qdisc_create_dflt(sch->dev_queue, ops, TC_H_MAKE(sch->handle, 1)); if (q) { err = fifo_set_limit(q, limit); if (err < 0) { diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c index 0abcc49..5dbb3cd 100644 --- a/net/sched/sch_generic.c +++ b/net/sched/sch_generic.c @@ -576,10 +576,8 @@ errout: return ERR_PTR(err); } -struct Qdisc * qdisc_create_dflt(struct net_device *dev, - struct netdev_queue *dev_queue, - struct Qdisc_ops *ops, - unsigned int parentid) +struct Qdisc *qdisc_create_dflt(struct netdev_queue *dev_queue, + struct Qdisc_ops *ops, unsigned int parentid) { struct Qdisc *sch; @@ -684,7 +682,7 @@ static void attach_one_default_qdisc(struct net_device *dev, struct Qdisc *qdisc; if (dev->tx_queue_len) { - qdisc = qdisc_create_dflt(dev, dev_queue, + qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, TC_H_ROOT); if (!qdisc) { printk(KERN_INFO "%s: activation failed\n", dev->name); @@ -711,7 +709,7 @@ static void attach_default_qdiscs(struct net_device *dev) dev->qdisc = txq->qdisc_sleeping; atomic_inc(&dev->qdisc->refcnt); } else { - qdisc = qdisc_create_dflt(dev, txq, &mq_qdisc_ops, TC_H_ROOT); + qdisc = qdisc_create_dflt(txq, &mq_qdisc_ops, TC_H_ROOT); if (qdisc) { qdisc->ops->attach(qdisc); dev->qdisc = qdisc; diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index 4749609..069c62b 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1088,7 +1088,7 @@ hfsc_change_class(struct Qdisc *sch, u32 classid, u32 parentid, cl->refcnt = 1; cl->sched = q; cl->cl_parent = parent; - cl->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, + cl->qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); if (cl->qdisc == NULL) cl->qdisc = &noop_qdisc; @@ -1209,8 +1209,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (cl->level > 0) return -EINVAL; if (new == NULL) { - new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, + new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->cl_common.classid); if (new == NULL) new = &noop_qdisc; @@ -1452,8 +1451,7 @@ hfsc_init_qdisc(struct Qdisc *sch, struct nlattr *opt) q->root.cl_common.classid = sch->handle; q->root.refcnt = 1; q->root.sched = q; - q->root.qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, + q->root.qdisc = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, sch->handle); if (q->root.qdisc == NULL) q->root.qdisc = &noop_qdisc; diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 87d9449..01b519d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -1121,8 +1121,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new, if (cl->level) return -EINVAL; if (new == NULL && - (new = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, + (new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->common.classid)) == NULL) return -ENOBUFS; @@ -1247,8 +1246,7 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg) return -EBUSY; if (!cl->level && htb_parent_last_child(cl)) { - new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &pfifo_qdisc_ops, + new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, cl->parent->common.classid); last_child = 1; } @@ -1377,7 +1375,7 @@ static int htb_change_class(struct Qdisc *sch, u32 classid, /* create leaf qdisc early because it uses kmalloc(GFP_KERNEL) so that can't be used inside of sch_tree_lock -- thanks to Karlis Peisenieks */ - new_q = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, + new_q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid); sch_tree_lock(sch); if (parent && !parent->level) { diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index fe91e50..ecc302f 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -56,7 +56,7 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt) for (ntx = 0; ntx < dev->num_tx_queues; ntx++) { dev_queue = netdev_get_tx_queue(dev, ntx); - qdisc = qdisc_create_dflt(dev, dev_queue, &pfifo_fast_ops, + qdisc = qdisc_create_dflt(dev_queue, &pfifo_fast_ops, TC_H_MAKE(TC_H_MAJ(sch->handle), TC_H_MIN(ntx + 1))); if (qdisc == NULL) diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index 6ae2512..32690de 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -227,8 +227,7 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt) for (i = 0; i < q->bands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; - child = qdisc_create_dflt(qdisc_dev(sch), - sch->dev_queue, + child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1)); diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 4714ff1..e5593c0 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -538,8 +538,7 @@ static int netem_init(struct Qdisc *sch, struct nlattr *opt) qdisc_watchdog_init(&q->watchdog, sch); - q->qdisc = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, - &tfifo_qdisc_ops, + q->qdisc = qdisc_create_dflt(sch->dev_queue, &tfifo_qdisc_ops, TC_H_MAKE(sch->handle, 1)); if (!q->qdisc) { pr_debug("netem: qdisc create failed\n"); diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 0748fb1..b1c95bc 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -200,7 +200,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt) for (i=0; ibands; i++) { if (q->queues[i] == &noop_qdisc) { struct Qdisc *child, *old; - child = qdisc_create_dflt(qdisc_dev(sch), sch->dev_queue, + child = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, TC_H_MAKE(sch->handle, i + 1)); if (child) { -- cgit v1.1 From 106e4c26b1529e559d1aae777f11b4f8f7bafc26 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 12:45:14 +0200 Subject: tproxy: kick out TIME_WAIT sockets in case a new connection comes in with the same tuple Without tproxy redirections an incoming SYN kicks out conflicting TIME_WAIT sockets, in order to handle clients that reuse ports within the TIME_WAIT period. The same mechanism didn't work in case TProxy is involved in finding the proper socket, as the time_wait processing code looked up the listening socket assuming that the listener addr/port matches those of the established connection. This is not the case with TProxy as the listener addr/port is possibly changed with the tproxy rule. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/nf_tproxy_core.c | 29 +++++++++++++----- net/netfilter/xt_TPROXY.c | 68 ++++++++++++++++++++++++++++++++++++++---- net/netfilter/xt_socket.c | 2 +- 3 files changed, 85 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index 5490fc3..8589e5e 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -22,21 +22,34 @@ struct sock * nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, const __be32 saddr, const __be32 daddr, const __be16 sport, const __be16 dport, - const struct net_device *in, bool listening_only) + const struct net_device *in, int lookup_type) { struct sock *sk; /* look up socket */ switch (protocol) { case IPPROTO_TCP: - if (listening_only) - sk = __inet_lookup_listener(net, &tcp_hashinfo, - daddr, ntohs(dport), - in->ifindex); - else + switch (lookup_type) { + case NFT_LOOKUP_ANY: sk = __inet_lookup(net, &tcp_hashinfo, saddr, sport, daddr, dport, in->ifindex); + break; + case NFT_LOOKUP_LISTENER: + sk = inet_lookup_listener(net, &tcp_hashinfo, + daddr, dport, + in->ifindex); + break; + case NFT_LOOKUP_ESTABLISHED: + sk = inet_lookup_established(net, &tcp_hashinfo, + saddr, sport, daddr, dport, + in->ifindex); + break; + default: + WARN_ON(1); + sk = NULL; + break; + } break; case IPPROTO_UDP: sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, @@ -47,8 +60,8 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, sk = NULL; } - pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, listener only: %d, sock %p\n", - protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), listening_only, sk); + pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", + protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); return sk; } diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index 21bb2af..e0b6900 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -24,6 +24,57 @@ #include #include +/** + * tproxy_handle_time_wait() - handle TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @par: Iptables target parameters. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk) +{ + const struct iphdr *iph = ip_hdr(skb); + const struct xt_tproxy_target_info *tgi = par->targinfo; + struct tcphdr _hdr, *hp; + + hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, + hp->source, tgi->lport ? tgi->lport : hp->dest, + par->in, NFT_LOOKUP_LISTENER); + if (sk2) { + /* yeah, there's one, let's kill the TIME_WAIT + * socket and redirect to the listener + */ + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + static unsigned int tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) { @@ -37,11 +88,18 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) return NF_DROP; sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, - iph->saddr, - tgi->laddr ? tgi->laddr : iph->daddr, - hp->source, - tgi->lport ? tgi->lport : hp->dest, - par->in, true); + iph->saddr, iph->daddr, + hp->source, hp->dest, + par->in, NFT_LOOKUP_ESTABLISHED); + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + sk = tproxy_handle_time_wait(skb, par, sk); + else if (!sk) + sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, + iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, + hp->source, tgi->lport ? tgi->lport : hp->dest, + par->in, NFT_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ if (sk && nf_tproxy_assign_sock(skb, sk)) { diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 1ca8990..266faa0 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -142,7 +142,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, #endif sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), protocol, - saddr, daddr, sport, dport, par->in, false); + saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); if (sk != NULL) { bool wildcard; bool transparent = true; -- cgit v1.1 From 6006db84a91838813cdad8a6622a4e39efe9ea47 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 12:47:34 +0200 Subject: tproxy: add lookup type checks for UDP in nf_tproxy_get_sock_v4() Also, inline this function as the lookup_type is always a literal and inlining removes branches performed at runtime. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/nf_tproxy_core.c | 48 ------------------------------------------ 1 file changed, 48 deletions(-) (limited to 'net') diff --git a/net/netfilter/nf_tproxy_core.c b/net/netfilter/nf_tproxy_core.c index 8589e5e..db65563 100644 --- a/net/netfilter/nf_tproxy_core.c +++ b/net/netfilter/nf_tproxy_core.c @@ -18,54 +18,6 @@ #include #include -struct sock * -nf_tproxy_get_sock_v4(struct net *net, const u8 protocol, - const __be32 saddr, const __be32 daddr, - const __be16 sport, const __be16 dport, - const struct net_device *in, int lookup_type) -{ - struct sock *sk; - - /* look up socket */ - switch (protocol) { - case IPPROTO_TCP: - switch (lookup_type) { - case NFT_LOOKUP_ANY: - sk = __inet_lookup(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, - daddr, dport, - in->ifindex); - break; - case NFT_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - break; - } - break; - case IPPROTO_UDP: - sk = udp4_lib_lookup(net, saddr, sport, daddr, dport, - in->ifindex); - break; - default: - WARN_ON(1); - sk = NULL; - } - - pr_debug("tproxy socket lookup: proto %u %08x:%u -> %08x:%u, lookup type: %d, sock %p\n", - protocol, ntohl(saddr), ntohs(sport), ntohl(daddr), ntohs(dport), lookup_type, sk); - - return sk; -} -EXPORT_SYMBOL_GPL(nf_tproxy_get_sock_v4); static void nf_tproxy_destructor(struct sk_buff *skb) -- cgit v1.1 From d2ed817766987fd05e69b7da65d4861b38f1aa2a Mon Sep 17 00:00:00 2001 From: Ben Greear Date: Thu, 21 Oct 2010 04:06:29 -0700 Subject: net/core: Allow tagged VLAN packets to flow through VETH devices. When there are VLANs on a VETH device, the packets being transmitted through the VETH device may be 4 bytes bigger than MTU. A check in dev_forward_skb did not take this into account and so dropped these packets. This patch is needed at least as far back as 2.6.34.7 and should be considered for -stable. Signed-off-by: Ben Greear Signed-off-by: David S. Miller --- net/core/dev.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 660dd41..8e07109 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1485,7 +1485,7 @@ int dev_forward_skb(struct net_device *dev, struct sk_buff *skb) nf_reset(skb); if (!(dev->flags & IFF_UP) || - (skb->len > (dev->mtu + dev->hard_header_len))) { + (skb->len > (dev->mtu + dev->hard_header_len + VLAN_HLEN))) { kfree_skb(skb); return NET_RX_DROP; } -- cgit v1.1 From 093d282321daeb19c107e5f1f16d7f68484f3ade Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 13:06:43 +0200 Subject: tproxy: fix hash locking issue when using port redirection in __inet_inherit_port() When __inet_inherit_port() is called on a tproxy connection the wrong locks are held for the inet_bind_bucket it is added to. __inet_inherit_port() made an implicit assumption that the listener's port number (and thus its bind bucket). Unfortunately, if you're using the TPROXY target to redirect skbs to a transparent proxy that assumption is not true anymore and things break. This patch adds code to __inet_inherit_port() so that it can handle this case by looking up or creating a new bind bucket for the child socket and updates callers of __inet_inherit_port() to gracefully handle __inet_inherit_port() failing. Reported by and original patch from Stephen Buck . See http://marc.info/?t=128169268200001&r=1&w=2 for the original discussion. Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/dccp/ipv4.c | 10 +++++++--- net/dccp/ipv6.c | 10 +++++++--- net/ipv4/inet_hashtables.c | 28 ++++++++++++++++++++++++++-- net/ipv4/tcp_ipv4.c | 10 +++++++--- net/ipv6/tcp_ipv6.c | 12 ++++++++---- 5 files changed, 55 insertions(+), 15 deletions(-) (limited to 'net') diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index d4a166f..3f69ea1 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -392,7 +392,7 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto exit; + goto exit_nonewsk; sk_setup_caps(newsk, dst); @@ -409,16 +409,20 @@ struct sock *dccp_v4_request_recv_sock(struct sock *sk, struct sk_buff *skb, dccp_sync_mss(newsk, dst_mtu(dst)); + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto exit; + } __inet_hash_nolisten(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - dst_release(dst); return NULL; } diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 6e3f325..dca711d 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -564,7 +564,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newsk = dccp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto out; + goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -632,18 +632,22 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, newinet->inet_daddr = newinet->inet_saddr = LOOPBACK4_IPV6; newinet->inet_rcv_saddr = LOOPBACK4_IPV6; + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } __inet6_hash(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; out_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +out_nonewsk: + dst_release(dst); out: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); if (opt != NULL && opt != np->opt) sock_kfree_s(sk, opt, opt->tot_len); - dst_release(dst); return NULL; } diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index fb7ad5a..1b344f3 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -101,19 +101,43 @@ void inet_put_port(struct sock *sk) } EXPORT_SYMBOL(inet_put_port); -void __inet_inherit_port(struct sock *sk, struct sock *child) +int __inet_inherit_port(struct sock *sk, struct sock *child) { struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(child)->inet_num, + unsigned short port = inet_sk(child)->inet_num; + const int bhash = inet_bhashfn(sock_net(sk), port, table->bhash_size); struct inet_bind_hashbucket *head = &table->bhash[bhash]; struct inet_bind_bucket *tb; spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; + if (tb->port != port) { + /* NOTE: using tproxy and redirecting skbs to a proxy + * on a different listener port breaks the assumption + * that the listener socket's icsk_bind_hash is the same + * as that of the child socket. We have to look up or + * create a new bind bucket for the child here. */ + struct hlist_node *node; + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (net_eq(ib_net(tb), sock_net(sk)) && + tb->port == port) + break; + } + if (!node) { + tb = inet_bind_bucket_create(table->bind_bucket_cachep, + sock_net(sk), head, port); + if (!tb) { + spin_unlock(&head->lock); + return -ENOMEM; + } + } + } sk_add_bind_node(child, &tb->owners); inet_csk(child)->icsk_bind_hash = tb; spin_unlock(&head->lock); + + return 0; } EXPORT_SYMBOL_GPL(__inet_inherit_port); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index a0232f3..8f8527d 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1422,7 +1422,7 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = tcp_create_openreq_child(sk, req, skb); if (!newsk) - goto exit; + goto exit_nonewsk; newsk->sk_gso_type = SKB_GSO_TCPV4; sk_setup_caps(newsk, dst); @@ -1469,16 +1469,20 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto exit; + } __inet_hash_nolisten(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; exit_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); +exit_nonewsk: + dst_release(dst); exit: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); - dst_release(dst); return NULL; } EXPORT_SYMBOL(tcp_v4_syn_recv_sock); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index fe6d404..ba5258ef 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1409,7 +1409,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newsk = tcp_create_openreq_child(sk, req, skb); if (newsk == NULL) - goto out; + goto out_nonewsk; /* * No need to charge this sock to the relevant IPv6 refcnt debug socks @@ -1497,18 +1497,22 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, } #endif + if (__inet_inherit_port(sk, newsk) < 0) { + sock_put(newsk); + goto out; + } __inet6_hash(newsk, NULL); - __inet_inherit_port(sk, newsk); return newsk; out_overflow: NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS); -out: - NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); +out_nonewsk: if (opt && opt != np->opt) sock_kfree_s(sk, opt, opt->tot_len); dst_release(dst); +out: + NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_LISTENDROPS); return NULL; } -- cgit v1.1 From 8c974438085d2c81b006daeaab8801eedbd19758 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 21 Oct 2010 01:06:15 +0000 Subject: Revert c6537d6742985da1fbf12ae26cde6a096fd35b5c Backout the tipc changes to the flags int he subscription message. These changees, while reasonable on the surface, interefere with user space ABI compatibility which is a no-no. This was part of the changes to fix the endianess issues in the TIPC protocol, which would be really nice to do but we need to do so in a way that is backwards compatible with user space. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/tipc/subscr.c | 15 +++++---------- 1 file changed, 5 insertions(+), 10 deletions(-) (limited to 'net') diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index ab6eab4..ff123e5 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -274,7 +274,7 @@ static void subscr_cancel(struct tipc_subscr *s, { struct subscription *sub; struct subscription *sub_temp; - __u32 type, lower, upper, timeout, filter; + __u32 type, lower, upper; int found = 0; /* Find first matching subscription, exit if not found */ @@ -282,18 +282,12 @@ static void subscr_cancel(struct tipc_subscr *s, type = ntohl(s->seq.type); lower = ntohl(s->seq.lower); upper = ntohl(s->seq.upper); - timeout = ntohl(s->timeout); - filter = ntohl(s->filter) & ~TIPC_SUB_CANCEL; list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, subscription_list) { if ((type == sub->seq.type) && (lower == sub->seq.lower) && - (upper == sub->seq.upper) && - (timeout == sub->timeout) && - (filter == sub->filter) && - !memcmp(s->usr_handle,sub->evt.s.usr_handle, - sizeof(s->usr_handle)) ){ + (upper == sub->seq.upper)) { found = 1; break; } @@ -310,7 +304,7 @@ static void subscr_cancel(struct tipc_subscr *s, k_term_timer(&sub->timer); spin_lock_bh(subscriber->lock); } - dbg("Cancel: removing sub %u,%u,%u from subscriber %p list\n", + dbg("Cancel: removing sub %u,%u,%u from subscriber %x list\n", sub->seq.type, sub->seq.lower, sub->seq.upper, subscriber); subscr_del(sub); } @@ -358,7 +352,8 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s, sub->seq.upper = ntohl(s->seq.upper); sub->timeout = ntohl(s->timeout); sub->filter = ntohl(s->filter); - if ((sub->filter && (sub->filter != TIPC_SUB_PORTS)) || + if ((!(sub->filter & TIPC_SUB_PORTS) == + !(sub->filter & TIPC_SUB_SERVICE)) || (sub->seq.lower > sub->seq.upper)) { warn("Subscription rejected, illegal request\n"); kfree(sub); -- cgit v1.1 From db5a753bf198ef7a50e17d2ff358adf37efe8648 Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Thu, 21 Oct 2010 01:06:16 +0000 Subject: Revert d88dca79d3852a3623f606f781e013d61486828a TIPC needs to have its endianess issues fixed. Unfortunately, the format of a subscriber message is passed in directly from user space, so requiring this message to be in network byte order breaks user space ABI. Revert this change until such time as we can determine how to do this in a backwards compatible manner. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/tipc/subscr.c | 57 ++++++++++++++++++++++++++++++++++--------------------- net/tipc/subscr.h | 2 ++ 2 files changed, 37 insertions(+), 22 deletions(-) (limited to 'net') diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c index ff123e5..ac91f0d 100644 --- a/net/tipc/subscr.c +++ b/net/tipc/subscr.c @@ -76,6 +76,19 @@ struct top_srv { static struct top_srv topsrv = { 0 }; /** + * htohl - convert value to endianness used by destination + * @in: value to convert + * @swap: non-zero if endianness must be reversed + * + * Returns converted value + */ + +static u32 htohl(u32 in, int swap) +{ + return swap ? swab32(in) : in; +} + +/** * subscr_send_event - send a message containing a tipc_event to the subscriber * * Note: Must not hold subscriber's server port lock, since tipc_send() will @@ -94,11 +107,11 @@ static void subscr_send_event(struct subscription *sub, msg_sect.iov_base = (void *)&sub->evt; msg_sect.iov_len = sizeof(struct tipc_event); - sub->evt.event = htonl(event); - sub->evt.found_lower = htonl(found_lower); - sub->evt.found_upper = htonl(found_upper); - sub->evt.port.ref = htonl(port_ref); - sub->evt.port.node = htonl(node); + sub->evt.event = htohl(event, sub->swap); + sub->evt.found_lower = htohl(found_lower, sub->swap); + sub->evt.found_upper = htohl(found_upper, sub->swap); + sub->evt.port.ref = htohl(port_ref, sub->swap); + sub->evt.port.node = htohl(node, sub->swap); tipc_send(sub->server_ref, 1, &msg_sect); } @@ -274,23 +287,16 @@ static void subscr_cancel(struct tipc_subscr *s, { struct subscription *sub; struct subscription *sub_temp; - __u32 type, lower, upper; int found = 0; /* Find first matching subscription, exit if not found */ - type = ntohl(s->seq.type); - lower = ntohl(s->seq.lower); - upper = ntohl(s->seq.upper); - list_for_each_entry_safe(sub, sub_temp, &subscriber->subscription_list, subscription_list) { - if ((type == sub->seq.type) && - (lower == sub->seq.lower) && - (upper == sub->seq.upper)) { - found = 1; - break; - } + if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) { + found = 1; + break; + } } if (!found) return; @@ -319,10 +325,16 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s, struct subscriber *subscriber) { struct subscription *sub; + int swap; + + /* Determine subscriber's endianness */ + + swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE)); /* Detect & process a subscription cancellation request */ - if (ntohl(s->filter) & TIPC_SUB_CANCEL) { + if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) { + s->filter &= ~htohl(TIPC_SUB_CANCEL, swap); subscr_cancel(s, subscriber); return NULL; } @@ -347,11 +359,11 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s, /* Initialize subscription object */ - sub->seq.type = ntohl(s->seq.type); - sub->seq.lower = ntohl(s->seq.lower); - sub->seq.upper = ntohl(s->seq.upper); - sub->timeout = ntohl(s->timeout); - sub->filter = ntohl(s->filter); + sub->seq.type = htohl(s->seq.type, swap); + sub->seq.lower = htohl(s->seq.lower, swap); + sub->seq.upper = htohl(s->seq.upper, swap); + sub->timeout = htohl(s->timeout, swap); + sub->filter = htohl(s->filter, swap); if ((!(sub->filter & TIPC_SUB_PORTS) == !(sub->filter & TIPC_SUB_SERVICE)) || (sub->seq.lower > sub->seq.upper)) { @@ -364,6 +376,7 @@ static struct subscription *subscr_subscribe(struct tipc_subscr *s, INIT_LIST_HEAD(&sub->nameseq_list); list_add(&sub->subscription_list, &subscriber->subscription_list); sub->server_ref = subscriber->port_ref; + sub->swap = swap; memcpy(&sub->evt.s, s, sizeof(struct tipc_subscr)); atomic_inc(&topsrv.subscription_count); if (sub->timeout != TIPC_WAIT_FOREVER) { diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h index c20f496..45d89bf 100644 --- a/net/tipc/subscr.h +++ b/net/tipc/subscr.h @@ -53,6 +53,7 @@ typedef void (*tipc_subscr_event) (struct subscription *sub, * @nameseq_list: adjacent subscriptions in name sequence's subscription list * @subscription_list: adjacent subscriptions in subscriber's subscription list * @server_ref: object reference of server port associated with subscription + * @swap: indicates if subscriber uses opposite endianness in its messages * @evt: template for events generated by subscription */ @@ -65,6 +66,7 @@ struct subscription { struct list_head nameseq_list; struct list_head subscription_list; u32 server_ref; + int swap; struct tipc_event evt; }; -- cgit v1.1 From a5c30b349b872aa2ac13babbd5ceb26737f17e95 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Tue, 19 Oct 2010 06:04:42 +0000 Subject: net/neighbour: cancel_delayed_work() + flush_scheduled_work() -> cancel_delayed_work_sync() flush_scheduled_work() is going away. Prepare for it. Signed-off-by: Tejun Heo Signed-off-by: David S. Miller --- net/core/neighbour.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/neighbour.c b/net/core/neighbour.c index b165b96..8cc8f9a 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -1588,8 +1588,7 @@ int neigh_table_clear(struct neigh_table *tbl) struct neigh_table **tp; /* It is not clean... Fix it to unload IPv6 module safely */ - cancel_delayed_work(&tbl->gc_work); - flush_scheduled_work(); + cancel_delayed_work_sync(&tbl->gc_work); del_timer_sync(&tbl->proxy_timer); pneigh_queue_purge(&tbl->proxy_queue); neigh_ifdown(tbl, NULL); -- cgit v1.1 From 1e253c3b8a1aeed51eef6fc366812f219b97de65 Mon Sep 17 00:00:00 2001 From: Benjamin Poirier Date: Mon, 18 Oct 2010 16:09:35 +0000 Subject: bridge: Forward reserved group addresses if !STP Make all frames sent to reserved group MAC addresses (01:80:c2:00:00:00 to 01:80:c2:00:00:0f) be forwarded if STP is disabled. This enables forwarding EAPOL frames, among other things. Signed-off-by: Benjamin Poirier Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_input.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net') diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index 6d04cfd..25207a1 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -159,7 +159,7 @@ struct sk_buff *br_handle_frame(struct sk_buff *skb) goto drop; /* If STP is turned off, then forward */ - if (p->br->stp_enabled == BR_NO_STP && dest[5] == 0) + if (p->br->stp_enabled == BR_NO_STP) goto forward; if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, skb, skb->dev, -- cgit v1.1 From a0e00369f1e1ff9142a20efe4785890e52b2e525 Mon Sep 17 00:00:00 2001 From: Allan Stephens Date: Mon, 18 Oct 2010 11:43:56 +0000 Subject: tipc: delete needless memset from bearer enabling. Eliminates zeroing out of the new bearer structure at the start of activation, since it is already in that state. Signed-off-by: Allan Stephens Signed-off-by: Paul Gortmaker Acked-by: Neil Horman Signed-off-by: David S. Miller --- net/tipc/bearer.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'net') diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c index fd9c06c..9927d1d 100644 --- a/net/tipc/bearer.c +++ b/net/tipc/bearer.c @@ -556,8 +556,6 @@ restart: } b_ptr = &tipc_bearers[bearer_id]; - memset(b_ptr, 0, sizeof(struct bearer)); - strcpy(b_ptr->publ.name, name); res = m_ptr->enable_bearer(&b_ptr->publ); if (res) { -- cgit v1.1 From d0c2b0d265a0f1f92922a99a31def9da582197ac Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 19 Oct 2010 07:12:10 +0000 Subject: napi: unexport napi_reuse_skb The function napi_reuse_skb is only used inside core. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/core/dev.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net') diff --git a/net/core/dev.c b/net/core/dev.c index 97fd6bc..b2269ac 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3301,7 +3301,7 @@ gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb) } EXPORT_SYMBOL(napi_gro_receive); -void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) +static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) { __skb_pull(skb, skb_headlen(skb)); skb_reserve(skb, NET_IP_ALIGN - skb_headroom(skb)); @@ -3309,7 +3309,6 @@ void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) napi->skb = skb; } -EXPORT_SYMBOL(napi_reuse_skb); struct sk_buff *napi_get_frags(struct napi_struct *napi) { -- cgit v1.1 From ff51bf841587c75b58d25ed77263158619784dd3 Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 19 Oct 2010 08:08:33 +0000 Subject: rds: make local functions/variables static The RDS protocol has lots of functions that should be declared static. rds_message_get/add_version_extension is removed since it defined but never used. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/rds/connection.c | 2 +- net/rds/ib.c | 10 +++++----- net/rds/ib.h | 4 ---- net/rds/ib_rdma.c | 2 +- net/rds/ib_sysctl.c | 2 +- net/rds/iw.c | 4 ++-- net/rds/iw.h | 4 ---- net/rds/iw_rdma.c | 3 ++- net/rds/iw_sysctl.c | 2 +- net/rds/message.c | 24 ++---------------------- net/rds/page.c | 3 ++- net/rds/rdma_transport.c | 4 ++-- net/rds/rdma_transport.h | 4 ---- net/rds/rds.h | 5 ----- net/rds/recv.c | 3 +-- net/rds/send.c | 4 +++- net/rds/tcp.c | 6 +++--- net/rds/tcp.h | 2 -- net/rds/tcp_recv.c | 3 ++- net/rds/tcp_send.c | 2 +- 20 files changed, 29 insertions(+), 64 deletions(-) (limited to 'net') diff --git a/net/rds/connection.c b/net/rds/connection.c index 870992e..9334d89 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -88,7 +88,7 @@ static struct rds_connection *rds_conn_lookup(struct hlist_head *head, * and receiving over this connection again in the future. It is up to * the transport to have serialized this call with its send and recv. */ -void rds_conn_reset(struct rds_connection *conn) +static void rds_conn_reset(struct rds_connection *conn) { rdsdebug("connection %pI4 to %pI4 reset\n", &conn->c_laddr, &conn->c_faddr); diff --git a/net/rds/ib.c b/net/rds/ib.c index b12a395..4123967 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -42,7 +42,7 @@ #include "rds.h" #include "ib.h" -unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; +static unsigned int fmr_pool_size = RDS_FMR_POOL_SIZE; unsigned int fmr_message_size = RDS_FMR_SIZE + 1; /* +1 allows for unaligned MRs */ unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT; @@ -65,7 +65,7 @@ struct list_head rds_ib_devices; DEFINE_SPINLOCK(ib_nodev_conns_lock); LIST_HEAD(ib_nodev_conns); -void rds_ib_nodev_connect(void) +static void rds_ib_nodev_connect(void) { struct rds_ib_connection *ic; @@ -75,7 +75,7 @@ void rds_ib_nodev_connect(void) spin_unlock(&ib_nodev_conns_lock); } -void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) +static void rds_ib_dev_shutdown(struct rds_ib_device *rds_ibdev) { struct rds_ib_connection *ic; unsigned long flags; @@ -118,7 +118,7 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) queue_work(rds_wq, &rds_ibdev->free_work); } -void rds_ib_add_one(struct ib_device *device) +static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; struct ib_device_attr *dev_attr; @@ -229,7 +229,7 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device) * * This can be called at any time and can be racing with any other RDS path. */ -void rds_ib_remove_one(struct ib_device *device) +static void rds_ib_remove_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; diff --git a/net/rds/ib.h b/net/rds/ib.h index 7ad3d57..e34ad03 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -265,13 +265,10 @@ static inline void rds_ib_dma_sync_sg_for_device(struct ib_device *dev, /* ib.c */ extern struct rds_transport rds_ib_transport; -extern void rds_ib_add_one(struct ib_device *device); -extern void rds_ib_remove_one(struct ib_device *device); struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device); void rds_ib_dev_put(struct rds_ib_device *rds_ibdev); extern struct ib_client rds_ib_client; -extern unsigned int fmr_pool_size; extern unsigned int fmr_message_size; extern unsigned int rds_ib_retry_count; @@ -374,6 +371,5 @@ extern unsigned long rds_ib_sysctl_max_unsig_wrs; extern unsigned long rds_ib_sysctl_max_unsig_bytes; extern unsigned long rds_ib_sysctl_max_recv_allocation; extern unsigned int rds_ib_sysctl_flow_control; -extern ctl_table rds_ib_sysctl_table[]; #endif diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index b5a8841..18a833c 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -38,7 +38,7 @@ #include "ib.h" #include "xlist.h" -struct workqueue_struct *rds_ib_fmr_wq; +static struct workqueue_struct *rds_ib_fmr_wq; static DEFINE_PER_CPU(unsigned long, clean_list_grace); #define CLEAN_LIST_BUSY_BIT 0 diff --git a/net/rds/ib_sysctl.c b/net/rds/ib_sysctl.c index fc3da37..1253b00 100644 --- a/net/rds/ib_sysctl.c +++ b/net/rds/ib_sysctl.c @@ -61,7 +61,7 @@ static unsigned long rds_ib_sysctl_max_unsig_wr_max = 64; */ unsigned int rds_ib_sysctl_flow_control = 0; -ctl_table rds_ib_sysctl_table[] = { +static ctl_table rds_ib_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_ib_sysctl_max_send_wr, diff --git a/net/rds/iw.c b/net/rds/iw.c index 56808ca..5a9676f 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -56,7 +56,7 @@ struct list_head rds_iw_devices; DEFINE_SPINLOCK(iw_nodev_conns_lock); LIST_HEAD(iw_nodev_conns); -void rds_iw_add_one(struct ib_device *device) +static void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct ib_device_attr *dev_attr; @@ -124,7 +124,7 @@ free_attr: kfree(dev_attr); } -void rds_iw_remove_one(struct ib_device *device) +static void rds_iw_remove_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; struct rds_iw_cm_id *i_cm_id, *next; diff --git a/net/rds/iw.h b/net/rds/iw.h index 543e665..9015192 100644 --- a/net/rds/iw.h +++ b/net/rds/iw.h @@ -268,8 +268,6 @@ static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic) /* ib.c */ extern struct rds_transport rds_iw_transport; -extern void rds_iw_add_one(struct ib_device *device); -extern void rds_iw_remove_one(struct ib_device *device); extern struct ib_client rds_iw_client; extern unsigned int fastreg_pool_size; @@ -318,7 +316,6 @@ void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents, void rds_iw_sync_mr(void *trans_private, int dir); void rds_iw_free_mr(void *trans_private, int invalidate); void rds_iw_flush_mrs(void); -void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id); /* ib_recv.c */ int rds_iw_recv_init(void); @@ -378,7 +375,6 @@ extern unsigned long rds_iw_sysctl_max_unsig_wrs; extern unsigned long rds_iw_sysctl_max_unsig_bytes; extern unsigned long rds_iw_sysctl_max_recv_allocation; extern unsigned int rds_iw_sysctl_flow_control; -extern ctl_table rds_iw_sysctl_table[]; /* * Helper functions for getting/setting the header and data SGEs in diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c index 0e7accc..59509e9 100644 --- a/net/rds/iw_rdma.c +++ b/net/rds/iw_rdma.c @@ -157,7 +157,8 @@ static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id * return 0; } -void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id) +static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev, + struct rdma_cm_id *cm_id) { struct rds_iw_cm_id *i_cm_id; diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c index 23e3a9a..e2e4717 100644 --- a/net/rds/iw_sysctl.c +++ b/net/rds/iw_sysctl.c @@ -55,7 +55,7 @@ static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL; unsigned int rds_iw_sysctl_flow_control = 1; -ctl_table rds_iw_sysctl_table[] = { +static ctl_table rds_iw_sysctl_table[] = { { .procname = "max_send_wr", .data = &rds_iw_sysctl_max_send_wr, diff --git a/net/rds/message.c b/net/rds/message.c index 84f937f..a84545d 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -106,8 +106,8 @@ void rds_message_populate_header(struct rds_header *hdr, __be16 sport, } EXPORT_SYMBOL_GPL(rds_message_populate_header); -int rds_message_add_extension(struct rds_header *hdr, - unsigned int type, const void *data, unsigned int len) +int rds_message_add_extension(struct rds_header *hdr, unsigned int type, + const void *data, unsigned int len) { unsigned int ext_len = sizeof(u8) + len; unsigned char *dst; @@ -177,26 +177,6 @@ none: return RDS_EXTHDR_NONE; } -int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version) -{ - struct rds_ext_header_version ext_hdr; - - ext_hdr.h_version = cpu_to_be32(version); - return rds_message_add_extension(hdr, RDS_EXTHDR_VERSION, &ext_hdr, sizeof(ext_hdr)); -} - -int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version) -{ - struct rds_ext_header_version ext_hdr; - unsigned int pos = 0, len = sizeof(ext_hdr); - - /* We assume the version extension is the only one present */ - if (rds_message_next_extension(hdr, &pos, &ext_hdr, &len) != RDS_EXTHDR_VERSION) - return 0; - *version = be32_to_cpu(ext_hdr.h_version); - return 1; -} - int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset) { struct rds_ext_header_rdma_dest ext_hdr; diff --git a/net/rds/page.c b/net/rds/page.c index 5e44f5ae..a3e2e0a 100644 --- a/net/rds/page.c +++ b/net/rds/page.c @@ -40,7 +40,8 @@ struct rds_page_remainder { unsigned long r_offset; }; -DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders); +static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, + rds_page_remainders); /* * returns 0 on success or -errno on failure. diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index e6ed10a..4195a05 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -207,7 +207,7 @@ static void rds_rdma_listen_stop(void) } } -int rds_rdma_init(void) +static int rds_rdma_init(void) { int ret; @@ -234,7 +234,7 @@ out: } module_init(rds_rdma_init); -void rds_rdma_exit(void) +static void rds_rdma_exit(void) { /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h index 2f2c7d9..faba4e3 100644 --- a/net/rds/rdma_transport.h +++ b/net/rds/rdma_transport.h @@ -11,10 +11,6 @@ int rds_rdma_conn_connect(struct rds_connection *conn); int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id, struct rdma_cm_event *event); -/* from rdma_transport.c */ -int rds_rdma_init(void); -void rds_rdma_exit(void); - /* from ib.c */ extern struct rds_transport rds_ib_transport; int rds_ib_init(void); diff --git a/net/rds/rds.h b/net/rds/rds.h index 8103dcf..9542449 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -619,7 +619,6 @@ struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp); void rds_conn_shutdown(struct rds_connection *conn); void rds_conn_destroy(struct rds_connection *conn); -void rds_conn_reset(struct rds_connection *conn); void rds_conn_drop(struct rds_connection *conn); void rds_conn_connect_if_down(struct rds_connection *conn); void rds_for_each_conn_info(struct socket *sock, unsigned int len, @@ -668,8 +667,6 @@ int rds_message_add_extension(struct rds_header *hdr, unsigned int type, const void *data, unsigned int len); int rds_message_next_extension(struct rds_header *hdr, unsigned int *pos, void *buf, unsigned int *buflen); -int rds_message_add_version_extension(struct rds_header *hdr, unsigned int version); -int rds_message_get_version_extension(struct rds_header *hdr, unsigned int *version); int rds_message_add_rdma_dest_extension(struct rds_header *hdr, u32 r_key, u32 offset); int rds_message_inc_copy_to_user(struct rds_incoming *inc, struct iovec *first_iov, size_t size); @@ -706,7 +703,6 @@ void rds_page_exit(void); /* recv.c */ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, __be32 saddr); -void rds_inc_addref(struct rds_incoming *inc); void rds_inc_put(struct rds_incoming *inc); void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr, struct rds_incoming *inc, gfp_t gfp, enum km_type km); @@ -728,7 +724,6 @@ void rds_send_drop_to(struct rds_sock *rs, struct sockaddr_in *dest); typedef int (*is_acked_func)(struct rds_message *rm, uint64_t ack); void rds_send_drop_acked(struct rds_connection *conn, u64 ack, is_acked_func is_acked); -void rds_send_remove_from_sock(struct list_head *messages, int status); int rds_send_pong(struct rds_connection *conn, __be16 dport); struct rds_message *rds_send_get_message(struct rds_connection *, struct rm_rdma_op *); diff --git a/net/rds/recv.c b/net/rds/recv.c index 68800f0..596689e 100644 --- a/net/rds/recv.c +++ b/net/rds/recv.c @@ -48,12 +48,11 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn, } EXPORT_SYMBOL_GPL(rds_inc_init); -void rds_inc_addref(struct rds_incoming *inc) +static void rds_inc_addref(struct rds_incoming *inc) { rdsdebug("addref inc %p ref %d\n", inc, atomic_read(&inc->i_refcount)); atomic_inc(&inc->i_refcount); } -EXPORT_SYMBOL_GPL(rds_inc_addref); void rds_inc_put(struct rds_incoming *inc) { diff --git a/net/rds/send.c b/net/rds/send.c index 9b951a0..0bc9db1 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -52,6 +52,8 @@ static int send_batch_count = 64; module_param(send_batch_count, int, 0444); MODULE_PARM_DESC(send_batch_count, " batch factor when working the send queue"); +static void rds_send_remove_from_sock(struct list_head *messages, int status); + /* * Reset the send state. Callers must ensure that this doesn't race with * rds_send_xmit(). @@ -555,7 +557,7 @@ EXPORT_SYMBOL_GPL(rds_send_get_message); * removing the messages from the 'messages' list regardless of if it found * the messages on the socket list or not. */ -void rds_send_remove_from_sock(struct list_head *messages, int status) +static void rds_send_remove_from_sock(struct list_head *messages, int status) { unsigned long flags; struct rds_sock *rs = NULL; diff --git a/net/rds/tcp.c b/net/rds/tcp.c index eeb08e6..08a8c6c 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -41,7 +41,7 @@ /* only for info exporting */ static DEFINE_SPINLOCK(rds_tcp_tc_list_lock); static LIST_HEAD(rds_tcp_tc_list); -unsigned int rds_tcp_tc_count; +static unsigned int rds_tcp_tc_count; /* Track rds_tcp_connection structs so they can be cleaned up */ static DEFINE_SPINLOCK(rds_tcp_conn_lock); @@ -243,7 +243,7 @@ static void rds_tcp_destroy_conns(void) } } -void rds_tcp_exit(void) +static void rds_tcp_exit(void) { rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info); rds_tcp_listen_stop(); @@ -274,7 +274,7 @@ struct rds_transport rds_tcp_transport = { .t_prefer_loopback = 1, }; -int rds_tcp_init(void) +static int rds_tcp_init(void) { int ret; diff --git a/net/rds/tcp.h b/net/rds/tcp.h index f5e6f7b..9cf2927 100644 --- a/net/rds/tcp.h +++ b/net/rds/tcp.h @@ -43,8 +43,6 @@ struct rds_tcp_statistics { }; /* tcp.c */ -int rds_tcp_init(void); -void rds_tcp_exit(void); void rds_tcp_tune(struct socket *sock); void rds_tcp_nonagle(struct socket *sock); void rds_tcp_set_callbacks(struct socket *sock, struct rds_connection *conn); diff --git a/net/rds/tcp_recv.c b/net/rds/tcp_recv.c index 67263fb..78205e2 100644 --- a/net/rds/tcp_recv.c +++ b/net/rds/tcp_recv.c @@ -272,7 +272,8 @@ out: } /* the caller has to hold the sock lock */ -int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, enum km_type km) +static int rds_tcp_read_sock(struct rds_connection *conn, gfp_t gfp, + enum km_type km) { struct rds_tcp_connection *tc = conn->c_transport_data; struct socket *sock = tc->t_sock; diff --git a/net/rds/tcp_send.c b/net/rds/tcp_send.c index aa16841..1b4fd68 100644 --- a/net/rds/tcp_send.c +++ b/net/rds/tcp_send.c @@ -63,7 +63,7 @@ void rds_tcp_xmit_complete(struct rds_connection *conn) } /* the core send_sem serializes this with other xmit and shutdown */ -int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) +static int rds_tcp_sendmsg(struct socket *sock, void *data, unsigned int len) { struct kvec vec = { .iov_base = data, -- cgit v1.1 From 32a875adcdcf5f470bf967250cfd01722e23844f Mon Sep 17 00:00:00 2001 From: stephen hemminger Date: Tue, 19 Oct 2010 06:48:16 +0000 Subject: 9p: client code cleanup Make p9_client_version static since only used in one file. Remove p9_client_auth because it is defined but never used. Compile tested only. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/9p/client.c | 51 +-------------------------------------------------- 1 file changed, 1 insertion(+), 50 deletions(-) (limited to 'net') diff --git a/net/9p/client.c b/net/9p/client.c index b5e1aa8..83bf0541 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -671,7 +671,7 @@ static void p9_fid_destroy(struct p9_fid *fid) kfree(fid); } -int p9_client_version(struct p9_client *c) +static int p9_client_version(struct p9_client *c) { int err = 0; struct p9_req_t *req; @@ -730,7 +730,6 @@ error: return err; } -EXPORT_SYMBOL(p9_client_version); struct p9_client *p9_client_create(const char *dev_name, char *options) { @@ -887,54 +886,6 @@ error: } EXPORT_SYMBOL(p9_client_attach); -struct p9_fid * -p9_client_auth(struct p9_client *clnt, char *uname, u32 n_uname, char *aname) -{ - int err; - struct p9_req_t *req; - struct p9_qid qid; - struct p9_fid *afid; - - P9_DPRINTK(P9_DEBUG_9P, ">>> TAUTH uname %s aname %s\n", uname, aname); - err = 0; - - afid = p9_fid_create(clnt); - if (IS_ERR(afid)) { - err = PTR_ERR(afid); - afid = NULL; - goto error; - } - - req = p9_client_rpc(clnt, P9_TAUTH, "dss?d", - afid ? afid->fid : P9_NOFID, uname, aname, n_uname); - if (IS_ERR(req)) { - err = PTR_ERR(req); - goto error; - } - - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); - if (err) { - p9pdu_dump(1, req->rc); - p9_free_req(clnt, req); - goto error; - } - - P9_DPRINTK(P9_DEBUG_9P, "<<< RAUTH qid %x.%llx.%x\n", - qid.type, - (unsigned long long)qid.path, - qid.version); - - memmove(&afid->qid, &qid, sizeof(struct p9_qid)); - p9_free_req(clnt, req); - return afid; - -error: - if (afid) - p9_fid_destroy(afid); - return ERR_PTR(err); -} -EXPORT_SYMBOL(p9_client_auth); - struct p9_fid *p9_client_walk(struct p9_fid *oldfid, int nwname, char **wnames, int clone) { -- cgit v1.1 From 1e55659ce6ddb5247cee0b1f720d77a799902b85 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Tue, 19 Oct 2010 09:32:04 +0000 Subject: can-raw: add msg_flags to distinguish local traffic CAN has no addressing scheme. It is currently impossible for userspace to tell is a received CAN frame comes from another process on the local host, or from a remote CAN device. This patch add support for userspace applications to distinguish between 'own', 'local' and 'remote' CAN traffic. The distinction is made by returning flags in msg->msg_flags in the call to recvmsg(). The added documentation explains the introduced flags. Signed-off-by: Kurt Van Dijck Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/can/raw.c | 33 ++++++++++++++++++++++++++++++--- 1 file changed, 30 insertions(+), 3 deletions(-) (limited to 'net') diff --git a/net/can/raw.c b/net/can/raw.c index 7d77e67..e88f610 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -90,23 +90,39 @@ struct raw_sock { can_err_mask_t err_mask; }; +/* + * Return pointer to store the extra msg flags for raw_recvmsg(). + * We use the space of one unsigned int beyond the 'struct sockaddr_can' + * in skb->cb. + */ +static inline unsigned int *raw_flags(struct sk_buff *skb) +{ + BUILD_BUG_ON(sizeof(skb->cb) <= (sizeof(struct sockaddr_can) + + sizeof(unsigned int))); + + /* return pointer after struct sockaddr_can */ + return (unsigned int *)(&((struct sockaddr_can *)skb->cb)[1]); +} + static inline struct raw_sock *raw_sk(const struct sock *sk) { return (struct raw_sock *)sk; } -static void raw_rcv(struct sk_buff *skb, void *data) +static void raw_rcv(struct sk_buff *oskb, void *data) { struct sock *sk = (struct sock *)data; struct raw_sock *ro = raw_sk(sk); struct sockaddr_can *addr; + struct sk_buff *skb; + unsigned int *pflags; /* check the received tx sock reference */ - if (!ro->recv_own_msgs && skb->sk == sk) + if (!ro->recv_own_msgs && oskb->sk == sk) return; /* clone the given skb to be able to enqueue it into the rcv queue */ - skb = skb_clone(skb, GFP_ATOMIC); + skb = skb_clone(oskb, GFP_ATOMIC); if (!skb) return; @@ -123,6 +139,14 @@ static void raw_rcv(struct sk_buff *skb, void *data) addr->can_family = AF_CAN; addr->can_ifindex = skb->dev->ifindex; + /* add CAN specific message flags for raw_recvmsg() */ + pflags = raw_flags(skb); + *pflags = 0; + if (oskb->sk) + *pflags |= MSG_DONTROUTE; + if (oskb->sk == sk) + *pflags |= MSG_CONFIRM; + if (sock_queue_rcv_skb(sk, skb) < 0) kfree_skb(skb); } @@ -707,6 +731,9 @@ static int raw_recvmsg(struct kiocb *iocb, struct socket *sock, memcpy(msg->msg_name, skb->cb, msg->msg_namelen); } + /* assign the flags that have been recorded in raw_rcv() */ + msg->msg_flags |= *(raw_flags(skb)); + skb_free_datagram(sk, skb); return size; -- cgit v1.1 From b0aeef30433ea6854e985c2e9842fa19f51b95cc Mon Sep 17 00:00:00 2001 From: Julian Anastasov Date: Mon, 11 Oct 2010 11:23:07 +0300 Subject: nf_nat: restrict ICMP translation for embedded header Skip ICMP translation of embedded protocol header if NAT bits are not set. Needed for IPVS to see the original embedded addresses because for IPVS traffic the IPS_SRC_NAT_BIT and IPS_DST_NAT_BIT bits are not set. It happens when IPVS performs DNAT for client packets after using nf_conntrack_alter_reply to expect replies from real server. Signed-off-by: Julian Anastasov Signed-off-by: Simon Horman --- net/ipv4/netfilter/nf_nat_core.c | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'net') diff --git a/net/ipv4/netfilter/nf_nat_core.c b/net/ipv4/netfilter/nf_nat_core.c index e2e00c4..0047923 100644 --- a/net/ipv4/netfilter/nf_nat_core.c +++ b/net/ipv4/netfilter/nf_nat_core.c @@ -462,6 +462,18 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, return 0; } + if (manip == IP_NAT_MANIP_SRC) + statusbit = IPS_SRC_NAT; + else + statusbit = IPS_DST_NAT; + + /* Invert if this is reply dir. */ + if (dir == IP_CT_DIR_REPLY) + statusbit ^= IPS_NAT_MASK; + + if (!(ct->status & statusbit)) + return 1; + pr_debug("icmp_reply_translation: translating error %p manip %u " "dir %s\n", skb, manip, dir == IP_CT_DIR_ORIGINAL ? "ORIG" : "REPLY"); @@ -496,20 +508,9 @@ int nf_nat_icmp_reply_translation(struct nf_conn *ct, /* Change outer to look the reply to an incoming packet * (proto 0 means don't invert per-proto part). */ - if (manip == IP_NAT_MANIP_SRC) - statusbit = IPS_SRC_NAT; - else - statusbit = IPS_DST_NAT; - - /* Invert if this is reply dir. */ - if (dir == IP_CT_DIR_REPLY) - statusbit ^= IPS_NAT_MASK; - - if (ct->status & statusbit) { - nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); - if (!manip_pkt(0, skb, 0, &target, manip)) - return 0; - } + nf_ct_invert_tuplepr(&target, &ct->tuplehash[!dir].tuple); + if (!manip_pkt(0, skb, 0, &target, manip)) + return 0; return 1; } -- cgit v1.1 From e83726b0460f4741c8f10e1488fbcc0a9e5f0b7e Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 21 Oct 2010 04:39:09 -0700 Subject: l2tp: small cleanup Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/l2tp/l2tp_ip.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net') diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c index 226a0ae..1c770c0 100644 --- a/net/l2tp/l2tp_ip.c +++ b/net/l2tp/l2tp_ip.c @@ -65,9 +65,7 @@ static struct sock *__l2tp_ip_bind_lookup(struct net *net, __be32 laddr, int dif continue; if ((l2tp->conn_id == tunnel_id) && -#ifdef CONFIG_NET_NS - (sk->sk_net == net) && -#endif + net_eq(sock_net(sk), net) && !(inet->inet_rcv_saddr && inet->inet_rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; -- cgit v1.1 From e97c3e278e951501c2f385de70c3ceacdea78c4a Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:03:43 +0200 Subject: tproxy: split off ipv6 defragmentation to a separate module Like with IPv4, TProxy needs IPv6 defragmentation but does not require connection tracking. Since defragmentation was coupled with conntrack, I split off the two, creating an nf_defrag_ipv6 module, similar to the already existing nf_defrag_ipv4. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/ipv6/netfilter/Makefile | 5 +- net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c | 78 +-------------- net/ipv6/netfilter/nf_conntrack_reasm.c | 14 ++- net/ipv6/netfilter/nf_defrag_ipv6_hooks.c | 131 +++++++++++++++++++++++++ 4 files changed, 150 insertions(+), 78 deletions(-) create mode 100644 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c (limited to 'net') diff --git a/net/ipv6/netfilter/Makefile b/net/ipv6/netfilter/Makefile index aafbba3..3f8e4a3 100644 --- a/net/ipv6/netfilter/Makefile +++ b/net/ipv6/netfilter/Makefile @@ -11,10 +11,11 @@ obj-$(CONFIG_IP6_NF_RAW) += ip6table_raw.o obj-$(CONFIG_IP6_NF_SECURITY) += ip6table_security.o # objects for l3 independent conntrack -nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o nf_conntrack_reasm.o +nf_conntrack_ipv6-objs := nf_conntrack_l3proto_ipv6.o nf_conntrack_proto_icmpv6.o +nf_defrag_ipv6-objs := nf_defrag_ipv6_hooks.o nf_conntrack_reasm.o # l3 independent conntrack -obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o +obj-$(CONFIG_NF_CONNTRACK_IPV6) += nf_conntrack_ipv6.o nf_defrag_ipv6.o # matches obj-$(CONFIG_IP6_NF_MATCH_AH) += ip6t_ah.o diff --git a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c index ff43461..c8af58b 100644 --- a/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c +++ b/net/ipv6/netfilter/nf_conntrack_l3proto_ipv6.c @@ -16,7 +16,6 @@ #include #include #include -#include #include #include @@ -29,6 +28,7 @@ #include #include #include +#include #include static bool ipv6_pkt_to_tuple(const struct sk_buff *skb, unsigned int nhoff, @@ -189,53 +189,6 @@ out: return nf_conntrack_confirm(skb); } -static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, - struct sk_buff *skb) -{ - u16 zone = NF_CT_DEFAULT_ZONE; - - if (skb->nfct) - zone = nf_ct_zone((struct nf_conn *)skb->nfct); - -#ifdef CONFIG_BRIDGE_NETFILTER - if (skb->nf_bridge && - skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) - return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; -#endif - if (hooknum == NF_INET_PRE_ROUTING) - return IP6_DEFRAG_CONNTRACK_IN + zone; - else - return IP6_DEFRAG_CONNTRACK_OUT + zone; - -} - -static unsigned int ipv6_defrag(unsigned int hooknum, - struct sk_buff *skb, - const struct net_device *in, - const struct net_device *out, - int (*okfn)(struct sk_buff *)) -{ - struct sk_buff *reasm; - - /* Previously seen (loopback)? */ - if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) - return NF_ACCEPT; - - reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); - /* queued */ - if (reasm == NULL) - return NF_STOLEN; - - /* error occured or not fragmented */ - if (reasm == skb) - return NF_ACCEPT; - - nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, - (struct net_device *)out, okfn); - - return NF_STOLEN; -} - static unsigned int __ipv6_conntrack_in(struct net *net, unsigned int hooknum, struct sk_buff *skb, @@ -288,13 +241,6 @@ static unsigned int ipv6_conntrack_local(unsigned int hooknum, static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { { - .hook = ipv6_defrag, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_PRE_ROUTING, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, - }, - { .hook = ipv6_conntrack_in, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, @@ -309,13 +255,6 @@ static struct nf_hook_ops ipv6_conntrack_ops[] __read_mostly = { .priority = NF_IP6_PRI_CONNTRACK, }, { - .hook = ipv6_defrag, - .owner = THIS_MODULE, - .pf = NFPROTO_IPV6, - .hooknum = NF_INET_LOCAL_OUT, - .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, - }, - { .hook = ipv6_confirm, .owner = THIS_MODULE, .pf = NFPROTO_IPV6, @@ -387,10 +326,6 @@ struct nf_conntrack_l3proto nf_conntrack_l3proto_ipv6 __read_mostly = { .nlattr_to_tuple = ipv6_nlattr_to_tuple, .nla_policy = ipv6_nla_policy, #endif -#ifdef CONFIG_SYSCTL - .ctl_table_path = nf_net_netfilter_sysctl_path, - .ctl_table = nf_ct_ipv6_sysctl_table, -#endif .me = THIS_MODULE, }; @@ -403,16 +338,12 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) int ret = 0; need_conntrack(); + nf_defrag_ipv6_enable(); - ret = nf_ct_frag6_init(); - if (ret < 0) { - pr_err("nf_conntrack_ipv6: can't initialize frag6.\n"); - return ret; - } ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_tcp6); if (ret < 0) { pr_err("nf_conntrack_ipv6: can't register tcp.\n"); - goto cleanup_frag6; + return ret; } ret = nf_conntrack_l4proto_register(&nf_conntrack_l4proto_udp6); @@ -450,8 +381,6 @@ static int __init nf_conntrack_l3proto_ipv6_init(void) nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); cleanup_tcp: nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - cleanup_frag6: - nf_ct_frag6_cleanup(); return ret; } @@ -463,7 +392,6 @@ static void __exit nf_conntrack_l3proto_ipv6_fini(void) nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_icmpv6); nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_udp6); nf_conntrack_l4proto_unregister(&nf_conntrack_l4proto_tcp6); - nf_ct_frag6_cleanup(); } module_init(nf_conntrack_l3proto_ipv6_init); diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index 138a8b3..489d71b 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -73,7 +73,7 @@ static struct inet_frags nf_frags; static struct netns_frags nf_init_frags; #ifdef CONFIG_SYSCTL -struct ctl_table nf_ct_ipv6_sysctl_table[] = { +struct ctl_table nf_ct_frag6_sysctl_table[] = { { .procname = "nf_conntrack_frag6_timeout", .data = &nf_init_frags.timeout, @@ -97,6 +97,8 @@ struct ctl_table nf_ct_ipv6_sysctl_table[] = { }, { } }; + +static struct ctl_table_header *nf_ct_frag6_sysctl_header; #endif static unsigned int nf_hashfn(struct inet_frag_queue *q) @@ -623,11 +625,21 @@ int nf_ct_frag6_init(void) inet_frags_init_net(&nf_init_frags); inet_frags_init(&nf_frags); + nf_ct_frag6_sysctl_header = register_sysctl_paths(nf_net_netfilter_sysctl_path, + nf_ct_frag6_sysctl_table); + if (!nf_ct_frag6_sysctl_header) { + inet_frags_fini(&nf_frags); + return -ENOMEM; + } + return 0; } void nf_ct_frag6_cleanup(void) { + unregister_sysctl_table(nf_ct_frag6_sysctl_header); + nf_ct_frag6_sysctl_header = NULL; + inet_frags_fini(&nf_frags); nf_init_frags.low_thresh = 0; diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c new file mode 100644 index 0000000..99abfb5 --- /dev/null +++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c @@ -0,0 +1,131 @@ +/* (C) 1999-2001 Paul `Rusty' Russell + * (C) 2002-2004 Netfilter Core Team + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static enum ip6_defrag_users nf_ct6_defrag_user(unsigned int hooknum, + struct sk_buff *skb) +{ + u16 zone = NF_CT_DEFAULT_ZONE; + + if (skb->nfct) + zone = nf_ct_zone((struct nf_conn *)skb->nfct); + +#ifdef CONFIG_BRIDGE_NETFILTER + if (skb->nf_bridge && + skb->nf_bridge->mask & BRNF_NF_BRIDGE_PREROUTING) + return IP6_DEFRAG_CONNTRACK_BRIDGE_IN + zone; +#endif + if (hooknum == NF_INET_PRE_ROUTING) + return IP6_DEFRAG_CONNTRACK_IN + zone; + else + return IP6_DEFRAG_CONNTRACK_OUT + zone; + +} + +static unsigned int ipv6_defrag(unsigned int hooknum, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct sk_buff *reasm; + + /* Previously seen (loopback)? */ + if (skb->nfct && !nf_ct_is_template((struct nf_conn *)skb->nfct)) + return NF_ACCEPT; + + reasm = nf_ct_frag6_gather(skb, nf_ct6_defrag_user(hooknum, skb)); + /* queued */ + if (reasm == NULL) + return NF_STOLEN; + + /* error occured or not fragmented */ + if (reasm == skb) + return NF_ACCEPT; + + nf_ct_frag6_output(hooknum, reasm, (struct net_device *)in, + (struct net_device *)out, okfn); + + return NF_STOLEN; +} + +static struct nf_hook_ops ipv6_defrag_ops[] = { + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_PRE_ROUTING, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, + { + .hook = ipv6_defrag, + .owner = THIS_MODULE, + .pf = NFPROTO_IPV6, + .hooknum = NF_INET_LOCAL_OUT, + .priority = NF_IP6_PRI_CONNTRACK_DEFRAG, + }, +}; + +static int __init nf_defrag_init(void) +{ + int ret = 0; + + ret = nf_ct_frag6_init(); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't initialize frag6.\n"); + return ret; + } + ret = nf_register_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + if (ret < 0) { + pr_err("nf_defrag_ipv6: can't register hooks\n"); + goto cleanup_frag6; + } + return ret; + +cleanup_frag6: + nf_ct_frag6_cleanup(); + return ret; + +} + +static void __exit nf_defrag_fini(void) +{ + nf_unregister_hooks(ipv6_defrag_ops, ARRAY_SIZE(ipv6_defrag_ops)); + nf_ct_frag6_cleanup(); +} + +void nf_defrag_ipv6_enable(void) +{ +} +EXPORT_SYMBOL_GPL(nf_defrag_ipv6_enable); + +module_init(nf_defrag_init); +module_exit(nf_defrag_fini); + +MODULE_LICENSE("GPL"); -- cgit v1.1 From 88440ae70eda83d0cc94148d404f4990c9f1289c Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:04:33 +0200 Subject: tproxy: added const specifiers to udp lookup functions The parameters for various UDP lookup functions were non-const, even though they could be const. TProxy has some const references and instead of downcasting it, I added const specifiers along the path. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/ipv6/udp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5acb356..33e3683 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -122,8 +122,8 @@ static void udp_v6_rehash(struct sock *sk) static inline int compute_score(struct sock *sk, struct net *net, unsigned short hnum, - struct in6_addr *saddr, __be16 sport, - struct in6_addr *daddr, __be16 dport, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) { int score = -1; @@ -239,8 +239,8 @@ exact_match: } static struct sock *__udp6_lib_lookup(struct net *net, - struct in6_addr *saddr, __be16 sport, - struct in6_addr *daddr, __be16 dport, + const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif, struct udp_table *udptable) { struct sock *sk, *result; -- cgit v1.1 From aa976fc011efa1f0e3290c6c9addf7c20757f885 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:05:41 +0200 Subject: tproxy: added udp6_lib_lookup function Just like with IPv4, we need access to the UDP hash table to look up local sockets, but instead of exporting the global udp_table, export a lookup function. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/ipv6/udp.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'net') diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 33e3683..c84dad4 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -320,6 +320,14 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb, udptable); } +struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport, + const struct in6_addr *daddr, __be16 dport, int dif) +{ + return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table); +} +EXPORT_SYMBOL_GPL(udp6_lib_lookup); + + /* * This should be easy, if there is something there we * return it, otherwise we block. -- cgit v1.1 From 6c46862280c5f55eda7750391bc65cd7e08c7535 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:08:28 +0200 Subject: tproxy: added tproxy sockopt interface in the IPV6 layer Support for IPV6_RECVORIGDSTADDR sockopt for UDP sockets were contributed by Harry Mason. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/ipv6/datagram.c | 19 +++++++++++++++++++ net/ipv6/ipv6_sockglue.c | 23 +++++++++++++++++++++++ 2 files changed, 42 insertions(+) (limited to 'net') diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index ef371aa..320bdb8 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -577,6 +577,25 @@ int datagram_recv_ctl(struct sock *sk, struct msghdr *msg, struct sk_buff *skb) u8 *ptr = nh + opt->dst1; put_cmsg(msg, SOL_IPV6, IPV6_2292DSTOPTS, (ptr[1]+1)<<3, ptr); } + if (np->rxopt.bits.rxorigdstaddr) { + struct sockaddr_in6 sin6; + u16 *ports = (u16 *) skb_transport_header(skb); + + if (skb_transport_offset(skb) + 4 <= skb->len) { + /* All current transport protocols have the port numbers in the + * first four bytes of the transport header and this function is + * written with this assumption in mind. + */ + + sin6.sin6_family = AF_INET6; + ipv6_addr_copy(&sin6.sin6_addr, &ipv6_hdr(skb)->daddr); + sin6.sin6_port = ports[1]; + sin6.sin6_flowinfo = 0; + sin6.sin6_scope_id = 0; + + put_cmsg(msg, SOL_IPV6, IPV6_ORIGDSTADDR, sizeof(sin6), &sin6); + } + } return 0; } diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index a7f66bc..0553867 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -342,6 +342,21 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, retv = 0; break; + case IPV6_TRANSPARENT: + if (optlen < sizeof(int)) + goto e_inval; + /* we don't have a separate transparent bit for IPV6 we use the one in the IPv4 socket */ + inet_sk(sk)->transparent = valbool; + retv = 0; + break; + + case IPV6_RECVORIGDSTADDR: + if (optlen < sizeof(int)) + goto e_inval; + np->rxopt.bits.rxorigdstaddr = valbool; + retv = 0; + break; + case IPV6_HOPOPTS: case IPV6_RTHDRDSTOPTS: case IPV6_RTHDR: @@ -1104,6 +1119,14 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; } + case IPV6_TRANSPARENT: + val = inet_sk(sk)->transparent; + break; + + case IPV6_RECVORIGDSTADDR: + val = np->rxopt.bits.rxorigdstaddr; + break; + case IPV6_UNICAST_HOPS: case IPV6_MULTICAST_HOPS: { -- cgit v1.1 From 0a513f6af962525ed4b3395f8c8d5daae8682aa9 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:10:03 +0200 Subject: tproxy: allow non-local binds of IPv6 sockets if IP_TRANSPARENT is enabled Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/ipv6/af_inet6.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net') diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 56b9bf2..4869797 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -343,7 +343,8 @@ int inet6_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) */ v4addr = LOOPBACK4_IPV6; if (!(addr_type & IPV6_ADDR_MULTICAST)) { - if (!ipv6_chk_addr(net, &addr->sin6_addr, + if (!inet->transparent && + !ipv6_chk_addr(net, &addr->sin6_addr, dev, 0)) { err = -EADDRNOTAVAIL; goto out_unlock; -- cgit v1.1 From 6ad7889327a5ee6ab4220bd34e4428c7d0de0f32 Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:17:26 +0200 Subject: tproxy: added IPv6 support to the TPROXY target This requires a new revision as the old target structure was IPv4 specific. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/xt_TPROXY.c | 262 +++++++++++++++++++++++++++++++++++++++------- 1 file changed, 225 insertions(+), 37 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index e0b6900..d5f97e2 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -1,7 +1,7 @@ /* * Transparent proxy support for Linux/iptables * - * Copyright (c) 2006-2007 BalaBit IT Ltd. + * Copyright (c) 2006-2010 BalaBit IT Ltd. * Author: Balazs Scheidler, Krisztian Kovacs * * This program is free software; you can redistribute it and/or modify @@ -19,15 +19,18 @@ #include #include +#include #include #include +#include #include /** - * tproxy_handle_time_wait() - handle TCP TIME_WAIT reopen redirections + * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections * @skb: The skb being processed. - * @par: Iptables target parameters. + * @laddr: IPv4 address to redirect to or zero. + * @lport: TCP port to redirect to or zero. * @sk: The TIME_WAIT TCP socket found by the lookup. * * We have to handle SYN packets arriving to TIME_WAIT sockets @@ -35,16 +38,16 @@ * redirect the new connection to the proxy if there's a listener * socket present. * - * tproxy_handle_time_wait() consumes the socket reference passed in. + * tproxy_handle_time_wait4() consumes the socket reference passed in. * * Returns the listener socket if there's one, the TIME_WAIT socket if * no such listener is found, or NULL if the TCP header is incomplete. */ static struct sock * -tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk) +tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, + struct sock *sk) { const struct iphdr *iph = ip_hdr(skb); - const struct xt_tproxy_target_info *tgi = par->targinfo; struct tcphdr _hdr, *hp; hp = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_hdr), &_hdr); @@ -59,13 +62,64 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, struct sock *sk2; sk2 = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, - iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, - hp->source, tgi->lport ? tgi->lport : hp->dest, - par->in, NFT_LOOKUP_LISTENER); + iph->saddr, laddr ? laddr : iph->daddr, + hp->source, lport ? lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + +/** + * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @tproto: Transport protocol. + * @thoff: Transport protocol header offset. + * @par: Iptables target parameters. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait6() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + const struct xt_action_param *par, + struct sock *sk) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr _hdr, *hp; + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, + !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr, + hp->source, + tgi->lport ? tgi->lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); if (sk2) { - /* yeah, there's one, let's kill the TIME_WAIT - * socket and redirect to the listener - */ inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); inet_twsk_put(inet_twsk(sk)); sk = sk2; @@ -76,10 +130,10 @@ tproxy_handle_time_wait(struct sk_buff *skb, const struct xt_action_param *par, } static unsigned int -tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) +tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, + u_int32_t mark_mask, u_int32_t mark_value) { const struct iphdr *iph = ip_hdr(skb); - const struct xt_tproxy_target_info *tgi = par->targinfo; struct udphdr _hdr, *hp; struct sock *sk; @@ -87,18 +141,105 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) if (hp == NULL) return NF_DROP; + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, iph->saddr, iph->daddr, hp->source, hp->dest, - par->in, NFT_LOOKUP_ESTABLISHED); + skb->dev, NFT_LOOKUP_ESTABLISHED); /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) - sk = tproxy_handle_time_wait(skb, par, sk); + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait4(skb, laddr, lport, sk); else if (!sk) + /* no, there's no established connection, check if + * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, - iph->saddr, tgi->laddr ? tgi->laddr : iph->daddr, - hp->source, tgi->lport ? tgi->lport : hp->dest, + iph->saddr, laddr ? laddr : iph->daddr, + hp->source, lport ? lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + + /* NOTE: assign_sock consumes our sk reference */ + if (sk && nf_tproxy_assign_sock(skb, sk)) { + /* This should be in a separate target, but we don't do multiple + targets on the same rule yet */ + skb->mark = (skb->mark & ~mark_mask) ^ mark_value; + + pr_debug("redirecting: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->daddr, ntohs(hp->dest), + &laddr, ntohs(lport), skb->mark); + return NF_ACCEPT; + } + + pr_debug("no socket, dropping: proto %hhu %08x:%hu -> %08x:%hu, mark: %x\n", + iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), + ntohl(laddr), ntohs(lport), skb->mark); + return NF_DROP; +} + +static unsigned int +tproxy_tg4_v0(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +static unsigned int +tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + return tproxy_tg4(skb, tgi->laddr.ip, tgi->lport, tgi->mark_mask, tgi->mark_value); +} + +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static unsigned int +tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + struct udphdr _hdr, *hp; + struct sock *sk; + int thoff; + int tproto; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + pr_debug("unable to grab transport header contents in IPv6 packet, dropping\n"); + return NF_DROP; + } + + /* check if there's an ongoing connection on the packet + * addresses, this happens if the redirect already happened + * and the current packet belongs to an already established + * connection */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, &iph->daddr, + hp->source, hp->dest, + par->in, NFT_LOOKUP_ESTABLISHED); + + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ + if (sk && sk->sk_state == TCP_TIME_WAIT) + /* reopening a TIME_WAIT connection needs special handling */ + sk = tproxy_handle_time_wait6(skb, tproto, thoff, par, sk); + else if (!sk) + /* no there's no established connection, check if + * there's a listener on the redirected addr/port */ + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, + !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr, + hp->source, + tgi->lport ? tgi->lport : hp->dest, par->in, NFT_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ @@ -107,19 +248,33 @@ tproxy_tg(struct sk_buff *skb, const struct xt_action_param *par) targets on the same rule yet */ skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; - pr_debug("redirecting: proto %u %08x:%u -> %08x:%u, mark: %x\n", - iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), - ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark); + pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->dest), + &tgi->laddr.in6, ntohs(tgi->lport), skb->mark); return NF_ACCEPT; } - pr_debug("no socket, dropping: proto %u %08x:%u -> %08x:%u, mark: %x\n", - iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), - ntohl(tgi->laddr), ntohs(tgi->lport), skb->mark); + pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", + tproto, &iph->saddr, ntohs(hp->dest), + &tgi->laddr.in6, ntohs(tgi->lport), skb->mark); return NF_DROP; } -static int tproxy_tg_check(const struct xt_tgchk_param *par) +static int tproxy_tg6_check(const struct xt_tgchk_param *par) +{ + const struct ip6t_ip6 *i = par->entryinfo; + + if ((i->proto == IPPROTO_TCP || i->proto == IPPROTO_UDP) + && !(i->flags & IP6T_INV_PROTO)) + return 0; + + pr_info("Can be used only in combination with " + "either -p tcp or -p udp\n"); + return -EINVAL; +} +#endif + +static int tproxy_tg4_check(const struct xt_tgchk_param *par) { const struct ipt_ip *i = par->entryinfo; @@ -132,31 +287,64 @@ static int tproxy_tg_check(const struct xt_tgchk_param *par) return -EINVAL; } -static struct xt_target tproxy_tg_reg __read_mostly = { - .name = "TPROXY", - .family = NFPROTO_IPV4, - .table = "mangle", - .target = tproxy_tg, - .targetsize = sizeof(struct xt_tproxy_target_info), - .checkentry = tproxy_tg_check, - .hooks = 1 << NF_INET_PRE_ROUTING, - .me = THIS_MODULE, +static struct xt_target tproxy_tg_reg[] __read_mostly = { + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v0, + .revision = 0, + .targetsize = sizeof(struct xt_tproxy_target_info), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, + { + .name = "TPROXY", + .family = NFPROTO_IPV4, + .table = "mangle", + .target = tproxy_tg4_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg4_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + { + .name = "TPROXY", + .family = NFPROTO_IPV6, + .table = "mangle", + .target = tproxy_tg6_v1, + .revision = 1, + .targetsize = sizeof(struct xt_tproxy_target_info_v1), + .checkentry = tproxy_tg6_check, + .hooks = 1 << NF_INET_PRE_ROUTING, + .me = THIS_MODULE, + }, +#endif + }; static int __init tproxy_tg_init(void) { nf_defrag_ipv4_enable(); - return xt_register_target(&tproxy_tg_reg); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + nf_defrag_ipv6_enable(); +#endif + + return xt_register_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); } static void __exit tproxy_tg_exit(void) { - xt_unregister_target(&tproxy_tg_reg); + xt_unregister_targets(tproxy_tg_reg, ARRAY_SIZE(tproxy_tg_reg)); } module_init(tproxy_tg_init); module_exit(tproxy_tg_exit); MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Krisztian Kovacs"); +MODULE_AUTHOR("Balazs Scheidler, Krisztian Kovacs"); MODULE_DESCRIPTION("Netfilter transparent proxy (TPROXY) target module."); MODULE_ALIAS("ipt_TPROXY"); +MODULE_ALIAS("ip6t_TPROXY"); -- cgit v1.1 From b64c9256a9b76fc9f059f71bd08ba88fb0cbba2e Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:19:42 +0200 Subject: tproxy: added IPv6 support to the socket match The ICMP extraction bits were contributed by Harry Mason. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/xt_socket.c | 165 ++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 154 insertions(+), 11 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c index 266faa0..2dbd4c8 100644 --- a/net/netfilter/xt_socket.c +++ b/net/netfilter/xt_socket.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -21,6 +22,7 @@ #include #include #include +#include #include @@ -30,7 +32,7 @@ #endif static int -extract_icmp_fields(const struct sk_buff *skb, +extract_icmp4_fields(const struct sk_buff *skb, u8 *protocol, __be32 *raddr, __be32 *laddr, @@ -86,7 +88,6 @@ extract_icmp_fields(const struct sk_buff *skb, return 0; } - static bool socket_match(const struct sk_buff *skb, struct xt_action_param *par, const struct xt_socket_mtinfo1 *info) @@ -115,7 +116,7 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, dport = hp->dest; } else if (iph->protocol == IPPROTO_ICMP) { - if (extract_icmp_fields(skb, &protocol, &saddr, &daddr, + if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr, &sport, &dport)) return false; } else { @@ -165,32 +166,157 @@ socket_match(const struct sk_buff *skb, struct xt_action_param *par, sk = NULL; } - pr_debug("proto %u %08x:%u -> %08x:%u (orig %08x:%u) sock %p\n", - protocol, ntohl(saddr), ntohs(sport), - ntohl(daddr), ntohs(dport), - ntohl(iph->daddr), hp ? ntohs(hp->dest) : 0, sk); + pr_debug("proto %hhu %pI4:%hu -> %pI4:%hu (orig %pI4:%hu) sock %p\n", + protocol, &saddr, ntohs(sport), + &daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); return (sk != NULL); } static bool -socket_mt_v0(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v0(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, NULL); } static bool -socket_mt_v1(const struct sk_buff *skb, struct xt_action_param *par) +socket_mt4_v1(const struct sk_buff *skb, struct xt_action_param *par) { return socket_match(skb, par, par->matchinfo); } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +static int +extract_icmp6_fields(const struct sk_buff *skb, + unsigned int outside_hdrlen, + u8 *protocol, + struct in6_addr **raddr, + struct in6_addr **laddr, + __be16 *rport, + __be16 *lport) +{ + struct ipv6hdr *inside_iph, _inside_iph; + struct icmp6hdr *icmph, _icmph; + __be16 *ports, _ports[2]; + u8 inside_nexthdr; + int inside_hdrlen; + + icmph = skb_header_pointer(skb, outside_hdrlen, + sizeof(_icmph), &_icmph); + if (icmph == NULL) + return 1; + + if (icmph->icmp6_type & ICMPV6_INFOMSG_MASK) + return 1; + + inside_iph = skb_header_pointer(skb, outside_hdrlen + sizeof(_icmph), sizeof(_inside_iph), &_inside_iph); + if (inside_iph == NULL) + return 1; + inside_nexthdr = inside_iph->nexthdr; + + inside_hdrlen = ipv6_skip_exthdr(skb, outside_hdrlen + sizeof(_icmph) + sizeof(_inside_iph), &inside_nexthdr); + if (inside_hdrlen < 0) + return 1; /* hjm: Packet has no/incomplete transport layer headers. */ + + if (inside_nexthdr != IPPROTO_TCP && + inside_nexthdr != IPPROTO_UDP) + return 1; + + ports = skb_header_pointer(skb, inside_hdrlen, + sizeof(_ports), &_ports); + if (ports == NULL) + return 1; + + /* the inside IP packet is the one quoted from our side, thus + * its saddr is the local address */ + *protocol = inside_nexthdr; + *laddr = &inside_iph->saddr; + *lport = ports[0]; + *raddr = &inside_iph->daddr; + *rport = ports[1]; + + return 0; +} + +static bool +socket_mt6_v1(const struct sk_buff *skb, struct xt_action_param *par) +{ + struct ipv6hdr *iph = ipv6_hdr(skb); + struct udphdr _hdr, *hp = NULL; + struct sock *sk; + struct in6_addr *daddr, *saddr; + __be16 dport, sport; + int thoff; + u8 tproto; + const struct xt_socket_mtinfo1 *info = (struct xt_socket_mtinfo1 *) par->matchinfo; + + tproto = ipv6_find_hdr(skb, &thoff, -1, NULL); + if (tproto < 0) { + pr_debug("unable to find transport header in IPv6 packet, dropping\n"); + return NF_DROP; + } + + if (tproto == IPPROTO_UDP || tproto == IPPROTO_TCP) { + hp = skb_header_pointer(skb, thoff, + sizeof(_hdr), &_hdr); + if (hp == NULL) + return false; + + saddr = &iph->saddr; + sport = hp->source; + daddr = &iph->daddr; + dport = hp->dest; + + } else if (tproto == IPPROTO_ICMPV6) { + if (extract_icmp6_fields(skb, thoff, &tproto, &saddr, &daddr, + &sport, &dport)) + return false; + } else { + return false; + } + + sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + saddr, daddr, sport, dport, par->in, NFT_LOOKUP_ANY); + if (sk != NULL) { + bool wildcard; + bool transparent = true; + + /* Ignore sockets listening on INADDR_ANY */ + wildcard = (sk->sk_state != TCP_TIME_WAIT && + ipv6_addr_any(&inet6_sk(sk)->rcv_saddr)); + + /* Ignore non-transparent sockets, + if XT_SOCKET_TRANSPARENT is used */ + if (info && info->flags & XT_SOCKET_TRANSPARENT) + transparent = ((sk->sk_state != TCP_TIME_WAIT && + inet_sk(sk)->transparent) || + (sk->sk_state == TCP_TIME_WAIT && + inet_twsk(sk)->tw_transparent)); + + nf_tproxy_put_sock(sk); + + if (wildcard || !transparent) + sk = NULL; + } + + pr_debug("proto %hhu %pI6:%hu -> %pI6:%hu " + "(orig %pI6:%hu) sock %p\n", + tproto, saddr, ntohs(sport), + daddr, ntohs(dport), + &iph->daddr, hp ? ntohs(hp->dest) : 0, sk); + + return (sk != NULL); +} +#endif + static struct xt_match socket_mt_reg[] __read_mostly = { { .name = "socket", .revision = 0, .family = NFPROTO_IPV4, - .match = socket_mt_v0, + .match = socket_mt4_v0, .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, @@ -199,17 +325,33 @@ static struct xt_match socket_mt_reg[] __read_mostly = { .name = "socket", .revision = 1, .family = NFPROTO_IPV4, - .match = socket_mt_v1, + .match = socket_mt4_v1, .matchsize = sizeof(struct xt_socket_mtinfo1), .hooks = (1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_IN), .me = THIS_MODULE, }, +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + { + .name = "socket", + .revision = 1, + .family = NFPROTO_IPV6, + .match = socket_mt6_v1, + .matchsize = sizeof(struct xt_socket_mtinfo1), + .hooks = (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN), + .me = THIS_MODULE, + }, +#endif }; static int __init socket_mt_init(void) { nf_defrag_ipv4_enable(); +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + nf_defrag_ipv6_enable(); +#endif + return xt_register_matches(socket_mt_reg, ARRAY_SIZE(socket_mt_reg)); } @@ -225,3 +367,4 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Krisztian Kovacs, Balazs Scheidler"); MODULE_DESCRIPTION("x_tables socket match module"); MODULE_ALIAS("ipt_socket"); +MODULE_ALIAS("ip6t_socket"); -- cgit v1.1 From cc6eb433856983e91071469c4ce57accb6947ccb Mon Sep 17 00:00:00 2001 From: Balazs Scheidler Date: Thu, 21 Oct 2010 16:21:10 +0200 Subject: tproxy: use the interface primary IP address as a default value for --on-ip The REDIRECT target and the older TProxy versions used the primary address of the incoming interface as the default value of the --on-ip parameter. This was unintentionally changed during the initial TProxy submission and caused confusion among users. Since IPv6 has no notion of primary address, we just select the first address on the list: this way the socket lookup finds wildcard bound sockets properly and we cannot really do better without the user telling us the IPv6 address of the proxy. This is implemented for both IPv4 and IPv6. Signed-off-by: Balazs Scheidler Signed-off-by: KOVACS Krisztian Signed-off-by: Patrick McHardy --- net/netfilter/xt_TPROXY.c | 202 ++++++++++++++++++++++++++++++---------------- 1 file changed, 132 insertions(+), 70 deletions(-) (limited to 'net') diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c index d5f97e2..19c482c 100644 --- a/net/netfilter/xt_TPROXY.c +++ b/net/netfilter/xt_TPROXY.c @@ -16,15 +16,41 @@ #include #include #include - +#include #include #include -#include -#include #include +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +#include +#include +#include #include +#endif + #include +#include + +static inline __be32 +tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr) +{ + struct in_device *indev; + __be32 laddr; + + if (user_laddr) + return user_laddr; + + laddr = 0; + rcu_read_lock(); + indev = __in_dev_get_rcu(skb->dev); + for_primary_ifa(indev) { + laddr = ifa->ifa_local; + break; + } endfor_ifa(indev); + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} /** * tproxy_handle_time_wait4() - handle IPv4 TCP TIME_WAIT reopen redirections @@ -75,60 +101,6 @@ tproxy_handle_time_wait4(struct sk_buff *skb, __be32 laddr, __be16 lport, return sk; } -/** - * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections - * @skb: The skb being processed. - * @tproto: Transport protocol. - * @thoff: Transport protocol header offset. - * @par: Iptables target parameters. - * @sk: The TIME_WAIT TCP socket found by the lookup. - * - * We have to handle SYN packets arriving to TIME_WAIT sockets - * differently: instead of reopening the connection we should rather - * redirect the new connection to the proxy if there's a listener - * socket present. - * - * tproxy_handle_time_wait6() consumes the socket reference passed in. - * - * Returns the listener socket if there's one, the TIME_WAIT socket if - * no such listener is found, or NULL if the TCP header is incomplete. - */ -static struct sock * -tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, - const struct xt_action_param *par, - struct sock *sk) -{ - const struct ipv6hdr *iph = ipv6_hdr(skb); - struct tcphdr _hdr, *hp; - const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; - - hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); - if (hp == NULL) { - inet_twsk_put(inet_twsk(sk)); - return NULL; - } - - if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { - /* SYN to a TIME_WAIT socket, we'd rather redirect it - * to a listener socket if there's one */ - struct sock *sk2; - - sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, - &iph->saddr, - !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr, - hp->source, - tgi->lport ? tgi->lport : hp->dest, - skb->dev, NFT_LOOKUP_LISTENER); - if (sk2) { - inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); - inet_twsk_put(inet_twsk(sk)); - sk = sk2; - } - } - - return sk; -} - static unsigned int tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, u_int32_t mark_mask, u_int32_t mark_value) @@ -150,6 +122,10 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, hp->source, hp->dest, skb->dev, NFT_LOOKUP_ESTABLISHED); + laddr = tproxy_laddr4(skb, laddr, iph->daddr); + if (!lport) + lport = hp->dest; + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) /* reopening a TIME_WAIT connection needs special handling */ @@ -158,8 +134,8 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, /* no, there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v4(dev_net(skb->dev), iph->protocol, - iph->saddr, laddr ? laddr : iph->daddr, - hp->source, lport ? lport : hp->dest, + iph->saddr, laddr, + hp->source, lport, skb->dev, NFT_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ @@ -174,9 +150,9 @@ tproxy_tg4(struct sk_buff *skb, __be32 laddr, __be16 lport, return NF_ACCEPT; } - pr_debug("no socket, dropping: proto %hhu %08x:%hu -> %08x:%hu, mark: %x\n", - iph->protocol, ntohl(iph->daddr), ntohs(hp->dest), - ntohl(laddr), ntohs(lport), skb->mark); + pr_debug("no socket, dropping: proto %hhu %pI4:%hu -> %pI4:%hu, mark: %x\n", + iph->protocol, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); return NF_DROP; } @@ -197,6 +173,88 @@ tproxy_tg4_v1(struct sk_buff *skb, const struct xt_action_param *par) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + +static inline const struct in6_addr * +tproxy_laddr6(struct sk_buff *skb, const struct in6_addr *user_laddr, + const struct in6_addr *daddr) +{ + struct inet6_dev *indev; + struct inet6_ifaddr *ifa; + struct in6_addr *laddr; + + if (!ipv6_addr_any(user_laddr)) + return user_laddr; + laddr = NULL; + + rcu_read_lock(); + indev = __in6_dev_get(skb->dev); + if (indev) + list_for_each_entry(ifa, &indev->addr_list, if_list) { + if (ifa->flags & (IFA_F_TENTATIVE | IFA_F_DEPRECATED)) + continue; + + laddr = &ifa->addr; + break; + } + rcu_read_unlock(); + + return laddr ? laddr : daddr; +} + +/** + * tproxy_handle_time_wait6() - handle IPv6 TCP TIME_WAIT reopen redirections + * @skb: The skb being processed. + * @tproto: Transport protocol. + * @thoff: Transport protocol header offset. + * @par: Iptables target parameters. + * @sk: The TIME_WAIT TCP socket found by the lookup. + * + * We have to handle SYN packets arriving to TIME_WAIT sockets + * differently: instead of reopening the connection we should rather + * redirect the new connection to the proxy if there's a listener + * socket present. + * + * tproxy_handle_time_wait6() consumes the socket reference passed in. + * + * Returns the listener socket if there's one, the TIME_WAIT socket if + * no such listener is found, or NULL if the TCP header is incomplete. + */ +static struct sock * +tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff, + const struct xt_action_param *par, + struct sock *sk) +{ + const struct ipv6hdr *iph = ipv6_hdr(skb); + struct tcphdr _hdr, *hp; + const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; + + hp = skb_header_pointer(skb, thoff, sizeof(_hdr), &_hdr); + if (hp == NULL) { + inet_twsk_put(inet_twsk(sk)); + return NULL; + } + + if (hp->syn && !hp->rst && !hp->ack && !hp->fin) { + /* SYN to a TIME_WAIT socket, we'd rather redirect it + * to a listener socket if there's one */ + struct sock *sk2; + + sk2 = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, + &iph->saddr, + tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr), + hp->source, + tgi->lport ? tgi->lport : hp->dest, + skb->dev, NFT_LOOKUP_LISTENER); + if (sk2) { + inet_twsk_deschedule(inet_twsk(sk), &tcp_death_row); + inet_twsk_put(inet_twsk(sk)); + sk = sk2; + } + } + + return sk; +} + static unsigned int tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) { @@ -204,6 +262,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) const struct xt_tproxy_target_info_v1 *tgi = par->targinfo; struct udphdr _hdr, *hp; struct sock *sk; + const struct in6_addr *laddr; + __be16 lport; int thoff; int tproto; @@ -228,6 +288,9 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) hp->source, hp->dest, par->in, NFT_LOOKUP_ESTABLISHED); + laddr = tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr); + lport = tgi->lport ? tgi->lport : hp->dest; + /* UDP has no TCP_TIME_WAIT state, so we never enter here */ if (sk && sk->sk_state == TCP_TIME_WAIT) /* reopening a TIME_WAIT connection needs special handling */ @@ -236,10 +299,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) /* no there's no established connection, check if * there's a listener on the redirected addr/port */ sk = nf_tproxy_get_sock_v6(dev_net(skb->dev), tproto, - &iph->saddr, - !ipv6_addr_any(&tgi->laddr.in6) ? &tgi->laddr.in6 : &iph->daddr, - hp->source, - tgi->lport ? tgi->lport : hp->dest, + &iph->saddr, laddr, + hp->source, lport, par->in, NFT_LOOKUP_LISTENER); /* NOTE: assign_sock consumes our sk reference */ @@ -249,14 +310,15 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par) skb->mark = (skb->mark & ~tgi->mark_mask) ^ tgi->mark_value; pr_debug("redirecting: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", - tproto, &iph->saddr, ntohs(hp->dest), - &tgi->laddr.in6, ntohs(tgi->lport), skb->mark); + tproto, &iph->saddr, ntohs(hp->source), + laddr, ntohs(lport), skb->mark); return NF_ACCEPT; } pr_debug("no socket, dropping: proto %hhu %pI6:%hu -> %pI6:%hu, mark: %x\n", - tproto, &iph->saddr, ntohs(hp->dest), - &tgi->laddr.in6, ntohs(tgi->lport), skb->mark); + tproto, &iph->saddr, ntohs(hp->source), + &iph->daddr, ntohs(hp->dest), skb->mark); + return NF_DROP; } -- cgit v1.1