From df71837d5024e2524cd51c93621e558aa7dd9f3f Mon Sep 17 00:00:00 2001 From: Trent Jaeger Date: Tue, 13 Dec 2005 23:12:27 -0800 Subject: [LSM-IPSec]: Security association restriction. This patch series implements per packet access control via the extension of the Linux Security Modules (LSM) interface by hooks in the XFRM and pfkey subsystems that leverage IPSec security associations to label packets. Extensions to the SELinux LSM are included that leverage the patch for this purpose. This patch implements the changes necessary to the XFRM subsystem, pfkey interface, ipv4/ipv6, and xfrm_user interface to restrict a socket to use only authorized security associations (or no security association) to send/receive network packets. Patch purpose: The patch is designed to enable access control per packets based on the strongly authenticated IPSec security association. Such access controls augment the existing ones based on network interface and IP address. The former are very coarse-grained, and the latter can be spoofed. By using IPSec, the system can control access to remote hosts based on cryptographic keys generated using the IPSec mechanism. This enables access control on a per-machine basis or per-application if the remote machine is running the same mechanism and trusted to enforce the access control policy. Patch design approach: The overall approach is that policy (xfrm_policy) entries set by user-level programs (e.g., setkey for ipsec-tools) are extended with a security context that is used at policy selection time in the XFRM subsystem to restrict the sockets that can send/receive packets via security associations (xfrm_states) that are built from those policies. A presentation available at www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf from the SELinux symposium describes the overall approach. Patch implementation details: On output, the policy retrieved (via xfrm_policy_lookup or xfrm_sk_policy_lookup) must be authorized for the security context of the socket and the same security context is required for resultant security association (retrieved or negotiated via racoon in ipsec-tools). This is enforced in xfrm_state_find. On input, the policy retrieved must also be authorized for the socket (at __xfrm_policy_check), and the security context of the policy must also match the security association being used. The patch has virtually no impact on packets that do not use IPSec. The existing Netfilter (outgoing) and LSM rcv_skb hooks are used as before. Also, if IPSec is used without security contexts, the impact is minimal. The LSM must allow such policies to be selected for the combination of socket and remote machine, but subsequent IPSec processing proceeds as in the original case. Testing: The pfkey interface is tested using the ipsec-tools. ipsec-tools have been modified (a separate ipsec-tools patch is available for version 0.5) that supports assignment of xfrm_policy entries and security associations with security contexts via setkey and the negotiation using the security contexts via racoon. The xfrm_user interface is tested via ad hoc programs that set security contexts. These programs are also available from me, and contain programs for setting, getting, and deleting policy for testing this interface. Testing of sa functions was done by tracing kernel behavior. Signed-off-by: Trent Jaeger Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/pfkeyv2.h | 13 +++- include/linux/security.h | 132 +++++++++++++++++++++++++++++++ include/linux/xfrm.h | 29 +++++++ include/net/flow.h | 7 +- include/net/xfrm.h | 27 ++++++- net/core/flow.c | 8 +- net/key/af_key.c | 197 +++++++++++++++++++++++++++++++++++++++++++++-- net/xfrm/xfrm_policy.c | 88 +++++++++++++-------- net/xfrm/xfrm_state.c | 9 ++- net/xfrm/xfrm_user.c | 148 +++++++++++++++++++++++++++++++++-- security/Kconfig | 13 ++++ security/dummy.c | 45 ++++++++++- 12 files changed, 655 insertions(+), 61 deletions(-) diff --git a/include/linux/pfkeyv2.h b/include/linux/pfkeyv2.h index 7240667..6351c40 100644 --- a/include/linux/pfkeyv2.h +++ b/include/linux/pfkeyv2.h @@ -216,6 +216,16 @@ struct sadb_x_nat_t_port { } __attribute__((packed)); /* sizeof(struct sadb_x_nat_t_port) == 8 */ +/* Generic LSM security context */ +struct sadb_x_sec_ctx { + uint16_t sadb_x_sec_len; + uint16_t sadb_x_sec_exttype; + uint8_t sadb_x_ctx_alg; /* LSMs: e.g., selinux == 1 */ + uint8_t sadb_x_ctx_doi; + uint16_t sadb_x_ctx_len; +} __attribute__((packed)); +/* sizeof(struct sadb_sec_ctx) = 8 */ + /* Message types */ #define SADB_RESERVED 0 #define SADB_GETSPI 1 @@ -325,7 +335,8 @@ struct sadb_x_nat_t_port { #define SADB_X_EXT_NAT_T_SPORT 21 #define SADB_X_EXT_NAT_T_DPORT 22 #define SADB_X_EXT_NAT_T_OA 23 -#define SADB_EXT_MAX 23 +#define SADB_X_EXT_SEC_CTX 24 +#define SADB_EXT_MAX 24 /* Identity Extension values */ #define SADB_IDENTTYPE_RESERVED 0 diff --git a/include/linux/security.h b/include/linux/security.h index f7e0ae0..ef75365 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -59,6 +59,12 @@ struct sk_buff; struct sock; struct sockaddr; struct socket; +struct flowi; +struct dst_entry; +struct xfrm_selector; +struct xfrm_policy; +struct xfrm_state; +struct xfrm_user_sec_ctx; extern int cap_netlink_send(struct sock *sk, struct sk_buff *skb); extern int cap_netlink_recv(struct sk_buff *skb); @@ -788,6 +794,52 @@ struct swap_info_struct; * which is used to copy security attributes between local stream sockets. * @sk_free_security: * Deallocate security structure. + * @sk_getsid: + * Retrieve the LSM-specific sid for the sock to enable caching of network + * authorizations. + * + * Security hooks for XFRM operations. + * + * @xfrm_policy_alloc_security: + * @xp contains the xfrm_policy being added to Security Policy Database + * used by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level policy update program (e.g., setkey). + * Allocate a security structure to the xp->selector.security field. + * The security field is initialized to NULL when the xfrm_policy is + * allocated. + * Return 0 if operation was successful (memory to allocate, legal context) + * @xfrm_policy_clone_security: + * @old contains an existing xfrm_policy in the SPD. + * @new contains a new xfrm_policy being cloned from old. + * Allocate a security structure to the new->selector.security field + * that contains the information from the old->selector.security field. + * Return 0 if operation was successful (memory to allocate). + * @xfrm_policy_free_security: + * @xp contains the xfrm_policy + * Deallocate xp->selector.security. + * @xfrm_state_alloc_security: + * @x contains the xfrm_state being added to the Security Association + * Database by the XFRM system. + * @sec_ctx contains the security context information being provided by + * the user-level SA generation program (e.g., setkey or racoon). + * Allocate a security structure to the x->sel.security field. The + * security field is initialized to NULL when the xfrm_state is + * allocated. + * Return 0 if operation was successful (memory to allocate, legal context). + * @xfrm_state_free_security: + * @x contains the xfrm_state. + * Deallocate x>sel.security. + * @xfrm_policy_lookup: + * @xp contains the xfrm_policy for which the access control is being + * checked. + * @sk_sid contains the sock security label that is used to authorize + * access to the policy xp. + * @dir contains the direction of the flow (input or output). + * Check permission when a sock selects a xfrm_policy for processing + * XFRMs on a packet. The hook is called when selecting either a + * per-socket policy or a generic xfrm policy. + * Return 0 if permission is granted. * * Security hooks affecting all Key Management operations * @@ -1237,8 +1289,18 @@ struct security_operations { int (*socket_getpeersec) (struct socket *sock, char __user *optval, int __user *optlen, unsigned len); int (*sk_alloc_security) (struct sock *sk, int family, gfp_t priority); void (*sk_free_security) (struct sock *sk); + unsigned int (*sk_getsid) (struct sock *sk, struct flowi *fl, u8 dir); #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM + int (*xfrm_policy_alloc_security) (struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx); + int (*xfrm_policy_clone_security) (struct xfrm_policy *old, struct xfrm_policy *new); + void (*xfrm_policy_free_security) (struct xfrm_policy *xp); + int (*xfrm_state_alloc_security) (struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx); + void (*xfrm_state_free_security) (struct xfrm_state *x); + int (*xfrm_policy_lookup)(struct xfrm_policy *xp, u32 sk_sid, u8 dir); +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ + /* key management security hooks */ #ifdef CONFIG_KEYS int (*key_alloc)(struct key *key); @@ -2679,6 +2741,11 @@ static inline void security_sk_free(struct sock *sk) { return security_ops->sk_free_security(sk); } + +static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir) +{ + return security_ops->sk_getsid(sk, fl, dir); +} #else /* CONFIG_SECURITY_NETWORK */ static inline int security_unix_stream_connect(struct socket * sock, struct socket * other, @@ -2795,8 +2862,73 @@ static inline int security_sk_alloc(struct sock *sk, int family, gfp_t priority) static inline void security_sk_free(struct sock *sk) { } + +static inline unsigned int security_sk_sid(struct sock *sk, struct flowi *fl, u8 dir) +{ + return 0; +} #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM +static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx) +{ + return security_ops->xfrm_policy_alloc_security(xp, sec_ctx); +} + +static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) +{ + return security_ops->xfrm_policy_clone_security(old, new); +} + +static inline void security_xfrm_policy_free(struct xfrm_policy *xp) +{ + security_ops->xfrm_policy_free_security(xp); +} + +static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) +{ + return security_ops->xfrm_state_alloc_security(x, sec_ctx); +} + +static inline void security_xfrm_state_free(struct xfrm_state *x) +{ + security_ops->xfrm_state_free_security(x); +} + +static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + return security_ops->xfrm_policy_lookup(xp, sk_sid, dir); +} +#else /* CONFIG_SECURITY_NETWORK_XFRM */ +static inline int security_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static inline int security_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) +{ + return 0; +} + +static inline void security_xfrm_policy_free(struct xfrm_policy *xp) +{ +} + +static inline int security_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static inline void security_xfrm_state_free(struct xfrm_state *x) +{ +} + +static inline int security_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + return 0; +} +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ + #ifdef CONFIG_KEYS #ifdef CONFIG_SECURITY static inline int security_key_alloc(struct key *key) diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h index 0fb077d..82fbb75 100644 --- a/include/linux/xfrm.h +++ b/include/linux/xfrm.h @@ -27,6 +27,22 @@ struct xfrm_id __u8 proto; }; +struct xfrm_sec_ctx { + __u8 ctx_doi; + __u8 ctx_alg; + __u16 ctx_len; + __u32 ctx_sid; + char ctx_str[0]; +}; + +/* Security Context Domains of Interpretation */ +#define XFRM_SC_DOI_RESERVED 0 +#define XFRM_SC_DOI_LSM 1 + +/* Security Context Algorithms */ +#define XFRM_SC_ALG_RESERVED 0 +#define XFRM_SC_ALG_SELINUX 1 + /* Selector, used as selector both on policy rules (SPD) and SAs. */ struct xfrm_selector @@ -146,6 +162,18 @@ enum { #define XFRM_NR_MSGTYPES (XFRM_MSG_MAX + 1 - XFRM_MSG_BASE) +/* + * Generic LSM security context for comunicating to user space + * NOTE: Same format as sadb_x_sec_ctx + */ +struct xfrm_user_sec_ctx { + __u16 len; + __u16 exttype; + __u8 ctx_alg; /* LSMs: e.g., selinux == 1 */ + __u8 ctx_doi; + __u16 ctx_len; +}; + struct xfrm_user_tmpl { struct xfrm_id id; __u16 family; @@ -176,6 +204,7 @@ enum xfrm_attr_type_t { XFRMA_TMPL, /* 1 or more struct xfrm_user_tmpl */ XFRMA_SA, XFRMA_POLICY, + XFRMA_SEC_CTX, /* struct xfrm_sec_ctx */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) diff --git a/include/net/flow.h b/include/net/flow.h index 9a5c94b..ec7eb86 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -84,11 +84,12 @@ struct flowi { #define FLOW_DIR_OUT 1 #define FLOW_DIR_FWD 2 -typedef void (*flow_resolve_t)(struct flowi *key, u16 family, u8 dir, +struct sock; +typedef void (*flow_resolve_t)(struct flowi *key, u32 sk_sid, u16 family, u8 dir, void **objp, atomic_t **obj_refp); -extern void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, - flow_resolve_t resolver); +extern void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir, + flow_resolve_t resolver); extern void flow_cache_flush(void); extern atomic_t flow_cache_genid; diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 1cdb879..487abca 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -144,6 +144,9 @@ struct xfrm_state * transformer. */ struct xfrm_type *type; + /* Security context */ + struct xfrm_sec_ctx *security; + /* Private data of this transformer, format is opaque, * interpreted by xfrm_type methods. */ void *data; @@ -298,6 +301,7 @@ struct xfrm_policy __u8 flags; __u8 dead; __u8 xfrm_nr; + struct xfrm_sec_ctx *security; struct xfrm_tmpl xfrm_vec[XFRM_MAX_DEPTH]; }; @@ -510,6 +514,25 @@ xfrm_selector_match(struct xfrm_selector *sel, struct flowi *fl, return 0; } +#ifdef CONFIG_SECURITY_NETWORK_XFRM +/* If neither has a context --> match + * Otherwise, both must have a context and the sids, doi, alg must match + */ +static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) +{ + return ((!s1 && !s2) || + (s1 && s2 && + (s1->ctx_sid == s2->ctx_sid) && + (s1->ctx_doi == s2->ctx_doi) && + (s1->ctx_alg == s2->ctx_alg))); +} +#else +static inline int xfrm_sec_ctx_match(struct xfrm_sec_ctx *s1, struct xfrm_sec_ctx *s2) +{ + return 1; +} +#endif + /* A struct encoding bundle of transformations to apply to some set of flow. * * dst->child points to the next element of bundle. @@ -878,8 +901,8 @@ static inline int xfrm_dst_lookup(struct xfrm_dst **dst, struct flowi *fl, unsig struct xfrm_policy *xfrm_policy_alloc(gfp_t gfp); extern int xfrm_policy_walk(int (*func)(struct xfrm_policy *, int, int, void*), void *); int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl); -struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, - int delete); +struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete); struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete); void xfrm_policy_flush(void); u32 xfrm_get_acqseq(void); diff --git a/net/core/flow.c b/net/core/flow.c index 7e95b39..c4f2538 100644 --- a/net/core/flow.c +++ b/net/core/flow.c @@ -23,6 +23,7 @@ #include #include #include +#include struct flow_cache_entry { struct flow_cache_entry *next; @@ -30,6 +31,7 @@ struct flow_cache_entry { u8 dir; struct flowi key; u32 genid; + u32 sk_sid; void *object; atomic_t *object_ref; }; @@ -162,7 +164,7 @@ static int flow_key_compare(struct flowi *key1, struct flowi *key2) return 0; } -void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, +void *flow_cache_lookup(struct flowi *key, u32 sk_sid, u16 family, u8 dir, flow_resolve_t resolver) { struct flow_cache_entry *fle, **head; @@ -186,6 +188,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, for (fle = *head; fle; fle = fle->next) { if (fle->family == family && fle->dir == dir && + fle->sk_sid == sk_sid && flow_key_compare(key, &fle->key) == 0) { if (fle->genid == atomic_read(&flow_cache_genid)) { void *ret = fle->object; @@ -210,6 +213,7 @@ void *flow_cache_lookup(struct flowi *key, u16 family, u8 dir, *head = fle; fle->family = family; fle->dir = dir; + fle->sk_sid = sk_sid; memcpy(&fle->key, key, sizeof(*key)); fle->object = NULL; flow_count(cpu)++; @@ -221,7 +225,7 @@ nocache: void *obj; atomic_t *obj_ref; - resolver(key, family, dir, &obj, &obj_ref); + resolver(key, sk_sid, family, dir, &obj, &obj_ref); if (fle) { fle->genid = atomic_read(&flow_cache_genid); diff --git a/net/key/af_key.c b/net/key/af_key.c index 3903168..d32f779 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -336,6 +336,7 @@ static u8 sadb_ext_min_len[] = { [SADB_X_EXT_NAT_T_SPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_DPORT] = (u8) sizeof(struct sadb_x_nat_t_port), [SADB_X_EXT_NAT_T_OA] = (u8) sizeof(struct sadb_address), + [SADB_X_EXT_SEC_CTX] = (u8) sizeof(struct sadb_x_sec_ctx), }; /* Verify sadb_address_{len,prefixlen} against sa_family. */ @@ -383,6 +384,55 @@ static int verify_address_len(void *p) return 0; } +static inline int pfkey_sec_ctx_len(struct sadb_x_sec_ctx *sec_ctx) +{ + int len = 0; + + len += sizeof(struct sadb_x_sec_ctx); + len += sec_ctx->sadb_x_ctx_len; + len += sizeof(uint64_t) - 1; + len /= sizeof(uint64_t); + + return len; +} + +static inline int verify_sec_ctx_len(void *p) +{ + struct sadb_x_sec_ctx *sec_ctx = (struct sadb_x_sec_ctx *)p; + int len; + + if (sec_ctx->sadb_x_ctx_len > PAGE_SIZE) + return -EINVAL; + + len = pfkey_sec_ctx_len(sec_ctx); + + if (sec_ctx->sadb_x_sec_len != len) + return -EINVAL; + + return 0; +} + +static inline struct xfrm_user_sec_ctx *pfkey_sadb2xfrm_user_sec_ctx(struct sadb_x_sec_ctx *sec_ctx) +{ + struct xfrm_user_sec_ctx *uctx = NULL; + int ctx_size = sec_ctx->sadb_x_ctx_len; + + uctx = kmalloc((sizeof(*uctx)+ctx_size), GFP_KERNEL); + + if (!uctx) + return NULL; + + uctx->len = pfkey_sec_ctx_len(sec_ctx); + uctx->exttype = sec_ctx->sadb_x_sec_exttype; + uctx->ctx_doi = sec_ctx->sadb_x_ctx_doi; + uctx->ctx_alg = sec_ctx->sadb_x_ctx_alg; + uctx->ctx_len = sec_ctx->sadb_x_ctx_len; + memcpy(uctx + 1, sec_ctx + 1, + uctx->ctx_len); + + return uctx; +} + static int present_and_same_family(struct sadb_address *src, struct sadb_address *dst) { @@ -438,6 +488,10 @@ static int parse_exthdrs(struct sk_buff *skb, struct sadb_msg *hdr, void **ext_h if (verify_address_len(p)) return -EINVAL; } + if (ext_type == SADB_X_EXT_SEC_CTX) { + if (verify_sec_ctx_len(p)) + return -EINVAL; + } ext_hdrs[ext_type-1] = p; } p += ext_len; @@ -586,6 +640,9 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, struct sadb_key *key; struct sadb_x_sa2 *sa2; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; + int ctx_size = 0; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -609,6 +666,12 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, sizeof(struct sadb_address)*2 + sockaddr_size*2 + sizeof(struct sadb_x_sa2); + + if ((xfrm_ctx = x->security)) { + ctx_size = PFKEY_ALIGN8(xfrm_ctx->ctx_len); + size += sizeof(struct sadb_x_sec_ctx) + ctx_size; + } + /* identity & sensitivity */ if ((x->props.family == AF_INET && @@ -899,6 +962,20 @@ static struct sk_buff * pfkey_xfrm_state2msg(struct xfrm_state *x, int add_keys, n_port->sadb_x_nat_t_port_reserved = 0; } + /* security context */ + if (xfrm_ctx) { + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, + sizeof(struct sadb_x_sec_ctx) + ctx_size); + sec_ctx->sadb_x_sec_len = + (sizeof(struct sadb_x_sec_ctx) + ctx_size) / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + return skb; } @@ -909,6 +986,7 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, struct sadb_lifetime *lifetime; struct sadb_sa *sa; struct sadb_key *key; + struct sadb_x_sec_ctx *sec_ctx; uint16_t proto; int err; @@ -993,6 +1071,21 @@ static struct xfrm_state * pfkey_msg2xfrm_state(struct sadb_msg *hdr, x->lft.soft_add_expires_seconds = lifetime->sadb_lifetime_addtime; x->lft.soft_use_expires_seconds = lifetime->sadb_lifetime_usetime; } + + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + goto out; + + err = security_xfrm_state_alloc(x, uctx); + kfree(uctx); + + if (err) + goto out; + } + key = (struct sadb_key*) ext_hdrs[SADB_EXT_KEY_AUTH-1]; if (sa->sadb_sa_auth) { int keysize = 0; @@ -1720,6 +1813,18 @@ parse_ipsecrequests(struct xfrm_policy *xp, struct sadb_x_policy *pol) return 0; } +static inline int pfkey_xfrm_policy2sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + + if (xfrm_ctx) { + int len = sizeof(struct sadb_x_sec_ctx); + len += xfrm_ctx->ctx_len; + return PFKEY_ALIGN8(len); + } + return 0; +} + static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) { int sockaddr_size = pfkey_sockaddr_size(xp->family); @@ -1733,7 +1838,8 @@ static int pfkey_xfrm_policy2msg_size(struct xfrm_policy *xp) (sockaddr_size * 2) + sizeof(struct sadb_x_policy) + (xp->xfrm_nr * (sizeof(struct sadb_x_ipsecrequest) + - (socklen * 2))); + (socklen * 2))) + + pfkey_xfrm_policy2sec_ctx_size(xp); } static struct sk_buff * pfkey_xfrm_policy2msg_prep(struct xfrm_policy *xp) @@ -1757,6 +1863,8 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i struct sadb_lifetime *lifetime; struct sadb_x_policy *pol; struct sockaddr_in *sin; + struct sadb_x_sec_ctx *sec_ctx; + struct xfrm_sec_ctx *xfrm_ctx; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) struct sockaddr_in6 *sin6; #endif @@ -1941,6 +2049,21 @@ static void pfkey_xfrm_policy2msg(struct sk_buff *skb, struct xfrm_policy *xp, i } } } + + /* security context */ + if ((xfrm_ctx = xp->security)) { + int ctx_size = pfkey_xfrm_policy2sec_ctx_size(xp); + + sec_ctx = (struct sadb_x_sec_ctx *) skb_put(skb, ctx_size); + sec_ctx->sadb_x_sec_len = ctx_size / sizeof(uint64_t); + sec_ctx->sadb_x_sec_exttype = SADB_X_EXT_SEC_CTX; + sec_ctx->sadb_x_ctx_doi = xfrm_ctx->ctx_doi; + sec_ctx->sadb_x_ctx_alg = xfrm_ctx->ctx_alg; + sec_ctx->sadb_x_ctx_len = xfrm_ctx->ctx_len; + memcpy(sec_ctx + 1, xfrm_ctx->ctx_str, + xfrm_ctx->ctx_len); + } + hdr->sadb_msg_len = size / sizeof(uint64_t); hdr->sadb_msg_reserved = atomic_read(&xp->refcnt); } @@ -1976,12 +2099,13 @@ out: static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *hdr, void **ext_hdrs) { - int err; + int err = 0; struct sadb_lifetime *lifetime; struct sadb_address *sa; struct sadb_x_policy *pol; struct xfrm_policy *xp; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2028,6 +2152,22 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h if (xp->selector.dport) xp->selector.dport_mask = ~0; + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) { + err = -ENOBUFS; + goto out; + } + + err = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (err) + goto out; + } + xp->lft.soft_byte_limit = XFRM_INF; xp->lft.hard_byte_limit = XFRM_INF; xp->lft.soft_packet_limit = XFRM_INF; @@ -2051,10 +2191,9 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h err = xfrm_policy_insert(pol->sadb_x_policy_dir-1, xp, hdr->sadb_msg_type != SADB_X_SPDUPDATE); - if (err) { - kfree(xp); - return err; - } + + if (err) + goto out; if (hdr->sadb_msg_type == SADB_X_SPDUPDATE) c.event = XFRM_MSG_UPDPOLICY; @@ -2069,6 +2208,7 @@ static int pfkey_spdadd(struct sock *sk, struct sk_buff *skb, struct sadb_msg *h return 0; out: + security_xfrm_policy_free(xp); kfree(xp); return err; } @@ -2078,9 +2218,10 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg int err; struct sadb_address *sa; struct sadb_x_policy *pol; - struct xfrm_policy *xp; + struct xfrm_policy *xp, tmp; struct xfrm_selector sel; struct km_event c; + struct sadb_x_sec_ctx *sec_ctx; if (!present_and_same_family(ext_hdrs[SADB_EXT_ADDRESS_SRC-1], ext_hdrs[SADB_EXT_ADDRESS_DST-1]) || @@ -2109,7 +2250,24 @@ static int pfkey_spddelete(struct sock *sk, struct sk_buff *skb, struct sadb_msg if (sel.dport) sel.dport_mask = ~0; - xp = xfrm_policy_bysel(pol->sadb_x_policy_dir-1, &sel, 1); + sec_ctx = (struct sadb_x_sec_ctx *) ext_hdrs[SADB_X_EXT_SEC_CTX-1]; + memset(&tmp, 0, sizeof(struct xfrm_policy)); + + if (sec_ctx != NULL) { + struct xfrm_user_sec_ctx *uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + + if (!uctx) + return -ENOMEM; + + err = security_xfrm_policy_alloc(&tmp, uctx); + kfree(uctx); + + if (err) + return err; + } + + xp = xfrm_policy_bysel_ctx(pol->sadb_x_policy_dir-1, &sel, tmp.security, 1); + security_xfrm_policy_free(&tmp); if (xp == NULL) return -ENOENT; @@ -2660,6 +2818,7 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, { struct xfrm_policy *xp; struct sadb_x_policy *pol = (struct sadb_x_policy*)data; + struct sadb_x_sec_ctx *sec_ctx; switch (family) { case AF_INET: @@ -2709,10 +2868,32 @@ static struct xfrm_policy *pfkey_compile_policy(u16 family, int opt, (*dir = parse_ipsecrequests(xp, pol)) < 0) goto out; + /* security context too */ + if (len >= (pol->sadb_x_policy_len*8 + + sizeof(struct sadb_x_sec_ctx))) { + char *p = (char *)pol; + struct xfrm_user_sec_ctx *uctx; + + p += pol->sadb_x_policy_len*8; + sec_ctx = (struct sadb_x_sec_ctx *)p; + if (len < pol->sadb_x_policy_len*8 + + sec_ctx->sadb_x_sec_len) + goto out; + if ((*dir = verify_sec_ctx_len(p))) + goto out; + uctx = pfkey_sadb2xfrm_user_sec_ctx(sec_ctx); + *dir = security_xfrm_policy_alloc(xp, uctx); + kfree(uctx); + + if (*dir) + goto out; + } + *dir = pol->sadb_x_policy_dir-1; return xp; out: + security_xfrm_policy_free(xp); kfree(xp); return NULL; } diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index d19e274..64a4473 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -10,7 +10,7 @@ * YOSHIFUJI Hideaki * Split up af-specific portion * Derek Atkins Add the post_input processor - * + * */ #include @@ -256,6 +256,7 @@ void __xfrm_policy_destroy(struct xfrm_policy *policy) if (del_timer(&policy->timer)) BUG(); + security_xfrm_policy_free(policy); kfree(policy); } EXPORT_SYMBOL(__xfrm_policy_destroy); @@ -350,7 +351,8 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL;) { - if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0) { + if (!delpol && memcmp(&policy->selector, &pol->selector, sizeof(pol->selector)) == 0 && + xfrm_sec_ctx_match(pol->security, policy->security)) { if (excl) { write_unlock_bh(&xfrm_policy_lock); return -EEXIST; @@ -416,14 +418,15 @@ int xfrm_policy_insert(int dir, struct xfrm_policy *policy, int excl) } EXPORT_SYMBOL(xfrm_policy_insert); -struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, - int delete) +struct xfrm_policy *xfrm_policy_bysel_ctx(int dir, struct xfrm_selector *sel, + struct xfrm_sec_ctx *ctx, int delete) { struct xfrm_policy *pol, **p; write_lock_bh(&xfrm_policy_lock); for (p = &xfrm_policy_list[dir]; (pol=*p)!=NULL; p = &pol->next) { - if (memcmp(sel, &pol->selector, sizeof(*sel)) == 0) { + if ((memcmp(sel, &pol->selector, sizeof(*sel)) == 0) && + (xfrm_sec_ctx_match(ctx, pol->security))) { xfrm_pol_hold(pol); if (delete) *p = pol->next; @@ -438,7 +441,7 @@ struct xfrm_policy *xfrm_policy_bysel(int dir, struct xfrm_selector *sel, } return pol; } -EXPORT_SYMBOL(xfrm_policy_bysel); +EXPORT_SYMBOL(xfrm_policy_bysel_ctx); struct xfrm_policy *xfrm_policy_byid(int dir, u32 id, int delete) { @@ -519,7 +522,7 @@ EXPORT_SYMBOL(xfrm_policy_walk); /* Find policy to apply to this flow. */ -static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, +static void xfrm_policy_lookup(struct flowi *fl, u32 sk_sid, u16 family, u8 dir, void **objp, atomic_t **obj_refp) { struct xfrm_policy *pol; @@ -533,9 +536,12 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, continue; match = xfrm_selector_match(sel, fl, family); + if (match) { - xfrm_pol_hold(pol); - break; + if (!security_xfrm_policy_lookup(pol, sk_sid, dir)) { + xfrm_pol_hold(pol); + break; + } } } read_unlock_bh(&xfrm_policy_lock); @@ -543,15 +549,37 @@ static void xfrm_policy_lookup(struct flowi *fl, u16 family, u8 dir, *obj_refp = &pol->refcnt; } -static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl) +static inline int policy_to_flow_dir(int dir) +{ + if (XFRM_POLICY_IN == FLOW_DIR_IN && + XFRM_POLICY_OUT == FLOW_DIR_OUT && + XFRM_POLICY_FWD == FLOW_DIR_FWD) + return dir; + switch (dir) { + default: + case XFRM_POLICY_IN: + return FLOW_DIR_IN; + case XFRM_POLICY_OUT: + return FLOW_DIR_OUT; + case XFRM_POLICY_FWD: + return FLOW_DIR_FWD; + }; +} + +static struct xfrm_policy *xfrm_sk_policy_lookup(struct sock *sk, int dir, struct flowi *fl, u32 sk_sid) { struct xfrm_policy *pol; read_lock_bh(&xfrm_policy_lock); if ((pol = sk->sk_policy[dir]) != NULL) { - int match = xfrm_selector_match(&pol->selector, fl, + int match = xfrm_selector_match(&pol->selector, fl, sk->sk_family); + int err = 0; + if (match) + err = security_xfrm_policy_lookup(pol, sk_sid, policy_to_flow_dir(dir)); + + if (match && !err) xfrm_pol_hold(pol); else pol = NULL; @@ -624,6 +652,10 @@ static struct xfrm_policy *clone_policy(struct xfrm_policy *old, int dir) if (newp) { newp->selector = old->selector; + if (security_xfrm_policy_clone(old, newp)) { + kfree(newp); + return NULL; /* ENOMEM */ + } newp->lft = old->lft; newp->curlft = old->curlft; newp->action = old->action; @@ -735,22 +767,6 @@ xfrm_bundle_create(struct xfrm_policy *policy, struct xfrm_state **xfrm, int nx, return err; } -static inline int policy_to_flow_dir(int dir) -{ - if (XFRM_POLICY_IN == FLOW_DIR_IN && - XFRM_POLICY_OUT == FLOW_DIR_OUT && - XFRM_POLICY_FWD == FLOW_DIR_FWD) - return dir; - switch (dir) { - default: - case XFRM_POLICY_IN: - return FLOW_DIR_IN; - case XFRM_POLICY_OUT: - return FLOW_DIR_OUT; - case XFRM_POLICY_FWD: - return FLOW_DIR_FWD; - }; -} static int stale_bundle(struct dst_entry *dst); @@ -769,19 +785,20 @@ int xfrm_lookup(struct dst_entry **dst_p, struct flowi *fl, int err; u32 genid; u16 family = dst_orig->ops->family; + u8 dir = policy_to_flow_dir(XFRM_POLICY_OUT); + u32 sk_sid = security_sk_sid(sk, fl, dir); restart: genid = atomic_read(&flow_cache_genid); policy = NULL; if (sk && sk->sk_policy[1]) - policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl); + policy = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl, sk_sid); if (!policy) { /* To accelerate a bit... */ if ((dst_orig->flags & DST_NOXFRM) || !xfrm_policy_list[XFRM_POLICY_OUT]) return 0; - policy = flow_cache_lookup(fl, family, - policy_to_flow_dir(XFRM_POLICY_OUT), + policy = flow_cache_lookup(fl, sk_sid, family, dir, xfrm_policy_lookup); } @@ -962,16 +979,20 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, { struct xfrm_policy *pol; struct flowi fl; + u8 fl_dir = policy_to_flow_dir(dir); + u32 sk_sid; if (_decode_session(skb, &fl, family) < 0) return 0; + sk_sid = security_sk_sid(sk, &fl, fl_dir); + /* First, check used SA against their selectors. */ if (skb->sp) { int i; for (i=skb->sp->len-1; i>=0; i--) { - struct sec_decap_state *xvec = &(skb->sp->x[i]); + struct sec_decap_state *xvec = &(skb->sp->x[i]); if (!xfrm_selector_match(&xvec->xvec->sel, &fl, family)) return 0; @@ -986,11 +1007,10 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, pol = NULL; if (sk && sk->sk_policy[dir]) - pol = xfrm_sk_policy_lookup(sk, dir, &fl); + pol = xfrm_sk_policy_lookup(sk, dir, &fl, sk_sid); if (!pol) - pol = flow_cache_lookup(&fl, family, - policy_to_flow_dir(dir), + pol = flow_cache_lookup(&fl, sk_sid, family, fl_dir, xfrm_policy_lookup); if (!pol) diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 479effc..e12d0be 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -10,7 +10,7 @@ * Split up af-specific functions * Derek Atkins * Add UDP Encapsulation - * + * */ #include @@ -70,6 +70,7 @@ static void xfrm_state_gc_destroy(struct xfrm_state *x) x->type->destructor(x); xfrm_put_type(x->type); } + security_xfrm_state_free(x); kfree(x); } @@ -343,7 +344,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, selector. */ if (x->km.state == XFRM_STATE_VALID) { - if (!xfrm_selector_match(&x->sel, fl, family)) + if (!xfrm_selector_match(&x->sel, fl, family) || + !xfrm_sec_ctx_match(pol->security, x->security)) continue; if (!best || best->km.dying > x->km.dying || @@ -354,7 +356,8 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, acquire_in_progress = 1; } else if (x->km.state == XFRM_STATE_ERROR || x->km.state == XFRM_STATE_EXPIRED) { - if (xfrm_selector_match(&x->sel, fl, family)) + if (xfrm_selector_match(&x->sel, fl, family) && + xfrm_sec_ctx_match(pol->security, x->security)) error = -ESRCH; } } diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 0cdd9a0..92e2b80 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -7,7 +7,7 @@ * Kazunori MIYAZAWA @USAGI * Kunihiro Ishiguro * IPv6 support - * + * */ #include @@ -88,6 +88,34 @@ static int verify_encap_tmpl(struct rtattr **xfrma) return 0; } + +static inline int verify_sec_ctx_len(struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX - 1]; + struct xfrm_user_sec_ctx *uctx; + int len = 0; + + if (!rt) + return 0; + + if (rt->rta_len < sizeof(*uctx)) + return -EINVAL; + + uctx = RTA_DATA(rt); + + if (uctx->ctx_len > PAGE_SIZE) + return -EINVAL; + + len += sizeof(struct xfrm_user_sec_ctx); + len += uctx->ctx_len; + + if (uctx->len != len) + return -EINVAL; + + return 0; +} + + static int verify_newsa_info(struct xfrm_usersa_info *p, struct rtattr **xfrma) { @@ -145,6 +173,8 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, goto out; if ((err = verify_encap_tmpl(xfrma))) goto out; + if ((err = verify_sec_ctx_len(xfrma))) + goto out; err = -EINVAL; switch (p->mode) { @@ -209,6 +239,30 @@ static int attach_encap_tmpl(struct xfrm_encap_tmpl **encapp, struct rtattr *u_a return 0; } + +static inline int xfrm_user_sec_ctx_size(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *xfrm_ctx = xp->security; + int len = 0; + + if (xfrm_ctx) { + len += sizeof(struct xfrm_user_sec_ctx); + len += xfrm_ctx->ctx_len; + } + return len; +} + +static int attach_sec_ctx(struct xfrm_state *x, struct rtattr *u_arg) +{ + struct xfrm_user_sec_ctx *uctx; + + if (!u_arg) + return 0; + + uctx = RTA_DATA(u_arg); + return security_xfrm_state_alloc(x, uctx); +} + static void copy_from_user_state(struct xfrm_state *x, struct xfrm_usersa_info *p) { memcpy(&x->id, &p->id, sizeof(x->id)); @@ -253,6 +307,9 @@ static struct xfrm_state *xfrm_state_construct(struct xfrm_usersa_info *p, if (err) goto error; + if ((err = attach_sec_ctx(x, xfrma[XFRMA_SEC_CTX-1]))) + goto error; + x->km.seq = p->seq; return x; @@ -272,11 +329,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfrma) int err; struct km_event c; - err = verify_newsa_info(p, (struct rtattr **) xfrma); + err = verify_newsa_info(p, (struct rtattr **)xfrma); if (err) return err; - x = xfrm_state_construct(p, (struct rtattr **) xfrma, &err); + x = xfrm_state_construct(p, (struct rtattr **)xfrma, &err); if (!x) return err; @@ -390,6 +447,19 @@ static int dump_one_state(struct xfrm_state *x, int count, void *ptr) if (x->encap) RTA_PUT(skb, XFRMA_ENCAP, sizeof(*x->encap), x->encap); + if (x->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + x->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = x->security->ctx_doi; + uctx->ctx_alg = x->security->ctx_alg; + uctx->ctx_len = x->security->ctx_len; + memcpy(uctx + 1, x->security->ctx_str, x->security->ctx_len); + } nlh->nlmsg_len = skb->tail - b; out: sp->this_idx++; @@ -603,6 +673,18 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) return verify_policy_dir(p->dir); } +static int copy_from_user_sec_ctx(struct xfrm_policy *pol, struct rtattr **xfrma) +{ + struct rtattr *rt = xfrma[XFRMA_SEC_CTX-1]; + struct xfrm_user_sec_ctx *uctx; + + if (!rt) + return 0; + + uctx = RTA_DATA(rt); + return security_xfrm_policy_alloc(pol, uctx); +} + static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, int nr) { @@ -681,7 +763,10 @@ static struct xfrm_policy *xfrm_policy_construct(struct xfrm_userpolicy_info *p, } copy_from_user_policy(xp, p); - err = copy_from_user_tmpl(xp, xfrma); + + if (!(err = copy_from_user_tmpl(xp, xfrma))) + err = copy_from_user_sec_ctx(xp, xfrma); + if (err) { *errp = err; kfree(xp); @@ -702,8 +787,11 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr err = verify_newpolicy_info(p); if (err) return err; + err = verify_sec_ctx_len((struct rtattr **)xfrma); + if (err) + return err; - xp = xfrm_policy_construct(p, (struct rtattr **) xfrma, &err); + xp = xfrm_policy_construct(p, (struct rtattr **)xfrma, &err); if (!xp) return err; @@ -761,6 +849,27 @@ rtattr_failure: return -1; } +static int copy_to_user_sec_ctx(struct xfrm_policy *xp, struct sk_buff *skb) +{ + if (xp->security) { + int ctx_size = sizeof(struct xfrm_sec_ctx) + + xp->security->ctx_len; + struct rtattr *rt = __RTA_PUT(skb, XFRMA_SEC_CTX, ctx_size); + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + uctx->exttype = XFRMA_SEC_CTX; + uctx->len = ctx_size; + uctx->ctx_doi = xp->security->ctx_doi; + uctx->ctx_alg = xp->security->ctx_alg; + uctx->ctx_len = xp->security->ctx_len; + memcpy(uctx + 1, xp->security->ctx_str, xp->security->ctx_len); + } + return 0; + + rtattr_failure: + return -1; +} + static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr) { struct xfrm_dump_info *sp = ptr; @@ -782,6 +891,8 @@ static int dump_one_policy(struct xfrm_policy *xp, int dir, int count, void *ptr copy_to_user_policy(xp, p, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; out: @@ -852,8 +963,25 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, void **xfr if (p->index) xp = xfrm_policy_byid(p->dir, p->index, delete); - else - xp = xfrm_policy_bysel(p->dir, &p->sel, delete); + else { + struct rtattr **rtattrs = (struct rtattr **)xfrma; + struct rtattr *rt = rtattrs[XFRMA_SEC_CTX-1]; + struct xfrm_policy tmp; + + err = verify_sec_ctx_len(rtattrs); + if (err) + return err; + + memset(&tmp, 0, sizeof(struct xfrm_policy)); + if (rt) { + struct xfrm_user_sec_ctx *uctx = RTA_DATA(rt); + + if ((err = security_xfrm_policy_alloc(&tmp, uctx))) + return err; + } + xp = xfrm_policy_bysel_ctx(p->dir, &p->sel, tmp.security, delete); + security_xfrm_policy_free(&tmp); + } if (xp == NULL) return -ENOENT; @@ -1224,6 +1352,8 @@ static int build_acquire(struct sk_buff *skb, struct xfrm_state *x, if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; nlh->nlmsg_len = skb->tail - b; return skb->len; @@ -1241,6 +1371,7 @@ static int xfrm_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *xt, len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_acquire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; @@ -1324,6 +1455,8 @@ static int build_polexpire(struct sk_buff *skb, struct xfrm_policy *xp, copy_to_user_policy(xp, &upe->pol, dir); if (copy_to_user_tmpl(xp, skb) < 0) goto nlmsg_failure; + if (copy_to_user_sec_ctx(xp, skb)) + goto nlmsg_failure; upe->hard = !!hard; nlh->nlmsg_len = skb->tail - b; @@ -1341,6 +1474,7 @@ static int xfrm_exp_policy_notify(struct xfrm_policy *xp, int dir, struct km_eve len = RTA_SPACE(sizeof(struct xfrm_user_tmpl) * xp->xfrm_nr); len += NLMSG_SPACE(sizeof(struct xfrm_user_polexpire)); + len += RTA_SPACE(xfrm_user_sec_ctx_size(xp)); skb = alloc_skb(len, GFP_ATOMIC); if (skb == NULL) return -ENOMEM; diff --git a/security/Kconfig b/security/Kconfig index 64d3f1e..34f5934 100644 --- a/security/Kconfig +++ b/security/Kconfig @@ -54,6 +54,19 @@ config SECURITY_NETWORK implement socket and networking access controls. If you are unsure how to answer this question, answer N. +config SECURITY_NETWORK_XFRM + bool "XFRM (IPSec) Networking Security Hooks" + depends on XFRM && SECURITY_NETWORK + help + This enables the XFRM (IPSec) networking security hooks. + If enabled, a security module can use these hooks to + implement per-packet access controls based on labels + derived from IPSec policy. Non-IPSec communications are + designated as unlabelled, and only sockets authorized + to communicate unlabelled data can send without using + IPSec. + If you are unsure how to answer this question, answer N. + config SECURITY_CAPABILITIES tristate "Default Linux Capabilities" depends on SECURITY diff --git a/security/dummy.c b/security/dummy.c index 3ca5f2b..a15c547 100644 --- a/security/dummy.c +++ b/security/dummy.c @@ -776,8 +776,42 @@ static inline int dummy_sk_alloc_security (struct sock *sk, int family, gfp_t pr static inline void dummy_sk_free_security (struct sock *sk) { } + +static unsigned int dummy_sk_getsid(struct sock *sk, struct flowi *fl, u8 dir) +{ + return 0; +} #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM +static int dummy_xfrm_policy_alloc_security(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static inline int dummy_xfrm_policy_clone_security(struct xfrm_policy *old, struct xfrm_policy *new) +{ + return 0; +} + +static void dummy_xfrm_policy_free_security(struct xfrm_policy *xp) +{ +} + +static int dummy_xfrm_state_alloc_security(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx) +{ + return 0; +} + +static void dummy_xfrm_state_free_security(struct xfrm_state *x) +{ +} + +static int dummy_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + return 0; +} +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ static int dummy_register_security (const char *name, struct security_operations *ops) { return -EINVAL; @@ -970,7 +1004,16 @@ void security_fixup_ops (struct security_operations *ops) set_to_dummy_if_null(ops, socket_getpeersec); set_to_dummy_if_null(ops, sk_alloc_security); set_to_dummy_if_null(ops, sk_free_security); -#endif /* CONFIG_SECURITY_NETWORK */ + set_to_dummy_if_null(ops, sk_getsid); + #endif /* CONFIG_SECURITY_NETWORK */ +#ifdef CONFIG_SECURITY_NETWORK_XFRM + set_to_dummy_if_null(ops, xfrm_policy_alloc_security); + set_to_dummy_if_null(ops, xfrm_policy_clone_security); + set_to_dummy_if_null(ops, xfrm_policy_free_security); + set_to_dummy_if_null(ops, xfrm_state_alloc_security); + set_to_dummy_if_null(ops, xfrm_state_free_security); + set_to_dummy_if_null(ops, xfrm_policy_lookup); +#endif /* CONFIG_SECURITY_NETWORK_XFRM */ #ifdef CONFIG_KEYS set_to_dummy_if_null(ops, key_alloc); set_to_dummy_if_null(ops, key_free); -- cgit v1.1 From d28d1e080132f28ab773291f10ad6acca4c8bba2 Mon Sep 17 00:00:00 2001 From: Trent Jaeger Date: Tue, 13 Dec 2005 23:12:40 -0800 Subject: [LSM-IPSec]: Per-packet access control. This patch series implements per packet access control via the extension of the Linux Security Modules (LSM) interface by hooks in the XFRM and pfkey subsystems that leverage IPSec security associations to label packets. Extensions to the SELinux LSM are included that leverage the patch for this purpose. This patch implements the changes necessary to the SELinux LSM to create, deallocate, and use security contexts for policies (xfrm_policy) and security associations (xfrm_state) that enable control of a socket's ability to send and receive packets. Patch purpose: The patch is designed to enable the SELinux LSM to implement access control on individual packets based on the strongly authenticated IPSec security association. Such access controls augment the existing ones in SELinux based on network interface and IP address. The former are very coarse-grained, and the latter can be spoofed. By using IPSec, the SELinux can control access to remote hosts based on cryptographic keys generated using the IPSec mechanism. This enables access control on a per-machine basis or per-application if the remote machine is running the same mechanism and trusted to enforce the access control policy. Patch design approach: The patch's main function is to authorize a socket's access to a IPSec policy based on their security contexts. Since the communication is implemented by a security association, the patch ensures that the security association's negotiated and used have the same security context. The patch enables allocation and deallocation of such security contexts for policies and security associations. It also enables copying of the security context when policies are cloned. Lastly, the patch ensures that packets that are sent without using a IPSec security assocation with a security context are allowed to be sent in that manner. A presentation available at www.selinux-symposium.org/2005/presentations/session2/2-3-jaeger.pdf from the SELinux symposium describes the overall approach. Patch implementation details: The function which authorizes a socket to perform a requested operation (send/receive) on a IPSec policy (xfrm_policy) is selinux_xfrm_policy_lookup. The Netfilter and rcv_skb hooks ensure that if a IPSec SA with a securit y association has not been used, then the socket is allowed to send or receive the packet, respectively. The patch implements SELinux function for allocating security contexts when policies (xfrm_policy) are created via the pfkey or xfrm_user interfaces via selinux_xfrm_policy_alloc. When a security association is built, SELinux allocates the security context designated by the XFRM subsystem which is based on that of the authorized policy via selinux_xfrm_state_alloc. When a xfrm_policy is cloned, the security context of that policy, if any, is copied to the clone via selinux_xfrm_policy_clone. When a xfrm_policy or xfrm_state is freed, its security context, if any is also freed at selinux_xfrm_policy_free or selinux_xfrm_state_free. Testing: The SELinux authorization function is tested using ipsec-tools. We created policies and security associations with particular security contexts and added SELinux access control policy entries to verify the authorization decision. We also made sure that packets for which no security context was supplied (which either did or did not use security associations) were authorized using an unlabelled context. Signed-off-by: Trent Jaeger Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- security/selinux/Makefile | 2 + security/selinux/hooks.c | 39 ++++ security/selinux/include/av_perm_to_string.h | 2 + security/selinux/include/av_permissions.h | 2 + security/selinux/include/xfrm.h | 54 +++++ security/selinux/xfrm.c | 311 +++++++++++++++++++++++++++ 6 files changed, 410 insertions(+) create mode 100644 security/selinux/include/xfrm.h create mode 100644 security/selinux/xfrm.c diff --git a/security/selinux/Makefile b/security/selinux/Makefile index b038cd0..06d54d9 100644 --- a/security/selinux/Makefile +++ b/security/selinux/Makefile @@ -8,5 +8,7 @@ selinux-y := avc.o hooks.o selinuxfs.o netlink.o nlmsgtab.o selinux-$(CONFIG_SECURITY_NETWORK) += netif.o +selinux-$(CONFIG_SECURITY_NETWORK_XFRM) += xfrm.o + EXTRA_CFLAGS += -Isecurity/selinux/include diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c index fc77443..3d496ea 100644 --- a/security/selinux/hooks.c +++ b/security/selinux/hooks.c @@ -73,6 +73,7 @@ #include "avc.h" #include "objsec.h" #include "netif.h" +#include "xfrm.h" #define XATTR_SELINUX_SUFFIX "selinux" #define XATTR_NAME_SELINUX XATTR_SECURITY_PREFIX XATTR_SELINUX_SUFFIX @@ -3349,6 +3350,10 @@ static int selinux_socket_sock_rcv_skb(struct sock *sk, struct sk_buff *skb) err = avc_has_perm(sock_sid, port_sid, sock_class, recv_perm, &ad); } + + if (!err) + err = selinux_xfrm_sock_rcv_skb(sock_sid, skb); + out: return err; } @@ -3401,6 +3406,24 @@ static void selinux_sk_free_security(struct sock *sk) sk_free_security(sk); } +static unsigned int selinux_sk_getsid_security(struct sock *sk, struct flowi *fl, u8 dir) +{ + struct inode_security_struct *isec; + u32 sock_sid = SECINITSID_ANY_SOCKET; + + if (!sk) + return selinux_no_sk_sid(fl); + + read_lock_bh(&sk->sk_callback_lock); + isec = get_sock_isec(sk); + + if (isec) + sock_sid = isec->sid; + + read_unlock_bh(&sk->sk_callback_lock); + return sock_sid; +} + static int selinux_nlmsg_perm(struct sock *sk, struct sk_buff *skb) { int err = 0; @@ -3536,6 +3559,11 @@ static unsigned int selinux_ip_postroute_last(unsigned int hooknum, send_perm, &ad) ? NF_DROP : NF_ACCEPT; } + if (err != NF_ACCEPT) + goto out; + + err = selinux_xfrm_postroute_last(isec->sid, skb); + out: return err; } @@ -4380,6 +4408,16 @@ static struct security_operations selinux_ops = { .socket_getpeersec = selinux_socket_getpeersec, .sk_alloc_security = selinux_sk_alloc_security, .sk_free_security = selinux_sk_free_security, + .sk_getsid = selinux_sk_getsid_security, +#endif + +#ifdef CONFIG_SECURITY_NETWORK_XFRM + .xfrm_policy_alloc_security = selinux_xfrm_policy_alloc, + .xfrm_policy_clone_security = selinux_xfrm_policy_clone, + .xfrm_policy_free_security = selinux_xfrm_policy_free, + .xfrm_state_alloc_security = selinux_xfrm_state_alloc, + .xfrm_state_free_security = selinux_xfrm_state_free, + .xfrm_policy_lookup = selinux_xfrm_policy_lookup, #endif }; @@ -4491,6 +4529,7 @@ static int __init selinux_nf_ip_init(void) panic("SELinux: nf_register_hook for IPv6: error %d\n", err); #endif /* IPV6 */ + out: return err; } diff --git a/security/selinux/include/av_perm_to_string.h b/security/selinux/include/av_perm_to_string.h index 1deb59e..71aeb12 100644 --- a/security/selinux/include/av_perm_to_string.h +++ b/security/selinux/include/av_perm_to_string.h @@ -238,3 +238,5 @@ S_(SECCLASS_NSCD, NSCD__SHMEMHOST, "shmemhost") S_(SECCLASS_ASSOCIATION, ASSOCIATION__SENDTO, "sendto") S_(SECCLASS_ASSOCIATION, ASSOCIATION__RECVFROM, "recvfrom") + S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELFROM, "relabelfrom") + S_(SECCLASS_ASSOCIATION, ASSOCIATION__RELABELTO, "relabelto") diff --git a/security/selinux/include/av_permissions.h b/security/selinux/include/av_permissions.h index a78b5d5..d1d0996 100644 --- a/security/selinux/include/av_permissions.h +++ b/security/selinux/include/av_permissions.h @@ -908,6 +908,8 @@ #define ASSOCIATION__SENDTO 0x00000001UL #define ASSOCIATION__RECVFROM 0x00000002UL +#define ASSOCIATION__RELABELFROM 0x00000004UL +#define ASSOCIATION__RELABELTO 0x00000008UL #define NETLINK_KOBJECT_UEVENT_SOCKET__IOCTL 0x00000001UL #define NETLINK_KOBJECT_UEVENT_SOCKET__READ 0x00000002UL diff --git a/security/selinux/include/xfrm.h b/security/selinux/include/xfrm.h new file mode 100644 index 0000000..8e87996 --- /dev/null +++ b/security/selinux/include/xfrm.h @@ -0,0 +1,54 @@ +/* + * SELinux support for the XFRM LSM hooks + * + * Author : Trent Jaeger, + */ +#ifndef _SELINUX_XFRM_H_ +#define _SELINUX_XFRM_H_ + +int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *sec_ctx); +int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new); +void selinux_xfrm_policy_free(struct xfrm_policy *xp); +int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *sec_ctx); +void selinux_xfrm_state_free(struct xfrm_state *x); +int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir); + +/* + * Extract the security blob from the sock (it's actually on the socket) + */ +static inline struct inode_security_struct *get_sock_isec(struct sock *sk) +{ + if (!sk->sk_socket) + return NULL; + + return SOCK_INODE(sk->sk_socket)->i_security; +} + + +static inline u32 selinux_no_sk_sid(struct flowi *fl) +{ + /* NOTE: no sock occurs on ICMP reply, forwards, ... */ + /* icmp_reply: authorize as kernel packet */ + if (fl && fl->proto == IPPROTO_ICMP) { + return SECINITSID_KERNEL; + } + + return SECINITSID_ANY_SOCKET; +} + +#ifdef CONFIG_SECURITY_NETWORK_XFRM +int selinux_xfrm_sock_rcv_skb(u32 sid, struct sk_buff *skb); +int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb); +#else +static inline int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb) +{ + return 0; +} + +static inline int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb) +{ + return NF_ACCEPT; +} +#endif + +#endif /* _SELINUX_XFRM_H_ */ diff --git a/security/selinux/xfrm.c b/security/selinux/xfrm.c new file mode 100644 index 0000000..c4d87d4 --- /dev/null +++ b/security/selinux/xfrm.c @@ -0,0 +1,311 @@ +/* + * NSA Security-Enhanced Linux (SELinux) security module + * + * This file contains the SELinux XFRM hook function implementations. + * + * Authors: Serge Hallyn + * Trent Jaeger + * + * Copyright (C) 2005 International Business Machines Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2, + * as published by the Free Software Foundation. + */ + +/* + * USAGE: + * NOTES: + * 1. Make sure to enable the following options in your kernel config: + * CONFIG_SECURITY=y + * CONFIG_SECURITY_NETWORK=y + * CONFIG_SECURITY_NETWORK_XFRM=y + * CONFIG_SECURITY_SELINUX=m/y + * ISSUES: + * 1. Caching packets, so they are not dropped during negotiation + * 2. Emulating a reasonable SO_PEERSEC across machines + * 3. Testing addition of sk_policy's with security context via setsockopt + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "avc.h" +#include "objsec.h" +#include "xfrm.h" + + +/* + * Returns true if an LSM/SELinux context + */ +static inline int selinux_authorizable_ctx(struct xfrm_sec_ctx *ctx) +{ + return (ctx && + (ctx->ctx_doi == XFRM_SC_DOI_LSM) && + (ctx->ctx_alg == XFRM_SC_ALG_SELINUX)); +} + +/* + * Returns true if the xfrm contains a security blob for SELinux + */ +static inline int selinux_authorizable_xfrm(struct xfrm_state *x) +{ + return selinux_authorizable_ctx(x->security); +} + +/* + * LSM hook implementation that authorizes that a socket can be used + * with the corresponding xfrm_sec_ctx and direction. + */ +int selinux_xfrm_policy_lookup(struct xfrm_policy *xp, u32 sk_sid, u8 dir) +{ + int rc = 0; + u32 sel_sid = SECINITSID_UNLABELED; + struct xfrm_sec_ctx *ctx; + + /* Context sid is either set to label or ANY_ASSOC */ + if ((ctx = xp->security)) { + if (!selinux_authorizable_ctx(ctx)) + return -EINVAL; + + sel_sid = ctx->ctx_sid; + } + + rc = avc_has_perm(sk_sid, sel_sid, SECCLASS_ASSOCIATION, + ((dir == FLOW_DIR_IN) ? ASSOCIATION__RECVFROM : + ((dir == FLOW_DIR_OUT) ? ASSOCIATION__SENDTO : + (ASSOCIATION__SENDTO | ASSOCIATION__RECVFROM))), + NULL); + + return rc; +} + +/* + * Security blob allocation for xfrm_policy and xfrm_state + * CTX does not have a meaningful value on input + */ +static int selinux_xfrm_sec_ctx_alloc(struct xfrm_sec_ctx **ctxp, struct xfrm_user_sec_ctx *uctx) +{ + int rc = 0; + struct task_security_struct *tsec = current->security; + struct xfrm_sec_ctx *ctx; + + BUG_ON(!uctx); + BUG_ON(uctx->ctx_doi != XFRM_SC_ALG_SELINUX); + + if (uctx->ctx_len >= PAGE_SIZE) + return -ENOMEM; + + *ctxp = ctx = kmalloc(sizeof(*ctx) + + uctx->ctx_len, + GFP_KERNEL); + + if (!ctx) + return -ENOMEM; + + ctx->ctx_doi = uctx->ctx_doi; + ctx->ctx_len = uctx->ctx_len; + ctx->ctx_alg = uctx->ctx_alg; + + memcpy(ctx->ctx_str, + uctx+1, + ctx->ctx_len); + rc = security_context_to_sid(ctx->ctx_str, + ctx->ctx_len, + &ctx->ctx_sid); + + if (rc) + goto out; + + /* + * Does the subject have permission to set security or permission to + * do the relabel? + * Must be permitted to relabel from default socket type (process type) + * to specified context + */ + rc = avc_has_perm(tsec->sid, tsec->sid, + SECCLASS_ASSOCIATION, + ASSOCIATION__RELABELFROM, NULL); + if (rc) + goto out; + + rc = avc_has_perm(tsec->sid, ctx->ctx_sid, + SECCLASS_ASSOCIATION, + ASSOCIATION__RELABELTO, NULL); + if (rc) + goto out; + + return rc; + +out: + *ctxp = 0; + kfree(ctx); + return rc; +} + +/* + * LSM hook implementation that allocs and transfers uctx spec to + * xfrm_policy. + */ +int selinux_xfrm_policy_alloc(struct xfrm_policy *xp, struct xfrm_user_sec_ctx *uctx) +{ + int err; + + BUG_ON(!xp); + + err = selinux_xfrm_sec_ctx_alloc(&xp->security, uctx); + return err; +} + + +/* + * LSM hook implementation that copies security data structure from old to + * new for policy cloning. + */ +int selinux_xfrm_policy_clone(struct xfrm_policy *old, struct xfrm_policy *new) +{ + struct xfrm_sec_ctx *old_ctx, *new_ctx; + + old_ctx = old->security; + + if (old_ctx) { + new_ctx = new->security = kmalloc(sizeof(*new_ctx) + + old_ctx->ctx_len, + GFP_KERNEL); + + if (!new_ctx) + return -ENOMEM; + + memcpy(new_ctx, old_ctx, sizeof(*new_ctx)); + memcpy(new_ctx->ctx_str, old_ctx->ctx_str, new_ctx->ctx_len); + } + return 0; +} + +/* + * LSM hook implementation that frees xfrm_policy security information. + */ +void selinux_xfrm_policy_free(struct xfrm_policy *xp) +{ + struct xfrm_sec_ctx *ctx = xp->security; + if (ctx) + kfree(ctx); +} + +/* + * LSM hook implementation that allocs and transfers sec_ctx spec to + * xfrm_state. + */ +int selinux_xfrm_state_alloc(struct xfrm_state *x, struct xfrm_user_sec_ctx *uctx) +{ + int err; + + BUG_ON(!x); + + err = selinux_xfrm_sec_ctx_alloc(&x->security, uctx); + return err; +} + +/* + * LSM hook implementation that frees xfrm_state security information. + */ +void selinux_xfrm_state_free(struct xfrm_state *x) +{ + struct xfrm_sec_ctx *ctx = x->security; + if (ctx) + kfree(ctx); +} + +/* + * LSM hook that controls access to unlabelled packets. If + * a xfrm_state is authorizable (defined by macro) then it was + * already authorized by the IPSec process. If not, then + * we need to check for unlabelled access since this may not have + * gone thru the IPSec process. + */ +int selinux_xfrm_sock_rcv_skb(u32 isec_sid, struct sk_buff *skb) +{ + int i, rc = 0; + struct sec_path *sp; + + sp = skb->sp; + + if (sp) { + /* + * __xfrm_policy_check does not approve unless xfrm_policy_ok + * says that spi's match for policy and the socket. + * + * Only need to verify the existence of an authorizable sp. + */ + for (i = 0; i < sp->len; i++) { + struct xfrm_state *x = sp->x[i].xvec; + + if (x && selinux_authorizable_xfrm(x)) + goto accept; + } + } + + /* check SELinux sock for unlabelled access */ + rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION, + ASSOCIATION__RECVFROM, NULL); + if (rc) + goto drop; + +accept: + return 0; + +drop: + return rc; +} + +/* + * POSTROUTE_LAST hook's XFRM processing: + * If we have no security association, then we need to determine + * whether the socket is allowed to send to an unlabelled destination. + * If we do have a authorizable security association, then it has already been + * checked in xfrm_policy_lookup hook. + */ +int selinux_xfrm_postroute_last(u32 isec_sid, struct sk_buff *skb) +{ + struct dst_entry *dst; + int rc = 0; + + dst = skb->dst; + + if (dst) { + struct dst_entry *dst_test; + + for (dst_test = dst; dst_test != 0; + dst_test = dst_test->child) { + struct xfrm_state *x = dst_test->xfrm; + + if (x && selinux_authorizable_xfrm(x)) + goto accept; + } + } + + rc = avc_has_perm(isec_sid, SECINITSID_UNLABELED, SECCLASS_ASSOCIATION, + ASSOCIATION__SENDTO, NULL); + if (rc) + goto drop; + +accept: + return NF_ACCEPT; + +drop: + return NF_DROP; +} -- cgit v1.1 From 018da8f44c066d5fc390011b8c953135f973b3a9 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Dec 2005 23:13:00 -0800 Subject: [TCP] BIC: remove low utilization code. The latest BICTCP patch at: http://www.csc.ncsu.edu:8080/faculty/rhee/export/bitcp/index_files/Page546.htm disables the low_utilization feature of BICTCP because it doesn't work in some cases. This patch removes it. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 81 +----------------------------------------------------- 1 file changed, 1 insertion(+), 80 deletions(-) diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index 1d0cd86..cf8c75f 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -30,8 +30,6 @@ static int fast_convergence = 1; static int max_increment = 16; static int low_window = 14; static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ -static int low_utilization_threshold = 153; -static int low_utilization_period = 2; static int initial_ssthresh = 100; static int smooth_part = 20; @@ -43,10 +41,6 @@ module_param(low_window, int, 0644); MODULE_PARM_DESC(low_window, "lower bound on congestion window (for TCP friendliness)"); module_param(beta, int, 0644); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); -module_param(low_utilization_threshold, int, 0644); -MODULE_PARM_DESC(low_utilization_threshold, "percent (scaled by 1024) for low utilization mode"); -module_param(low_utilization_period, int, 0644); -MODULE_PARM_DESC(low_utilization_period, "if average delay exceeds then goto to low utilization mode (seconds)"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); module_param(smooth_part, int, 0644); @@ -60,11 +54,6 @@ struct bictcp { u32 loss_cwnd; /* congestion window at last loss */ u32 last_cwnd; /* the last snd_cwnd */ u32 last_time; /* time when updated last_cwnd */ - u32 delay_min; /* min delay */ - u32 delay_max; /* max delay */ - u32 last_delay; - u8 low_utilization;/* 0: high; 1: low */ - u32 low_utilization_start; /* starting time of low utilization detection*/ u32 epoch_start; /* beginning of an epoch */ #define ACK_RATIO_SHIFT 4 u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ @@ -77,11 +66,6 @@ static inline void bictcp_reset(struct bictcp *ca) ca->loss_cwnd = 0; ca->last_cwnd = 0; ca->last_time = 0; - ca->delay_min = 0; - ca->delay_max = 0; - ca->last_delay = 0; - ca->low_utilization = 0; - ca->low_utilization_start = 0; ca->epoch_start = 0; ca->delayed_ack = 2 << ACK_RATIO_SHIFT; } @@ -143,8 +127,7 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) } /* if in slow start or link utilization is very low */ - if ( ca->loss_cwnd == 0 || - (cwnd > ca->loss_cwnd && ca->low_utilization)) { + if (ca->loss_cwnd == 0) { if (ca->cnt > 20) /* increase cwnd 5% per RTT */ ca->cnt = 20; } @@ -154,69 +137,12 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->cnt = 1; } - -/* Detect low utilization in congestion avoidance */ -static inline void bictcp_low_utilization(struct sock *sk, int flag) -{ - const struct tcp_sock *tp = tcp_sk(sk); - struct bictcp *ca = inet_csk_ca(sk); - u32 dist, delay; - - /* No time stamp */ - if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || - /* Discard delay samples right after fast recovery */ - tcp_time_stamp < ca->epoch_start + HZ || - /* this delay samples may not be accurate */ - flag == 0) { - ca->last_delay = 0; - goto notlow; - } - - delay = ca->last_delay<<3; /* use the same scale as tp->srtt*/ - ca->last_delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; - if (delay == 0) /* no previous delay sample */ - goto notlow; - - /* first time call or link delay decreases */ - if (ca->delay_min == 0 || ca->delay_min > delay) { - ca->delay_min = ca->delay_max = delay; - goto notlow; - } - - if (ca->delay_max < delay) - ca->delay_max = delay; - - /* utilization is low, if avg delay < dist*threshold - for checking_period time */ - dist = ca->delay_max - ca->delay_min; - if (dist <= ca->delay_min>>6 || - tp->srtt - ca->delay_min >= (dist*low_utilization_threshold)>>10) - goto notlow; - - if (ca->low_utilization_start == 0) { - ca->low_utilization = 0; - ca->low_utilization_start = tcp_time_stamp; - } else if ((s32)(tcp_time_stamp - ca->low_utilization_start) - > low_utilization_period*HZ) { - ca->low_utilization = 1; - } - - return; - - notlow: - ca->low_utilization = 0; - ca->low_utilization_start = 0; - -} - static void bictcp_cong_avoid(struct sock *sk, u32 ack, u32 seq_rtt, u32 in_flight, int data_acked) { struct tcp_sock *tp = tcp_sk(sk); struct bictcp *ca = inet_csk_ca(sk); - bictcp_low_utilization(sk, data_acked); - if (!tcp_is_cwnd_limited(sk, in_flight)) return; @@ -249,11 +175,6 @@ static u32 bictcp_recalc_ssthresh(struct sock *sk) ca->epoch_start = 0; /* end of epoch */ - /* in case of wrong delay_max*/ - if (ca->delay_min > 0 && ca->delay_max > ca->delay_min) - ca->delay_max = ca->delay_min - + ((ca->delay_max - ca->delay_min)* 90) / 100; - /* Wmax and fast convergence */ if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) -- cgit v1.1 From 05d054503a9c4652212b8730150608787547ecc3 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Dec 2005 23:13:13 -0800 Subject: [TCP] BIC: spelling and whitespace Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_bic.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/tcp_bic.c b/net/ipv4/tcp_bic.c index cf8c75f..035f209 100644 --- a/net/ipv4/tcp_bic.c +++ b/net/ipv4/tcp_bic.c @@ -210,14 +210,14 @@ static void bictcp_state(struct sock *sk, u8 new_state) bictcp_reset(inet_csk_ca(sk)); } -/* Track delayed acknowledgement ratio using sliding window +/* Track delayed acknowledgment ratio using sliding window * ratio = (15*ratio + sample) / 16 */ static void bictcp_acked(struct sock *sk, u32 cnt) { const struct inet_connection_sock *icsk = inet_csk(sk); - if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { struct bictcp *ca = inet_csk_ca(sk); cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; ca->delayed_ack += cnt; -- cgit v1.1 From df3271f3361b61ce02da0026b4a53e63bc2720cb Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Dec 2005 23:13:28 -0800 Subject: [TCP] BIC: CUBIC window growth (2.0) Replace existing BIC version 1.1 with new version 2.0. The main change is to replace the window growth function with a cubic function as described in: http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/Kconfig | 8 + net/ipv4/Makefile | 1 + net/ipv4/tcp_cubic.c | 445 +++++++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 454 insertions(+) create mode 100644 net/ipv4/tcp_cubic.c diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig index e55136a..011cca7 100644 --- a/net/ipv4/Kconfig +++ b/net/ipv4/Kconfig @@ -456,6 +456,14 @@ config TCP_CONG_BIC increase provides TCP friendliness. See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/ +config TCP_CONG_CUBIC + tristate "CUBIC TCP" + default m + ---help--- + This is version 2.0 of BIC-TCP which uses a cubic growth function + among other techniques. + See http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + config TCP_CONG_WESTWOOD tristate "TCP Westwood+" default m diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index f0435d0..c54edd7 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -34,6 +34,7 @@ obj-$(CONFIG_INET_DIAG) += inet_diag.o obj-$(CONFIG_IP_ROUTE_MULTIPATH_CACHED) += multipath.o obj-$(CONFIG_INET_TCP_DIAG) += tcp_diag.o obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o +obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o obj-$(CONFIG_TCP_CONG_WESTWOOD) += tcp_westwood.o obj-$(CONFIG_TCP_CONG_HSTCP) += tcp_highspeed.o obj-$(CONFIG_TCP_CONG_HYBLA) += tcp_hybla.o diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c new file mode 100644 index 0000000..bb5dc4b --- /dev/null +++ b/net/ipv4/tcp_cubic.c @@ -0,0 +1,445 @@ +/* + * TCP CUBIC: Binary Increase Congestion control for TCP v2.0 + * + * This is from the implementation of CUBIC TCP in + * Injong Rhee, Lisong Xu. + * "CUBIC: A New TCP-Friendly High-Speed TCP Variant + * in PFLDnet 2005 + * Available from: + * http://www.csc.ncsu.edu/faculty/rhee/export/bitcp/cubic-paper.pdf + * + * Unless CUBIC is enabled and congestion window is large + * this behaves the same as the original Reno. + */ + +#include +#include +#include +#include + + +#define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation + * max_cwnd = snd_cwnd * beta + */ +#define BICTCP_B 4 /* + * In binary search, + * go to point (max+min)/N + */ +#define BICTCP_HZ 10 /* BIC HZ 2^10 = 1024 */ + +static int fast_convergence = 1; +static int max_increment = 16; +static int beta = 819; /* = 819/1024 (BICTCP_BETA_SCALE) */ +static int initial_ssthresh = 100; +static int bic_scale = 41; +static int tcp_friendliness = 1; + +module_param(fast_convergence, int, 0644); +MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); +module_param(max_increment, int, 0644); +MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); +module_param(beta, int, 0644); +MODULE_PARM_DESC(beta, "beta for multiplicative increase"); +module_param(initial_ssthresh, int, 0644); +MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); +module_param(bic_scale, int, 0644); +MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); +module_param(tcp_friendliness, int, 0644); +MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); + + +/* BIC TCP Parameters */ +struct bictcp { + u32 cnt; /* increase cwnd by 1 after ACKs */ + u32 last_max_cwnd; /* last maximum snd_cwnd */ + u32 loss_cwnd; /* congestion window at last loss */ + u32 last_cwnd; /* the last snd_cwnd */ + u32 last_time; /* time when updated last_cwnd */ + u32 bic_origin_point;/* origin point of bic function */ + u32 bic_K; /* time to origin point from the beginning of the current epoch */ + u32 delay_min; /* min delay */ + u32 epoch_start; /* beginning of an epoch */ + u32 ack_cnt; /* number of acks */ + u32 tcp_cwnd; /* estimated tcp cwnd */ +#define ACK_RATIO_SHIFT 4 + u32 delayed_ack; /* estimate the ratio of Packets/ACKs << 4 */ +}; + +static inline void bictcp_reset(struct bictcp *ca) +{ + ca->cnt = 0; + ca->last_max_cwnd = 0; + ca->loss_cwnd = 0; + ca->last_cwnd = 0; + ca->last_time = 0; + ca->bic_origin_point = 0; + ca->bic_K = 0; + ca->delay_min = 0; + ca->epoch_start = 0; + ca->delayed_ack = 2 << ACK_RATIO_SHIFT; + ca->ack_cnt = 0; + ca->tcp_cwnd = 0; +} + +static void bictcp_init(struct sock *sk) +{ + bictcp_reset(inet_csk_ca(sk)); + if (initial_ssthresh) + tcp_sk(sk)->snd_ssthresh = initial_ssthresh; +} + +/* 65536 times the cubic root */ +static const u64 cubic_table[8] + = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; + +/* + * calculate the cubic root of x + * the basic idea is that x can be expressed as i*8^j + * so cubic_root(x) = cubic_root(i)*2^j + * in the following code, x is i, and y is 2^j + * because of integer calculation, there are errors in calculation + * so finally use binary search to find out the exact solution + */ +static u32 cubic_root(u64 x) +{ + u64 y, app, target, start, end, mid, start_diff, end_diff; + + if (x == 0) + return 0; + + target = x; + + /* first estimate lower and upper bound */ + y = 1; + while (x >= 8){ + x = (x >> 3); + y = (y << 1); + } + start = (y*cubic_table[x])>>16; + if (x==7) + end = (y<<1); + else + end = (y*cubic_table[x+1]+65535)>>16; + + /* binary search for more accurate one */ + while (start < end-1) { + mid = (start+end) >> 1; + app = mid*mid*mid; + if (app < target) + start = mid; + else if (app > target) + end = mid; + else + return mid; + } + + /* find the most accurate one from start and end */ + app = start*start*start; + if (app < target) + start_diff = target - app; + else + start_diff = app - target; + app = end*end*end; + if (app < target) + end_diff = target - app; + else + end_diff = app - target; + + if (start_diff < end_diff) + return (u32)start; + else + return (u32)end; +} + +static inline u32 bictcp_K(u32 dist, u32 srtt) +{ + u64 d64; + u32 d32; + u32 count; + u32 result; + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + so K = cubic_root( (wmax-cwnd)*rtt/c ) + the unit of K is bictcp_HZ=2^10, not HZ + + c = bic_scale >> 10 + rtt = (tp->srtt >> 3 ) / HZ + + the following code has been designed and tested for + cwnd < 1 million packets + RTT < 100 seconds + HZ < 1,000,00 (corresponding to 10 nano-second) + + */ + + /* 1/c * 2^2*bictcp_HZ */ + d32 = (1 << (10+2*BICTCP_HZ)) / bic_scale; + d64 = (__u64)d32; + + /* srtt * 2^count / HZ + 1) to get a better accuracy of the following d32, + the larger the "count", the better the accuracy + 2) and avoid overflow of the following d64 + the larger the "count", the high possibility of overflow + 3) so find a "count" between bictcp_hz-3 and bictcp_hz + "count" may be less than bictcp_HZ, + then d64 becomes 0. that is OK + */ + d32 = srtt; + count = 0; + while (((d32 & 0x80000000)==0) && (count < BICTCP_HZ)){ + d32 = d32 << 1; + count++; + } + d32 = d32 / HZ; + + /* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */ + d64 = (d64 * dist * d32) >> (count+3-BICTCP_HZ); + + /* cubic root */ + d64 = cubic_root(d64); + + result = (u32)d64; + return result; +} + +/* + * Compute congestion window to use. + */ +static inline void bictcp_update(struct bictcp *ca, u32 cwnd) +{ + u64 d64; + u32 d32, t, srtt, bic_target, min_cnt, max_cnt; + + ca->ack_cnt++; /* count the number of ACKs */ + + if (ca->last_cwnd == cwnd && + (s32)(tcp_time_stamp - ca->last_time) <= HZ / 32) + return; + + ca->last_cwnd = cwnd; + ca->last_time = tcp_time_stamp; + + srtt = (HZ << 3)/10; /* use real time-based growth function */ + + if (ca->epoch_start == 0) { + ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ + ca->ack_cnt = 1; /* start counting */ + ca->tcp_cwnd = cwnd; /* syn with cubic */ + + if (ca->last_max_cwnd <= cwnd) { + ca->bic_K = 0; + ca->bic_origin_point = cwnd; + } else { + ca->bic_K = bictcp_K(ca->last_max_cwnd-cwnd, srtt); + ca->bic_origin_point = ca->last_max_cwnd; + } + } + + /* cubic function - calc*/ + /* calculate c * time^3 / rtt, + * while considering overflow in calculation of time^3 + * (so time^3 is done by using d64) + * and without the support of division of 64bit numbers + * (so all divisions are done by using d32) + * also NOTE the unit of those veriables + * time = (t - K) / 2^bictcp_HZ + * c = bic_scale >> 10 + * rtt = (srtt >> 3) / HZ + * !!! The following code does not have overflow problems, + * if the cwnd < 1 million packets !!! + */ + + /* change the unit from HZ to bictcp_HZ */ + t = ((tcp_time_stamp + ca->delay_min - ca->epoch_start) + << BICTCP_HZ) / HZ; + + if (t < ca->bic_K) /* t - K */ + d32 = ca->bic_K - t; + else + d32 = t - ca->bic_K; + + d64 = (u64)d32; + d32 = (bic_scale << 3) * HZ / srtt; /* 1024*c/rtt */ + d64 = (d32 * d64 * d64 * d64) >> (10+3*BICTCP_HZ); /* c/rtt * (t-K)^3 */ + d32 = (u32)d64; + if (t < ca->bic_K) /* below origin*/ + bic_target = ca->bic_origin_point - d32; + else /* above origin*/ + bic_target = ca->bic_origin_point + d32; + + /* cubic function - calc bictcp_cnt*/ + if (bic_target > cwnd) { + ca->cnt = cwnd / (bic_target - cwnd); + } else { + ca->cnt = 100 * cwnd; /* very small increment*/ + } + + if (ca->delay_min > 0) { + /* max increment = Smax * rtt / 0.1 */ + min_cnt = (cwnd * HZ * 8)/(10 * max_increment * ca->delay_min); + if (ca->cnt < min_cnt) + ca->cnt = min_cnt; + } + + /* slow start and low utilization */ + if (ca->loss_cwnd == 0) /* could be aggressive in slow start */ + ca->cnt = 50; + + /* TCP Friendly */ + if (tcp_friendliness) { + u32 scale = 8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta); + d32 = (cwnd * scale) >> 3; + while (ca->ack_cnt > d32) { /* update tcp cwnd */ + ca->ack_cnt -= d32; + ca->tcp_cwnd++; + } + + if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ + d32 = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / d32; + if (ca->cnt > max_cnt) + ca->cnt = max_cnt; + } + } + + ca->cnt = (ca->cnt << ACK_RATIO_SHIFT) / ca->delayed_ack; + if (ca->cnt == 0) /* cannot be zero */ + ca->cnt = 1; +} + + +/* Keep track of minimum rtt */ +static inline void measure_delay(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + u32 delay; + + /* No time stamp */ + if (!(tp->rx_opt.saw_tstamp && tp->rx_opt.rcv_tsecr) || + /* Discard delay samples right after fast recovery */ + (s32)(tcp_time_stamp - ca->epoch_start) < HZ) + return; + + delay = tcp_time_stamp - tp->rx_opt.rcv_tsecr; + if (delay == 0) + delay = 1; + + /* first time call or link delay decreases */ + if (ca->delay_min == 0 || ca->delay_min > delay) + ca->delay_min = delay; +} + +static void bictcp_cong_avoid(struct sock *sk, u32 ack, + u32 seq_rtt, u32 in_flight, int data_acked) +{ + struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + if (data_acked) + measure_delay(sk); + + if (!tcp_is_cwnd_limited(sk, in_flight)) + return; + + if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + else { + bictcp_update(ca, tp->snd_cwnd); + + /* In dangerous area, increase slowly. + * In theory this is tp->snd_cwnd += 1 / tp->snd_cwnd + */ + if (tp->snd_cwnd_cnt >= ca->cnt) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + tp->snd_cwnd_cnt = 0; + } else + tp->snd_cwnd_cnt++; + } + +} + +static u32 bictcp_recalc_ssthresh(struct sock *sk) +{ + const struct tcp_sock *tp = tcp_sk(sk); + struct bictcp *ca = inet_csk_ca(sk); + + ca->epoch_start = 0; /* end of epoch */ + + /* Wmax and fast convergence */ + if (tp->snd_cwnd < ca->last_max_cwnd && fast_convergence) + ca->last_max_cwnd = (tp->snd_cwnd * (BICTCP_BETA_SCALE + beta)) + / (2 * BICTCP_BETA_SCALE); + else + ca->last_max_cwnd = tp->snd_cwnd; + + ca->loss_cwnd = tp->snd_cwnd; + + return max((tp->snd_cwnd * beta) / BICTCP_BETA_SCALE, 2U); +} + +static u32 bictcp_undo_cwnd(struct sock *sk) +{ + struct bictcp *ca = inet_csk_ca(sk); + + return max(tcp_sk(sk)->snd_cwnd, ca->last_max_cwnd); +} + +static u32 bictcp_min_cwnd(struct sock *sk) +{ + return tcp_sk(sk)->snd_ssthresh; +} + +static void bictcp_state(struct sock *sk, u8 new_state) +{ + if (new_state == TCP_CA_Loss) + bictcp_reset(inet_csk_ca(sk)); +} + +/* Track delayed acknowledgment ratio using sliding window + * ratio = (15*ratio + sample) / 16 + */ +static void bictcp_acked(struct sock *sk, u32 cnt) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + + if (cnt > 0 && icsk->icsk_ca_state == TCP_CA_Open) { + struct bictcp *ca = inet_csk_ca(sk); + cnt -= ca->delayed_ack >> ACK_RATIO_SHIFT; + ca->delayed_ack += cnt; + } +} + + +static struct tcp_congestion_ops cubictcp = { + .init = bictcp_init, + .ssthresh = bictcp_recalc_ssthresh, + .cong_avoid = bictcp_cong_avoid, + .set_state = bictcp_state, + .undo_cwnd = bictcp_undo_cwnd, + .min_cwnd = bictcp_min_cwnd, + .pkts_acked = bictcp_acked, + .owner = THIS_MODULE, + .name = "cubic", +}; + +static int __init cubictcp_register(void) +{ + BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + return tcp_register_congestion_control(&cubictcp); +} + +static void __exit cubictcp_unregister(void) +{ + tcp_unregister_congestion_control(&cubictcp); +} + +module_init(cubictcp_register); +module_exit(cubictcp_unregister); + +MODULE_AUTHOR("Sangtae Ha, Stephen Hemminger"); +MODULE_LICENSE("GPL"); +MODULE_DESCRIPTION("CUBIC TCP"); +MODULE_VERSION("2.0"); -- cgit v1.1 From 318360646941d6f3d4c6e4ee99107392728a4079 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 13 Dec 2005 23:13:48 -0800 Subject: [NETFILTER] ip_tables: NUMA-aware allocation Part of a performance problem with ip_tables is that memory allocation is not NUMA aware, but 'only' SMP aware (ie each CPU normally touch separate cache lines) Even with small iptables rules, the cost of this misplacement can be high on common workloads. Instead of using one vmalloc() area (located in the node of the iptables process), we now allocate an area for each possible CPU, using vmalloc_node() so that memory should be allocated in the CPU's node if possible. Port to arp_tables and ip6_tables by Harald Welte. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/ipv4/netfilter/arp_tables.c | 175 ++++++++++++++++++++++++----------- net/ipv4/netfilter/ip_tables.c | 199 +++++++++++++++++++++++++++------------- net/ipv6/netfilter/ip6_tables.c | 190 +++++++++++++++++++++++++------------- 3 files changed, 382 insertions(+), 182 deletions(-) diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c index 3c2e963..bba1563 100644 --- a/net/ipv4/netfilter/arp_tables.c +++ b/net/ipv4/netfilter/arp_tables.c @@ -68,19 +68,14 @@ struct arpt_table_info { unsigned int initial_entries; unsigned int hook_entry[NF_ARP_NUMHOOKS]; unsigned int underflow[NF_ARP_NUMHOOKS]; - char entries[0] __attribute__((aligned(SMP_CACHE_BYTES))); + void *entries[NR_CPUS]; }; static LIST_HEAD(arpt_target); static LIST_HEAD(arpt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - static inline int arp_devaddr_compare(const struct arpt_devaddr_info *ap, char *hdr_addr, int len) { @@ -269,9 +264,7 @@ unsigned int arpt_do_table(struct sk_buff **pskb, outdev = out ? out->name : nulldevname; read_lock_bh(&table->lock); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, - smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); back = get_entry(table_base, table->private->underflow[hook]); @@ -462,7 +455,8 @@ static inline int unconditional(const struct arpt_arp *arp) /* Figures out from what hook each rule can be called: returns 0 if * there are loops. Puts hook bitmask in comefrom. */ -static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int valid_hooks) +static int mark_source_chains(struct arpt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -472,7 +466,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali for (hook = 0; hook < NF_ARP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct arpt_entry *e - = (struct arpt_entry *)(newinfo->entries + pos); + = (struct arpt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -514,13 +508,13 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali goto next; e = (struct arpt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct arpt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -537,7 +531,7 @@ static int mark_source_chains(struct arpt_table_info *newinfo, unsigned int vali newpos = pos + e->next_offset; } e = (struct arpt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -689,6 +683,7 @@ static inline int cleanup_entry(struct arpt_entry *e, unsigned int *i) static int translate_table(const char *name, unsigned int valid_hooks, struct arpt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -710,11 +705,11 @@ static int translate_table(const char *name, i = 0; /* Walk through entries, checking offsets. */ - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); duprintf("translate_table: ARPT_ENTRY_ITERATE gives %d\n", ret); if (ret != 0) @@ -743,29 +738,26 @@ static int translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) { + if (!mark_source_chains(newinfo, valid_hooks, entry0)) { duprintf("Looping hook\n"); return -ELOOP; } /* Finally, each sanity check must pass */ i = 0; - ret = ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = ARPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ARPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -807,15 +799,42 @@ static inline int add_entry_to_counter(const struct arpt_entry *e, return 0; } +static inline int set_entry_to_counter(const struct arpt_entry *e, + struct arpt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct arpt_table_info *t, struct arpt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + ARPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - ARPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + ARPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -831,6 +850,7 @@ static int copy_entries_to_user(unsigned int total_size, struct arpt_entry *e; struct arpt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change * (other than comefrom, which userspace doesn't care @@ -843,13 +863,13 @@ static int copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -859,7 +879,7 @@ static int copy_entries_to_user(unsigned int total_size, for (off = 0, num = 0; off < total_size; off += e->next_offset, num++){ struct arpt_entry_target *t; - e = (struct arpt_entry *)(table->private->entries + off); + e = (struct arpt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct arpt_entry, counters), &counters[num], @@ -911,6 +931,47 @@ static int get_entries(const struct arpt_get_entries *entries, return ret; } +static void free_table_info(struct arpt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct arpt_table_info *alloc_table_info(unsigned int size) +{ + struct arpt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct arpt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { int ret; @@ -918,6 +979,7 @@ static int do_replace(void __user *user, unsigned int len) struct arpt_table *t; struct arpt_table_info *newinfo, *oldinfo; struct arpt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -930,13 +992,13 @@ static int do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -947,10 +1009,9 @@ static int do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct arpt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -989,8 +1050,10 @@ static int do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - ARPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct arpt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1002,11 +1065,11 @@ static int do_replace(void __user *user, unsigned int len) module_put(t->me); up(&arpt_mutex); free_newinfo_counters_untrans: - ARPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry, NULL); + ARPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry, NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1030,6 +1093,7 @@ static int do_add_counters(void __user *user, unsigned int len) struct arpt_counters_info tmp, *paddc; struct arpt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1059,7 +1123,9 @@ static int do_add_counters(void __user *user, unsigned int len) } i = 0; - ARPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1220,30 +1286,32 @@ int arpt_register_table(struct arpt_table *table, struct arpt_table_info *newinfo; static struct arpt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct arpt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) { ret = -ENOMEM; return ret; } - memcpy(newinfo->entries, repl->entries, repl->size); + + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); duprintf("arpt_register_table: translate table gives %d\n", ret); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&arpt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1272,20 +1340,23 @@ int arpt_register_table(struct arpt_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void arpt_unregister_table(struct arpt_table *table) { + void *loc_cpu_entry; + down(&arpt_mutex); LIST_DELETE(&arpt_tables, table); up(&arpt_mutex); /* Decrease module usage counts and free resources */ - ARPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + ARPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* The built-in targets: standard (NULL) and error. */ diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c index 45886c8..2a26d16 100644 --- a/net/ipv4/netfilter/ip_tables.c +++ b/net/ipv4/netfilter/ip_tables.c @@ -83,11 +83,6 @@ static DECLARE_MUTEX(ipt_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -105,20 +100,15 @@ struct ipt_table_info unsigned int underflow[NF_IP_NUMHOOKS]; /* ipt_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ipt_target); static LIST_HEAD(ipt_match); static LIST_HEAD(ipt_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -290,8 +280,7 @@ ipt_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -563,7 +552,8 @@ unconditional(const struct ipt_ip *ip) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ipt_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -572,7 +562,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ipt_entry *e - = (struct ipt_entry *)(newinfo->entries + pos); + = (struct ipt_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -622,13 +612,13 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ipt_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ipt_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -645,7 +635,7 @@ mark_source_chains(struct ipt_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ipt_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -855,6 +845,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ipt_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -875,11 +866,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -907,27 +898,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IPT_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IPT_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -943,15 +931,12 @@ replace_table(struct ipt_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ipt_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ipt_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -986,16 +971,44 @@ add_entry_to_counter(const struct ipt_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ipt_entry *e, + struct ipt_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ipt_table_info *t, struct ipt_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IPT_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IPT_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IPT_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1012,24 +1025,29 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry *e; struct ipt_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care about). */ countersize = sizeof(struct ipt_counters) * table->private->number; - counters = vmalloc(countersize); + counters = vmalloc_node(countersize, numa_node_id()); if (counters == NULL) return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on our node/cpu, ... + * This choice is lazy (because current thread is + * allowed to migrate to another cpu) + */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + /* ... then copy entire thing ... */ + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1041,7 +1059,7 @@ copy_entries_to_user(unsigned int total_size, struct ipt_entry_match *m; struct ipt_entry_target *t; - e = (struct ipt_entry *)(table->private->entries + off); + e = (struct ipt_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ipt_entry, counters), &counters[num], @@ -1110,6 +1128,45 @@ get_entries(const struct ipt_get_entries *entries, return ret; } +static void free_table_info(struct ipt_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ipt_table_info *alloc_table_info(unsigned int size) +{ + struct ipt_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ipt_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, cpu_to_node(cpu)); + if (newinfo->entries[cpu] == 0) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1118,6 +1175,7 @@ do_replace(void __user *user, unsigned int len) struct ipt_table *t; struct ipt_table_info *newinfo, *oldinfo; struct ipt_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1130,13 +1188,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1147,10 +1205,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ipt_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1189,8 +1246,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IPT_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ipt_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1202,11 +1260,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ipt_mutex); free_newinfo_counters_untrans: - IPT_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IPT_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1239,6 +1297,7 @@ do_add_counters(void __user *user, unsigned int len) struct ipt_counters_info tmp, *paddc; struct ipt_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1246,7 +1305,7 @@ do_add_counters(void __user *user, unsigned int len) if (len != sizeof(tmp) + tmp.num_counters*sizeof(struct ipt_counters)) return -EINVAL; - paddc = vmalloc(len); + paddc = vmalloc_node(len, numa_node_id()); if (!paddc) return -ENOMEM; @@ -1268,7 +1327,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IPT_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1460,28 +1521,31 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) struct ipt_table_info *newinfo; static struct ipt_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ipt_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu + * but dont care of preemption + */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ipt_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1510,20 +1574,23 @@ int ipt_register_table(struct ipt_table *table, const struct ipt_replace *repl) return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ipt_unregister_table(struct ipt_table *table) { + void *loc_cpu_entry; + down(&ipt_mutex); LIST_DELETE(&ipt_tables, table); up(&ipt_mutex); /* Decrease module usage counts and free resources */ - IPT_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IPT_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index 95d4692..dd80020 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -86,11 +86,6 @@ static DECLARE_MUTEX(ip6t_mutex); context stops packets coming through and allows user context to read the counters or update the rules. - To be cache friendly on SMP, we arrange them like so: - [ n-entries ] - ... cache-align padding ... - [ n-entries ] - Hence the start of any table is given by get_table() below. */ /* The table itself */ @@ -108,20 +103,15 @@ struct ip6t_table_info unsigned int underflow[NF_IP6_NUMHOOKS]; /* ip6t_entry tables: one per CPU */ - char entries[0] ____cacheline_aligned; + void *entries[NR_CPUS]; }; static LIST_HEAD(ip6t_target); static LIST_HEAD(ip6t_match); static LIST_HEAD(ip6t_tables); +#define SET_COUNTER(c,b,p) do { (c).bcnt = (b); (c).pcnt = (p); } while(0) #define ADD_COUNTER(c,b,p) do { (c).bcnt += (b); (c).pcnt += (p); } while(0) -#ifdef CONFIG_SMP -#define TABLE_OFFSET(t,p) (SMP_ALIGN((t)->size)*(p)) -#else -#define TABLE_OFFSET(t,p) 0 -#endif - #if 0 #define down(x) do { printk("DOWN:%u:" #x "\n", __LINE__); down(x); } while(0) #define down_interruptible(x) ({ int __r; printk("DOWNi:%u:" #x "\n", __LINE__); __r = down_interruptible(x); if (__r != 0) printk("ABORT-DOWNi:%u\n", __LINE__); __r; }) @@ -376,8 +366,7 @@ ip6t_do_table(struct sk_buff **pskb, read_lock_bh(&table->lock); IP_NF_ASSERT(table->valid_hooks & (1 << hook)); - table_base = (void *)table->private->entries - + TABLE_OFFSET(table->private, smp_processor_id()); + table_base = (void *)table->private->entries[smp_processor_id()]; e = get_entry(table_base, table->private->hook_entry[hook]); #ifdef CONFIG_NETFILTER_DEBUG @@ -649,7 +638,8 @@ unconditional(const struct ip6t_ip6 *ipv6) /* Figures out from what hook each rule can be called: returns 0 if there are loops. Puts hook bitmask in comefrom. */ static int -mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) +mark_source_chains(struct ip6t_table_info *newinfo, + unsigned int valid_hooks, void *entry0) { unsigned int hook; @@ -658,7 +648,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) for (hook = 0; hook < NF_IP6_NUMHOOKS; hook++) { unsigned int pos = newinfo->hook_entry[hook]; struct ip6t_entry *e - = (struct ip6t_entry *)(newinfo->entries + pos); + = (struct ip6t_entry *)(entry0 + pos); if (!(valid_hooks & (1 << hook))) continue; @@ -708,13 +698,13 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) goto next; e = (struct ip6t_entry *) - (newinfo->entries + pos); + (entry0 + pos); } while (oldpos == pos + e->next_offset); /* Move along one */ size = e->next_offset; e = (struct ip6t_entry *) - (newinfo->entries + pos + size); + (entry0 + pos + size); e->counters.pcnt = pos; pos += size; } else { @@ -731,7 +721,7 @@ mark_source_chains(struct ip6t_table_info *newinfo, unsigned int valid_hooks) newpos = pos + e->next_offset; } e = (struct ip6t_entry *) - (newinfo->entries + newpos); + (entry0 + newpos); e->counters.pcnt = pos; pos = newpos; } @@ -941,6 +931,7 @@ static int translate_table(const char *name, unsigned int valid_hooks, struct ip6t_table_info *newinfo, + void *entry0, unsigned int size, unsigned int number, const unsigned int *hook_entries, @@ -961,11 +952,11 @@ translate_table(const char *name, duprintf("translate_table: size %u\n", newinfo->size); i = 0; /* Walk through entries, checking offsets. */ - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry_size_and_hooks, newinfo, - newinfo->entries, - newinfo->entries + size, + entry0, + entry0 + size, hook_entries, underflows, &i); if (ret != 0) return ret; @@ -993,27 +984,24 @@ translate_table(const char *name, } } - if (!mark_source_chains(newinfo, valid_hooks)) + if (!mark_source_chains(newinfo, valid_hooks, entry0)) return -ELOOP; /* Finally, each sanity check must pass */ i = 0; - ret = IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + ret = IP6T_ENTRY_ITERATE(entry0, newinfo->size, check_entry, name, size, &i); if (ret != 0) { - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, + IP6T_ENTRY_ITERATE(entry0, newinfo->size, cleanup_entry, &i); return ret; } /* And one copy for every other CPU */ for_each_cpu(i) { - if (i == 0) - continue; - memcpy(newinfo->entries + SMP_ALIGN(newinfo->size) * i, - newinfo->entries, - SMP_ALIGN(newinfo->size)); + if (newinfo->entries[i] && newinfo->entries[i] != entry0) + memcpy(newinfo->entries[i], entry0, newinfo->size); } return ret; @@ -1029,15 +1017,12 @@ replace_table(struct ip6t_table *table, #ifdef CONFIG_NETFILTER_DEBUG { - struct ip6t_entry *table_base; - unsigned int i; + int cpu; - for_each_cpu(i) { - table_base = - (void *)newinfo->entries - + TABLE_OFFSET(newinfo, i); - - table_base->comefrom = 0xdead57ac; + for_each_cpu(cpu) { + struct ip6t_entry *table_base = newinfo->entries[cpu]; + if (table_base) + table_base->comefrom = 0xdead57ac; } } #endif @@ -1072,16 +1057,44 @@ add_entry_to_counter(const struct ip6t_entry *e, return 0; } +static inline int +set_entry_to_counter(const struct ip6t_entry *e, + struct ip6t_counters total[], + unsigned int *i) +{ + SET_COUNTER(total[*i], e->counters.bcnt, e->counters.pcnt); + + (*i)++; + return 0; +} + static void get_counters(const struct ip6t_table_info *t, struct ip6t_counters counters[]) { unsigned int cpu; unsigned int i; + unsigned int curcpu; + + /* Instead of clearing (by a previous call to memset()) + * the counters and using adds, we set the counters + * with data used by 'current' CPU + * We dont care about preemption here. + */ + curcpu = raw_smp_processor_id(); + + i = 0; + IP6T_ENTRY_ITERATE(t->entries[curcpu], + t->size, + set_entry_to_counter, + counters, + &i); for_each_cpu(cpu) { + if (cpu == curcpu) + continue; i = 0; - IP6T_ENTRY_ITERATE(t->entries + TABLE_OFFSET(t, cpu), + IP6T_ENTRY_ITERATE(t->entries[cpu], t->size, add_entry_to_counter, counters, @@ -1098,6 +1111,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry *e; struct ip6t_counters *counters; int ret = 0; + void *loc_cpu_entry; /* We need atomic snapshot of counters: rest doesn't change (other than comefrom, which userspace doesn't care @@ -1109,13 +1123,13 @@ copy_entries_to_user(unsigned int total_size, return -ENOMEM; /* First, sum counters... */ - memset(counters, 0, countersize); write_lock_bh(&table->lock); get_counters(table->private, counters); write_unlock_bh(&table->lock); - /* ... then copy entire thing from CPU 0... */ - if (copy_to_user(userptr, table->private->entries, total_size) != 0) { + /* choose the copy that is on ourc node/cpu */ + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + if (copy_to_user(userptr, loc_cpu_entry, total_size) != 0) { ret = -EFAULT; goto free_counters; } @@ -1127,7 +1141,7 @@ copy_entries_to_user(unsigned int total_size, struct ip6t_entry_match *m; struct ip6t_entry_target *t; - e = (struct ip6t_entry *)(table->private->entries + off); + e = (struct ip6t_entry *)(loc_cpu_entry + off); if (copy_to_user(userptr + off + offsetof(struct ip6t_entry, counters), &counters[num], @@ -1196,6 +1210,46 @@ get_entries(const struct ip6t_get_entries *entries, return ret; } +static void free_table_info(struct ip6t_table_info *info) +{ + int cpu; + for_each_cpu(cpu) { + if (info->size <= PAGE_SIZE) + kfree(info->entries[cpu]); + else + vfree(info->entries[cpu]); + } + kfree(info); +} + +static struct ip6t_table_info *alloc_table_info(unsigned int size) +{ + struct ip6t_table_info *newinfo; + int cpu; + + newinfo = kzalloc(sizeof(struct ip6t_table_info), GFP_KERNEL); + if (!newinfo) + return NULL; + + newinfo->size = size; + + for_each_cpu(cpu) { + if (size <= PAGE_SIZE) + newinfo->entries[cpu] = kmalloc_node(size, + GFP_KERNEL, + cpu_to_node(cpu)); + else + newinfo->entries[cpu] = vmalloc_node(size, + cpu_to_node(cpu)); + if (newinfo->entries[cpu] == NULL) { + free_table_info(newinfo); + return NULL; + } + } + + return newinfo; +} + static int do_replace(void __user *user, unsigned int len) { @@ -1204,6 +1258,7 @@ do_replace(void __user *user, unsigned int len) struct ip6t_table *t; struct ip6t_table_info *newinfo, *oldinfo; struct ip6t_counters *counters; + void *loc_cpu_entry, *loc_cpu_old_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1212,13 +1267,13 @@ do_replace(void __user *user, unsigned int len) if ((SMP_ALIGN(tmp.size) >> PAGE_SHIFT) + 2 > num_physpages) return -ENOMEM; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(tmp.size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(tmp.size); if (!newinfo) return -ENOMEM; - if (copy_from_user(newinfo->entries, user + sizeof(tmp), + /* choose the copy that is on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + if (copy_from_user(loc_cpu_entry, user + sizeof(tmp), tmp.size) != 0) { ret = -EFAULT; goto free_newinfo; @@ -1229,10 +1284,9 @@ do_replace(void __user *user, unsigned int len) ret = -ENOMEM; goto free_newinfo; } - memset(counters, 0, tmp.num_counters * sizeof(struct ip6t_counters)); ret = translate_table(tmp.name, tmp.valid_hooks, - newinfo, tmp.size, tmp.num_entries, + newinfo, loc_cpu_entry, tmp.size, tmp.num_entries, tmp.hook_entry, tmp.underflow); if (ret != 0) goto free_newinfo_counters; @@ -1271,8 +1325,9 @@ do_replace(void __user *user, unsigned int len) /* Get the old counters. */ get_counters(oldinfo, counters); /* Decrease module usage counts and free resource */ - IP6T_ENTRY_ITERATE(oldinfo->entries, oldinfo->size, cleanup_entry,NULL); - vfree(oldinfo); + loc_cpu_old_entry = oldinfo->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_old_entry, oldinfo->size, cleanup_entry,NULL); + free_table_info(oldinfo); if (copy_to_user(tmp.counters, counters, sizeof(struct ip6t_counters) * tmp.num_counters) != 0) ret = -EFAULT; @@ -1284,11 +1339,11 @@ do_replace(void __user *user, unsigned int len) module_put(t->me); up(&ip6t_mutex); free_newinfo_counters_untrans: - IP6T_ENTRY_ITERATE(newinfo->entries, newinfo->size, cleanup_entry,NULL); + IP6T_ENTRY_ITERATE(loc_cpu_entry, newinfo->size, cleanup_entry,NULL); free_newinfo_counters: vfree(counters); free_newinfo: - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1321,6 +1376,7 @@ do_add_counters(void __user *user, unsigned int len) struct ip6t_counters_info tmp, *paddc; struct ip6t_table *t; int ret = 0; + void *loc_cpu_entry; if (copy_from_user(&tmp, user, sizeof(tmp)) != 0) return -EFAULT; @@ -1350,7 +1406,9 @@ do_add_counters(void __user *user, unsigned int len) } i = 0; - IP6T_ENTRY_ITERATE(t->private->entries, + /* Choose the copy that is on our node */ + loc_cpu_entry = t->private->entries[smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, t->private->size, add_counter_to_entry, paddc->counters, @@ -1543,28 +1601,29 @@ int ip6t_register_table(struct ip6t_table *table, struct ip6t_table_info *newinfo; static struct ip6t_table_info bootstrap = { 0, 0, 0, { 0 }, { 0 }, { } }; + void *loc_cpu_entry; - newinfo = vmalloc(sizeof(struct ip6t_table_info) - + SMP_ALIGN(repl->size) * - (highest_possible_processor_id()+1)); + newinfo = alloc_table_info(repl->size); if (!newinfo) return -ENOMEM; - memcpy(newinfo->entries, repl->entries, repl->size); + /* choose the copy on our node/cpu */ + loc_cpu_entry = newinfo->entries[raw_smp_processor_id()]; + memcpy(loc_cpu_entry, repl->entries, repl->size); ret = translate_table(table->name, table->valid_hooks, - newinfo, repl->size, + newinfo, loc_cpu_entry, repl->size, repl->num_entries, repl->hook_entry, repl->underflow); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } ret = down_interruptible(&ip6t_mutex); if (ret != 0) { - vfree(newinfo); + free_table_info(newinfo); return ret; } @@ -1593,20 +1652,23 @@ int ip6t_register_table(struct ip6t_table *table, return ret; free_unlock: - vfree(newinfo); + free_table_info(newinfo); goto unlock; } void ip6t_unregister_table(struct ip6t_table *table) { + void *loc_cpu_entry; + down(&ip6t_mutex); LIST_DELETE(&ip6t_tables, table); up(&ip6t_mutex); /* Decrease module usage counts and free resources */ - IP6T_ENTRY_ITERATE(table->private->entries, table->private->size, + loc_cpu_entry = table->private->entries[raw_smp_processor_id()]; + IP6T_ENTRY_ITERATE(loc_cpu_entry, table->private->size, cleanup_entry, NULL); - vfree(table->private); + free_table_info(table->private); } /* Returns 1 if the port is matched by the range, 0 otherwise */ -- cgit v1.1 From d5228a4f49db32d22a39c653281b527ef371129c Mon Sep 17 00:00:00 2001 From: Bart De Schuymer Date: Tue, 13 Dec 2005 23:14:08 -0800 Subject: [NETFILTER] ebtables: Support nf_log API from ebt_log and ebt_ulog This makes ebt_log and ebt_ulog use the new nf_log api. This enables the bridging packet filter to log packets e.g. via nfnetlink_log. Signed-off-by: Bart De Schuymer Signed-off-by: Harald Welte Signed-off-by: David S. Miller --- net/bridge/netfilter/Kconfig | 6 +++- net/bridge/netfilter/ebt_log.c | 72 +++++++++++++++++++++++++++++++---------- net/bridge/netfilter/ebt_ulog.c | 53 ++++++++++++++++++++++++++++-- 3 files changed, 110 insertions(+), 21 deletions(-) diff --git a/net/bridge/netfilter/Kconfig b/net/bridge/netfilter/Kconfig index c70b3be..b84fc60 100644 --- a/net/bridge/netfilter/Kconfig +++ b/net/bridge/netfilter/Kconfig @@ -196,9 +196,13 @@ config BRIDGE_EBT_LOG To compile it as a module, choose M here. If unsure, say N. config BRIDGE_EBT_ULOG - tristate "ebt: ulog support" + tristate "ebt: ulog support (OBSOLETE)" depends on BRIDGE_NF_EBTABLES help + This option enables the old bridge-specific "ebt_ulog" implementation + which has been obsoleted by the new "nfnetlink_log" code (see + CONFIG_NETFILTER_NETLINK_LOG). + This option adds the ulog watcher, that you can use in any rule in any ebtables table. The packet is passed to a userspace logging daemon using netlink multicast sockets. This differs diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index 662975b..c436e6c 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -3,6 +3,7 @@ * * Authors: * Bart De Schuymer + * Harald Welte * * April, 2002 * @@ -10,6 +11,7 @@ #include #include +#include #include #include #include @@ -55,27 +57,30 @@ static void print_MAC(unsigned char *p) } #define myNIPQUAD(a) a[0], a[1], a[2], a[3] -static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, - const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) +static void +ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *loginfo, + const char *prefix) { - struct ebt_log_info *info = (struct ebt_log_info *)data; - char level_string[4] = "< >"; + unsigned int bitmask; - level_string[1] = '0' + info->loglevel; spin_lock_bh(&ebt_log_lock); - printk(level_string); - printk("%s IN=%s OUT=%s ", info->prefix, in ? in->name : "", - out ? out->name : ""); + printk("<%c>%s IN=%s OUT=%s MAC source = ", '0' + loginfo->u.log.level, + prefix, in ? in->name : "", out ? out->name : ""); - printk("MAC source = "); print_MAC(eth_hdr(skb)->h_source); printk("MAC dest = "); print_MAC(eth_hdr(skb)->h_dest); printk("proto = 0x%04x", ntohs(eth_hdr(skb)->h_proto)); - if ((info->bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == + if (loginfo->type == NF_LOG_TYPE_LOG) + bitmask = loginfo->u.log.logflags; + else + bitmask = NF_LOG_MASK; + + if ((bitmask & EBT_LOG_IP) && eth_hdr(skb)->h_proto == htons(ETH_P_IP)){ struct iphdr _iph, *ih; @@ -84,10 +89,9 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, printk(" INCOMPLETE IP header"); goto out; } - printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u,", - NIPQUAD(ih->saddr), NIPQUAD(ih->daddr)); - printk(" IP tos=0x%02X, IP proto=%d", ih->tos, - ih->protocol); + printk(" IP SRC=%u.%u.%u.%u IP DST=%u.%u.%u.%u, IP " + "tos=0x%02X, IP proto=%d", NIPQUAD(ih->saddr), + NIPQUAD(ih->daddr), ih->tos, ih->protocol); if (ih->protocol == IPPROTO_TCP || ih->protocol == IPPROTO_UDP) { struct tcpudphdr _ports, *pptr; @@ -104,7 +108,7 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, goto out; } - if ((info->bitmask & EBT_LOG_ARP) && + if ((bitmask & EBT_LOG_ARP) && ((eth_hdr(skb)->h_proto == htons(ETH_P_ARP)) || (eth_hdr(skb)->h_proto == htons(ETH_P_RARP)))) { struct arphdr _arph, *ah; @@ -144,6 +148,21 @@ static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, out: printk("\n"); spin_unlock_bh(&ebt_log_lock); + +} + +static void ebt_log(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_log_info *info = (struct ebt_log_info *)data; + struct nf_loginfo li; + + li.type = NF_LOG_TYPE_LOG; + li.u.log.level = info->loglevel; + li.u.log.logflags = info->bitmask; + + nf_log_packet(PF_BRIDGE, hooknr, skb, in, out, &li, info->prefix); } static struct ebt_watcher log = @@ -154,13 +173,32 @@ static struct ebt_watcher log = .me = THIS_MODULE, }; +static struct nf_logger ebt_log_logger = { + .name = "ebt_log", + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { - return ebt_register_watcher(&log); + int ret; + + ret = ebt_register_watcher(&log); + if (ret < 0) + return ret; + if (nf_log_register(PF_BRIDGE, &ebt_log_logger) < 0) { + printk(KERN_WARNING "ebt_log: not logging via system console " + "since somebody else already registered for PF_INET\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + + return 0; } static void __exit fini(void) { + nf_log_unregister_logger(&ebt_log_logger); ebt_unregister_watcher(&log); } diff --git a/net/bridge/netfilter/ebt_ulog.c b/net/bridge/netfilter/ebt_ulog.c index aae26ae..ce617b3 100644 --- a/net/bridge/netfilter/ebt_ulog.c +++ b/net/bridge/netfilter/ebt_ulog.c @@ -3,6 +3,7 @@ * * Authors: * Bart De Schuymer + * Harald Welte * * November, 2004 * @@ -115,14 +116,13 @@ static struct sk_buff *ulog_alloc_skb(unsigned int size) return skb; } -static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, +static void ebt_ulog_packet(unsigned int hooknr, const struct sk_buff *skb, const struct net_device *in, const struct net_device *out, - const void *data, unsigned int datalen) + const struct ebt_ulog_info *uloginfo, const char *prefix) { ebt_ulog_packet_msg_t *pm; size_t size, copy_len; struct nlmsghdr *nlh; - struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; unsigned int group = uloginfo->nlgroup; ebt_ulog_buff_t *ub = &ulog_buffers[group]; spinlock_t *lock = &ub->lock; @@ -216,6 +216,39 @@ alloc_failure: goto unlock; } +/* this function is registered with the netfilter core */ +static void ebt_log_packet(unsigned int pf, unsigned int hooknum, + const struct sk_buff *skb, const struct net_device *in, + const struct net_device *out, const struct nf_loginfo *li, + const char *prefix) +{ + struct ebt_ulog_info loginfo; + + if (!li || li->type != NF_LOG_TYPE_ULOG) { + loginfo.nlgroup = EBT_ULOG_DEFAULT_NLGROUP; + loginfo.cprange = 0; + loginfo.qthreshold = EBT_ULOG_DEFAULT_QTHRESHOLD; + loginfo.prefix[0] = '\0'; + } else { + loginfo.nlgroup = li->u.ulog.group; + loginfo.cprange = li->u.ulog.copy_len; + loginfo.qthreshold = li->u.ulog.qthreshold; + strlcpy(loginfo.prefix, prefix, sizeof(loginfo.prefix)); + } + + ebt_ulog_packet(hooknum, skb, in, out, &loginfo, prefix); +} + +static void ebt_ulog(const struct sk_buff *skb, unsigned int hooknr, + const struct net_device *in, const struct net_device *out, + const void *data, unsigned int datalen) +{ + struct ebt_ulog_info *uloginfo = (struct ebt_ulog_info *)data; + + ebt_ulog_packet(hooknr, skb, in, out, uloginfo, NULL); +} + + static int ebt_ulog_check(const char *tablename, unsigned int hookmask, const struct ebt_entry *e, void *data, unsigned int datalen) { @@ -240,6 +273,12 @@ static struct ebt_watcher ulog = { .me = THIS_MODULE, }; +static struct nf_logger ebt_ulog_logger = { + .name = EBT_ULOG_WATCHER, + .logfn = &ebt_log_packet, + .me = THIS_MODULE, +}; + static int __init init(void) { int i, ret = 0; @@ -265,6 +304,13 @@ static int __init init(void) else if ((ret = ebt_register_watcher(&ulog))) sock_release(ebtulognl->sk_socket); + if (nf_log_register(PF_BRIDGE, &ebt_ulog_logger) < 0) { + printk(KERN_WARNING "ebt_ulog: not logging via ulog " + "since somebody else already registered for PF_BRIDGE\n"); + /* we cannot make module load fail here, since otherwise + * ebtables userspace would abort */ + } + return ret; } @@ -273,6 +319,7 @@ static void __exit fini(void) ebt_ulog_buff_t *ub; int i; + nf_log_unregister_logger(&ebt_ulog_logger); ebt_unregister_watcher(&ulog); for (i = 0; i < EBT_ULOG_MAXNLGROUPS; i++) { ub = &ulog_buffers[i]; -- cgit v1.1 From 89cee8b1cbb9dac40c92ef1968aea2b45f82fd18 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 13 Dec 2005 23:14:27 -0800 Subject: [IPV4]: Safer reassembly Another spin of Herbert Xu's "safer ip reassembly" patch for 2.6.16. (The original patch is here: http://marc.theaimsgroup.com/?l=linux-netdev&m=112281936522415&w=2 and my only contribution is to have tested it.) This patch (optionally) does additional checks before accepting IP fragments, which can greatly reduce the possibility of reassembling fragments which originated from different IP datagrams. Signed-off-by: Herbert Xu Signed-off-by: Arthur Kepner Signed-off-by: David S. Miller --- Documentation/networking/ip-sysctl.txt | 23 ++++++++++++ include/linux/sysctl.h | 1 + include/net/inetpeer.h | 1 + include/net/ip.h | 2 + net/ipv4/inetpeer.c | 1 + net/ipv4/ip_fragment.c | 68 +++++++++++++++++++++++++++++++++- net/ipv4/ip_output.c | 1 + net/ipv4/sysctl_net_ipv4.c | 10 +++++ 8 files changed, 106 insertions(+), 1 deletion(-) diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt index ebc09a1..2b7cf19 100644 --- a/Documentation/networking/ip-sysctl.txt +++ b/Documentation/networking/ip-sysctl.txt @@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER for the hash secret) for IP fragments. Default: 600 +ipfrag_max_dist - INTEGER + ipfrag_max_dist is a non-negative integer value which defines the + maximum "disorder" which is allowed among fragments which share a + common IP source address. Note that reordering of packets is + not unusual, but if a large number of fragments arrive from a source + IP address while a particular fragment queue remains incomplete, it + probably indicates that one or more fragments belonging to that queue + have been lost. When ipfrag_max_dist is positive, an additional check + is done on fragments before they are added to a reassembly queue - if + ipfrag_max_dist (or more) fragments have arrived from a particular IP + address between additions to any IP fragment queue using that source + address, it's presumed that one or more fragments in the queue are + lost. The existing fragment queue will be dropped, and a new one + started. An ipfrag_max_dist value of zero disables this check. + + Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can + result in unnecessarily dropping fragment queues when normal + reordering of packets occurs, which could lead to poor application + performance. Using a very large value, e.g. 50000, increases the + likelihood of incorrectly reassembling IP fragments that originate + from different IP datagrams, which could result in data corruption. + Default: 64 + INET peer storage: inet_peer_threshold - INTEGER diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 4be34ef..93fa765 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -390,6 +390,7 @@ enum NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_TCP_CONG_CONTROL=110, NET_TCP_ABC=111, + NET_IPV4_IPFRAG_MAX_DIST=112, }; enum { diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h index 7fda471..0965515 100644 --- a/include/net/inetpeer.h +++ b/include/net/inetpeer.h @@ -25,6 +25,7 @@ struct inet_peer __u32 v4daddr; /* peer's address */ __u16 avl_height; __u16 ip_id_count; /* IP ID for the next packet */ + atomic_t rid; /* Frag reception counter */ __u32 tcp_ts; unsigned long tcp_ts_stamp; }; diff --git a/include/net/ip.h b/include/net/ip.h index e4563bb..4d6294ba 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -45,6 +45,7 @@ struct inet_skb_parm #define IPSKB_TRANSLATED 2 #define IPSKB_FORWARDED 4 #define IPSKB_XFRM_TUNNEL_SIZE 8 +#define IPSKB_FRAG_COMPLETE 16 }; struct ipcm_cookie @@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh; extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_time; extern int sysctl_ipfrag_secret_interval; +extern int sysctl_ipfrag_max_dist; /* From inetpeer.c */ extern int inet_peer_threshold; diff --git a/net/ipv4/inetpeer.c b/net/ipv4/inetpeer.c index 2fc3fd3..ce5fe3f 100644 --- a/net/ipv4/inetpeer.c +++ b/net/ipv4/inetpeer.c @@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create) return NULL; n->v4daddr = daddr; atomic_set(&n->refcnt, 1); + atomic_set(&n->rid, 0); n->ip_id_count = secure_ip_id(daddr); n->tcp_ts_stamp = 0; diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index 8ce0ce2..ce2b70c 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -22,6 +22,7 @@ * Patrick McHardy : LRU queue of frag heads for evictor. */ +#include #include #include #include @@ -38,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -56,6 +58,8 @@ int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_low_thresh = 192*1024; +int sysctl_ipfrag_max_dist = 64; + /* Important NOTE! Fragment queue must be destroyed before MSL expires. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. */ @@ -89,8 +93,10 @@ struct ipq { spinlock_t lock; atomic_t refcnt; struct timer_list timer; /* when will this queue expire? */ - int iif; struct timeval stamp; + int iif; + unsigned int rid; + struct inet_peer *peer; }; /* Hash table. */ @@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(del_timer(&qp->timer) == 0); + if (qp->peer) + inet_putpeer(qp->peer); + /* Release all fragment data. */ fp = qp->fragments; while (fp) { @@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) qp->meat = 0; qp->fragments = NULL; qp->iif = 0; + qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL; /* Initialize a timer for this entry. */ init_timer(&qp->timer); @@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) return ip_frag_create(hash, iph, user); } +/* Is the fragment too far ahead to be part of ipq? */ +static inline int ip_frag_too_far(struct ipq *qp) +{ + struct inet_peer *peer = qp->peer; + unsigned int max = sysctl_ipfrag_max_dist; + unsigned int start, end; + + int rc; + + if (!peer || !max) + return 0; + + start = qp->rid; + end = atomic_inc_return(&peer->rid); + qp->rid = end; + + rc = qp->fragments && (end - start) > max; + + if (rc) { + IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS); + } + + return rc; +} + +static int ip_frag_reinit(struct ipq *qp) +{ + struct sk_buff *fp; + + if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) { + atomic_inc(&qp->refcnt); + return -ETIMEDOUT; + } + + fp = qp->fragments; + do { + struct sk_buff *xp = fp->next; + frag_kfree_skb(fp, NULL); + fp = xp; + } while (fp); + + qp->last_in = 0; + qp->len = 0; + qp->meat = 0; + qp->fragments = NULL; + qp->iif = 0; + + return 0; +} + /* Add new segment to existing queue. */ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) { @@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) if (qp->last_in & COMPLETE) goto err; + if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) && + unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) { + ipq_kill(qp); + goto err; + } + offset = ntohs(skb->nh.iph->frag_off); flags = offset & ~IP_OFFSET; offset &= IP_OFFSET; diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index eba64e2..2a830de 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) hlen = iph->ihl * 4; mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ + IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE; /* When frag_list is given, use it. First, check its validity: * some transformers could create wrong frag_list or break existing diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 01444a0..dbf8295 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -22,6 +22,7 @@ extern int sysctl_ip_nonlocal_bind; #ifdef CONFIG_SYSCTL +static int zero; static int tcp_retr1_max = 255; static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_max[] = { 65535, 65535 }; @@ -614,6 +615,15 @@ ctl_table ipv4_table[] = { .strategy = &sysctl_jiffies }, { + .ctl_name = NET_IPV4_IPFRAG_MAX_DIST, + .procname = "ipfrag_max_dist", + .data = &sysctl_ipfrag_max_dist, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dointvec_minmax, + .extra1 = &zero + }, + { .ctl_name = NET_TCP_NO_METRICS_SAVE, .procname = "tcp_no_metrics_save", .data = &sysctl_tcp_nometrics_save, -- cgit v1.1 From 971af18bbfabb7b7c9c548da34a51e30869c08fc Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:14:47 -0800 Subject: [IPV6]: Reuse inet_csk_get_port in tcp_v6_get_port Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 6 ++- net/dccp/ipv4.c | 3 +- net/ipv4/inet_connection_sock.c | 11 +++-- net/ipv4/tcp_ipv4.c | 3 +- net/ipv6/tcp_ipv6.c | 95 ++------------------------------------ 5 files changed, 21 insertions(+), 97 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index b0c9906..edc68e8 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -192,8 +192,12 @@ extern struct request_sock *inet_csk_search_req(const struct sock *sk, const __u16 rport, const __u32 raddr, const __u32 laddr); +extern int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb); extern int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum); + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)); extern struct dst_entry* inet_csk_route_req(struct sock *sk, const struct request_sock *req); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 656e13e..1ac3e30 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -37,7 +37,8 @@ EXPORT_SYMBOL_GPL(dccp_hashinfo); static int dccp_v4_get_port(struct sock *sk, const unsigned short snum) { - return inet_csk_get_port(&dccp_hashinfo, sk, snum); + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void dccp_v4_hash(struct sock *sk) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index 3fe021f..f05b6e7 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -37,7 +37,8 @@ EXPORT_SYMBOL(inet_csk_timer_bug_msg); */ int sysctl_local_port_range[2] = { 1024, 4999 }; -static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucket *tb) +int inet_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { const u32 sk_rcv_saddr = inet_rcv_saddr(sk); struct sock *sk2; @@ -62,11 +63,15 @@ static inline int inet_csk_bind_conflict(struct sock *sk, struct inet_bind_bucke return node != NULL; } +EXPORT_SYMBOL_GPL(inet_csk_bind_conflict); + /* Obtain a reference to a local port for the given sock, * if snum is zero it means select any available local port. */ int inet_csk_get_port(struct inet_hashinfo *hashinfo, - struct sock *sk, unsigned short snum) + struct sock *sk, unsigned short snum, + int (*bind_conflict)(const struct sock *sk, + const struct inet_bind_bucket *tb)) { struct inet_bind_hashbucket *head; struct hlist_node *node; @@ -125,7 +130,7 @@ tb_found: goto success; } else { ret = 1; - if (inet_csk_bind_conflict(sk, tb)) + if (bind_conflict(sk, tb)) goto fail_unlock; } } diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 4d5021e..2aa19c8 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -97,7 +97,8 @@ struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { static int tcp_v4_get_port(struct sock *sk, unsigned short snum) { - return inet_csk_get_port(&tcp_hashinfo, sk, snum); + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet_csk_bind_conflict); } static void tcp_v4_hash(struct sock *sk) diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8827389..76c8f5a 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -76,8 +76,8 @@ static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); static struct tcp_func ipv6_mapped; static struct tcp_func ipv6_specific; -static inline int tcp_v6_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) { const struct sock *sk2; const struct hlist_node *node; @@ -97,97 +97,10 @@ static inline int tcp_v6_bind_conflict(const struct sock *sk, return node != NULL; } -/* Grrr, addr_type already calculated by caller, but I don't want - * to add some silly "cookie" argument to this method just for that. - * But it doesn't matter, the recalculation is in the rarest path - * this function ever takes. - */ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - struct hlist_node *node; - int ret; - - local_bh_disable(); - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - - do { - head = &tcp_hashinfo.bhash[inet_bhashfn(rover, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == rover) - goto next; - break; - next: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - /* Exhausted local port range during search? It is not - * possible for us to be holding one of the bind hash - * locks if this test triggers, because if 'remaining' - * drops to zero, we broke out of the do/while loop at - * the top level, not from the 'break;' statement. - */ - ret = 1; - if (unlikely(remaining <= 0)) - goto fail; - - /* OK, here is the one we will use. */ - snum = rover; - } else { - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - inet_bind_bucket_for_each(tb, node, &head->chain) - if (tb->port == snum) - goto tb_found; - } - tb = NULL; - goto tb_not_found; -tb_found: - if (tb && !hlist_empty(&tb->owners)) { - if (tb->fastreuse > 0 && sk->sk_reuse && - sk->sk_state != TCP_LISTEN) { - goto success; - } else { - ret = 1; - if (tcp_v6_bind_conflict(sk, tb)) - goto fail_unlock; - } - } -tb_not_found: - ret = 1; - if (tb == NULL) { - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, snum); - if (tb == NULL) - goto fail_unlock; - } - if (hlist_empty(&tb->owners)) { - if (sk->sk_reuse && sk->sk_state != TCP_LISTEN) - tb->fastreuse = 1; - else - tb->fastreuse = 0; - } else if (tb->fastreuse && - (!sk->sk_reuse || sk->sk_state == TCP_LISTEN)) - tb->fastreuse = 0; - -success: - if (!inet_csk(sk)->icsk_bind_hash) - inet_bind_hash(sk, tb, snum); - BUG_TRAP(inet_csk(sk)->icsk_bind_hash == tb); - ret = 0; - -fail_unlock: - spin_unlock(&head->lock); -fail: - local_bh_enable(); - return ret; + return inet_csk_get_port(&tcp_hashinfo, sk, snum, + inet6_csk_bind_conflict); } static __inline__ void __tcp_v6_hash(struct sock *sk) -- cgit v1.1 From 90b19d31695371bd3ed256d4c9e280861cd6ae7e Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:01 -0800 Subject: [IPV6]: Generalise __tcp_v6_hash, renaming it to __inet6_hash Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet6_hashtables.h | 26 ++++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 34 ++++------------------------------ 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index 5a2beed..a4a204f 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -48,6 +48,32 @@ static inline int inet6_sk_ehashfn(const struct sock *sk) return inet6_ehashfn(laddr, lport, faddr, fport); } +static inline void __inet6_hash(struct inet_hashinfo *hashinfo, + struct sock *sk) +{ + struct hlist_head *list; + rwlock_t *lock; + + BUG_TRAP(sk_unhashed(sk)); + + if (sk->sk_state == TCP_LISTEN) { + list = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)]; + lock = &hashinfo->lhash_lock; + inet_listen_wlock(hashinfo); + } else { + unsigned int hash; + sk->sk_hash = hash = inet6_sk_ehashfn(sk); + hash &= (hashinfo->ehash_size - 1); + list = &hashinfo->ehash[hash].chain; + lock = &hashinfo->ehash[hash].lock; + write_lock(lock); + } + + __sk_add_node(sk, list); + sock_prot_inc_use(sk->sk_prot); + write_unlock(lock); +} + /* * Sockets in TCP_CLOSE state are _always_ taken out of the hash, so * we need not check it for TCP lookups anymore, thanks Alexey. -DaveM diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 76c8f5a..bf41f84 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -103,32 +103,6 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) inet6_csk_bind_conflict); } -static __inline__ void __tcp_v6_hash(struct sock *sk) -{ - struct hlist_head *list; - rwlock_t *lock; - - BUG_TRAP(sk_unhashed(sk)); - - if (sk->sk_state == TCP_LISTEN) { - list = &tcp_hashinfo.listening_hash[inet_sk_listen_hashfn(sk)]; - lock = &tcp_hashinfo.lhash_lock; - inet_listen_wlock(&tcp_hashinfo); - } else { - unsigned int hash; - sk->sk_hash = hash = inet6_sk_ehashfn(sk); - hash &= (tcp_hashinfo.ehash_size - 1); - list = &tcp_hashinfo.ehash[hash].chain; - lock = &tcp_hashinfo.ehash[hash].lock; - write_lock(lock); - } - - __sk_add_node(sk, list); - sock_prot_inc_use(sk->sk_prot); - write_unlock(lock); -} - - static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { @@ -139,7 +113,7 @@ static void tcp_v6_hash(struct sock *sk) return; } local_bh_disable(); - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); local_bh_enable(); } } @@ -374,7 +348,7 @@ ok: inet_bind_hash(sk, tb, port); if (sk_unhashed(sk)) { inet_sk(sk)->sport = htons(port); - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); } spin_unlock(&head->lock); @@ -392,7 +366,7 @@ ok: spin_lock_bh(&head->lock); if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __tcp_v6_hash(sk); + __inet6_hash(&tcp_hashinfo, sk); spin_unlock_bh(&head->lock); return 0; } else { @@ -1295,7 +1269,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; - __tcp_v6_hash(newsk); + __inet6_hash(&tcp_hashinfo, newsk); inet_inherit_port(&tcp_hashinfo, sk, newsk); return newsk; -- cgit v1.1 From c2977c2213993bff51911f4117281b31c4612591 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:12 -0800 Subject: [ICSK]: make inet_csk_reqsk_queue_hash_add timeout arg unsigned long Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 +- net/ipv4/inet_connection_sock.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index edc68e8..ccc81a1 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -211,7 +211,7 @@ static inline void inet_csk_reqsk_queue_add(struct sock *sk, extern void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout); + unsigned long timeout); static inline void inet_csk_reqsk_queue_removed(struct sock *sk, struct request_sock *req) diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f05b6e7..e2bf508 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -385,7 +385,7 @@ struct request_sock *inet_csk_search_req(const struct sock *sk, EXPORT_SYMBOL_GPL(inet_csk_search_req); void inet_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, - const unsigned timeout) + unsigned long timeout) { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; -- cgit v1.1 From 8129765ac07c2455c927051e3a8b048b619b56ee Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:24 -0800 Subject: [IPV6]: Generalise tcp_v6_search_req & tcp_v6_synq_add More work is needed tho to introduce inet6_request_sock from tcp6_request_sock, in the same layout considerations as ipv6_pinfo in inet_sock, next changeset will do that. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet6_connection_sock.h | 31 ++++++++++++ include/net/request_sock.h | 2 +- net/ipv6/Makefile | 3 +- net/ipv6/inet6_connection_sock.c | 96 +++++++++++++++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 78 +++--------------------------- 5 files changed, 137 insertions(+), 73 deletions(-) create mode 100644 include/net/inet6_connection_sock.h create mode 100644 net/ipv6/inet6_connection_sock.c diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h new file mode 100644 index 0000000..aa30ebd --- /dev/null +++ b/include/net/inet6_connection_sock.h @@ -0,0 +1,31 @@ +/* + * NET Generic infrastructure for INET6 connection oriented protocols. + * + * Authors: Many people, see the TCPv6 sources + * + * From code originally in TCPv6 + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET6_CONNECTION_SOCK_H +#define _INET6_CONNECTION_SOCK_H + +#include + +struct sock; +struct request_sock; + +extern struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif); + +extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout); +#endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/include/net/request_sock.h b/include/net/request_sock.h index b52cc52..11641c9 100644 --- a/include/net/request_sock.h +++ b/include/net/request_sock.h @@ -244,7 +244,7 @@ static inline int reqsk_queue_is_full(const struct request_sock_queue *queue) static inline void reqsk_queue_hash_req(struct request_sock_queue *queue, u32 hash, struct request_sock *req, - unsigned timeout) + unsigned long timeout) { struct listen_sock *lopt = queue->listen_opt; diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile index 6460eec..9601fd7 100644 --- a/net/ipv6/Makefile +++ b/net/ipv6/Makefile @@ -8,7 +8,8 @@ ipv6-objs := af_inet6.o anycast.o ip6_output.o ip6_input.o addrconf.o sit.o \ route.o ip6_fib.o ipv6_sockglue.o ndisc.o udp.o raw.o \ protocol.o icmp.o mcast.o reassembly.o tcp_ipv6.o \ exthdrs.o sysctl_net_ipv6.o datagram.o proc.o \ - ip6_flowlabel.o ipv6_syms.o netfilter.o + ip6_flowlabel.o ipv6_syms.o netfilter.o \ + inet6_connection_sock.o ipv6-$(CONFIG_XFRM) += xfrm6_policy.o xfrm6_state.o xfrm6_input.o \ xfrm6_output.o diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c new file mode 100644 index 0000000..04ff443 --- /dev/null +++ b/net/ipv6/inet6_connection_sock.c @@ -0,0 +1,96 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Support for INET6 connection oriented protocols. + * + * Authors: See the TCPv6 sources + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or(at your option) any later version. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +/* + * request_sock (formerly open request) hash tables. + */ +static u32 inet6_synq_hash(const struct in6_addr *raddr, const u16 rport, + const u32 rnd, const u16 synq_hsize) +{ + u32 a = raddr->s6_addr32[0]; + u32 b = raddr->s6_addr32[1]; + u32 c = raddr->s6_addr32[2]; + + a += JHASH_GOLDEN_RATIO; + b += JHASH_GOLDEN_RATIO; + c += rnd; + __jhash_mix(a, b, c); + + a += raddr->s6_addr32[3]; + b += (u32)rport; + __jhash_mix(a, b, c); + + return c & (synq_hsize - 1); +} + +struct request_sock *inet6_csk_search_req(const struct sock *sk, + struct request_sock ***prevp, + const __u16 rport, + const struct in6_addr *raddr, + const struct in6_addr *laddr, + const int iif) +{ + const struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + struct request_sock *req, **prev; + + for (prev = &lopt->syn_table[inet6_synq_hash(raddr, rport, + lopt->hash_rnd, + lopt->nr_table_entries)]; + (req = *prev) != NULL; + prev = &req->dl_next) { + const struct tcp6_request_sock *treq = tcp6_rsk(req); + + if (inet_rsk(req)->rmt_port == rport && + req->rsk_ops->family == AF_INET6 && + ipv6_addr_equal(&treq->rmt_addr, raddr) && + ipv6_addr_equal(&treq->loc_addr, laddr) && + (!treq->iif || treq->iif == iif)) { + BUG_TRAP(req->sk == NULL); + *prevp = prev; + return req; + } + } + + return NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_search_req); + +void inet6_csk_reqsk_queue_hash_add(struct sock *sk, + struct request_sock *req, + const unsigned long timeout) +{ + struct inet_connection_sock *icsk = inet_csk(sk); + struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; + const u32 h = inet6_synq_hash(&tcp6_rsk(req)->rmt_addr, + inet_rsk(req)->rmt_port, + lopt->hash_rnd, lopt->nr_table_entries); + + reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, timeout); + inet_csk_reqsk_queue_added(sk, timeout); +} + +EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index bf41f84..5a10d30 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -48,6 +48,7 @@ #include #include #include +#include #include #include #include @@ -118,60 +119,6 @@ static void tcp_v6_hash(struct sock *sk) } } -/* - * Open request hash tables. - */ - -static u32 tcp_v6_synq_hash(const struct in6_addr *raddr, const u16 rport, const u32 rnd) -{ - u32 a, b, c; - - a = raddr->s6_addr32[0]; - b = raddr->s6_addr32[1]; - c = raddr->s6_addr32[2]; - - a += JHASH_GOLDEN_RATIO; - b += JHASH_GOLDEN_RATIO; - c += rnd; - __jhash_mix(a, b, c); - - a += raddr->s6_addr32[3]; - b += (u32) rport; - __jhash_mix(a, b, c); - - return c & (TCP_SYNQ_HSIZE - 1); -} - -static struct request_sock *tcp_v6_search_req(const struct sock *sk, - struct request_sock ***prevp, - __u16 rport, - struct in6_addr *raddr, - struct in6_addr *laddr, - int iif) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - struct request_sock *req, **prev; - - for (prev = &lopt->syn_table[tcp_v6_synq_hash(raddr, rport, lopt->hash_rnd)]; - (req = *prev) != NULL; - prev = &req->dl_next) { - const struct tcp6_request_sock *treq = tcp6_rsk(req); - - if (inet_rsk(req)->rmt_port == rport && - req->rsk_ops->family == AF_INET6 && - ipv6_addr_equal(&treq->rmt_addr, raddr) && - ipv6_addr_equal(&treq->loc_addr, laddr) && - (!treq->iif || treq->iif == iif)) { - BUG_TRAP(req->sk == NULL); - *prevp = prev; - return req; - } - } - - return NULL; -} - static __inline__ u16 tcp_v6_check(struct tcphdr *th, int len, struct in6_addr *saddr, struct in6_addr *daddr, @@ -662,8 +609,8 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, if (sock_owned_by_user(sk)) goto out; - req = tcp_v6_search_req(sk, &prev, th->dest, &hdr->daddr, - &hdr->saddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->dest, &hdr->daddr, + &hdr->saddr, inet6_iif(skb)); if (!req) goto out; @@ -978,8 +925,9 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) struct sock *nsk; /* Find possible connection requests. */ - req = tcp_v6_search_req(sk, &prev, th->source, &skb->nh.ipv6h->saddr, - &skb->nh.ipv6h->daddr, inet6_iif(skb)); + req = inet6_csk_search_req(sk, &prev, th->source, + &skb->nh.ipv6h->saddr, + &skb->nh.ipv6h->daddr, inet6_iif(skb)); if (req) return tcp_check_req(sk, skb, req, prev); @@ -1003,17 +951,6 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) return sk; } -static void tcp_v6_synq_add(struct sock *sk, struct request_sock *req) -{ - struct inet_connection_sock *icsk = inet_csk(sk); - struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = tcp_v6_synq_hash(&tcp6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd); - - reqsk_queue_hash_req(&icsk->icsk_accept_queue, h, req, TCP_TIMEOUT_INIT); - inet_csk_reqsk_queue_added(sk, TCP_TIMEOUT_INIT); -} - - /* FIXME: this is substantially similar to the ipv4 code. * Can some kind of merge be done? -- erics */ @@ -1083,8 +1020,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (tcp_v6_send_synack(sk, req, NULL)) goto drop; - tcp_v6_synq_add(sk, req); - + inet6_csk_reqsk_queue_hash_add(sk, req, TCP_TIMEOUT_INIT); return 0; drop: -- cgit v1.1 From ca304b6104ffdd120bb6687a88a0625e58bc71cd Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:40 -0800 Subject: [IPV6]: Introduce inet6_rsk() And inet6_rsk_offset in inet_request_sock, for the same reasons as inet_sock's pinfo6 member. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ip.h | 4 ++++ include/linux/ipv6.h | 39 +++++++++++++++++++++++++++++++++------ net/ipv4/inet_diag.c | 8 ++++---- net/ipv6/inet6_connection_sock.c | 4 ++-- net/ipv6/tcp_ipv6.c | 19 +++++++++---------- 5 files changed, 52 insertions(+), 22 deletions(-) diff --git a/include/linux/ip.h b/include/linux/ip.h index 33e8a19..5a560da 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -110,6 +110,10 @@ struct ip_options { struct inet_request_sock { struct request_sock req; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + u16 inet6_rsk_offset; + /* 2 bytes hole, try to pack */ +#endif u32 loc_addr; u32 rmt_addr; u16 rmt_port; diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index e0b9227..7d3e86d 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -199,18 +199,17 @@ static inline int inet6_iif(const struct sk_buff *skb) return IP6CB(skb)->iif; } -struct tcp6_request_sock { - struct tcp_request_sock req; +struct inet6_request_sock { struct in6_addr loc_addr; struct in6_addr rmt_addr; struct sk_buff *pktopts; int iif; }; -static inline struct tcp6_request_sock *tcp6_rsk(const struct request_sock *sk) -{ - return (struct tcp6_request_sock *)sk; -} +struct tcp6_request_sock { + struct tcp_request_sock tcp6rsk_tcp; + struct inet6_request_sock tcp6rsk_inet6; +}; /** * struct ipv6_pinfo - ipv6 private area @@ -304,6 +303,28 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) return inet_sk(__sk)->pinet6; } +static inline struct inet6_request_sock * + inet6_rsk(const struct request_sock *rsk) +{ + return (struct inet6_request_sock *)(((u8 *)rsk) + + inet_rsk(rsk)->inet6_rsk_offset); +} + +static inline u32 inet6_rsk_offset(struct request_sock *rsk) +{ + return rsk->rsk_ops->obj_size - sizeof(struct inet6_request_sock); +} + +static inline struct request_sock *inet6_reqsk_alloc(struct request_sock_ops *ops) +{ + struct request_sock *req = reqsk_alloc(ops); + + if (req != NULL) + inet_rsk(req)->inet6_rsk_offset = inet6_rsk_offset(req); + + return req; +} + static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return (struct raw6_sock *)sk; @@ -361,6 +382,12 @@ static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) return NULL; } +static inline struct inet6_request_sock * + inet6_rsk(const struct request_sock *rsk) +{ + return NULL; +} + static inline struct raw6_sock *raw6_sk(const struct sock *sk) { return NULL; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 39061ed..3ce73b1 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -489,9 +489,9 @@ static int inet_diag_fill_req(struct sk_buff *skb, struct sock *sk, #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6_rsk(req)->loc_addr); + &inet6_rsk(req)->loc_addr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6_rsk(req)->rmt_addr); + &inet6_rsk(req)->rmt_addr); } #endif nlh->nlmsg_len = skb->tail - b; @@ -553,13 +553,13 @@ static int inet_diag_dump_reqs(struct sk_buff *skb, struct sock *sk, entry.saddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->loc_addr.s6_addr32 : + inet6_rsk(req)->loc_addr.s6_addr32 : #endif &ireq->loc_addr; entry.daddr = #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) (entry.family == AF_INET6) ? - tcp6_rsk(req)->rmt_addr.s6_addr32 : + inet6_rsk(req)->rmt_addr.s6_addr32 : #endif &ireq->rmt_addr; entry.dport = ntohs(ireq->rmt_port); diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index 04ff443..fe874ee 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -61,7 +61,7 @@ struct request_sock *inet6_csk_search_req(const struct sock *sk, lopt->nr_table_entries)]; (req = *prev) != NULL; prev = &req->dl_next) { - const struct tcp6_request_sock *treq = tcp6_rsk(req); + const struct inet6_request_sock *treq = inet6_rsk(req); if (inet_rsk(req)->rmt_port == rport && req->rsk_ops->family == AF_INET6 && @@ -85,7 +85,7 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, { struct inet_connection_sock *icsk = inet_csk(sk); struct listen_sock *lopt = icsk->icsk_accept_queue.listen_opt; - const u32 h = inet6_synq_hash(&tcp6_rsk(req)->rmt_addr, + const u32 h = inet6_synq_hash(&inet6_rsk(req)->rmt_addr, inet_rsk(req)->rmt_port, lopt->hash_rnd, lopt->nr_table_entries); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5a10d30..c2472d7 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -656,7 +656,7 @@ out: static int tcp_v6_send_synack(struct sock *sk, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff * skb; struct ipv6_txoptions *opt = NULL; @@ -722,8 +722,8 @@ done: static void tcp_v6_reqsk_destructor(struct request_sock *req) { - if (tcp6_rsk(req)->pktopts) - kfree_skb(tcp6_rsk(req)->pktopts); + if (inet6_rsk(req)->pktopts) + kfree_skb(inet6_rsk(req)->pktopts); } static struct request_sock_ops tcp6_request_sock_ops = { @@ -956,7 +956,7 @@ static struct sock *tcp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) */ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) { - struct tcp6_request_sock *treq; + struct inet6_request_sock *treq; struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_options_received tmp_opt; struct tcp_sock *tp = tcp_sk(sk); @@ -981,7 +981,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) goto drop; - req = reqsk_alloc(&tcp6_request_sock_ops); + req = inet6_reqsk_alloc(&tcp6_request_sock_ops); if (req == NULL) goto drop; @@ -994,7 +994,7 @@ static int tcp_v6_conn_request(struct sock *sk, struct sk_buff *skb) tmp_opt.tstamp_ok = tmp_opt.saw_tstamp; tcp_openreq_init(req, &tmp_opt, skb); - treq = tcp6_rsk(req); + treq = inet6_rsk(req); ipv6_addr_copy(&treq->rmt_addr, &skb->nh.ipv6h->saddr); ipv6_addr_copy(&treq->loc_addr, &skb->nh.ipv6h->daddr); TCP_ECN_create_request(req, skb->h.th); @@ -1035,7 +1035,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp6_request_sock *treq = tcp6_rsk(req); + struct inet6_request_sock *treq = inet6_rsk(req); struct ipv6_pinfo *newnp, *np = inet6_sk(sk); struct tcp6_sock *newtcp6sk; struct inet_sock *newinet; @@ -1723,14 +1723,13 @@ static int tcp_v6_destroy_sock(struct sock *sk) static void get_openreq6(struct seq_file *seq, struct sock *sk, struct request_sock *req, int i, int uid) { - struct in6_addr *dest, *src; int ttd = req->expires - jiffies; + struct in6_addr *src = &inet6_rsk(req)->loc_addr; + struct in6_addr *dest = &inet6_rsk(req)->rmt_addr; if (ttd < 0) ttd = 0; - src = &tcp6_rsk(req)->loc_addr; - dest = &tcp6_rsk(req)->rmt_addr; seq_printf(seq, "%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X " "%02X %08X:%08X %02X:%08lX %08X %5d %8d %d %d %p\n", -- cgit v1.1 From 8292a17a399ffb7c5c8b083db4ad994e090055f7 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:15:52 -0800 Subject: [ICSK]: Rename struct tcp_func to struct inet_connection_sock_af_ops And move it to struct inet_connection_sock. DCCP will use it in the upcoming changesets. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/tcp.h | 2 -- include/net/inet_connection_sock.h | 26 ++++++++++++++++++++ include/net/tcp.h | 50 +------------------------------------- include/net/transp_v6.h | 2 +- net/ipv4/syncookies.c | 4 +-- net/ipv4/tcp.c | 8 +++--- net/ipv4/tcp_input.c | 11 +++++---- net/ipv4/tcp_ipv4.c | 11 ++++----- net/ipv4/tcp_minisocks.c | 8 +++--- net/ipv4/tcp_output.c | 17 ++++++------- net/ipv6/ipv6_sockglue.c | 2 +- net/ipv6/tcp_ipv6.c | 28 ++++++++++----------- 12 files changed, 71 insertions(+), 98 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 0e1da66..4e14340 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -295,8 +295,6 @@ struct tcp_sock { struct sk_buff_head out_of_order_queue; /* Out of order segments go here */ - struct tcp_func *af_specific; /* Operations which are AF_INET{4,6} specific */ - __u32 rcv_wnd; /* Current receiver window */ __u32 rcv_wup; /* rcv_nxt on last window update sent */ __u32 write_seq; /* Tail(+1) of data held in tcp send buffer */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index ccc81a1..9e20d20 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -15,6 +15,7 @@ #ifndef _INET_CONNECTION_SOCK_H #define _INET_CONNECTION_SOCK_H +#include #include #include #include @@ -29,6 +30,29 @@ struct inet_bind_bucket; struct inet_hashinfo; struct tcp_congestion_ops; +/* + * Pointers to address related TCP functions + * (i.e. things that depend on the address family) + */ +struct inet_connection_sock_af_ops { + int (*queue_xmit)(struct sk_buff *skb, int ipfragok); + void (*send_check)(struct sock *sk, int len, + struct sk_buff *skb); + int (*rebuild_header)(struct sock *sk); + int (*conn_request)(struct sock *sk, struct sk_buff *skb); + struct sock *(*syn_recv_sock)(struct sock *sk, struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst); + int (*remember_stamp)(struct sock *sk); + __u16 net_header_len; + int (*setsockopt)(struct sock *sk, int level, int optname, + char __user *optval, int optlen); + int (*getsockopt)(struct sock *sk, int level, int optname, + char __user *optval, int __user *optlen); + void (*addr2sockaddr)(struct sock *sk, struct sockaddr *); + int sockaddr_len; +}; + /** inet_connection_sock - INET connection oriented sock * * @icsk_accept_queue: FIFO of established children @@ -37,6 +61,7 @@ struct tcp_congestion_ops; * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout * @icsk_ca_ops Pluggable congestion control hook + * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ca_state: Congestion control state * @icsk_retransmits: Number of unrecovered [RTO] timeouts * @icsk_pending: Scheduled timer event @@ -55,6 +80,7 @@ struct inet_connection_sock { struct timer_list icsk_delack_timer; __u32 icsk_rto; struct tcp_congestion_ops *icsk_ca_ops; + struct inet_connection_sock_af_ops *icsk_af_ops; __u8 icsk_ca_state; __u8 icsk_retransmits; __u8 icsk_pending; diff --git a/include/net/tcp.h b/include/net/tcp.h index d78025f..83b117a 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -225,53 +225,6 @@ extern atomic_t tcp_sockets_allocated; extern int tcp_memory_pressure; /* - * Pointers to address related TCP functions - * (i.e. things that depend on the address family) - */ - -struct tcp_func { - int (*queue_xmit) (struct sk_buff *skb, - int ipfragok); - - void (*send_check) (struct sock *sk, - struct tcphdr *th, - int len, - struct sk_buff *skb); - - int (*rebuild_header) (struct sock *sk); - - int (*conn_request) (struct sock *sk, - struct sk_buff *skb); - - struct sock * (*syn_recv_sock) (struct sock *sk, - struct sk_buff *skb, - struct request_sock *req, - struct dst_entry *dst); - - int (*remember_stamp) (struct sock *sk); - - __u16 net_header_len; - - int (*setsockopt) (struct sock *sk, - int level, - int optname, - char __user *optval, - int optlen); - - int (*getsockopt) (struct sock *sk, - int level, - int optname, - char __user *optval, - int __user *optlen); - - - void (*addr2sockaddr) (struct sock *sk, - struct sockaddr *); - - int sockaddr_len; -}; - -/* * The next routines deal with comparing 32 bit unsigned ints * and worry about wraparound (automatic with unsigned arithmetic). */ @@ -405,8 +358,7 @@ extern void tcp_parse_options(struct sk_buff *skb, * TCP v4 functions exported for the inet6 API */ -extern void tcp_v4_send_check(struct sock *sk, - struct tcphdr *th, int len, +extern void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); extern int tcp_v4_conn_request(struct sock *sk, diff --git a/include/net/transp_v6.h b/include/net/transp_v6.h index 4e86f2d..61f724c 100644 --- a/include/net/transp_v6.h +++ b/include/net/transp_v6.h @@ -44,7 +44,7 @@ extern int datagram_send_ctl(struct msghdr *msg, /* * address family specific functions */ -extern struct tcp_func ipv4_specific; +extern struct inet_connection_sock_af_ops ipv4_specific; extern int inet6_destroy_sock(struct sock *sk); diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index a34e60e..e20be33 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -173,10 +173,10 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, struct request_sock *req, struct dst_entry *dst) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct sock *child; - child = tp->af_specific->syn_recv_sock(sk, skb, req, dst); + child = icsk->icsk_af_ops->syn_recv_sock(sk, skb, req, dst); if (child) inet_csk_reqsk_queue_add(sk, req, child); else diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index ef98b14..eacfe6a 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1696,8 +1696,8 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, char __user *optval, int err = 0; if (level != SOL_TCP) - return tp->af_specific->setsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->setsockopt(sk, level, optname, + optval, optlen); /* This is a string value all the others are int's */ if (optname == TCP_CONGESTION) { @@ -1939,8 +1939,8 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int val, len; if (level != SOL_TCP) - return tp->af_specific->getsockopt(sk, level, optname, - optval, optlen); + return icsk->icsk_af_ops->getsockopt(sk, level, optname, + optval, optlen); if (get_user(len, optlen)) return -EFAULT; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index bf2e230..7de6184 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -4071,8 +4071,10 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, mb(); tcp_set_state(sk, TCP_ESTABLISHED); + icsk = inet_csk(sk); + /* Make sure socket is routed, for correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); @@ -4098,8 +4100,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, sk_wake_async(sk, 0, POLL_OUT); } - icsk = inet_csk(sk); - if (sk->sk_write_pending || icsk->icsk_accept_queue.rskq_defer_accept || icsk->icsk_ack.pingpong) { @@ -4220,6 +4220,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int queued = 0; tp->rx_opt.saw_tstamp = 0; @@ -4236,7 +4237,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, goto discard; if(th->syn) { - if(tp->af_specific->conn_request(sk, skb) < 0) + if (icsk->icsk_af_ops->conn_request(sk, skb) < 0) return 1; /* Now we have several options: In theory there is @@ -4349,7 +4350,7 @@ int tcp_rcv_state_process(struct sock *sk, struct sk_buff *skb, /* Make sure socket is routed, for * correct metrics. */ - tp->af_specific->rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); tcp_init_metrics(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 2aa19c8..704cf21 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -86,8 +86,7 @@ int sysctl_tcp_low_latency; /* Socket used for sending RSTs */ static struct socket *tcp_socket; -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb); +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb); struct inet_hashinfo __cacheline_aligned tcp_hashinfo = { .lhash_lock = RW_LOCK_UNLOCKED, @@ -645,10 +644,10 @@ out: } /* This routine computes an IPv4 TCP checksum. */ -void tcp_v4_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +void tcp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct inet_sock *inet = inet_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~tcp_v4_check(th, len, inet->saddr, inet->daddr, 0); @@ -1383,7 +1382,7 @@ int tcp_v4_tw_remember_stamp(struct inet_timewait_sock *tw) return 0; } -struct tcp_func ipv4_specific = { +struct inet_connection_sock_af_ops ipv4_specific = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1434,7 +1433,7 @@ static int tcp_v4_init_sock(struct sock *sk) sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); - tp->af_specific = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 1b66a2a..9c02968 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -274,18 +274,18 @@ kill: void tcp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; + const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); int recycle_ok = 0; if (tcp_death_row.sysctl_tw_recycle && tp->rx_opt.ts_recent_stamp) - recycle_ok = tp->af_specific->remember_stamp(sk); + recycle_ok = icsk->icsk_af_ops->remember_stamp(sk); if (tcp_death_row.tw_count < tcp_death_row.sysctl_max_tw_buckets) tw = inet_twsk_alloc(sk, state); if (tw != NULL) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); - const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); tw->tw_rcv_wscale = tp->rx_opt.rcv_wscale; @@ -456,7 +456,6 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, struct request_sock **prev) { struct tcphdr *th = skb->h.th; - struct tcp_sock *tp = tcp_sk(sk); u32 flg = tcp_flag_word(th) & (TCP_FLAG_RST|TCP_FLAG_SYN|TCP_FLAG_ACK); int paws_reject = 0; struct tcp_options_received tmp_opt; @@ -613,7 +612,8 @@ struct sock *tcp_check_req(struct sock *sk,struct sk_buff *skb, * ESTABLISHED STATE. If it will be dropped after * socket is created, wait for troubles. */ - child = tp->af_specific->syn_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, + req, NULL); if (child == NULL) goto listen_overflow; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index b7325e0..af1946c 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -371,7 +371,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_ECN_send(sk, tp, skb, tcp_header_size); } - tp->af_specific->send_check(sk, th, skb->len, skb); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (likely(tcb->flags & TCPCB_FLAG_ACK)) tcp_event_ack_sent(sk, tcp_skb_pcount(skb)); @@ -381,7 +381,7 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it, TCP_INC_STATS(TCP_MIB_OUTSEGS); - err = tp->af_specific->queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (unlikely(err <= 0)) return err; @@ -638,12 +638,11 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); - int mss_now; - /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - mss_now = pmtu - tp->af_specific->net_header_len - sizeof(struct tcphdr); + int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - + sizeof(struct tcphdr)); /* Clamp it (mss_clamp does not include tcp options) */ if (mss_now > tp->rx_opt.mss_clamp) @@ -705,9 +704,9 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) xmit_size_goal = mss_now; if (doing_tso) { - xmit_size_goal = 65535 - - tp->af_specific->net_header_len - - tp->ext_header_len - tp->tcp_header_len; + xmit_size_goal = (65535 - + inet_csk(sk)->icsk_af_ops->net_header_len - + tp->ext_header_len - tp->tcp_header_len); if (tp->max_window && (xmit_size_goal > (tp->max_window >> 1))) @@ -1422,7 +1421,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb) (sysctl_tcp_retrans_collapse != 0)) tcp_retrans_try_collapse(sk, skb, cur_mss); - if(tp->af_specific->rebuild_header(sk)) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk)) return -EHOSTUNREACH; /* Routing failure or similar. */ /* Some Solaris stacks overoptimize and ignore the FIN on a diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index 3620718..b6b63fa 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -170,7 +170,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; - tp->af_specific = &ipv4_specific; + inet_csk(sk)->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, tp->pmtu_cookie); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index c2472d7..8ce8a13 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -68,14 +68,14 @@ static void tcp_v6_send_reset(struct sk_buff *skb); static void tcp_v6_reqsk_send_ack(struct sk_buff *skb, struct request_sock *req); -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); -static struct tcp_func ipv6_mapped; -static struct tcp_func ipv6_specific; +static struct inet_connection_sock_af_ops ipv6_mapped; +static struct inet_connection_sock_af_ops ipv6_specific; int inet6_csk_bind_conflict(const struct sock *sk, const struct inet_bind_bucket *tb) @@ -107,9 +107,7 @@ static int tcp_v6_get_port(struct sock *sk, unsigned short snum) static void tcp_v6_hash(struct sock *sk) { if (sk->sk_state != TCP_CLOSE) { - struct tcp_sock *tp = tcp_sk(sk); - - if (tp->af_specific == &ipv6_mapped) { + if (inet_csk(sk)->icsk_af_ops == &ipv6_mapped) { tcp_prot.hash(sk); return; } @@ -417,14 +415,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - tp->af_specific = &ipv6_mapped; + inet_csk(sk)->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { tp->ext_header_len = exthdrlen; - tp->af_specific = &ipv6_specific; + inet_csk(sk)->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { @@ -751,10 +749,10 @@ static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) } -static void tcp_v6_send_check(struct sock *sk, struct tcphdr *th, int len, - struct sk_buff *skb) +static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); + struct tcphdr *th = skb->h.th; if (skb->ip_summed == CHECKSUM_HW) { th->check = ~csum_ipv6_magic(&np->saddr, &np->daddr, len, IPPROTO_TCP, 0); @@ -1070,7 +1068,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); - newtp->af_specific = &ipv6_mapped; + inet_csk(newsk)->icsk_af_ops = &ipv6_mapped; newsk->sk_backlog_rcv = tcp_v4_do_rcv; newnp->pktoptions = NULL; newnp->opt = NULL; @@ -1084,7 +1082,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, */ /* It is tricky place. Until this moment IPv4 tcp - worked with IPv6 af_tcp.af_specific. + worked with IPv6 icsk.icsk_af_ops. Sync it now. */ tcp_sync_mss(newsk, newtp->pmtu_cookie); @@ -1631,7 +1629,7 @@ static int tcp_v6_remember_stamp(struct sock *sk) return 0; } -static struct tcp_func ipv6_specific = { +static struct inet_connection_sock_af_ops ipv6_specific = { .queue_xmit = tcp_v6_xmit, .send_check = tcp_v6_send_check, .rebuild_header = tcp_v6_rebuild_header, @@ -1650,7 +1648,7 @@ static struct tcp_func ipv6_specific = { * TCP over IPv4 via INET6 API */ -static struct tcp_func ipv6_mapped = { +static struct inet_connection_sock_af_ops ipv6_mapped = { .queue_xmit = ip_queue_xmit, .send_check = tcp_v4_send_check, .rebuild_header = inet_sk_rebuild_header, @@ -1700,7 +1698,7 @@ static int tcp_v6_init_sock(struct sock *sk) sk->sk_state = TCP_CLOSE; - tp->af_specific = &ipv6_specific; + icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); -- cgit v1.1 From af05dc9394feb193d221bc9d4c6db768facb4b40 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:16:04 -0800 Subject: [ICSK]: Move v4_addr2sockaddr from TCP to icsk Renaming it to inet_csk_addr2sockaddr. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/inet_connection_sock.h | 2 ++ net/ipv4/inet_connection_sock.c | 12 ++++++++++++ net/ipv4/tcp_ipv4.c | 12 +----------- 3 files changed, 15 insertions(+), 11 deletions(-) diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 9e20d20..e50e2b8 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -303,4 +303,6 @@ static inline unsigned int inet_csk_listen_poll(const struct sock *sk) extern int inet_csk_listen_start(struct sock *sk, const int nr_table_entries); extern void inet_csk_listen_stop(struct sock *sk); +extern void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); + #endif /* _INET_CONNECTION_SOCK_H */ diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index e2bf508..ae20281 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -636,3 +636,15 @@ void inet_csk_listen_stop(struct sock *sk) } EXPORT_SYMBOL_GPL(inet_csk_listen_stop); + +void inet_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr) +{ + struct sockaddr_in *sin = (struct sockaddr_in *)uaddr; + const struct inet_sock *inet = inet_sk(sk); + + sin->sin_family = AF_INET; + sin->sin_addr.s_addr = inet->daddr; + sin->sin_port = inet->dport; +} + +EXPORT_SYMBOL_GPL(inet_csk_addr2sockaddr); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 704cf21..0b5ab04 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1314,16 +1314,6 @@ do_time_wait: goto discard_it; } -static void v4_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct sockaddr_in *sin = (struct sockaddr_in *) uaddr; - struct inet_sock *inet = inet_sk(sk); - - sin->sin_family = AF_INET; - sin->sin_addr.s_addr = inet->daddr; - sin->sin_port = inet->dport; -} - /* VJ's idea. Save last timestamp seen from this destination * and hold it at least for normal timewait interval to use for duplicate * segment detection in subsequent connections, before they enter synchronized @@ -1392,7 +1382,7 @@ struct inet_connection_sock_af_ops ipv4_specific = { .net_header_len = sizeof(struct iphdr), .setsockopt = ip_setsockopt, .getsockopt = ip_getsockopt, - .addr2sockaddr = v4_addr2sockaddr, + .addr2sockaddr = inet_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in), }; -- cgit v1.1 From 57cca05af1e20fdc65b55be52c042c234f86c866 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:16:16 -0800 Subject: [DCCP]: Introduce dccp_ipv4_af_ops And make the core DCCP code AF agnostic, just like TCP, now its time to work on net/dccp/ipv6.c, we are close to the end! Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/input.c | 5 +++-- net/dccp/ipv4.c | 23 +++++++++++++++++++++++ net/dccp/minisocks.c | 2 +- net/dccp/output.c | 14 ++++++-------- net/dccp/proto.c | 9 ++++++--- 5 files changed, 39 insertions(+), 14 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index 3454d59..c81488f 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -329,7 +329,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, dccp_set_state(sk, DCCP_PARTOPEN); /* Make sure socket is routed, for correct metrics. */ - inet_sk_rebuild_header(sk); + icsk->icsk_af_ops->rebuild_header(sk); if (!sock_flag(sk, SOCK_DEAD)) { sk->sk_state_change(sk); @@ -444,7 +444,8 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, */ if (sk->sk_state == DCCP_LISTEN) { if (dh->dccph_type == DCCP_PKT_REQUEST) { - if (dccp_v4_conn_request(sk, skb) < 0) + if (inet_csk(sk)->icsk_af_ops->conn_request(sk, + skb) < 0) return 1; /* FIXME: do congestion control initialization */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 1ac3e30..0ce7d0f 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -607,6 +607,15 @@ out: sock_put(sk); } +/* This routine computes an IPv4 DCCP checksum. */ +static void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + const struct inet_sock *inet = inet_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr); +} + int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) { struct sk_buff *skb; @@ -1195,6 +1204,19 @@ do_time_wait: goto no_dccp_socket; } +struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v4_conn_request, + .syn_recv_sock = dccp_v4_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ip_setsockopt, + .getsockopt = ip_getsockopt, + .addr2sockaddr = inet_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in), +}; + static int dccp_v4_init_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); @@ -1240,6 +1262,7 @@ static int dccp_v4_init_sock(struct sock *sk) inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 1393461..c7ff80c 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -214,7 +214,7 @@ struct sock *dccp_check_req(struct sock *sk, struct sk_buff *skb, goto drop; } - child = dccp_v4_request_recv_sock(sk, skb, req, NULL); + child = inet_csk(sk)->icsk_af_ops->syn_recv_sock(sk, skb, req, NULL); if (child == NULL) goto listen_overflow; diff --git a/net/dccp/output.c b/net/dccp/output.c index 74ff870..f358805 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -43,6 +43,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) { if (likely(skb != NULL)) { const struct inet_sock *inet = inet_sk(sk); + const struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); struct dccp_hdr *dh; @@ -108,8 +109,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) break; } - dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, - inet->daddr); + icsk->icsk_af_ops->send_check(sk, skb->len, skb); if (set_ack) dccp_event_ack_sent(sk); @@ -117,7 +117,7 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) DCCP_INC_STATS(DCCP_MIB_OUTSEGS); memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt)); - err = ip_queue_xmit(skb, 0); + err = icsk->icsk_af_ops->queue_xmit(skb, 0); if (err <= 0) return err; @@ -135,16 +135,14 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { struct dccp_sock *dp = dccp_sk(sk); - int mss_now; - /* * FIXME: we really should be using the af_specific thing to support * IPv6. * mss_now = pmtu - tp->af_specific->net_header_len - * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); */ - mss_now = pmtu - sizeof(struct iphdr) - sizeof(struct dccp_hdr) - - sizeof(struct dccp_hdr_ext); + int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - + sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); /* Now subtract optional transport overhead */ mss_now -= dp->dccps_ext_header_len; @@ -266,7 +264,7 @@ int dccp_write_xmit(struct sock *sk, struct sk_buff *skb, long *timeo) int dccp_retransmit_skb(struct sock *sk, struct sk_buff *skb) { - if (inet_sk_rebuild_header(sk) != 0) + if (inet_csk(sk)->icsk_af_ops->rebuild_header(sk) != 0) return -EHOSTUNREACH; /* Routing failure or similar. */ return dccp_transmit_skb(sk, (skb_cloned(skb) ? diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 8a6b2a9..7b30c12 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -254,7 +254,9 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, int val; if (level != SOL_DCCP) - return ip_setsockopt(sk, level, optname, optval, optlen); + return inet_csk(sk)->icsk_af_ops->setsockopt(sk, level, + optname, optval, + optlen); if (optlen < sizeof(int)) return -EINVAL; @@ -320,8 +322,9 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, int val, len; if (level != SOL_DCCP) - return ip_getsockopt(sk, level, optname, optval, optlen); - + return inet_csk(sk)->icsk_af_ops->getsockopt(sk, level, + optname, optval, + optlen); if (get_user(len, optlen)) return -EFAULT; -- cgit v1.1 From 3305b80c214c642b89cd5c21af83bc91ec13f8bd Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 13 Dec 2005 23:16:37 -0800 Subject: [IP]: Simplify and consolidate MSG_PEEK error handling When a packet is obtained from skb_recv_datagram with MSG_PEEK enabled it is left on the socket receive queue. This means that when we detect a checksum error we have to be careful when trying to free the packet as someone could have dequeued it in the time being. Currently this delicate logic is duplicated three times between UDPv4, UDPv6 and RAWv6. This patch moves them into a one place and simplifies the code somewhat. This is based on a suggestion by Eric Dumazet. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 ++ net/core/datagram.c | 36 ++++++++++++++++++++++++++++++++++++ net/ipv4/udp.c | 15 +-------------- net/ipv6/raw.c | 16 +++------------- net/ipv6/udp.c | 16 ++-------------- 5 files changed, 44 insertions(+), 41 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 8c5d600..97f6580 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -1239,6 +1239,8 @@ extern int skb_copy_and_csum_datagram_iovec(struct sk_buff *skb, int hlen, struct iovec *iov); extern void skb_free_datagram(struct sock *sk, struct sk_buff *skb); +extern void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, + unsigned int flags); extern unsigned int skb_checksum(const struct sk_buff *skb, int offset, int len, unsigned int csum); extern int skb_copy_bits(const struct sk_buff *skb, int offset, diff --git a/net/core/datagram.c b/net/core/datagram.c index 1bcfef5..f8d322e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include @@ -200,6 +201,41 @@ void skb_free_datagram(struct sock *sk, struct sk_buff *skb) } /** + * skb_kill_datagram - Free a datagram skbuff forcibly + * @sk: socket + * @skb: datagram skbuff + * @flags: MSG_ flags + * + * This function frees a datagram skbuff that was received by + * skb_recv_datagram. The flags argument must match the one + * used for skb_recv_datagram. + * + * If the MSG_PEEK flag is set, and the packet is still on the + * receive queue of the socket, it will be taken off the queue + * before it is freed. + * + * This function currently only disables BH when acquiring the + * sk_receive_queue lock. Therefore it must not be used in a + * context where that lock is acquired in an IRQ context. + */ + +void skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags) +{ + if (flags & MSG_PEEK) { + spin_lock_bh(&sk->sk_receive_queue.lock); + if (skb == skb_peek(&sk->sk_receive_queue)) { + __skb_unlink(skb, &sk->sk_receive_queue); + atomic_dec(&skb->users); + } + spin_unlock_bh(&sk->sk_receive_queue.lock); + } + + kfree_skb(skb); +} + +EXPORT_SYMBOL(skb_kill_datagram); + +/** * skb_copy_datagram_iovec - Copy a datagram to an iovec. * @skb: buffer to copy * @offset: offset in the buffer to start copying from diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 2422a5f..012c462 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -846,20 +846,7 @@ out: csum_copy_err: UDP_INC_STATS_BH(UDP_MIB_INERRORS); - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (noblock) return -EAGAIN; diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index a66900c..66f1d12 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -433,25 +434,14 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } + skb_kill_datagram(sk, skb, flags); /* Error for blocking case is chosen to masquerade as some normal condition. */ err = (flags&MSG_DONTWAIT) ? -EAGAIN : -EHOSTUNREACH; /* FIXME: increment a raw6 drops counter here */ - goto out_free; + goto out; } static int rawv6_push_pending_frames(struct sock *sk, struct flowi *fl, diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 5cc8731..d8538dc 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -36,6 +36,7 @@ #include #include #include +#include #include #include @@ -300,20 +301,7 @@ out: return err; csum_copy_err: - /* Clear queue. */ - if (flags&MSG_PEEK) { - int clear = 0; - spin_lock_bh(&sk->sk_receive_queue.lock); - if (skb == skb_peek(&sk->sk_receive_queue)) { - __skb_unlink(skb, &sk->sk_receive_queue); - clear = 1; - } - spin_unlock_bh(&sk->sk_receive_queue.lock); - if (clear) - kfree_skb(skb); - } - - skb_free_datagram(sk, skb); + skb_kill_datagram(sk, skb, flags); if (flags & MSG_DONTWAIT) { UDP6_INC_STATS_USER(UDP_MIB_INERRORS); -- cgit v1.1 From 65a45441d7e91ef6107fadbc55f775db4ba23874 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 13 Dec 2005 23:17:02 -0800 Subject: [UDP]: udp_checksum_init return value Since udp_checksum_init always returns 0 there is no point in having it return a value. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/udp.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 012c462..67c0363 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -1081,7 +1081,7 @@ static int udp_v4_mcast_deliver(struct sk_buff *skb, struct udphdr *uh, * Otherwise, csum completion requires chacksumming packet body, * including udp header and folding it to skb->csum. */ -static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, +static void udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, unsigned short ulen, u32 saddr, u32 daddr) { if (uh->check == 0) { @@ -1095,7 +1095,6 @@ static int udp_checksum_init(struct sk_buff *skb, struct udphdr *uh, /* Probably, we should checksum udp header (it should be in cache * in any case) and data in tiny packets (< rx copybreak). */ - return 0; } /* @@ -1128,8 +1127,7 @@ int udp_rcv(struct sk_buff *skb) if (pskb_trim_rcsum(skb, ulen)) goto short_packet; - if (udp_checksum_init(skb, uh, ulen, saddr, daddr) < 0) - goto csum_error; + udp_checksum_init(skb, uh, ulen, saddr, daddr); if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); -- cgit v1.1 From f1f71e03b17db3b9edb0264a8be7719bd5c35582 Mon Sep 17 00:00:00 2001 From: Roberto Nibali Date: Tue, 13 Dec 2005 23:17:20 -0800 Subject: [IPVS]: remove dead code This patch removes dead code. I don't see the reason to keep this cruft around, besides cluttering the nice and functionally working code. Signed-off-by: Roberto Nibali Signed-off-by: Horms Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_app.c | 28 ---------------------------- net/ipv4/ipvs/ip_vs_lblc.c | 27 --------------------------- net/ipv4/ipvs/ip_vs_lblcr.c | 27 --------------------------- net/ipv4/ipvs/ip_vs_proto_tcp.c | 22 ---------------------- 4 files changed, 104 deletions(-) diff --git a/net/ipv4/ipvs/ip_vs_app.c b/net/ipv4/ipvs/ip_vs_app.c index d7eb680..9b176a9 100644 --- a/net/ipv4/ipvs/ip_vs_app.c +++ b/net/ipv4/ipvs/ip_vs_app.c @@ -224,34 +224,6 @@ void unregister_ip_vs_app(struct ip_vs_app *app) } -#if 0000 -/* - * Get reference to app by name (called from user context) - */ -struct ip_vs_app *ip_vs_app_get_by_name(char *appname) -{ - struct ip_vs_app *app, *a = NULL; - - down(&__ip_vs_app_mutex); - - list_for_each_entry(ent, &ip_vs_app_list, a_list) { - if (strcmp(app->name, appname)) - continue; - - /* softirq may call ip_vs_app_get too, so the caller - must disable softirq on the current CPU */ - if (ip_vs_app_get(app)) - a = app; - break; - } - - up(&__ip_vs_app_mutex); - - return a; -} -#endif - - /* * Bind ip_vs_conn to its ip_vs_app (called by cp constructor) */ diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index 561cda3..b6dad7e 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -228,33 +228,6 @@ ip_vs_lblc_hash(struct ip_vs_lblc_table *tbl, struct ip_vs_lblc_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblc_entry from ip_vs_lblc_table. - * returns bool success. - */ -static int ip_vs_lblc_unhash(struct ip_vs_lblc_table *tbl, - struct ip_vs_lblc_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblc_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblc_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index ce456db..8c78ef7 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -414,33 +414,6 @@ ip_vs_lblcr_hash(struct ip_vs_lblcr_table *tbl, struct ip_vs_lblcr_entry *en) } -#if 0000 -/* - * Unhash ip_vs_lblcr_entry from ip_vs_lblcr_table. - * returns bool success. - */ -static int ip_vs_lblcr_unhash(struct ip_vs_lblcr_table *tbl, - struct ip_vs_lblcr_entry *en) -{ - if (list_empty(&en->list)) { - IP_VS_ERR("ip_vs_lblcr_unhash(): request for not hashed entry, " - "called from %p\n", __builtin_return_address(0)); - return 0; - } - - /* - * Remove it from the table - */ - write_lock(&tbl->lock); - list_del(&en->list); - INIT_LIST_HEAD(&en->list); - write_unlock(&tbl->lock); - - return 1; -} -#endif - - /* * Get ip_vs_lblcr_entry associated with supplied parameters. */ diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 0e878fd..638ba8c 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -275,28 +275,6 @@ static int tcp_timeouts[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_LAST] = 2*HZ, }; - -#if 0 - -/* FIXME: This is going to die */ - -static int tcp_timeouts_dos[IP_VS_TCP_S_LAST+1] = { - [IP_VS_TCP_S_NONE] = 2*HZ, - [IP_VS_TCP_S_ESTABLISHED] = 8*60*HZ, - [IP_VS_TCP_S_SYN_SENT] = 60*HZ, - [IP_VS_TCP_S_SYN_RECV] = 10*HZ, - [IP_VS_TCP_S_FIN_WAIT] = 60*HZ, - [IP_VS_TCP_S_TIME_WAIT] = 60*HZ, - [IP_VS_TCP_S_CLOSE] = 10*HZ, - [IP_VS_TCP_S_CLOSE_WAIT] = 60*HZ, - [IP_VS_TCP_S_LAST_ACK] = 30*HZ, - [IP_VS_TCP_S_LISTEN] = 2*60*HZ, - [IP_VS_TCP_S_SYNACK] = 100*HZ, - [IP_VS_TCP_S_LAST] = 2*HZ, -}; - -#endif - static char * tcp_state_name_table[IP_VS_TCP_S_LAST+1] = { [IP_VS_TCP_S_NONE] = "NONE", [IP_VS_TCP_S_ESTABLISHED] = "ESTABLISHED", -- cgit v1.1 From c1cbe4b7ad0bc4b1d98ea708a3fecb7362aa4088 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 13 Dec 2005 23:22:19 -0800 Subject: [NET]: Avoid atomic xchg() for non-error case It also looks like there were 2 places where the test on sk_err was missing from the event wait logic (in sk_stream_wait_connect and sk_stream_wait_memory), while the rest of the sock_error() users look to be doing the right thing. This version of the patch fixes those, and cleans up a few places that were testing ->sk_err directly. Signed-off-by: Benjamin LaHaise Signed-off-by: David S. Miller --- include/net/sock.h | 5 ++++- net/bluetooth/af_bluetooth.c | 5 ++--- net/bluetooth/l2cap.c | 5 +++-- net/bluetooth/sco.c | 5 +++-- net/core/stream.c | 10 +++++++--- net/irda/af_irda.c | 5 +++-- net/llc/af_llc.c | 5 ++--- 7 files changed, 24 insertions(+), 16 deletions(-) diff --git a/include/net/sock.h b/include/net/sock.h index 982b4ec..0fbae85 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -1166,7 +1166,10 @@ static inline int sock_queue_err_skb(struct sock *sk, struct sk_buff *skb) static inline int sock_error(struct sock *sk) { - int err = xchg(&sk->sk_err, 0); + int err; + if (likely(!sk->sk_err)) + return 0; + err = xchg(&sk->sk_err, 0); return -err; } diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index ea616e3..fb031fe 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -287,10 +287,9 @@ int bt_sock_wait_state(struct sock *sk, int state, unsigned long timeo) timeo = schedule_timeout(timeo); lock_sock(sk); - if (sk->sk_err) { - err = sock_error(sk); + err = sock_error(sk); + if (err) break; - } } set_current_state(TASK_RUNNING); remove_wait_queue(sk->sk_sleep, &wait); diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index e3bb11c..95f33cc 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -767,8 +767,9 @@ static int l2cap_sock_sendmsg(struct kiocb *iocb, struct socket *sock, struct ms BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 9cb00dc..6481814 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -637,8 +637,9 @@ static int sco_sock_sendmsg(struct kiocb *iocb, struct socket *sock, BT_DBG("sock %p, sk %p", sock, sk); - if (sk->sk_err) - return sock_error(sk); + err = sock_error(sk); + if (err) + return err; if (msg->msg_flags & MSG_OOB) return -EOPNOTSUPP; diff --git a/net/core/stream.c b/net/core/stream.c index 15bfd03..35e2525 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -55,8 +55,9 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) int done; do { - if (sk->sk_err) - return sock_error(sk); + int err = sock_error(sk); + if (err) + return err; if ((1 << sk->sk_state) & ~(TCPF_SYN_SENT | TCPF_SYN_RECV)) return -EPIPE; if (!*timeo_p) @@ -67,6 +68,7 @@ int sk_stream_wait_connect(struct sock *sk, long *timeo_p) prepare_to_wait(sk->sk_sleep, &wait, TASK_INTERRUPTIBLE); sk->sk_write_pending++; done = sk_wait_event(sk, timeo_p, + !sk->sk_err && !((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT))); finish_wait(sk->sk_sleep, &wait); @@ -137,7 +139,9 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) set_bit(SOCK_NOSPACE, &sk->sk_socket->flags); sk->sk_write_pending++; - sk_wait_event(sk, ¤t_timeo, sk_stream_memory_free(sk) && + sk_wait_event(sk, ¤t_timeo, !sk->sk_err && + !(sk->sk_shutdown & SEND_SHUTDOWN) && + sk_stream_memory_free(sk) && vm_wait); sk->sk_write_pending--; diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 6f92f9c..f121f7d 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1438,8 +1438,9 @@ static int irda_recvmsg_stream(struct kiocb *iocb, struct socket *sock, /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) - ret = sock_error(sk); + ret = sock_error(sk); + if (ret) + break; else if (sk->sk_shutdown & RCV_SHUTDOWN) ; else if (noblock) diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index c3f0b07..b6d3df5 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -566,10 +566,9 @@ static int llc_wait_data(struct sock *sk, long timeo) /* * POSIX 1003.1g mandates this order. */ - if (sk->sk_err) { - rc = sock_error(sk); + rc = sock_error(sk); + if (rc) break; - } rc = 0; if (sk->sk_shutdown & RCV_SHUTDOWN) break; -- cgit v1.1 From 830a1e5c212fb3fdc83b66359c780c3b3a294897 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 13 Dec 2005 23:22:32 -0800 Subject: [AF_UNIX]: Remove superfluous reference counting in unix_stream_sendmsg AF_UNIX stream socket performance on P4 CPUs tends to suffer due to a lot of pipeline flushes from atomic operations. The patch below removes the sock_hold() and sock_put() in unix_stream_sendmsg(). This should be safe as the socket still holds a reference to its peer which is only released after the file descriptor's final user invokes unix_release_sock(). The only consideration is that we must add a memory barrier before setting the peer initially. Signed-off-by: Benjamin LaHaise Signed-off-by: David S. Miller --- net/unix/af_unix.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index acc73ba..1dc3685 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1063,10 +1063,12 @@ restart: /* Set credentials */ sk->sk_peercred = other->sk_peercred; - sock_hold(newsk); - unix_peer(sk) = newsk; sock->state = SS_CONNECTED; sk->sk_state = TCP_ESTABLISHED; + sock_hold(newsk); + + smp_mb__after_atomic_inc(); /* sock_hold() does an atomic_inc() */ + unix_peer(sk) = newsk; unix_state_wunlock(sk); @@ -1414,7 +1416,7 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, } else { sunaddr = NULL; err = -ENOTCONN; - other = unix_peer_get(sk); + other = unix_peer(sk); if (!other) goto out_err; } @@ -1476,7 +1478,6 @@ static int unix_stream_sendmsg(struct kiocb *kiocb, struct socket *sock, other->sk_data_ready(other, size); sent+=size; } - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; @@ -1491,8 +1492,6 @@ pipe_err: send_sig(SIGPIPE,current,0); err = -EPIPE; out_err: - if (other) - sock_put(other); scm_destroy(siocb->scm); siocb->scm = NULL; return sent ? : err; -- cgit v1.1 From b9750ce13c08aa8a71a9b138d741f3046aefd991 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:22:54 -0800 Subject: [IPV6]: Generalise some functions Using sk->sk_protocol instead of IPPROTO_TCP. Will be used by DCCPv6 in the next changesets. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ipv6.h | 2 + include/net/inet6_connection_sock.h | 13 ++- net/ipv6/af_inet6.c | 52 ++++++++++++ net/ipv6/inet6_connection_sock.c | 103 ++++++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 153 +----------------------------------- 5 files changed, 173 insertions(+), 150 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7d3e86d..69a0dec 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -297,6 +297,8 @@ struct tcp6_sock { struct ipv6_pinfo inet6; }; +extern int inet6_sk_rebuild_header(struct sock *sk); + #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) static inline struct ipv6_pinfo * inet6_sk(const struct sock *__sk) { diff --git a/include/net/inet6_connection_sock.h b/include/net/inet6_connection_sock.h index aa30ebd..b33b438 100644 --- a/include/net/inet6_connection_sock.h +++ b/include/net/inet6_connection_sock.h @@ -15,8 +15,15 @@ #include -struct sock; +struct in6_addr; +struct inet_bind_bucket; struct request_sock; +struct sk_buff; +struct sock; +struct sockaddr; + +extern int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb); extern struct request_sock *inet6_csk_search_req(const struct sock *sk, struct request_sock ***prevp, @@ -28,4 +35,8 @@ extern struct request_sock *inet6_csk_search_req(const struct sock *sk, extern void inet6_csk_reqsk_queue_hash_add(struct sock *sk, struct request_sock *req, const unsigned long timeout); + +extern void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr); + +extern int inet6_csk_xmit(struct sk_buff *skb, int ipfragok); #endif /* _INET6_CONNECTION_SOCK_H */ diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index d954638..fd040e9 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -609,6 +609,58 @@ inet6_unregister_protosw(struct inet_protosw *p) } } +int inet6_sk_rebuild_header(struct sock *sk) +{ + int err; + struct dst_entry *dst; + struct ipv6_pinfo *np = inet6_sk(sk); + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) { + sk->sk_route_caps = 0; + return err; + } + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + return 0; +} + +EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) { diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c index fe874ee..792f90f 100644 --- a/net/ipv6/inet6_connection_sock.c +++ b/net/ipv6/inet6_connection_sock.c @@ -21,8 +21,34 @@ #include #include +#include +#include +#include #include +int inet6_csk_bind_conflict(const struct sock *sk, + const struct inet_bind_bucket *tb) +{ + const struct sock *sk2; + const struct hlist_node *node; + + /* We must walk the whole port owner list in this case. -DaveM */ + sk_for_each_bound(sk2, node, &tb->owners) { + if (sk != sk2 && + (!sk->sk_bound_dev_if || + !sk2->sk_bound_dev_if || + sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && + (!sk->sk_reuse || !sk2->sk_reuse || + sk2->sk_state == TCP_LISTEN) && + ipv6_rcv_saddr_equal(sk, sk2)) + break; + } + + return node != NULL; +} + +EXPORT_SYMBOL_GPL(inet6_csk_bind_conflict); + /* * request_sock (formerly open request) hash tables. */ @@ -94,3 +120,80 @@ void inet6_csk_reqsk_queue_hash_add(struct sock *sk, } EXPORT_SYMBOL_GPL(inet6_csk_reqsk_queue_hash_add); + +void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; + + sin6->sin6_family = AF_INET6; + ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); + sin6->sin6_port = inet_sk(sk)->dport; + /* We do not store received flowlabel for TCP */ + sin6->sin6_flowinfo = 0; + sin6->sin6_scope_id = 0; + if (sk->sk_bound_dev_if && + ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) + sin6->sin6_scope_id = sk->sk_bound_dev_if; +} + +EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr); + +int inet6_csk_xmit(struct sk_buff *skb, int ipfragok) +{ + struct sock *sk = skb->sk; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct flowi fl; + struct dst_entry *dst; + struct in6_addr *final_p = NULL, final; + + memset(&fl, 0, sizeof(fl)); + fl.proto = sk->sk_protocol; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.fl6_flowlabel = np->flow_label; + IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_sport = inet->sport; + fl.fl_ip_dport = inet->dport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + int err = ip6_dst_lookup(sk, &dst, &fl); + + if (err) { + sk->sk_err_soft = -err; + return err; + } + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_route_caps = 0; + return err; + } + + ip6_dst_store(sk, dst, NULL); + sk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + } + + skb->dst = dst_clone(dst); + + /* Restore final destination back after routing done */ + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + + return ip6_xmit(sk, skb, &fl, np->opt, 0); +} + +EXPORT_SYMBOL_GPL(inet6_csk_xmit); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 8ce8a13..2f932ce 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -72,32 +72,10 @@ static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok); static struct inet_connection_sock_af_ops ipv6_mapped; static struct inet_connection_sock_af_ops ipv6_specific; -int inet6_csk_bind_conflict(const struct sock *sk, - const struct inet_bind_bucket *tb) -{ - const struct sock *sk2; - const struct hlist_node *node; - - /* We must walk the whole port owner list in this case. -DaveM */ - sk_for_each_bound(sk2, node, &tb->owners) { - if (sk != sk2 && - (!sk->sk_bound_dev_if || - !sk2->sk_bound_dev_if || - sk->sk_bound_dev_if == sk2->sk_bound_dev_if) && - (!sk->sk_reuse || !sk2->sk_reuse || - sk2->sk_state == TCP_LISTEN) && - ipv6_rcv_saddr_equal(sk, sk2)) - break; - } - - return node != NULL; -} - static int tcp_v6_get_port(struct sock *sk, unsigned short snum) { return inet_csk_get_port(&tcp_hashinfo, sk, snum, @@ -1500,129 +1478,6 @@ do_time_wait: goto discard_it; } -static int tcp_v6_rebuild_header(struct sock *sk) -{ - int err; - struct dst_entry *dst; - struct ipv6_pinfo *np = inet6_sk(sk); - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - struct inet_sock *inet = inet_sk(sk); - struct in6_addr *final_p = NULL, final; - struct flowi fl; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_dport = inet->dport; - fl.fl_ip_sport = inet->sport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - err = ip6_dst_lookup(sk, &dst, &fl); - if (err) { - sk->sk_route_caps = 0; - return err; - } - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_err_soft = -err; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - return 0; -} - -static int tcp_v6_xmit(struct sk_buff *skb, int ipfragok) -{ - struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(sk); - struct ipv6_pinfo *np = inet6_sk(sk); - struct flowi fl; - struct dst_entry *dst; - struct in6_addr *final_p = NULL, final; - - memset(&fl, 0, sizeof(fl)); - fl.proto = IPPROTO_TCP; - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - ipv6_addr_copy(&fl.fl6_src, &np->saddr); - fl.fl6_flowlabel = np->flow_label; - IP6_ECN_flow_xmit(sk, fl.fl6_flowlabel); - fl.oif = sk->sk_bound_dev_if; - fl.fl_ip_sport = inet->sport; - fl.fl_ip_dport = inet->dport; - - if (np->opt && np->opt->srcrt) { - struct rt0_hdr *rt0 = (struct rt0_hdr *) np->opt->srcrt; - ipv6_addr_copy(&final, &fl.fl6_dst); - ipv6_addr_copy(&fl.fl6_dst, rt0->addr); - final_p = &final; - } - - dst = __sk_dst_check(sk, np->dst_cookie); - - if (dst == NULL) { - int err = ip6_dst_lookup(sk, &dst, &fl); - - if (err) { - sk->sk_err_soft = -err; - return err; - } - - if (final_p) - ipv6_addr_copy(&fl.fl6_dst, final_p); - - if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { - sk->sk_route_caps = 0; - return err; - } - - ip6_dst_store(sk, dst, NULL); - sk->sk_route_caps = dst->dev->features & - ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - } - - skb->dst = dst_clone(dst); - - /* Restore final destination back after routing done */ - ipv6_addr_copy(&fl.fl6_dst, &np->daddr); - - return ip6_xmit(sk, skb, &fl, np->opt, 0); -} - -static void v6_addr2sockaddr(struct sock *sk, struct sockaddr * uaddr) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) uaddr; - - sin6->sin6_family = AF_INET6; - ipv6_addr_copy(&sin6->sin6_addr, &np->daddr); - sin6->sin6_port = inet_sk(sk)->dport; - /* We do not store received flowlabel for TCP */ - sin6->sin6_flowinfo = 0; - sin6->sin6_scope_id = 0; - if (sk->sk_bound_dev_if && - ipv6_addr_type(&sin6->sin6_addr) & IPV6_ADDR_LINKLOCAL) - sin6->sin6_scope_id = sk->sk_bound_dev_if; -} - static int tcp_v6_remember_stamp(struct sock *sk) { /* Alas, not yet... */ @@ -1630,9 +1485,9 @@ static int tcp_v6_remember_stamp(struct sock *sk) } static struct inet_connection_sock_af_ops ipv6_specific = { - .queue_xmit = tcp_v6_xmit, + .queue_xmit = inet6_csk_xmit, .send_check = tcp_v6_send_check, - .rebuild_header = tcp_v6_rebuild_header, + .rebuild_header = inet6_sk_rebuild_header, .conn_request = tcp_v6_conn_request, .syn_recv_sock = tcp_v6_syn_recv_sock, .remember_stamp = tcp_v6_remember_stamp, @@ -1640,7 +1495,7 @@ static struct inet_connection_sock_af_ops ipv6_specific = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; @@ -1659,7 +1514,7 @@ static struct inet_connection_sock_af_ops ipv6_mapped = { .setsockopt = ipv6_setsockopt, .getsockopt = ipv6_getsockopt, - .addr2sockaddr = v6_addr2sockaddr, + .addr2sockaddr = inet6_csk_addr2sockaddr, .sockaddr_len = sizeof(struct sockaddr_in6) }; -- cgit v1.1 From 0fa1a53e1f055a6c790f40e7728f42a825b29248 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:23:09 -0800 Subject: [IPV6]: Introduce inet6_timewait_sock Out of tcp6_timewait_sock, that now is just an aggregation of inet_timewait_sock and inet6_timewait_sock, using tw_ipv6_offset in struct inet_timewait_sock, that is common to the IPv6 transport protocols that use timewait sockets, like DCCP and TCP. tw_ipv6_offset plays the struct inet_sock pinfo6 role, i.e. for the generic code to find the IPv6 area in a timewait sock. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ipv6.h | 32 +++++++++++++++++++++----------- include/net/inet6_hashtables.h | 6 +++--- include/net/inet_timewait_sock.h | 3 ++- net/ipv4/inet_diag.c | 6 +++--- net/ipv4/tcp_minisocks.c | 8 +++++--- net/ipv6/addrconf.c | 2 +- net/ipv6/tcp_ipv6.c | 12 ++++++------ 7 files changed, 41 insertions(+), 28 deletions(-) diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 69a0dec..7d39085 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -348,26 +348,36 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #include +struct inet6_timewait_sock { + struct in6_addr tw_v6_daddr; + struct in6_addr tw_v6_rcv_saddr; +}; + struct tcp6_timewait_sock { - struct tcp_timewait_sock tw_v6_sk; - struct in6_addr tw_v6_daddr; - struct in6_addr tw_v6_rcv_saddr; + struct tcp_timewait_sock tcp6tw_tcp; + struct inet6_timewait_sock tcp6tw_inet6; }; -static inline struct tcp6_timewait_sock *tcp6_twsk(const struct sock *sk) +static inline u16 inet6_tw_offset(const struct proto *prot) +{ + return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock); +} + +static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk) { - return (struct tcp6_timewait_sock *)sk; + return (struct inet6_timewait_sock *)(((u8 *)sk) + + inet_twsk(sk)->tw_ipv6_offset); } -static inline struct in6_addr *__tcp_v6_rcv_saddr(const struct sock *sk) +static inline struct in6_addr *__inet6_rcv_saddr(const struct sock *sk) { return likely(sk->sk_state != TCP_TIME_WAIT) ? - &inet6_sk(sk)->rcv_saddr : &tcp6_twsk(sk)->tw_v6_rcv_saddr; + &inet6_sk(sk)->rcv_saddr : &inet6_twsk(sk)->tw_v6_rcv_saddr; } -static inline struct in6_addr *tcp_v6_rcv_saddr(const struct sock *sk) +static inline struct in6_addr *inet6_rcv_saddr(const struct sock *sk) { - return sk->sk_family == AF_INET6 ? __tcp_v6_rcv_saddr(sk) : NULL; + return sk->sk_family == AF_INET6 ? __inet6_rcv_saddr(sk) : NULL; } static inline int inet_v6_ipv6only(const struct sock *sk) @@ -395,8 +405,8 @@ static inline struct raw6_sock *raw6_sk(const struct sock *sk) return NULL; } -#define __tcp_v6_rcv_saddr(__sk) NULL -#define tcp_v6_rcv_saddr(__sk) NULL +#define __inet6_rcv_saddr(__sk) NULL +#define inet6_rcv_saddr(__sk) NULL #define tcp_twsk_ipv6only(__sk) 0 #define inet_v6_ipv6only(__sk) 0 #endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */ diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h index a4a204f..25f708f 100644 --- a/include/net/inet6_hashtables.h +++ b/include/net/inet6_hashtables.h @@ -110,10 +110,10 @@ static inline struct sock * if(*((__u32 *)&(tw->tw_dport)) == ports && sk->sk_family == PF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); - if (ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + if (ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && (!sk->sk_bound_dev_if || sk->sk_bound_dev_if == dif)) goto hit; } diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 28f7b21..ca240f8 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -127,7 +127,8 @@ struct inet_timewait_sock { __u16 tw_num; /* And these are ours. */ __u8 tw_ipv6only:1; - /* 31 bits hole, try to pack */ + /* 15 bits hole, try to pack */ + __u16 tw_ipv6_offset; int tw_timeout; unsigned long tw_ttd; struct inet_bind_bucket *tw_tb; diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c index 3ce73b1..c499081 100644 --- a/net/ipv4/inet_diag.c +++ b/net/ipv4/inet_diag.c @@ -112,12 +112,12 @@ static int inet_diag_fill(struct sk_buff *skb, struct sock *sk, r->idiag_inode = 0; #if defined(CONFIG_IPV6) || defined (CONFIG_IPV6_MODULE) if (r->idiag_family == AF_INET6) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk); ipv6_addr_copy((struct in6_addr *)r->id.idiag_src, - &tcp6tw->tw_v6_rcv_saddr); + &tw6->tw_v6_rcv_saddr); ipv6_addr_copy((struct in6_addr *)r->id.idiag_dst, - &tcp6tw->tw_v6_daddr); + &tw6->tw_v6_daddr); } #endif nlh->nlmsg_len = skb->tail - b; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 9c02968..2b9b7f6 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -298,10 +298,12 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { struct ipv6_pinfo *np = inet6_sk(sk); - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6; - ipv6_addr_copy(&tcp6tw->tw_v6_daddr, &np->daddr); - ipv6_addr_copy(&tcp6tw->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); tw->tw_ipv6only = np->ipv6only; } #endif diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index a60585f..704fb73 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -1195,7 +1195,7 @@ struct inet6_ifaddr * ipv6_get_ifaddr(struct in6_addr *addr, struct net_device * int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2) { const struct in6_addr *sk_rcv_saddr6 = &inet6_sk(sk)->rcv_saddr; - const struct in6_addr *sk2_rcv_saddr6 = tcp_v6_rcv_saddr(sk2); + const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2); u32 sk_rcv_saddr = inet_sk(sk)->rcv_saddr; u32 sk2_rcv_saddr = inet_rcv_saddr(sk2); int sk_ipv6only = ipv6_only_sock(sk); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 2f932ce..cb88007 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -138,14 +138,14 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, /* Check TIME-WAIT sockets first. */ sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - const struct tcp6_timewait_sock *tcp6tw = tcp6_twsk(sk2); + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); tw = inet_twsk(sk2); if(*((__u32 *)&(tw->tw_dport)) == ports && sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tcp6tw->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tcp6tw->tw_v6_rcv_saddr, daddr) && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); struct tcp_sock *tp = tcp_sk(sk); @@ -1663,14 +1663,14 @@ static void get_timewait6_sock(struct seq_file *seq, { struct in6_addr *dest, *src; __u16 destp, srcp; - struct tcp6_timewait_sock *tcp6tw = tcp6_twsk((struct sock *)tw); + struct inet6_timewait_sock *tw6 = inet6_twsk((struct sock *)tw); int ttd = tw->tw_ttd - jiffies; if (ttd < 0) ttd = 0; - dest = &tcp6tw->tw_v6_daddr; - src = &tcp6tw->tw_v6_rcv_saddr; + dest = &tw6->tw_v6_daddr; + src = &tw6->tw_v6_rcv_saddr; destp = ntohs(tw->tw_dport); srcp = ntohs(tw->tw_sport); -- cgit v1.1 From 3cf3dc6c2e05e67b12e522f547c0b71d509a516c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:23:20 -0800 Subject: [IPV6]: Export some symbols for DCCPv6 Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/ipv6/af_inet6.c | 2 ++ net/ipv6/exthdrs.c | 4 ++++ net/ipv6/ip6_flowlabel.c | 2 ++ net/ipv6/ip6_output.c | 2 ++ 4 files changed, 10 insertions(+) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index fd040e9..23675ef 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -389,6 +389,8 @@ int inet6_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(inet6_destroy_sock); + /* * This does both peername and sockname. */ diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c index be6faf3..113374d 100644 --- a/net/ipv6/exthdrs.c +++ b/net/ipv6/exthdrs.c @@ -413,6 +413,8 @@ ipv6_invert_rthdr(struct sock *sk, struct ipv6_rt_hdr *hdr) return opt; } +EXPORT_SYMBOL_GPL(ipv6_invert_rthdr); + /********************************** Hop-by-hop options. **********************************/ @@ -579,6 +581,8 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt) return opt2; } +EXPORT_SYMBOL_GPL(ipv6_dup_options); + static int ipv6_renew_option(void *ohdr, struct ipv6_opt_hdr __user *newopt, int newoptlen, int inherit, diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index 1cf0276..89d12b4 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -200,6 +200,8 @@ struct ip6_flowlabel * fl6_sock_lookup(struct sock *sk, u32 label) return NULL; } +EXPORT_SYMBOL_GPL(fl6_sock_lookup); + void fl6_free_socklist(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 8523c76..b4c4beb 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -775,6 +775,8 @@ out_err_release: return err; } +EXPORT_SYMBOL_GPL(ip6_dst_lookup); + static inline int ip6_ufo_append_data(struct sock *sk, int getfrag(void *from, char *to, int offset, int len, int odd, struct sk_buff *skb), -- cgit v1.1 From 34ca6860810342441f801226b19ae6c9e0ecb34f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:23:32 -0800 Subject: [DCCP]: Just rename dccp_v4_prot to dccp_prot To match TCP equivalent. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/dccp.h | 2 +- net/dccp/ipv4.c | 2 +- net/dccp/proto.c | 8 ++++---- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index f97b85d..e711f85 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -59,7 +59,7 @@ extern void dccp_time_wait(struct sock *sk, int state, int timeo); #define DCCP_RTO_MAX ((unsigned)(120 * HZ)) /* FIXME: using TCP value */ -extern struct proto dccp_v4_prot; +extern struct proto dccp_prot; /* is seq1 < seq2 ? */ static inline int before48(const u64 seq1, const u64 seq2) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 0ce7d0f..9f69a67 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -1317,7 +1317,7 @@ static struct request_sock_ops dccp_request_sock_ops = { .send_reset = dccp_v4_ctl_send_reset, }; -struct proto dccp_v4_prot = { +struct proto dccp_prot = { .name = "DCCP", .owner = THIS_MODULE, .close = dccp_close, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7b30c12..9cb2989 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -684,7 +684,7 @@ extern struct net_proto_family inet_family_ops; static struct inet_protosw dccp_v4_protosw = { .type = SOCK_DCCP, .protocol = IPPROTO_DCCP, - .prot = &dccp_v4_prot, + .prot = &dccp_prot, .ops = &inet_dccp_ops, .capability = -1, .no_check = 0, @@ -769,7 +769,7 @@ static int __init dccp_init(void) { unsigned long goal; int ehash_order, bhash_order, i; - int rc = proto_register(&dccp_v4_prot, 1); + int rc = proto_register(&dccp_prot, 1); if (rc) goto out; @@ -872,7 +872,7 @@ out_free_bind_bucket_cachep: kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); dccp_hashinfo.bind_bucket_cachep = NULL; out_proto_unregister: - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); goto out; } @@ -895,7 +895,7 @@ static void __exit dccp_fini(void) get_order(dccp_hashinfo.ehash_size * sizeof(struct inet_ehash_bucket))); kmem_cache_destroy(dccp_hashinfo.bind_bucket_cachep); - proto_unregister(&dccp_v4_prot); + proto_unregister(&dccp_prot); } module_init(dccp_init); -- cgit v1.1 From f21e68caa0ddffddf98a1e729e734a470957b6ec Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:24:16 -0800 Subject: [DCCP]: Prepare the AF agnostic core for the introduction of DCCPv6 Basically exports a similar set of functions as the one exported by the non-AF specific TCP code. In the process moved some non-AF specific code from dccp_v4_connect to dccp_connect_init and moved the checksum verification from dccp_invalid_packet to dccp_v4_rcv, so as to use it in dccp_v6_rcv too. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/dccp.h | 22 ++++++++++++++++ net/dccp/input.c | 4 +++ net/dccp/ipv4.c | 73 ++++++++++++++++++++++++---------------------------- net/dccp/minisocks.c | 8 ++++++ net/dccp/output.c | 27 ++++++++++++------- net/dccp/proto.c | 32 ++++++++++++++++++++--- 6 files changed, 114 insertions(+), 52 deletions(-) diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index e711f85..93f26dd 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -228,6 +228,9 @@ extern int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, extern int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned len); +extern int dccp_v4_init_sock(struct sock *sk); +extern int dccp_v4_destroy_sock(struct sock *sk); + extern void dccp_close(struct sock *sk, long timeout); extern struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, @@ -238,6 +241,7 @@ extern struct sk_buff *dccp_make_reset(struct sock *sk, extern int dccp_connect(struct sock *sk); extern int dccp_disconnect(struct sock *sk, int flags); +extern void dccp_unhash(struct sock *sk); extern int dccp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen); extern int dccp_setsockopt(struct sock *sk, int level, int optname, @@ -249,6 +253,13 @@ extern int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len); extern void dccp_shutdown(struct sock *sk, int how); +extern int inet_dccp_listen(struct socket *sock, int backlog); +extern unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait); +extern void dccp_v4_send_check(struct sock *sk, int len, + struct sk_buff *skb); +extern int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len); extern int dccp_v4_checksum(const struct sk_buff *skb, const u32 saddr, const u32 daddr); @@ -256,6 +267,17 @@ extern int dccp_v4_checksum(const struct sk_buff *skb, extern int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code); extern void dccp_send_close(struct sock *sk, const int active); +extern int dccp_invalid_packet(struct sk_buff *skb); + +static inline int dccp_bad_service_code(const struct sock *sk, + const __u32 service) +{ + const struct dccp_sock *dp = dccp_sk(sk); + + if (dp->dccps_service == service) + return 0; + return !dccp_list_has_service(dp->dccps_service_list, service); +} struct dccp_skb_cb { __u8 dccpd_type:4; diff --git a/net/dccp/input.c b/net/dccp/input.c index c81488f..9a724ff 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -250,6 +250,8 @@ discard: return 0; } +EXPORT_SYMBOL_GPL(dccp_rcv_established); + static int dccp_rcv_request_sent_state_process(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, @@ -567,3 +569,5 @@ discard: } return 0; } + +EXPORT_SYMBOL_GPL(dccp_rcv_state_process); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 9f69a67..3108c9d 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -46,11 +46,13 @@ static void dccp_v4_hash(struct sock *sk) inet_hash(&dccp_hashinfo, sk); } -static void dccp_v4_unhash(struct sock *sk) +void dccp_unhash(struct sock *sk) { inet_unhash(&dccp_hashinfo, sk); } +EXPORT_SYMBOL_GPL(dccp_unhash); + /* called with local bh disabled */ static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, struct inet_timewait_sock **twp) @@ -209,8 +211,7 @@ out: } } -static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, - int addr_len) +int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); struct dccp_sock *dp = dccp_sk(sk); @@ -288,16 +289,6 @@ static int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, usin->sin_port); dccp_update_gss(sk, dp->dccps_iss); - /* - * SWL and AWL are initially adjusted so that they are not less than - * the initial Sequence Numbers received and sent, respectively: - * SWL := max(GSR + 1 - floor(W/4), ISR), - * AWL := max(GSS - W' + 1, ISS). - * These adjustments MUST be applied only at the beginning of the - * connection. - */ - dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); - inet->id = dp->dccps_iss ^ jiffies; err = dccp_connect(sk); @@ -317,6 +308,8 @@ failure: goto out; } +EXPORT_SYMBOL_GPL(dccp_v4_connect); + /* * This routine does path mtu discovery as defined in RFC1191. */ @@ -608,7 +601,7 @@ out: } /* This routine computes an IPv4 DCCP checksum. */ -static void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) +void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) { const struct inet_sock *inet = inet_sk(sk); struct dccp_hdr *dh = dccp_hdr(skb); @@ -616,6 +609,8 @@ static void dccp_v4_send_check(struct sock *sk, int len, struct sk_buff *skb) dh->dccph_checksum = dccp_v4_checksum(skb, inet->saddr, inet->daddr); } +EXPORT_SYMBOL_GPL(dccp_v4_send_check); + int dccp_v4_send_reset(struct sock *sk, enum dccp_reset_codes code) { struct sk_buff *skb; @@ -651,16 +646,6 @@ static inline u64 dccp_v4_init_sequence(const struct sock *sk, dccp_hdr(skb)->dccph_sport); } -static inline int dccp_bad_service_code(const struct sock *sk, - const __u32 service) -{ - const struct dccp_sock *dp = dccp_sk(sk); - - if (dp->dccps_service == service) - return 0; - return !dccp_list_has_service(dp->dccps_service_list, service); -} - int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -672,7 +657,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) const __u32 service = dccp_hdr_request(skb)->dccph_req_service; struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; - struct dst_entry *dst = NULL; /* Never answer to DCCP_PKT_REQUESTs send to broadcast or multicast */ if (((struct rtable *)skb->dst)->rt_flags & @@ -713,7 +697,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) ireq = inet_rsk(req); ireq->loc_addr = daddr; ireq->rmt_addr = saddr; - /* FIXME: Merge Aristeu's option parsing code when ready */ req->rcv_wnd = 100; /* Fake, option parsing will get the right value */ ireq->opt = NULL; @@ -731,7 +714,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) dreq->dreq_iss = dccp_v4_init_sequence(sk, skb); dreq->dreq_service = service; - if (dccp_v4_send_response(sk, req, dst)) + if (dccp_v4_send_response(sk, req, NULL)) goto drop_and_free; inet_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); @@ -748,6 +731,8 @@ drop: return -1; } +EXPORT_SYMBOL_GPL(dccp_v4_conn_request); + /* * The three way handshake has completed - we got a valid ACK or DATAACK - * now create the new socket. @@ -802,6 +787,8 @@ exit: return NULL; } +EXPORT_SYMBOL_GPL(dccp_v4_request_recv_sock); + static struct sock *dccp_v4_hnd_req(struct sock *sk, struct sk_buff *skb) { const struct dccp_hdr *dh = dccp_hdr(skb); @@ -1021,7 +1008,9 @@ discard: return 0; } -static inline int dccp_invalid_packet(struct sk_buff *skb) +EXPORT_SYMBOL_GPL(dccp_v4_do_rcv); + +int dccp_invalid_packet(struct sk_buff *skb) { const struct dccp_hdr *dh; @@ -1075,17 +1064,11 @@ static inline int dccp_invalid_packet(struct sk_buff *skb) return 1; } - /* If the header checksum is incorrect, drop packet and return */ - if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, - skb->nh.iph->daddr) < 0) { - LIMIT_NETDEBUG(KERN_WARNING "DCCP: header checksum is " - "incorrect\n"); - return 1; - } - return 0; } +EXPORT_SYMBOL_GPL(dccp_invalid_packet); + /* this is called when real data arrives */ int dccp_v4_rcv(struct sk_buff *skb) { @@ -1098,6 +1081,14 @@ int dccp_v4_rcv(struct sk_buff *skb) if (dccp_invalid_packet(skb)) goto discard_it; + /* If the header checksum is incorrect, drop packet and return */ + if (dccp_v4_verify_checksum(skb, skb->nh.iph->saddr, + skb->nh.iph->daddr) < 0) { + LIMIT_NETDEBUG(KERN_WARNING "%s: incorrect header checksum\n", + __FUNCTION__); + goto discard_it; + } + dh = dccp_hdr(skb); DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); @@ -1217,7 +1208,7 @@ struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { .sockaddr_len = sizeof(struct sockaddr_in), }; -static int dccp_v4_init_sock(struct sock *sk) +int dccp_v4_init_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); static int dccp_ctl_socket_init = 1; @@ -1270,7 +1261,9 @@ static int dccp_v4_init_sock(struct sock *sk) return 0; } -static int dccp_v4_destroy_sock(struct sock *sk) +EXPORT_SYMBOL_GPL(dccp_v4_init_sock); + +int dccp_v4_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); @@ -1303,6 +1296,8 @@ static int dccp_v4_destroy_sock(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_v4_destroy_sock); + static void dccp_v4_reqsk_destructor(struct request_sock *req) { kfree(inet_rsk(req)->opt); @@ -1331,7 +1326,7 @@ struct proto dccp_prot = { .recvmsg = dccp_recvmsg, .backlog_rcv = dccp_v4_do_rcv, .hash = dccp_v4_hash, - .unhash = dccp_v4_unhash, + .unhash = dccp_unhash, .accept = inet_csk_accept, .get_port = dccp_v4_get_port, .shutdown = dccp_shutdown, diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index c7ff80c..5c767b5 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -40,6 +40,8 @@ struct inet_timewait_death_row dccp_death_row = { (unsigned long)&dccp_death_row), }; +EXPORT_SYMBOL_GPL(dccp_death_row); + void dccp_time_wait(struct sock *sk, int state, int timeo) { struct inet_timewait_sock *tw = NULL; @@ -170,6 +172,8 @@ out_free: return newsk; } +EXPORT_SYMBOL_GPL(dccp_create_openreq_child); + /* * Process an incoming packet for RESPOND sockets represented * as an request_sock. @@ -236,6 +240,8 @@ drop: goto out; } +EXPORT_SYMBOL_GPL(dccp_check_req); + /* * Queue segment on the new socket if the new socket is active, * otherwise we just shortcircuit this and continue with @@ -266,3 +272,5 @@ int dccp_child_process(struct sock *parent, struct sock *child, sock_put(child); return ret; } + +EXPORT_SYMBOL_GPL(dccp_child_process); diff --git a/net/dccp/output.c b/net/dccp/output.c index f358805..c40f7f8 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -135,12 +135,6 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { struct dccp_sock *dp = dccp_sk(sk); - /* - * FIXME: we really should be using the af_specific thing to support - * IPv6. - * mss_now = pmtu - tp->af_specific->net_header_len - - * sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext); - */ int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); @@ -164,6 +158,8 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) return mss_now; } +EXPORT_SYMBOL_GPL(dccp_sync_mss); + void dccp_write_space(struct sock *sk) { read_lock(&sk->sk_callback_lock); @@ -319,6 +315,8 @@ struct sk_buff *dccp_make_response(struct sock *sk, struct dst_entry *dst, return skb; } +EXPORT_SYMBOL_GPL(dccp_make_response); + struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, const enum dccp_reset_codes code) @@ -375,6 +373,7 @@ struct sk_buff *dccp_make_reset(struct sock *sk, struct dst_entry *dst, */ static inline void dccp_connect_init(struct sock *sk) { + struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = __sk_dst_get(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -383,10 +382,16 @@ static inline void dccp_connect_init(struct sock *sk) dccp_sync_mss(sk, dst_mtu(dst)); - /* - * FIXME: set dp->{dccps_swh,dccps_swl}, with - * something like dccp_inc_seq - */ + dccp_update_gss(sk, dp->dccps_iss); + /* + * SWL and AWL are initially adjusted so that they are not less than + * the initial Sequence Numbers received and sent, respectively: + * SWL := max(GSR + 1 - floor(W/4), ISR), + * AWL := max(GSS - W' + 1, ISS). + * These adjustments MUST be applied only at the beginning of the + * connection. + */ + dccp_set_seqno(&dp->dccps_awl, max48(dp->dccps_awl, dp->dccps_iss)); icsk->icsk_retransmits = 0; } @@ -418,6 +423,8 @@ int dccp_connect(struct sock *sk) return 0; } +EXPORT_SYMBOL_GPL(dccp_connect); + void dccp_send_ack(struct sock *sk) { /* If we have been reset, we may not send again. */ diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 9cb2989..51dfacd2 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -41,8 +41,12 @@ DEFINE_SNMP_STAT(struct dccp_mib, dccp_statistics) __read_mostly; +EXPORT_SYMBOL_GPL(dccp_statistics); + atomic_t dccp_orphan_count = ATOMIC_INIT(0); +EXPORT_SYMBOL_GPL(dccp_orphan_count); + static struct net_protocol dccp_protocol = { .handler = dccp_v4_rcv, .err_handler = dccp_v4_err, @@ -149,6 +153,8 @@ int dccp_disconnect(struct sock *sk, int flags) return err; } +EXPORT_SYMBOL_GPL(dccp_disconnect); + /* * Wait for a DCCP event. * @@ -156,8 +162,8 @@ int dccp_disconnect(struct sock *sk, int flags) * take care of normal races (between the test and the event) and we don't * go look at any of the socket buffers directly. */ -static unsigned int dccp_poll(struct file *file, struct socket *sock, - poll_table *wait) +unsigned int dccp_poll(struct file *file, struct socket *sock, + poll_table *wait) { unsigned int mask; struct sock *sk = sock->sk; @@ -205,12 +211,16 @@ static unsigned int dccp_poll(struct file *file, struct socket *sock, return mask; } +EXPORT_SYMBOL_GPL(dccp_poll); + int dccp_ioctl(struct sock *sk, int cmd, unsigned long arg) { dccp_pr_debug("entry\n"); return -ENOIOCTLCMD; } +EXPORT_SYMBOL_GPL(dccp_ioctl); + static int dccp_setsockopt_service(struct sock *sk, const u32 service, char __user *optval, int optlen) { @@ -284,6 +294,8 @@ int dccp_setsockopt(struct sock *sk, int level, int optname, return err; } +EXPORT_SYMBOL_GPL(dccp_setsockopt); + static int dccp_getsockopt_service(struct sock *sk, int len, u32 __user *optval, int __user *optlen) @@ -357,6 +369,8 @@ int dccp_getsockopt(struct sock *sk, int level, int optname, return 0; } +EXPORT_SYMBOL_GPL(dccp_getsockopt); + int dccp_sendmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len) { @@ -413,6 +427,8 @@ out_discard: goto out_release; } +EXPORT_SYMBOL_GPL(dccp_sendmsg); + int dccp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, size_t len, int nonblock, int flags, int *addr_len) { @@ -510,7 +526,9 @@ out: return len; } -static int inet_dccp_listen(struct socket *sock, int backlog) +EXPORT_SYMBOL_GPL(dccp_recvmsg); + +int inet_dccp_listen(struct socket *sock, int backlog) { struct sock *sk = sock->sk; unsigned char old_state; @@ -546,6 +564,8 @@ out: return err; } +EXPORT_SYMBOL_GPL(inet_dccp_listen); + static const unsigned char dccp_new_state[] = { /* current state: new state: action: */ [0] = DCCP_CLOSED, @@ -651,11 +671,15 @@ adjudge_to_death: sock_put(sk); } +EXPORT_SYMBOL_GPL(dccp_close); + void dccp_shutdown(struct sock *sk, int how) { dccp_pr_debug("entry\n"); } +EXPORT_SYMBOL_GPL(dccp_shutdown); + static struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, @@ -763,6 +787,8 @@ MODULE_PARM_DESC(thash_entries, "Number of ehash buckets"); int dccp_debug; module_param(dccp_debug, int, 0444); MODULE_PARM_DESC(dccp_debug, "Enable debug messages"); + +EXPORT_SYMBOL_GPL(dccp_debug); #endif static int __init dccp_init(void) -- cgit v1.1 From 399c07def62a77678d633f5b3005431423a424a8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:24:28 -0800 Subject: [IPV6]: Export ipv6_opt_accepted It was already non-TCP specific, will be used by DCCPv6. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/ipv6.h | 2 ++ net/ipv6/af_inet6.c | 21 +++++++++++++++++++++ net/ipv6/tcp_ipv6.c | 16 ---------------- 3 files changed, 23 insertions(+), 16 deletions(-) diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 0a2ad51..8513761 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -240,6 +240,8 @@ extern struct ipv6_txoptions * ipv6_renew_options(struct sock *sk, struct ipv6_t struct ipv6_txoptions *ipv6_fixup_options(struct ipv6_txoptions *opt_space, struct ipv6_txoptions *opt); +extern int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb); + extern int ip6_frag_nqueues; extern atomic_t ip6_frag_mem; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 23675ef..bf17aab 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -663,6 +663,27 @@ int inet6_sk_rebuild_header(struct sock *sk) EXPORT_SYMBOL_GPL(inet6_sk_rebuild_header); +int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_skb_parm *opt = IP6CB(skb); + + if (np->rxopt.all) { + if ((opt->hop && (np->rxopt.bits.hopopts || + np->rxopt.bits.ohopopts)) || + ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && + np->rxopt.bits.rxflow) || + (opt->srcrt && (np->rxopt.bits.srcrt || + np->rxopt.bits.osrcrt)) || + ((opt->dst1 || opt->dst0) && + (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) + return 1; + } + return 0; +} + +EXPORT_SYMBOL_GPL(ipv6_opt_accepted); + int snmp6_mib_init(void *ptr[2], size_t mibsize, size_t mibalign) { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index cb88007..e5c8a66 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -711,22 +711,6 @@ static struct request_sock_ops tcp6_request_sock_ops = { .send_reset = tcp_v6_send_reset }; -static int ipv6_opt_accepted(struct sock *sk, struct sk_buff *skb) -{ - struct ipv6_pinfo *np = inet6_sk(sk); - struct inet6_skb_parm *opt = IP6CB(skb); - - if (np->rxopt.all) { - if ((opt->hop && (np->rxopt.bits.hopopts || np->rxopt.bits.ohopopts)) || - ((IPV6_FLOWINFO_MASK & *(u32*)skb->nh.raw) && np->rxopt.bits.rxflow) || - (opt->srcrt && (np->rxopt.bits.srcrt || np->rxopt.bits.osrcrt)) || - ((opt->dst1 || opt->dst0) && (np->rxopt.bits.dstopts || np->rxopt.bits.odstopts))) - return 1; - } - return 0; -} - - static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); -- cgit v1.1 From 3df80d9320bcaea72b1b4761a319c79cb3fdaf5f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:24:53 -0800 Subject: [DCCP]: Introduce DCCPv6 Still needs mucho polishing, specially in the checksum code, but works just fine, inet_diag/iproute2 and all 8) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/Makefile | 4 + net/dccp/ipv6.c | 1438 ++++++++++++++++++++++++++++++++++++++++++++++++++ net/dccp/ipv6.h | 37 ++ net/dccp/minisocks.c | 13 +- 4 files changed, 1491 insertions(+), 1 deletion(-) create mode 100644 net/dccp/ipv6.c create mode 100644 net/dccp/ipv6.h diff --git a/net/dccp/Makefile b/net/dccp/Makefile index 344a8da..87b27ff 100644 --- a/net/dccp/Makefile +++ b/net/dccp/Makefile @@ -1,3 +1,7 @@ +obj-$(CONFIG_IPV6) += dccp_ipv6.o + +dccp_ipv6-y := ipv6.o + obj-$(CONFIG_IP_DCCP) += dccp.o dccp-y := ccid.o input.o ipv4.o minisocks.o options.o output.o proto.o \ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c new file mode 100644 index 0000000..a7d2aee --- /dev/null +++ b/net/dccp/ipv6.c @@ -0,0 +1,1438 @@ +/* + * DCCP over IPv6 + * Linux INET6 implementation + * + * Based on net/dccp6/ipv6.c + * + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "dccp.h" +#include "ipv6.h" + +static void dccp_v6_ctl_send_reset(struct sk_buff *skb); +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req); +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb); + +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb); + +static struct inet_connection_sock_af_ops dccp_ipv6_mapped; +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops; + +static int dccp_v6_get_port(struct sock *sk, unsigned short snum) +{ + return inet_csk_get_port(&dccp_hashinfo, sk, snum, + inet6_csk_bind_conflict); +} + +static void dccp_v6_hash(struct sock *sk) +{ + if (sk->sk_state != DCCP_CLOSED) { + if (inet_csk(sk)->icsk_af_ops == &dccp_ipv6_mapped) { + dccp_prot.hash(sk); + return; + } + local_bh_disable(); + __inet6_hash(&dccp_hashinfo, sk); + local_bh_enable(); + } +} + +static inline u16 dccp_v6_check(struct dccp_hdr *dh, int len, + struct in6_addr *saddr, + struct in6_addr *daddr, + unsigned long base) +{ + return csum_ipv6_magic(saddr, daddr, len, IPPROTO_DCCP, base); +} + +static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + + if (skb->protocol == htons(ETH_P_IPV6)) + return secure_tcpv6_sequence_number(skb->nh.ipv6h->daddr.s6_addr32, + skb->nh.ipv6h->saddr.s6_addr32, + dh->dccph_dport, + dh->dccph_sport); + else + return secure_dccp_sequence_number(skb->nh.iph->daddr, + skb->nh.iph->saddr, + dh->dccph_dport, + dh->dccph_sport); +} + +static int __dccp_v6_check_established(struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, inet->num, + saddr, inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); + + tw = inet_twsk(sk2); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) + goto not_unique; + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hash = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, &dccp_death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 dccp_v6_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + + return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +static int dccp_v6_hash_connect(struct sock *sk) +{ + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + dccp_v6_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &dccp_hashinfo.bhash[inet_bhashfn(port, + dccp_hashinfo.bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__dccp_v6_check_established(sk, + port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, + head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet6_hash(&dccp_hashinfo, sk); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, &dccp_death_row); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &dccp_hashinfo.bhash[inet_bhashfn(snum, + dccp_hashinfo.bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __inet6_hash(&dccp_hashinfo, sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __dccp_v6_check_established(sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_sock *inet = inet_sk(sk); + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_sock *dp = dccp_sk(sk); + struct in6_addr *saddr = NULL, *final_p = NULL, final; + struct flowi fl; + struct dst_entry *dst; + int addr_type; + int err; + + dp->dccps_role = DCCP_ROLE_CLIENT; + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + if (usin->sin6_family != AF_INET6) + return -EAFNOSUPPORT; + + memset(&fl, 0, sizeof(fl)); + + if (np->sndflow) { + fl.fl6_flowlabel = usin->sin6_flowinfo & IPV6_FLOWINFO_MASK; + IP6_ECN_flow_init(fl.fl6_flowlabel); + if (fl.fl6_flowlabel & IPV6_FLOWLABEL_MASK) { + struct ip6_flowlabel *flowlabel; + flowlabel = fl6_sock_lookup(sk, fl.fl6_flowlabel); + if (flowlabel == NULL) + return -EINVAL; + ipv6_addr_copy(&usin->sin6_addr, &flowlabel->dst); + fl6_sock_release(flowlabel); + } + } + + /* + * connect() to INADDR_ANY means loopback (BSD'ism). + */ + + if (ipv6_addr_any(&usin->sin6_addr)) + usin->sin6_addr.s6_addr[15] = 0x1; + + addr_type = ipv6_addr_type(&usin->sin6_addr); + + if(addr_type & IPV6_ADDR_MULTICAST) + return -ENETUNREACH; + + if (addr_type & IPV6_ADDR_LINKLOCAL) { + if (addr_len >= sizeof(struct sockaddr_in6) && + usin->sin6_scope_id) { + /* If interface is set while binding, indices + * must coincide. + */ + if (sk->sk_bound_dev_if && + sk->sk_bound_dev_if != usin->sin6_scope_id) + return -EINVAL; + + sk->sk_bound_dev_if = usin->sin6_scope_id; + } + + /* Connect to link-local address requires an interface */ + if (!sk->sk_bound_dev_if) + return -EINVAL; + } + + ipv6_addr_copy(&np->daddr, &usin->sin6_addr); + np->flow_label = fl.fl6_flowlabel; + + /* + * DCCP over IPv4 + */ + + if (addr_type == IPV6_ADDR_MAPPED) { + u32 exthdrlen = dp->dccps_ext_header_len; + struct sockaddr_in sin; + + SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); + + if (__ipv6_only_sock(sk)) + return -ENETUNREACH; + + sin.sin_family = AF_INET; + sin.sin_port = usin->sin6_port; + sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; + + inet_csk(sk)->icsk_af_ops = &dccp_ipv6_mapped; + sk->sk_backlog_rcv = dccp_v4_do_rcv; + + err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); + + if (err) { + dp->dccps_ext_header_len = exthdrlen; + inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_backlog_rcv = dccp_v6_do_rcv; + goto failure; + } else { + ipv6_addr_set(&np->saddr, 0, 0, htonl(0x0000FFFF), + inet->saddr); + ipv6_addr_set(&np->rcv_saddr, 0, 0, htonl(0x0000FFFF), + inet->rcv_saddr); + } + + return err; + } + + if (!ipv6_addr_any(&np->rcv_saddr)) + saddr = &np->rcv_saddr; + + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, saddr ? saddr : &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = usin->sin6_port; + fl.fl_ip_sport = inet->sport; + + if (np->opt && np->opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)np->opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto failure; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto failure; + + if (saddr == NULL) { + saddr = &fl.fl6_src; + ipv6_addr_copy(&np->rcv_saddr, saddr); + } + + /* set the source address */ + ipv6_addr_copy(&np->saddr, saddr); + inet->rcv_saddr = LOOPBACK4_IPV6; + + ip6_dst_store(sk, dst, NULL); + + dp->dccps_ext_header_len = 0; + if (np->opt) + dp->dccps_ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + + inet->dport = usin->sin6_port; + + dccp_set_state(sk, DCCP_REQUESTING); + err = dccp_v6_hash_connect(sk); + if (err) + goto late_failure; + /* FIXME */ +#if 0 + dp->dccps_gar = secure_dccp_v6_sequence_number(np->saddr.s6_addr32, + np->daddr.s6_addr32, + inet->sport, + inet->dport); +#endif + err = dccp_connect(sk); + if (err) + goto late_failure; + + return 0; + +late_failure: + dccp_set_state(sk, DCCP_CLOSED); + __sk_dst_reset(sk); +failure: + inet->dport = 0; + sk->sk_route_caps = 0; + return err; +} + +static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, + int type, int code, int offset, __u32 info) +{ + struct ipv6hdr *hdr = (struct ipv6hdr *)skb->data; + const struct dccp_hdr *dh = (struct dccp_hdr *)(skb->data + offset); + struct ipv6_pinfo *np; + struct sock *sk; + int err; + __u64 seq; + + sk = inet6_lookup(&dccp_hashinfo, &hdr->daddr, dh->dccph_dport, + &hdr->saddr, dh->dccph_sport, skb->dev->ifindex); + + if (sk == NULL) { + ICMP6_INC_STATS_BH(__in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); + return; + } + + if (sk->sk_state == DCCP_TIME_WAIT) { + inet_twsk_put((struct inet_timewait_sock *)sk); + return; + } + + bh_lock_sock(sk); + if (sock_owned_by_user(sk)) + NET_INC_STATS_BH(LINUX_MIB_LOCKDROPPEDICMPS); + + if (sk->sk_state == DCCP_CLOSED) + goto out; + + np = inet6_sk(sk); + + if (type == ICMPV6_PKT_TOOBIG) { + struct dccp_sock *dp = dccp_sk(sk); + struct dst_entry *dst = NULL; + + if (sock_owned_by_user(sk)) + goto out; + if ((1 << sk->sk_state) & (DCCPF_LISTEN | DCCPF_CLOSED)) + goto out; + + /* icmp should have updated the destination cache entry */ + dst = __sk_dst_check(sk, np->dst_cookie); + + if (dst == NULL) { + struct inet_sock *inet = inet_sk(sk); + struct flowi fl; + + /* BUGGG_FUTURE: Again, it is not clear how + to handle rthdr case. Ignore this complexity + for now. + */ + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &np->daddr); + ipv6_addr_copy(&fl.fl6_src, &np->saddr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet->dport; + fl.fl_ip_sport = inet->sport; + + if ((err = ip6_dst_lookup(sk, &dst, &fl))) { + sk->sk_err_soft = -err; + goto out; + } + + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) { + sk->sk_err_soft = -err; + goto out; + } + + } else + dst_hold(dst); + + if (dp->dccps_pmtu_cookie > dst_mtu(dst)) { + dccp_sync_mss(sk, dst_mtu(dst)); + } /* else let the usual retransmit timer handle it */ + dst_release(dst); + goto out; + } + + icmpv6_err_convert(type, code, &err); + + seq = DCCP_SKB_CB(skb)->dccpd_seq; + /* Might be for an request_sock */ + switch (sk->sk_state) { + struct request_sock *req, **prev; + case DCCP_LISTEN: + if (sock_owned_by_user(sk)) + goto out; + + req = inet6_csk_search_req(sk, &prev, dh->dccph_dport, + &hdr->daddr, &hdr->saddr, + inet6_iif(skb)); + if (!req) + goto out; + + /* ICMPs are not backlogged, hence we cannot get + * an established socket here. + */ + BUG_TRAP(req->sk == NULL); + + if (seq != dccp_rsk(req)->dreq_iss) { + NET_INC_STATS_BH(LINUX_MIB_OUTOFWINDOWICMPS); + goto out; + } + + inet_csk_reqsk_queue_drop(sk, req, prev); + goto out; + + case DCCP_REQUESTING: + case DCCP_RESPOND: /* Cannot happen. + It can, it SYNs are crossed. --ANK */ + if (!sock_owned_by_user(sk)) { + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + sk->sk_err = err; + /* + * Wake people up to see the error + * (see connect in sock.c) + */ + sk->sk_error_report(sk); + + dccp_done(sk); + } else + sk->sk_err_soft = err; + goto out; + } + + if (!sock_owned_by_user(sk) && np->recverr) { + sk->sk_err = err; + sk->sk_error_report(sk); + } else + sk->sk_err_soft = err; + +out: + bh_unlock_sock(sk); + sock_put(sk); +} + + +static int dccp_v6_send_response(struct sock *sk, struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *skb; + struct ipv6_txoptions *opt = NULL; + struct in6_addr *final_p = NULL, final; + struct flowi fl; + int err = -1; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.fl6_flowlabel = 0; + fl.oif = ireq6->iif; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (dst == NULL) { + opt = np->opt; + if (opt == NULL && + np->rxopt.bits.osrcrt == 2 && + ireq6->pktopts) { + struct sk_buff *pktopts = ireq6->pktopts; + struct inet6_skb_parm *rxopt = IP6CB(pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(pktopts->nh.raw + + rxopt->srcrt)); + } + + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *)opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + + err = ip6_dst_lookup(sk, &dst, &fl); + if (err) + goto done; + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + if ((err = xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto done; + } + + skb = dccp_make_response(sk, dst, req); + if (skb != NULL) { + struct dccp_hdr *dh = dccp_hdr(skb); + dh->dccph_checksum = dccp_v6_check(dh, skb->len, + &ireq6->loc_addr, + &ireq6->rmt_addr, + csum_partial((char *)dh, + skb->len, + skb->csum)); + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + err = ip6_xmit(sk, skb, &fl, opt, 0); + if (err == NET_XMIT_CN) + err = 0; + } + +done: + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + return err; +} + +static void dccp_v6_reqsk_destructor(struct request_sock *req) +{ + if (inet6_rsk(req)->pktopts != NULL) + kfree_skb(inet6_rsk(req)->pktopts); +} + +static struct request_sock_ops dccp6_request_sock_ops = { + .family = AF_INET6, + .obj_size = sizeof(struct dccp6_request_sock), + .rtx_syn_ack = dccp_v6_send_response, + .send_ack = dccp_v6_reqsk_send_ack, + .destructor = dccp_v6_reqsk_destructor, + .send_reset = dccp_v6_ctl_send_reset, +}; + +static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct dccp_hdr *dh = dccp_hdr(skb); + + dh->dccph_checksum = csum_ipv6_magic(&np->saddr, &np->daddr, + len, IPPROTO_DCCP, + csum_partial((char *)dh, + dh->dccph_doff << 2, + skb->csum)); +} + +static void dccp_v6_ctl_send_reset(struct sk_buff *rxskb) +{ + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_reset_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_reset); + struct sk_buff *skb; + struct flowi fl; + u64 seqno; + + if (rxdh->dccph_type == DCCP_PKT_RESET) + return; + + if (!ipv6_unicast_destination(rxskb)) + return; + + /* + * We need to grab some memory, and put together an RST, + * and then put it into the queue to be sent. + */ + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_reset_len); + + skb->h.raw = skb_push(skb, dccp_hdr_reset_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_reset_len); + + /* Swap the send and the receive. */ + dh->dccph_type = DCCP_PKT_RESET; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_reset_len / 4; + dh->dccph_x = 1; + dccp_hdr_reset(skb)->dccph_reset_code = + DCCP_SKB_CB(rxskb)->dccpd_reset_code; + + /* See "8.3.1. Abnormal Termination" in draft-ietf-dccp-spec-11 */ + seqno = 0; + if (DCCP_SKB_CB(rxskb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_set_seqno(&seqno, DCCP_SKB_CB(rxskb)->dccpd_ack_seq + 1); + + dccp_hdr_set_seq(dh, seqno); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + dh->dccph_checksum = csum_ipv6_magic(&fl.fl6_src, &fl.fl6_dst, + sizeof(*dh), IPPROTO_DCCP, + skb->csum); + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + /* sk = NULL, but it is safe for now. RST socket required. */ + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + DCCP_INC_STATS_BH(DCCP_MIB_OUTRSTS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_ctl_send_ack(struct sk_buff *rxskb) +{ + struct flowi fl; + struct dccp_hdr *rxdh = dccp_hdr(rxskb), *dh; + const int dccp_hdr_ack_len = sizeof(struct dccp_hdr) + + sizeof(struct dccp_hdr_ext) + + sizeof(struct dccp_hdr_ack_bits); + struct sk_buff *skb; + + skb = alloc_skb(MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len, GFP_ATOMIC); + if (skb == NULL) + return; + + skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr) + + dccp_hdr_ack_len); + + skb->h.raw = skb_push(skb, dccp_hdr_ack_len); + dh = dccp_hdr(skb); + memset(dh, 0, dccp_hdr_ack_len); + + /* Build DCCP header and checksum it. */ + dh->dccph_type = DCCP_PKT_ACK; + dh->dccph_sport = rxdh->dccph_dport; + dh->dccph_dport = rxdh->dccph_sport; + dh->dccph_doff = dccp_hdr_ack_len / 4; + dh->dccph_x = 1; + + dccp_hdr_set_seq(dh, DCCP_SKB_CB(rxskb)->dccpd_ack_seq); + dccp_hdr_set_ack(dccp_hdr_ack_bits(skb), + DCCP_SKB_CB(rxskb)->dccpd_seq); + + memset(&fl, 0, sizeof(fl)); + ipv6_addr_copy(&fl.fl6_dst, &rxskb->nh.ipv6h->saddr); + ipv6_addr_copy(&fl.fl6_src, &rxskb->nh.ipv6h->daddr); + + /* FIXME: calculate checksum, IPv4 also should... */ + + fl.proto = IPPROTO_DCCP; + fl.oif = inet6_iif(rxskb); + fl.fl_ip_dport = dh->dccph_dport; + fl.fl_ip_sport = dh->dccph_sport; + + if (!ip6_dst_lookup(NULL, &skb->dst, &fl)) { + if (xfrm_lookup(&skb->dst, &fl, NULL, 0) >= 0) { + ip6_xmit(NULL, skb, &fl, NULL, 0); + DCCP_INC_STATS_BH(DCCP_MIB_OUTSEGS); + return; + } + } + + kfree_skb(skb); +} + +static void dccp_v6_reqsk_send_ack(struct sk_buff *skb, + struct request_sock *req) +{ + dccp_v6_ctl_send_ack(skb); +} + +static struct sock *dccp_v6_hnd_req(struct sock *sk,struct sk_buff *skb) +{ + const struct dccp_hdr *dh = dccp_hdr(skb); + const struct ipv6hdr *iph = skb->nh.ipv6h; + struct sock *nsk; + struct request_sock **prev; + /* Find possible connection requests. */ + struct request_sock *req = inet6_csk_search_req(sk, &prev, + dh->dccph_sport, + &iph->saddr, + &iph->daddr, + inet6_iif(skb)); + if (req != NULL) + return dccp_check_req(sk, skb, req, prev); + + nsk = __inet6_lookup_established(&dccp_hashinfo, + &iph->saddr, dh->dccph_sport, + &iph->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + + if (nsk != NULL) { + if (nsk->sk_state != DCCP_TIME_WAIT) { + bh_lock_sock(nsk); + return nsk; + } + inet_twsk_put((struct inet_timewait_sock *)nsk); + return NULL; + } + + return sk; +} + +static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) +{ + struct inet_request_sock *ireq; + struct dccp_sock dp; + struct request_sock *req; + struct dccp_request_sock *dreq; + struct inet6_request_sock *ireq6; + struct ipv6_pinfo *np = inet6_sk(sk); + const __u32 service = dccp_hdr_request(skb)->dccph_req_service; + struct dccp_skb_cb *dcb = DCCP_SKB_CB(skb); + __u8 reset_code = DCCP_RESET_CODE_TOO_BUSY; + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_conn_request(sk, skb); + + if (!ipv6_unicast_destination(skb)) + goto drop; + + if (dccp_bad_service_code(sk, service)) { + reset_code = DCCP_RESET_CODE_BAD_SERVICE_CODE; + goto drop; + } + /* + * There are no SYN attacks on IPv6, yet... + */ + if (inet_csk_reqsk_queue_is_full(sk)) + goto drop; + + if (sk_acceptq_is_full(sk) && inet_csk_reqsk_queue_young(sk) > 1) + goto drop; + + req = inet6_reqsk_alloc(sk->sk_prot->rsk_prot); + if (req == NULL) + goto drop; + + /* FIXME: process options */ + + dccp_openreq_init(req, &dp, skb); + + ireq6 = inet6_rsk(req); + ireq = inet_rsk(req); + ipv6_addr_copy(&ireq6->rmt_addr, &skb->nh.ipv6h->saddr); + ipv6_addr_copy(&ireq6->loc_addr, &skb->nh.ipv6h->daddr); + req->rcv_wnd = 100; /* Fake, option parsing will get the + right value */ + ireq6->pktopts = NULL; + + if (ipv6_opt_accepted(sk, skb) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { + atomic_inc(&skb->users); + ireq6->pktopts = skb; + } + ireq6->iif = sk->sk_bound_dev_if; + + /* So that link locals have meaning */ + if (!sk->sk_bound_dev_if && + ipv6_addr_type(&ireq6->rmt_addr) & IPV6_ADDR_LINKLOCAL) + ireq6->iif = inet6_iif(skb); + + /* + * Step 3: Process LISTEN state + * + * Set S.ISR, S.GSR, S.SWL, S.SWH from packet or Init Cookie + * + * In fact we defer setting S.GSR, S.SWL, S.SWH to + * dccp_create_openreq_child. + */ + dreq = dccp_rsk(req); + dreq->dreq_isr = dcb->dccpd_seq; + dreq->dreq_iss = dccp_v6_init_sequence(sk, skb); + dreq->dreq_service = service; + + if (dccp_v6_send_response(sk, req, NULL)) + goto drop_and_free; + + inet6_csk_reqsk_queue_hash_add(sk, req, DCCP_TIMEOUT_INIT); + return 0; + +drop_and_free: + reqsk_free(req); +drop: + DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); + dcb->dccpd_reset_code = reset_code; + return -1; +} + +static struct sock *dccp_v6_request_recv_sock(struct sock *sk, + struct sk_buff *skb, + struct request_sock *req, + struct dst_entry *dst) +{ + struct inet6_request_sock *ireq6 = inet6_rsk(req); + struct ipv6_pinfo *newnp, *np = inet6_sk(sk); + struct inet_sock *newinet; + struct dccp_sock *newdp; + struct dccp6_sock *newdp6; + struct sock *newsk; + struct ipv6_txoptions *opt; + + if (skb->protocol == htons(ETH_P_IP)) { + /* + * v6 mapped + */ + + newsk = dccp_v4_request_recv_sock(sk, skb, req, dst); + if (newsk == NULL) + return NULL; + + newdp6 = (struct dccp6_sock *)newsk; + newdp = dccp_sk(newsk); + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_set(&newnp->daddr, 0, 0, htonl(0x0000FFFF), + newinet->daddr); + + ipv6_addr_set(&newnp->saddr, 0, 0, htonl(0x0000FFFF), + newinet->saddr); + + ipv6_addr_copy(&newnp->rcv_saddr, &newnp->saddr); + + inet_csk(newsk)->icsk_af_ops = &dccp_ipv6_mapped; + newsk->sk_backlog_rcv = dccp_v4_do_rcv; + newnp->pktoptions = NULL; + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks count + * here, dccp_create_openreq_child now does this for us, see the comment in + * that function for the gory details. -acme + */ + + /* It is tricky place. Until this moment IPv4 tcp + worked with IPv6 icsk.icsk_af_ops. + Sync it now. + */ + dccp_sync_mss(newsk, newdp->dccps_pmtu_cookie); + + return newsk; + } + + opt = np->opt; + + if (sk_acceptq_is_full(sk)) + goto out_overflow; + + if (np->rxopt.bits.osrcrt == 2 && + opt == NULL && ireq6->pktopts) { + struct inet6_skb_parm *rxopt = IP6CB(ireq6->pktopts); + if (rxopt->srcrt) + opt = ipv6_invert_rthdr(sk, + (struct ipv6_rt_hdr *)(ireq6->pktopts->nh.raw + + rxopt->srcrt)); + } + + if (dst == NULL) { + struct in6_addr *final_p = NULL, final; + struct flowi fl; + + memset(&fl, 0, sizeof(fl)); + fl.proto = IPPROTO_DCCP; + ipv6_addr_copy(&fl.fl6_dst, &ireq6->rmt_addr); + if (opt && opt->srcrt) { + struct rt0_hdr *rt0 = (struct rt0_hdr *) opt->srcrt; + ipv6_addr_copy(&final, &fl.fl6_dst); + ipv6_addr_copy(&fl.fl6_dst, rt0->addr); + final_p = &final; + } + ipv6_addr_copy(&fl.fl6_src, &ireq6->loc_addr); + fl.oif = sk->sk_bound_dev_if; + fl.fl_ip_dport = inet_rsk(req)->rmt_port; + fl.fl_ip_sport = inet_sk(sk)->sport; + + if (ip6_dst_lookup(sk, &dst, &fl)) + goto out; + + if (final_p) + ipv6_addr_copy(&fl.fl6_dst, final_p); + + if ((xfrm_lookup(&dst, &fl, sk, 0)) < 0) + goto out; + } + + newsk = dccp_create_openreq_child(sk, req, skb); + if (newsk == NULL) + goto out; + + /* + * No need to charge this sock to the relevant IPv6 refcnt debug socks + * count here, dccp_create_openreq_child now does this for us, see the + * comment in that function for the gory details. -acme + */ + + ip6_dst_store(newsk, dst, NULL); + newsk->sk_route_caps = dst->dev->features & + ~(NETIF_F_IP_CSUM | NETIF_F_TSO); + + newdp6 = (struct dccp6_sock *)newsk; + newinet = inet_sk(newsk); + newinet->pinet6 = &newdp6->inet6; + newdp = dccp_sk(newsk); + newnp = inet6_sk(newsk); + + memcpy(newnp, np, sizeof(struct ipv6_pinfo)); + + ipv6_addr_copy(&newnp->daddr, &ireq6->rmt_addr); + ipv6_addr_copy(&newnp->saddr, &ireq6->loc_addr); + ipv6_addr_copy(&newnp->rcv_saddr, &ireq6->loc_addr); + newsk->sk_bound_dev_if = ireq6->iif; + + /* Now IPv6 options... + + First: no IPv4 options. + */ + newinet->opt = NULL; + + /* Clone RX bits */ + newnp->rxopt.all = np->rxopt.all; + + /* Clone pktoptions received with SYN */ + newnp->pktoptions = NULL; + if (ireq6->pktopts != NULL) { + newnp->pktoptions = skb_clone(ireq6->pktopts, GFP_ATOMIC); + kfree_skb(ireq6->pktopts); + ireq6->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } + newnp->opt = NULL; + newnp->mcast_oif = inet6_iif(skb); + newnp->mcast_hops = skb->nh.ipv6h->hop_limit; + + /* Clone native IPv6 options from listening socket (if any) + + Yes, keeping reference count would be much more clever, + but we make one more one thing there: reattach optmem + to newsk. + */ + if (opt) { + newnp->opt = ipv6_dup_options(newsk, opt); + if (opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + } + + newdp->dccps_ext_header_len = 0; + if (newnp->opt) + newdp->dccps_ext_header_len = newnp->opt->opt_nflen + + newnp->opt->opt_flen; + + dccp_sync_mss(newsk, dst_mtu(dst)); + + newinet->daddr = newinet->saddr = newinet->rcv_saddr = LOOPBACK4_IPV6; + + __inet6_hash(&dccp_hashinfo, newsk); + inet_inherit_port(&dccp_hashinfo, sk, newsk); + + return newsk; + +out_overflow: + NET_INC_STATS_BH(LINUX_MIB_LISTENOVERFLOWS); +out: + NET_INC_STATS_BH(LINUX_MIB_LISTENDROPS); + if (opt && opt != np->opt) + sock_kfree_s(sk, opt, opt->tot_len); + dst_release(dst); + return NULL; +} + +/* The socket must have it's spinlock held when we get + * here. + * + * We have a potential double-lock case here, so even when + * doing backlog processing we use the BH locking scheme. + * This is because we cannot sleep with the original spinlock + * held. + */ +static int dccp_v6_do_rcv(struct sock *sk, struct sk_buff *skb) +{ + struct ipv6_pinfo *np = inet6_sk(sk); + struct sk_buff *opt_skb = NULL; + + /* Imagine: socket is IPv6. IPv4 packet arrives, + goes to IPv4 receive handler and backlogged. + From backlog it always goes here. Kerboom... + Fortunately, dccp_rcv_established and rcv_established + handle them correctly, but it is not case with + dccp_v6_hnd_req and dccp_v6_ctl_send_reset(). --ANK + */ + + if (skb->protocol == htons(ETH_P_IP)) + return dccp_v4_do_rcv(sk, skb); + + if (sk_filter(sk, skb, 0)) + goto discard; + + /* + * socket locking is here for SMP purposes as backlog rcv + * is currently called with bh processing disabled. + */ + + /* Do Stevens' IPV6_PKTOPTIONS. + + Yes, guys, it is the only place in our code, where we + may make it not affecting IPv4. + The rest of code is protocol independent, + and I do not like idea to uglify IPv4. + + Actually, all the idea behind IPV6_PKTOPTIONS + looks not very well thought. For now we latch + options, received in the last packet, enqueued + by tcp. Feel free to propose better solution. + --ANK (980728) + */ + if (np->rxopt.all) + opt_skb = skb_clone(skb, GFP_ATOMIC); + + if (sk->sk_state == DCCP_OPEN) { /* Fast path */ + if (dccp_rcv_established(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + } + + if (sk->sk_state == DCCP_LISTEN) { + struct sock *nsk = dccp_v6_hnd_req(sk, skb); + if (!nsk) + goto discard; + + /* + * Queue it on the new socket if the new socket is active, + * otherwise we just shortcircuit this and continue with + * the new socket.. + */ + if(nsk != sk) { + if (dccp_child_process(sk, nsk, skb)) + goto reset; + if (opt_skb) + __kfree_skb(opt_skb); + return 0; + } + } + + if (dccp_rcv_state_process(sk, skb, dccp_hdr(skb), skb->len)) + goto reset; + return 0; + +reset: + dccp_v6_ctl_send_reset(skb); +discard: + if (opt_skb) + __kfree_skb(opt_skb); + kfree_skb(skb); + return 0; +} + +static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) +{ + const struct dccp_hdr *dh; + struct sk_buff *skb = *pskb; + struct sock *sk; + int rc; + + /* Step 1: Check header basics: */ + + if (dccp_invalid_packet(skb)) + goto discard_it; + + dh = dccp_hdr(skb); + + DCCP_SKB_CB(skb)->dccpd_seq = dccp_hdr_seq(skb); + DCCP_SKB_CB(skb)->dccpd_type = dh->dccph_type; + + if (dccp_packet_without_ack(skb)) + DCCP_SKB_CB(skb)->dccpd_ack_seq = DCCP_PKT_WITHOUT_ACK_SEQ; + else + DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb); + + /* Step 2: + * Look up flow ID in table and get corresponding socket */ + sk = __inet6_lookup(&dccp_hashinfo, &skb->nh.ipv6h->saddr, + dh->dccph_sport, + &skb->nh.ipv6h->daddr, ntohs(dh->dccph_dport), + inet6_iif(skb)); + /* + * Step 2: + * If no socket ... + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (sk == NULL) + goto no_dccp_socket; + + /* + * Step 2: + * ... or S.state == TIMEWAIT, + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + + if (sk->sk_state == DCCP_TIME_WAIT) + goto do_time_wait; + + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + goto discard_and_relse; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + rc = 0; + if (!sock_owned_by_user(sk)) + rc = dccp_v6_do_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); + + sock_put(sk); + return rc ? -1 : 0; + +no_dccp_socket: + if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) + goto discard_it; + /* + * Step 2: + * Generate Reset(No Connection) unless P.type == Reset + * Drop packet and return + */ + if (dh->dccph_type != DCCP_PKT_RESET) { + DCCP_SKB_CB(skb)->dccpd_reset_code = + DCCP_RESET_CODE_NO_CONNECTION; + dccp_v6_ctl_send_reset(skb); + } +discard_it: + + /* + * Discard frame + */ + + kfree_skb(skb); + return 0; + +discard_and_relse: + sock_put(sk); + goto discard_it; + +do_time_wait: + inet_twsk_put((struct inet_timewait_sock *)sk); + goto no_dccp_socket; +} + +static struct inet_connection_sock_af_ops dccp_ipv6_af_ops = { + .queue_xmit = inet6_csk_xmit, + .send_check = dccp_v6_send_check, + .rebuild_header = inet6_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct ipv6hdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* + * DCCP over IPv4 via INET6 API + */ +static struct inet_connection_sock_af_ops dccp_ipv6_mapped = { + .queue_xmit = ip_queue_xmit, + .send_check = dccp_v4_send_check, + .rebuild_header = inet_sk_rebuild_header, + .conn_request = dccp_v6_conn_request, + .syn_recv_sock = dccp_v6_request_recv_sock, + .net_header_len = sizeof(struct iphdr), + .setsockopt = ipv6_setsockopt, + .getsockopt = ipv6_getsockopt, + .addr2sockaddr = inet6_csk_addr2sockaddr, + .sockaddr_len = sizeof(struct sockaddr_in6) +}; + +/* NOTE: A lot of things set to zero explicitly by call to + * sk_alloc() so need not be done here. + */ +static int dccp_v6_init_sock(struct sock *sk) +{ + int err = dccp_v4_init_sock(sk); + + if (err == 0) + inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + + return err; +} + +static int dccp_v6_destroy_sock(struct sock *sk) +{ + dccp_v4_destroy_sock(sk); + return inet6_destroy_sock(sk); +} + +static struct proto dccp_v6_prot = { + .name = "DCCPv6", + .owner = THIS_MODULE, + .close = dccp_close, + .connect = dccp_v6_connect, + .disconnect = dccp_disconnect, + .ioctl = dccp_ioctl, + .init = dccp_v6_init_sock, + .setsockopt = dccp_setsockopt, + .getsockopt = dccp_getsockopt, + .sendmsg = dccp_sendmsg, + .recvmsg = dccp_recvmsg, + .backlog_rcv = dccp_v6_do_rcv, + .hash = dccp_v6_hash, + .unhash = dccp_unhash, + .accept = inet_csk_accept, + .get_port = dccp_v6_get_port, + .shutdown = dccp_shutdown, + .destroy = dccp_v6_destroy_sock, + .orphan_count = &dccp_orphan_count, + .max_header = MAX_DCCP_HEADER, + .obj_size = sizeof(struct dccp6_sock), + .rsk_prot = &dccp6_request_sock_ops, + .twsk_obj_size = sizeof(struct dccp6_timewait_sock), +}; + +static struct inet6_protocol dccp_v6_protocol = { + .handler = dccp_v6_rcv, + .err_handler = dccp_v6_err, + .flags = INET6_PROTO_NOPOLICY | INET6_PROTO_FINAL, +}; + +static struct proto_ops inet6_dccp_ops = { + .family = PF_INET6, + .owner = THIS_MODULE, + .release = inet6_release, + .bind = inet6_bind, + .connect = inet_stream_connect, + .socketpair = sock_no_socketpair, + .accept = inet_accept, + .getname = inet6_getname, + .poll = dccp_poll, + .ioctl = inet6_ioctl, + .listen = inet_dccp_listen, + .shutdown = inet_shutdown, + .setsockopt = sock_common_setsockopt, + .getsockopt = sock_common_getsockopt, + .sendmsg = inet_sendmsg, + .recvmsg = sock_common_recvmsg, + .mmap = sock_no_mmap, + .sendpage = sock_no_sendpage, +}; + +static struct inet_protosw dccp_v6_protosw = { + .type = SOCK_DCCP, + .protocol = IPPROTO_DCCP, + .prot = &dccp_v6_prot, + .ops = &inet6_dccp_ops, + .capability = -1, +}; + +static int __init dccp_v6_init(void) +{ + int err = proto_register(&dccp_v6_prot, 1); + + if (err != 0) + goto out; + + err = inet6_add_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + if (err != 0) + goto out_unregister_proto; + + inet6_register_protosw(&dccp_v6_protosw); +out: + return err; +out_unregister_proto: + proto_unregister(&dccp_v6_prot); + goto out; +} + +static void __exit dccp_v6_exit(void) +{ + inet6_del_protocol(&dccp_v6_protocol, IPPROTO_DCCP); + inet6_unregister_protosw(&dccp_v6_protosw); + proto_unregister(&dccp_v6_prot); +} + +module_init(dccp_v6_init); +module_exit(dccp_v6_exit); + +/* + * __stringify doesn't likes enums, so use SOCK_DCCP (6) and IPPROTO_DCCP (33) + * values directly, Also cover the case where the protocol is not specified, + * i.e. net-pf-PF_INET6-proto-0-type-SOCK_DCCP + */ +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-33-type-6"); +MODULE_ALIAS("net-pf-" __stringify(PF_INET6) "-proto-0-type-6"); +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Arnaldo Carvalho de Melo "); +MODULE_DESCRIPTION("DCCPv6 - Datagram Congestion Controlled Protocol"); diff --git a/net/dccp/ipv6.h b/net/dccp/ipv6.h new file mode 100644 index 0000000..e4d4e93 --- /dev/null +++ b/net/dccp/ipv6.h @@ -0,0 +1,37 @@ +#ifndef _DCCP_IPV6_H +#define _DCCP_IPV6_H +/* + * net/dccp/ipv6.h + * + * An implementation of the DCCP protocol + * Copyright (c) 2005 Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include + +struct dccp6_sock { + struct dccp_sock dccp; + /* + * ipv6_pinfo has to be the last member of dccp6_sock, + * see inet6_sk_generic. + */ + struct ipv6_pinfo inet6; +}; + +struct dccp6_request_sock { + struct dccp_request_sock dccp; + struct inet6_request_sock inet6; +}; + +struct dccp6_timewait_sock { + struct inet_timewait_sock inet; + struct inet6_timewait_sock tw6; +}; + +#endif /* _DCCP_IPV6_H */ diff --git a/net/dccp/minisocks.c b/net/dccp/minisocks.c index 5c767b5..29261fc 100644 --- a/net/dccp/minisocks.c +++ b/net/dccp/minisocks.c @@ -52,7 +52,18 @@ void dccp_time_wait(struct sock *sk, int state, int timeo) if (tw != NULL) { const struct inet_connection_sock *icsk = inet_csk(sk); const int rto = (icsk->icsk_rto << 2) - (icsk->icsk_rto >> 1); - +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + if (tw->tw_family == PF_INET6) { + const struct ipv6_pinfo *np = inet6_sk(sk); + struct inet6_timewait_sock *tw6; + + tw->tw_ipv6_offset = inet6_tw_offset(sk->sk_prot); + tw6 = inet6_twsk((struct sock *)tw); + ipv6_addr_copy(&tw6->tw_v6_daddr, &np->daddr); + ipv6_addr_copy(&tw6->tw_v6_rcv_saddr, &np->rcv_saddr); + tw->tw_ipv6only = np->ipv6only; + } +#endif /* Linkage updates. */ __inet_twsk_hashdance(tw, sk, &dccp_hashinfo); -- cgit v1.1 From fc44b9805324c0ad2733ea2feea9935cc056709d Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:06 -0800 Subject: [DCCP]: Use reqsk_free in dccp_v4_conn_request Now we have the destructor (dccp_v4_reqsk_destructor) in our request_sock_ops vtable. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/ipv4.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 3108c9d..bc28d71 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -721,10 +721,7 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) return 0; drop_and_free: - /* - * FIXME: should be reqsk_free after implementing req->rsk_ops - */ - __reqsk_free(req); + reqsk_free(req); drop: DCCP_INC_STATS_BH(DCCP_MIB_ATTEMPTFAILS); dcb->dccpd_reset_code = reset_code; -- cgit v1.1 From 6d6ee43e0b8b8d4847627fd43739b98ec2b9404f Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:19 -0800 Subject: [TWSK]: Introduce struct timewait_sock_ops So that we can share several timewait sockets related functions and make the timewait mini sockets infrastructure closer to the request mini sockets one. Next changesets will take advantage of this, moving more code out of TCP and DCCP v4 and v6 to common infrastructure. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/ipv6.h | 3 +- include/net/inet_timewait_sock.h | 3 +- include/net/sock.h | 4 +-- include/net/tcp.h | 3 ++ include/net/timewait_sock.h | 31 ++++++++++++++++++ net/core/sock.c | 21 ++++++------ net/dccp/ipv4.c | 9 ++++- net/dccp/ipv6.c | 6 +++- net/ipv4/inet_timewait_sock.c | 5 +-- net/ipv4/tcp_ipv4.c | 71 ++++++++++++++++++++++++---------------- net/ipv6/tcp_ipv6.c | 25 +++++--------- 11 files changed, 118 insertions(+), 63 deletions(-) create mode 100644 include/net/timewait_sock.h diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index 7d39085..a0d0489 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -360,7 +360,8 @@ struct tcp6_timewait_sock { static inline u16 inet6_tw_offset(const struct proto *prot) { - return prot->twsk_obj_size - sizeof(struct inet6_timewait_sock); + return prot->twsk_prot->twsk_obj_size - + sizeof(struct inet6_timewait_sock); } static inline struct inet6_timewait_sock *inet6_twsk(const struct sock *sk) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index ca240f8..e396a65 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -26,6 +26,7 @@ #include #include +#include #include @@ -200,7 +201,7 @@ static inline void inet_twsk_put(struct inet_timewait_sock *tw) printk(KERN_DEBUG "%s timewait_sock %p released\n", tw->tw_prot->name, tw); #endif - kmem_cache_free(tw->tw_prot->twsk_slab, tw); + kmem_cache_free(tw->tw_prot->twsk_prot->twsk_slab, tw); module_put(owner); } } diff --git a/include/net/sock.h b/include/net/sock.h index 0fbae85..91d2895 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -493,6 +493,7 @@ extern void sk_stream_kill_queues(struct sock *sk); extern int sk_wait_data(struct sock *sk, long *timeo); struct request_sock_ops; +struct timewait_sock_ops; /* Networking protocol blocks we attach to sockets. * socket layer -> transport layer interface @@ -557,11 +558,10 @@ struct proto { kmem_cache_t *slab; unsigned int obj_size; - kmem_cache_t *twsk_slab; - unsigned int twsk_obj_size; atomic_t *orphan_count; struct request_sock_ops *rsk_prot; + struct timewait_sock_ops *twsk_prot; struct module *owner; diff --git a/include/net/tcp.h b/include/net/tcp.h index 83b117a..176221c 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -287,6 +287,9 @@ extern int tcp_rcv_established(struct sock *sk, extern void tcp_rcv_space_adjust(struct sock *sk); +extern int tcp_twsk_unique(struct sock *sk, + struct sock *sktw, void *twp); + static inline void tcp_dec_quickack_mode(struct sock *sk, const unsigned int pkts) { diff --git a/include/net/timewait_sock.h b/include/net/timewait_sock.h new file mode 100644 index 0000000..2544281 --- /dev/null +++ b/include/net/timewait_sock.h @@ -0,0 +1,31 @@ +/* + * NET Generic infrastructure for Network protocols. + * + * Authors: Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _TIMEWAIT_SOCK_H +#define _TIMEWAIT_SOCK_H + +#include +#include + +struct timewait_sock_ops { + kmem_cache_t *twsk_slab; + unsigned int twsk_obj_size; + int (*twsk_unique)(struct sock *sk, + struct sock *sktw, void *twp); +}; + +static inline int twsk_unique(struct sock *sk, struct sock *sktw, void *twp) +{ + if (sk->sk_prot->twsk_prot->twsk_unique != NULL) + return sk->sk_prot->twsk_prot->twsk_unique(sk, sktw, twp); + return 0; +} + +#endif /* _TIMEWAIT_SOCK_H */ diff --git a/net/core/sock.c b/net/core/sock.c index 13cc3be..6465b0e 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1488,7 +1488,7 @@ int proto_register(struct proto *prot, int alloc_slab) } } - if (prot->twsk_obj_size) { + if (prot->twsk_prot != NULL) { static const char mask[] = "tw_sock_%s"; timewait_sock_slab_name = kmalloc(strlen(prot->name) + sizeof(mask) - 1, GFP_KERNEL); @@ -1497,11 +1497,12 @@ int proto_register(struct proto *prot, int alloc_slab) goto out_free_request_sock_slab; sprintf(timewait_sock_slab_name, mask, prot->name); - prot->twsk_slab = kmem_cache_create(timewait_sock_slab_name, - prot->twsk_obj_size, - 0, SLAB_HWCACHE_ALIGN, - NULL, NULL); - if (prot->twsk_slab == NULL) + prot->twsk_prot->twsk_slab = + kmem_cache_create(timewait_sock_slab_name, + prot->twsk_prot->twsk_obj_size, + 0, SLAB_HWCACHE_ALIGN, + NULL, NULL); + if (prot->twsk_prot->twsk_slab == NULL) goto out_free_timewait_sock_slab_name; } } @@ -1548,12 +1549,12 @@ void proto_unregister(struct proto *prot) prot->rsk_prot->slab = NULL; } - if (prot->twsk_slab != NULL) { - const char *name = kmem_cache_name(prot->twsk_slab); + if (prot->twsk_prot != NULL && prot->twsk_prot->twsk_slab != NULL) { + const char *name = kmem_cache_name(prot->twsk_prot->twsk_slab); - kmem_cache_destroy(prot->twsk_slab); + kmem_cache_destroy(prot->twsk_prot->twsk_slab); kfree(name); - prot->twsk_slab = NULL; + prot->twsk_prot->twsk_slab = NULL; } } diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index bc28d71..e11cda0 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include @@ -1309,6 +1310,10 @@ static struct request_sock_ops dccp_request_sock_ops = { .send_reset = dccp_v4_ctl_send_reset, }; +static struct timewait_sock_ops dccp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct inet_timewait_sock), +}; + struct proto dccp_prot = { .name = "DCCP", .owner = THIS_MODULE, @@ -1332,5 +1337,7 @@ struct proto dccp_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp_sock), .rsk_prot = &dccp_request_sock_ops, - .twsk_obj_size = sizeof(struct inet_timewait_sock), + .twsk_prot = &dccp_timewait_sock_ops, }; + +EXPORT_SYMBOL_GPL(dccp_prot); diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index a7d2aee..4d078f5 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -652,6 +652,10 @@ static struct request_sock_ops dccp6_request_sock_ops = { .send_reset = dccp_v6_ctl_send_reset, }; +static struct timewait_sock_ops dccp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct dccp6_timewait_sock), +}; + static void dccp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -1359,7 +1363,7 @@ static struct proto dccp_v6_prot = { .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), .rsk_prot = &dccp6_request_sock_ops, - .twsk_obj_size = sizeof(struct dccp6_timewait_sock), + .twsk_prot = &dccp6_timewait_sock_ops, }; static struct inet6_protocol dccp_v6_protocol = { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index a010e9a..417f126 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -90,8 +90,9 @@ EXPORT_SYMBOL_GPL(__inet_twsk_hashdance); struct inet_timewait_sock *inet_twsk_alloc(const struct sock *sk, const int state) { - struct inet_timewait_sock *tw = kmem_cache_alloc(sk->sk_prot_creator->twsk_slab, - SLAB_ATOMIC); + struct inet_timewait_sock *tw = + kmem_cache_alloc(sk->sk_prot_creator->twsk_prot->twsk_slab, + SLAB_ATOMIC); if (tw != NULL) { const struct inet_sock *inet = inet_sk(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 0b5ab04..6728772 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include @@ -118,6 +119,39 @@ static inline __u32 tcp_v4_init_sequence(struct sock *sk, struct sk_buff *skb) skb->h.th->source); } +int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) +{ + const struct tcp_timewait_sock *tcptw = tcp_twsk(sktw); + struct tcp_sock *tp = tcp_sk(sk); + + /* With PAWS, it is safe from the viewpoint + of data integrity. Even without PAWS it is safe provided sequence + spaces do not overlap i.e. at data rates <= 80Mbit/sec. + + Actually, the idea is close to VJ's one, only timestamp cache is + held not per host, but per port pair and TW bucket is used as state + holder. + + If TW bucket has been already destroyed we fall back to VJ's scheme + and use initial timestamp retrieved from peer table. + */ + if (tcptw->tw_ts_recent_stamp && + (twp == NULL || (sysctl_tcp_tw_reuse && + xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { + tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; + if (tp->write_seq == 0) + tp->write_seq = 1; + tp->rx_opt.ts_recent = tcptw->tw_ts_recent; + tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; + sock_hold(sktw); + return 1; + } + + return 0; +} + +EXPORT_SYMBOL_GPL(tcp_twsk_unique); + /* called with local bh disabled */ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, struct inet_timewait_sock **twp) @@ -142,35 +176,9 @@ static int __tcp_v4_check_established(struct sock *sk, __u16 lport, tw = inet_twsk(sk2); if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - /* With PAWS, it is safe from the viewpoint - of data integrity. Even without PAWS it - is safe provided sequence spaces do not - overlap i.e. at data rates <= 80Mbit/sec. - - Actually, the idea is close to VJ's one, - only timestamp cache is held not per host, - but per port pair and TW bucket is used - as state holder. - - If TW bucket has been already destroyed we - fall back to VJ's scheme and use initial - timestamp retrieved from peer table. - */ - if (tcptw->tw_ts_recent_stamp && - (!twp || (sysctl_tcp_tw_reuse && - xtime.tv_sec - - tcptw->tw_ts_recent_stamp > 1))) { - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (tp->write_seq == 0) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); + if (twsk_unique(sk, sk2, twp)) goto unique; - } else + else goto not_unique; } } @@ -869,6 +877,11 @@ struct request_sock_ops tcp_request_sock_ops = { .send_reset = tcp_v4_send_reset, }; +static struct timewait_sock_ops tcp_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; + int tcp_v4_conn_request(struct sock *sk, struct sk_buff *skb) { struct inet_request_sock *ireq; @@ -1979,7 +1992,7 @@ struct proto tcp_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp_sock), - .twsk_obj_size = sizeof(struct tcp_timewait_sock), + .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, }; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index e5c8a66..514b57b 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -60,6 +60,7 @@ #include #include #include +#include #include @@ -147,22 +148,9 @@ static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { - const struct tcp_timewait_sock *tcptw = tcp_twsk(sk2); - struct tcp_sock *tp = tcp_sk(sk); - - if (tcptw->tw_ts_recent_stamp && - (!twp || - (sysctl_tcp_tw_reuse && - xtime.tv_sec - tcptw->tw_ts_recent_stamp > 1))) { - /* See comment in tcp_ipv4.c */ - tp->write_seq = tcptw->tw_snd_nxt + 65535 + 2; - if (!tp->write_seq) - tp->write_seq = 1; - tp->rx_opt.ts_recent = tcptw->tw_ts_recent; - tp->rx_opt.ts_recent_stamp = tcptw->tw_ts_recent_stamp; - sock_hold(sk2); + if (twsk_unique(sk, sk2, twp)) goto unique; - } else + else goto not_unique; } } @@ -711,6 +699,11 @@ static struct request_sock_ops tcp6_request_sock_ops = { .send_reset = tcp_v6_send_reset }; +static struct timewait_sock_ops tcp6_timewait_sock_ops = { + .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_unique = tcp_twsk_unique, +}; + static void tcp_v6_send_check(struct sock *sk, int len, struct sk_buff *skb) { struct ipv6_pinfo *np = inet6_sk(sk); @@ -1752,7 +1745,7 @@ struct proto tcpv6_prot = { .sysctl_rmem = sysctl_tcp_rmem, .max_header = MAX_TCP_HEADER, .obj_size = sizeof(struct tcp6_sock), - .twsk_obj_size = sizeof(struct tcp6_timewait_sock), + .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, }; -- cgit v1.1 From a7f5e7f164788a22eb5d3de8e2d3cee1bf58fdca Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:31 -0800 Subject: [INET]: Generalise tcp_v4_hash_connect Renaming it to inet_hash_connect, making it possible to ditch dccp_v4_hash_connect and share the same code with TCP instead. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/char/random.c | 6 +- include/linux/random.h | 2 +- include/net/inet_hashtables.h | 3 + net/dccp/ipv4.c | 160 +------------------------------------ net/ipv4/inet_hashtables.c | 178 ++++++++++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 173 +--------------------------------------- 6 files changed, 186 insertions(+), 336 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 7999da2..79b59d9 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1554,10 +1554,8 @@ __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, EXPORT_SYMBOL(secure_tcp_sequence_number); - - -/* Generate secure starting point for ephemeral TCP port search */ -u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) +/* Generate secure starting point for ephemeral IPV4 transport port search */ +u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) { struct keydata *keyptr = get_keyptr(); u32 hash[4]; diff --git a/include/linux/random.h b/include/linux/random.h index 7b2adb3..01424a8 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -52,7 +52,7 @@ extern void get_random_bytes(void *buf, int nbytes); void generate_random_uuid(unsigned char uuid_out[16]); extern __u32 secure_ip_id(__u32 daddr); -extern u32 secure_tcp_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); +extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport); extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index 07840ba..c83baa7 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -434,4 +434,7 @@ static inline struct sock *inet_lookup(struct inet_hashinfo *hashinfo, return sk; } + +extern int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); #endif /* _INET_HASHTABLES_H */ diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index e11cda0..671fbf3 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -54,164 +54,6 @@ void dccp_unhash(struct sock *sk) EXPORT_SYMBOL_GPL(dccp_unhash); -/* called with local bh disabled */ -static int __dccp_v4_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const u32 daddr = inet->rcv_saddr; - const u32 saddr = inet->daddr; - const int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); - const struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp != NULL) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw != NULL) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &dccp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -/* - * Bind a port for a connect operation and hash it. - */ -static int dccp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int remaining = (high - low) + 1; - int rover = net_random() % (high - low) + low; - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - do { - head = &dccp_hashinfo.bhash[inet_bhashfn(rover, - dccp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == rover) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__dccp_v4_check_established(sk, - rover, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, - head, rover); - if (tb == NULL) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - if (++rover > high) - rover = low; - } while (--remaining > 0); - - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - /* All locks still held and bhs disabled */ - inet_bind_hash(sk, tb, rover); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(rover); - __inet_hash(&dccp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw != NULL) { - inet_twsk_deschedule(tw, &dccp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, - dccp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { - __inet_hash(&dccp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __dccp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_sock *inet = inet_sk(sk); @@ -272,7 +114,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ dccp_set_state(sk, DCCP_REQUESTING); - err = dccp_v4_hash_connect(sk); + err = inet_hash_connect(&dccp_death_row, sk); if (err != 0) goto failure; diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index e8d29fe..3322811 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -15,12 +15,14 @@ #include #include +#include #include #include #include #include #include +#include /* * Allocate and initialize a new local port bind bucket. @@ -163,3 +165,179 @@ struct sock *__inet_lookup_listener(const struct hlist_head *head, const u32 dad } EXPORT_SYMBOL_GPL(__inet_lookup_listener); + +/* called with local bh disabled */ +static int __inet_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + struct inet_sock *inet = inet_sk(sk); + u32 daddr = inet->rcv_saddr; + u32 saddr = inet->daddr; + int dif = sk->sk_bound_dev_if; + INET_ADDR_COOKIE(acookie, saddr, daddr) + const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + tw = inet_twsk(sk2); + + if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + /* Must record num and sport now. Otherwise we will see + * in hash table socket with a funny identity. */ + inet->num = lport; + inet->sport = htons(lport); + sk->sk_hash = hash; + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + return secure_ipv4_port_ephemeral(inet->rcv_saddr, inet->daddr, + inet->dport); +} + +/* + * Bind a port for a connect operation and hash it. + */ +int inet_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (!snum) { + int low = sysctl_local_port_range[0]; + int high = sysctl_local_port_range[1]; + int range = high - low; + int i; + int port; + static u32 hint; + u32 offset = hint + inet_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet_hash(hinfo, sk, 0); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row);; + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { + __inet_hash(hinfo, sk, 0); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet_hash_connect); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 6728772..c2fe61b 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -152,177 +152,6 @@ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp) EXPORT_SYMBOL_GPL(tcp_twsk_unique); -/* called with local bh disabled */ -static int __tcp_v4_check_established(struct sock *sk, __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - u32 daddr = inet->rcv_saddr; - u32 saddr = inet->daddr; - int dif = sk->sk_bound_dev_if; - INET_ADDR_COOKIE(acookie, saddr, daddr) - const __u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet_ehashfn(daddr, lport, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - tw = inet_twsk(sk2); - - if (INET_TW_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET_MATCH(sk2, hash, acookie, saddr, daddr, ports, dif)) - goto not_unique; - } - -unique: - /* Must record num and sport now. Otherwise we will see - * in hash table socket with a funny identity. */ - inet->num = lport; - inet->sport = htons(lport); - sk->sk_hash = hash; - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 connect_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - - return secure_tcp_port_ephemeral(inet->rcv_saddr, inet->daddr, - inet->dport); -} - -/* - * Bind a port for a connect operation and hash it. - */ -static inline int tcp_v4_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + connect_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v4_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet_hash(&tcp_hashinfo, sk, 0); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row);; - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet_hash(&tcp_hashinfo, sk, 0); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v4_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - /* This will initiate an outgoing connection. */ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -403,7 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) * complete initialization after this. */ tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v4_hash_connect(sk); + err = inet_hash_connect(&tcp_death_row, sk); if (err) goto failure; -- cgit v1.1 From d8313f5ca2b1f86b7df6c99fc4b3fffa1f84e92b Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:44 -0800 Subject: [INET6]: Generalise tcp_v6_hash_connect Renaming it to inet6_hash_connect, making it possible to ditch dccp_v6_hash_connect and share the same code with TCP instead. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/char/random.c | 4 +- include/linux/random.h | 4 +- include/net/ipv6.h | 3 + net/dccp/ipv6.c | 171 +---------------------------------------- net/ipv6/inet6_hashtables.c | 183 +++++++++++++++++++++++++++++++++++++++++++- net/ipv6/tcp_ipv6.c | 173 +---------------------------------------- 6 files changed, 190 insertions(+), 348 deletions(-) diff --git a/drivers/char/random.c b/drivers/char/random.c index 79b59d9..bdfdfd2 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1573,7 +1573,7 @@ u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport) } #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) -u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport) +u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dport) { struct keydata *keyptr = get_keyptr(); u32 hash[12]; @@ -1584,7 +1584,7 @@ u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, __u16 dp return twothirdsMD4Transform(daddr, hash); } -EXPORT_SYMBOL(secure_tcpv6_port_ephemeral); +EXPORT_SYMBOL(secure_ipv6_port_ephemeral); #endif #if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE) diff --git a/include/linux/random.h b/include/linux/random.h index 01424a8..5d6456b 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -53,8 +53,8 @@ void generate_random_uuid(unsigned char uuid_out[16]); extern __u32 secure_ip_id(__u32 daddr); extern u32 secure_ipv4_port_ephemeral(__u32 saddr, __u32 daddr, __u16 dport); -extern u32 secure_tcpv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, - __u16 dport); +extern u32 secure_ipv6_port_ephemeral(const __u32 *saddr, const __u32 *daddr, + __u16 dport); extern __u32 secure_tcp_sequence_number(__u32 saddr, __u32 daddr, __u16 sport, __u16 dport); extern __u32 secure_tcpv6_sequence_number(__u32 *saddr, __u32 *daddr, diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 8513761..e3d5d7b 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -527,6 +527,9 @@ extern int inet6_getname(struct socket *sock, struct sockaddr *uaddr, extern int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); +extern int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk); + /* * reassembly.c */ diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 4d078f5..71bf04e 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -84,175 +84,6 @@ static __u32 dccp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) dh->dccph_sport); } -static int __dccp_v6_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; - const int dif = sk->sk_bound_dev_if; - const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - const unsigned int hash = inet6_ehashfn(daddr, inet->num, - saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&dccp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + dccp_hashinfo.ehash_size)->chain) { - const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); - - tw = inet_twsk(sk2); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) - goto not_unique; - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) - goto not_unique; - } - - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &dccp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 dccp_v6_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - - return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); -} - -static int dccp_v6_hash_connect(struct sock *sk) -{ - const unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (snum == 0) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + dccp_v6_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &dccp_hashinfo.bhash[inet_bhashfn(port, - dccp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__dccp_v6_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(dccp_hashinfo.bind_bucket_cachep, - head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet6_hash(&dccp_hashinfo, sk); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &dccp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &dccp_hashinfo.bhash[inet_bhashfn(snum, - dccp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet6_hash(&dccp_hashinfo, sk); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __dccp_v6_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -403,7 +234,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->dport = usin->sin6_port; dccp_set_state(sk, DCCP_REQUESTING); - err = dccp_v6_hash_connect(sk); + err = inet6_hash_connect(&dccp_death_row, sk); if (err) goto late_failure; /* FIXME */ diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 01d5f46..4154f3a 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -5,7 +5,8 @@ * * Generic INET6 transport hashtables * - * Authors: Lotsa people, from code originally in tcp + * Authors: Lotsa people, from code originally in tcp, generalised here + * by Arnaldo Carvalho de Melo * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -14,12 +15,13 @@ */ #include - #include +#include #include #include #include +#include struct sock *inet6_lookup_listener(struct inet_hashinfo *hashinfo, const struct in6_addr *daddr, @@ -79,3 +81,180 @@ struct sock *inet6_lookup(struct inet_hashinfo *hashinfo, } EXPORT_SYMBOL_GPL(inet6_lookup); + +static int __inet6_check_established(struct inet_timewait_death_row *death_row, + struct sock *sk, const __u16 lport, + struct inet_timewait_sock **twp) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + const struct in6_addr *daddr = &np->rcv_saddr; + const struct in6_addr *saddr = &np->daddr; + const int dif = sk->sk_bound_dev_if; + const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); + const unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, + inet->dport); + struct inet_ehash_bucket *head = inet_ehash_bucket(hinfo, hash); + struct sock *sk2; + const struct hlist_node *node; + struct inet_timewait_sock *tw; + + prefetch(head->chain.first); + write_lock(&head->lock); + + /* Check TIME-WAIT sockets first. */ + sk_for_each(sk2, node, &(head + hinfo->ehash_size)->chain) { + const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); + + tw = inet_twsk(sk2); + + if(*((__u32 *)&(tw->tw_dport)) == ports && + sk2->sk_family == PF_INET6 && + ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && + ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && + sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { + if (twsk_unique(sk, sk2, twp)) + goto unique; + else + goto not_unique; + } + } + tw = NULL; + + /* And established part... */ + sk_for_each(sk2, node, &head->chain) { + if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) + goto not_unique; + } + +unique: + BUG_TRAP(sk_unhashed(sk)); + __sk_add_node(sk, &head->chain); + sk->sk_hash = hash; + sock_prot_inc_use(sk->sk_prot); + write_unlock(&head->lock); + + if (twp != NULL) { + *twp = tw; + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + } else if (tw != NULL) { + /* Silly. Should hash-dance instead... */ + inet_twsk_deschedule(tw, death_row); + NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); + + inet_twsk_put(tw); + } + return 0; + +not_unique: + write_unlock(&head->lock); + return -EADDRNOTAVAIL; +} + +static inline u32 inet6_sk_port_offset(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const struct ipv6_pinfo *np = inet6_sk(sk); + return secure_ipv6_port_ephemeral(np->rcv_saddr.s6_addr32, + np->daddr.s6_addr32, + inet->dport); +} + +int inet6_hash_connect(struct inet_timewait_death_row *death_row, + struct sock *sk) +{ + struct inet_hashinfo *hinfo = death_row->hashinfo; + const unsigned short snum = inet_sk(sk)->num; + struct inet_bind_hashbucket *head; + struct inet_bind_bucket *tb; + int ret; + + if (snum == 0) { + const int low = sysctl_local_port_range[0]; + const int high = sysctl_local_port_range[1]; + const int range = high - low; + int i, port; + static u32 hint; + const u32 offset = hint + inet6_sk_port_offset(sk); + struct hlist_node *node; + struct inet_timewait_sock *tw = NULL; + + local_bh_disable(); + for (i = 1; i <= range; i++) { + port = low + (i + offset) % range; + head = &hinfo->bhash[inet_bhashfn(port, hinfo->bhash_size)]; + spin_lock(&head->lock); + + /* Does not bother with rcv_saddr checks, + * because the established check is already + * unique enough. + */ + inet_bind_bucket_for_each(tb, node, &head->chain) { + if (tb->port == port) { + BUG_TRAP(!hlist_empty(&tb->owners)); + if (tb->fastreuse >= 0) + goto next_port; + if (!__inet6_check_established(death_row, + sk, port, + &tw)) + goto ok; + goto next_port; + } + } + + tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep, + head, port); + if (!tb) { + spin_unlock(&head->lock); + break; + } + tb->fastreuse = -1; + goto ok; + + next_port: + spin_unlock(&head->lock); + } + local_bh_enable(); + + return -EADDRNOTAVAIL; + +ok: + hint += i; + + /* Head lock still held and bh's disabled */ + inet_bind_hash(sk, tb, port); + if (sk_unhashed(sk)) { + inet_sk(sk)->sport = htons(port); + __inet6_hash(hinfo, sk); + } + spin_unlock(&head->lock); + + if (tw) { + inet_twsk_deschedule(tw, death_row); + inet_twsk_put(tw); + } + + ret = 0; + goto out; + } + + head = &hinfo->bhash[inet_bhashfn(snum, hinfo->bhash_size)]; + tb = inet_csk(sk)->icsk_bind_hash; + spin_lock_bh(&head->lock); + + if (sk_head(&tb->owners) == sk && sk->sk_bind_node.next == NULL) { + __inet6_hash(hinfo, sk); + spin_unlock_bh(&head->lock); + return 0; + } else { + spin_unlock(&head->lock); + /* No definite answer... Walk to established hash table */ + ret = __inet6_check_established(death_row, sk, snum, NULL); +out: + local_bh_enable(); + return ret; + } +} + +EXPORT_SYMBOL_GPL(inet6_hash_connect); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 514b57b..a682eb9 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -119,177 +119,6 @@ static __u32 tcp_v6_init_sequence(struct sock *sk, struct sk_buff *skb) } } -static int __tcp_v6_check_established(struct sock *sk, const __u16 lport, - struct inet_timewait_sock **twp) -{ - struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - const struct in6_addr *daddr = &np->rcv_saddr; - const struct in6_addr *saddr = &np->daddr; - const int dif = sk->sk_bound_dev_if; - const u32 ports = INET_COMBINED_PORTS(inet->dport, lport); - unsigned int hash = inet6_ehashfn(daddr, inet->num, saddr, inet->dport); - struct inet_ehash_bucket *head = inet_ehash_bucket(&tcp_hashinfo, hash); - struct sock *sk2; - const struct hlist_node *node; - struct inet_timewait_sock *tw; - - prefetch(head->chain.first); - write_lock(&head->lock); - - /* Check TIME-WAIT sockets first. */ - sk_for_each(sk2, node, &(head + tcp_hashinfo.ehash_size)->chain) { - const struct inet6_timewait_sock *tw6 = inet6_twsk(sk2); - - tw = inet_twsk(sk2); - - if(*((__u32 *)&(tw->tw_dport)) == ports && - sk2->sk_family == PF_INET6 && - ipv6_addr_equal(&tw6->tw_v6_daddr, saddr) && - ipv6_addr_equal(&tw6->tw_v6_rcv_saddr, daddr) && - sk2->sk_bound_dev_if == sk->sk_bound_dev_if) { - if (twsk_unique(sk, sk2, twp)) - goto unique; - else - goto not_unique; - } - } - tw = NULL; - - /* And established part... */ - sk_for_each(sk2, node, &head->chain) { - if (INET6_MATCH(sk2, hash, saddr, daddr, ports, dif)) - goto not_unique; - } - -unique: - BUG_TRAP(sk_unhashed(sk)); - __sk_add_node(sk, &head->chain); - sk->sk_hash = hash; - sock_prot_inc_use(sk->sk_prot); - write_unlock(&head->lock); - - if (twp) { - *twp = tw; - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - } else if (tw) { - /* Silly. Should hash-dance instead... */ - inet_twsk_deschedule(tw, &tcp_death_row); - NET_INC_STATS_BH(LINUX_MIB_TIMEWAITRECYCLED); - - inet_twsk_put(tw); - } - return 0; - -not_unique: - write_unlock(&head->lock); - return -EADDRNOTAVAIL; -} - -static inline u32 tcpv6_port_offset(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const struct ipv6_pinfo *np = inet6_sk(sk); - - return secure_tcpv6_port_ephemeral(np->rcv_saddr.s6_addr32, - np->daddr.s6_addr32, - inet->dport); -} - -static int tcp_v6_hash_connect(struct sock *sk) -{ - unsigned short snum = inet_sk(sk)->num; - struct inet_bind_hashbucket *head; - struct inet_bind_bucket *tb; - int ret; - - if (!snum) { - int low = sysctl_local_port_range[0]; - int high = sysctl_local_port_range[1]; - int range = high - low; - int i; - int port; - static u32 hint; - u32 offset = hint + tcpv6_port_offset(sk); - struct hlist_node *node; - struct inet_timewait_sock *tw = NULL; - - local_bh_disable(); - for (i = 1; i <= range; i++) { - port = low + (i + offset) % range; - head = &tcp_hashinfo.bhash[inet_bhashfn(port, tcp_hashinfo.bhash_size)]; - spin_lock(&head->lock); - - /* Does not bother with rcv_saddr checks, - * because the established check is already - * unique enough. - */ - inet_bind_bucket_for_each(tb, node, &head->chain) { - if (tb->port == port) { - BUG_TRAP(!hlist_empty(&tb->owners)); - if (tb->fastreuse >= 0) - goto next_port; - if (!__tcp_v6_check_established(sk, - port, - &tw)) - goto ok; - goto next_port; - } - } - - tb = inet_bind_bucket_create(tcp_hashinfo.bind_bucket_cachep, head, port); - if (!tb) { - spin_unlock(&head->lock); - break; - } - tb->fastreuse = -1; - goto ok; - - next_port: - spin_unlock(&head->lock); - } - local_bh_enable(); - - return -EADDRNOTAVAIL; - -ok: - hint += i; - - /* Head lock still held and bh's disabled */ - inet_bind_hash(sk, tb, port); - if (sk_unhashed(sk)) { - inet_sk(sk)->sport = htons(port); - __inet6_hash(&tcp_hashinfo, sk); - } - spin_unlock(&head->lock); - - if (tw) { - inet_twsk_deschedule(tw, &tcp_death_row); - inet_twsk_put(tw); - } - - ret = 0; - goto out; - } - - head = &tcp_hashinfo.bhash[inet_bhashfn(snum, tcp_hashinfo.bhash_size)]; - tb = inet_csk(sk)->icsk_bind_hash; - spin_lock_bh(&head->lock); - - if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) { - __inet6_hash(&tcp_hashinfo, sk); - spin_unlock_bh(&head->lock); - return 0; - } else { - spin_unlock(&head->lock); - /* No definite answer... Walk to established hash table */ - ret = __tcp_v6_check_established(sk, snum, NULL); -out: - local_bh_enable(); - return ret; - } -} - static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { @@ -450,7 +279,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - err = tcp_v6_hash_connect(sk); + err = inet6_hash_connect(&tcp_death_row, sk); if (err) goto late_failure; -- cgit v1.1 From 22712813620fa8e682dbfb253a60ca0131da1e07 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:25:56 -0800 Subject: [TCP]: Move the TCPF_ enum to tcp_states.h Upcoming patches will make, for instance, ip_sockglue.c need just this enum and not all of tcp.h. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/tcp.h | 16 ---------------- include/net/tcp_states.h | 16 ++++++++++++++++ 2 files changed, 16 insertions(+), 16 deletions(-) diff --git a/include/linux/tcp.h b/include/linux/tcp.h index 4e14340..da38eea 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -55,22 +55,6 @@ struct tcphdr { __u16 urg_ptr; }; -#define TCP_ACTION_FIN (1 << 7) - -enum { - TCPF_ESTABLISHED = (1 << 1), - TCPF_SYN_SENT = (1 << 2), - TCPF_SYN_RECV = (1 << 3), - TCPF_FIN_WAIT1 = (1 << 4), - TCPF_FIN_WAIT2 = (1 << 5), - TCPF_TIME_WAIT = (1 << 6), - TCPF_CLOSE = (1 << 7), - TCPF_CLOSE_WAIT = (1 << 8), - TCPF_LAST_ACK = (1 << 9), - TCPF_LISTEN = (1 << 10), - TCPF_CLOSING = (1 << 11) -}; - /* * The union cast uses a gcc extension to avoid aliasing problems * (union is compatible to any of its members) diff --git a/include/net/tcp_states.h b/include/net/tcp_states.h index b9d4176..b0b6459 100644 --- a/include/net/tcp_states.h +++ b/include/net/tcp_states.h @@ -31,4 +31,20 @@ enum { #define TCP_STATE_MASK 0xF +#define TCP_ACTION_FIN (1 << 7) + +enum { + TCPF_ESTABLISHED = (1 << 1), + TCPF_SYN_SENT = (1 << 2), + TCPF_SYN_RECV = (1 << 3), + TCPF_FIN_WAIT1 = (1 << 4), + TCPF_FIN_WAIT2 = (1 << 5), + TCPF_TIME_WAIT = (1 << 6), + TCPF_CLOSE = (1 << 7), + TCPF_CLOSE_WAIT = (1 << 8), + TCPF_LAST_ACK = (1 << 9), + TCPF_LISTEN = (1 << 10), + TCPF_CLOSING = (1 << 11) +}; + #endif /* _LINUX_TCP_STATES_H */ -- cgit v1.1 From d83d8461f902c672bc1bd8fbc6a94e19f092da97 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 13 Dec 2005 23:26:10 -0800 Subject: [IP_SOCKGLUE]: Remove most of the tcp specific calls As DCCP needs to be called in the same spots. Now we have a member in inet_sock (is_icsk), set at sock creation time from struct inet_protosw->flags (if INET_PROTOSW_ICSK is set, like for TCP and DCCP) to see if a struct sock instance is a inet_connection_sock for places like the ones in ip_sockglue.c (v4 and v6) where we previously were looking if sk_type was SOCK_STREAM, that is insufficient because we now use the same code for DCCP, that has sk_type SOCK_DCCP. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/linux/dccp.h | 4 ---- include/linux/ip.h | 1 + include/linux/tcp.h | 3 +-- include/net/inet_connection_sock.h | 6 +++++- include/net/protocol.h | 1 + net/dccp/diag.c | 2 +- net/dccp/input.c | 2 +- net/dccp/ipv4.c | 12 +++++++----- net/dccp/ipv6.c | 26 ++++++++++++++------------ net/dccp/output.c | 7 ++++--- net/dccp/proto.c | 2 +- net/ipv4/af_inet.c | 4 +++- net/ipv4/ip_sockglue.c | 13 ++++++------- net/ipv4/tcp.c | 2 +- net/ipv4/tcp_input.c | 10 ++++------ net/ipv4/tcp_ipv4.c | 12 ++++++------ net/ipv4/tcp_output.c | 18 ++++++++++-------- net/ipv6/af_inet6.c | 1 + net/ipv6/ipv6_sockglue.c | 24 +++++++++++++----------- net/ipv6/tcp_ipv6.c | 30 +++++++++++++++++------------- 20 files changed, 97 insertions(+), 83 deletions(-) diff --git a/include/linux/dccp.h b/include/linux/dccp.h index 71fab43..d0bdb49 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -408,8 +408,6 @@ struct dccp_ackvec; * @dccps_gar - greatest valid ack number received on a non-Sync; initialized to %dccps_iss * @dccps_timestamp_time - time of latest TIMESTAMP option * @dccps_timestamp_echo - latest timestamp received on a TIMESTAMP option - * @dccps_ext_header_len - network protocol overhead (IP/IPv6 options) - * @dccps_pmtu_cookie - Last pmtu seen by socket * @dccps_packet_size - Set thru setsockopt * @dccps_role - Role of this sock, one of %dccp_role * @dccps_ndp_count - number of Non Data Packets since last data packet @@ -434,8 +432,6 @@ struct dccp_sock { __u32 dccps_timestamp_echo; __u32 dccps_packet_size; unsigned long dccps_ndp_count; - __u16 dccps_ext_header_len; - __u32 dccps_pmtu_cookie; __u32 dccps_mss_cache; struct dccp_options dccps_options; struct dccp_ackvec *dccps_hc_rx_ackvec; diff --git a/include/linux/ip.h b/include/linux/ip.h index 5a560da..6ccc596 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -155,6 +155,7 @@ struct inet_sock { __u8 mc_ttl; /* Multicasting TTL */ __u8 pmtudisc; unsigned recverr : 1, + is_icsk : 1, /* inet_connection_sock? */ freebind : 1, hdrincl : 1, mc_loop : 1; diff --git a/include/linux/tcp.h b/include/linux/tcp.h index da38eea..f2bb239 100644 --- a/include/linux/tcp.h +++ b/include/linux/tcp.h @@ -238,10 +238,9 @@ struct tcp_sock { __u32 snd_wl1; /* Sequence for window update */ __u32 snd_wnd; /* The window we expect to receive */ __u32 max_window; /* Maximal window ever seen from peer */ - __u32 pmtu_cookie; /* Last pmtu seen by socket */ __u32 mss_cache; /* Cached effective mss, not including SACKS */ __u16 xmit_size_goal; /* Goal for segmenting output packets */ - __u16 ext_header_len; /* Network protocol overhead (IP/IPv6 options) */ + /* XXX Two bytes hole, try to pack */ __u32 window_clamp; /* Maximal window to advertise */ __u32 rcv_ssthresh; /* Current window clamp */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index e50e2b8..9188896 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -60,6 +60,7 @@ struct inet_connection_sock_af_ops { * @icsk_timeout: Timeout * @icsk_retransmit_timer: Resend (no ack) * @icsk_rto: Retransmit timeout + * @icsk_pmtu_cookie Last pmtu seen by socket * @icsk_ca_ops Pluggable congestion control hook * @icsk_af_ops Operations which are AF_INET{4,6} specific * @icsk_ca_state: Congestion control state @@ -68,6 +69,7 @@ struct inet_connection_sock_af_ops { * @icsk_backoff: Backoff * @icsk_syn_retries: Number of allowed SYN (or equivalent) retries * @icsk_probes_out: unanswered 0 window probes + * @icsk_ext_hdr_len: Network protocol overhead (IP/IPv6 options) * @icsk_ack: Delayed ACK control data */ struct inet_connection_sock { @@ -79,15 +81,17 @@ struct inet_connection_sock { struct timer_list icsk_retransmit_timer; struct timer_list icsk_delack_timer; __u32 icsk_rto; + __u32 icsk_pmtu_cookie; struct tcp_congestion_ops *icsk_ca_ops; struct inet_connection_sock_af_ops *icsk_af_ops; + unsigned int (*icsk_sync_mss)(struct sock *sk, u32 pmtu); __u8 icsk_ca_state; __u8 icsk_retransmits; __u8 icsk_pending; __u8 icsk_backoff; __u8 icsk_syn_retries; __u8 icsk_probes_out; - /* 2 BYTES HOLE, TRY TO PACK! */ + __u16 icsk_ext_hdr_len; struct { __u8 pending; /* ACK is pending */ __u8 quick; /* Scheduled number of quick acks */ diff --git a/include/net/protocol.h b/include/net/protocol.h index 357691f..a29cb29 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -76,6 +76,7 @@ struct inet_protosw { }; #define INET_PROTOSW_REUSE 0x01 /* Are ports automatically reusable? */ #define INET_PROTOSW_PERMANENT 0x02 /* Permanent protocols are unremovable. */ +#define INET_PROTOSW_ICSK 0x04 /* Is this an inet_connection_sock? */ extern struct net_protocol *inet_protocol_base; extern struct net_protocol *inet_protos[MAX_INET_PROTOS]; diff --git a/net/dccp/diag.c b/net/dccp/diag.c index f675d8e6..3f78c00 100644 --- a/net/dccp/diag.c +++ b/net/dccp/diag.c @@ -28,7 +28,7 @@ static void dccp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_retransmits = icsk->icsk_retransmits; info->tcpi_probes = icsk->icsk_probes_out; info->tcpi_backoff = icsk->icsk_backoff; - info->tcpi_pmtu = dp->dccps_pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; if (dp->dccps_options.dccpo_send_ack_vector) info->tcpi_options |= TCPI_OPT_SACK; diff --git a/net/dccp/input.c b/net/dccp/input.c index 9a724ff..55e921b 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -311,7 +311,7 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } - dccp_sync_mss(sk, dp->dccps_pmtu_cookie); + dccp_sync_mss(sk, icsk->icsk_pmtu_cookie); /* * Step 10: Process REQUEST state (second part) diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 671fbf3..c363051 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -104,9 +104,9 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->dport = usin->sin_port; inet->daddr = daddr; - dp->dccps_ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt != NULL) - dp->dccps_ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; /* * Socket identity is still unknown (sport may be zero). * However we set state to DCCP_REQUESTING and not releasing socket @@ -191,7 +191,7 @@ static inline void dccp_do_pmtu_discovery(struct sock *sk, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - dp->dccps_pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { dccp_sync_mss(sk, mtu); /* @@ -1051,6 +1051,7 @@ struct inet_connection_sock_af_ops dccp_ipv4_af_ops = { int dccp_v4_init_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); static int dccp_ctl_socket_init = 1; dccp_options_init(&dp->dccps_options); @@ -1090,10 +1091,11 @@ int dccp_v4_init_sock(struct sock *sk) dccp_ctl_socket_init = 0; dccp_init_xmit_timers(sk); - inet_csk(sk)->icsk_rto = DCCP_TIMEOUT_INIT; + icsk->icsk_rto = DCCP_TIMEOUT_INIT; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; - inet_csk(sk)->icsk_af_ops = &dccp_ipv4_af_ops; + icsk->icsk_af_ops = &dccp_ipv4_af_ops; + icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_role = DCCP_ROLE_UNDEFINED; dp->dccps_service = DCCP_SERVICE_INVALID_VALUE; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 71bf04e..599b0be 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -88,6 +88,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; + struct inet_connection_sock *icsk = inet_csk(sk); struct inet_sock *inet = inet_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct dccp_sock *dp = dccp_sk(sk); @@ -158,7 +159,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = dp->dccps_ext_header_len; + u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); @@ -170,14 +171,14 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_mapped; + icsk->icsk_af_ops = &dccp_ipv6_mapped; sk->sk_backlog_rcv = dccp_v4_do_rcv; err = dccp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { - dp->dccps_ext_header_len = exthdrlen; - inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &dccp_ipv6_af_ops; sk->sk_backlog_rcv = dccp_v6_do_rcv; goto failure; } else { @@ -227,9 +228,10 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ip6_dst_store(sk, dst, NULL); - dp->dccps_ext_header_len = 0; + icsk->icsk_ext_hdr_len = 0; if (np->opt) - dp->dccps_ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); inet->dport = usin->sin6_port; @@ -292,7 +294,6 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, np = inet6_sk(sk); if (type == ICMPV6_PKT_TOOBIG) { - struct dccp_sock *dp = dccp_sk(sk); struct dst_entry *dst = NULL; if (sock_owned_by_user(sk)) @@ -332,7 +333,7 @@ static void dccp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); - if (dp->dccps_pmtu_cookie > dst_mtu(dst)) { + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { dccp_sync_mss(sk, dst_mtu(dst)); } /* else let the usual retransmit timer handle it */ dst_release(dst); @@ -808,7 +809,7 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, worked with IPv6 icsk.icsk_af_ops. Sync it now. */ - dccp_sync_mss(newsk, newdp->dccps_pmtu_cookie); + dccp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } @@ -916,10 +917,10 @@ static struct sock *dccp_v6_request_recv_sock(struct sock *sk, sock_kfree_s(sk, opt, opt->tot_len); } - newdp->dccps_ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) - newdp->dccps_ext_header_len = newnp->opt->opt_nflen + - newnp->opt->opt_flen; + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); dccp_sync_mss(newsk, dst_mtu(dst)); @@ -1230,6 +1231,7 @@ static struct inet_protosw dccp_v6_protosw = { .prot = &dccp_v6_prot, .ops = &inet6_dccp_ops, .capability = -1, + .flags = INET_PROTOSW_ICSK, }; static int __init dccp_v6_init(void) diff --git a/net/dccp/output.c b/net/dccp/output.c index c40f7f8..95a3c2c 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -134,12 +134,13 @@ static int dccp_transmit_skb(struct sock *sk, struct sk_buff *skb) unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) { + struct inet_connection_sock *icsk = inet_csk(sk); struct dccp_sock *dp = dccp_sk(sk); - int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct dccp_hdr) - sizeof(struct dccp_hdr_ext)); /* Now subtract optional transport overhead */ - mss_now -= dp->dccps_ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* * FIXME: this should come from the CCID infrastructure, where, say, @@ -152,7 +153,7 @@ unsigned int dccp_sync_mss(struct sock *sk, u32 pmtu) mss_now -= ((5 + 6 + 10 + 6 + 6 + 6 + 3) / 4) * 4; /* And store cached results */ - dp->dccps_pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; dp->dccps_mss_cache = mss_now; return mss_now; diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 51dfacd2..40a4c68 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -712,7 +712,7 @@ static struct inet_protosw dccp_v4_protosw = { .ops = &inet_dccp_ops, .capability = -1, .no_check = 0, - .flags = 0, + .flags = INET_PROTOSW_ICSK, }; /* diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d368cf2..617e858 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -302,6 +302,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; @@ -869,7 +870,8 @@ static struct inet_protosw inetsw_array[] = .ops = &inet_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }, { diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index 4f2d872..add019c 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -29,8 +29,7 @@ #include #include #include -#include -#include +#include #include #include #include @@ -427,8 +426,8 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, err = ip_options_get_from_user(&opt, optval, optlen); if (err) break; - if (sk->sk_type == SOCK_STREAM) { - struct tcp_sock *tp = tcp_sk(sk); + if (inet->is_icsk) { + struct inet_connection_sock *icsk = inet_csk(sk); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (sk->sk_family == PF_INET || (!((1 << sk->sk_state) & @@ -436,10 +435,10 @@ int ip_setsockopt(struct sock *sk, int level, int optname, char __user *optval, inet->daddr != LOOPBACK4_IPV6)) { #endif if (inet->opt) - tp->ext_header_len -= inet->opt->optlen; + icsk->icsk_ext_hdr_len -= inet->opt->optlen; if (opt) - tp->ext_header_len += opt->optlen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len += opt->optlen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) } #endif diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index eacfe6a..00aa80e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1914,7 +1914,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_last_data_recv = jiffies_to_msecs(now - icsk->icsk_ack.lrcvtime); info->tcpi_last_ack_recv = jiffies_to_msecs(now - tp->rcv_tstamp); - info->tcpi_pmtu = tp->pmtu_cookie; + info->tcpi_pmtu = icsk->icsk_pmtu_cookie; info->tcpi_rcv_ssthresh = tp->rcv_ssthresh; info->tcpi_rtt = jiffies_to_usecs(tp->srtt)>>3; info->tcpi_rttvar = jiffies_to_usecs(tp->mdev)>>2; diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 7de6184..981d120 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2342,7 +2342,7 @@ static int tcp_ack_update_window(struct sock *sk, struct tcp_sock *tp, if (nwin > tp->max_window) { tp->max_window = nwin; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, inet_csk(sk)->icsk_pmtu_cookie); } } } @@ -3967,12 +3967,12 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, struct tcphdr *th, unsigned len) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); int saved_clamp = tp->rx_opt.mss_clamp; tcp_parse_options(skb, &tp->rx_opt, 0); if (th->ack) { - struct inet_connection_sock *icsk; /* rfc793: * "If the state is SYN-SENT then * first check the ACK bit @@ -4061,7 +4061,7 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, if (tp->rx_opt.sack_ok && sysctl_tcp_fack) tp->rx_opt.sack_ok |= 2; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); /* Remember, tcp_poll() does not lock socket! @@ -4071,8 +4071,6 @@ static int tcp_rcv_synsent_state_process(struct sock *sk, struct sk_buff *skb, mb(); tcp_set_state(sk, TCP_ESTABLISHED); - icsk = inet_csk(sk); - /* Make sure socket is routed, for correct metrics. */ icsk->icsk_af_ops->rebuild_header(sk); @@ -4173,7 +4171,7 @@ discard: if (tp->ecn_flags&TCP_ECN_OK) sock_set_flag(sk, SOCK_NO_LARGESEND); - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); tcp_initialize_rcv_mss(sk); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index c2fe61b..9b62d80 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -220,9 +220,9 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->dport = usin->sin_port; inet->daddr = daddr; - tp->ext_header_len = 0; + inet_csk(sk)->icsk_ext_hdr_len = 0; if (inet->opt) - tp->ext_header_len = inet->opt->optlen; + inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen; tp->rx_opt.mss_clamp = 536; @@ -275,7 +275,6 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); - struct tcp_sock *tp = tcp_sk(sk); /* We are not interested in TCP_LISTEN and open_requests (SYN-ACKs * send out by Linux are always <576bytes so they should go through @@ -304,7 +303,7 @@ static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, mtu = dst_mtu(dst); if (inet->pmtudisc != IP_PMTUDISC_DONT && - tp->pmtu_cookie > mtu) { + inet_csk(sk)->icsk_pmtu_cookie > mtu) { tcp_sync_mss(sk, mtu); /* Resend the TCP packet because it's @@ -895,9 +894,9 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ireq->opt = NULL; newinet->mc_index = inet_iif(skb); newinet->mc_ttl = skb->nh.iph->ttl; - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newinet->opt) - newtp->ext_header_len = newinet->opt->optlen; + inet_csk(newsk)->icsk_ext_hdr_len = newinet->opt->optlen; newinet->id = newtp->write_seq ^ jiffies; tcp_sync_mss(newsk, dst_mtu(dst)); @@ -1266,6 +1265,7 @@ static int tcp_v4_init_sock(struct sock *sk) sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); icsk->icsk_af_ops = &ipv4_specific; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_sndbuf = sysctl_tcp_wmem[1]; sk->sk_rcvbuf = sysctl_tcp_rmem[1]; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index af1946c..3a0a914 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -621,7 +621,7 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) It is minimum of user_mss and mss received with SYN. It also does not include TCP options. - tp->pmtu_cookie is last pmtu, seen by this function. + inet_csk(sk)->icsk_pmtu_cookie is last pmtu, seen by this function. tp->mss_cache is current effective sending mss, including all tcp options except for SACKs. It is evaluated, @@ -631,17 +631,18 @@ int tcp_trim_head(struct sock *sk, struct sk_buff *skb, u32 len) NOTE1. rfc1122 clearly states that advertised MSS DOES NOT include either tcp or ip options. - NOTE2. tp->pmtu_cookie and tp->mss_cache are READ ONLY outside - this function. --ANK (980731) + NOTE2. inet_csk(sk)->icsk_pmtu_cookie and tp->mss_cache + are READ ONLY outside this function. --ANK (980731) */ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) { struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); /* Calculate base mss without TCP options: It is MMS_S - sizeof(tcphdr) of rfc1122 */ - int mss_now = (pmtu - inet_csk(sk)->icsk_af_ops->net_header_len - + int mss_now = (pmtu - icsk->icsk_af_ops->net_header_len - sizeof(struct tcphdr)); /* Clamp it (mss_clamp does not include tcp options) */ @@ -649,7 +650,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = tp->rx_opt.mss_clamp; /* Now subtract optional transport overhead */ - mss_now -= tp->ext_header_len; + mss_now -= icsk->icsk_ext_hdr_len; /* Then reserve room for full set of TCP options and 8 bytes of data */ if (mss_now < 48) @@ -663,7 +664,7 @@ unsigned int tcp_sync_mss(struct sock *sk, u32 pmtu) mss_now = max((tp->max_window>>1), 68U - tp->tcp_header_len); /* And store cached results */ - tp->pmtu_cookie = pmtu; + icsk->icsk_pmtu_cookie = pmtu; tp->mss_cache = mss_now; return mss_now; @@ -693,7 +694,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) if (dst) { u32 mtu = dst_mtu(dst); - if (mtu != tp->pmtu_cookie) + if (mtu != inet_csk(sk)->icsk_pmtu_cookie) mss_now = tcp_sync_mss(sk, mtu); } @@ -706,7 +707,8 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) if (doing_tso) { xmit_size_goal = (65535 - inet_csk(sk)->icsk_af_ops->net_header_len - - tp->ext_header_len - tp->tcp_header_len); + inet_csk(sk)->icsk_ext_hdr_len - + tp->tcp_header_len); if (tp->max_window && (xmit_size_goal > (tp->max_window >> 1))) diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index bf17aab..70a510f 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -167,6 +167,7 @@ lookup_protocol: sk->sk_reuse = 1; inet = inet_sk(sk); + inet->is_icsk = INET_PROTOSW_ICSK & answer_flags; if (SOCK_RAW == sock->type) { inet->num = protocol; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index b6b63fa..c63868d 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -163,17 +163,17 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sk_refcnt_debug_dec(sk); if (sk->sk_protocol == IPPROTO_TCP) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); local_bh_disable(); sock_prot_dec_use(sk->sk_prot); sock_prot_inc_use(&tcp_prot); local_bh_enable(); sk->sk_prot = &tcp_prot; - inet_csk(sk)->icsk_af_ops = &ipv4_specific; + icsk->icsk_af_ops = &ipv4_specific; sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; - tcp_sync_mss(sk, tp->pmtu_cookie); + tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); } else { local_bh_disable(); sock_prot_dec_use(sk->sk_prot); @@ -317,14 +317,15 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, } retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); @@ -380,14 +381,15 @@ sticky_done: goto done; update: retv = 0; - if (sk->sk_type == SOCK_STREAM) { + if (inet_sk(sk)->is_icsk) { if (opt) { - struct tcp_sock *tp = tcp_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); if (!((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) && inet_sk(sk)->daddr != LOOPBACK4_IPV6) { - tp->ext_header_len = opt->opt_flen + opt->opt_nflen; - tcp_sync_mss(sk, tp->pmtu_cookie); + icsk->icsk_ext_hdr_len = + opt->opt_flen + opt->opt_nflen; + icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie); } } opt = xchg(&np->opt, opt); diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index a682eb9..2947bc5 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -123,7 +123,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); + struct inet_sock *inet = inet_sk(sk); + struct inet_connection_sock *icsk = inet_csk(sk); struct ipv6_pinfo *np = inet6_sk(sk); struct tcp_sock *tp = tcp_sk(sk); struct in6_addr *saddr = NULL, *final_p = NULL, final; @@ -198,7 +199,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, */ if (addr_type == IPV6_ADDR_MAPPED) { - u32 exthdrlen = tp->ext_header_len; + u32 exthdrlen = icsk->icsk_ext_hdr_len; struct sockaddr_in sin; SOCK_DEBUG(sk, "connect: ipv4 mapped\n"); @@ -210,14 +211,14 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - inet_csk(sk)->icsk_af_ops = &ipv6_mapped; + icsk->icsk_af_ops = &ipv6_mapped; sk->sk_backlog_rcv = tcp_v4_do_rcv; err = tcp_v4_connect(sk, (struct sockaddr *)&sin, sizeof(sin)); if (err) { - tp->ext_header_len = exthdrlen; - inet_csk(sk)->icsk_af_ops = &ipv6_specific; + icsk->icsk_ext_hdr_len = exthdrlen; + icsk->icsk_af_ops = &ipv6_specific; sk->sk_backlog_rcv = tcp_v6_do_rcv; goto failure; } else { @@ -270,9 +271,10 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM | NETIF_F_TSO); - tp->ext_header_len = 0; + icsk->icsk_ext_hdr_len = 0; if (np->opt) - tp->ext_header_len = np->opt->opt_flen + np->opt->opt_nflen; + icsk->icsk_ext_hdr_len = (np->opt->opt_flen + + np->opt->opt_nflen); tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr); @@ -385,7 +387,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } else dst_hold(dst); - if (tp->pmtu_cookie > dst_mtu(dst)) { + if (inet_csk(sk)->icsk_pmtu_cookie > dst_mtu(dst)) { tcp_sync_mss(sk, dst_mtu(dst)); tcp_simple_retransmit(sk); } /* else let the usual retransmit timer handle it */ @@ -869,7 +871,7 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, worked with IPv6 icsk.icsk_af_ops. Sync it now. */ - tcp_sync_mss(newsk, newtp->pmtu_cookie); + tcp_sync_mss(newsk, inet_csk(newsk)->icsk_pmtu_cookie); return newsk; } @@ -976,10 +978,10 @@ static struct sock * tcp_v6_syn_recv_sock(struct sock *sk, struct sk_buff *skb, sock_kfree_s(sk, opt, opt->tot_len); } - newtp->ext_header_len = 0; + inet_csk(newsk)->icsk_ext_hdr_len = 0; if (newnp->opt) - newtp->ext_header_len = newnp->opt->opt_nflen + - newnp->opt->opt_flen; + inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen + + newnp->opt->opt_flen); tcp_sync_mss(newsk, dst_mtu(dst)); newtp->advmss = dst_metric(dst, RTAX_ADVMSS); @@ -1361,6 +1363,7 @@ static int tcp_v6_init_sock(struct sock *sk) icsk->icsk_af_ops = &ipv6_specific; icsk->icsk_ca_ops = &tcp_init_congestion_ops; + icsk->icsk_sync_mss = tcp_sync_mss; sk->sk_write_space = sk_stream_write_space; sock_set_flag(sk, SOCK_USE_WRITE_QUEUE); @@ -1591,7 +1594,8 @@ static struct inet_protosw tcpv6_protosw = { .ops = &inet6_stream_ops, .capability = -1, .no_check = 0, - .flags = INET_PROTOSW_PERMANENT, + .flags = INET_PROTOSW_PERMANENT | + INET_PROTOSW_ICSK, }; void __init tcpv6_init(void) -- cgit v1.1 From fbe9cc4a87030d5cad5f944ffaef6af7efd119e4 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 13 Dec 2005 23:26:29 -0800 Subject: [AF_UNIX]: Use spinlock for unix_table_lock This lock is actually taken mostly as a writer, so using a rwlock actually just makes performance worse especially on chips like the Intel P4. Signed-off-by: David S. Miller --- include/net/af_unix.h | 2 +- net/unix/af_unix.c | 34 +++++++++++++++++----------------- net/unix/garbage.c | 4 ++-- 3 files changed, 20 insertions(+), 20 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index b5d785a..3f302ae 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -13,7 +13,7 @@ extern void unix_gc(void); #define UNIX_HASH_SIZE 256 extern struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -extern rwlock_t unix_table_lock; +extern spinlock_t unix_table_lock; extern atomic_t unix_tot_inflight; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1dc3685..04e850e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -121,7 +121,7 @@ int sysctl_unix_max_dgram_qlen = 10; struct hlist_head unix_socket_table[UNIX_HASH_SIZE + 1]; -DEFINE_RWLOCK(unix_table_lock); +DEFINE_SPINLOCK(unix_table_lock); static atomic_t unix_nr_socks = ATOMIC_INIT(0); #define unix_sockets_unbound (&unix_socket_table[UNIX_HASH_SIZE]) @@ -130,7 +130,7 @@ static atomic_t unix_nr_socks = ATOMIC_INIT(0); /* * SMP locking strategy: - * hash table is protected with rwlock unix_table_lock + * hash table is protected with spinlock unix_table_lock * each socket state is protected by separate rwlock. */ @@ -214,16 +214,16 @@ static void __unix_insert_socket(struct hlist_head *list, struct sock *sk) static inline void unix_remove_socket(struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_remove_socket(sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static inline void unix_insert_socket(struct hlist_head *list, struct sock *sk) { - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); __unix_insert_socket(list, sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static struct sock *__unix_find_socket_byname(struct sockaddr_un *sunname, @@ -250,11 +250,11 @@ static inline struct sock *unix_find_socket_byname(struct sockaddr_un *sunname, { struct sock *s; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); s = __unix_find_socket_byname(sunname, len, type, hash); if (s) sock_hold(s); - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -263,7 +263,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) struct sock *s; struct hlist_node *node; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); sk_for_each(s, node, &unix_socket_table[i->i_ino & (UNIX_HASH_SIZE - 1)]) { struct dentry *dentry = unix_sk(s)->dentry; @@ -276,7 +276,7 @@ static struct sock *unix_find_socket_byinode(struct inode *i) } s = NULL; found: - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); return s; } @@ -642,12 +642,12 @@ retry: addr->len = sprintf(addr->name->sun_path+1, "%05x", ordernum) + 1 + sizeof(short); addr->hash = unix_hash_fold(csum_partial((void*)addr->name, addr->len, 0)); - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); ordernum = (ordernum+1)&0xFFFFF; if (__unix_find_socket_byname(addr->name, addr->len, sock->type, addr->hash)) { - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* Sanity yield. It is unusual case, but yet... */ if (!(ordernum&0xFF)) yield(); @@ -658,7 +658,7 @@ retry: __unix_remove_socket(sk); u->addr = addr; __unix_insert_socket(&unix_socket_table[addr->hash], sk); - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); err = 0; out: up(&u->readsem); @@ -791,7 +791,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) addr->hash = UNIX_HASH_SIZE; } - write_lock(&unix_table_lock); + spin_lock(&unix_table_lock); if (!sunaddr->sun_path[0]) { err = -EADDRINUSE; @@ -814,7 +814,7 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) __unix_insert_socket(list, sk); out_unlock: - write_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); out_up: up(&u->readsem); out: @@ -1916,7 +1916,7 @@ static struct sock *unix_seq_idx(int *iter, loff_t pos) static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); return *pos ? unix_seq_idx(seq->private, *pos - 1) : ((void *) 1); } @@ -1931,7 +1931,7 @@ static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) static void unix_seq_stop(struct seq_file *seq, void *v) { - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); } static int unix_seq_show(struct seq_file *seq, void *v) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index 6ffc64e..411802b 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -182,7 +182,7 @@ void unix_gc(void) if (down_trylock(&unix_gc_sem)) return; - read_lock(&unix_table_lock); + spin_lock(&unix_table_lock); forall_unix_sockets(i, s) { @@ -301,7 +301,7 @@ void unix_gc(void) } u->gc_tree = GC_ORPHAN; } - read_unlock(&unix_table_lock); + spin_unlock(&unix_table_lock); /* * Here we are. Hitlist is filled. Die. -- cgit v1.1 From 4505a3ef720845b5db3ddb440de13cd4800fd508 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 18:51:49 -0800 Subject: [BRIDGE]: allow setting hardware address of bridge pseudo-dev Some people are using bridging to hide multiple machines from an ISP that restricts by MAC address. So in that case allow the bridge mac address to be set to any of the existing interfaces. I don't want to allow any arbitrary value and confuse STP. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 28 ++++++++++++++++++++++++++-- net/bridge/br_private.h | 1 + net/bridge/br_stp_if.c | 3 +-- 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f564ee9..f7a66ab 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -15,7 +15,8 @@ #include #include -#include +#include + #include #include "br_private.h" @@ -82,6 +83,29 @@ static int br_change_mtu(struct net_device *dev, int new_mtu) return 0; } +/* Allow setting mac address of pseudo-bridge to be same as + * any of the bound interfaces + */ +static int br_set_mac_address(struct net_device *dev, void *p) +{ + struct net_bridge *br = netdev_priv(dev); + struct sockaddr *addr = p; + struct net_bridge_port *port; + int err = -EADDRNOTAVAIL; + + spin_lock_bh(&br->lock); + list_for_each_entry(port, &br->port_list, list) { + if (!compare_ether_addr(port->dev->dev_addr, addr->sa_data)) { + br_stp_change_bridge_id(br, addr->sa_data); + err = 0; + break; + } + } + spin_unlock_bh(&br->lock); + + return err; +} + void br_dev_setup(struct net_device *dev) { memset(dev->dev_addr, 0, ETH_ALEN); @@ -98,6 +122,6 @@ void br_dev_setup(struct net_device *dev) SET_MODULE_OWNER(dev); dev->stop = br_dev_stop; dev->tx_queue_len = 0; - dev->set_mac_address = NULL; + dev->set_mac_address = br_set_mac_address; dev->priv_flags = IFF_EBRIDGE; } diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index bdf95a7..2c24948 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -201,6 +201,7 @@ extern void br_stp_disable_bridge(struct net_bridge *br); extern void br_stp_enable_port(struct net_bridge_port *p); extern void br_stp_disable_port(struct net_bridge_port *p); extern void br_stp_recalculate_bridge_id(struct net_bridge *br); +extern void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *a); extern void br_stp_set_bridge_priority(struct net_bridge *br, u16 newprio); extern void br_stp_set_port_priority(struct net_bridge_port *p, diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index ac09b6a..2d2e969 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -120,8 +120,7 @@ void br_stp_disable_port(struct net_bridge_port *p) } /* called under bridge lock */ -static void br_stp_change_bridge_id(struct net_bridge *br, - const unsigned char *addr) +void br_stp_change_bridge_id(struct net_bridge *br, const unsigned char *addr) { unsigned char oldaddr[6]; struct net_bridge_port *p; -- cgit v1.1 From 4433f420e57afae0ab308b1e2b979f09c86bc115 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 20 Dec 2005 15:19:51 -0800 Subject: [BRIDGE]: handle speed detection after carrier changes Speed of a interface may not be available until carrier is detected in the case of autonegotiation. To get the correct value we need to recheck speed after carrier event. But the check needs to be done in a context that is similar to normal ethtool interface (can sleep). Also, delay check for 1ms to try avoid any carrier bounce transitions. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_if.c | 49 ++++++++++++++++++++++++++++++++++++++----------- net/bridge/br_notify.c | 14 +++----------- net/bridge/br_private.h | 3 +++ 3 files changed, 44 insertions(+), 22 deletions(-) diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index 975abe2..e6b8bb5 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -32,9 +32,8 @@ * ethtool, use ethtool_ops. Also, since driver might sleep need to * not be holding any locks. */ -static int br_initial_port_cost(struct net_device *dev) +static int port_cost(struct net_device *dev) { - struct ethtool_cmd ecmd = { ETHTOOL_GSET }; struct ifreq ifr; mm_segment_t old_fs; @@ -58,10 +57,6 @@ static int br_initial_port_cost(struct net_device *dev) return 2; case SPEED_10: return 100; - default: - pr_info("bridge: can't decode speed from %s: %d\n", - dev->name, ecmd.speed); - return 100; } } @@ -75,6 +70,35 @@ static int br_initial_port_cost(struct net_device *dev) return 100; /* assume old 10Mbps */ } + +/* + * Check for port carrier transistions. + * Called from work queue to allow for calling functions that + * might sleep (such as speed check), and to debounce. + */ +static void port_carrier_check(void *arg) +{ + struct net_bridge_port *p = arg; + + rtnl_lock(); + if (netif_carrier_ok(p->dev)) { + u32 cost = port_cost(p->dev); + + spin_lock_bh(&p->br->lock); + if (p->state == BR_STATE_DISABLED) { + p->path_cost = cost; + br_stp_enable_port(p); + } + spin_unlock_bh(&p->br->lock); + } else { + spin_lock_bh(&p->br->lock); + if (p->state != BR_STATE_DISABLED) + br_stp_disable_port(p); + spin_unlock_bh(&p->br->lock); + } + rtnl_unlock(); +} + static void destroy_nbp(struct net_bridge_port *p) { struct net_device *dev = p->dev; @@ -102,6 +126,9 @@ static void del_nbp(struct net_bridge_port *p) dev->br_port = NULL; dev_set_promiscuity(dev, -1); + cancel_delayed_work(&p->carrier_check); + flush_scheduled_work(); + spin_lock_bh(&br->lock); br_stp_disable_port(p); spin_unlock_bh(&br->lock); @@ -195,10 +222,9 @@ static int find_portno(struct net_bridge *br) return (index >= BR_MAX_PORTS) ? -EXFULL : index; } -/* called with RTNL */ +/* called with RTNL but without bridge lock */ static struct net_bridge_port *new_nbp(struct net_bridge *br, - struct net_device *dev, - unsigned long cost) + struct net_device *dev) { int index; struct net_bridge_port *p; @@ -215,12 +241,13 @@ static struct net_bridge_port *new_nbp(struct net_bridge *br, p->br = br; dev_hold(dev); p->dev = dev; - p->path_cost = cost; + p->path_cost = port_cost(dev); p->priority = 0x8000 >> BR_PORT_BITS; dev->br_port = p; p->port_no = index; br_init_port(p); p->state = BR_STATE_DISABLED; + INIT_WORK(&p->carrier_check, port_carrier_check, p); kobject_init(&p->kobj); return p; @@ -351,7 +378,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev) if (dev->br_port != NULL) return -EBUSY; - if (IS_ERR(p = new_nbp(br, dev, br_initial_port_cost(dev)))) + if (IS_ERR(p = new_nbp(br, dev))) return PTR_ERR(p); if ((err = br_fdb_insert(br, p, dev->dev_addr))) diff --git a/net/bridge/br_notify.c b/net/bridge/br_notify.c index 917311c..a43a9c1 100644 --- a/net/bridge/br_notify.c +++ b/net/bridge/br_notify.c @@ -52,17 +52,9 @@ static int br_device_event(struct notifier_block *unused, unsigned long event, v br_stp_recalculate_bridge_id(br); break; - case NETDEV_CHANGE: /* device is up but carrier changed */ - if (!(br->dev->flags & IFF_UP)) - break; - - if (netif_carrier_ok(dev)) { - if (p->state == BR_STATE_DISABLED) - br_stp_enable_port(p); - } else { - if (p->state != BR_STATE_DISABLED) - br_stp_disable_port(p); - } + case NETDEV_CHANGE: + if (br->dev->flags & IFF_UP) + schedule_delayed_work(&p->carrier_check, BR_PORT_DEBOUNCE); break; case NETDEV_FEAT_CHANGE: diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 2c24948..7ad53c2 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -27,6 +27,8 @@ #define BR_PORT_BITS 10 #define BR_MAX_PORTS (1< Date: Wed, 21 Dec 2005 19:00:18 -0800 Subject: [BRIDGE]: filter packets in learning state While in the learning state, run filters but drop the result. This prevents us from acquiring bad fdb entries in learning state. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_input.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c index b88220a..c387852 100644 --- a/net/bridge/br_input.c +++ b/net/bridge/br_input.c @@ -53,6 +53,11 @@ int br_handle_frame_finish(struct sk_buff *skb) /* insert into forwarding database after filtering to avoid spoofing */ br_fdb_update(p->br, p, eth_hdr(skb)->h_source); + if (p->state == BR_STATE_LEARNING) { + kfree_skb(skb); + goto out; + } + if (br->dev->flags & IFF_PROMISC) { struct sk_buff *skb2; @@ -107,9 +112,6 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) if (!is_valid_ether_addr(eth_hdr(skb)->h_source)) goto err; - if (p->state == BR_STATE_LEARNING) - br_fdb_update(p->br, p, eth_hdr(skb)->h_source); - if (p->br->stp_enabled && !memcmp(dest, bridge_ula, 5) && !(dest[5] & 0xF0)) { @@ -118,9 +120,10 @@ int br_handle_frame(struct net_bridge_port *p, struct sk_buff **pskb) NULL, br_stp_handle_bpdu); return 1; } + goto err; } - else if (p->state == BR_STATE_FORWARDING) { + if (p->state == BR_STATE_FORWARDING || p->state == BR_STATE_LEARNING) { if (br_should_route_hook) { if (br_should_route_hook(pskb)) return 0; -- cgit v1.1 From edb5e46fc03d0a45f2b41e3717631f7af7e9fc19 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 19:00:58 -0800 Subject: [BRIDGE]: limited ethtool support Add limited ethtool support to bridge to allow disabling features. Note: if underlying device does not support a feature (like checksum offload), then the bridge device won't inherit it. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_device.c | 63 +++++++++++++++++++++++++++++++++++++++++++++++++ net/bridge/br_if.c | 6 ++--- net/bridge/br_private.h | 1 + 3 files changed, 67 insertions(+), 3 deletions(-) diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c index f7a66ab..0b33a7b 100644 --- a/net/bridge/br_device.c +++ b/net/bridge/br_device.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include "br_private.h" @@ -106,6 +107,64 @@ static int br_set_mac_address(struct net_device *dev, void *p) return err; } +static void br_getinfo(struct net_device *dev, struct ethtool_drvinfo *info) +{ + strcpy(info->driver, "bridge"); + strcpy(info->version, BR_VERSION); + strcpy(info->fw_version, "N/A"); + strcpy(info->bus_info, "N/A"); +} + +static int br_set_sg(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_SG; + else + br->feature_mask &= ~NETIF_F_SG; + + br_features_recompute(br); + return 0; +} + +static int br_set_tso(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_TSO; + else + br->feature_mask &= ~NETIF_F_TSO; + + br_features_recompute(br); + return 0; +} + +static int br_set_tx_csum(struct net_device *dev, u32 data) +{ + struct net_bridge *br = netdev_priv(dev); + + if (data) + br->feature_mask |= NETIF_F_IP_CSUM; + else + br->feature_mask &= ~NETIF_F_IP_CSUM; + + br_features_recompute(br); + return 0; +} + +static struct ethtool_ops br_ethtool_ops = { + .get_drvinfo = br_getinfo, + .get_link = ethtool_op_get_link, + .get_sg = ethtool_op_get_sg, + .set_sg = br_set_sg, + .get_tx_csum = ethtool_op_get_tx_csum, + .set_tx_csum = br_set_tx_csum, + .get_tso = ethtool_op_get_tso, + .set_tso = br_set_tso, +}; + void br_dev_setup(struct net_device *dev) { memset(dev->dev_addr, 0, ETH_ALEN); @@ -120,8 +179,12 @@ void br_dev_setup(struct net_device *dev) dev->change_mtu = br_change_mtu; dev->destructor = free_netdev; SET_MODULE_OWNER(dev); + SET_ETHTOOL_OPS(dev, &br_ethtool_ops); dev->stop = br_dev_stop; dev->tx_queue_len = 0; dev->set_mac_address = br_set_mac_address; dev->priv_flags = IFF_EBRIDGE; + + dev->features = NETIF_F_SG | NETIF_F_FRAGLIST + | NETIF_F_HIGHDMA | NETIF_F_TSO | NETIF_F_IP_CSUM; } diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index e6b8bb5..1132119 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -182,6 +182,7 @@ static struct net_device *new_bridge_dev(const char *name) br->bridge_id.prio[1] = 0x00; memset(br->bridge_id.addr, 0, ETH_ALEN); + br->feature_mask = dev->features; br->stp_enabled = 0; br->designated_root = br->bridge_id; br->root_path_cost = 0; @@ -349,9 +350,8 @@ void br_features_recompute(struct net_bridge *br) struct net_bridge_port *p; unsigned long features, checksum; - features = NETIF_F_SG | NETIF_F_FRAGLIST - | NETIF_F_HIGHDMA | NETIF_F_TSO; - checksum = NETIF_F_IP_CSUM; /* least commmon subset */ + features = br->feature_mask &~ NETIF_F_IP_CSUM; + checksum = br->feature_mask & NETIF_F_IP_CSUM; list_for_each_entry(p, &br->port_list, list) { if (!(p->dev->features diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index 7ad53c2..db7b26b 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -93,6 +93,7 @@ struct net_bridge spinlock_t hash_lock; struct hlist_head hash[BR_HASH_SIZE]; struct list_head age_list; + unsigned long feature_mask; /* STP */ bridge_id designated_root; -- cgit v1.1 From 8cbb512e50fb702b5b1d444f76ebcdb53577b2ec Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 19:01:30 -0800 Subject: [BRIDGE]: add version number Add version info to bridge module. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br.c | 1 + net/bridge/br_private.h | 2 ++ 2 files changed, 3 insertions(+) diff --git a/net/bridge/br.c b/net/bridge/br.c index f8f1849..188cc1a 100644 --- a/net/bridge/br.c +++ b/net/bridge/br.c @@ -67,3 +67,4 @@ EXPORT_SYMBOL(br_should_route_hook); module_init(br_init) module_exit(br_deinit) MODULE_LICENSE("GPL"); +MODULE_VERSION(BR_VERSION); diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h index db7b26b..c5bd631 100644 --- a/net/bridge/br_private.h +++ b/net/bridge/br_private.h @@ -29,6 +29,8 @@ #define BR_PORT_DEBOUNCE (HZ/10) +#define BR_VERSION "2.1" + typedef struct bridge_id bridge_id; typedef struct mac_addr mac_addr; typedef __u16 port_id; -- cgit v1.1 From c865e5d99e25a171e8262fc0f7ba608568633c64 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 19:03:44 -0800 Subject: [PKT_SCHED] netem: packet corruption option Here is a new feature for netem in 2.6.16. It adds the ability to randomly corrupt packets with netem. A version was done by Hagen Paul Pfeifer, but I redid it to handle the cases of backwards compatibility with netlink interface and presence of hardware checksum offload. It is useful for testing hardware offload in devices. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/pkt_sched.h | 7 +++++++ net/sched/sch_netem.c | 49 ++++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 53 insertions(+), 3 deletions(-) diff --git a/include/linux/pkt_sched.h b/include/linux/pkt_sched.h index e87b233..d10f353 100644 --- a/include/linux/pkt_sched.h +++ b/include/linux/pkt_sched.h @@ -429,6 +429,7 @@ enum TCA_NETEM_CORR, TCA_NETEM_DELAY_DIST, TCA_NETEM_REORDER, + TCA_NETEM_CORRUPT, __TCA_NETEM_MAX, }; @@ -457,6 +458,12 @@ struct tc_netem_reorder __u32 correlation; }; +struct tc_netem_corrupt +{ + __u32 probability; + __u32 correlation; +}; + #define NETEM_DIST_SCALE 8192 #endif diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index 82fb07a..ba52832 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -25,7 +25,7 @@ #include -#define VERSION "1.1" +#define VERSION "1.2" /* Network Emulation Queuing algorithm. ==================================== @@ -65,11 +65,12 @@ struct netem_sched_data { u32 jitter; u32 duplicate; u32 reorder; + u32 corrupt; struct crndstate { unsigned long last; unsigned long rho; - } delay_cor, loss_cor, dup_cor, reorder_cor; + } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor; struct disttable { u32 size; @@ -183,6 +184,23 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch) q->duplicate = dupsave; } + /* + * Randomized packet corruption. + * Make copy if needed since we are modifying + * If packet is going to be hardware checksummed, then + * do it now in software before we mangle it. + */ + if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) { + if (!(skb = skb_unshare(skb, GFP_ATOMIC)) + || (skb->ip_summed == CHECKSUM_HW + && skb_checksum_help(skb, 0))) { + sch->qstats.drops++; + return NET_XMIT_DROP; + } + + skb->data[net_random() % skb_headlen(skb)] ^= 1<<(net_random() % 8); + } + if (q->gap == 0 /* not doing reordering */ || q->counter < q->gap /* inside last reordering gap */ || q->reorder < get_crandom(&q->reorder_cor)) { @@ -382,6 +400,20 @@ static int get_reorder(struct Qdisc *sch, const struct rtattr *attr) return 0; } +static int get_corrupt(struct Qdisc *sch, const struct rtattr *attr) +{ + struct netem_sched_data *q = qdisc_priv(sch); + const struct tc_netem_corrupt *r = RTA_DATA(attr); + + if (RTA_PAYLOAD(attr) != sizeof(*r)) + return -EINVAL; + + q->corrupt = r->probability; + init_crandom(&q->corrupt_cor, r->correlation); + return 0; +} + +/* Parse netlink message to set options */ static int netem_change(struct Qdisc *sch, struct rtattr *opt) { struct netem_sched_data *q = qdisc_priv(sch); @@ -432,13 +464,19 @@ static int netem_change(struct Qdisc *sch, struct rtattr *opt) if (ret) return ret; } + if (tb[TCA_NETEM_REORDER-1]) { ret = get_reorder(sch, tb[TCA_NETEM_REORDER-1]); if (ret) return ret; } - } + if (tb[TCA_NETEM_CORRUPT-1]) { + ret = get_corrupt(sch, tb[TCA_NETEM_CORRUPT-1]); + if (ret) + return ret; + } + } return 0; } @@ -564,6 +602,7 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) struct tc_netem_qopt qopt; struct tc_netem_corr cor; struct tc_netem_reorder reorder; + struct tc_netem_corrupt corrupt; qopt.latency = q->latency; qopt.jitter = q->jitter; @@ -582,6 +621,10 @@ static int netem_dump(struct Qdisc *sch, struct sk_buff *skb) reorder.correlation = q->reorder_cor.rho; RTA_PUT(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder); + corrupt.probability = q->corrupt; + corrupt.correlation = q->corrupt_cor.rho; + RTA_PUT(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt); + rta->rta_len = skb->tail - b; return skb->len; -- cgit v1.1 From 3821af2fe13700cab6fd67367128fa180e43f8b8 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 19:30:53 -0800 Subject: [FLS64]: generic version Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/asm-alpha/bitops.h | 1 + include/asm-arm/bitops.h | 2 ++ include/asm-arm26/bitops.h | 1 + include/asm-cris/bitops.h | 1 + include/asm-frv/bitops.h | 1 + include/asm-generic/bitops.h | 1 + include/asm-h8300/bitops.h | 1 + include/asm-i386/bitops.h | 1 + include/asm-ia64/bitops.h | 1 + include/asm-m32r/bitops.h | 1 + include/asm-m68k/bitops.h | 1 + include/asm-m68knommu/bitops.h | 1 + include/asm-mips/bitops.h | 2 +- include/asm-parisc/bitops.h | 1 + include/asm-powerpc/bitops.h | 1 + include/asm-s390/bitops.h | 1 + include/asm-sh/bitops.h | 1 + include/asm-sh64/bitops.h | 1 + include/asm-sparc/bitops.h | 1 + include/asm-sparc64/bitops.h | 1 + include/asm-v850/bitops.h | 1 + include/asm-x86_64/bitops.h | 1 + include/asm-xtensa/bitops.h | 1 + include/linux/bitops.h | 9 +++++++++ 24 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/asm-alpha/bitops.h b/include/asm-alpha/bitops.h index 578ed3f..302201f 100644 --- a/include/asm-alpha/bitops.h +++ b/include/asm-alpha/bitops.h @@ -321,6 +321,7 @@ static inline int fls(int word) #else #define fls generic_fls #endif +#define fls64 generic_fls64 /* Compute powers of two for the given integer. */ static inline long floor_log2(unsigned long word) diff --git a/include/asm-arm/bitops.h b/include/asm-arm/bitops.h index 7399d43..d02de72 100644 --- a/include/asm-arm/bitops.h +++ b/include/asm-arm/bitops.h @@ -332,6 +332,7 @@ static inline unsigned long __ffs(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * ffs: find first bit set. This is defined the same way as @@ -351,6 +352,7 @@ static inline unsigned long __ffs(unsigned long word) #define fls(x) \ ( __builtin_constant_p(x) ? generic_fls(x) : \ ({ int __r; asm("clz\t%0, %1" : "=r"(__r) : "r"(x) : "cc"); 32-__r; }) ) +#define fls64(x) generic_fls64(x) #define ffs(x) ({ unsigned long __t = (x); fls(__t & -__t); }) #define __ffs(x) (ffs(x) - 1) #define ffz(x) __ffs( ~(x) ) diff --git a/include/asm-arm26/bitops.h b/include/asm-arm26/bitops.h index 7d062fb..15cc6f2 100644 --- a/include/asm-arm26/bitops.h +++ b/include/asm-arm26/bitops.h @@ -259,6 +259,7 @@ static inline unsigned long __ffs(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * ffs: find first bit set. This is defined the same way as diff --git a/include/asm-cris/bitops.h b/include/asm-cris/bitops.h index 1bddb3f..d3eb0f1 100644 --- a/include/asm-cris/bitops.h +++ b/include/asm-cris/bitops.h @@ -240,6 +240,7 @@ static inline int test_bit(int nr, const volatile unsigned long *addr) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * hweightN - returns the hamming weight of a N-bit word diff --git a/include/asm-frv/bitops.h b/include/asm-frv/bitops.h index b664bd5..02be7b3 100644 --- a/include/asm-frv/bitops.h +++ b/include/asm-frv/bitops.h @@ -228,6 +228,7 @@ found_middle: \ bit ? 33 - bit : bit; \ }) +#define fls64(x) generic_fls64(x) /* * Every architecture must define this function. It's the fastest diff --git a/include/asm-generic/bitops.h b/include/asm-generic/bitops.h index ce31b73..0e6d985 100644 --- a/include/asm-generic/bitops.h +++ b/include/asm-generic/bitops.h @@ -56,6 +56,7 @@ extern __inline__ int test_bit(int nr, const unsigned long * addr) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-h8300/bitops.h b/include/asm-h8300/bitops.h index 5036f59..c0411ec 100644 --- a/include/asm-h8300/bitops.h +++ b/include/asm-h8300/bitops.h @@ -406,5 +406,6 @@ found_middle: #endif /* __KERNEL__ */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* _H8300_BITOPS_H */ diff --git a/include/asm-i386/bitops.h b/include/asm-i386/bitops.h index ddf1739..4807aa1 100644 --- a/include/asm-i386/bitops.h +++ b/include/asm-i386/bitops.h @@ -372,6 +372,7 @@ static inline unsigned long ffz(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-ia64/bitops.h b/include/asm-ia64/bitops.h index 7232528..36d0fb9 100644 --- a/include/asm-ia64/bitops.h +++ b/include/asm-ia64/bitops.h @@ -345,6 +345,7 @@ fls (int t) x |= x >> 16; return ia64_popcnt(x); } +#define fls64(x) generic_fls64(x) /* * ffs: find first bit set. This is defined the same way as the libc and compiler builtin diff --git a/include/asm-m32r/bitops.h b/include/asm-m32r/bitops.h index e784439..abea2fd 100644 --- a/include/asm-m32r/bitops.h +++ b/include/asm-m32r/bitops.h @@ -465,6 +465,7 @@ static __inline__ unsigned long __ffs(unsigned long word) * fls: find last bit set. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-m68k/bitops.h b/include/asm-m68k/bitops.h index b1bcf7c66..13f4c00 100644 --- a/include/asm-m68k/bitops.h +++ b/include/asm-m68k/bitops.h @@ -310,6 +310,7 @@ static inline int fls(int x) return 32 - cnt; } +#define fls64(x) generic_fls64(x) /* * Every architecture must define this function. It's the fastest diff --git a/include/asm-m68knommu/bitops.h b/include/asm-m68knommu/bitops.h index c42f88a..4058dd0 100644 --- a/include/asm-m68knommu/bitops.h +++ b/include/asm-m68knommu/bitops.h @@ -499,5 +499,6 @@ found_middle: * fls: find last bit set. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* _M68KNOMMU_BITOPS_H */ diff --git a/include/asm-mips/bitops.h b/include/asm-mips/bitops.h index 5496f90..3b0c8aa 100644 --- a/include/asm-mips/bitops.h +++ b/include/asm-mips/bitops.h @@ -695,7 +695,7 @@ static inline unsigned long fls(unsigned long word) return flz(~word) + 1; } - +#define fls64(x) generic_fls64(x) /* * find_next_zero_bit - find the first zero bit in a memory region diff --git a/include/asm-parisc/bitops.h b/include/asm-parisc/bitops.h index 55b98c6..15d8c2b 100644 --- a/include/asm-parisc/bitops.h +++ b/include/asm-parisc/bitops.h @@ -263,6 +263,7 @@ static __inline__ int fls(int x) return ret; } +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-powerpc/bitops.h b/include/asm-powerpc/bitops.h index 5727229..1996eaa 100644 --- a/include/asm-powerpc/bitops.h +++ b/include/asm-powerpc/bitops.h @@ -310,6 +310,7 @@ static __inline__ int fls(unsigned int x) asm ("cntlzw %0,%1" : "=r" (lz) : "r" (x)); return 32 - lz; } +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-s390/bitops.h b/include/asm-s390/bitops.h index b07c578..6123276 100644 --- a/include/asm-s390/bitops.h +++ b/include/asm-s390/bitops.h @@ -839,6 +839,7 @@ static inline int sched_find_first_bit(unsigned long *b) * fls: find last bit set. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-sh/bitops.h b/include/asm-sh/bitops.h index 5163d1f..1c52608 100644 --- a/include/asm-sh/bitops.h +++ b/include/asm-sh/bitops.h @@ -470,6 +470,7 @@ found_middle: */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* __KERNEL__ */ diff --git a/include/asm-sh64/bitops.h b/include/asm-sh64/bitops.h index e1ff63e..ce9c3ad 100644 --- a/include/asm-sh64/bitops.h +++ b/include/asm-sh64/bitops.h @@ -510,6 +510,7 @@ found_middle: #define ffs(x) generic_ffs(x) #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* __KERNEL__ */ diff --git a/include/asm-sparc/bitops.h b/include/asm-sparc/bitops.h index bfbd795..41722b5 100644 --- a/include/asm-sparc/bitops.h +++ b/include/asm-sparc/bitops.h @@ -298,6 +298,7 @@ static inline int ffs(int x) * Note fls(0) = 0, fls(1) = 1, fls(0x80000000) = 32. */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) /* * hweightN: returns the hamming weight (i.e. the number diff --git a/include/asm-sparc64/bitops.h b/include/asm-sparc64/bitops.h index 6388b83..6efc016 100644 --- a/include/asm-sparc64/bitops.h +++ b/include/asm-sparc64/bitops.h @@ -119,6 +119,7 @@ static inline unsigned long __ffs(unsigned long word) */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #ifdef __KERNEL__ diff --git a/include/asm-v850/bitops.h b/include/asm-v850/bitops.h index b91e799..8955d23 100644 --- a/include/asm-v850/bitops.h +++ b/include/asm-v850/bitops.h @@ -276,6 +276,7 @@ found_middle: #define ffs(x) generic_ffs (x) #define fls(x) generic_fls (x) +#define fls64(x) generic_fls64(x) #define __ffs(x) ffs(x) diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index 05a0d37..94b52c8 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -409,6 +409,7 @@ static __inline__ int ffs(int x) /* find last set bit */ #define fls(x) generic_fls(x) +#define fls64(x) generic_fls64(x) #endif /* __KERNEL__ */ diff --git a/include/asm-xtensa/bitops.h b/include/asm-xtensa/bitops.h index e76ee88..0a2065f 100644 --- a/include/asm-xtensa/bitops.h +++ b/include/asm-xtensa/bitops.h @@ -245,6 +245,7 @@ static __inline__ int fls (unsigned int x) { return __cntlz(x); } +#define fls64(x) generic_fls64(x) static __inline__ int find_next_bit(const unsigned long *addr, int size, int offset) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index 38c2fb7..6a2a19f 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -76,6 +76,15 @@ static __inline__ int generic_fls(int x) */ #include + +static inline int generic_fls64(__u64 x) +{ + __u32 h = x >> 32; + if (h) + return fls(x) + 32; + return fls(x); +} + static __inline__ int get_bitmask_order(unsigned int count) { int order; -- cgit v1.1 From 90933fc8ba5cc9034e3c04ee19938a22b0b4fe4e Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 19:31:36 -0800 Subject: [FLS64]: x86_64 version Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/asm-x86_64/bitops.h | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/include/asm-x86_64/bitops.h b/include/asm-x86_64/bitops.h index 94b52c8..a4d5d09 100644 --- a/include/asm-x86_64/bitops.h +++ b/include/asm-x86_64/bitops.h @@ -340,6 +340,20 @@ static __inline__ unsigned long __ffs(unsigned long word) return word; } +/* + * __fls: find last bit set. + * @word: The word to search + * + * Undefined if no zero exists, so code should check against ~0UL first. + */ +static __inline__ unsigned long __fls(unsigned long word) +{ + __asm__("bsrq %1,%0" + :"=r" (word) + :"rm" (word)); + return word; +} + #ifdef __KERNEL__ static inline int sched_find_first_bit(const unsigned long *b) @@ -370,6 +384,19 @@ static __inline__ int ffs(int x) } /** + * fls64 - find last bit set in 64 bit word + * @x: the word to search + * + * This is defined the same way as fls. + */ +static __inline__ int fls64(__u64 x) +{ + if (x == 0) + return 0; + return __fls(x) + 1; +} + +/** * hweightN - returns the hamming weight of a N-bit word * @x: the word to weigh * @@ -409,7 +436,6 @@ static __inline__ int ffs(int x) /* find last set bit */ #define fls(x) generic_fls(x) -#define fls64(x) generic_fls64(x) #endif /* __KERNEL__ */ -- cgit v1.1 From 89b3d9aaf46791177c5a5fa07a3ed38a035b5ef5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 19:32:08 -0800 Subject: [TCP] cubic: precompute constants Revised version of patch to pre-compute values for TCP cubic. * d32,d64 replaced with descriptive names * cube_factor replaces srtt[scaled by count] / HZ * ((1 << (10+2*BICTCP_HZ)) / bic_scale) * beta_scale replaces 8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta); Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 133 ++++++++++++++++++++++----------------------------- 1 file changed, 57 insertions(+), 76 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index bb5dc4b..44fd408 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -16,7 +16,7 @@ #include #include #include - +#include #define BICTCP_BETA_SCALE 1024 /* Scale factor beta calculation * max_cwnd = snd_cwnd * beta @@ -34,15 +34,20 @@ static int initial_ssthresh = 100; static int bic_scale = 41; static int tcp_friendliness = 1; +static u32 cube_rtt_scale; +static u32 beta_scale; +static u64 cube_factor; + +/* Note parameters that are used for precomputing scale factors are read-only */ module_param(fast_convergence, int, 0644); MODULE_PARM_DESC(fast_convergence, "turn on/off fast convergence"); module_param(max_increment, int, 0644); MODULE_PARM_DESC(max_increment, "Limit on increment allowed during binary search"); -module_param(beta, int, 0644); +module_param(beta, int, 0444); MODULE_PARM_DESC(beta, "beta for multiplicative increase"); module_param(initial_ssthresh, int, 0644); MODULE_PARM_DESC(initial_ssthresh, "initial value of slow start threshold"); -module_param(bic_scale, int, 0644); +module_param(bic_scale, int, 0444); MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_scale/1024)"); module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); @@ -151,65 +156,13 @@ static u32 cubic_root(u64 x) return (u32)end; } -static inline u32 bictcp_K(u32 dist, u32 srtt) -{ - u64 d64; - u32 d32; - u32 count; - u32 result; - - /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 - so K = cubic_root( (wmax-cwnd)*rtt/c ) - the unit of K is bictcp_HZ=2^10, not HZ - - c = bic_scale >> 10 - rtt = (tp->srtt >> 3 ) / HZ - - the following code has been designed and tested for - cwnd < 1 million packets - RTT < 100 seconds - HZ < 1,000,00 (corresponding to 10 nano-second) - - */ - - /* 1/c * 2^2*bictcp_HZ */ - d32 = (1 << (10+2*BICTCP_HZ)) / bic_scale; - d64 = (__u64)d32; - - /* srtt * 2^count / HZ - 1) to get a better accuracy of the following d32, - the larger the "count", the better the accuracy - 2) and avoid overflow of the following d64 - the larger the "count", the high possibility of overflow - 3) so find a "count" between bictcp_hz-3 and bictcp_hz - "count" may be less than bictcp_HZ, - then d64 becomes 0. that is OK - */ - d32 = srtt; - count = 0; - while (((d32 & 0x80000000)==0) && (count < BICTCP_HZ)){ - d32 = d32 << 1; - count++; - } - d32 = d32 / HZ; - - /* (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) */ - d64 = (d64 * dist * d32) >> (count+3-BICTCP_HZ); - - /* cubic root */ - d64 = cubic_root(d64); - - result = (u32)d64; - return result; -} - /* * Compute congestion window to use. */ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) { - u64 d64; - u32 d32, t, srtt, bic_target, min_cnt, max_cnt; + u64 offs; + u32 delta, t, bic_target, min_cnt, max_cnt; ca->ack_cnt++; /* count the number of ACKs */ @@ -220,8 +173,6 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->last_cwnd = cwnd; ca->last_time = tcp_time_stamp; - srtt = (HZ << 3)/10; /* use real time-based growth function */ - if (ca->epoch_start == 0) { ca->epoch_start = tcp_time_stamp; /* record the beginning of an epoch */ ca->ack_cnt = 1; /* start counting */ @@ -231,7 +182,11 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) ca->bic_K = 0; ca->bic_origin_point = cwnd; } else { - ca->bic_K = bictcp_K(ca->last_max_cwnd-cwnd, srtt); + /* Compute new K based on + * (wmax-cwnd) * (srtt>>3 / HZ) / c * 2^(3*bictcp_HZ) + */ + ca->bic_K = cubic_root(cube_factor + * (ca->last_max_cwnd - cwnd)); ca->bic_origin_point = ca->last_max_cwnd; } } @@ -239,9 +194,9 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* cubic function - calc*/ /* calculate c * time^3 / rtt, * while considering overflow in calculation of time^3 - * (so time^3 is done by using d64) + * (so time^3 is done by using 64 bit) * and without the support of division of 64bit numbers - * (so all divisions are done by using d32) + * (so all divisions are done by using 32 bit) * also NOTE the unit of those veriables * time = (t - K) / 2^bictcp_HZ * c = bic_scale >> 10 @@ -255,18 +210,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) << BICTCP_HZ) / HZ; if (t < ca->bic_K) /* t - K */ - d32 = ca->bic_K - t; + offs = ca->bic_K - t; else - d32 = t - ca->bic_K; + offs = t - ca->bic_K; - d64 = (u64)d32; - d32 = (bic_scale << 3) * HZ / srtt; /* 1024*c/rtt */ - d64 = (d32 * d64 * d64 * d64) >> (10+3*BICTCP_HZ); /* c/rtt * (t-K)^3 */ - d32 = (u32)d64; + /* c/rtt * (t-K)^3 */ + delta = (cube_rtt_scale * offs * offs * offs) >> (10+3*BICTCP_HZ); if (t < ca->bic_K) /* below origin*/ - bic_target = ca->bic_origin_point - d32; + bic_target = ca->bic_origin_point - delta; else /* above origin*/ - bic_target = ca->bic_origin_point + d32; + bic_target = ca->bic_origin_point + delta; /* cubic function - calc bictcp_cnt*/ if (bic_target > cwnd) { @@ -288,16 +241,16 @@ static inline void bictcp_update(struct bictcp *ca, u32 cwnd) /* TCP Friendly */ if (tcp_friendliness) { - u32 scale = 8*(BICTCP_BETA_SCALE+beta)/3/(BICTCP_BETA_SCALE-beta); - d32 = (cwnd * scale) >> 3; - while (ca->ack_cnt > d32) { /* update tcp cwnd */ - ca->ack_cnt -= d32; + u32 scale = beta_scale; + delta = (cwnd * scale) >> 3; + while (ca->ack_cnt > delta) { /* update tcp cwnd */ + ca->ack_cnt -= delta; ca->tcp_cwnd++; } if (ca->tcp_cwnd > cwnd){ /* if bic is slower than tcp */ - d32 = ca->tcp_cwnd - cwnd; - max_cnt = cwnd / d32; + delta = ca->tcp_cwnd - cwnd; + max_cnt = cwnd / delta; if (ca->cnt > max_cnt) ca->cnt = max_cnt; } @@ -428,6 +381,34 @@ static struct tcp_congestion_ops cubictcp = { static int __init cubictcp_register(void) { BUG_ON(sizeof(struct bictcp) > ICSK_CA_PRIV_SIZE); + + /* Precompute a bunch of the scaling factors that are used per-packet + * based on SRTT of 100ms + */ + + beta_scale = 8*(BICTCP_BETA_SCALE+beta)/ 3 / (BICTCP_BETA_SCALE - beta); + + cube_rtt_scale = (bic_scale << 3) / 10; /* 1024*c/rtt */ + + /* calculate the "K" for (wmax-cwnd) = c/rtt * K^3 + * so K = cubic_root( (wmax-cwnd)*rtt/c ) + * the unit of K is bictcp_HZ=2^10, not HZ + * + * c = bic_scale >> 10 + * rtt = 100ms + * + * the following code has been designed and tested for + * cwnd < 1 million packets + * RTT < 100 seconds + * HZ < 1,000,00 (corresponding to 10 nano-second) + */ + + /* 1/c * 2^2*bictcp_HZ * srtt */ + cube_factor = 1ull << (10+3*BICTCP_HZ); /* 2^40 */ + + /* divide by bic_scale and by constant Srtt (100ms) */ + do_div(cube_factor, bic_scale * 10); + return tcp_register_congestion_control(&cubictcp); } -- cgit v1.1 From 9eb2d627190a8afe4b9276b24615a9559504fa60 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Wed, 21 Dec 2005 19:32:36 -0800 Subject: [TCP] cubic: use Newton-Raphson Replace cube root algorithim with a faster version using Newton-Raphson. Surprisingly, doing the scaled div64_64 is faster than a true 64 bit division on 64 bit CPU's. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/tcp_cubic.c | 93 ++++++++++++++++++++++------------------------------ 1 file changed, 39 insertions(+), 54 deletions(-) diff --git a/net/ipv4/tcp_cubic.c b/net/ipv4/tcp_cubic.c index 44fd408..31a4986 100644 --- a/net/ipv4/tcp_cubic.c +++ b/net/ipv4/tcp_cubic.c @@ -52,6 +52,7 @@ MODULE_PARM_DESC(bic_scale, "scale (scaled by 1024) value for bic function (bic_ module_param(tcp_friendliness, int, 0644); MODULE_PARM_DESC(tcp_friendliness, "turn on/off tcp friendliness"); +#include /* BIC TCP Parameters */ struct bictcp { @@ -93,67 +94,51 @@ static void bictcp_init(struct sock *sk) tcp_sk(sk)->snd_ssthresh = initial_ssthresh; } -/* 65536 times the cubic root */ -static const u64 cubic_table[8] - = {0, 65536, 82570, 94519, 104030, 112063, 119087, 125367}; +/* 64bit divisor, dividend and result. dynamic precision */ +static inline u_int64_t div64_64(u_int64_t dividend, u_int64_t divisor) +{ + u_int32_t d = divisor; + + if (divisor > 0xffffffffULL) { + unsigned int shift = fls(divisor >> 32); + + d = divisor >> shift; + dividend >>= shift; + } + + /* avoid 64 bit division if possible */ + if (dividend >> 32) + do_div(dividend, d); + else + dividend = (uint32_t) dividend / d; + + return dividend; +} /* - * calculate the cubic root of x - * the basic idea is that x can be expressed as i*8^j - * so cubic_root(x) = cubic_root(i)*2^j - * in the following code, x is i, and y is 2^j - * because of integer calculation, there are errors in calculation - * so finally use binary search to find out the exact solution + * calculate the cubic root of x using Newton-Raphson */ -static u32 cubic_root(u64 x) +static u32 cubic_root(u64 a) { - u64 y, app, target, start, end, mid, start_diff, end_diff; - - if (x == 0) - return 0; + u32 x, x1; - target = x; - - /* first estimate lower and upper bound */ - y = 1; - while (x >= 8){ - x = (x >> 3); - y = (y << 1); - } - start = (y*cubic_table[x])>>16; - if (x==7) - end = (y<<1); - else - end = (y*cubic_table[x+1]+65535)>>16; - - /* binary search for more accurate one */ - while (start < end-1) { - mid = (start+end) >> 1; - app = mid*mid*mid; - if (app < target) - start = mid; - else if (app > target) - end = mid; - else - return mid; - } + /* Initial estimate is based on: + * cbrt(x) = exp(log(x) / 3) + */ + x = 1u << (fls64(a)/3); - /* find the most accurate one from start and end */ - app = start*start*start; - if (app < target) - start_diff = target - app; - else - start_diff = app - target; - app = end*end*end; - if (app < target) - end_diff = target - app; - else - end_diff = app - target; + /* + * Iteration based on: + * 2 + * x = ( 2 * x + a / x ) / 3 + * k+1 k k + */ + do { + x1 = x; + x = (2 * x + (uint32_t) div64_64(a, x*x)) / 3; + } while (abs(x1 - x) > 1); - if (start_diff < end_diff) - return (u32)start; - else - return (u32)end; + return x; } /* -- cgit v1.1 From fd9662555cc35f8bf9242cd7bba8b44ae168a68b Mon Sep 17 00:00:00 2001 From: Robert Olsson Date: Thu, 22 Dec 2005 11:25:10 -0800 Subject: [IPV4] fib_trie: Add credits. Signed-off-by: Robert Olsson Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 705e3ce..a3ed7e1 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -41,6 +41,13 @@ * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. + * + * Substantial contributions to this work comes from: + * + * David S. Miller, + * Stephen Hemminger + * Paul E. McKenney + * Patrick McHardy */ #define VERSION "0.404" -- cgit v1.1 From 52ccb8e90c0ace233b8b740f2fc5de0dbd706b27 Mon Sep 17 00:00:00 2001 From: Frank Filz Date: Thu, 22 Dec 2005 11:36:46 -0800 Subject: [SCTP]: Update SCTP_PEER_ADDR_PARAMS socket option to the latest api draft. This patch adds support to set/get heartbeat interval, maximum number of retransmissions, pathmtu, sackdelay time for a particular transport/ association/socket as per the latest SCTP sockets api draft11. Signed-off-by: Frank Filz Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/sctp/structs.h | 76 +++++-- include/net/sctp/user.h | 16 ++ net/sctp/associola.c | 81 ++++--- net/sctp/input.c | 36 +++- net/sctp/output.c | 17 +- net/sctp/sm_sideeffect.c | 26 ++- net/sctp/sm_statefuns.c | 20 +- net/sctp/socket.c | 511 ++++++++++++++++++++++++++++++++++----------- net/sctp/transport.c | 32 +-- 9 files changed, 595 insertions(+), 220 deletions(-) diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h index 8e7794e..f5c22d7 100644 --- a/include/net/sctp/structs.h +++ b/include/net/sctp/structs.h @@ -277,6 +277,24 @@ struct sctp_sock { __u32 default_context; __u32 default_timetolive; + /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to + * the destination address every heartbeat interval. This value + * will be inherited by all new associations. + */ + __u32 hbinterval; + + /* This is the max_retrans value for new associations. */ + __u16 pathmaxrxt; + + /* The initial Path MTU to use for new associations. */ + __u32 pathmtu; + + /* The default SACK delay timeout for new associations. */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; + struct sctp_initmsg initmsg; struct sctp_rtoinfo rtoinfo; struct sctp_paddrparams paddrparam; @@ -845,9 +863,6 @@ struct sctp_transport { /* Data that has been sent, but not acknowledged. */ __u32 flight_size; - /* PMTU : The current known path MTU. */ - __u32 pmtu; - /* Destination */ struct dst_entry *dst; /* Source address. */ @@ -862,7 +877,22 @@ struct sctp_transport { /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to * the destination address every heartbeat interval. */ - int hb_interval; + __u32 hbinterval; + + /* This is the max_retrans value for the transport and will + * be initialized from the assocs value. This can be changed + * using SCTP_SET_PEER_ADDR_PARAMS socket option. + */ + __u16 pathmaxrxt; + + /* PMTU : The current known path MTU. */ + __u32 pathmtu; + + /* SACK delay timeout */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; /* When was the last time (in jiffies) that we heard from this * transport? We use this to pick new active and retran paths. @@ -882,22 +912,11 @@ struct sctp_transport { */ int state; - /* hb_allowed : The current heartbeat state of this destination, - * : i.e. ALLOW-HB, NO-HEARTBEAT, etc. - */ - int hb_allowed; - /* These are the error stats for this destination. */ /* Error count : The current error count for this destination. */ unsigned short error_count; - /* This is the max_retrans value for the transport and will - * be initialized to proto.max_retrans.path. This can be changed - * using SCTP_SET_PEER_ADDR_PARAMS socket option. - */ - int max_retrans; - /* Per : A timer used by each destination. * Destination : * Timer : @@ -1502,6 +1521,28 @@ struct sctp_association { /* The largest timeout or RTO value to use in attempting an INIT */ __u16 max_init_timeo; + /* Heartbeat interval: The endpoint sends out a Heartbeat chunk to + * the destination address every heartbeat interval. This value + * will be inherited by all new transports. + */ + __u32 hbinterval; + + /* This is the max_retrans value for new transports in the + * association. + */ + __u16 pathmaxrxt; + + /* Association : The smallest PMTU discovered for all of the + * PMTU : peer's transport addresses. + */ + __u32 pathmtu; + + /* SACK delay timeout */ + __u32 sackdelay; + + /* Flags controling Heartbeat, SACK delay, and Path MTU Discovery. */ + __u32 param_flags; + int timeouts[SCTP_NUM_TIMEOUT_TYPES]; struct timer_list timers[SCTP_NUM_TIMEOUT_TYPES]; @@ -1571,11 +1612,6 @@ struct sctp_association { */ wait_queue_head_t wait; - /* Association : The smallest PMTU discovered for all of the - * PMTU : peer's transport addresses. - */ - __u32 pmtu; - /* The message size at which SCTP fragmentation will occur. */ __u32 frag_point; diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h index f1c3bc5..b9052864 100644 --- a/include/net/sctp/user.h +++ b/include/net/sctp/user.h @@ -503,11 +503,27 @@ struct sctp_setadaption { * unreachable. The following structure is used to access and modify an * address's parameters: */ +enum sctp_spp_flags { + SPP_HB_ENABLE = 1, /*Enable heartbeats*/ + SPP_HB_DISABLE = 2, /*Disable heartbeats*/ + SPP_HB = SPP_HB_ENABLE | SPP_HB_DISABLE, + SPP_HB_DEMAND = 4, /*Send heartbeat immediately*/ + SPP_PMTUD_ENABLE = 8, /*Enable PMTU discovery*/ + SPP_PMTUD_DISABLE = 16, /*Disable PMTU discovery*/ + SPP_PMTUD = SPP_PMTUD_ENABLE | SPP_PMTUD_DISABLE, + SPP_SACKDELAY_ENABLE = 32, /*Enable SACK*/ + SPP_SACKDELAY_DISABLE = 64, /*Disable SACK*/ + SPP_SACKDELAY = SPP_SACKDELAY_ENABLE | SPP_SACKDELAY_DISABLE, +}; + struct sctp_paddrparams { sctp_assoc_t spp_assoc_id; struct sockaddr_storage spp_address; __u32 spp_hbinterval; __u16 spp_pathmaxrxt; + __u32 spp_pathmtu; + __u32 spp_sackdelay; + __u32 spp_flags; } __attribute__((packed, aligned(4))); /* diff --git a/net/sctp/associola.c b/net/sctp/associola.c index dec68a6..9d05e13 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -110,7 +110,6 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->cookie_life.tv_sec = sp->assocparams.sasoc_cookie_life / 1000; asoc->cookie_life.tv_usec = (sp->assocparams.sasoc_cookie_life % 1000) * 1000; - asoc->pmtu = 0; asoc->frag_point = 0; /* Set the association max_retrans and RTO values from the @@ -123,6 +122,25 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a asoc->overall_error_count = 0; + /* Initialize the association's heartbeat interval based on the + * sock configured value. + */ + asoc->hbinterval = msecs_to_jiffies(sp->hbinterval); + + /* Initialize path max retrans value. */ + asoc->pathmaxrxt = sp->pathmaxrxt; + + /* Initialize default path MTU. */ + asoc->pathmtu = sp->pathmtu; + + /* Set association default SACK delay */ + asoc->sackdelay = msecs_to_jiffies(sp->sackdelay); + + /* Set the association default flags controlling + * Heartbeat, SACK delay, and Path MTU Discovery. + */ + asoc->param_flags = sp->param_flags; + /* Initialize the maximum mumber of new data packets that can be sent * in a burst. */ @@ -144,8 +162,7 @@ static struct sctp_association *sctp_association_init(struct sctp_association *a = 5 * asoc->rto_max; asoc->timeouts[SCTP_EVENT_TIMEOUT_HEARTBEAT] = 0; - asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = - SCTP_DEFAULT_TIMEOUT_SACK; + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = asoc->sackdelay; asoc->timeouts[SCTP_EVENT_TIMEOUT_AUTOCLOSE] = sp->autoclose * HZ; @@ -540,23 +557,46 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, sctp_transport_set_owner(peer, asoc); + /* Initialize the peer's heartbeat interval based on the + * association configured value. + */ + peer->hbinterval = asoc->hbinterval; + + /* Set the path max_retrans. */ + peer->pathmaxrxt = asoc->pathmaxrxt; + + /* Initialize the peer's SACK delay timeout based on the + * association configured value. + */ + peer->sackdelay = asoc->sackdelay; + + /* Enable/disable heartbeat, SACK delay, and path MTU discovery + * based on association setting. + */ + peer->param_flags = asoc->param_flags; + /* Initialize the pmtu of the transport. */ - sctp_transport_pmtu(peer); + if (peer->param_flags & SPP_PMTUD_ENABLE) + sctp_transport_pmtu(peer); + else if (asoc->pathmtu) + peer->pathmtu = asoc->pathmtu; + else + peer->pathmtu = SCTP_DEFAULT_MAXSEGMENT; /* If this is the first transport addr on this association, * initialize the association PMTU to the peer's PMTU. * If not and the current association PMTU is higher than the new * peer's PMTU, reset the association PMTU to the new peer's PMTU. */ - if (asoc->pmtu) - asoc->pmtu = min_t(int, peer->pmtu, asoc->pmtu); + if (asoc->pathmtu) + asoc->pathmtu = min_t(int, peer->pathmtu, asoc->pathmtu); else - asoc->pmtu = peer->pmtu; + asoc->pathmtu = peer->pathmtu; SCTP_DEBUG_PRINTK("sctp_assoc_add_peer:association %p PMTU set to " - "%d\n", asoc, asoc->pmtu); + "%d\n", asoc, asoc->pathmtu); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); /* The asoc->peer.port might not be meaningful yet, but * initialize the packet structure anyway. @@ -574,7 +614,7 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, * (for example, implementations MAY use the size of the * receiver advertised window). */ - peer->cwnd = min(4*asoc->pmtu, max_t(__u32, 2*asoc->pmtu, 4380)); + peer->cwnd = min(4*asoc->pathmtu, max_t(__u32, 2*asoc->pathmtu, 4380)); /* At this point, we may not have the receiver's advertised window, * so initialize ssthresh to the default value and it will be set @@ -585,17 +625,6 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc, peer->partial_bytes_acked = 0; peer->flight_size = 0; - /* By default, enable heartbeat for peer address. */ - peer->hb_allowed = 1; - - /* Initialize the peer's heartbeat interval based on the - * sock configured value. - */ - peer->hb_interval = msecs_to_jiffies(sp->paddrparam.spp_hbinterval); - - /* Set the path max_retrans. */ - peer->max_retrans = sp->paddrparam.spp_pathmaxrxt; - /* Set the transport's RTO.initial value */ peer->rto = asoc->rto_initial; @@ -1155,18 +1184,18 @@ void sctp_assoc_sync_pmtu(struct sctp_association *asoc) /* Get the lowest pmtu of all the transports. */ list_for_each(pos, &asoc->peer.transport_addr_list) { t = list_entry(pos, struct sctp_transport, transports); - if (!pmtu || (t->pmtu < pmtu)) - pmtu = t->pmtu; + if (!pmtu || (t->pathmtu < pmtu)) + pmtu = t->pathmtu; } if (pmtu) { struct sctp_sock *sp = sctp_sk(asoc->base.sk); - asoc->pmtu = pmtu; + asoc->pathmtu = pmtu; asoc->frag_point = sctp_frag_point(sp, pmtu); } SCTP_DEBUG_PRINTK("%s: asoc:%p, pmtu:%d, frag_point:%d\n", - __FUNCTION__, asoc, asoc->pmtu, asoc->frag_point); + __FUNCTION__, asoc, asoc->pathmtu, asoc->frag_point); } /* Should we send a SACK to update our peer? */ @@ -1179,7 +1208,7 @@ static inline int sctp_peer_needs_update(struct sctp_association *asoc) case SCTP_STATE_SHUTDOWN_SENT: if ((asoc->rwnd > asoc->a_rwnd) && ((asoc->rwnd - asoc->a_rwnd) >= - min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pmtu))) + min_t(__u32, (asoc->base.sk->sk_rcvbuf >> 1), asoc->pathmtu))) return 1; break; default: diff --git a/net/sctp/input.c b/net/sctp/input.c index b24ff2c..238f1bf 100644 --- a/net/sctp/input.c +++ b/net/sctp/input.c @@ -305,18 +305,36 @@ int sctp_backlog_rcv(struct sock *sk, struct sk_buff *skb) void sctp_icmp_frag_needed(struct sock *sk, struct sctp_association *asoc, struct sctp_transport *t, __u32 pmtu) { - if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { - printk(KERN_WARNING "%s: Reported pmtu %d too low, " - "using default minimum of %d\n", __FUNCTION__, pmtu, - SCTP_DEFAULT_MINSEGMENT); - pmtu = SCTP_DEFAULT_MINSEGMENT; - } + if (sock_owned_by_user(sk) || !t || (t->pathmtu == pmtu)) + return; - if (!sock_owned_by_user(sk) && t && (t->pmtu != pmtu)) { - t->pmtu = pmtu; + if (t->param_flags & SPP_PMTUD_ENABLE) { + if (unlikely(pmtu < SCTP_DEFAULT_MINSEGMENT)) { + printk(KERN_WARNING "%s: Reported pmtu %d too low, " + "using default minimum of %d\n", + __FUNCTION__, pmtu, + SCTP_DEFAULT_MINSEGMENT); + /* Use default minimum segment size and disable + * pmtu discovery on this transport. + */ + t->pathmtu = SCTP_DEFAULT_MINSEGMENT; + t->param_flags = (t->param_flags & ~SPP_HB) | + SPP_PMTUD_DISABLE; + } else { + t->pathmtu = pmtu; + } + + /* Update association pmtu. */ sctp_assoc_sync_pmtu(asoc); - sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } + + /* Retransmit with the new pmtu setting. + * Normally, if PMTU discovery is disabled, an ICMP Fragmentation + * Needed will never be sent, but if a message was sent before + * PMTU discovery was disabled that was larger than the PMTU, it + * would not be fragmented, so it must be re-transmitted fragmented. + */ + sctp_retransmit(&asoc->outqueue, t, SCTP_RTXR_PMTUD); } /* diff --git a/net/sctp/output.c b/net/sctp/output.c index 9313716..a40991e 100644 --- a/net/sctp/output.c +++ b/net/sctp/output.c @@ -234,8 +234,8 @@ sctp_xmit_t sctp_packet_append_chunk(struct sctp_packet *packet, goto finish; pmtu = ((packet->transport->asoc) ? - (packet->transport->asoc->pmtu) : - (packet->transport->pmtu)); + (packet->transport->asoc->pathmtu) : + (packet->transport->pathmtu)); too_big = (psize + chunk_len > pmtu); @@ -482,7 +482,9 @@ int sctp_packet_transmit(struct sctp_packet *packet) if (!dst || (dst->obsolete > 1)) { dst_release(dst); sctp_transport_route(tp, NULL, sctp_sk(sk)); - sctp_assoc_sync_pmtu(asoc); + if (asoc->param_flags & SPP_PMTUD_ENABLE) { + sctp_assoc_sync_pmtu(asoc); + } } nskb->dst = dst_clone(tp->dst); @@ -492,7 +494,10 @@ int sctp_packet_transmit(struct sctp_packet *packet) SCTP_DEBUG_PRINTK("***sctp_transmit_packet*** skb len %d\n", nskb->len); - (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + if (tp->param_flags & SPP_PMTUD_ENABLE) + (*tp->af_specific->sctp_xmit)(nskb, tp, packet->ipfragok); + else + (*tp->af_specific->sctp_xmit)(nskb, tp, 1); out: packet->size = packet->overhead; @@ -577,7 +582,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * if ((flightsize + Max.Burst * MTU) < cwnd) * cwnd = flightsize + Max.Burst * MTU */ - max_burst_bytes = asoc->max_burst * asoc->pmtu; + max_burst_bytes = asoc->max_burst * asoc->pathmtu; if ((transport->flight_size + max_burst_bytes) < transport->cwnd) { transport->cwnd = transport->flight_size + max_burst_bytes; SCTP_DEBUG_PRINTK("%s: cwnd limited by max_burst: " @@ -622,7 +627,7 @@ static sctp_xmit_t sctp_packet_append_data(struct sctp_packet *packet, * data will fit or delay in hopes of bundling a full * sized packet. */ - if (len < asoc->pmtu - packet->overhead) { + if (len < asoc->pathmtu - packet->overhead) { retval = SCTP_XMIT_NAGLE_DELAY; goto finish; } diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c index 8239471..2d7d8a5 100644 --- a/net/sctp/sm_sideeffect.c +++ b/net/sctp/sm_sideeffect.c @@ -157,9 +157,12 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, { __u32 ctsn, max_tsn_seen; struct sctp_chunk *sack; + struct sctp_transport *trans = asoc->peer.last_data_from; int error = 0; - if (force) + if (force || + (!trans && (asoc->param_flags & SPP_SACKDELAY_DISABLE)) || + (trans && (trans->param_flags & SPP_SACKDELAY_DISABLE))) asoc->peer.sack_needed = 1; ctsn = sctp_tsnmap_get_ctsn(&asoc->peer.tsn_map); @@ -189,7 +192,22 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, if (!asoc->peer.sack_needed) { /* We will need a SACK for the next packet. */ asoc->peer.sack_needed = 1; - goto out; + + /* Set the SACK delay timeout based on the + * SACK delay for the last transport + * data was received from, or the default + * for the association. + */ + if (trans) + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + trans->sackdelay; + else + asoc->timeouts[SCTP_EVENT_TIMEOUT_SACK] = + asoc->sackdelay; + + /* Restart the SACK timer. */ + sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, + SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } else { if (asoc->a_rwnd > asoc->rwnd) asoc->a_rwnd = asoc->rwnd; @@ -205,7 +223,7 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force, sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP, SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); } -out: + return error; nomem: error = -ENOMEM; @@ -415,7 +433,7 @@ static void sctp_do_8_2_transport_strike(struct sctp_association *asoc, asoc->overall_error_count++; if (transport->state != SCTP_INACTIVE && - (transport->error_count++ >= transport->max_retrans)) { + (transport->error_count++ >= transport->pathmaxrxt)) { SCTP_DEBUG_PRINTK_IPADDR("transport_strike:association %p", " transport IP: port:%d failed.\n", asoc, diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c index 475bfb4..557a7d9 100644 --- a/net/sctp/sm_statefuns.c +++ b/net/sctp/sm_statefuns.c @@ -900,7 +900,7 @@ sctp_disposition_t sctp_sf_sendbeat_8_3(const struct sctp_endpoint *ep, * HEARTBEAT is sent (see Section 8.3). */ - if (transport->hb_allowed) { + if (transport->param_flags & SPP_HB_ENABLE) { if (SCTP_DISPOSITION_NOMEM == sctp_sf_heartbeat(ep, asoc, type, arg, commands)) @@ -1051,7 +1051,7 @@ sctp_disposition_t sctp_sf_backbeat_8_3(const struct sctp_endpoint *ep, return SCTP_DISPOSITION_DISCARD; } - max_interval = link->hb_interval + link->rto; + max_interval = link->hbinterval + link->rto; /* Check if the timestamp looks valid. */ if (time_after(hbinfo->sent_at, jiffies) || @@ -2691,14 +2691,9 @@ sctp_disposition_t sctp_sf_eat_data_6_2(const struct sctp_endpoint *ep, * document allow. However, an SCTP transmitter MUST NOT be * more aggressive than the following algorithms allow. */ - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } - return SCTP_DISPOSITION_CONSUME; discard_force: @@ -2721,13 +2716,9 @@ discard_force: return SCTP_DISPOSITION_DISCARD; discard_noforce: - if (chunk->end_of_packet) { + if (chunk->end_of_packet) sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); - } return SCTP_DISPOSITION_DISCARD; consume: return SCTP_DISPOSITION_CONSUME; @@ -3442,9 +3433,6 @@ sctp_disposition_t sctp_sf_eat_fwd_tsn(const struct sctp_endpoint *ep, * send another. */ sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, SCTP_NOFORCE()); - /* Start the SACK timer. */ - sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART, - SCTP_TO(SCTP_EVENT_TIMEOUT_SACK)); return SCTP_DISPOSITION_CONSUME; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 9df888e..adb5ee6 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -1941,106 +1941,275 @@ static int sctp_setsockopt_autoclose(struct sock *sk, char __user *optval, * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ +int sctp_apply_peer_addr_params(struct sctp_paddrparams *params, + struct sctp_transport *trans, + struct sctp_association *asoc, + struct sctp_sock *sp, + int hb_change, + int pmtud_change, + int sackdelay_change) +{ + int error; + + if (params->spp_flags & SPP_HB_DEMAND && trans) { + error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); + if (error) + return error; + } + + if (params->spp_hbinterval) { + if (trans) { + trans->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else if (asoc) { + asoc->hbinterval = msecs_to_jiffies(params->spp_hbinterval); + } else { + sp->hbinterval = params->spp_hbinterval; + } + } + + if (hb_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_HB) | hb_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_HB) | hb_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_HB) | hb_change; + } + } + + if (params->spp_pathmtu) { + if (trans) { + trans->pathmtu = params->spp_pathmtu; + sctp_assoc_sync_pmtu(asoc); + } else if (asoc) { + asoc->pathmtu = params->spp_pathmtu; + sctp_frag_point(sp, params->spp_pathmtu); + } else { + sp->pathmtu = params->spp_pathmtu; + } + } + + if (pmtud_change) { + if (trans) { + int update = (trans->param_flags & SPP_PMTUD_DISABLE) && + (params->spp_flags & SPP_PMTUD_ENABLE); + trans->param_flags = + (trans->param_flags & ~SPP_PMTUD) | pmtud_change; + if (update) { + sctp_transport_pmtu(trans); + sctp_assoc_sync_pmtu(asoc); + } + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_PMTUD) | pmtud_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_PMTUD) | pmtud_change; + } + } + + if (params->spp_sackdelay) { + if (trans) { + trans->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params->spp_sackdelay); + } else { + sp->sackdelay = params->spp_sackdelay; + } + } + + if (sackdelay_change) { + if (trans) { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + sackdelay_change; + } + } + + if (params->spp_pathmaxrxt) { + if (trans) { + trans->pathmaxrxt = params->spp_pathmaxrxt; + } else if (asoc) { + asoc->pathmaxrxt = params->spp_pathmaxrxt; + } else { + sp->pathmaxrxt = params->spp_pathmaxrxt; + } + } + + return 0; +} + static int sctp_setsockopt_peer_addr_params(struct sock *sk, char __user *optval, int optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); int error; + int hb_change, pmtud_change, sackdelay_change; if (optlen != sizeof(struct sctp_paddrparams)) - return -EINVAL; + return - EINVAL; + if (copy_from_user(¶ms, optval, optlen)) return -EFAULT; - /* - * API 7. Socket Options (setting the default value for the endpoint) - * All options that support specific settings on an association by - * filling in either an association id variable or a sockaddr_storage - * SHOULD also support setting of the same value for the entire endpoint - * (i.e. future associations). To accomplish this the following logic is - * used when setting one of these options: - - * c) If neither the sockaddr_storage or association identification is - * set i.e. the sockaddr_storage is set to all 0's (INADDR_ANY) and - * the association identification is 0, the settings are a default - * and to be applied to the endpoint (all future associations). - */ + /* Validate flags and value parameters. */ + hb_change = params.spp_flags & SPP_HB; + pmtud_change = params.spp_flags & SPP_PMTUD; + sackdelay_change = params.spp_flags & SPP_SACKDELAY; + + if (hb_change == SPP_HB || + pmtud_change == SPP_PMTUD || + sackdelay_change == SPP_SACKDELAY || + params.spp_sackdelay > 500 || + (params.spp_pathmtu + && params.spp_pathmtu < SCTP_DEFAULT_MINSEGMENT)) + return -EINVAL; - /* update default value for endpoint (all future associations) */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - /* Manual heartbeat on an endpoint is invalid. */ - if (0xffffffff == params.spp_hbinterval) + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. + */ + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) return -EINVAL; - else if (params.spp_hbinterval) - sctp_sk(sk)->paddrparam.spp_hbinterval = - params.spp_hbinterval; - if (params.spp_pathmaxrxt) - sctp_sk(sk)->paddrparam.spp_pathmaxrxt = - params.spp_pathmaxrxt; - return 0; } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) return -EINVAL; - /* Applications can enable or disable heartbeats for any peer address - * of an association, modify an address's heartbeat interval, force a - * heartbeat to be sent immediately, and adjust the address's maximum - * number of retransmissions sent before an address is considered - * unreachable. - * - * The value of the heartbeat interval, in milliseconds. A value of - * UINT32_MAX (4294967295), when modifying the parameter, specifies - * that a heartbeat should be sent immediately to the peer address, - * and the current interval should remain unchanged. - */ - if (0xffffffff == params.spp_hbinterval) { - error = sctp_primitive_REQUESTHEARTBEAT (trans->asoc, trans); - if (error) - return error; - } else { - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. + /* Heartbeat demand can only be sent on a transport or + * association, but not a socket. */ - if (params.spp_hbinterval) { - trans->hb_allowed = 1; - trans->hb_interval = - msecs_to_jiffies(params.spp_hbinterval); - } else - trans->hb_allowed = 0; - } + if (params.spp_flags & SPP_HB_DEMAND && !trans && !asoc) + return -EINVAL; + + /* Process parameters. */ + error = sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. + if (error) + return error; + + /* If changes are for association, also apply parameters to each + * transport. */ - if (params.spp_pathmaxrxt) - trans->max_retrans = params.spp_pathmaxrxt; + if (!trans && asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + sctp_apply_peer_addr_params(¶ms, trans, asoc, sp, + hb_change, pmtud_change, + sackdelay_change); + } + } return 0; } @@ -2334,7 +2503,7 @@ static int sctp_setsockopt_maxseg(struct sock *sk, char __user *optval, int optl /* Update the frag_point of the existing associations. */ list_for_each(pos, &(sp->ep->asocs)) { asoc = list_entry(pos, struct sctp_association, asocs); - asoc->frag_point = sctp_frag_point(sp, asoc->pmtu); + asoc->frag_point = sctp_frag_point(sp, asoc->pathmtu); } return 0; @@ -2715,8 +2884,13 @@ SCTP_STATIC int sctp_init_sock(struct sock *sk) /* Default Peer Address Parameters. These defaults can * be modified via SCTP_PEER_ADDR_PARAMS */ - sp->paddrparam.spp_hbinterval = jiffies_to_msecs(sctp_hb_interval); - sp->paddrparam.spp_pathmaxrxt = sctp_max_retrans_path; + sp->hbinterval = jiffies_to_msecs(sctp_hb_interval); + sp->pathmaxrxt = sctp_max_retrans_path; + sp->pathmtu = 0; // allow default discovery + sp->sackdelay = sctp_sack_timeout; + sp->param_flags = SPP_HB_ENABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; /* If enabled no SCTP message fragmentation will be performed. * Configure through SCTP_DISABLE_FRAGMENTS socket option. @@ -2865,7 +3039,7 @@ static int sctp_getsockopt_sctp_status(struct sock *sk, int len, status.sstat_primary.spinfo_cwnd = transport->cwnd; status.sstat_primary.spinfo_srtt = transport->srtt; status.sstat_primary.spinfo_rto = jiffies_to_msecs(transport->rto); - status.sstat_primary.spinfo_mtu = transport->pmtu; + status.sstat_primary.spinfo_mtu = transport->pathmtu; if (status.sstat_primary.spinfo_state == SCTP_UNKNOWN) status.sstat_primary.spinfo_state = SCTP_ACTIVE; @@ -2924,7 +3098,7 @@ static int sctp_getsockopt_peer_addr_info(struct sock *sk, int len, pinfo.spinfo_cwnd = transport->cwnd; pinfo.spinfo_srtt = transport->srtt; pinfo.spinfo_rto = jiffies_to_msecs(transport->rto); - pinfo.spinfo_mtu = transport->pmtu; + pinfo.spinfo_mtu = transport->pathmtu; if (pinfo.spinfo_state == SCTP_UNKNOWN) pinfo.spinfo_state = SCTP_ACTIVE; @@ -3086,69 +3260,154 @@ out: * address's parameters: * * struct sctp_paddrparams { - * sctp_assoc_t spp_assoc_id; - * struct sockaddr_storage spp_address; - * uint32_t spp_hbinterval; - * uint16_t spp_pathmaxrxt; - * }; - * - * spp_assoc_id - (UDP style socket) This is filled in the application, - * and identifies the association for this query. + * sctp_assoc_t spp_assoc_id; + * struct sockaddr_storage spp_address; + * uint32_t spp_hbinterval; + * uint16_t spp_pathmaxrxt; + * uint32_t spp_pathmtu; + * uint32_t spp_sackdelay; + * uint32_t spp_flags; + * }; + * + * spp_assoc_id - (one-to-many style socket) This is filled in the + * application, and identifies the association for + * this query. * spp_address - This specifies which address is of interest. * spp_hbinterval - This contains the value of the heartbeat interval, - * in milliseconds. A value of 0, when modifying the - * parameter, specifies that the heartbeat on this - * address should be disabled. A value of UINT32_MAX - * (4294967295), when modifying the parameter, - * specifies that a heartbeat should be sent - * immediately to the peer address, and the current - * interval should remain unchanged. + * in milliseconds. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. * spp_pathmaxrxt - This contains the maximum number of * retransmissions before this address shall be - * considered unreachable. + * considered unreachable. If a value of zero + * is present in this field then no changes are to + * be made to this parameter. + * spp_pathmtu - When Path MTU discovery is disabled the value + * specified here will be the "fixed" path mtu. + * Note that if the spp_address field is empty + * then all associations on this address will + * have this fixed path mtu set upon them. + * + * spp_sackdelay - When delayed sack is enabled, this value specifies + * the number of milliseconds that sacks will be delayed + * for. This value will apply to all addresses of an + * association if the spp_address field is empty. Note + * also, that if delayed sack is enabled and this + * value is set to 0, no change is made to the last + * recorded delayed sack timer value. + * + * spp_flags - These flags are used to control various features + * on an association. The flag field may contain + * zero or more of the following options. + * + * SPP_HB_ENABLE - Enable heartbeats on the + * specified address. Note that if the address + * field is empty all addresses for the association + * have heartbeats enabled upon them. + * + * SPP_HB_DISABLE - Disable heartbeats on the + * speicifed address. Note that if the address + * field is empty all addresses for the association + * will have their heartbeats disabled. Note also + * that SPP_HB_ENABLE and SPP_HB_DISABLE are + * mutually exclusive, only one of these two should + * be specified. Enabling both fields will have + * undetermined results. + * + * SPP_HB_DEMAND - Request a user initiated heartbeat + * to be made immediately. + * + * SPP_PMTUD_ENABLE - This field will enable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. + * + * SPP_PMTUD_DISABLE - This field will disable PMTU + * discovery upon the specified address. Note that + * if the address feild is empty then all addresses + * on the association are effected. Not also that + * SPP_PMTUD_ENABLE and SPP_PMTUD_DISABLE are mutually + * exclusive. Enabling both will have undetermined + * results. + * + * SPP_SACKDELAY_ENABLE - Setting this flag turns + * on delayed sack. The time specified in spp_sackdelay + * is used to specify the sack delay for this address. Note + * that if spp_address is empty then all addresses will + * enable delayed sack and take on the sack delay + * value specified in spp_sackdelay. + * SPP_SACKDELAY_DISABLE - Setting this flag turns + * off delayed sack. If the spp_address field is blank then + * delayed sack is disabled for the entire association. Note + * also that this field is mutually exclusive to + * SPP_SACKDELAY_ENABLE, setting both will have undefined + * results. */ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, - char __user *optval, int __user *optlen) + char __user *optval, int __user *optlen) { - struct sctp_paddrparams params; - struct sctp_transport *trans; + struct sctp_paddrparams params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); if (len != sizeof(struct sctp_paddrparams)) return -EINVAL; + if (copy_from_user(¶ms, optval, len)) return -EFAULT; - /* If no association id is specified retrieve the default value - * for the endpoint that will be used for all future associations + /* If an address other than INADDR_ANY is specified, and + * no transport is found, then the request is invalid. */ - if (!params.spp_assoc_id && - sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { - params.spp_hbinterval = sctp_sk(sk)->paddrparam.spp_hbinterval; - params.spp_pathmaxrxt = sctp_sk(sk)->paddrparam.spp_pathmaxrxt; - - goto done; + if (!sctp_is_any(( union sctp_addr *)¶ms.spp_address)) { + trans = sctp_addr_id2transport(sk, ¶ms.spp_address, + params.spp_assoc_id); + if (!trans) { + SCTP_DEBUG_PRINTK("Failed no transport\n"); + return -EINVAL; + } } - trans = sctp_addr_id2transport(sk, ¶ms.spp_address, - params.spp_assoc_id); - if (!trans) - return -EINVAL; - - /* The value of the heartbeat interval, in milliseconds. A value of 0, - * when modifying the parameter, specifies that the heartbeat on this - * address should be disabled. + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. */ - if (!trans->hb_allowed) - params.spp_hbinterval = 0; - else - params.spp_hbinterval = jiffies_to_msecs(trans->hb_interval); + asoc = sctp_id2assoc(sk, params.spp_assoc_id); + if (!asoc && params.spp_assoc_id && sctp_style(sk, UDP)) { + SCTP_DEBUG_PRINTK("Failed no association\n"); + return -EINVAL; + } - /* spp_pathmaxrxt contains the maximum number of retransmissions - * before this address shall be considered unreachable. - */ - params.spp_pathmaxrxt = trans->max_retrans; + if (trans) { + /* Fetch transport values. */ + params.spp_hbinterval = jiffies_to_msecs(trans->hbinterval); + params.spp_pathmtu = trans->pathmtu; + params.spp_pathmaxrxt = trans->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(trans->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = trans->param_flags; + } else if (asoc) { + /* Fetch association values. */ + params.spp_hbinterval = jiffies_to_msecs(asoc->hbinterval); + params.spp_pathmtu = asoc->pathmtu; + params.spp_pathmaxrxt = asoc->pathmaxrxt; + params.spp_sackdelay = jiffies_to_msecs(asoc->sackdelay); + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = asoc->param_flags; + } else { + /* Fetch socket values. */ + params.spp_hbinterval = sp->hbinterval; + params.spp_pathmtu = sp->pathmtu; + params.spp_sackdelay = sp->sackdelay; + params.spp_pathmaxrxt = sp->pathmaxrxt; + + /*draft-11 doesn't say what to return in spp_flags*/ + params.spp_flags = sp->param_flags; + } -done: if (copy_to_user(optval, ¶ms, len)) return -EFAULT; diff --git a/net/sctp/transport.c b/net/sctp/transport.c index 268ddaf..68d73e2 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -86,10 +86,13 @@ static struct sctp_transport *sctp_transport_init(struct sctp_transport *peer, peer->init_sent_count = 0; peer->state = SCTP_ACTIVE; - peer->hb_allowed = 0; + peer->param_flags = SPP_HB_DISABLE | + SPP_PMTUD_ENABLE | + SPP_SACKDELAY_ENABLE; + peer->hbinterval = 0; /* Initialize the default path max_retrans. */ - peer->max_retrans = sctp_max_retrans_path; + peer->pathmaxrxt = sctp_max_retrans_path; peer->error_count = 0; INIT_LIST_HEAD(&peer->transmitted); @@ -229,10 +232,10 @@ void sctp_transport_pmtu(struct sctp_transport *transport) dst = transport->af_specific->get_dst(NULL, &transport->ipaddr, NULL); if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); dst_release(dst); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Caches the dst entry and source address for a transport's destination @@ -254,8 +257,11 @@ void sctp_transport_route(struct sctp_transport *transport, af->get_saddr(asoc, dst, daddr, &transport->saddr); transport->dst = dst; + if ((transport->param_flags & SPP_PMTUD_DISABLE) && transport->pathmtu) { + return; + } if (dst) { - transport->pmtu = dst_mtu(dst); + transport->pathmtu = dst_mtu(dst); /* Initialize sk->sk_rcv_saddr, if the transport is the * association's active path for getsockname(). @@ -264,7 +270,7 @@ void sctp_transport_route(struct sctp_transport *transport, opt->pf->af->to_sk_saddr(&transport->saddr, asoc->base.sk); } else - transport->pmtu = SCTP_DEFAULT_MAXSEGMENT; + transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT; } /* Hold a reference to a transport. */ @@ -369,7 +375,7 @@ void sctp_transport_raise_cwnd(struct sctp_transport *transport, ssthresh = transport->ssthresh; pba = transport->partial_bytes_acked; - pmtu = transport->asoc->pmtu; + pmtu = transport->asoc->pathmtu; if (cwnd <= ssthresh) { /* RFC 2960 7.2.1, sctpimpguide-05 2.14.2 When cwnd is less @@ -441,8 +447,8 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); - transport->cwnd = transport->asoc->pmtu; + 4*transport->asoc->pathmtu); + transport->cwnd = transport->asoc->pathmtu; break; case SCTP_LOWER_CWND_FAST_RTX: @@ -459,7 +465,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, * partial_bytes_acked = 0 */ transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; break; @@ -479,7 +485,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, if ((jiffies - transport->last_time_ecne_reduced) > transport->rtt) { transport->ssthresh = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); transport->cwnd = transport->ssthresh; transport->last_time_ecne_reduced = jiffies; } @@ -496,7 +502,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, */ if ((jiffies - transport->last_time_used) > transport->rto) transport->cwnd = max(transport->cwnd/2, - 4*transport->asoc->pmtu); + 4*transport->asoc->pathmtu); break; }; @@ -511,7 +517,7 @@ void sctp_transport_lower_cwnd(struct sctp_transport *transport, unsigned long sctp_transport_timeout(struct sctp_transport *t) { unsigned long timeout; - timeout = t->hb_interval + t->rto + sctp_jitter(t->rto); + timeout = t->hbinterval + t->rto + sctp_jitter(t->rto); timeout += jiffies; return timeout; } -- cgit v1.1 From 7708610b1bff4a0ba8a73733d3c7c4bda9f94b21 Mon Sep 17 00:00:00 2001 From: Frank Filz Date: Thu, 22 Dec 2005 11:37:30 -0800 Subject: [SCTP]: Add support for SCTP_DELAYED_ACK_TIME socket option. Signed-off-by: Frank Filz Signed-off-by: Sridhar Samudrala Signed-off-by: David S. Miller --- include/net/sctp/user.h | 14 ++++ net/sctp/socket.c | 184 ++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 198 insertions(+) diff --git a/include/net/sctp/user.h b/include/net/sctp/user.h index b9052864..8a6bef6 100644 --- a/include/net/sctp/user.h +++ b/include/net/sctp/user.h @@ -93,6 +93,8 @@ enum sctp_optname { #define SCTP_STATUS SCTP_STATUS SCTP_GET_PEER_ADDR_INFO, #define SCTP_GET_PEER_ADDR_INFO SCTP_GET_PEER_ADDR_INFO + SCTP_DELAYED_ACK_TIME, +#define SCTP_DELAYED_ACK_TIME SCTP_DELAYED_ACK_TIME /* Internal Socket Options. Some of the sctp library functions are * implemented using these socket options. @@ -526,6 +528,18 @@ struct sctp_paddrparams { __u32 spp_flags; } __attribute__((packed, aligned(4))); +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + */ +struct sctp_assoc_value { + sctp_assoc_t assoc_id; + uint32_t assoc_value; +}; + /* * 7.2.2 Peer Address Information * diff --git a/net/sctp/socket.c b/net/sctp/socket.c index adb5ee6..fc04d18 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -2214,6 +2214,109 @@ static int sctp_setsockopt_peer_addr_params(struct sock *sk, return 0; } +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ + +static int sctp_setsockopt_delayed_ack_time(struct sock *sk, + char __user *optval, int optlen) +{ + struct sctp_assoc_value params; + struct sctp_transport *trans = NULL; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (optlen != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, optlen)) + return -EFAULT; + + /* Validate value parameter. */ + if (params.assoc_value > 500) + return -EINVAL; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (params.assoc_value) { + if (asoc) { + asoc->sackdelay = + msecs_to_jiffies(params.assoc_value); + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + sp->sackdelay = params.assoc_value; + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } + } else { + if (asoc) { + asoc->param_flags = + (asoc->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } else { + sp->param_flags = + (sp->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + + /* If change is for association, also apply to each transport. */ + if (asoc) { + struct list_head *pos; + + list_for_each(pos, &asoc->peer.transport_addr_list) { + trans = list_entry(pos, struct sctp_transport, + transports); + if (params.assoc_value) { + trans->sackdelay = + msecs_to_jiffies(params.assoc_value); + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_ENABLE; + } else { + trans->param_flags = + (trans->param_flags & ~SPP_SACKDELAY) | + SPP_SACKDELAY_DISABLE; + } + } + } + + return 0; +} + /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association @@ -2660,6 +2763,10 @@ SCTP_STATIC int sctp_setsockopt(struct sock *sk, int level, int optname, retval = sctp_setsockopt_peer_addr_params(sk, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_setsockopt_delayed_ack_time(sk, optval, optlen); + break; + case SCTP_INITMSG: retval = sctp_setsockopt_initmsg(sk, optval, optlen); break; @@ -3417,6 +3524,79 @@ static int sctp_getsockopt_peer_addr_params(struct sock *sk, int len, return 0; } +/* 7.1.24. Delayed Ack Timer (SCTP_DELAYED_ACK_TIME) + * + * This options will get or set the delayed ack timer. The time is set + * in milliseconds. If the assoc_id is 0, then this sets or gets the + * endpoints default delayed ack timer value. If the assoc_id field is + * non-zero, then the set or get effects the specified association. + * + * struct sctp_assoc_value { + * sctp_assoc_t assoc_id; + * uint32_t assoc_value; + * }; + * + * assoc_id - This parameter, indicates which association the + * user is preforming an action upon. Note that if + * this field's value is zero then the endpoints + * default value is changed (effecting future + * associations only). + * + * assoc_value - This parameter contains the number of milliseconds + * that the user is requesting the delayed ACK timer + * be set to. Note that this value is defined in + * the standard to be between 200 and 500 milliseconds. + * + * Note: a value of zero will leave the value alone, + * but disable SACK delay. A non-zero value will also + * enable SACK delay. + */ +static int sctp_getsockopt_delayed_ack_time(struct sock *sk, int len, + char __user *optval, + int __user *optlen) +{ + struct sctp_assoc_value params; + struct sctp_association *asoc = NULL; + struct sctp_sock *sp = sctp_sk(sk); + + if (len != sizeof(struct sctp_assoc_value)) + return - EINVAL; + + if (copy_from_user(¶ms, optval, len)) + return -EFAULT; + + /* Get association, if assoc_id != 0 and the socket is a one + * to many style socket, and an association was not found, then + * the id was invalid. + */ + asoc = sctp_id2assoc(sk, params.assoc_id); + if (!asoc && params.assoc_id && sctp_style(sk, UDP)) + return -EINVAL; + + if (asoc) { + /* Fetch association values. */ + if (asoc->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = jiffies_to_msecs( + asoc->sackdelay); + else + params.assoc_value = 0; + } else { + /* Fetch socket values. */ + if (sp->param_flags & SPP_SACKDELAY_ENABLE) + params.assoc_value = sp->sackdelay; + else + params.assoc_value = 0; + } + + if (copy_to_user(optval, ¶ms, len)) + return -EFAULT; + + if (put_user(len, optlen)) + return -EFAULT; + + return 0; +} + /* 7.1.3 Initialization Parameters (SCTP_INITMSG) * * Applications can specify protocol parameters for the default association @@ -4274,6 +4454,10 @@ SCTP_STATIC int sctp_getsockopt(struct sock *sk, int level, int optname, retval = sctp_getsockopt_peer_addr_params(sk, len, optval, optlen); break; + case SCTP_DELAYED_ACK_TIME: + retval = sctp_getsockopt_delayed_ack_time(sk, len, optval, + optlen); + break; case SCTP_INITMSG: retval = sctp_getsockopt_initmsg(sk, len, optval, optlen); break; -- cgit v1.1 From 77d76ea310b50a9c8ff15bd290fcb4ed4961adf2 Mon Sep 17 00:00:00 2001 From: Andi Kleen Date: Thu, 22 Dec 2005 12:43:42 -0800 Subject: [NET]: Small cleanup to socket initialization sock_init can be done as a core_initcall instead of calling it directly in init/main.c Also I removed an out of date #ifdef. Signed-off-by: Andi Kleen Signed-off-by: David S. Miller --- include/linux/skbuff.h | 1 - include/linux/socket.h | 1 - init/main.c | 4 ---- net/socket.c | 10 +++++----- 4 files changed, 5 insertions(+), 11 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 97f6580..9716771 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -32,7 +32,6 @@ #define HAVE_ALLOC_SKB /* For the drivers to know */ #define HAVE_ALIGNABLE_SKB /* Ditto 8) */ -#define SLAB_SKB /* Slabified skbuffs */ #define CHECKSUM_NONE 0 #define CHECKSUM_HW 1 diff --git a/include/linux/socket.h b/include/linux/socket.h index 1739c2d..9f40191 100644 --- a/include/linux/socket.h +++ b/include/linux/socket.h @@ -27,7 +27,6 @@ struct __kernel_sockaddr_storage { #include /* __user */ extern int sysctl_somaxconn; -extern void sock_init(void); #ifdef CONFIG_PROC_FS struct seq_file; extern void socket_seq_show(struct seq_file *seq); diff --git a/init/main.c b/init/main.c index 27f97f9..54aaf56 100644 --- a/init/main.c +++ b/init/main.c @@ -47,7 +47,6 @@ #include #include #include -#include #include #include @@ -614,9 +613,6 @@ static void __init do_basic_setup(void) sysctl_init(); #endif - /* Networking initialization needs a process context */ - sock_init(); - do_initcalls(); } diff --git a/net/socket.c b/net/socket.c index 3145103..98be7ef 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2036,7 +2036,7 @@ int sock_unregister(int family) return 0; } -void __init sock_init(void) +static int __init sock_init(void) { /* * Initialize sock SLAB cache. @@ -2044,12 +2044,10 @@ void __init sock_init(void) sk_init(); -#ifdef SLAB_SKB /* * Initialize skbuff SLAB cache */ skb_init(); -#endif /* * Initialize the protocols module. @@ -2058,8 +2056,8 @@ void __init sock_init(void) init_inodecache(); register_filesystem(&sock_fs_type); sock_mnt = kern_mount(&sock_fs_type); - /* The real protocol initialization is performed when - * do_initcalls is run. + + /* The real protocol initialization is performed in later initcalls. */ #ifdef CONFIG_NETFILTER @@ -2067,6 +2065,8 @@ void __init sock_init(void) #endif } +core_initcall(sock_init); /* early initcall */ + #ifdef CONFIG_PROC_FS void socket_seq_show(struct seq_file *seq) { -- cgit v1.1 From 90ddc4f0470427df306f308ad03db6b6b21644b8 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Thu, 22 Dec 2005 12:49:22 -0800 Subject: [NET]: move struct proto_ops to const I noticed that some of 'struct proto_ops' used in the kernel may share a cache line used by locks or other heavily modified data. (default linker alignement is 32 bytes, and L1_CACHE_LINE is 64 or 128 at least) This patch makes sure a 'struct proto_ops' can be declared as const, so that all cpus can share all parts of it without false sharing. This is not mandatory : a driver can still use a read/write structure if it needs to (and eventually a __read_mostly) I made a global stubstitute to change all existing occurences to make them const. This should reduce the possibility of false sharing on SMP, and speedup some socket system calls. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- include/linux/net.h | 4 ++-- include/net/inet_common.h | 4 ++-- include/net/ipv6.h | 4 ++-- include/net/protocol.h | 2 +- net/appletalk/ddp.c | 4 ++-- net/atm/pvc.c | 2 +- net/atm/svc.c | 2 +- net/ax25/af_ax25.c | 4 ++-- net/bluetooth/bnep/sock.c | 2 +- net/bluetooth/cmtp/sock.c | 2 +- net/bluetooth/hci_sock.c | 2 +- net/bluetooth/hidp/sock.c | 2 +- net/bluetooth/l2cap.c | 4 ++-- net/bluetooth/rfcomm/sock.c | 4 ++-- net/bluetooth/sco.c | 4 ++-- net/dccp/proto.c | 2 +- net/decnet/af_decnet.c | 4 ++-- net/econet/af_econet.c | 4 ++-- net/ipv4/af_inet.c | 6 +++--- net/ipv6/af_inet6.c | 6 +++--- net/ipx/af_ipx.c | 4 ++-- net/irda/af_irda.c | 16 ++++++++-------- net/key/af_key.c | 4 ++-- net/llc/af_llc.c | 4 ++-- net/netlink/af_netlink.c | 4 ++-- net/netrom/af_netrom.c | 4 ++-- net/packet/af_packet.c | 8 ++++---- net/sctp/ipv6.c | 2 +- net/sctp/protocol.c | 2 +- net/sunrpc/svcsock.c | 2 +- net/unix/af_unix.c | 6 +++--- net/wanrouter/af_wanpipe.c | 4 ++-- net/x25/af_x25.c | 4 ++-- 33 files changed, 66 insertions(+), 66 deletions(-) diff --git a/include/linux/net.h b/include/linux/net.h index d6a41e6..28195a2 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -107,7 +107,7 @@ enum sock_type { struct socket { socket_state state; unsigned long flags; - struct proto_ops *ops; + const struct proto_ops *ops; struct fasync_struct *fasync_list; struct file *file; struct sock *sk; @@ -260,7 +260,7 @@ SOCKCALL_WRAP(name, recvmsg, (struct kiocb *iocb, struct socket *sock, struct ms SOCKCALL_WRAP(name, mmap, (struct file *file, struct socket *sock, struct vm_area_struct *vma), \ (file, sock, vma)) \ \ -static struct proto_ops name##_ops = { \ +static const struct proto_ops name##_ops = { \ .family = fam, \ .owner = THIS_MODULE, \ .release = __lock_##name##_release, \ diff --git a/include/net/inet_common.h b/include/net/inet_common.h index f943306..227adcb 100644 --- a/include/net/inet_common.h +++ b/include/net/inet_common.h @@ -1,8 +1,8 @@ #ifndef _INET_COMMON_H #define _INET_COMMON_H -extern struct proto_ops inet_stream_ops; -extern struct proto_ops inet_dgram_ops; +extern const struct proto_ops inet_stream_ops; +extern const struct proto_ops inet_dgram_ops; /* * INET4 prototypes used by INET6 diff --git a/include/net/ipv6.h b/include/net/ipv6.h index e3d5d7b..11a7256 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -538,8 +538,8 @@ extern int sysctl_ip6frag_low_thresh; extern int sysctl_ip6frag_time; extern int sysctl_ip6frag_secret_interval; -extern struct proto_ops inet6_stream_ops; -extern struct proto_ops inet6_dgram_ops; +extern const struct proto_ops inet6_stream_ops; +extern const struct proto_ops inet6_dgram_ops; extern int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); diff --git a/include/net/protocol.h b/include/net/protocol.h index a29cb29..63f7db9 100644 --- a/include/net/protocol.h +++ b/include/net/protocol.h @@ -65,7 +65,7 @@ struct inet_protosw { int protocol; /* This is the L4 protocol number. */ struct proto *prot; - struct proto_ops *ops; + const struct proto_ops *ops; int capability; /* Which (if any) capability do * we need to use this socket diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 7982656..296f186 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -63,7 +63,7 @@ #include struct datalink_proto *ddp_dl, *aarp_dl; -static struct proto_ops atalk_dgram_ops; +static const struct proto_ops atalk_dgram_ops; /**************************************************************************\ * * @@ -1841,7 +1841,7 @@ static struct net_proto_family atalk_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(atalk_dgram_ops) = { .family = PF_APPLETALK, .owner = THIS_MODULE, .release = atalk_release, diff --git a/net/atm/pvc.c b/net/atm/pvc.c index 2684a92..f2c5417 100644 --- a/net/atm/pvc.c +++ b/net/atm/pvc.c @@ -102,7 +102,7 @@ static int pvc_getname(struct socket *sock,struct sockaddr *sockaddr, } -static struct proto_ops pvc_proto_ops = { +static const struct proto_ops pvc_proto_ops = { .family = PF_ATMPVC, .owner = THIS_MODULE, diff --git a/net/atm/svc.c b/net/atm/svc.c index d7b2661..3a180cfd 100644 --- a/net/atm/svc.c +++ b/net/atm/svc.c @@ -613,7 +613,7 @@ static int svc_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return error; } -static struct proto_ops svc_proto_ops = { +static const struct proto_ops svc_proto_ops = { .family = PF_ATMSVC, .owner = THIS_MODULE, diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 1b683f3..8b5d10a 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -54,7 +54,7 @@ HLIST_HEAD(ax25_list); DEFINE_SPINLOCK(ax25_list_lock); -static struct proto_ops ax25_proto_ops; +static const struct proto_ops ax25_proto_ops; static void ax25_free_sock(struct sock *sk) { @@ -1944,7 +1944,7 @@ static struct net_proto_family ax25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops ax25_proto_ops = { +static const struct proto_ops ax25_proto_ops = { .family = PF_AX25, .owner = THIS_MODULE, .release = ax25_release, diff --git a/net/bluetooth/bnep/sock.c b/net/bluetooth/bnep/sock.c index 9778c6a..ccbaf69 100644 --- a/net/bluetooth/bnep/sock.c +++ b/net/bluetooth/bnep/sock.c @@ -146,7 +146,7 @@ static int bnep_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return 0; } -static struct proto_ops bnep_sock_ops = { +static const struct proto_ops bnep_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = bnep_sock_release, diff --git a/net/bluetooth/cmtp/sock.c b/net/bluetooth/cmtp/sock.c index beb045b..5e22343 100644 --- a/net/bluetooth/cmtp/sock.c +++ b/net/bluetooth/cmtp/sock.c @@ -137,7 +137,7 @@ static int cmtp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops cmtp_sock_ops = { +static const struct proto_ops cmtp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = cmtp_sock_release, diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 1d6d0a1..84e6c93 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -575,7 +575,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, char return 0; } -static struct proto_ops hci_sock_ops = { +static const struct proto_ops hci_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hci_sock_release, diff --git a/net/bluetooth/hidp/sock.c b/net/bluetooth/hidp/sock.c index f8986f8..8f8dd93 100644 --- a/net/bluetooth/hidp/sock.c +++ b/net/bluetooth/hidp/sock.c @@ -143,7 +143,7 @@ static int hidp_sock_ioctl(struct socket *sock, unsigned int cmd, unsigned long return -EINVAL; } -static struct proto_ops hidp_sock_ops = { +static const struct proto_ops hidp_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = hidp_sock_release, diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 95f33cc..7f0781e 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -57,7 +57,7 @@ #define VERSION "2.8" -static struct proto_ops l2cap_sock_ops; +static const struct proto_ops l2cap_sock_ops; static struct bt_sock_list l2cap_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -2161,7 +2161,7 @@ static ssize_t l2cap_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(l2cap, S_IRUGO, l2cap_sysfs_show, NULL); -static struct proto_ops l2cap_sock_ops = { +static const struct proto_ops l2cap_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = l2cap_sock_release, diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 6c34261..757d2dd 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -58,7 +58,7 @@ #define BT_DBG(D...) #endif -static struct proto_ops rfcomm_sock_ops; +static const struct proto_ops rfcomm_sock_ops; static struct bt_sock_list rfcomm_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -907,7 +907,7 @@ static ssize_t rfcomm_sock_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(rfcomm, S_IRUGO, rfcomm_sock_sysfs_show, NULL); -static struct proto_ops rfcomm_sock_ops = { +static const struct proto_ops rfcomm_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = rfcomm_sock_release, diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index 6481814..6b61323 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -56,7 +56,7 @@ #define VERSION "0.5" -static struct proto_ops sco_sock_ops; +static const struct proto_ops sco_sock_ops; static struct bt_sock_list sco_sk_list = { .lock = RW_LOCK_UNLOCKED @@ -914,7 +914,7 @@ static ssize_t sco_sysfs_show(struct class *dev, char *buf) static CLASS_ATTR(sco, S_IRUGO, sco_sysfs_show, NULL); -static struct proto_ops sco_sock_ops = { +static const struct proto_ops sco_sock_ops = { .family = PF_BLUETOOTH, .owner = THIS_MODULE, .release = sco_sock_release, diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 40a4c68..e4e629e 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -680,7 +680,7 @@ void dccp_shutdown(struct sock *sk, int how) EXPORT_SYMBOL_GPL(dccp_shutdown); -static struct proto_ops inet_dccp_ops = { +static const struct proto_ops inet_dccp_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index d402e90..65e3bae 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -149,7 +149,7 @@ static void dn_keepalive(struct sock *sk); #define DN_SK_HASH_MASK (DN_SK_HASH_SIZE - 1) -static struct proto_ops dn_proto_ops; +static const struct proto_ops dn_proto_ops; static DEFINE_RWLOCK(dn_hash_lock); static struct hlist_head dn_sk_hash[DN_SK_HASH_SIZE]; static struct hlist_head dn_wild_sk; @@ -2342,7 +2342,7 @@ static struct net_proto_family dn_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops dn_proto_ops = { +static const struct proto_ops dn_proto_ops = { .family = AF_DECnet, .owner = THIS_MODULE, .release = dn_release, diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 34fdac5..ff58f49 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -45,7 +45,7 @@ #include #include -static struct proto_ops econet_ops; +static const struct proto_ops econet_ops; static struct hlist_head econet_sklist; static DEFINE_RWLOCK(econet_lock); @@ -698,7 +698,7 @@ static struct net_proto_family econet_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(econet_ops) = { .family = PF_ECONET, .owner = THIS_MODULE, .release = econet_release, diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 617e858..4ed8a81 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -785,7 +785,7 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return err; } -struct proto_ops inet_stream_ops = { +const struct proto_ops inet_stream_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -806,7 +806,7 @@ struct proto_ops inet_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet_dgram_ops = { +const struct proto_ops inet_dgram_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, @@ -831,7 +831,7 @@ struct proto_ops inet_dgram_ops = { * For SOCK_RAW sockets; should be the same as inet_dgram_ops but without * udp_poll */ -static struct proto_ops inet_sockraw_ops = { +static const struct proto_ops inet_sockraw_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 70a510f..7c9f192 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -462,7 +462,7 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return(0); } -struct proto_ops inet6_stream_ops = { +const struct proto_ops inet6_stream_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -483,7 +483,7 @@ struct proto_ops inet6_stream_ops = { .sendpage = tcp_sendpage }; -struct proto_ops inet6_dgram_ops = { +const struct proto_ops inet6_dgram_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, @@ -511,7 +511,7 @@ static struct net_proto_family inet6_family_ops = { }; /* Same as inet6_dgram_ops, sans udp_poll. */ -static struct proto_ops inet6_sockraw_ops = { +static const struct proto_ops inet6_sockraw_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 34b3bb8..6c464c1 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -75,7 +75,7 @@ static struct datalink_proto *pEII_datalink; static struct datalink_proto *p8023_datalink; static struct datalink_proto *pSNAP_datalink; -static struct proto_ops ipx_dgram_ops; +static const struct proto_ops ipx_dgram_ops; LIST_HEAD(ipx_interfaces); DEFINE_SPINLOCK(ipx_interfaces_lock); @@ -1901,7 +1901,7 @@ static struct net_proto_family ipx_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(ipx_dgram_ops) = { .family = PF_IPX, .owner = THIS_MODULE, .release = ipx_release, diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index f121f7d..e57683d 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -62,12 +62,12 @@ static int irda_create(struct socket *sock, int protocol); -static struct proto_ops irda_stream_ops; -static struct proto_ops irda_seqpacket_ops; -static struct proto_ops irda_dgram_ops; +static const struct proto_ops irda_stream_ops; +static const struct proto_ops irda_seqpacket_ops; +static const struct proto_ops irda_dgram_ops; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops irda_ultra_ops; +static const struct proto_ops irda_ultra_ops; #define ULTRA_MAX_DATA 382 #endif /* CONFIG_IRDA_ULTRA */ @@ -2464,7 +2464,7 @@ static struct net_proto_family irda_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2485,7 +2485,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_stream_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2506,7 +2506,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_seqpacket_ops) = { .sendpage = sock_no_sendpage, }; -static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, @@ -2528,7 +2528,7 @@ static struct proto_ops SOCKOPS_WRAPPED(irda_dgram_ops) = { }; #ifdef CONFIG_IRDA_ULTRA -static struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(irda_ultra_ops) = { .family = PF_IRDA, .owner = THIS_MODULE, .release = irda_release, diff --git a/net/key/af_key.c b/net/key/af_key.c index d32f779..52efd04 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -113,7 +113,7 @@ static __inline__ void pfkey_unlock_table(void) } -static struct proto_ops pfkey_ops; +static const struct proto_ops pfkey_ops; static void pfkey_insert(struct sock *sk) { @@ -3127,7 +3127,7 @@ out: return err; } -static struct proto_ops pfkey_ops = { +static const struct proto_ops pfkey_ops = { .family = PF_KEY, .owner = THIS_MODULE, /* Operations that make no sense on pfkey sockets. */ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index b6d3df5..9cf65f9 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -36,7 +36,7 @@ static u16 llc_ui_sap_last_autoport = LLC_SAP_DYN_START; static u16 llc_ui_sap_link_no_max[256]; static struct sockaddr_llc llc_ui_addrnull; -static struct proto_ops llc_ui_ops; +static const struct proto_ops llc_ui_ops; static int llc_ui_wait_for_conn(struct sock *sk, long timeout); static int llc_ui_wait_for_disc(struct sock *sk, long timeout); @@ -1098,7 +1098,7 @@ static struct net_proto_family llc_ui_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops llc_ui_ops = { +static const struct proto_ops llc_ui_ops = { .family = PF_LLC, .owner = THIS_MODULE, .release = llc_ui_release, diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index 96020d7..7849cac 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -293,7 +293,7 @@ static inline int nl_pid_hash_dilute(struct nl_pid_hash *hash, int len) return 0; } -static struct proto_ops netlink_ops; +static const struct proto_ops netlink_ops; static int netlink_insert(struct sock *sk, u32 pid) { @@ -1656,7 +1656,7 @@ int netlink_unregister_notifier(struct notifier_block *nb) return notifier_chain_unregister(&netlink_chain, nb); } -static struct proto_ops netlink_ops = { +static const struct proto_ops netlink_ops = { .family = PF_NETLINK, .owner = THIS_MODULE, .release = netlink_release, diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index e5d82d7..05b653c 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -63,7 +63,7 @@ static unsigned short circuit = 0x101; static HLIST_HEAD(nr_list); static DEFINE_SPINLOCK(nr_list_lock); -static struct proto_ops nr_proto_ops; +static const struct proto_ops nr_proto_ops; /* * Socket removal during an interrupt is now safe. @@ -1337,7 +1337,7 @@ static struct net_proto_family nr_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops nr_proto_ops = { +static const struct proto_ops nr_proto_ops = { .family = PF_NETROM, .owner = THIS_MODULE, .release = nr_release, diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 3e24627..deda6fd 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -251,10 +251,10 @@ static void packet_sock_destruct(struct sock *sk) } -static struct proto_ops packet_ops; +static const struct proto_ops packet_ops; #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt; +static const struct proto_ops packet_ops_spkt; static int packet_rcv_spkt(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { @@ -1784,7 +1784,7 @@ out: #ifdef CONFIG_SOCK_PACKET -static struct proto_ops packet_ops_spkt = { +static const struct proto_ops packet_ops_spkt = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, @@ -1806,7 +1806,7 @@ static struct proto_ops packet_ops_spkt = { }; #endif -static struct proto_ops packet_ops = { +static const struct proto_ops packet_ops = { .family = PF_PACKET, .owner = THIS_MODULE, .release = packet_release, diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index fa3be2b..15c0516 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -866,7 +866,7 @@ static int sctp_inet6_supported_addrs(const struct sctp_sock *opt, return 2; } -static struct proto_ops inet6_seqpacket_ops = { +static const struct proto_ops inet6_seqpacket_ops = { .family = PF_INET6, .owner = THIS_MODULE, .release = inet6_release, diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index f775d78..d1b0747 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -829,7 +829,7 @@ static struct notifier_block sctp_inetaddr_notifier = { }; /* Socket operations. */ -static struct proto_ops inet_seqpacket_ops = { +static const struct proto_ops inet_seqpacket_ops = { .family = PF_INET, .owner = THIS_MODULE, .release = inet_release, /* Needs to be wrapped... */ diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index c6a5191..d68eba4 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -758,7 +758,7 @@ svc_tcp_accept(struct svc_sock *svsk) struct svc_serv *serv = svsk->sk_server; struct socket *sock = svsk->sk_sock; struct socket *newsock; - struct proto_ops *ops; + const struct proto_ops *ops; struct svc_sock *newsvsk; int err, slen; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 04e850e..7d3fe6a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -473,7 +473,7 @@ static int unix_dgram_connect(struct socket *, struct sockaddr *, static int unix_seqpacket_sendmsg(struct kiocb *, struct socket *, struct msghdr *, size_t); -static struct proto_ops unix_stream_ops = { +static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -494,7 +494,7 @@ static struct proto_ops unix_stream_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_dgram_ops = { +static const struct proto_ops unix_dgram_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, @@ -515,7 +515,7 @@ static struct proto_ops unix_dgram_ops = { .sendpage = sock_no_sendpage, }; -static struct proto_ops unix_seqpacket_ops = { +static const struct proto_ops unix_seqpacket_ops = { .family = PF_UNIX, .owner = THIS_MODULE, .release = unix_release, diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 59fec59..67948bf 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -181,7 +181,7 @@ struct wanpipe_opt #endif static int sk_count; -extern struct proto_ops wanpipe_ops; +extern const struct proto_ops wanpipe_ops; static unsigned long find_free_critical; static void wanpipe_unlink_driver(struct sock *sk); @@ -2546,7 +2546,7 @@ static int wanpipe_connect(struct socket *sock, struct sockaddr *uaddr, int addr return 0; } -struct proto_ops wanpipe_ops = { +const struct proto_ops wanpipe_ops = { .family = PF_WANPIPE, .owner = THIS_MODULE, .release = wanpipe_release, diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index 020d73c..ca8b3b0 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -64,7 +64,7 @@ int sysctl_x25_ack_holdback_timeout = X25_DEFAULT_T2; HLIST_HEAD(x25_list); DEFINE_RWLOCK(x25_list_lock); -static struct proto_ops x25_proto_ops; +static const struct proto_ops x25_proto_ops; static struct x25_address null_x25_address = {" "}; @@ -1391,7 +1391,7 @@ static struct net_proto_family x25_family_ops = { .owner = THIS_MODULE, }; -static struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { +static const struct proto_ops SOCKOPS_WRAPPED(x25_proto_ops) = { .family = AF_X25, .owner = THIS_MODULE, .release = x25_release, -- cgit v1.1 From f34fbb971368c20f757f8758833a534590b16518 Mon Sep 17 00:00:00 2001 From: Jaco Kroon Date: Thu, 22 Dec 2005 12:51:46 -0800 Subject: [PKTGEN]: Deinitialise static variables. static variables should not be explicitly initialised to 0. This causes them to be placed in .data instead of .bss. This patch de-initialises 3 static variables in net/core/pktgen.c. There are approximately 800 more such variables in the source tree (2.6.15rc5). If there is more interrest I'd be willing to track down the rest of these as well and de-initialise them as well. Signed-off-by: David S. Miller --- net/core/pktgen.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 7fc3e9e..06cad2d 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -487,9 +487,9 @@ static unsigned int fmt_ip6(char *s,const char ip[16]); /* Module parameters, defaults. */ static int pg_count_d = 1000; /* 1000 pkts by default */ -static int pg_delay_d = 0; -static int pg_clone_skb_d = 0; -static int debug = 0; +static int pg_delay_d; +static int pg_clone_skb_d; +static int debug; static DECLARE_MUTEX(pktgen_sem); static struct pktgen_thread *pktgen_threads = NULL; -- cgit v1.1 From cbeb321a64af5437fbde249605b191ff0fdfa21c Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Thu, 22 Dec 2005 12:58:55 -0800 Subject: [NET]: Fix sock_init() return value. It needs to return zero now that it is an initcall. Also, net/nonet.c no longer needs a dummy sock_init(). Signed-off-by: David S. Miller --- net/nonet.c | 5 ----- net/socket.c | 2 ++ 2 files changed, 2 insertions(+), 5 deletions(-) diff --git a/net/nonet.c b/net/nonet.c index e5241dce..1230f0a 100644 --- a/net/nonet.c +++ b/net/nonet.c @@ -14,11 +14,6 @@ #include #include -void __init sock_init(void) -{ - printk(KERN_INFO "Linux NoNET1.0 for Linux 2.6\n"); -} - static int sock_no_open(struct inode *irrelevant, struct file *dontcare) { return -ENXIO; diff --git a/net/socket.c b/net/socket.c index 98be7ef..f40957a 100644 --- a/net/socket.c +++ b/net/socket.c @@ -2063,6 +2063,8 @@ static int __init sock_init(void) #ifdef CONFIG_NETFILTER netfilter_init(); #endif + + return 0; } core_initcall(sock_init); /* early initcall */ -- cgit v1.1 From ce1d4d3e88b3a69d23c3feb436a0b36b6ca0642b Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 22 Dec 2005 21:08:46 -0800 Subject: [NET]: restructure sock_aio_{read,write} / sock_{readv,writev} Mid-term I plan to restructure the file_operations so that we don't need to have all these duplicate aio and vectored versions. This patch is a small step in that direction but also a worthwile cleanup on it's own: (1) introduce a alloc_sock_iocb helper that encapsulates allocating a proper sock_iocb (2) add do_sock_read and do_sock_write helpers for common read/write code Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/socket.c | 224 +++++++++++++++++++++++++++++------------------------------ 1 file changed, 110 insertions(+), 114 deletions(-) diff --git a/net/socket.c b/net/socket.c index f40957a..0dde6da 100644 --- a/net/socket.c +++ b/net/socket.c @@ -640,154 +640,150 @@ static void sock_aio_dtor(struct kiocb *iocb) kfree(iocb->private); } -/* - * Read data from a socket. ubuf is a user mode pointer. We make sure the user - * area ubuf...ubuf+size-1 is writable before asking the protocol. - */ - -static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, - size_t size, loff_t pos) +static ssize_t sock_sendpage(struct file *file, struct page *page, + int offset, size_t size, loff_t *ppos, int more) { - struct sock_iocb *x, siocb; struct socket *sock; int flags; - if (pos != 0) - return -ESPIPE; - if (size==0) /* Match SYS5 behaviour */ - return 0; + sock = file->private_data; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; + flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + if (more) + flags |= MSG_MORE; + + return sock->ops->sendpage(sock, page, offset, size, flags); +} + +static struct sock_iocb *alloc_sock_iocb(struct kiocb *iocb, + char __user *ubuf, size_t size, struct sock_iocb *siocb) +{ + if (!is_sync_kiocb(iocb)) { + siocb = kmalloc(sizeof(*siocb), GFP_KERNEL); + if (!siocb) + return NULL; iocb->ki_dtor = sock_aio_dtor; } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_iov.iov_base = ubuf; - x->async_iov.iov_len = size; - flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; + siocb->kiocb = iocb; + siocb->async_iov.iov_base = ubuf; + siocb->async_iov.iov_len = size; - return __sock_recvmsg(iocb, sock, &x->async_msg, size, flags); + iocb->private = siocb; + return siocb; } +static ssize_t do_sock_read(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) +{ + struct socket *sock = file->private_data; + size_t size = 0; + int i; -/* - * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 - * is readable by the user process. - */ + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; -static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, - size_t size, loff_t pos) + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + + return __sock_recvmsg(iocb, sock, msg, size, msg->msg_flags); +} + +static ssize_t sock_readv(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { - struct sock_iocb *x, siocb; - struct socket *sock; - + struct kiocb iocb; + struct sock_iocb siocb; + struct msghdr msg; + int ret; + + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; + + ret = do_sock_read(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} + +static ssize_t sock_aio_read(struct kiocb *iocb, char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; + if (pos != 0) return -ESPIPE; - if(size==0) /* Match SYS5 behaviour */ + if (count == 0) /* Match SYS5 behaviour */ return 0; - if (is_sync_kiocb(iocb)) - x = &siocb; - else { - x = kmalloc(sizeof(struct sock_iocb), GFP_KERNEL); - if (!x) - return -ENOMEM; - iocb->ki_dtor = sock_aio_dtor; - } - iocb->private = x; - x->kiocb = iocb; - sock = iocb->ki_filp->private_data; - - x->async_msg.msg_name = NULL; - x->async_msg.msg_namelen = 0; - x->async_msg.msg_iov = &x->async_iov; - x->async_msg.msg_iovlen = 1; - x->async_msg.msg_control = NULL; - x->async_msg.msg_controllen = 0; - x->async_msg.msg_flags = !(iocb->ki_filp->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (sock->type == SOCK_SEQPACKET) - x->async_msg.msg_flags |= MSG_EOR; - x->async_iov.iov_base = (void __user *)ubuf; - x->async_iov.iov_len = size; - - return __sock_sendmsg(iocb, sock, &x->async_msg, size); + x = alloc_sock_iocb(iocb, ubuf, count, &siocb); + if (!x) + return -ENOMEM; + return do_sock_read(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } -static ssize_t sock_sendpage(struct file *file, struct page *page, - int offset, size_t size, loff_t *ppos, int more) +static ssize_t do_sock_write(struct msghdr *msg, struct kiocb *iocb, + struct file *file, struct iovec *iov, unsigned long nr_segs) { - struct socket *sock; - int flags; + struct socket *sock = file->private_data; + size_t size = 0; + int i; - sock = file->private_data; + for (i = 0 ; i < nr_segs ; i++) + size += iov[i].iov_len; - flags = !(file->f_flags & O_NONBLOCK) ? 0 : MSG_DONTWAIT; - if (more) - flags |= MSG_MORE; + msg->msg_name = NULL; + msg->msg_namelen = 0; + msg->msg_control = NULL; + msg->msg_controllen = 0; + msg->msg_iov = (struct iovec *) iov; + msg->msg_iovlen = nr_segs; + msg->msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + if (sock->type == SOCK_SEQPACKET) + msg->msg_flags |= MSG_EOR; - return sock->ops->sendpage(sock, page, offset, size, flags); + return __sock_sendmsg(iocb, sock, msg, size); } -static int sock_readv_writev(int type, - struct file * file, const struct iovec * iov, - long count, size_t size) +static ssize_t sock_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { struct msghdr msg; - struct socket *sock; + struct kiocb iocb; + struct sock_iocb siocb; + int ret; - sock = file->private_data; + init_sync_kiocb(&iocb, NULL); + iocb.private = &siocb; - msg.msg_name = NULL; - msg.msg_namelen = 0; - msg.msg_control = NULL; - msg.msg_controllen = 0; - msg.msg_iov = (struct iovec *) iov; - msg.msg_iovlen = count; - msg.msg_flags = (file->f_flags & O_NONBLOCK) ? MSG_DONTWAIT : 0; + ret = do_sock_write(&msg, &iocb, file, (struct iovec *)iov, nr_segs); + if (-EIOCBQUEUED == ret) + ret = wait_on_sync_kiocb(&iocb); + return ret; +} - /* read() does a VERIFY_WRITE */ - if (type == VERIFY_WRITE) - return sock_recvmsg(sock, &msg, size, msg.msg_flags); +static ssize_t sock_aio_write(struct kiocb *iocb, const char __user *ubuf, + size_t count, loff_t pos) +{ + struct sock_iocb siocb, *x; - if (sock->type == SOCK_SEQPACKET) - msg.msg_flags |= MSG_EOR; + if (pos != 0) + return -ESPIPE; + if (count == 0) /* Match SYS5 behaviour */ + return 0; - return sock_sendmsg(sock, &msg, size); -} + x = alloc_sock_iocb(iocb, (void __user *)ubuf, count, &siocb); + if (!x) + return -ENOMEM; -static ssize_t sock_readv(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_WRITE, - file, vector, count, tot_len); -} - -static ssize_t sock_writev(struct file *file, const struct iovec *vector, - unsigned long count, loff_t *ppos) -{ - size_t tot_len = 0; - int i; - for (i = 0 ; i < count ; i++) - tot_len += vector[i].iov_len; - return sock_readv_writev(VERIFY_READ, - file, vector, count, tot_len); + return do_sock_write(&x->async_msg, iocb, iocb->ki_filp, + &x->async_iov, 1); } -- cgit v1.1 From 25995ff577675b58dbd848b7758e7bad87411947 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2005 02:42:22 -0200 Subject: [SOCK]: Introduce sk_receive_skb Its common enough to to justify that, TCP still can't use it as it has the prequeueing stuff, still to be made generic in the not so distant future :-) Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/net/pppoe.c | 22 ++-------------------- include/net/sock.h | 23 +++++++++++++++++++++++ net/dccp/ipv4.c | 23 ++--------------------- net/dccp/ipv6.c | 17 +---------------- net/decnet/dn_nsp_in.c | 17 +---------------- 5 files changed, 29 insertions(+), 73 deletions(-) diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index a842ecc..71e303b 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -383,8 +383,6 @@ static int pppoe_rcv(struct sk_buff *skb, { struct pppoe_hdr *ph; struct pppox_sock *po; - struct sock *sk; - int ret; if (!pskb_may_pull(skb, sizeof(struct pppoe_hdr))) goto drop; @@ -395,24 +393,8 @@ static int pppoe_rcv(struct sk_buff *skb, ph = (struct pppoe_hdr *) skb->nh.raw; po = get_item((unsigned long) ph->sid, eth_hdr(skb)->h_source); - if (!po) - goto drop; - - sk = sk_pppox(po); - bh_lock_sock(sk); - - /* Socket state is unknown, must put skb into backlog. */ - if (sock_owned_by_user(sk) != 0) { - sk_add_backlog(sk, skb); - ret = NET_RX_SUCCESS; - } else { - ret = pppoe_rcv_core(sk, skb); - } - - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + if (po != NULL) + return sk_receive_skb(sk_pppox(po), skb); drop: kfree_skb(skb); out: diff --git a/include/net/sock.h b/include/net/sock.h index 91d2895..6961700 100644 --- a/include/net/sock.h +++ b/include/net/sock.h @@ -926,6 +926,29 @@ static inline void sock_put(struct sock *sk) sk_free(sk); } +static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb) +{ + int rc = NET_RX_SUCCESS; + + if (sk_filter(sk, skb, 0)) + goto discard_and_relse; + + skb->dev = NULL; + + bh_lock_sock(sk); + if (!sock_owned_by_user(sk)) + rc = sk->sk_backlog_rcv(sk, skb); + else + sk_add_backlog(sk, skb); + bh_unlock_sock(sk); +out: + sock_put(sk); + return rc; +discard_and_relse: + kfree_skb(skb); + goto out; +} + /* Detach socket from process context. * Announce socket dead, detach it from wait queue and inode. * Note that parent inode held reference count on this struct sock, diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index c363051..99e8afa 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -914,7 +914,6 @@ int dccp_v4_rcv(struct sk_buff *skb) { const struct dccp_hdr *dh; struct sock *sk; - int rc; /* Step 1: Check header basics: */ @@ -984,28 +983,10 @@ int dccp_v4_rcv(struct sk_buff *skb) goto do_time_wait; } - if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) { - dccp_pr_debug("xfrm4_policy_check failed\n"); + if (!xfrm4_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - } - - if (sk_filter(sk, skb, 0)) { - dccp_pr_debug("sk_filter failed\n"); - goto discard_and_relse; - } - - skb->dev = NULL; - bh_lock_sock(sk); - rc = 0; - if (!sock_owned_by_user(sk)) - rc = dccp_v4_do_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - - sock_put(sk); - return rc; + return sk_receive_skb(sk, skb); no_dccp_socket: if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 599b0be..2e194c8 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1032,7 +1032,6 @@ static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) const struct dccp_hdr *dh; struct sk_buff *skb = *pskb; struct sock *sk; - int rc; /* Step 1: Check header basics: */ @@ -1077,21 +1076,7 @@ static int dccp_v6_rcv(struct sk_buff **pskb, unsigned int *nhoffp) if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) goto discard_and_relse; - if (sk_filter(sk, skb, 0)) - goto discard_and_relse; - - skb->dev = NULL; - - bh_lock_sock(sk); - rc = 0; - if (!sock_owned_by_user(sk)) - rc = dccp_v6_do_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - - sock_put(sk); - return rc ? -1 : 0; + return sk_receive_skb(sk, skb) ? -1 : 0; no_dccp_socket: if (!xfrm6_policy_check(NULL, XFRM_POLICY_IN, skb)) diff --git a/net/decnet/dn_nsp_in.c b/net/decnet/dn_nsp_in.c index 369f25b..44bda85 100644 --- a/net/decnet/dn_nsp_in.c +++ b/net/decnet/dn_nsp_in.c @@ -793,7 +793,6 @@ static int dn_nsp_rx_packet(struct sk_buff *skb) got_it: if (sk != NULL) { struct dn_scp *scp = DN_SK(sk); - int ret; /* Reset backoff */ scp->nsp_rxtshift = 0; @@ -807,21 +806,7 @@ got_it: goto free_out; } - bh_lock_sock(sk); - ret = NET_RX_SUCCESS; - if (decnet_debug_level & 8) - printk(KERN_DEBUG "NSP: 0x%02x 0x%02x 0x%04x 0x%04x %d\n", - (int)cb->rt_flags, (int)cb->nsp_flags, - (int)cb->src_port, (int)cb->dst_port, - !!sock_owned_by_user(sk)); - if (!sock_owned_by_user(sk)) - ret = dn_nsp_backlog_rcv(sk, skb); - else - sk_add_backlog(sk, skb); - bh_unlock_sock(sk); - sock_put(sk); - - return ret; + return sk_receive_skb(sk, skb); } return dn_nsp_no_socket(skb, reason); -- cgit v1.1 From 14c850212ed8f8cbb5972ad6b8812e08a0bc901c Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2005 02:43:12 -0200 Subject: [INET_SOCK]: Move struct inet_sock & helper functions to net/inet_sock.h To help in reducing the number of include dependencies, several files were touched as they were getting needed headers indirectly for stuff they use. Thanks also to Alan Menegotto for pointing out that net/dccp/proto.c had linux/dccp.h include twice. Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 2 + drivers/infiniband/ulp/ipoib/ipoib_multicast.c | 2 + drivers/net/ns83820.c | 1 + drivers/net/sk98lin/skge.c | 1 + drivers/net/skge.c | 1 + drivers/net/tg3.c | 1 + fs/9p/trans_sock.c | 1 + fs/nfs/callback.c | 3 + include/linux/dccp.h | 3 +- include/linux/ip.h | 126 +--------------- include/linux/ipv6.h | 7 +- include/linux/udp.h | 6 +- include/net/atmclip.h | 2 +- include/net/dst.h | 1 + include/net/icmp.h | 9 +- include/net/ieee80211_crypt.h | 9 +- include/net/inet_connection_sock.h | 3 +- include/net/inet_ecn.h | 2 + include/net/inet_hashtables.h | 21 +-- include/net/inet_sock.h | 193 +++++++++++++++++++++++++ include/net/inet_timewait_sock.h | 2 +- include/net/ip.h | 17 ++- include/net/ip_fib.h | 2 + include/net/ip_vs.h | 12 +- include/net/ipv6.h | 3 + include/net/ndisc.h | 17 ++- include/net/neighbour.h | 2 +- include/net/pkt_act.h | 1 - include/net/raw.h | 2 + include/net/udp.h | 4 +- include/net/xfrm.h | 3 +- net/bridge/br_netfilter.c | 4 + net/bridge/netfilter/ebt_log.c | 1 + net/core/netpoll.c | 1 + net/dccp/ccid.h | 2 + net/dccp/ipv4.c | 1 + net/dccp/ipv6.c | 1 + net/dccp/output.c | 1 + net/dccp/proto.c | 3 +- net/econet/af_econet.c | 1 + net/ipv4/af_inet.c | 1 + net/ipv4/ah4.c | 1 + net/ipv4/arp.c | 1 + net/ipv4/devinet.c | 1 + net/ipv4/esp4.c | 1 + net/ipv4/fib_frontend.c | 1 + net/ipv4/fib_hash.c | 1 + net/ipv4/fib_rules.c | 1 + net/ipv4/fib_semantics.c | 2 + net/ipv4/icmp.c | 1 + net/ipv4/igmp.c | 2 + net/ipv4/ip_input.c | 1 + net/ipv4/ip_options.c | 1 + net/ipv4/ip_sockglue.c | 1 + net/ipv4/ipcomp.c | 1 + net/ipv4/ipconfig.c | 2 + net/ipv4/ipmr.c | 1 + net/ipv4/ipvs/ip_vs_conn.c | 2 + net/ipv4/ipvs/ip_vs_ctl.c | 1 + net/ipv4/ipvs/ip_vs_dh.c | 2 + net/ipv4/ipvs/ip_vs_est.c | 3 + net/ipv4/ipvs/ip_vs_lblc.c | 2 + net/ipv4/ipvs/ip_vs_lblcr.c | 2 + net/ipv4/ipvs/ip_vs_proto_ah.c | 2 + net/ipv4/ipvs/ip_vs_proto_esp.c | 2 + net/ipv4/ipvs/ip_vs_proto_udp.c | 3 + net/ipv4/ipvs/ip_vs_sh.c | 2 + net/ipv4/ipvs/ip_vs_sync.c | 2 + net/ipv4/netfilter/ip_conntrack_amanda.c | 2 + net/ipv4/netfilter/ip_conntrack_proto_gre.c | 1 + net/ipv4/netfilter/ip_conntrack_proto_udp.c | 1 + net/ipv4/netfilter/ip_conntrack_standalone.c | 1 + net/ipv4/netfilter/ip_nat_snmp_basic.c | 2 + net/ipv4/netfilter/ipt_MASQUERADE.c | 2 + net/ipv4/netfilter/ipt_physdev.c | 1 + net/ipv4/proc.c | 1 + net/ipv4/sysctl_net_ipv4.c | 1 + net/ipv4/udp.c | 1 + net/ipv6/ah6.c | 1 + net/ipv6/esp6.c | 1 + net/ipv6/ipcomp6.c | 1 + net/ipv6/netfilter/ip6_tables.c | 1 + net/ipv6/netfilter/ip6t_LOG.c | 1 + net/ipv6/netfilter/ip6t_ah.c | 1 + net/ipv6/netfilter/ip6t_esp.c | 1 + net/sched/sch_teql.c | 1 + net/sctp/protocol.c | 1 + 87 files changed, 355 insertions(+), 184 deletions(-) create mode 100644 include/net/inet_sock.h diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 475d98f..780009c 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -47,6 +47,8 @@ #include #include +#include + MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("IP-over-InfiniBand net driver"); MODULE_LICENSE("Dual BSD/GPL"); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index ef3ee03..ed0c2ea 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -43,6 +43,8 @@ #include #include +#include + #include "ipoib.h" #ifdef CONFIG_INFINIBAND_IPOIB_DEBUG diff --git a/drivers/net/ns83820.c b/drivers/net/ns83820.c index f857ae9..b0c3b6a 100644 --- a/drivers/net/ns83820.c +++ b/drivers/net/ns83820.c @@ -115,6 +115,7 @@ #include #include #include +#include #include #include diff --git a/drivers/net/sk98lin/skge.c b/drivers/net/sk98lin/skge.c index ae73439..e1a2d52 100644 --- a/drivers/net/sk98lin/skge.c +++ b/drivers/net/sk98lin/skge.c @@ -107,6 +107,7 @@ #include "h/skversion.h" +#include #include #include #include diff --git a/drivers/net/skge.c b/drivers/net/skge.c index 00d6830..d8cc3ae 100644 --- a/drivers/net/skge.c +++ b/drivers/net/skge.c @@ -25,6 +25,7 @@ */ #include +#include #include #include #include diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 2fc9893..59d916c 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include diff --git a/fs/9p/trans_sock.c b/fs/9p/trans_sock.c index a93c2bf..6a9a75d 100644 --- a/fs/9p/trans_sock.c +++ b/fs/9p/trans_sock.c @@ -26,6 +26,7 @@ */ #include +#include #include #include #include diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index f2ca782..30cae36 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -14,6 +14,9 @@ #include #include #include + +#include + #include "nfs4_fs.h" #include "callback.h" diff --git a/include/linux/dccp.h b/include/linux/dccp.h index d0bdb49..088529f 100644 --- a/include/linux/dccp.h +++ b/include/linux/dccp.h @@ -192,10 +192,9 @@ enum { #include #include +#include #include -#include #include -#include enum dccp_state { DCCP_OPEN = TCP_ESTABLISHED, diff --git a/include/linux/ip.h b/include/linux/ip.h index 6ccc596..9e2eb9a 100644 --- a/include/linux/ip.h +++ b/include/linux/ip.h @@ -16,6 +16,7 @@ */ #ifndef _LINUX_IP_H #define _LINUX_IP_H +#include #include #define IPTOS_TOS_MASK 0x1E @@ -78,131 +79,6 @@ #define IPOPT_TS_TSANDADDR 1 /* timestamps and addresses */ #define IPOPT_TS_PRESPEC 3 /* specified modules only */ -#ifdef __KERNEL__ -#include -#include -#include -#include -#include -#include - -struct ip_options { - __u32 faddr; /* Saved first hop address */ - unsigned char optlen; - unsigned char srr; - unsigned char rr; - unsigned char ts; - unsigned char is_setbyuser:1, /* Set by setsockopt? */ - is_data:1, /* Options in __data, rather than skb */ - is_strictroute:1, /* Strict source route */ - srr_is_hit:1, /* Packet destination addr was our one */ - is_changed:1, /* IP checksum more not valid */ - rr_needaddr:1, /* Need to record addr of outgoing dev */ - ts_needtime:1, /* Need to record timestamp */ - ts_needaddr:1; /* Need to record addr of outgoing dev */ - unsigned char router_alert; - unsigned char __pad1; - unsigned char __pad2; - unsigned char __data[0]; -}; - -#define optlength(opt) (sizeof(struct ip_options) + opt->optlen) - -struct inet_request_sock { - struct request_sock req; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - u16 inet6_rsk_offset; - /* 2 bytes hole, try to pack */ -#endif - u32 loc_addr; - u32 rmt_addr; - u16 rmt_port; - u16 snd_wscale : 4, - rcv_wscale : 4, - tstamp_ok : 1, - sack_ok : 1, - wscale_ok : 1, - ecn_ok : 1, - acked : 1; - struct ip_options *opt; -}; - -static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) -{ - return (struct inet_request_sock *)sk; -} - -struct ipv6_pinfo; - -struct inet_sock { - /* sk and pinet6 has to be the first two members of inet_sock */ - struct sock sk; -#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) - struct ipv6_pinfo *pinet6; -#endif - /* Socket demultiplex comparisons on incoming packets. */ - __u32 daddr; /* Foreign IPv4 addr */ - __u32 rcv_saddr; /* Bound local IPv4 addr */ - __u16 dport; /* Destination port */ - __u16 num; /* Local port */ - __u32 saddr; /* Sending source */ - __s16 uc_ttl; /* Unicast TTL */ - __u16 cmsg_flags; - struct ip_options *opt; - __u16 sport; /* Source port */ - __u16 id; /* ID counter for DF pkts */ - __u8 tos; /* TOS */ - __u8 mc_ttl; /* Multicasting TTL */ - __u8 pmtudisc; - unsigned recverr : 1, - is_icsk : 1, /* inet_connection_sock? */ - freebind : 1, - hdrincl : 1, - mc_loop : 1; - int mc_index; /* Multicast device index */ - __u32 mc_addr; - struct ip_mc_socklist *mc_list; /* Group array */ - /* - * Following members are used to retain the infomation to build - * an ip header on each ip fragmentation while the socket is corked. - */ - struct { - unsigned int flags; - unsigned int fragsize; - struct ip_options *opt; - struct rtable *rt; - int length; /* Total length of all frames */ - u32 addr; - struct flowi fl; - } cork; -}; - -#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ -#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ - -static inline struct inet_sock *inet_sk(const struct sock *sk) -{ - return (struct inet_sock *)sk; -} - -static inline void __inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from, - const int ancestor_size) -{ - memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, - sk_from->sk_prot->obj_size - ancestor_size); -} -#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) -static inline void inet_sk_copy_descendant(struct sock *sk_to, - const struct sock *sk_from) -{ - __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); -} -#endif -#endif - -extern int inet_sk_rebuild_header(struct sock *sk); - struct iphdr { #if defined(__LITTLE_ENDIAN_BITFIELD) __u8 ihl:4, diff --git a/include/linux/ipv6.h b/include/linux/ipv6.h index a0d0489..93bbed5 100644 --- a/include/linux/ipv6.h +++ b/include/linux/ipv6.h @@ -171,12 +171,13 @@ enum { }; #ifdef __KERNEL__ -#include /* struct sockaddr_in6 */ #include -#include /* struct ipv6_mc_socklist */ #include #include +#include /* struct ipv6_mc_socklist */ +#include + /* This structure contains results of exthdrs parsing as offsets from skb->nh. @@ -346,8 +347,6 @@ static inline void inet_sk_copy_descendant(struct sock *sk_to, #define __ipv6_only_sock(sk) (inet6_sk(sk)->ipv6only) #define ipv6_only_sock(sk) ((sk)->sk_family == PF_INET6 && __ipv6_only_sock(sk)) -#include - struct inet6_timewait_sock { struct in6_addr tw_v6_daddr; struct in6_addr tw_v6_rcv_saddr; diff --git a/include/linux/udp.h b/include/linux/udp.h index b60e0b4..85a5565 100644 --- a/include/linux/udp.h +++ b/include/linux/udp.h @@ -35,10 +35,10 @@ struct udphdr { #define UDP_ENCAP_ESPINUDP 2 /* draft-ietf-ipsec-udp-encaps-06 */ #ifdef __KERNEL__ - #include -#include -#include +#include + +#include struct udp_sock { /* inet_sock has to be the first member */ diff --git a/include/net/atmclip.h b/include/net/atmclip.h index 47048b1..90fcc98 100644 --- a/include/net/atmclip.h +++ b/include/net/atmclip.h @@ -7,7 +7,6 @@ #define _ATMCLIP_H #include -#include #include #include #include @@ -18,6 +17,7 @@ #define CLIP_VCC(vcc) ((struct clip_vcc *) ((vcc)->user_back)) #define NEIGH2ENTRY(neigh) ((struct atmarp_entry *) (neigh)->primary_key) +struct sk_buff; struct clip_vcc { struct atm_vcc *vcc; /* VCC descriptor */ diff --git a/include/net/dst.h b/include/net/dst.h index 6c196a5..bee8b84d 100644 --- a/include/net/dst.h +++ b/include/net/dst.h @@ -9,6 +9,7 @@ #define _NET_DST_H #include +#include #include #include #include diff --git a/include/net/icmp.h b/include/net/icmp.h index 6cdebee..e7c3f20 100644 --- a/include/net/icmp.h +++ b/include/net/icmp.h @@ -20,12 +20,9 @@ #include #include -#include -#include -#include +#include #include -#include struct icmp_err { int errno; @@ -38,6 +35,10 @@ DECLARE_SNMP_STAT(struct icmp_mib, icmp_statistics); #define ICMP_INC_STATS_BH(field) SNMP_INC_STATS_BH(icmp_statistics, field) #define ICMP_INC_STATS_USER(field) SNMP_INC_STATS_USER(icmp_statistics, field) +struct dst_entry; +struct net_proto_family; +struct sk_buff; + extern void icmp_send(struct sk_buff *skb_in, int type, int code, u32 info); extern int icmp_rcv(struct sk_buff *skb); extern int icmp_ioctl(struct sock *sk, int cmd, unsigned long arg); diff --git a/include/net/ieee80211_crypt.h b/include/net/ieee80211_crypt.h index 225fc751..03b766a 100644 --- a/include/net/ieee80211_crypt.h +++ b/include/net/ieee80211_crypt.h @@ -23,12 +23,17 @@ #ifndef IEEE80211_CRYPT_H #define IEEE80211_CRYPT_H -#include +#include +#include +#include enum { IEEE80211_CRYPTO_TKIP_COUNTERMEASURES = (1 << 0), }; +struct sk_buff; +struct module; + struct ieee80211_crypto_ops { const char *name; struct list_head list; @@ -87,6 +92,8 @@ struct ieee80211_crypt_data { atomic_t refcnt; }; +struct ieee80211_device; + int ieee80211_register_crypto_ops(struct ieee80211_crypto_ops *ops); int ieee80211_unregister_crypto_ops(struct ieee80211_crypto_ops *ops); struct ieee80211_crypto_ops *ieee80211_get_crypto_ops(const char *name); diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h index 9188896..50234fa 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -16,9 +16,10 @@ #define _INET_CONNECTION_SOCK_H #include -#include #include #include + +#include #include #define INET_CSK_DEBUG 1 diff --git a/include/net/inet_ecn.h b/include/net/inet_ecn.h index b0c47e2..d599c6b 100644 --- a/include/net/inet_ecn.h +++ b/include/net/inet_ecn.h @@ -3,6 +3,8 @@ #include #include + +#include #include enum { diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h index c83baa7..135d80f 100644 --- a/include/net/inet_hashtables.h +++ b/include/net/inet_hashtables.h @@ -26,6 +26,7 @@ #include #include +#include #include #include #include @@ -128,26 +129,6 @@ struct inet_hashinfo { kmem_cache_t *bind_bucket_cachep; }; -static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, - const __u32 faddr, const __u16 fport) -{ - unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); - h ^= h >> 16; - h ^= h >> 8; - return h; -} - -static inline int inet_sk_ehashfn(const struct sock *sk) -{ - const struct inet_sock *inet = inet_sk(sk); - const __u32 laddr = inet->rcv_saddr; - const __u16 lport = inet->num; - const __u32 faddr = inet->daddr; - const __u16 fport = inet->dport; - - return inet_ehashfn(laddr, lport, faddr, fport); -} - static inline struct inet_ehash_bucket *inet_ehash_bucket( struct inet_hashinfo *hashinfo, unsigned int hash) diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h new file mode 100644 index 0000000..883eb52 --- /dev/null +++ b/include/net/inet_sock.h @@ -0,0 +1,193 @@ +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for inet_sock + * + * Authors: Many, reorganised here by + * Arnaldo Carvalho de Melo + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _INET_SOCK_H +#define _INET_SOCK_H + +#include + +#include +#include + +#include +#include +#include + +/** struct ip_options - IP Options + * + * @faddr - Saved first hop address + * @is_setbyuser - Set by setsockopt? + * @is_data - Options in __data, rather than skb + * @is_strictroute - Strict source route + * @srr_is_hit - Packet destination addr was our one + * @is_changed - IP checksum more not valid + * @rr_needaddr - Need to record addr of outgoing dev + * @ts_needtime - Need to record timestamp + * @ts_needaddr - Need to record addr of outgoing dev + */ +struct ip_options { + __u32 faddr; + unsigned char optlen; + unsigned char srr; + unsigned char rr; + unsigned char ts; + unsigned char is_setbyuser:1, + is_data:1, + is_strictroute:1, + srr_is_hit:1, + is_changed:1, + rr_needaddr:1, + ts_needtime:1, + ts_needaddr:1; + unsigned char router_alert; + unsigned char __pad1; + unsigned char __pad2; + unsigned char __data[0]; +}; + +#define optlength(opt) (sizeof(struct ip_options) + opt->optlen) + +struct inet_request_sock { + struct request_sock req; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + u16 inet6_rsk_offset; + /* 2 bytes hole, try to pack */ +#endif + u32 loc_addr; + u32 rmt_addr; + u16 rmt_port; + u16 snd_wscale : 4, + rcv_wscale : 4, + tstamp_ok : 1, + sack_ok : 1, + wscale_ok : 1, + ecn_ok : 1, + acked : 1; + struct ip_options *opt; +}; + +static inline struct inet_request_sock *inet_rsk(const struct request_sock *sk) +{ + return (struct inet_request_sock *)sk; +} + +struct ip_mc_socklist; +struct ipv6_pinfo; +struct rtable; + +/** struct inet_sock - representation of INET sockets + * + * @sk - ancestor class + * @pinet6 - pointer to IPv6 control block + * @daddr - Foreign IPv4 addr + * @rcv_saddr - Bound local IPv4 addr + * @dport - Destination port + * @num - Local port + * @saddr - Sending source + * @uc_ttl - Unicast TTL + * @sport - Source port + * @id - ID counter for DF pkts + * @tos - TOS + * @mc_ttl - Multicasting TTL + * @is_icsk - is this an inet_connection_sock? + * @mc_index - Multicast device index + * @mc_list - Group array + * @cork - info to build ip hdr on each ip frag while socket is corked + */ +struct inet_sock { + /* sk and pinet6 has to be the first two members of inet_sock */ + struct sock sk; +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) + struct ipv6_pinfo *pinet6; +#endif + /* Socket demultiplex comparisons on incoming packets. */ + __u32 daddr; + __u32 rcv_saddr; + __u16 dport; + __u16 num; + __u32 saddr; + __s16 uc_ttl; + __u16 cmsg_flags; + struct ip_options *opt; + __u16 sport; + __u16 id; + __u8 tos; + __u8 mc_ttl; + __u8 pmtudisc; + __u8 recverr:1, + is_icsk:1, + freebind:1, + hdrincl:1, + mc_loop:1; + int mc_index; + __u32 mc_addr; + struct ip_mc_socklist *mc_list; + struct { + unsigned int flags; + unsigned int fragsize; + struct ip_options *opt; + struct rtable *rt; + int length; /* Total length of all frames */ + u32 addr; + struct flowi fl; + } cork; +}; + +#define IPCORK_OPT 1 /* ip-options has been held in ipcork.opt */ +#define IPCORK_ALLFRAG 2 /* always fragment (for ipv6 for now) */ + +static inline struct inet_sock *inet_sk(const struct sock *sk) +{ + return (struct inet_sock *)sk; +} + +static inline void __inet_sk_copy_descendant(struct sock *sk_to, + const struct sock *sk_from, + const int ancestor_size) +{ + memcpy(inet_sk(sk_to) + 1, inet_sk(sk_from) + 1, + sk_from->sk_prot->obj_size - ancestor_size); +} +#if !(defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) +static inline void inet_sk_copy_descendant(struct sock *sk_to, + const struct sock *sk_from) +{ + __inet_sk_copy_descendant(sk_to, sk_from, sizeof(struct inet_sock)); +} +#endif + +extern int inet_sk_rebuild_header(struct sock *sk); + +static inline unsigned int inet_ehashfn(const __u32 laddr, const __u16 lport, + const __u32 faddr, const __u16 fport) +{ + unsigned int h = (laddr ^ lport) ^ (faddr ^ fport); + h ^= h >> 16; + h ^= h >> 8; + return h; +} + +static inline int inet_sk_ehashfn(const struct sock *sk) +{ + const struct inet_sock *inet = inet_sk(sk); + const __u32 laddr = inet->rcv_saddr; + const __u16 lport = inet->num; + const __u32 faddr = inet->daddr; + const __u16 fport = inet->dport; + + return inet_ehashfn(laddr, lport, faddr, fport); +} + +#endif /* _INET_SOCK_H */ diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index e396a65..1da294c 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -17,13 +17,13 @@ #include -#include #include #include #include #include #include +#include #include #include #include diff --git a/include/net/ip.h b/include/net/ip.h index 4d6294ba..f7e7fd7 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -24,14 +24,10 @@ #include #include -#include #include #include -#include -#include -#include -#include -#include + +#include #include struct sock; @@ -75,6 +71,13 @@ extern rwlock_t ip_ra_lock; #define IP_FRAG_TIME (30 * HZ) /* fragment lifetime */ +struct msghdr; +struct net_device; +struct packet_type; +struct rtable; +struct sk_buff; +struct sockaddr; + extern void ip_mc_dropsocket(struct sock *); extern void ip_mc_dropdevice(struct net_device *dev); extern int igmp_mc_proc_init(void); @@ -184,6 +187,8 @@ extern int sysctl_ip_dynaddr; extern void ipfrag_init(void); #ifdef CONFIG_INET +#include + /* The function in 2.2 was invalid, producing wrong result for * check=0xFEFF. It was noticed by Arthur Skawina _year_ ago. --ANK(000625) */ static inline diff --git a/include/net/ip_fib.h b/include/net/ip_fib.h index 14de4eb..e000fa2 100644 --- a/include/net/ip_fib.h +++ b/include/net/ip_fib.h @@ -238,6 +238,8 @@ extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, struct net_device *dev, u32 *spec_dst, u32 *itag); extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res); +struct rtentry; + /* Exported by fib_semantics.c */ extern int ip_fib_check_default(u32 gw, struct net_device *dev); extern int fib_sync_down(u32 local, struct net_device *dev, int force); diff --git a/include/net/ip_vs.h b/include/net/ip_vs.h index 3b5559a..7d2674f 100644 --- a/include/net/ip_vs.h +++ b/include/net/ip_vs.h @@ -251,16 +251,15 @@ struct ip_vs_daemon_user { #include #include /* for struct list_head */ #include /* for struct rwlock_t */ -#include /* for struct sk_buff */ -#include /* for struct iphdr */ #include /* for struct atomic_t */ -#include /* for struct neighbour */ -#include /* for struct dst_entry */ -#include #include +#include +#include #ifdef CONFIG_IP_VS_DEBUG +#include + extern int ip_vs_get_debug_level(void); #define IP_VS_DBG(level, msg...) \ do { \ @@ -429,8 +428,11 @@ struct ip_vs_stats spinlock_t lock; /* spin lock */ }; +struct dst_entry; +struct iphdr; struct ip_vs_conn; struct ip_vs_app; +struct sk_buff; struct ip_vs_protocol { struct ip_vs_protocol *next; diff --git a/include/net/ipv6.h b/include/net/ipv6.h index 11a7256..860bbac 100644 --- a/include/net/ipv6.h +++ b/include/net/ipv6.h @@ -541,6 +541,9 @@ extern int sysctl_ip6frag_secret_interval; extern const struct proto_ops inet6_stream_ops; extern const struct proto_ops inet6_dgram_ops; +struct group_source_req; +struct group_filter; + extern int ip6_mc_source(int add, int omode, struct sock *sk, struct group_source_req *pgsr); extern int ip6_mc_msfilter(struct sock *sk, struct group_filter *gsf); diff --git a/include/net/ndisc.h b/include/net/ndisc.h index f85d6e4..bbac87e 100644 --- a/include/net/ndisc.h +++ b/include/net/ndisc.h @@ -35,11 +35,20 @@ enum { #ifdef __KERNEL__ -#include -#include +#include +#include #include +#include +#include + #include -#include + +struct ctl_table; +struct file; +struct inet6_dev; +struct net_device; +struct net_proto_family; +struct sk_buff; extern struct neigh_table nd_tbl; @@ -108,7 +117,7 @@ extern int igmp6_event_report(struct sk_buff *skb); extern void igmp6_cleanup(void); #ifdef CONFIG_SYSCTL -extern int ndisc_ifinfo_sysctl_change(ctl_table *ctl, +extern int ndisc_ifinfo_sysctl_change(struct ctl_table *ctl, int write, struct file * filp, void __user *buffer, diff --git a/include/net/neighbour.h b/include/net/neighbour.h index 34c0773..6fa9ae1 100644 --- a/include/net/neighbour.h +++ b/include/net/neighbour.h @@ -49,8 +49,8 @@ #ifdef __KERNEL__ #include -#include #include +#include #include #include diff --git a/include/net/pkt_act.h b/include/net/pkt_act.h index bd08964..b225d84 100644 --- a/include/net/pkt_act.h +++ b/include/net/pkt_act.h @@ -15,7 +15,6 @@ #include #include #include -#include #include #include #include diff --git a/include/net/raw.h b/include/net/raw.h index f479174..e67b28a 100644 --- a/include/net/raw.h +++ b/include/net/raw.h @@ -19,6 +19,8 @@ #include +#include + extern struct proto raw_prot; extern void raw_err(struct sock *, struct sk_buff *, u32 info); diff --git a/include/net/udp.h b/include/net/udp.h index 107b9d7..766fba1 100644 --- a/include/net/udp.h +++ b/include/net/udp.h @@ -22,9 +22,8 @@ #ifndef _UDP_H #define _UDP_H -#include -#include #include +#include #include #include #include @@ -62,6 +61,7 @@ static inline int udp_lport_inuse(u16 num) extern struct proto udp_prot; +struct sk_buff; extern void udp_err(struct sk_buff *, u32); diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 487abca..07d7b50 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -2,11 +2,12 @@ #define _NET_XFRM_H #include +#include #include #include #include #include -#include +#include #include #include #include diff --git a/net/bridge/br_netfilter.c b/net/bridge/br_netfilter.c index 23422bd..223f827 100644 --- a/net/bridge/br_netfilter.c +++ b/net/bridge/br_netfilter.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -33,8 +34,11 @@ #include #include #include + #include #include +#include + #include #include #include "br_private.h" diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c index c436e6c..9f6e019 100644 --- a/net/bridge/netfilter/ebt_log.c +++ b/net/bridge/netfilter/ebt_log.c @@ -9,6 +9,7 @@ * */ +#include #include #include #include diff --git a/net/core/netpoll.c b/net/core/netpoll.c index 49424a4..281a632 100644 --- a/net/core/netpoll.c +++ b/net/core/netpoll.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/dccp/ccid.h b/net/dccp/ccid.h index c37eeea..de681c6 100644 --- a/net/dccp/ccid.h +++ b/net/dccp/ccid.h @@ -21,6 +21,8 @@ #define CCID_MAX 255 +struct tcp_info; + struct ccid { unsigned char ccid_id; const char *ccid_name; diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 99e8afa..3f24467 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -19,6 +19,7 @@ #include #include +#include #include #include #include diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index 2e194c8..c609dc7 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -20,6 +20,7 @@ #include #include #include +#include #include #include #include diff --git a/net/dccp/output.c b/net/dccp/output.c index 95a3c2c..efd7ffb 100644 --- a/net/dccp/output.c +++ b/net/dccp/output.c @@ -15,6 +15,7 @@ #include #include +#include #include #include "ackvec.h" diff --git a/net/dccp/proto.c b/net/dccp/proto.c index e4e629e..65b11ea 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,7 +24,7 @@ #include #include -#include +#include #include #include #include @@ -34,7 +34,6 @@ #include #include #include -#include #include "ccid.h" #include "dccp.h" diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index ff58f49..70fb2b8 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 4ed8a81..36a6306 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -93,6 +93,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index 035ad2c..aed537f 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -6,6 +6,7 @@ #include #include #include +#include #include diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index b425748..3743208 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 04a6fe3..7b9bb28 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -58,6 +58,7 @@ #endif #include +#include #include #include #include diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 1b18ce6..73bfcae 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -9,6 +9,7 @@ #include #include #include +#include #include /* decapsulation data for use when post-processing */ diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 19b1b98..18f5e50 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c index 7ea0209c..e2890ec 100644 --- a/net/ipv4/fib_hash.c +++ b/net/ipv4/fib_hash.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c index 0b298bb..0dd4d06 100644 --- a/net/ipv4/fib_rules.c +++ b/net/ipv4/fib_rules.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 6d2a6ac..ef4724d 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include #include @@ -36,6 +37,7 @@ #include #include +#include #include #include #include diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index 92e23b2..be5a519 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -73,6 +73,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 4a195c7..3475811 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -91,6 +91,8 @@ #include #include #include + +#include #include #include #include diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 473d0f2..e45846a 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -128,6 +128,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c index dbe12da..d3f6c46 100644 --- a/net/ipv4/ip_options.c +++ b/net/ipv4/ip_options.c @@ -22,6 +22,7 @@ #include #include #include +#include /* * Write options to IP header, record destination address to diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index add019c..6986e11d 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -25,6 +25,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index fc718df..d64e2ec 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -28,6 +28,7 @@ #include #include #include +#include struct ipcomp_tfms { struct list_head list; diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c index e8674ba..bb3613e 100644 --- a/net/ipv4/ipconfig.c +++ b/net/ipv4/ipconfig.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -58,6 +59,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 302b7eb..caa3b7d 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -52,6 +52,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index 2a3a8c5..d35cea3 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -24,7 +24,9 @@ * */ +#include #include +#include #include #include /* for proc_net_* */ #include diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index 9bdcf31..fe2c39d 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -35,6 +35,7 @@ #include #include +#include #include #include diff --git a/net/ipv4/ipvs/ip_vs_dh.c b/net/ipv4/ipvs/ip_vs_dh.c index f3bc320..9fee19c 100644 --- a/net/ipv4/ipvs/ip_vs_dh.c +++ b/net/ipv4/ipvs/ip_vs_dh.c @@ -37,8 +37,10 @@ * */ +#include #include #include +#include #include diff --git a/net/ipv4/ipvs/ip_vs_est.c b/net/ipv4/ipvs/ip_vs_est.c index 67b3e2f..e700474 100644 --- a/net/ipv4/ipvs/ip_vs_est.c +++ b/net/ipv4/ipvs/ip_vs_est.c @@ -13,7 +13,10 @@ * Changes: * */ +#include #include +#include +#include #include #include diff --git a/net/ipv4/ipvs/ip_vs_lblc.c b/net/ipv4/ipvs/ip_vs_lblc.c index b6dad7e..6e5cb92 100644 --- a/net/ipv4/ipvs/ip_vs_lblc.c +++ b/net/ipv4/ipvs/ip_vs_lblc.c @@ -41,8 +41,10 @@ * me to write this module. */ +#include #include #include +#include /* for sysctl */ #include diff --git a/net/ipv4/ipvs/ip_vs_lblcr.c b/net/ipv4/ipvs/ip_vs_lblcr.c index 8c78ef7..32ba37b 100644 --- a/net/ipv4/ipvs/ip_vs_lblcr.c +++ b/net/ipv4/ipvs/ip_vs_lblcr.c @@ -39,8 +39,10 @@ * */ +#include #include #include +#include /* for sysctl */ #include diff --git a/net/ipv4/ipvs/ip_vs_proto_ah.c b/net/ipv4/ipvs/ip_vs_proto_ah.c index 453e94a..8b0505b 100644 --- a/net/ipv4/ipvs/ip_vs_proto_ah.c +++ b/net/ipv4/ipvs/ip_vs_proto_ah.c @@ -12,6 +12,8 @@ * */ +#include +#include #include #include #include diff --git a/net/ipv4/ipvs/ip_vs_proto_esp.c b/net/ipv4/ipvs/ip_vs_proto_esp.c index 478e5c7..c36ccf0 100644 --- a/net/ipv4/ipvs/ip_vs_proto_esp.c +++ b/net/ipv4/ipvs/ip_vs_proto_esp.c @@ -12,6 +12,8 @@ * */ +#include +#include #include #include #include diff --git a/net/ipv4/ipvs/ip_vs_proto_udp.c b/net/ipv4/ipvs/ip_vs_proto_udp.c index 8ae5f2e..89d9175 100644 --- a/net/ipv4/ipvs/ip_vs_proto_udp.c +++ b/net/ipv4/ipvs/ip_vs_proto_udp.c @@ -15,8 +15,11 @@ * */ +#include +#include #include #include +#include #include diff --git a/net/ipv4/ipvs/ip_vs_sh.c b/net/ipv4/ipvs/ip_vs_sh.c index 6f7c50e..7775e6c 100644 --- a/net/ipv4/ipvs/ip_vs_sh.c +++ b/net/ipv4/ipvs/ip_vs_sh.c @@ -34,8 +34,10 @@ * */ +#include #include #include +#include #include diff --git a/net/ipv4/ipvs/ip_vs_sync.c b/net/ipv4/ipvs/ip_vs_sync.c index 2e5ced3..1bca714 100644 --- a/net/ipv4/ipvs/ip_vs_sync.c +++ b/net/ipv4/ipvs/ip_vs_sync.c @@ -21,12 +21,14 @@ #include #include +#include #include #include #include #include #include #include /* for ip_mc_join_group */ +#include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_amanda.c b/net/ipv4/netfilter/ip_conntrack_amanda.c index e52847f..0366eed 100644 --- a/net/ipv4/netfilter/ip_conntrack_amanda.c +++ b/net/ipv4/netfilter/ip_conntrack_amanda.c @@ -18,11 +18,13 @@ * */ +#include #include #include #include #include #include +#include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_proto_gre.c b/net/ipv4/netfilter/ip_conntrack_proto_gre.c index 744abb9..57956de 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_gre.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_gre.c @@ -31,6 +31,7 @@ #include #include #include +#include static DEFINE_RWLOCK(ip_ct_gre_lock); #define ASSERT_READ_LOCK(x) diff --git a/net/ipv4/netfilter/ip_conntrack_proto_udp.c b/net/ipv4/netfilter/ip_conntrack_proto_udp.c index f2dcac7..46becbe 100644 --- a/net/ipv4/netfilter/ip_conntrack_proto_udp.c +++ b/net/ipv4/netfilter/ip_conntrack_proto_udp.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/netfilter/ip_conntrack_standalone.c b/net/ipv4/netfilter/ip_conntrack_standalone.c index dd476b1..a88bcc5 100644 --- a/net/ipv4/netfilter/ip_conntrack_standalone.c +++ b/net/ipv4/netfilter/ip_conntrack_standalone.c @@ -27,6 +27,7 @@ #endif #include #include +#include #define ASSERT_READ_LOCK(x) #define ASSERT_WRITE_LOCK(x) diff --git a/net/ipv4/netfilter/ip_nat_snmp_basic.c b/net/ipv4/netfilter/ip_nat_snmp_basic.c index 8acb7ed..4f95d47 100644 --- a/net/ipv4/netfilter/ip_nat_snmp_basic.c +++ b/net/ipv4/netfilter/ip_nat_snmp_basic.c @@ -44,6 +44,7 @@ * */ #include +#include #include #include #include @@ -53,6 +54,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/netfilter/ipt_MASQUERADE.c b/net/ipv4/netfilter/ipt_MASQUERADE.c index 275a174..2786051 100644 --- a/net/ipv4/netfilter/ipt_MASQUERADE.c +++ b/net/ipv4/netfilter/ipt_MASQUERADE.c @@ -11,6 +11,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/netfilter/ipt_physdev.c b/net/ipv4/netfilter/ipt_physdev.c index 1a53924..03f5548 100644 --- a/net/ipv4/netfilter/ipt_physdev.c +++ b/net/ipv4/netfilter/ipt_physdev.c @@ -9,6 +9,7 @@ */ #include +#include #include #include #include diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0d7dc66..39d49dc 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -38,6 +38,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index dbf8295..16984d4 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 67c0363..223abaa 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -86,6 +86,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index f362973..13cc7f8 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -33,6 +33,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8bfbe99..6de8ee1 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -36,6 +36,7 @@ #include #include #include +#include #include static int esp6_output(struct xfrm_state *x, struct sk_buff *skb) diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 55917fb..626dd39 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -47,6 +47,7 @@ #include #include #include +#include #include #include diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c index dd80020..ea43ef1 100644 --- a/net/ipv6/netfilter/ip6_tables.c +++ b/net/ipv6/netfilter/ip6_tables.c @@ -15,6 +15,7 @@ * - new extension header parser code */ #include +#include #include #include #include diff --git a/net/ipv6/netfilter/ip6t_LOG.c b/net/ipv6/netfilter/ip6t_LOG.c index 0cd1d1b..ae4653b 100644 --- a/net/ipv6/netfilter/ip6t_LOG.c +++ b/net/ipv6/netfilter/ip6t_LOG.c @@ -13,6 +13,7 @@ #include #include #include +#include #include #include #include diff --git a/net/ipv6/netfilter/ip6t_ah.c b/net/ipv6/netfilter/ip6t_ah.c index dde3779..268918d 100644 --- a/net/ipv6/netfilter/ip6t_ah.c +++ b/net/ipv6/netfilter/ip6t_ah.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/net/ipv6/netfilter/ip6t_esp.c b/net/ipv6/netfilter/ip6t_esp.c index 24bc0cd..65937de 100644 --- a/net/ipv6/netfilter/ip6t_esp.c +++ b/net/ipv6/netfilter/ip6t_esp.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include diff --git a/net/sched/sch_teql.c b/net/sched/sch_teql.c index 6cf0342..c4a2a8c 100644 --- a/net/sched/sch_teql.c +++ b/net/sched/sch_teql.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c index d1b0747..de693b4 100644 --- a/net/sctp/protocol.c +++ b/net/sctp/protocol.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From 8639a11e23d9eb0a6ceac2feed27acdfbb158f95 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Tue, 27 Dec 2005 15:17:57 -0200 Subject: [TCP]: Don't use __constant_htonl for a non const arg Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- include/net/tcp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 176221c..3699304 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -994,11 +994,11 @@ static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; int this_sack; - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + - (tp->rx_opt.eff_sacks * TCPOLEN_SACK_PERBLOCK))); + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK))); for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { *ptr++ = htonl(sp[this_sack].start_seq); *ptr++ = htonl(sp[this_sack].end_seq); -- cgit v1.1 From 17ba15fb6264f27374bc87f4c3f8519b80289d85 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 27 Dec 2005 20:57:40 -0800 Subject: [PPPOX]: Fix assignment into const proto_ops. And actually, with this, the whole pppox layer can basically be removed and subsumed into pppoe.c, no other pppox sub-protocol implementation exists and we've had this thing for at least 4 years. Signed-off-by: David S. Miller --- drivers/net/pppoe.c | 9 ++++----- drivers/net/pppox.c | 10 +++------- include/linux/if_pppox.h | 3 +-- 3 files changed, 8 insertions(+), 14 deletions(-) diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index 71e303b..9369f81 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -85,7 +85,7 @@ static int pppoe_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) static int pppoe_xmit(struct ppp_channel *chan, struct sk_buff *skb); static int __pppoe_xmit(struct sock *sk, struct sk_buff *skb); -static struct proto_ops pppoe_ops; +static const struct proto_ops pppoe_ops; static DEFINE_RWLOCK(pppoe_hash_lock); static struct ppp_channel_ops pppoe_chan_ops; @@ -1063,9 +1063,7 @@ static int __init pppoe_proc_init(void) static inline int pppoe_proc_init(void) { return 0; } #endif /* CONFIG_PROC_FS */ -/* ->ioctl are set at pppox_create */ - -static struct proto_ops pppoe_ops = { +static const struct proto_ops pppoe_ops = { .family = AF_PPPOX, .owner = THIS_MODULE, .release = pppoe_release, @@ -1081,7 +1079,8 @@ static struct proto_ops pppoe_ops = { .getsockopt = sock_no_getsockopt, .sendmsg = pppoe_sendmsg, .recvmsg = pppoe_recvmsg, - .mmap = sock_no_mmap + .mmap = sock_no_mmap, + .ioctl = pppox_ioctl, }; static struct pppox_proto pppoe_proto = { diff --git a/drivers/net/pppox.c b/drivers/net/pppox.c index 0c1e114..9315046 100644 --- a/drivers/net/pppox.c +++ b/drivers/net/pppox.c @@ -68,8 +68,7 @@ EXPORT_SYMBOL(register_pppox_proto); EXPORT_SYMBOL(unregister_pppox_proto); EXPORT_SYMBOL(pppox_unbind_sock); -static int pppox_ioctl(struct socket* sock, unsigned int cmd, - unsigned long arg) +int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; struct pppox_sock *po = pppox_sk(sk); @@ -105,6 +104,7 @@ static int pppox_ioctl(struct socket* sock, unsigned int cmd, return rc; } +EXPORT_SYMBOL(pppox_ioctl); static int pppox_create(struct socket *sock, int protocol) { @@ -119,11 +119,7 @@ static int pppox_create(struct socket *sock, int protocol) goto out; rc = pppox_protos[protocol]->create(sock); - if (!rc) { - /* We get to set the ioctl handler. */ - /* For everything else, pppox is just a shell. */ - sock->ops->ioctl = pppox_ioctl; - } + module_put(pppox_protos[protocol]->owner); out: return rc; diff --git a/include/linux/if_pppox.h b/include/linux/if_pppox.h index e677f73..4fab3d0 100644 --- a/include/linux/if_pppox.h +++ b/include/linux/if_pppox.h @@ -157,8 +157,7 @@ struct pppox_proto { extern int register_pppox_proto(int proto_num, struct pppox_proto *pp); extern void unregister_pppox_proto(int proto_num); extern void pppox_unbind_sock(struct sock *sk);/* delete ppp-channel binding */ -extern int pppox_channel_ioctl(struct ppp_channel *pc, unsigned int cmd, - unsigned long arg); +extern int pppox_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg); /* PPPoX socket states */ enum { -- cgit v1.1 From 4947d3ef8de7b4f42aed6ea9ba689dc8fb45b5a5 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 3 Jan 2006 14:06:50 -0800 Subject: [NET]: Speed up __alloc_skb() From: Benjamin LaHaise In __alloc_skb(), the use of skb_shinfo() which casts a u8 * to the shared info structure results in gcc being forced to do a reload of the pointer since it has no information on possible aliasing. Fix this by using a pointer to refer to skb_shared_info. By initializing skb_shared_info sequentially, the write combining buffers can reduce the number of memory transactions to a single write. Reorder the initialization in __alloc_skb() to match the structure definition. There is also an alignment issue on 64 bit systems with skb_shared_info by converting nr_frags to a short everything packs up nicely. Also, pass the slab cache pointer according to the fclone flag instead of using two almost identical function calls. This raises bw_unix performance up to a peak of 707KB/s when combined with the spinlock patch. It should help other networking protocols, too. Signed-off-by: David S. Miller --- include/linux/skbuff.h | 2 +- net/core/skbuff.c | 27 +++++++++++++-------------- 2 files changed, 14 insertions(+), 15 deletions(-) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 9716771..483cfc4 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -133,7 +133,7 @@ struct skb_frag_struct { */ struct skb_shared_info { atomic_t dataref; - unsigned int nr_frags; + unsigned short nr_frags; unsigned short tso_size; unsigned short tso_segs; unsigned short ufo_size; diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 83fee37..070f91c 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -135,17 +135,13 @@ void skb_under_panic(struct sk_buff *skb, int sz, void *here) struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, int fclone) { + struct skb_shared_info *shinfo; struct sk_buff *skb; u8 *data; /* Get the HEAD */ - if (fclone) - skb = kmem_cache_alloc(skbuff_fclone_cache, - gfp_mask & ~__GFP_DMA); - else - skb = kmem_cache_alloc(skbuff_head_cache, - gfp_mask & ~__GFP_DMA); - + skb = kmem_cache_alloc(fclone ? skbuff_fclone_cache : skbuff_head_cache, + gfp_mask & ~__GFP_DMA); if (!skb) goto out; @@ -162,6 +158,16 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, skb->data = data; skb->tail = data; skb->end = data + size; + /* make sure we initialize shinfo sequentially */ + shinfo = skb_shinfo(skb); + atomic_set(&shinfo->dataref, 1); + shinfo->nr_frags = 0; + shinfo->tso_size = 0; + shinfo->tso_segs = 0; + shinfo->ufo_size = 0; + shinfo->ip6_frag_id = 0; + shinfo->frag_list = NULL; + if (fclone) { struct sk_buff *child = skb + 1; atomic_t *fclone_ref = (atomic_t *) (child + 1); @@ -171,13 +177,6 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, child->fclone = SKB_FCLONE_UNAVAILABLE; } - atomic_set(&(skb_shinfo(skb)->dataref), 1); - skb_shinfo(skb)->nr_frags = 0; - skb_shinfo(skb)->tso_size = 0; - skb_shinfo(skb)->tso_segs = 0; - skb_shinfo(skb)->frag_list = NULL; - skb_shinfo(skb)->ufo_size = 0; - skb_shinfo(skb)->ip6_frag_id = 0; out: return skb; nodata: -- cgit v1.1 From fd19f329a32bdc4eb07885e0b3889567cfe00aa7 Mon Sep 17 00:00:00 2001 From: Benjamin LaHaise Date: Tue, 3 Jan 2006 14:10:46 -0800 Subject: [AF_UNIX]: Convert to use a spinlock instead of rwlock From: Benjamin LaHaise In af_unix, a rwlock is used to protect internal state. At least on my P4 with HT it is faster to use a spinlock due to the simpler memory barrier used to unlock. This patch raises bw_unix to ~690K/s. Signed-off-by: David S. Miller --- include/net/af_unix.h | 10 +++++----- net/unix/af_unix.c | 2 +- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/net/af_unix.h b/include/net/af_unix.h index 3f302ae..bfc1779 100644 --- a/include/net/af_unix.h +++ b/include/net/af_unix.h @@ -58,10 +58,10 @@ struct unix_skb_parms { #define UNIXCB(skb) (*(struct unix_skb_parms*)&((skb)->cb)) #define UNIXCREDS(skb) (&UNIXCB((skb)).creds) -#define unix_state_rlock(s) read_lock(&unix_sk(s)->lock) -#define unix_state_runlock(s) read_unlock(&unix_sk(s)->lock) -#define unix_state_wlock(s) write_lock(&unix_sk(s)->lock) -#define unix_state_wunlock(s) write_unlock(&unix_sk(s)->lock) +#define unix_state_rlock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_runlock(s) spin_unlock(&unix_sk(s)->lock) +#define unix_state_wlock(s) spin_lock(&unix_sk(s)->lock) +#define unix_state_wunlock(s) spin_unlock(&unix_sk(s)->lock) #ifdef __KERNEL__ /* The AF_UNIX socket */ @@ -76,7 +76,7 @@ struct unix_sock { struct sock *other; struct sock *gc_tree; atomic_t inflight; - rwlock_t lock; + spinlock_t lock; wait_queue_head_t peer_wait; }; #define unix_sk(__sk) ((struct unix_sock *)__sk) diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 7d3fe6a..1ddd36d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -564,7 +564,7 @@ static struct sock * unix_create1(struct socket *sock) u = unix_sk(sk); u->dentry = NULL; u->mnt = NULL; - rwlock_init(&u->lock); + spin_lock_init(&u->lock); atomic_set(&u->inflight, sock ? 0 : -1); init_MUTEX(&u->readsem); /* single task reading lock */ init_waitqueue_head(&u->peer_wait); -- cgit v1.1 From b461d2f2188c1c578ed651e4cdf608be7a993cd4 Mon Sep 17 00:00:00 2001 From: Per Liden Date: Tue, 3 Jan 2006 14:13:29 -0800 Subject: [NETLINK] genetlink: fix cmd type in genl_ops to be consistent to u8 Signed-off-by: Per Liden ACKed-by: Jamal Hadi Salim Signed-off-by: David S. Miller --- include/net/genetlink.h | 2 +- net/netlink/genetlink.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/net/genetlink.h b/include/net/genetlink.h index 52d8b1a..c5b96b2 100644 --- a/include/net/genetlink.h +++ b/include/net/genetlink.h @@ -60,7 +60,7 @@ struct genl_info */ struct genl_ops { - unsigned int cmd; + u8 cmd; unsigned int flags; struct nla_policy *policy; int (*doit)(struct sk_buff *skb, diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 287cfcc..3b13784 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -441,7 +441,7 @@ errout: } static struct sk_buff *ctrl_build_msg(struct genl_family *family, u32 pid, - int seq, int cmd) + int seq, u8 cmd) { struct sk_buff *skb; int err; -- cgit v1.1 From 5ff7630e4aa6c3969094dc30ff1cdaa6f52b0ed0 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jan 2006 14:14:46 -0800 Subject: [NETROM]: Remove unessecary lock_sock calls in netrom_ioctl() lock_sock is needed only in very few cases, so do it there instead of around the switch statement. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/netrom/af_netrom.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 05b653c..9ee672e 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1166,10 +1166,11 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) void __user *argp = (void __user *)arg; int ret; - lock_sock(sk); switch (cmd) { case TIOCOUTQ: { long amount; + + lock_sock(sk); amount = sk->sk_sndbuf - atomic_read(&sk->sk_wmem_alloc); if (amount < 0) amount = 0; @@ -1180,6 +1181,8 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case TIOCINQ: { struct sk_buff *skb; long amount = 0L; + + lock_sock(sk); /* These two are safe on a single CPU system as only user tasks fiddle here */ if ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) amount = skb->len; @@ -1188,6 +1191,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } case SIOCGSTAMP: + lock_sock(sk); ret = sock_get_timestamp(sk, argp); release_sock(sk); return ret; @@ -1202,21 +1206,17 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFNETMASK: case SIOCGIFMETRIC: case SIOCSIFMETRIC: - release_sock(sk); return -EINVAL; case SIOCADDRT: case SIOCDELRT: case SIOCNRDECOBS: - release_sock(sk); if (!capable(CAP_NET_ADMIN)) return -EPERM; return nr_rt_ioctl(cmd, argp); default: - release_sock(sk); return dev_ioctl(cmd, argp); } - release_sock(sk); return 0; } -- cgit v1.1 From b5e5fa5e093e42cab4ee3d6dcbc4f450ad29a723 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Jan 2006 14:18:33 -0800 Subject: [NET]: Add a dev_ioctl() fallback to sock_ioctl() Currently all network protocols need to call dev_ioctl as the default fallback in their ioctl implementations. This patch adds a fallback to dev_ioctl to sock_ioctl if the protocol returned -ENOIOCTLCMD. This way all the procotol ioctl handlers can be simplified and we don't need to export dev_ioctl. Signed-off-by: Christoph Hellwig Signed-off-by: David S. Miller --- net/appletalk/ddp.c | 19 +------------------ net/ax25/af_ax25.c | 2 +- net/core/dev.c | 1 - net/decnet/af_decnet.c | 2 +- net/econet/af_econet.c | 2 +- net/ipv4/af_inet.c | 8 ++++---- net/ipv6/af_inet6.c | 8 +++----- net/ipx/af_ipx.c | 2 +- net/irda/af_irda.c | 2 +- net/llc/af_llc.c | 2 +- net/netrom/af_netrom.c | 2 +- net/packet/af_packet.c | 2 +- net/rose/af_rose.c | 2 +- net/socket.c | 7 +++++++ net/unix/af_unix.c | 2 +- net/wanrouter/af_wanpipe.c | 2 +- net/x25/af_x25.c | 2 +- 17 files changed, 27 insertions(+), 40 deletions(-) diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c index 296f186..a5144e4 100644 --- a/net/appletalk/ddp.c +++ b/net/appletalk/ddp.c @@ -1763,7 +1763,7 @@ static int atalk_recvmsg(struct kiocb *iocb, struct socket *sock, struct msghdr */ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - int rc = -EINVAL; + int rc = -ENOIOCTLCMD; struct sock *sk = sock->sk; void __user *argp = (void __user *)arg; @@ -1813,23 +1813,6 @@ static int atalk_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = atif_ioctl(cmd, argp); rtnl_unlock(); break; - /* Physical layer ioctl calls */ - case SIOCSIFLINK: - case SIOCGIFHWADDR: - case SIOCSIFHWADDR: - case SIOCGIFFLAGS: - case SIOCSIFFLAGS: - case SIOCGIFTXQLEN: - case SIOCSIFTXQLEN: - case SIOCGIFMTU: - case SIOCGIFCONF: - case SIOCADDMULTI: - case SIOCDELMULTI: - case SIOCGIFCOUNT: - case SIOCGIFINDEX: - case SIOCGIFNAME: - rc = dev_ioctl(cmd, argp); - break; } return rc; diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c index 8b5d10a..e8753c7 100644 --- a/net/ax25/af_ax25.c +++ b/net/ax25/af_ax25.c @@ -1827,7 +1827,7 @@ static int ax25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - res = dev_ioctl(cmd, argp); + res = -ENOIOCTLCMD; break; } release_sock(sk); diff --git a/net/core/dev.c b/net/core/dev.c index a5efc9a..29ba109 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -3276,7 +3276,6 @@ EXPORT_SYMBOL(dev_close); EXPORT_SYMBOL(dev_get_by_flags); EXPORT_SYMBOL(dev_get_by_index); EXPORT_SYMBOL(dev_get_by_name); -EXPORT_SYMBOL(dev_ioctl); EXPORT_SYMBOL(dev_open); EXPORT_SYMBOL(dev_queue_xmit); EXPORT_SYMBOL(dev_remove_pack); diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c index 65e3bae..78ec534 100644 --- a/net/decnet/af_decnet.c +++ b/net/decnet/af_decnet.c @@ -1252,7 +1252,7 @@ static int dn_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) break; default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 70fb2b8..4fc2452 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -687,7 +687,7 @@ static int econet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg break; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } /*NOTREACHED*/ return 0; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 36a6306..966a071 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -777,10 +777,10 @@ int inet_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) err = devinet_ioctl(cmd, (void __user *)arg); break; default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == - -ENOIOCTLCMD) - err = dev_ioctl(cmd, (void __user *)arg); + if (sk->sk_prot->ioctl) + err = sk->sk_prot->ioctl(sk, cmd, arg); + else + err = -ENOIOCTLCMD; break; } return err; diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 7c9f192..68afc53 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -434,7 +434,6 @@ int inet6_getname(struct socket *sock, struct sockaddr *uaddr, int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { struct sock *sk = sock->sk; - int err = -EINVAL; switch(cmd) { @@ -453,10 +452,9 @@ int inet6_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCSIFDSTADDR: return addrconf_set_dstaddr((void __user *) arg); default: - if (!sk->sk_prot->ioctl || - (err = sk->sk_prot->ioctl(sk, cmd, arg)) == -ENOIOCTLCMD) - return(dev_ioctl(cmd,(void __user *) arg)); - return err; + if (!sk->sk_prot->ioctl) + return -ENOIOCTLCMD; + return sk->sk_prot->ioctl(sk, cmd, arg); } /*NOTREACHED*/ return(0); diff --git a/net/ipx/af_ipx.c b/net/ipx/af_ipx.c index 6c464c1..0dc519b 100644 --- a/net/ipx/af_ipx.c +++ b/net/ipx/af_ipx.c @@ -1884,7 +1884,7 @@ static int ipx_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) rc = -EINVAL; break; default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index e57683d..fbfa967 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -1822,7 +1822,7 @@ static int irda_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return -EINVAL; default: IRDA_DEBUG(1, "%s(), doing device ioctl!\n", __FUNCTION__); - return dev_ioctl(cmd, (void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c index 9cf65f9..8171c53 100644 --- a/net/llc/af_llc.c +++ b/net/llc/af_llc.c @@ -959,7 +959,7 @@ out: static int llc_ui_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) { - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } /** diff --git a/net/netrom/af_netrom.c b/net/netrom/af_netrom.c index 9ee672e..63b0e4a 100644 --- a/net/netrom/af_netrom.c +++ b/net/netrom/af_netrom.c @@ -1215,7 +1215,7 @@ static int nr_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return nr_rt_ioctl(cmd, argp); default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } return 0; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index deda6fd..f69e5ed 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1521,7 +1521,7 @@ static int packet_ioctl(struct socket *sock, unsigned int cmd, #endif default: - return dev_ioctl(cmd, (void __user *)arg); + return -ENOIOCTLCMD; } return 0; } diff --git a/net/rose/af_rose.c b/net/rose/af_rose.c index 829fdbc..63090be 100644 --- a/net/rose/af_rose.c +++ b/net/rose/af_rose.c @@ -1320,7 +1320,7 @@ static int rose_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) return 0; default: - return dev_ioctl(cmd, argp); + return -ENOIOCTLCMD; } return 0; diff --git a/net/socket.c b/net/socket.c index 0dde6da..06fa217 100644 --- a/net/socket.c +++ b/net/socket.c @@ -900,6 +900,13 @@ static long sock_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; default: err = sock->ops->ioctl(sock, cmd, arg); + + /* + * If this ioctl is unknown try to hand it down + * to the NIC driver. + */ + if (err == -ENOIOCTLCMD) + err = dev_ioctl(cmd, argp); break; } return err; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 1ddd36d..5f6ae79 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1859,7 +1859,7 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - err = dev_ioctl(cmd, (void __user *)arg); + err = -ENOIOCTLCMD; break; } return err; diff --git a/net/wanrouter/af_wanpipe.c b/net/wanrouter/af_wanpipe.c index 67948bf..7a43ae4 100644 --- a/net/wanrouter/af_wanpipe.c +++ b/net/wanrouter/af_wanpipe.c @@ -1839,7 +1839,7 @@ static int wanpipe_ioctl(struct socket *sock, unsigned int cmd, unsigned long ar #endif default: - return dev_ioctl(cmd,(void __user *) arg); + return -ENOIOCTLCMD; } /*NOTREACHED*/ } diff --git a/net/x25/af_x25.c b/net/x25/af_x25.c index ca8b3b0..16459c7 100644 --- a/net/x25/af_x25.c +++ b/net/x25/af_x25.c @@ -1378,7 +1378,7 @@ static int x25_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) } default: - rc = dev_ioctl(cmd, argp); + rc = -ENOIOCTLCMD; break; } -- cgit v1.1 From fd30333d0fab9e870af89e112454996c188655e9 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 3 Jan 2006 14:19:25 -0800 Subject: [TG3]: fixup tot_len calculation Turning struct iphdr::tot_len into __be16 added sparse warning. Signed-off-by: Alexey Dobriyan Signed-off-by: David S. Miller --- drivers/net/tg3.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/tg3.c b/drivers/net/tg3.c index 59d916c..eb86b05 100644 --- a/drivers/net/tg3.c +++ b/drivers/net/tg3.c @@ -3651,7 +3651,7 @@ static int tg3_start_xmit(struct sk_buff *skb, struct net_device *dev) TXD_FLAG_CPU_POST_DMA); skb->nh.iph->check = 0; - skb->nh.iph->tot_len = ntohs(mss + ip_tcp_len + tcp_opt_len); + skb->nh.iph->tot_len = htons(mss + ip_tcp_len + tcp_opt_len); if (tp->tg3_flags2 & TG3_FLG2_HW_TSO) { skb->h.th->check = 0; base_flags &= ~TXD_FLAG_TCPUDP_CSUM; -- cgit v1.1 From 4b5bdf5cc3695dc5caba011b9c616b40e6299638 Mon Sep 17 00:00:00 2001 From: Roberto Nibali Date: Tue, 3 Jan 2006 14:22:59 -0800 Subject: [IPVS]: Cleanup IP_VS_DBG statements. From: Roberto Nibali The attached patch (against current -GIT) is a cleanup patch which does following: o lookup debug messages shifted back to 9 o added more informational value to flags and refcnt since those entries can be in multiple referenced structures o cleanup 80 char violation It's the prepatch to the session pool implementation and helps very much to debug and monitor important variables and structures regarding the threshold limitation and persistency without the thousands of lookup messages which noone is interested in. Signed-off-by: Horms Signed-off-by: David S. Miller --- net/ipv4/ipvs/ip_vs_conn.c | 18 ++++++++++-------- net/ipv4/ipvs/ip_vs_core.c | 2 +- net/ipv4/ipvs/ip_vs_ctl.c | 9 +++++---- net/ipv4/ipvs/ip_vs_proto_tcp.c | 2 +- 4 files changed, 17 insertions(+), 14 deletions(-) diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index d35cea3..ed5bd56 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -221,7 +221,7 @@ struct ip_vs_conn *ip_vs_conn_in_get if (!cp && atomic_read(&ip_vs_conn_no_cport_cnt)) cp = __ip_vs_conn_in_get(protocol, s_addr, 0, d_addr, d_port); - IP_VS_DBG(7, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -256,7 +256,7 @@ struct ip_vs_conn *ip_vs_ct_in_get out: ct_read_unlock(hash); - IP_VS_DBG(7, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "template lookup/in %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -297,7 +297,7 @@ struct ip_vs_conn *ip_vs_conn_out_get ct_read_unlock(hash); - IP_VS_DBG(7, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", + IP_VS_DBG(9, "lookup/out %s %u.%u.%u.%u:%d->%u.%u.%u.%u:%d %s\n", ip_vs_proto_name(protocol), NIPQUAD(s_addr), ntohs(s_port), NIPQUAD(d_addr), ntohs(d_port), @@ -393,8 +393,9 @@ ip_vs_bind_dest(struct ip_vs_conn *cp, struct ip_vs_dest *dest) cp->flags |= atomic_read(&dest->conn_flags); cp->dest = dest; - IP_VS_DBG(9, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Bind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -432,8 +433,9 @@ static inline void ip_vs_unbind_dest(struct ip_vs_conn *cp) if (!dest) return; - IP_VS_DBG(9, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " - "d:%u.%u.%u.%u:%d fwd:%c s:%u flg:%X cnt:%d destcnt:%d\n", + IP_VS_DBG(7, "Unbind-dest %s c:%u.%u.%u.%u:%d v:%u.%u.%u.%u:%d " + "d:%u.%u.%u.%u:%d fwd:%c s:%u conn->flags:%X conn->refcnt:%d " + "dest->refcnt:%d\n", ip_vs_proto_name(cp->protocol), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), @@ -573,7 +575,7 @@ static void ip_vs_conn_expire(unsigned long data) ip_vs_conn_hash(cp); expire_later: - IP_VS_DBG(7, "delayed: refcnt-1=%d conn.n_control=%d\n", + IP_VS_DBG(7, "delayed: conn->refcnt-1=%d conn->n_control=%d\n", atomic_read(&cp->refcnt)-1, atomic_read(&cp->n_control)); diff --git a/net/ipv4/ipvs/ip_vs_core.c b/net/ipv4/ipvs/ip_vs_core.c index 1a0843c..1aca94a 100644 --- a/net/ipv4/ipvs/ip_vs_core.c +++ b/net/ipv4/ipvs/ip_vs_core.c @@ -426,7 +426,7 @@ ip_vs_schedule(struct ip_vs_service *svc, const struct sk_buff *skb) return NULL; IP_VS_DBG(6, "Schedule fwd:%c c:%u.%u.%u.%u:%u v:%u.%u.%u.%u:%u " - "d:%u.%u.%u.%u:%u flg:%X cnt:%d\n", + "d:%u.%u.%u.%u:%u conn->flags:%X conn->refcnt:%d\n", ip_vs_fwd_tag(cp), NIPQUAD(cp->caddr), ntohs(cp->cport), NIPQUAD(cp->vaddr), ntohs(cp->vport), diff --git a/net/ipv4/ipvs/ip_vs_ctl.c b/net/ipv4/ipvs/ip_vs_ctl.c index fe2c39d..c935c50 100644 --- a/net/ipv4/ipvs/ip_vs_ctl.c +++ b/net/ipv4/ipvs/ip_vs_ctl.c @@ -448,7 +448,7 @@ ip_vs_service_get(__u32 fwmark, __u16 protocol, __u32 vaddr, __u16 vport) out: read_unlock(&__ip_vs_svc_lock); - IP_VS_DBG(6, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", + IP_VS_DBG(9, "lookup service: fwm %u %s %u.%u.%u.%u:%u %s\n", fwmark, ip_vs_proto_name(protocol), NIPQUAD(vaddr), ntohs(vport), svc?"hit":"not hit"); @@ -598,7 +598,7 @@ ip_vs_trash_get_dest(struct ip_vs_service *svc, __u32 daddr, __u16 dport) */ list_for_each_entry_safe(dest, nxt, &ip_vs_dest_trash, n_list) { IP_VS_DBG(3, "Destination %u/%u.%u.%u.%u:%u still in trash, " - "refcnt=%d\n", + "dest->refcnt=%d\n", dest->vfwmark, NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); @@ -805,7 +805,7 @@ ip_vs_add_dest(struct ip_vs_service *svc, struct ip_vs_dest_user *udest) dest = ip_vs_trash_get_dest(svc, daddr, dport); if (dest != NULL) { IP_VS_DBG(3, "Get destination %u.%u.%u.%u:%u from trash, " - "refcnt=%d, service %u/%u.%u.%u.%u:%u\n", + "dest->refcnt=%d, service %u/%u.%u.%u.%u:%u\n", NIPQUAD(daddr), ntohs(dport), atomic_read(&dest->refcnt), dest->vfwmark, @@ -950,7 +950,8 @@ static void __ip_vs_del_dest(struct ip_vs_dest *dest) atomic_dec(&dest->svc->refcnt); kfree(dest); } else { - IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, refcnt=%d\n", + IP_VS_DBG(3, "Moving dest %u.%u.%u.%u:%u into trash, " + "dest->refcnt=%d\n", NIPQUAD(dest->addr), ntohs(dest->port), atomic_read(&dest->refcnt)); list_add(&dest->n_list, &ip_vs_dest_trash); diff --git a/net/ipv4/ipvs/ip_vs_proto_tcp.c b/net/ipv4/ipvs/ip_vs_proto_tcp.c index 638ba8c..bc28b11 100644 --- a/net/ipv4/ipvs/ip_vs_proto_tcp.c +++ b/net/ipv4/ipvs/ip_vs_proto_tcp.c @@ -426,7 +426,7 @@ set_tcp_state(struct ip_vs_protocol *pp, struct ip_vs_conn *cp, struct ip_vs_dest *dest = cp->dest; IP_VS_DBG(8, "%s %s [%c%c%c%c] %u.%u.%u.%u:%d->" - "%u.%u.%u.%u:%d state: %s->%s cnt:%d\n", + "%u.%u.%u.%u:%d state: %s->%s conn->refcnt:%d\n", pp->name, (state_off==TCP_DIR_OUTPUT)?"output ":"input ", th->syn? 'S' : '.', -- cgit v1.1 From 5062430c5cc526655e3d10c670fc9c263656f66c Mon Sep 17 00:00:00 2001 From: Patrick Caulfield Date: Tue, 3 Jan 2006 14:24:02 -0800 Subject: [DECNET]: Only use local routers The attached patch makes DECnet routing only use routers from the same area - rather than the highest rated router seen. In theory there should not be an out-of-area router on a local network but some networks are bridged rather than properly routed. VMS seems to behave similarly: if I bring up a VMS node with no router then it can't see anything else on the global network. Signed-off-by: Patrick Caulfield Signed-off-by: David S. Miller --- net/decnet/dn_neigh.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c index 8d0cc3c..33ab256 100644 --- a/net/decnet/dn_neigh.c +++ b/net/decnet/dn_neigh.c @@ -408,11 +408,14 @@ int dn_neigh_router_hello(struct sk_buff *skb) } } - if (!dn_db->router) { - dn_db->router = neigh_clone(neigh); - } else { - if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) - neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + /* Only use routers in our area */ + if ((dn_ntohs(src)>>10) == dn_ntohs((decnet_address)>>10)) { + if (!dn_db->router) { + dn_db->router = neigh_clone(neigh); + } else { + if (msg->priority > ((struct dn_neigh *)dn_db->router)->priority) + neigh_release(xchg(&dn_db->router, neigh_clone(neigh))); + } } write_unlock(&neigh->lock); neigh_release(neigh); -- cgit v1.1 From 709dd3aaf5304993083c2297c73f5531c36fba5a Mon Sep 17 00:00:00 2001 From: Andrea Bittau Date: Tue, 3 Jan 2006 14:25:17 -0800 Subject: [DCCP]: Do not process a packet twice when it's not in state DCCP_OPEN. When packets are received, the connection is either in DCCP_OPEN [fast-path] or it isn't. If it's not [e.g. DCCP_PARTOPEN] upper layers will perform sanity checks and parse options. If it is in DCCP_OPEN, dccp_rcv_established() will do it. It is important not to re-parse options in dccp_rcv_established() when it is not called from the fast-path. Else, fore example, the ack vector will be added twice and the CCID will see the packet twice. The solution is to always enfore sanity checks from the upper layers. When packets arrive in the fast-path, sanity checks will be performed before calling dccp_rcv_established(). Note(acme): I rewrote the patch to achieve the same result but keeping dccp_rcv_established with the previous semantics and having it split into __dccp_rcv_established, that doesn't does do any sanity check, code in state != DCCP_OPEN use this lighter version as they already do the sanity checks. Signed-off-by: Andrea Bittau Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/input.c | 56 ++++++++++++++++++++++++++++++++++---------------------- 1 file changed, 34 insertions(+), 22 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index 55e921b..5e312b0 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -151,29 +151,12 @@ static int dccp_check_seqno(struct sock *sk, struct sk_buff *skb) return 0; } -int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, - const struct dccp_hdr *dh, const unsigned len) +static inline int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, + const unsigned len) { struct dccp_sock *dp = dccp_sk(sk); - if (dccp_check_seqno(sk, skb)) - goto discard; - - if (dccp_parse_options(sk, skb)) - goto discard; - - if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) - dccp_event_ack_recv(sk, skb); - - if (dp->dccps_options.dccpo_send_ack_vector && - dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, - DCCP_SKB_CB(skb)->dccpd_seq, - DCCP_ACKVEC_STATE_RECEIVED)) - goto discard; - - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - switch (dccp_hdr(skb)->dccph_type) { case DCCP_PKT_DATAACK: case DCCP_PKT_DATA: @@ -250,6 +233,35 @@ discard: return 0; } +int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, + const struct dccp_hdr *dh, const unsigned len) +{ + struct dccp_sock *dp = dccp_sk(sk); + + if (dccp_check_seqno(sk, skb)) + goto discard; + + if (dccp_parse_options(sk, skb)) + goto discard; + + if (DCCP_SKB_CB(skb)->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) + dccp_event_ack_recv(sk, skb); + + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); + + return __dccp_rcv_established(sk, skb, dh, len); +discard: + __kfree_skb(skb); + return 0; +} + EXPORT_SYMBOL_GPL(dccp_rcv_established); static int dccp_rcv_request_sent_state_process(struct sock *sk, @@ -400,9 +412,9 @@ static int dccp_rcv_respond_partopen_state_process(struct sock *sk, if (dh->dccph_type == DCCP_PKT_DATAACK || dh->dccph_type == DCCP_PKT_DATA) { - dccp_rcv_established(sk, skb, dh, len); + __dccp_rcv_established(sk, skb, dh, len); queued = 1; /* packet was queued - (by dccp_rcv_established) */ + (by __dccp_rcv_established) */ } break; } -- cgit v1.1 From 9e377202d2c968dde8efd6121d94c7f0a77787aa Mon Sep 17 00:00:00 2001 From: Andrea Bittau Date: Tue, 3 Jan 2006 14:25:49 -0800 Subject: [DCCP]: Send an ACK vector when ACKing a response packet If ACK vectors are used, each packet with an ACK should contain an ACK vector. The only exception currently is response packets. It probably is not a good idea to store ACK vector state before the connection is completed (to help protect from syn floods). Signed-off-by: Andrea Bittau Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/input.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/dccp/input.c b/net/dccp/input.c index 5e312b0..cb0f5c9 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -300,6 +300,12 @@ static int dccp_rcv_request_sent_state_process(struct sock *sk, goto out_invalid_packet; } + if (dp->dccps_options.dccpo_send_ack_vector && + dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, + DCCP_SKB_CB(skb)->dccpd_seq, + DCCP_ACKVEC_STATE_RECEIVED)) + goto out_invalid_packet; /* FIXME: change error code */ + dp->dccps_isr = DCCP_SKB_CB(skb)->dccpd_seq; dccp_update_gsr(sk, dp->dccps_isr); /* -- cgit v1.1 From e84a9f5e9cd2b229dda24002334bc3cd36c1109d Mon Sep 17 00:00:00 2001 From: Andrea Bittau Date: Tue, 3 Jan 2006 14:26:15 -0800 Subject: [DCCP]: Notify CCID only after ACK vectors have been processed. The CCID should be notified of packet reception only when a packet is valid. Therefore, the ACK vector needs to be processed before notifying the CCID. Also, the CCID might need information provided by the ACK vector. Signed-off-by: Andrea Bittau Signed-off-by: Arnaldo Carvalho de Melo Signed-off-by: David S. Miller --- net/dccp/input.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/dccp/input.c b/net/dccp/input.c index cb0f5c9..b6cba72 100644 --- a/net/dccp/input.c +++ b/net/dccp/input.c @@ -492,14 +492,14 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, if (dcb->dccpd_ack_seq != DCCP_PKT_WITHOUT_ACK_SEQ) dccp_event_ack_recv(sk, skb); - ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); - ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); - if (dp->dccps_options.dccpo_send_ack_vector && dccp_ackvec_add(dp->dccps_hc_rx_ackvec, sk, DCCP_SKB_CB(skb)->dccpd_seq, DCCP_ACKVEC_STATE_RECEIVED)) goto discard; + + ccid_hc_rx_packet_recv(dp->dccps_hc_rx_ccid, sk, skb); + ccid_hc_tx_packet_recv(dp->dccps_hc_tx_ccid, sk, skb); } /* -- cgit v1.1 From 554c9a8ec37729bff69951cb740074abbae21afa Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Jan 2006 14:35:54 -0800 Subject: [BRIDGE]: Fix faulty check in br_stp_recalculate_bridge_id() One of the conversions from memcmp to compare_ether_addr is incorrect. We need to do relative comparison to determine min MAC address to use in bridge id. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_stp_if.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c index 2d2e969..cc047f7 100644 --- a/net/bridge/br_stp_if.c +++ b/net/bridge/br_stp_if.c @@ -157,7 +157,7 @@ void br_stp_recalculate_bridge_id(struct net_bridge *br) list_for_each_entry(p, &br->port_list, list) { if (addr == br_mac_zero || - compare_ether_addr(p->dev->dev_addr, addr) < 0) + memcmp(p->dev->dev_addr, addr, ETH_ALEN) < 0) addr = p->dev->dev_addr; } -- cgit v1.1 From cd8787ab04d23f925f440b712b43a6fd5cb31ece Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Jan 2006 14:38:34 -0800 Subject: [IPV4] fib_trie: build fix Need this to fix build of fib_trie in net-2.6.16 (rebased) tree. The code needs the new inet_make_mask inline. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/ipv4/fib_trie.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index a3ed7e1..e320b32 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -66,6 +66,7 @@ #include #include #include +#include #include #include #include -- cgit v1.1 From 88df8ef59a3eb54b1e2412765ff2736d2376d1ca Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Jan 2006 15:25:45 -0800 Subject: [NET]: Don't exclude broadcast addresses from is_multicast_ether_addr() The check for multicast shouldn't exclude broadcast type addresses. This reverts the incorrect change done in 2.6.13. The broadcast address is a multicast address and should be excluded from being a valid_ether_address for use in bridging or device address. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/linux/etherdevice.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h index 5f49a30..745c988 100644 --- a/include/linux/etherdevice.h +++ b/include/linux/etherdevice.h @@ -63,10 +63,11 @@ static inline int is_zero_ether_addr(const u8 *addr) * @addr: Pointer to a six-byte array containing the Ethernet address * * Return true if the address is a multicast address. + * By definition the broadcast address is also a multicast address. */ static inline int is_multicast_ether_addr(const u8 *addr) { - return ((addr[0] != 0xff) && (0x01 & addr[0])); + return (0x01 & addr[0]); } /** -- cgit v1.1 From 3c19065a1e2c862becc576bc65e54f2bc1cbffe6 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Jan 2006 15:27:38 -0800 Subject: [IEEE80211] ipw2200: Simplify multicast checks. From: Stephen Hemminger is_multicast_ether_addr() accepts broadcast too, so the is_broadcast_ether_addr() calls are redundant. Signed-off-by: David S. Miller --- drivers/net/wireless/ipw2200.c | 15 +++++---------- net/ieee80211/ieee80211_rx.c | 5 ++--- 2 files changed, 7 insertions(+), 13 deletions(-) diff --git a/drivers/net/wireless/ipw2200.c b/drivers/net/wireless/ipw2200.c index 5e7c7e9..64f6d1f 100644 --- a/drivers/net/wireless/ipw2200.c +++ b/drivers/net/wireless/ipw2200.c @@ -7456,8 +7456,7 @@ static void ipw_handle_data_packet(struct ipw_priv *priv, /* HW decrypt will not clear the WEP bit, MIC, PN, etc. */ hdr = (struct ieee80211_hdr_4addr *)rxb->skb->data; if (priv->ieee->iw_mode != IW_MODE_MONITOR && - ((is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr1)) ? + (is_multicast_ether_addr(hdr->addr1) ? !priv->ieee->host_mc_decrypt : !priv->ieee->host_decrypt)) ipw_rebuild_decrypted_skb(priv, rxb->skb); @@ -7648,8 +7647,7 @@ static inline int is_network_packet(struct ipw_priv *priv, return 0; /* {broad,multi}cast packets to our BSSID go through */ - if (is_multicast_ether_addr(header->addr1) || - is_broadcast_ether_addr(header->addr1)) + if (is_multicast_ether_addr(header->addr1)) return !memcmp(header->addr3, priv->bssid, ETH_ALEN); /* packets to our adapter go through */ @@ -7662,8 +7660,7 @@ static inline int is_network_packet(struct ipw_priv *priv, return 0; /* {broad,multi}cast packets to our BSS go through */ - if (is_multicast_ether_addr(header->addr1) || - is_broadcast_ether_addr(header->addr1)) + if (is_multicast_ether_addr(header->addr1)) return !memcmp(header->addr2, priv->bssid, ETH_ALEN); /* packets to our adapter go through */ @@ -9657,8 +9654,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb, switch (priv->ieee->iw_mode) { case IW_MODE_ADHOC: hdr_len = IEEE80211_3ADDR_LEN; - unicast = !(is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr1)); + unicast = !is_multicast_ether_addr(hdr->addr1); id = ipw_find_station(priv, hdr->addr1); if (id == IPW_INVALID_STATION) { id = ipw_add_station(priv, hdr->addr1); @@ -9673,8 +9669,7 @@ static inline int ipw_tx_skb(struct ipw_priv *priv, struct ieee80211_txb *txb, case IW_MODE_INFRA: default: - unicast = !(is_multicast_ether_addr(hdr->addr3) || - is_broadcast_ether_addr(hdr->addr3)); + unicast = !is_multicast_ether_addr(hdr->addr3); hdr_len = IEEE80211_3ADDR_LEN; id = 0; break; diff --git a/net/ieee80211/ieee80211_rx.c b/net/ieee80211/ieee80211_rx.c index 03efaac..4cc6f41 100644 --- a/net/ieee80211/ieee80211_rx.c +++ b/net/ieee80211/ieee80211_rx.c @@ -410,9 +410,8 @@ int ieee80211_rx(struct ieee80211_device *ieee, struct sk_buff *skb, return 1; } - if ((is_multicast_ether_addr(hdr->addr1) || - is_broadcast_ether_addr(hdr->addr2)) ? ieee->host_mc_decrypt : - ieee->host_decrypt) { + if (is_multicast_ether_addr(hdr->addr1) + ? ieee->host_mc_decrypt : ieee->host_decrypt) { int idx = 0; if (skb->len >= hdrlen + 3) idx = skb->data[hdrlen + 3] >> 6; -- cgit v1.1 From 40efc6fa179f440a008333ea98f701bc35a1f97f Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Tue, 3 Jan 2006 16:03:49 -0800 Subject: [TCP]: less inline's TCP inline usage cleanup: * get rid of inline in several places * replace __inline__ with inline where possible * move functions used in one file out of tcp.h * let compiler decide on used once cases On x86_64: text data bss dec hex filename 3594701 648348 567400 4810449 4966d1 vmlinux.orig 3593133 648580 567400 4809113 496199 vmlinux On sparc64: text data bss dec hex filename 2538278 406152 530392 3474822 350586 vmlinux.ORIG 2536382 406384 530392 3473158 34ff06 vmlinux Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- include/net/tcp.h | 193 +++++++------------------------------------------- net/ipv4/tcp_cong.c | 28 ++++++++ net/ipv4/tcp_input.c | 82 +++++++++++++++------ net/ipv4/tcp_ipv4.c | 9 ++- net/ipv4/tcp_output.c | 87 ++++++++++++++++++++--- 5 files changed, 198 insertions(+), 201 deletions(-) diff --git a/include/net/tcp.h b/include/net/tcp.h index 3699304..77f21c6 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -445,34 +445,16 @@ typedef int (*sk_read_actor_t)(read_descriptor_t *, struct sk_buff *, extern int tcp_read_sock(struct sock *sk, read_descriptor_t *desc, sk_read_actor_t recv_actor); -/* Initialize RCV_MSS value. - * RCV_MSS is an our guess about MSS used by the peer. - * We haven't any direct information about the MSS. - * It's better to underestimate the RCV_MSS rather than overestimate. - * Overestimations make us ACKing less frequently than needed. - * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). - */ +extern void tcp_initialize_rcv_mss(struct sock *sk); -static inline void tcp_initialize_rcv_mss(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); - - hint = min(hint, tp->rcv_wnd/2); - hint = min(hint, TCP_MIN_RCVMSS); - hint = max(hint, TCP_MIN_MSS); - - inet_csk(sk)->icsk_ack.rcv_mss = hint; -} - -static __inline__ void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) +static inline void __tcp_fast_path_on(struct tcp_sock *tp, u32 snd_wnd) { tp->pred_flags = htonl((tp->tcp_header_len << 26) | ntohl(TCP_FLAG_ACK) | snd_wnd); } -static __inline__ void tcp_fast_path_on(struct tcp_sock *tp) +static inline void tcp_fast_path_on(struct tcp_sock *tp) { __tcp_fast_path_on(tp, tp->snd_wnd >> tp->rx_opt.snd_wscale); } @@ -490,7 +472,7 @@ static inline void tcp_fast_path_check(struct sock *sk, struct tcp_sock *tp) * Rcv_nxt can be after the window if our peer push more data * than the offered window. */ -static __inline__ u32 tcp_receive_window(const struct tcp_sock *tp) +static inline u32 tcp_receive_window(const struct tcp_sock *tp) { s32 win = tp->rcv_wup + tp->rcv_wnd - tp->rcv_nxt; @@ -662,6 +644,7 @@ extern void tcp_cleanup_congestion_control(struct sock *sk); extern int tcp_set_default_congestion_control(const char *name); extern void tcp_get_default_congestion_control(char *name); extern int tcp_set_congestion_control(struct sock *sk, const char *name); +extern void tcp_slow_start(struct tcp_sock *tp); extern struct tcp_congestion_ops tcp_init_congestion_ops; extern u32 tcp_reno_ssthresh(struct sock *sk); @@ -701,7 +684,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) * "Packets left network, but not honestly ACKed yet" PLUS * "Packets fast retransmitted" */ -static __inline__ unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) +static inline unsigned int tcp_packets_in_flight(const struct tcp_sock *tp) { return (tp->packets_out - tp->left_out + tp->retrans_out); } @@ -721,33 +704,6 @@ static inline __u32 tcp_current_ssthresh(const struct sock *sk) (tp->snd_cwnd >> 2))); } -/* - * Linear increase during slow start - */ -static inline void tcp_slow_start(struct tcp_sock *tp) -{ - if (sysctl_tcp_abc) { - /* RFC3465: Slow Start - * TCP sender SHOULD increase cwnd by the number of - * previously unacknowledged bytes ACKed by each incoming - * acknowledgment, provided the increase is not more than L - */ - if (tp->bytes_acked < tp->mss_cache) - return; - - /* We MAY increase by 2 if discovered delayed ack */ - if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; - } - } - tp->bytes_acked = 0; - - if (tp->snd_cwnd < tp->snd_cwnd_clamp) - tp->snd_cwnd++; -} - - static inline void tcp_sync_left_out(struct tcp_sock *tp) { if (tp->rx_opt.sack_ok && @@ -756,34 +712,7 @@ static inline void tcp_sync_left_out(struct tcp_sock *tp) tp->left_out = tp->sacked_out + tp->lost_out; } -/* Set slow start threshold and cwnd not falling to slow start */ -static inline void __tcp_enter_cwr(struct sock *sk) -{ - const struct inet_connection_sock *icsk = inet_csk(sk); - struct tcp_sock *tp = tcp_sk(sk); - - tp->undo_marker = 0; - tp->snd_ssthresh = icsk->icsk_ca_ops->ssthresh(sk); - tp->snd_cwnd = min(tp->snd_cwnd, - tcp_packets_in_flight(tp) + 1U); - tp->snd_cwnd_cnt = 0; - tp->high_seq = tp->snd_nxt; - tp->snd_cwnd_stamp = tcp_time_stamp; - TCP_ECN_queue_cwr(tp); -} - -static inline void tcp_enter_cwr(struct sock *sk) -{ - struct tcp_sock *tp = tcp_sk(sk); - - tp->prior_ssthresh = 0; - tp->bytes_acked = 0; - if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { - __tcp_enter_cwr(sk); - tcp_set_ca_state(sk, TCP_CA_CWR); - } -} - +extern void tcp_enter_cwr(struct sock *sk); extern __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst); /* Slow start with delack produces 3 packets of burst, so that @@ -815,14 +744,14 @@ static inline int tcp_is_cwnd_limited(const struct sock *sk, u32 in_flight) return left <= tcp_max_burst(tp); } -static __inline__ void tcp_minshall_update(struct tcp_sock *tp, int mss, - const struct sk_buff *skb) +static inline void tcp_minshall_update(struct tcp_sock *tp, int mss, + const struct sk_buff *skb) { if (skb->len < mss) tp->snd_sml = TCP_SKB_CB(skb)->end_seq; } -static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *tp) { const struct inet_connection_sock *icsk = inet_csk(sk); if (!tp->packets_out && !icsk->icsk_pending) @@ -830,18 +759,18 @@ static __inline__ void tcp_check_probe_timer(struct sock *sk, struct tcp_sock *t icsk->icsk_rto, TCP_RTO_MAX); } -static __inline__ void tcp_push_pending_frames(struct sock *sk, - struct tcp_sock *tp) +static inline void tcp_push_pending_frames(struct sock *sk, + struct tcp_sock *tp) { __tcp_push_pending_frames(sk, tp, tcp_current_mss(sk, 1), tp->nonagle); } -static __inline__ void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_init_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; } -static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) +static inline void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) { tp->snd_wl1 = seq; } @@ -849,19 +778,19 @@ static __inline__ void tcp_update_wl(struct tcp_sock *tp, u32 ack, u32 seq) /* * Calculate(/check) TCP checksum */ -static __inline__ u16 tcp_v4_check(struct tcphdr *th, int len, - unsigned long saddr, unsigned long daddr, - unsigned long base) +static inline u16 tcp_v4_check(struct tcphdr *th, int len, + unsigned long saddr, unsigned long daddr, + unsigned long base) { return csum_tcpudp_magic(saddr,daddr,len,IPPROTO_TCP,base); } -static __inline__ int __tcp_checksum_complete(struct sk_buff *skb) +static inline int __tcp_checksum_complete(struct sk_buff *skb) { return __skb_checksum_complete(skb); } -static __inline__ int tcp_checksum_complete(struct sk_buff *skb) +static inline int tcp_checksum_complete(struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete(skb); @@ -869,7 +798,7 @@ static __inline__ int tcp_checksum_complete(struct sk_buff *skb) /* Prequeue for VJ style copy to user, combined with checksumming. */ -static __inline__ void tcp_prequeue_init(struct tcp_sock *tp) +static inline void tcp_prequeue_init(struct tcp_sock *tp) { tp->ucopy.task = NULL; tp->ucopy.len = 0; @@ -885,7 +814,7 @@ static __inline__ void tcp_prequeue_init(struct tcp_sock *tp) * * NOTE: is this not too big to inline? */ -static __inline__ int tcp_prequeue(struct sock *sk, struct sk_buff *skb) +static inline int tcp_prequeue(struct sock *sk, struct sk_buff *skb) { struct tcp_sock *tp = tcp_sk(sk); @@ -926,7 +855,7 @@ static const char *statename[]={ }; #endif -static __inline__ void tcp_set_state(struct sock *sk, int state) +static inline void tcp_set_state(struct sock *sk, int state) { int oldstate = sk->sk_state; @@ -960,7 +889,7 @@ static __inline__ void tcp_set_state(struct sock *sk, int state) #endif } -static __inline__ void tcp_done(struct sock *sk) +static inline void tcp_done(struct sock *sk) { tcp_set_state(sk, TCP_CLOSE); tcp_clear_xmit_timers(sk); @@ -973,81 +902,13 @@ static __inline__ void tcp_done(struct sock *sk) inet_csk_destroy_sock(sk); } -static __inline__ void tcp_sack_reset(struct tcp_options_received *rx_opt) +static inline void tcp_sack_reset(struct tcp_options_received *rx_opt) { rx_opt->dsack = 0; rx_opt->eff_sacks = 0; rx_opt->num_sacks = 0; } -static __inline__ void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, __u32 tstamp) -{ - if (tp->rx_opt.tstamp_ok) { - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | - TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); - *ptr++ = htonl(tp->rx_opt.ts_recent); - } - if (tp->rx_opt.eff_sacks) { - struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; - int this_sack; - - *ptr++ = htonl((TCPOPT_NOP << 24) | - (TCPOPT_NOP << 16) | - (TCPOPT_SACK << 8) | - (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * - TCPOLEN_SACK_PERBLOCK))); - for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { - *ptr++ = htonl(sp[this_sack].start_seq); - *ptr++ = htonl(sp[this_sack].end_seq); - } - if (tp->rx_opt.dsack) { - tp->rx_opt.dsack = 0; - tp->rx_opt.eff_sacks--; - } - } -} - -/* Construct a tcp options header for a SYN or SYN_ACK packet. - * If this is every changed make sure to change the definition of - * MAX_SYN_SIZE to match the new maximum number of options that you - * can generate. - */ -static inline void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, - int offer_wscale, int wscale, __u32 tstamp, __u32 ts_recent) -{ - /* We always get an MSS option. - * The option bytes which will be seen in normal data - * packets should timestamps be used, must be in the MSS - * advertised. But we subtract them from tp->mss_cache so - * that calculations in tcp_sendmsg are simpler etc. - * So account for this fact here if necessary. If we - * don't do this correctly, as a receiver we won't - * recognize data packets as being full sized when we - * should, and thus we won't abide by the delayed ACK - * rules correctly. - * SACKs don't matter, we never delay an ACK when we - * have any of those going out. - */ - *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); - if (ts) { - if(sack) - *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - else - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); - *ptr++ = htonl(tstamp); /* TSVAL */ - *ptr++ = htonl(ts_recent); /* TSECR */ - } else if(sack) - *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | - (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); - if (offer_wscale) - *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); -} - /* Determine a window scaling and initial window to offer. */ extern void tcp_select_initial_window(int __space, __u32 mss, __u32 *rcv_wnd, __u32 *window_clamp, @@ -1072,9 +933,9 @@ static inline int tcp_full_space(const struct sock *sk) return tcp_win_from_space(sk->sk_rcvbuf); } -static __inline__ void tcp_openreq_init(struct request_sock *req, - struct tcp_options_received *rx_opt, - struct sk_buff *skb) +static inline void tcp_openreq_init(struct request_sock *req, + struct tcp_options_received *rx_opt, + struct sk_buff *skb) { struct inet_request_sock *ireq = inet_rsk(req); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c index c7cc62c..e688c68 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -174,6 +174,34 @@ int tcp_set_congestion_control(struct sock *sk, const char *name) return err; } + +/* + * Linear increase during slow start + */ +void tcp_slow_start(struct tcp_sock *tp) +{ + if (sysctl_tcp_abc) { + /* RFC3465: Slow Start + * TCP sender SHOULD increase cwnd by the number of + * previously unacknowledged bytes ACKed by each incoming + * acknowledgment, provided the increase is not more than L + */ + if (tp->bytes_acked < tp->mss_cache) + return; + + /* We MAY increase by 2 if discovered delayed ack */ + if (sysctl_tcp_abc > 1 && tp->bytes_acked > 2*tp->mss_cache) { + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; + } + } + tp->bytes_acked = 0; + + if (tp->snd_cwnd < tp->snd_cwnd_clamp) + tp->snd_cwnd++; +} +EXPORT_SYMBOL_GPL(tcp_slow_start); + /* * TCP Reno congestion control * This is special case used for fallback as well. diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 981d120..0a46123 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -115,8 +115,8 @@ int sysctl_tcp_abc = 1; /* Adapt the MSS value used to make delayed ack decision to the * real world. */ -static inline void tcp_measure_rcv_mss(struct sock *sk, - const struct sk_buff *skb) +static void tcp_measure_rcv_mss(struct sock *sk, + const struct sk_buff *skb) { struct inet_connection_sock *icsk = inet_csk(sk); const unsigned int lss = icsk->icsk_ack.last_seg_size; @@ -246,8 +246,8 @@ static int __tcp_grow_window(const struct sock *sk, struct tcp_sock *tp, return 0; } -static inline void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void tcp_grow_window(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { /* Check #1 */ if (tp->rcv_ssthresh < tp->window_clamp && @@ -341,6 +341,26 @@ static void tcp_clamp_window(struct sock *sk, struct tcp_sock *tp) tp->rcv_ssthresh = min(tp->window_clamp, 2U*tp->advmss); } + +/* Initialize RCV_MSS value. + * RCV_MSS is an our guess about MSS used by the peer. + * We haven't any direct information about the MSS. + * It's better to underestimate the RCV_MSS rather than overestimate. + * Overestimations make us ACKing less frequently than needed. + * Underestimations are more easy to detect and fix by tcp_measure_rcv_mss(). + */ +void tcp_initialize_rcv_mss(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + unsigned int hint = min_t(unsigned int, tp->advmss, tp->mss_cache); + + hint = min(hint, tp->rcv_wnd/2); + hint = min(hint, TCP_MIN_RCVMSS); + hint = max(hint, TCP_MIN_MSS); + + inet_csk(sk)->icsk_ack.rcv_mss = hint; +} + /* Receiver "autotuning" code. * * The algorithm for RTT estimation w/o timestamps is based on @@ -735,6 +755,27 @@ __u32 tcp_init_cwnd(struct tcp_sock *tp, struct dst_entry *dst) return min_t(__u32, cwnd, tp->snd_cwnd_clamp); } +/* Set slow start threshold and cwnd not falling to slow start */ +void tcp_enter_cwr(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + tp->prior_ssthresh = 0; + tp->bytes_acked = 0; + if (inet_csk(sk)->icsk_ca_state < TCP_CA_CWR) { + tp->undo_marker = 0; + tp->snd_ssthresh = inet_csk(sk)->icsk_ca_ops->ssthresh(sk); + tp->snd_cwnd = min(tp->snd_cwnd, + tcp_packets_in_flight(tp) + 1U); + tp->snd_cwnd_cnt = 0; + tp->high_seq = tp->snd_nxt; + tp->snd_cwnd_stamp = tcp_time_stamp; + TCP_ECN_queue_cwr(tp); + + tcp_set_ca_state(sk, TCP_CA_CWR); + } +} + /* Initialize metrics on socket. */ static void tcp_init_metrics(struct sock *sk) @@ -2070,8 +2111,8 @@ static inline void tcp_ack_update_rtt(struct sock *sk, const int flag, tcp_ack_no_tstamp(sk, seq_rtt, flag); } -static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, - u32 in_flight, int good) +static void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, + u32 in_flight, int good) { const struct inet_connection_sock *icsk = inet_csk(sk); icsk->icsk_ca_ops->cong_avoid(sk, ack, rtt, in_flight, good); @@ -2082,7 +2123,7 @@ static inline void tcp_cong_avoid(struct sock *sk, u32 ack, u32 rtt, * RFC2988 recommends to restart timer to now+rto. */ -static inline void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) +static void tcp_ack_packets_out(struct sock *sk, struct tcp_sock *tp) { if (!tp->packets_out) { inet_csk_clear_xmit_timer(sk, ICSK_TIME_RETRANS); @@ -2147,7 +2188,7 @@ static int tcp_tso_acked(struct sock *sk, struct sk_buff *skb, return acked; } -static inline u32 tcp_usrtt(const struct sk_buff *skb) +static u32 tcp_usrtt(const struct sk_buff *skb) { struct timeval tv, now; @@ -2583,8 +2624,8 @@ void tcp_parse_options(struct sk_buff *skb, struct tcp_options_received *opt_rx, /* Fast parse options. This hopes to only see timestamps. * If it is wrong it falls back on tcp_parse_options(). */ -static inline int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, - struct tcp_sock *tp) +static int tcp_fast_parse_options(struct sk_buff *skb, struct tcphdr *th, + struct tcp_sock *tp) { if (th->doff == sizeof(struct tcphdr)>>2) { tp->rx_opt.saw_tstamp = 0; @@ -2804,8 +2845,7 @@ static void tcp_fin(struct sk_buff *skb, struct sock *sk, struct tcphdr *th) } } -static __inline__ int -tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) +static inline int tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) { if (!after(seq, sp->end_seq) && !after(sp->start_seq, end_seq)) { if (before(seq, sp->start_seq)) @@ -2817,7 +2857,7 @@ tcp_sack_extend(struct tcp_sack_block *sp, u32 seq, u32 end_seq) return 0; } -static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (tp->rx_opt.sack_ok && sysctl_tcp_dsack) { if (before(seq, tp->rcv_nxt)) @@ -2832,7 +2872,7 @@ static inline void tcp_dsack_set(struct tcp_sock *tp, u32 seq, u32 end_seq) } } -static inline void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) +static void tcp_dsack_extend(struct tcp_sock *tp, u32 seq, u32 end_seq) { if (!tp->rx_opt.dsack) tcp_dsack_set(tp, seq, end_seq); @@ -2890,7 +2930,7 @@ static void tcp_sack_maybe_coalesce(struct tcp_sock *tp) } } -static __inline__ void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) +static inline void tcp_sack_swap(struct tcp_sack_block *sack1, struct tcp_sack_block *sack2) { __u32 tmp; @@ -3455,7 +3495,7 @@ void tcp_cwnd_application_limited(struct sock *sk) tp->snd_cwnd_stamp = tcp_time_stamp; } -static inline int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) +static int tcp_should_expand_sndbuf(struct sock *sk, struct tcp_sock *tp) { /* If the user specified a specific send buffer setting, do * not modify it. @@ -3502,7 +3542,7 @@ static void tcp_new_space(struct sock *sk) sk->sk_write_space(sk); } -static inline void tcp_check_space(struct sock *sk) +static void tcp_check_space(struct sock *sk) { if (sock_flag(sk, SOCK_QUEUE_SHRUNK)) { sock_reset_flag(sk, SOCK_QUEUE_SHRUNK); @@ -3512,7 +3552,7 @@ static inline void tcp_check_space(struct sock *sk) } } -static __inline__ void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) +static inline void tcp_data_snd_check(struct sock *sk, struct tcp_sock *tp) { tcp_push_pending_frames(sk, tp); tcp_check_space(sk); @@ -3544,7 +3584,7 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) } } -static __inline__ void tcp_ack_snd_check(struct sock *sk) +static inline void tcp_ack_snd_check(struct sock *sk) { if (!inet_csk_ack_scheduled(sk)) { /* We sent a data segment already. */ @@ -3692,8 +3732,7 @@ static int __tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) return result; } -static __inline__ int -tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) +static inline int tcp_checksum_complete_user(struct sock *sk, struct sk_buff *skb) { return skb->ip_summed != CHECKSUM_UNNECESSARY && __tcp_checksum_complete_user(sk, skb); @@ -4474,3 +4513,4 @@ EXPORT_SYMBOL(sysctl_tcp_abc); EXPORT_SYMBOL(tcp_parse_options); EXPORT_SYMBOL(tcp_rcv_established); EXPORT_SYMBOL(tcp_rcv_state_process); +EXPORT_SYMBOL(tcp_initialize_rcv_mss); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9b62d80..5c70493 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -270,8 +270,7 @@ failure: /* * This routine does path mtu discovery as defined in RFC1191. */ -static inline void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, - u32 mtu) +static void do_pmtu_discovery(struct sock *sk, struct iphdr *iph, u32 mtu) { struct dst_entry *dst; struct inet_sock *inet = inet_sk(sk); @@ -662,7 +661,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } -static inline void syn_flood_warning(struct sk_buff *skb) +static void syn_flood_warning(struct sk_buff *skb) { static unsigned long warntime; @@ -677,8 +676,8 @@ static inline void syn_flood_warning(struct sk_buff *skb) /* * Save and compile IPv4 options into the request_sock if needed. */ -static inline struct ip_options *tcp_v4_save_options(struct sock *sk, - struct sk_buff *skb) +static struct ip_options *tcp_v4_save_options(struct sock *sk, + struct sk_buff *skb) { struct ip_options *opt = &(IPCB(skb)->opt); struct ip_options *dopt = NULL; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 3a0a914..a7623ea 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -51,8 +51,8 @@ int sysctl_tcp_retrans_collapse = 1; */ int sysctl_tcp_tso_win_divisor = 3; -static inline void update_send_head(struct sock *sk, struct tcp_sock *tp, - struct sk_buff *skb) +static void update_send_head(struct sock *sk, struct tcp_sock *tp, + struct sk_buff *skb) { sk->sk_send_head = skb->next; if (sk->sk_send_head == (struct sk_buff *)&sk->sk_write_queue) @@ -124,8 +124,8 @@ static void tcp_cwnd_restart(struct sock *sk, struct dst_entry *dst) tp->snd_cwnd_used = 0; } -static inline void tcp_event_data_sent(struct tcp_sock *tp, - struct sk_buff *skb, struct sock *sk) +static void tcp_event_data_sent(struct tcp_sock *tp, + struct sk_buff *skb, struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); const u32 now = tcp_time_stamp; @@ -142,7 +142,7 @@ static inline void tcp_event_data_sent(struct tcp_sock *tp, icsk->icsk_ack.pingpong = 1; } -static __inline__ void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) +static inline void tcp_event_ack_sent(struct sock *sk, unsigned int pkts) { tcp_dec_quickack_mode(sk, pkts); inet_csk_clear_xmit_timer(sk, ICSK_TIME_DACK); @@ -212,7 +212,7 @@ void tcp_select_initial_window(int __space, __u32 mss, * value can be stuffed directly into th->window for an outgoing * frame. */ -static __inline__ u16 tcp_select_window(struct sock *sk) +static u16 tcp_select_window(struct sock *sk) { struct tcp_sock *tp = tcp_sk(sk); u32 cur_win = tcp_receive_window(tp); @@ -250,6 +250,75 @@ static __inline__ u16 tcp_select_window(struct sock *sk) return new_win; } +static void tcp_build_and_update_options(__u32 *ptr, struct tcp_sock *tp, + __u32 tstamp) +{ + if (tp->rx_opt.tstamp_ok) { + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); + *ptr++ = htonl(tp->rx_opt.ts_recent); + } + if (tp->rx_opt.eff_sacks) { + struct tcp_sack_block *sp = tp->rx_opt.dsack ? tp->duplicate_sack : tp->selective_acks; + int this_sack; + + *ptr++ = htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_SACK << 8) | + (TCPOLEN_SACK_BASE + (tp->rx_opt.eff_sacks * + TCPOLEN_SACK_PERBLOCK))); + for(this_sack = 0; this_sack < tp->rx_opt.eff_sacks; this_sack++) { + *ptr++ = htonl(sp[this_sack].start_seq); + *ptr++ = htonl(sp[this_sack].end_seq); + } + if (tp->rx_opt.dsack) { + tp->rx_opt.dsack = 0; + tp->rx_opt.eff_sacks--; + } + } +} + +/* Construct a tcp options header for a SYN or SYN_ACK packet. + * If this is every changed make sure to change the definition of + * MAX_SYN_SIZE to match the new maximum number of options that you + * can generate. + */ +static void tcp_syn_build_options(__u32 *ptr, int mss, int ts, int sack, + int offer_wscale, int wscale, __u32 tstamp, + __u32 ts_recent) +{ + /* We always get an MSS option. + * The option bytes which will be seen in normal data + * packets should timestamps be used, must be in the MSS + * advertised. But we subtract them from tp->mss_cache so + * that calculations in tcp_sendmsg are simpler etc. + * So account for this fact here if necessary. If we + * don't do this correctly, as a receiver we won't + * recognize data packets as being full sized when we + * should, and thus we won't abide by the delayed ACK + * rules correctly. + * SACKs don't matter, we never delay an ACK when we + * have any of those going out. + */ + *ptr++ = htonl((TCPOPT_MSS << 24) | (TCPOLEN_MSS << 16) | mss); + if (ts) { + if(sack) + *ptr++ = __constant_htonl((TCPOPT_SACK_PERM << 24) | (TCPOLEN_SACK_PERM << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + else + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | TCPOLEN_TIMESTAMP); + *ptr++ = htonl(tstamp); /* TSVAL */ + *ptr++ = htonl(ts_recent); /* TSECR */ + } else if(sack) + *ptr++ = __constant_htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16) | + (TCPOPT_SACK_PERM << 8) | TCPOLEN_SACK_PERM); + if (offer_wscale) + *ptr++ = htonl((TCPOPT_NOP << 24) | (TCPOPT_WINDOW << 16) | (TCPOLEN_WINDOW << 8) | (wscale)); +} /* This routine actually transmits TCP packets queued in by * tcp_do_sendmsg(). This is used by both the initial @@ -724,7 +793,7 @@ unsigned int tcp_current_mss(struct sock *sk, int large_allowed) /* Congestion window validation. (RFC2861) */ -static inline void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) +static void tcp_cwnd_validate(struct sock *sk, struct tcp_sock *tp) { __u32 packets_out = tp->packets_out; @@ -773,7 +842,7 @@ static inline unsigned int tcp_cwnd_test(struct tcp_sock *tp, struct sk_buff *sk /* This must be invoked the first time we consider transmitting * SKB onto the wire. */ -static inline int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) +static int tcp_init_tso_segs(struct sock *sk, struct sk_buff *skb, unsigned int mss_now) { int tso_segs = tcp_skb_pcount(skb); @@ -1794,7 +1863,7 @@ struct sk_buff * tcp_make_synack(struct sock *sk, struct dst_entry *dst, /* * Do all connect socket setups that can be done AF independent. */ -static inline void tcp_connect_init(struct sock *sk) +static void tcp_connect_init(struct sock *sk) { struct dst_entry *dst = __sk_dst_get(sk); struct tcp_sock *tp = tcp_sk(sk); -- cgit v1.1 From 6742bbcbb8a0959e1dff0ce055768e3217d9967a Mon Sep 17 00:00:00 2001 From: Andrea Bittau Date: Wed, 4 Jan 2006 01:45:17 -0200 Subject: [DCCP] ackvec: Fix spelling of "throw" Signed-off-by: Andrea Bittau Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ackvec.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index c9a62cc..a979f4e 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -291,7 +291,7 @@ void dccp_ackvec_print(const struct dccp_ackvec *av) } #endif -static void dccp_ackvec_trow_away_ack_record(struct dccp_ackvec *av) +static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av) { /* * As we're keeping track of the ack vector size (dccpav_vec_len) and @@ -326,7 +326,7 @@ void dccp_ackvec_check_rcv_ackno(struct dccp_ackvec *av, struct sock *sk, debug_prefix, 1, (unsigned long long)av->dccpav_ack_seqno, (unsigned long long)av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); av->dccpav_ack_seqno = DCCP_MAX_SEQNO + 1; } } @@ -389,7 +389,7 @@ static void dccp_ackvec_check_rcv_ackvector(struct dccp_ackvec *av, av->dccpav_ack_seqno, (unsigned long long) av->dccpav_ack_ackno); - dccp_ackvec_trow_away_ack_record(av); + dccp_ackvec_throw_away_ack_record(av); } /* * If dccpav_ack_seqno was not received, no problem -- cgit v1.1 From e4dfd449c80a41bb615b23d0fc198ba08360a1f8 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 4 Jan 2006 01:46:34 -0200 Subject: [DCCP] ackvec: use u8 for the buf offsets Signed-off-by: Arnaldo Carvalho de Melo --- net/dccp/ackvec.c | 27 +++++++++++++++++---------- net/dccp/ackvec.h | 12 ++++++------ 2 files changed, 23 insertions(+), 16 deletions(-) diff --git a/net/dccp/ackvec.c b/net/dccp/ackvec.c index a979f4e..ce9cb77 100644 --- a/net/dccp/ackvec.c +++ b/net/dccp/ackvec.c @@ -55,8 +55,8 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) from = av->dccpav_buf + av->dccpav_buf_head; /* Check if buf_head wraps */ - if (av->dccpav_buf_head + len > av->dccpav_vec_len) { - const u32 tailsize = (av->dccpav_vec_len - av->dccpav_buf_head); + if ((int)av->dccpav_buf_head + len > av->dccpav_vec_len) { + const u32 tailsize = av->dccpav_vec_len - av->dccpav_buf_head; memcpy(to, from, tailsize); to += tailsize; @@ -93,8 +93,14 @@ int dccp_insert_option_ackvec(struct sock *sk, struct sk_buff *skb) struct dccp_ackvec *dccp_ackvec_alloc(const unsigned int len, const gfp_t priority) { - struct dccp_ackvec *av = kmalloc(sizeof(*av) + len, priority); + struct dccp_ackvec *av; + BUG_ON(len == 0); + + if (len > DCCP_MAX_ACKVEC_LEN) + return NULL; + + av = kmalloc(sizeof(*av) + len, priority); if (av != NULL) { av->dccpav_buf_len = len; av->dccpav_buf_head = @@ -117,13 +123,13 @@ void dccp_ackvec_free(struct dccp_ackvec *av) } static inline u8 dccp_ackvec_state(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_STATE_MASK; } static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, - const unsigned int index) + const u8 index) { return av->dccpav_buf[index] & DCCP_ACKVEC_LEN_MASK; } @@ -135,7 +141,7 @@ static inline u8 dccp_ackvec_len(const struct dccp_ackvec *av, */ static inline int dccp_ackvec_set_buf_head_state(struct dccp_ackvec *av, const unsigned int packets, - const unsigned char state) + const unsigned char state) { unsigned int gap; signed long new_head; @@ -223,7 +229,7 @@ int dccp_ackvec_add(struct dccp_ackvec *av, const struct sock *sk, * could reduce the complexity of this scan.) */ u64 delta = dccp_delta_seqno(ackno, av->dccpav_buf_ackno); - unsigned int index = av->dccpav_buf_head; + u8 index = av->dccpav_buf_head; while (1) { const u8 len = dccp_ackvec_len(av, index); @@ -301,9 +307,10 @@ static void dccp_ackvec_throw_away_ack_record(struct dccp_ackvec *av) * draft-ietf-dccp-spec-11.txt Appendix A. -acme */ #if 0 - av->dccpav_buf_tail = av->dccpav_ack_ptr + 1; - if (av->dccpav_buf_tail >= av->dccpav_vec_len) - av->dccpav_buf_tail -= av->dccpav_vec_len; + u32 new_buf_tail = av->dccpav_ack_ptr + 1; + if (new_buf_tail >= av->dccpav_vec_len) + new_buf_tail -= av->dccpav_vec_len; + av->dccpav_buf_tail = new_buf_tail; #endif av->dccpav_vec_len -= av->dccpav_sent_len; } diff --git a/net/dccp/ackvec.h b/net/dccp/ackvec.h index d0fd6c6..f7dfb5f 100644 --- a/net/dccp/ackvec.h +++ b/net/dccp/ackvec.h @@ -54,16 +54,16 @@ * @dccpav_buf - circular buffer of acknowledgeable packets */ struct dccp_ackvec { - unsigned int dccpav_buf_head; - unsigned int dccpav_buf_tail; u64 dccpav_buf_ackno; u64 dccpav_ack_seqno; u64 dccpav_ack_ackno; - unsigned int dccpav_ack_ptr; - unsigned int dccpav_sent_len; - unsigned int dccpav_vec_len; - unsigned int dccpav_buf_len; struct timeval dccpav_time; + u8 dccpav_buf_head; + u8 dccpav_buf_tail; + u8 dccpav_ack_ptr; + u8 dccpav_sent_len; + u8 dccpav_vec_len; + u8 dccpav_buf_len; u8 dccpav_buf_nonce; u8 dccpav_ack_nonce; u8 dccpav_buf[0]; -- cgit v1.1 From 80e40daa4797a156781d1594642b654eb1c461df Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 4 Jan 2006 01:58:06 -0200 Subject: [TCP]: syn_flood_warning is only needed if CONFIG_SYN_COOKIES is selected CC net/ipv4/tcp_ipv4.o /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/tcp_ipv4.c:665: warning: 'syn_flood_warning' defined but not used Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/tcp_ipv4.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 5c70493..e9f83e5 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -661,6 +661,7 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req) kfree(inet_rsk(req)->opt); } +#ifdef CONFIG_SYN_COOKIES static void syn_flood_warning(struct sk_buff *skb) { static unsigned long warntime; @@ -672,6 +673,7 @@ static void syn_flood_warning(struct sk_buff *skb) ntohs(skb->h.th->dest)); } } +#endif /* * Save and compile IPv4 options into the request_sock if needed. -- cgit v1.1 From f190055ff5c08a877d3e1ac2e0300fd92c264b06 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Wed, 4 Jan 2006 02:02:20 -0200 Subject: [IPVS]: Add missing include CC [M] net/ipv4/ipvs/ip_vs_conn.o /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c: In function 'ip_vs_conn_new': /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c:606: warning: implicit declaration of function 'net_ratelimit' /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c: In function 'ip_vs_random_dropentry': /pub/scm/linux/kernel/git/acme/net-2.6/net/ipv4/ipvs/ip_vs_conn.c:810: warning: implicit declaration of function 'net_random' Signed-off-by: Arnaldo Carvalho de Melo --- net/ipv4/ipvs/ip_vs_conn.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/ipv4/ipvs/ip_vs_conn.c b/net/ipv4/ipvs/ip_vs_conn.c index ed5bd56..81d90354 100644 --- a/net/ipv4/ipvs/ip_vs_conn.c +++ b/net/ipv4/ipvs/ip_vs_conn.c @@ -25,6 +25,7 @@ */ #include +#include #include #include #include -- cgit v1.1 From ca4033024858fcbd392465ba9cbf4c838aedfb58 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 4 Jan 2006 13:56:08 -0800 Subject: [ECONET]: Use macro for spinlock_t definition. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/econet/af_econet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/econet/af_econet.c b/net/econet/af_econet.c index 4fc2452..c792994 100644 --- a/net/econet/af_econet.c +++ b/net/econet/af_econet.c @@ -57,7 +57,7 @@ static struct net_device *net2dev_map[256]; #define EC_PORT_IP 0xd2 #ifdef CONFIG_ECONET_AUNUDP -static spinlock_t aun_queue_lock; +static DEFINE_SPINLOCK(aun_queue_lock); static struct socket *udpsock; #define AUN_PORT 0x8000 -- cgit v1.1 From 196433c5b788eb732fdcf92449274e302f089ce4 Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 4 Jan 2006 13:56:31 -0800 Subject: [IPV6]: Use macro for rwlock_t initialization. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/mcast.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index f829a4a..1cf305a 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -224,7 +224,7 @@ int ipv6_sock_mc_join(struct sock *sk, int ifindex, struct in6_addr *addr) mc_lst->ifindex = dev->ifindex; mc_lst->sfmode = MCAST_EXCLUDE; - mc_lst->sflock = RW_LOCK_UNLOCKED; + rwlock_init(&mc_lst->sflock); mc_lst->sflist = NULL; /* -- cgit v1.1 From 181a46a56e9f852060c54247209e93740329b6eb Mon Sep 17 00:00:00 2001 From: YOSHIFUJI Hideaki Date: Wed, 4 Jan 2006 13:56:54 -0800 Subject: [NETFILTER]: Use macro for spinlock_t/rwlock_t initializations/definition. Signed-off-by: YOSHIFUJI Hideaki Signed-off-by: David S. Miller --- net/ipv6/netfilter/nf_conntrack_reasm.c | 4 ++-- net/netfilter/nfnetlink_log.c | 2 +- net/netfilter/nfnetlink_queue.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index c2c52af..f3e5ffb 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -98,7 +98,7 @@ struct nf_ct_frag6_queue #define FRAG6Q_HASHSZ 64 static struct nf_ct_frag6_queue *nf_ct_frag6_hash[FRAG6Q_HASHSZ]; -static rwlock_t nf_ct_frag6_lock = RW_LOCK_UNLOCKED; +static DEFINE_RWLOCK(nf_ct_frag6_lock); static u32 nf_ct_frag6_hash_rnd; static LIST_HEAD(nf_ct_frag6_lru_list); int nf_ct_frag6_nqueues = 0; @@ -371,7 +371,7 @@ nf_ct_frag6_create(unsigned int hash, u32 id, struct in6_addr *src, struct init_timer(&fq->timer); fq->timer.function = nf_ct_frag6_expire; fq->timer.data = (long) fq; - fq->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&fq->lock); atomic_set(&fq->refcnt, 1); return nf_ct_frag6_intern(hash, fq); diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c index cba6372..e10512e 100644 --- a/net/netfilter/nfnetlink_log.c +++ b/net/netfilter/nfnetlink_log.c @@ -151,7 +151,7 @@ instance_create(u_int16_t group_num, int pid) goto out_unlock; INIT_HLIST_NODE(&inst->hlist); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c index f28460b6..55afdda 100644 --- a/net/netfilter/nfnetlink_queue.c +++ b/net/netfilter/nfnetlink_queue.c @@ -148,7 +148,7 @@ instance_create(u_int16_t queue_num, int pid) atomic_set(&inst->id_sequence, 0); /* needs to be two, since we _put() after creation */ atomic_set(&inst->use, 2); - inst->lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&inst->lock); INIT_LIST_HEAD(&inst->queue_list); if (!try_module_get(THIS_MODULE)) -- cgit v1.1 From 9369986306d4692f37b61302d4e1ce3054d8833e Mon Sep 17 00:00:00 2001 From: Kris Katterjohn Date: Wed, 4 Jan 2006 13:58:36 -0800 Subject: [NET]: More instruction checks fornet/core/filter.c Signed-off-by: Kris Katterjohn Signed-off-by: David S. Miller --- net/core/filter.c | 112 ++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 80 insertions(+), 32 deletions(-) diff --git a/net/core/filter.c b/net/core/filter.c index 3a10e0b..8964d34 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -13,6 +13,7 @@ * 2 of the License, or (at your option) any later version. * * Andi Kleen - Fix a few bad bugs and races. + * Kris Katterjohn - Added many additional checks in sk_chk_filter() */ #include @@ -250,7 +251,7 @@ load_b: mem[fentry->k] = X; continue; default: - /* Invalid instruction counts as RET */ + WARN_ON(1); return 0; } @@ -283,8 +284,8 @@ load_b: * * Check the user's filter code. If we let some ugly * filter code slip through kaboom! The filter must contain - * no references or jumps that are out of range, no illegal instructions - * and no backward jumps. It must end with a RET instruction + * no references or jumps that are out of range, no illegal + * instructions, and must end with a RET instruction. * * Returns 0 if the rule set is legal or a negative errno code if not. */ @@ -300,38 +301,85 @@ int sk_chk_filter(struct sock_filter *filter, int flen) for (pc = 0; pc < flen; pc++) { /* all jumps are forward as they are not signed */ ftest = &filter[pc]; - if (BPF_CLASS(ftest->code) == BPF_JMP) { - /* but they mustn't jump off the end */ - if (BPF_OP(ftest->code) == BPF_JA) { - /* - * Note, the large ftest->k might cause loops. - * Compare this with conditional jumps below, - * where offsets are limited. --ANK (981016) - */ - if (ftest->k >= (unsigned)(flen-pc-1)) - return -EINVAL; - } else { - /* for conditionals both must be safe */ - if (pc + ftest->jt +1 >= flen || - pc + ftest->jf +1 >= flen) - return -EINVAL; - } - } - /* check for division by zero -Kris Katterjohn 2005-10-30 */ - if (ftest->code == (BPF_ALU|BPF_DIV|BPF_K) && ftest->k == 0) - return -EINVAL; + /* Only allow valid instructions */ + switch (ftest->code) { + case BPF_ALU|BPF_ADD|BPF_K: + case BPF_ALU|BPF_ADD|BPF_X: + case BPF_ALU|BPF_SUB|BPF_K: + case BPF_ALU|BPF_SUB|BPF_X: + case BPF_ALU|BPF_MUL|BPF_K: + case BPF_ALU|BPF_MUL|BPF_X: + case BPF_ALU|BPF_DIV|BPF_X: + case BPF_ALU|BPF_AND|BPF_K: + case BPF_ALU|BPF_AND|BPF_X: + case BPF_ALU|BPF_OR|BPF_K: + case BPF_ALU|BPF_OR|BPF_X: + case BPF_ALU|BPF_LSH|BPF_K: + case BPF_ALU|BPF_LSH|BPF_X: + case BPF_ALU|BPF_RSH|BPF_K: + case BPF_ALU|BPF_RSH|BPF_X: + case BPF_ALU|BPF_NEG: + case BPF_LD|BPF_W|BPF_ABS: + case BPF_LD|BPF_H|BPF_ABS: + case BPF_LD|BPF_B|BPF_ABS: + case BPF_LD|BPF_W|BPF_LEN: + case BPF_LD|BPF_W|BPF_IND: + case BPF_LD|BPF_H|BPF_IND: + case BPF_LD|BPF_B|BPF_IND: + case BPF_LD|BPF_IMM: + case BPF_LDX|BPF_W|BPF_LEN: + case BPF_LDX|BPF_B|BPF_MSH: + case BPF_LDX|BPF_IMM: + case BPF_MISC|BPF_TAX: + case BPF_MISC|BPF_TXA: + case BPF_RET|BPF_K: + case BPF_RET|BPF_A: + break; + + /* Some instructions need special checks */ - /* check that memory operations use valid addresses. */ - if (ftest->k >= BPF_MEMWORDS) { - /* but it might not be a memory operation... */ - switch (ftest->code) { - case BPF_ST: - case BPF_STX: - case BPF_LD|BPF_MEM: - case BPF_LDX|BPF_MEM: + case BPF_ALU|BPF_DIV|BPF_K: + /* check for division by zero */ + if (ftest->k == 0) return -EINVAL; - } + break; + + case BPF_LD|BPF_MEM: + case BPF_LDX|BPF_MEM: + case BPF_ST: + case BPF_STX: + /* check for invalid memory addresses */ + if (ftest->k >= BPF_MEMWORDS) + return -EINVAL; + break; + + case BPF_JMP|BPF_JA: + /* + * Note, the large ftest->k might cause loops. + * Compare this with conditional jumps below, + * where offsets are limited. --ANK (981016) + */ + if (ftest->k >= (unsigned)(flen-pc-1)) + return -EINVAL; + break; + + case BPF_JMP|BPF_JEQ|BPF_K: + case BPF_JMP|BPF_JEQ|BPF_X: + case BPF_JMP|BPF_JGE|BPF_K: + case BPF_JMP|BPF_JGE|BPF_X: + case BPF_JMP|BPF_JGT|BPF_K: + case BPF_JMP|BPF_JGT|BPF_X: + case BPF_JMP|BPF_JSET|BPF_K: + case BPF_JMP|BPF_JSET|BPF_X: + /* for conditionals both must be safe */ + if (pc + ftest->jt + 1 >= flen || + pc + ftest->jf + 1 >= flen) + return -EINVAL; + break; + + default: + return -EINVAL; } } -- cgit v1.1 From 74cb8798222bb7d1aecb0acb91e6eeedf5feb948 Mon Sep 17 00:00:00 2001 From: Thomas Young Date: Wed, 4 Jan 2006 13:59:32 -0800 Subject: [TCP] tcp_vegas: Fix slow start Vegas' slow start was only adding one MSS per RTT rather than one for every ack. Slow start behavior should now match Reno. Signed-off-by: Thomas Young Signed-off-by: David S. Miller --- net/ipv4/tcp_vegas.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/tcp_vegas.c b/net/ipv4/tcp_vegas.c index 13e7e6e..3b74034 100644 --- a/net/ipv4/tcp_vegas.c +++ b/net/ipv4/tcp_vegas.c @@ -330,6 +330,10 @@ static void tcp_vegas_cong_avoid(struct sock *sk, u32 ack, vegas->cntRTT = 0; vegas->minRTT = 0x7fffffff; } + /* Use normal slow start */ + else if (tp->snd_cwnd <= tp->snd_ssthresh) + tcp_slow_start(tp); + } /* Extract info for Tcp socket info provided via netlink. */ -- cgit v1.1