diff options
-rw-r--r-- | Documentation/networking/ipvlan.txt | 107 | ||||
-rw-r--r-- | drivers/net/Kconfig | 18 | ||||
-rw-r--r-- | drivers/net/Makefile | 1 | ||||
-rw-r--r-- | drivers/net/ipvlan/Makefile | 7 | ||||
-rw-r--r-- | drivers/net/ipvlan/ipvlan.h | 130 | ||||
-rw-r--r-- | drivers/net/ipvlan/ipvlan_core.c | 607 | ||||
-rw-r--r-- | drivers/net/ipvlan/ipvlan_main.c | 789 | ||||
-rw-r--r-- | include/linux/netdevice.h | 4 | ||||
-rw-r--r-- | include/uapi/linux/if_link.h | 15 |
9 files changed, 1678 insertions, 0 deletions
diff --git a/Documentation/networking/ipvlan.txt b/Documentation/networking/ipvlan.txt new file mode 100644 index 0000000..cf99639 --- /dev/null +++ b/Documentation/networking/ipvlan.txt @@ -0,0 +1,107 @@ + + IPVLAN Driver HOWTO + +Initial Release: + Mahesh Bandewar <maheshb AT google.com> + +1. Introduction: + This is conceptually very similar to the macvlan driver with one major +exception of using L3 for mux-ing /demux-ing among slaves. This property makes +the master device share the L2 with it's slave devices. I have developed this +driver in conjuntion with network namespaces and not sure if there is use case +outside of it. + + +2. Building and Installation: + In order to build the driver, please select the config item CONFIG_IPVLAN. +The driver can be built into the kernel (CONFIG_IPVLAN=y) or as a module +(CONFIG_IPVLAN=m). + + +3. Configuration: + There are no module parameters for this driver and it can be configured +using IProute2/ip utility. + + ip link add link <master-dev> <slave-dev> type ipvlan mode { l2 | L3 } + + e.g. ip link add link ipvl0 eth0 type ipvlan mode l2 + + +4. Operating modes: + IPvlan has two modes of operation - L2 and L3. For a given master device, +you can select one of these two modes and all slaves on that master will +operate in the same (selected) mode. The RX mode is almost identical except +that in L3 mode the slaves wont receive any multicast / broadcast traffic. +L3 mode is more restrictive since routing is controlled from the other (mostly) +default namespace. + +4.1 L2 mode: + In this mode TX processing happens on the stack instance attached to the +slave device and packets are switched and queued to the master device to send +out. In this mode the slaves will RX/TX multicast and broadcast (if applicable) +as well. + +4.2 L3 mode: + In this mode TX processing upto L3 happens on the stack instance attached +to the slave device and packets are switched to the stack instance of the +master device for the L2 processing and routing from that instance will be +used before packets are queued on the outbound device. In this mode the slaves +will not receive nor can send multicast / broadcast traffic. + + +5. What to choose (macvlan vs. ipvlan)? + These two devices are very similar in many regards and the specific use +case could very well define which device to choose. if one of the following +situations defines your use case then you can choose to use ipvlan - + (a) The Linux host that is connected to the external switch / router has +policy configured that allows only one mac per port. + (b) No of virtual devices created on a master exceed the mac capacity and +puts the NIC in promiscous mode and degraded performance is a concern. + (c) If the slave device is to be put into the hostile / untrusted network +namespace where L2 on the slave could be changed / misused. + + +6. Example configuration: + + +=============================================================+ + | Host: host1 | + | | + | +----------------------+ +----------------------+ | + | | NS:ns0 | | NS:ns1 | | + | | | | | | + | | | | | | + | | ipvl0 | | ipvl1 | | + | +----------#-----------+ +-----------#----------+ | + | # # | + | ################################ | + | # eth0 | + +==============================#==============================+ + + + (a) Create two network namespaces - ns0, ns1 + ip netns add ns0 + ip netns add ns1 + + (b) Create two ipvlan slaves on eth0 (master device) + ip link add link eth0 ipvl0 type ipvlan mode l2 + ip link add link eth0 ipvl1 type ipvlan mode l2 + + (c) Assign slaves to the respective network namespaces + ip link set dev ipvl0 netns ns0 + ip link set dev ipvl1 netns ns1 + + (d) Now switch to the namespace (ns0 or ns1) to configure the slave devices + - For ns0 + (1) ip netns exec ns0 bash + (2) ip link set dev ipvl0 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl0 + (6) ip -4 route add default via $ROUTER dev ipvl0 + - For ns1 + (1) ip netns exec ns1 bash + (2) ip link set dev ipvl1 up + (3) ip link set dev lo up + (4) ip -4 addr add 127.0.0.1 dev lo + (5) ip -4 addr add $IPADDR dev ipvl1 + (6) ip -4 route add default via $ROUTER dev ipvl1 diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index f9009be..b6d64f5 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -145,6 +145,24 @@ config MACVTAP To compile this driver as a module, choose M here: the module will be called macvtap. + +config IPVLAN + tristate "IP-VLAN support" + ---help--- + This allows one to create virtual devices off of a main interface + and packets will be delivered based on the dest L3 (IPv6/IPv4 addr) + on packets. All interfaces (including the main interface) share L2 + making it transparent to the connected L2 switch. + + Ipvlan devices can be added using the "ip" command from the + iproute2 package starting with the iproute2-X.Y.ZZ release: + + "ip link add link <main-dev> [ NAME ] type ipvlan" + + To compile this driver as a module, choose M here: the module + will be called ipvlan. + + config VXLAN tristate "Virtual eXtensible Local Area Network (VXLAN)" depends on INET diff --git a/drivers/net/Makefile b/drivers/net/Makefile index 61aefdd..e25fdd7 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -6,6 +6,7 @@ # Networking Core Drivers # obj-$(CONFIG_BONDING) += bonding/ +obj-$(CONFIG_IPVLAN) += ipvlan/ obj-$(CONFIG_DUMMY) += dummy.o obj-$(CONFIG_EQUALIZER) += eql.o obj-$(CONFIG_IFB) += ifb.o diff --git a/drivers/net/ipvlan/Makefile b/drivers/net/ipvlan/Makefile new file mode 100644 index 0000000..df79910 --- /dev/null +++ b/drivers/net/ipvlan/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the Ethernet Ipvlan driver +# + +obj-$(CONFIG_IPVLAN) += ipvlan.o + +ipvlan-objs := ipvlan_core.o ipvlan_main.o diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h new file mode 100644 index 0000000..ab3e761 --- /dev/null +++ b/drivers/net/ipvlan/ipvlan.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + */ +#ifndef __IPVLAN_H +#define __IPVLAN_H + +#include <linux/kernel.h> +#include <linux/types.h> +#include <linux/module.h> +#include <linux/init.h> +#include <linux/rculist.h> +#include <linux/notifier.h> +#include <linux/netdevice.h> +#include <linux/etherdevice.h> +#include <linux/if_arp.h> +#include <linux/if_link.h> +#include <linux/if_vlan.h> +#include <linux/ip.h> +#include <linux/inetdevice.h> +#include <net/rtnetlink.h> +#include <net/gre.h> +#include <net/route.h> +#include <net/addrconf.h> + +#define IPVLAN_DRV "ipvlan" +#define IPV_DRV_VER "0.1" + +#define IPVLAN_HASH_SIZE (1 << BITS_PER_BYTE) +#define IPVLAN_HASH_MASK (IPVLAN_HASH_SIZE - 1) + +#define IPVLAN_MAC_FILTER_BITS 8 +#define IPVLAN_MAC_FILTER_SIZE (1 << IPVLAN_MAC_FILTER_BITS) +#define IPVLAN_MAC_FILTER_MASK (IPVLAN_MAC_FILTER_SIZE - 1) + +typedef enum { + IPVL_IPV6 = 0, + IPVL_ICMPV6, + IPVL_IPV4, + IPVL_ARP, +} ipvl_hdr_type; + +struct ipvl_pcpu_stats { + u64 rx_pkts; + u64 rx_bytes; + u64 rx_mcast; + u64 tx_pkts; + u64 tx_bytes; + struct u64_stats_sync syncp; + u32 rx_errs; + u32 tx_drps; +}; + +struct ipvl_port; + +struct ipvl_dev { + struct net_device *dev; + struct list_head pnode; + struct ipvl_port *port; + struct net_device *phy_dev; + struct list_head addrs; + int ipv4cnt; + int ipv6cnt; + struct ipvl_pcpu_stats *pcpu_stats; + DECLARE_BITMAP(mac_filters, IPVLAN_MAC_FILTER_SIZE); + netdev_features_t sfeatures; + u32 msg_enable; + u16 mtu_adj; +}; + +struct ipvl_addr { + struct ipvl_dev *master; /* Back pointer to master */ + union { + struct in6_addr ip6; /* IPv6 address on logical interface */ + struct in_addr ip4; /* IPv4 address on logical interface */ + } ipu; +#define ip6addr ipu.ip6 +#define ip4addr ipu.ip4 + struct hlist_node hlnode; /* Hash-table linkage */ + struct list_head anode; /* logical-interface linkage */ + struct rcu_head rcu; + ipvl_hdr_type atype; +}; + +struct ipvl_port { + struct net_device *dev; + struct hlist_head hlhead[IPVLAN_HASH_SIZE]; + struct list_head ipvlans; + struct rcu_head rcu; + int count; + u16 mode; +}; + +static inline struct ipvl_port *ipvlan_port_get_rcu(const struct net_device *d) +{ + return rcu_dereference(d->rx_handler_data); +} + +static inline struct ipvl_port *ipvlan_port_get_rtnl(const struct net_device *d) +{ + return rtnl_dereference(d->rx_handler_data); +} + +static inline bool ipvlan_dev_master(struct net_device *d) +{ + return d->priv_flags & IFF_IPVLAN_MASTER; +} + +static inline bool ipvlan_dev_slave(struct net_device *d) +{ + return d->priv_flags & IFF_IPVLAN_SLAVE; +} + +void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev); +void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval); +void ipvlan_init_secret(void); +unsigned int ipvlan_mac_hash(const unsigned char *addr); +rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb); +int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev); +void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr); +bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6); +struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, + const void *iaddr, bool is_v6); +void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync); +#endif /* __IPVLAN_H */ diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c new file mode 100644 index 0000000..a14d877 --- /dev/null +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -0,0 +1,607 @@ +/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + */ + +#include "ipvlan.h" + +static u32 ipvlan_jhash_secret; + +void ipvlan_init_secret(void) +{ + net_get_random_once(&ipvlan_jhash_secret, sizeof(ipvlan_jhash_secret)); +} + +static void ipvlan_count_rx(const struct ipvl_dev *ipvlan, + unsigned int len, bool success, bool mcast) +{ + if (!ipvlan) + return; + + if (likely(success)) { + struct ipvl_pcpu_stats *pcptr; + + pcptr = this_cpu_ptr(ipvlan->pcpu_stats); + u64_stats_update_begin(&pcptr->syncp); + pcptr->rx_pkts++; + pcptr->rx_bytes += len; + if (mcast) + pcptr->rx_mcast++; + u64_stats_update_end(&pcptr->syncp); + } else { + this_cpu_inc(ipvlan->pcpu_stats->rx_errs); + } +} + +static u8 ipvlan_get_v6_hash(const void *iaddr) +{ + const struct in6_addr *ip6_addr = iaddr; + + return __ipv6_addr_jhash(ip6_addr, ipvlan_jhash_secret) & + IPVLAN_HASH_MASK; +} + +static u8 ipvlan_get_v4_hash(const void *iaddr) +{ + const struct in_addr *ip4_addr = iaddr; + + return jhash_1word(ip4_addr->s_addr, ipvlan_jhash_secret) & + IPVLAN_HASH_MASK; +} + +struct ipvl_addr *ipvlan_ht_addr_lookup(const struct ipvl_port *port, + const void *iaddr, bool is_v6) +{ + struct ipvl_addr *addr; + u8 hash; + + hash = is_v6 ? ipvlan_get_v6_hash(iaddr) : + ipvlan_get_v4_hash(iaddr); + hlist_for_each_entry_rcu(addr, &port->hlhead[hash], hlnode) { + if (is_v6 && addr->atype == IPVL_IPV6 && + ipv6_addr_equal(&addr->ip6addr, iaddr)) + return addr; + else if (!is_v6 && addr->atype == IPVL_IPV4 && + addr->ip4addr.s_addr == + ((struct in_addr *)iaddr)->s_addr) + return addr; + } + return NULL; +} + +void ipvlan_ht_addr_add(struct ipvl_dev *ipvlan, struct ipvl_addr *addr) +{ + struct ipvl_port *port = ipvlan->port; + u8 hash; + + hash = (addr->atype == IPVL_IPV6) ? + ipvlan_get_v6_hash(&addr->ip6addr) : + ipvlan_get_v4_hash(&addr->ip4addr); + hlist_add_head_rcu(&addr->hlnode, &port->hlhead[hash]); +} + +void ipvlan_ht_addr_del(struct ipvl_addr *addr, bool sync) +{ + hlist_del_rcu(&addr->hlnode); + if (sync) + synchronize_rcu(); +} + +bool ipvlan_addr_busy(struct ipvl_dev *ipvlan, void *iaddr, bool is_v6) +{ + struct ipvl_port *port = ipvlan->port; + struct ipvl_addr *addr; + + list_for_each_entry(addr, &ipvlan->addrs, anode) { + if ((is_v6 && addr->atype == IPVL_IPV6 && + ipv6_addr_equal(&addr->ip6addr, iaddr)) || + (!is_v6 && addr->atype == IPVL_IPV4 && + addr->ip4addr.s_addr == ((struct in_addr *)iaddr)->s_addr)) + return true; + } + + if (ipvlan_ht_addr_lookup(port, iaddr, is_v6)) + return true; + + return false; +} + +static void *ipvlan_get_L3_hdr(struct sk_buff *skb, int *type) +{ + void *lyr3h = NULL; + + switch (skb->protocol) { + case htons(ETH_P_ARP): { + struct arphdr *arph; + + if (unlikely(!pskb_may_pull(skb, sizeof(*arph)))) + return NULL; + + arph = arp_hdr(skb); + *type = IPVL_ARP; + lyr3h = arph; + break; + } + case htons(ETH_P_IP): { + u32 pktlen; + struct iphdr *ip4h; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip4h)))) + return NULL; + + ip4h = ip_hdr(skb); + pktlen = ntohs(ip4h->tot_len); + if (ip4h->ihl < 5 || ip4h->version != 4) + return NULL; + if (skb->len < pktlen || pktlen < (ip4h->ihl * 4)) + return NULL; + + *type = IPVL_IPV4; + lyr3h = ip4h; + break; + } + case htons(ETH_P_IPV6): { + struct ipv6hdr *ip6h; + + if (unlikely(!pskb_may_pull(skb, sizeof(*ip6h)))) + return NULL; + + ip6h = ipv6_hdr(skb); + if (ip6h->version != 6) + return NULL; + + *type = IPVL_IPV6; + lyr3h = ip6h; + /* Only Neighbour Solicitation pkts need different treatment */ + if (ipv6_addr_any(&ip6h->saddr) && + ip6h->nexthdr == NEXTHDR_ICMP) { + *type = IPVL_ICMPV6; + lyr3h = ip6h + 1; + } + break; + } + default: + return NULL; + } + + return lyr3h; +} + +unsigned int ipvlan_mac_hash(const unsigned char *addr) +{ + u32 hash = jhash_1word(__get_unaligned_cpu32(addr+2), + ipvlan_jhash_secret); + + return hash & IPVLAN_MAC_FILTER_MASK; +} + +static void ipvlan_multicast_frame(struct ipvl_port *port, struct sk_buff *skb, + const struct ipvl_dev *in_dev, bool local) +{ + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_dev *ipvlan; + struct sk_buff *nskb; + unsigned int len; + unsigned int mac_hash; + int ret; + + if (skb->protocol == htons(ETH_P_PAUSE)) + return; + + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { + if (local && (ipvlan == in_dev)) + continue; + + mac_hash = ipvlan_mac_hash(eth->h_dest); + if (!test_bit(mac_hash, ipvlan->mac_filters)) + continue; + + ret = NET_RX_DROP; + len = skb->len + ETH_HLEN; + nskb = skb_clone(skb, GFP_ATOMIC); + if (!nskb) + goto mcast_acct; + + if (ether_addr_equal(eth->h_dest, ipvlan->phy_dev->broadcast)) + nskb->pkt_type = PACKET_BROADCAST; + else + nskb->pkt_type = PACKET_MULTICAST; + + nskb->dev = ipvlan->dev; + if (local) + ret = dev_forward_skb(ipvlan->dev, nskb); + else + ret = netif_rx(nskb); +mcast_acct: + ipvlan_count_rx(ipvlan, len, ret == NET_RX_SUCCESS, true); + } + + /* Locally generated? ...Forward a copy to the main-device as + * well. On the RX side we'll ignore it (wont give it to any + * of the virtual devices. + */ + if (local) { + nskb = skb_clone(skb, GFP_ATOMIC); + if (nskb) { + if (ether_addr_equal(eth->h_dest, port->dev->broadcast)) + nskb->pkt_type = PACKET_BROADCAST; + else + nskb->pkt_type = PACKET_MULTICAST; + + dev_forward_skb(port->dev, nskb); + } + } +} + +static int ipvlan_rcv_frame(struct ipvl_addr *addr, struct sk_buff *skb, + bool local) +{ + struct ipvl_dev *ipvlan = addr->master; + struct net_device *dev = ipvlan->dev; + unsigned int len; + rx_handler_result_t ret = RX_HANDLER_CONSUMED; + bool success = false; + + len = skb->len + ETH_HLEN; + if (unlikely(!(dev->flags & IFF_UP))) { + kfree_skb(skb); + goto out; + } + + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + goto out; + + skb->dev = dev; + skb->pkt_type = PACKET_HOST; + + if (local) { + if (dev_forward_skb(ipvlan->dev, skb) == NET_RX_SUCCESS) + success = true; + } else { + ret = RX_HANDLER_ANOTHER; + success = true; + } + +out: + ipvlan_count_rx(ipvlan, len, success, false); + return ret; +} + +static struct ipvl_addr *ipvlan_addr_lookup(struct ipvl_port *port, + void *lyr3h, int addr_type, + bool use_dest) +{ + struct ipvl_addr *addr = NULL; + + if (addr_type == IPVL_IPV6) { + struct ipv6hdr *ip6h; + struct in6_addr *i6addr; + + ip6h = (struct ipv6hdr *)lyr3h; + i6addr = use_dest ? &ip6h->daddr : &ip6h->saddr; + addr = ipvlan_ht_addr_lookup(port, i6addr, true); + } else if (addr_type == IPVL_ICMPV6) { + struct nd_msg *ndmh; + struct in6_addr *i6addr; + + /* Make sure that the NeighborSolicitation ICMPv6 packets + * are handled to avoid DAD issue. + */ + ndmh = (struct nd_msg *)lyr3h; + if (ndmh->icmph.icmp6_type == NDISC_NEIGHBOUR_SOLICITATION) { + i6addr = &ndmh->target; + addr = ipvlan_ht_addr_lookup(port, i6addr, true); + } + } else if (addr_type == IPVL_IPV4) { + struct iphdr *ip4h; + __be32 *i4addr; + + ip4h = (struct iphdr *)lyr3h; + i4addr = use_dest ? &ip4h->daddr : &ip4h->saddr; + addr = ipvlan_ht_addr_lookup(port, i4addr, false); + } else if (addr_type == IPVL_ARP) { + struct arphdr *arph; + unsigned char *arp_ptr; + __be32 dip; + + arph = (struct arphdr *)lyr3h; + arp_ptr = (unsigned char *)(arph + 1); + if (use_dest) + arp_ptr += (2 * port->dev->addr_len) + 4; + else + arp_ptr += port->dev->addr_len; + + memcpy(&dip, arp_ptr, 4); + addr = ipvlan_ht_addr_lookup(port, &dip, false); + } + + return addr; +} + +static int ipvlan_process_v4_outbound(struct sk_buff *skb) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net_device *dev = skb->dev; + struct rtable *rt; + int err, ret = NET_XMIT_DROP; + struct flowi4 fl4 = { + .flowi4_oif = dev->iflink, + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + + rt = ip_route_output_flow(dev_net(dev), &fl4, NULL); + if (IS_ERR(rt)) + goto err; + + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto err; + } + skb_dst_drop(skb); + skb_dst_set(skb, &rt->dst); + err = ip_local_out(skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} + +static int ipvlan_process_v6_outbound(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net_device *dev = skb->dev; + struct dst_entry *dst; + int err, ret = NET_XMIT_DROP; + struct flowi6 fl6 = { + .flowi6_iif = skb->dev->ifindex, + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + dst = ip6_route_output(dev_net(dev), NULL, &fl6); + if (IS_ERR(dst)) + goto err; + + skb_dst_drop(skb); + skb_dst_set(skb, dst); + err = ip6_local_out(skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} + +static int ipvlan_process_outbound(struct sk_buff *skb, + const struct ipvl_dev *ipvlan) +{ + struct ethhdr *ethh = eth_hdr(skb); + int ret = NET_XMIT_DROP; + + /* In this mode we dont care about multicast and broadcast traffic */ + if (is_multicast_ether_addr(ethh->h_dest)) { + pr_warn_ratelimited("Dropped {multi|broad}cast of type= [%x]\n", + ntohs(skb->protocol)); + kfree_skb(skb); + goto out; + } + + /* The ipvlan is a pseudo-L2 device, so the packets that we receive + * will have L2; which need to discarded and processed further + * in the net-ns of the main-device. + */ + if (skb_mac_header_was_set(skb)) { + skb_pull(skb, sizeof(*ethh)); + skb->mac_header = (typeof(skb->mac_header))~0U; + skb_reset_network_header(skb); + } + + if (skb->protocol == htons(ETH_P_IPV6)) + ret = ipvlan_process_v6_outbound(skb); + else if (skb->protocol == htons(ETH_P_IP)) + ret = ipvlan_process_v4_outbound(skb); + else { + pr_warn_ratelimited("Dropped outbound packet type=%x\n", + ntohs(skb->protocol)); + kfree_skb(skb); + } +out: + return ret; +} + +static int ipvlan_xmit_mode_l3(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + void *lyr3h; + struct ipvl_addr *addr; + int addr_type; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); + if (addr) + return ipvlan_rcv_frame(addr, skb, true); + +out: + skb->dev = ipvlan->phy_dev; + return ipvlan_process_outbound(skb, ipvlan); +} + +static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_addr *addr; + void *lyr3h; + int addr_type; + + if (ether_addr_equal(eth->h_dest, eth->h_source)) { + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (lyr3h) { + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, addr_type, true); + if (addr) + return ipvlan_rcv_frame(addr, skb, true); + } + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + /* Packet definitely does not belong to any of the + * virtual devices, but the dest is local. So forward + * the skb for the main-dev. At the RX side we just return + * RX_PASS for it to be processed further on the stack. + */ + return dev_forward_skb(ipvlan->phy_dev, skb); + + } else if (is_multicast_ether_addr(eth->h_dest)) { + u8 ip_summed = skb->ip_summed; + + skb->ip_summed = CHECKSUM_UNNECESSARY; + ipvlan_multicast_frame(ipvlan->port, skb, ipvlan, true); + skb->ip_summed = ip_summed; + } + + skb->dev = ipvlan->phy_dev; + return dev_queue_xmit(skb); +} + +int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan_port_get_rcu(ipvlan->phy_dev); + + if (!port) + goto out; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct ethhdr)))) + goto out; + + switch(port->mode) { + case IPVLAN_MODE_L2: + return ipvlan_xmit_mode_l2(skb, dev); + case IPVLAN_MODE_L3: + return ipvlan_xmit_mode_l3(skb, dev); + } + + /* Should not reach here */ + WARN_ONCE(true, "ipvlan_queue_xmit() called for mode = [%hx]\n", + port->mode); +out: + kfree_skb(skb); + return NET_XMIT_DROP; +} + +static bool ipvlan_external_frame(struct sk_buff *skb, struct ipvl_port *port) +{ + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_addr *addr; + void *lyr3h; + int addr_type; + + if (ether_addr_equal(eth->h_source, skb->dev->dev_addr)) { + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + return true; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, false); + if (addr) + return false; + } + + return true; +} + +static rx_handler_result_t ipvlan_handle_mode_l3(struct sk_buff **pskb, + struct ipvl_port *port) +{ + void *lyr3h; + int addr_type; + struct ipvl_addr *addr; + struct sk_buff *skb = *pskb; + rx_handler_result_t ret = RX_HANDLER_PASS; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + goto out; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); + if (addr) + ret = ipvlan_rcv_frame(addr, skb, false); + +out: + return ret; +} + +static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, + struct ipvl_port *port) +{ + struct sk_buff *skb = *pskb; + struct ethhdr *eth = eth_hdr(skb); + rx_handler_result_t ret = RX_HANDLER_PASS; + void *lyr3h; + int addr_type; + + if (is_multicast_ether_addr(eth->h_dest)) { + if (ipvlan_external_frame(skb, port)) + ipvlan_multicast_frame(port, skb, NULL, false); + } else { + struct ipvl_addr *addr; + + lyr3h = ipvlan_get_L3_hdr(skb, &addr_type); + if (!lyr3h) + return ret; + + addr = ipvlan_addr_lookup(port, lyr3h, addr_type, true); + if (addr) + ret = ipvlan_rcv_frame(addr, skb, false); + } + + return ret; +} + +rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + struct ipvl_port *port = ipvlan_port_get_rcu(skb->dev); + + if (!port) + return RX_HANDLER_PASS; + + switch (port->mode) { + case IPVLAN_MODE_L2: + return ipvlan_handle_mode_l2(pskb, port); + case IPVLAN_MODE_L3: + return ipvlan_handle_mode_l3(pskb, port); + } + + /* Should not reach here */ + WARN_ONCE(true, "ipvlan_handle_frame() called for mode = [%hx]\n", + port->mode); + kfree_skb(skb); + return NET_RX_DROP; +} diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c new file mode 100644 index 0000000..c3df84b --- /dev/null +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -0,0 +1,789 @@ +/* Copyright (c) 2014 Mahesh Bandewar <maheshb@google.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of + * the License, or (at your option) any later version. + * + */ + +#include "ipvlan.h" + +void ipvlan_adjust_mtu(struct ipvl_dev *ipvlan, struct net_device *dev) +{ + ipvlan->dev->mtu = dev->mtu - ipvlan->mtu_adj; +} + +void ipvlan_set_port_mode(struct ipvl_port *port, u32 nval) +{ + struct ipvl_dev *ipvlan; + + if (port->mode != nval) { + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { + if (nval == IPVLAN_MODE_L3) + ipvlan->dev->flags |= IFF_NOARP; + else + ipvlan->dev->flags &= ~IFF_NOARP; + } + port->mode = nval; + } +} + +static int ipvlan_port_create(struct net_device *dev) +{ + struct ipvl_port *port; + int err, idx; + + if (dev->type != ARPHRD_ETHER || dev->flags & IFF_LOOPBACK) { + netdev_err(dev, "Master is either lo or non-ether device\n"); + return -EINVAL; + } + port = kzalloc(sizeof(struct ipvl_port), GFP_KERNEL); + if (!port) + return -ENOMEM; + + port->dev = dev; + port->mode = IPVLAN_MODE_L3; + INIT_LIST_HEAD(&port->ipvlans); + for (idx = 0; idx < IPVLAN_HASH_SIZE; idx++) + INIT_HLIST_HEAD(&port->hlhead[idx]); + + err = netdev_rx_handler_register(dev, ipvlan_handle_frame, port); + if (err) + goto err; + + dev->priv_flags |= IFF_IPVLAN_MASTER; + return 0; + +err: + kfree_rcu(port, rcu); + return err; +} + +static void ipvlan_port_destroy(struct net_device *dev) +{ + struct ipvl_port *port = ipvlan_port_get_rtnl(dev); + + dev->priv_flags &= ~IFF_IPVLAN_MASTER; + netdev_rx_handler_unregister(dev); + kfree_rcu(port, rcu); +} + +/* ipvlan network devices have devices nesting below it and are a special + * "super class" of normal network devices; split their locks off into a + * separate class since they always nest. + */ +static struct lock_class_key ipvlan_netdev_xmit_lock_key; +static struct lock_class_key ipvlan_netdev_addr_lock_key; + +#define IPVLAN_FEATURES \ + (NETIF_F_SG | NETIF_F_ALL_CSUM | NETIF_F_HIGHDMA | NETIF_F_FRAGLIST | \ + NETIF_F_GSO | NETIF_F_TSO | NETIF_F_UFO | NETIF_F_GSO_ROBUST | \ + NETIF_F_TSO_ECN | NETIF_F_TSO6 | NETIF_F_GRO | NETIF_F_RXCSUM | \ + NETIF_F_HW_VLAN_CTAG_FILTER | NETIF_F_HW_VLAN_STAG_FILTER) + +#define IPVLAN_STATE_MASK \ + ((1<<__LINK_STATE_NOCARRIER) | (1<<__LINK_STATE_DORMANT)) + +static void ipvlan_set_lockdep_class_one(struct net_device *dev, + struct netdev_queue *txq, + void *_unused) +{ + lockdep_set_class(&txq->_xmit_lock, &ipvlan_netdev_xmit_lock_key); +} + +static void ipvlan_set_lockdep_class(struct net_device *dev) +{ + lockdep_set_class(&dev->addr_list_lock, &ipvlan_netdev_addr_lock_key); + netdev_for_each_tx_queue(dev, ipvlan_set_lockdep_class_one, NULL); +} + +static int ipvlan_init(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + const struct net_device *phy_dev = ipvlan->phy_dev; + + dev->state = (dev->state & ~IPVLAN_STATE_MASK) | + (phy_dev->state & IPVLAN_STATE_MASK); + dev->features = phy_dev->features & IPVLAN_FEATURES; + dev->features |= NETIF_F_LLTX; + dev->gso_max_size = phy_dev->gso_max_size; + dev->iflink = phy_dev->ifindex; + dev->hard_header_len = phy_dev->hard_header_len; + + ipvlan_set_lockdep_class(dev); + + ipvlan->pcpu_stats = alloc_percpu(struct ipvl_pcpu_stats); + if (!ipvlan->pcpu_stats) + return -ENOMEM; + + return 0; +} + +static void ipvlan_uninit(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan->port; + + if (ipvlan->pcpu_stats) + free_percpu(ipvlan->pcpu_stats); + + port->count -= 1; + if (!port->count) + ipvlan_port_destroy(port->dev); +} + +static int ipvlan_open(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + struct ipvl_addr *addr; + + if (ipvlan->port->mode == IPVLAN_MODE_L3) + dev->flags |= IFF_NOARP; + else + dev->flags &= ~IFF_NOARP; + + if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) { + list_for_each_entry(addr, &ipvlan->addrs, anode) + ipvlan_ht_addr_add(ipvlan, addr); + } + return dev_uc_add(phy_dev, phy_dev->dev_addr); +} + +static int ipvlan_stop(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + struct ipvl_addr *addr; + + dev_uc_unsync(phy_dev, dev); + dev_mc_unsync(phy_dev, dev); + + dev_uc_del(phy_dev, phy_dev->dev_addr); + + if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) { + list_for_each_entry(addr, &ipvlan->addrs, anode) + ipvlan_ht_addr_del(addr, !dev->dismantle); + } + return 0; +} + +netdev_tx_t ipvlan_start_xmit(struct sk_buff *skb, struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + int skblen = skb->len; + int ret; + + ret = ipvlan_queue_xmit(skb, dev); + if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) { + struct ipvl_pcpu_stats *pcptr; + + pcptr = this_cpu_ptr(ipvlan->pcpu_stats); + + u64_stats_update_begin(&pcptr->syncp); + pcptr->tx_pkts++; + pcptr->tx_bytes += skblen; + u64_stats_update_end(&pcptr->syncp); + } else { + this_cpu_inc(ipvlan->pcpu_stats->tx_drps); + } + return ret; +} + +static netdev_features_t ipvlan_fix_features(struct net_device *dev, + netdev_features_t features) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + return features & (ipvlan->sfeatures | ~IPVLAN_FEATURES); +} + +static void ipvlan_change_rx_flags(struct net_device *dev, int change) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + if (change & IFF_ALLMULTI) + dev_set_allmulti(phy_dev, dev->flags & IFF_ALLMULTI? 1 : -1); +} + +static void ipvlan_set_broadcast_mac_filter(struct ipvl_dev *ipvlan, bool set) +{ + struct net_device *dev = ipvlan->dev; + unsigned int hashbit = ipvlan_mac_hash(dev->broadcast); + + if (set && !test_bit(hashbit, ipvlan->mac_filters)) + __set_bit(hashbit, ipvlan->mac_filters); + else if (!set && test_bit(hashbit, ipvlan->mac_filters)) + __clear_bit(hashbit, ipvlan->mac_filters); +} + +static void ipvlan_set_multicast_mac_filter(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (dev->flags & (IFF_PROMISC | IFF_ALLMULTI)) { + bitmap_fill(ipvlan->mac_filters, IPVLAN_MAC_FILTER_SIZE); + } else { + struct netdev_hw_addr *ha; + DECLARE_BITMAP(mc_filters, IPVLAN_MAC_FILTER_SIZE); + + bitmap_zero(mc_filters, IPVLAN_MAC_FILTER_SIZE); + netdev_for_each_mc_addr(ha, dev) + __set_bit(ipvlan_mac_hash(ha->addr), mc_filters); + + bitmap_copy(ipvlan->mac_filters, mc_filters, + IPVLAN_MAC_FILTER_SIZE); + } + dev_uc_sync(ipvlan->phy_dev, dev); + dev_mc_sync(ipvlan->phy_dev, dev); +} + +static struct rtnl_link_stats64 *ipvlan_get_stats64(struct net_device *dev, + struct rtnl_link_stats64 *s) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (ipvlan->pcpu_stats) { + struct ipvl_pcpu_stats *pcptr; + u64 rx_pkts, rx_bytes, rx_mcast, tx_pkts, tx_bytes; + u32 rx_errs = 0, tx_drps = 0; + u32 strt; + int idx; + + for_each_possible_cpu(idx) { + pcptr = per_cpu_ptr(ipvlan->pcpu_stats, idx); + do { + strt= u64_stats_fetch_begin_irq(&pcptr->syncp); + rx_pkts = pcptr->rx_pkts; + rx_bytes = pcptr->rx_bytes; + rx_mcast = pcptr->rx_mcast; + tx_pkts = pcptr->tx_pkts; + tx_bytes = pcptr->tx_bytes; + } while (u64_stats_fetch_retry_irq(&pcptr->syncp, + strt)); + + s->rx_packets += rx_pkts; + s->rx_bytes += rx_bytes; + s->multicast += rx_mcast; + s->tx_packets += tx_pkts; + s->tx_bytes += tx_bytes; + + /* u32 values are updated without syncp protection. */ + rx_errs += pcptr->rx_errs; + tx_drps += pcptr->tx_drps; + } + s->rx_errors = rx_errs; + s->rx_dropped = rx_errs; + s->tx_dropped = tx_drps; + } + return s; +} + +static int ipvlan_vlan_rx_add_vid(struct net_device *dev, __be16 proto, u16 vid) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + return vlan_vid_add(phy_dev, proto, vid); +} + +static int ipvlan_vlan_rx_kill_vid(struct net_device *dev, __be16 proto, + u16 vid) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + vlan_vid_del(phy_dev, proto, vid); + return 0; +} + +static const struct net_device_ops ipvlan_netdev_ops = { + .ndo_init = ipvlan_init, + .ndo_uninit = ipvlan_uninit, + .ndo_open = ipvlan_open, + .ndo_stop = ipvlan_stop, + .ndo_start_xmit = ipvlan_start_xmit, + .ndo_fix_features = ipvlan_fix_features, + .ndo_change_rx_flags = ipvlan_change_rx_flags, + .ndo_set_rx_mode = ipvlan_set_multicast_mac_filter, + .ndo_get_stats64 = ipvlan_get_stats64, + .ndo_vlan_rx_add_vid = ipvlan_vlan_rx_add_vid, + .ndo_vlan_rx_kill_vid = ipvlan_vlan_rx_kill_vid, +}; + +static int ipvlan_hard_header(struct sk_buff *skb, struct net_device *dev, + unsigned short type, const void *daddr, + const void *saddr, unsigned len) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + struct net_device *phy_dev = ipvlan->phy_dev; + + /* TODO Probably use a different field than dev_addr so that the + * mac-address on the virtual device is portable and can be carried + * while the packets use the mac-addr on the physical device. + */ + return dev_hard_header(skb, phy_dev, type, daddr, + saddr ? : dev->dev_addr, len); +} + +static const struct header_ops ipvlan_header_ops = { + .create = ipvlan_hard_header, + .rebuild = eth_rebuild_header, + .parse = eth_header_parse, + .cache = eth_header_cache, + .cache_update = eth_header_cache_update, +}; + +static int ipvlan_ethtool_get_settings(struct net_device *dev, + struct ethtool_cmd *cmd) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + + return __ethtool_get_settings(ipvlan->phy_dev, cmd); +} + +static void ipvlan_ethtool_get_drvinfo(struct net_device *dev, + struct ethtool_drvinfo *drvinfo) +{ + strlcpy(drvinfo->driver, IPVLAN_DRV, sizeof(drvinfo->driver)); + strlcpy(drvinfo->version, IPV_DRV_VER, sizeof(drvinfo->version)); +} + +static u32 ipvlan_ethtool_get_msglevel(struct net_device *dev) +{ + const struct ipvl_dev *ipvlan = netdev_priv(dev); + + return ipvlan->msg_enable; +} + +static void ipvlan_ethtool_set_msglevel(struct net_device *dev, u32 value) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + ipvlan->msg_enable = value; +} + +static const struct ethtool_ops ipvlan_ethtool_ops = { + .get_link = ethtool_op_get_link, + .get_settings = ipvlan_ethtool_get_settings, + .get_drvinfo = ipvlan_ethtool_get_drvinfo, + .get_msglevel = ipvlan_ethtool_get_msglevel, + .set_msglevel = ipvlan_ethtool_set_msglevel, +}; + +static int ipvlan_nl_changelink(struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); + + if (data && data[IFLA_IPVLAN_MODE]) { + u16 nmode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + ipvlan_set_port_mode(port, nmode); + } + return 0; +} + +static size_t ipvlan_nl_getsize(const struct net_device *dev) +{ + return (0 + + nla_total_size(2) /* IFLA_IPVLAN_MODE */ + ); +} + +static int ipvlan_nl_validate(struct nlattr *tb[], struct nlattr *data[]) +{ + if (data && data[IFLA_IPVLAN_MODE]) { + u16 mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + if (mode < IPVLAN_MODE_L2 || mode >= IPVLAN_MODE_MAX) + return -EINVAL; + } + return 0; +} + +static int ipvlan_nl_fillinfo(struct sk_buff *skb, + const struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port = ipvlan_port_get_rtnl(ipvlan->phy_dev); + int ret = -EINVAL; + + if (!port) + goto err; + + ret = -EMSGSIZE; + if (nla_put_u16(skb, IFLA_IPVLAN_MODE, port->mode)) + goto err; + + return 0; + +err: + return ret; +} + +static int ipvlan_link_new(struct net *src_net, struct net_device *dev, + struct nlattr *tb[], struct nlattr *data[]) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_port *port; + struct net_device *phy_dev; + int err; + + if (!tb[IFLA_LINK]) + return -EINVAL; + + phy_dev = __dev_get_by_index(src_net, nla_get_u32(tb[IFLA_LINK])); + if (!phy_dev) + return -ENODEV; + + if (ipvlan_dev_slave(phy_dev)) { + struct ipvl_dev *tmp = netdev_priv(phy_dev); + + phy_dev = tmp->phy_dev; + } else if (!ipvlan_dev_master(phy_dev)) { + err = ipvlan_port_create(phy_dev); + if (err < 0) + return err; + } + + port = ipvlan_port_get_rtnl(phy_dev); + if (data && data[IFLA_IPVLAN_MODE]) + port->mode = nla_get_u16(data[IFLA_IPVLAN_MODE]); + + ipvlan->phy_dev = phy_dev; + ipvlan->dev = dev; + ipvlan->port = port; + ipvlan->sfeatures = IPVLAN_FEATURES; + INIT_LIST_HEAD(&ipvlan->addrs); + ipvlan->ipv4cnt = 0; + ipvlan->ipv6cnt = 0; + + /* TODO Probably put random address here to be presented to the + * world but keep using the physical-dev address for the outgoing + * packets. + */ + memcpy(dev->dev_addr, phy_dev->dev_addr, ETH_ALEN); + + dev->priv_flags |= IFF_IPVLAN_SLAVE; + + port->count += 1; + err = register_netdevice(dev); + if (err < 0) + goto ipvlan_destroy_port; + + err = netdev_upper_dev_link(phy_dev, dev); + if (err) + goto ipvlan_destroy_port; + + list_add_tail_rcu(&ipvlan->pnode, &port->ipvlans); + netif_stacked_transfer_operstate(phy_dev, dev); + return 0; + +ipvlan_destroy_port: + port->count -= 1; + if (!port->count) + ipvlan_port_destroy(phy_dev); + + return err; +} + +static void ipvlan_link_delete(struct net_device *dev, struct list_head *head) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ipvl_addr *addr, *next; + + if (ipvlan->ipv6cnt > 0 || ipvlan->ipv4cnt > 0) { + list_for_each_entry_safe(addr, next, &ipvlan->addrs, anode) { + ipvlan_ht_addr_del(addr, !dev->dismantle); + list_del_rcu(&addr->anode); + } + } + list_del_rcu(&ipvlan->pnode); + unregister_netdevice_queue(dev, head); + netdev_upper_dev_unlink(ipvlan->phy_dev, dev); +} + +static void ipvlan_link_setup(struct net_device *dev) +{ + ether_setup(dev); + + dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE | IFF_TX_SKB_SHARING); + dev->priv_flags |= IFF_UNICAST_FLT; + dev->netdev_ops = &ipvlan_netdev_ops; + dev->destructor = free_netdev; + dev->header_ops = &ipvlan_header_ops; + dev->ethtool_ops = &ipvlan_ethtool_ops; + dev->tx_queue_len = 0; +} + +static const struct nla_policy ipvlan_nl_policy[IFLA_IPVLAN_MAX + 1] = +{ + [IFLA_IPVLAN_MODE] = { .type = NLA_U16 }, +}; + +static struct rtnl_link_ops ipvlan_link_ops = { + .kind = "ipvlan", + .priv_size = sizeof(struct ipvl_dev), + + .get_size = ipvlan_nl_getsize, + .policy = ipvlan_nl_policy, + .validate = ipvlan_nl_validate, + .fill_info = ipvlan_nl_fillinfo, + .changelink = ipvlan_nl_changelink, + .maxtype = IFLA_IPVLAN_MAX, + + .setup = ipvlan_link_setup, + .newlink = ipvlan_link_new, + .dellink = ipvlan_link_delete, +}; + +int ipvlan_link_register(struct rtnl_link_ops *ops) +{ + return rtnl_link_register(ops); +} + +static int ipvlan_device_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct net_device *dev = netdev_notifier_info_to_dev(ptr); + struct ipvl_dev *ipvlan, *next; + struct ipvl_port *port; + LIST_HEAD(lst_kill); + + if (!ipvlan_dev_master(dev)) + return NOTIFY_DONE; + + port = ipvlan_port_get_rtnl(dev); + + switch (event) { + case NETDEV_CHANGE: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) + netif_stacked_transfer_operstate(ipvlan->phy_dev, + ipvlan->dev); + break; + + case NETDEV_UNREGISTER: + if (dev->reg_state != NETREG_UNREGISTERING) + break; + + list_for_each_entry_safe(ipvlan, next, &port->ipvlans, + pnode) + ipvlan->dev->rtnl_link_ops->dellink(ipvlan->dev, + &lst_kill); + unregister_netdevice_many(&lst_kill); + break; + + case NETDEV_FEAT_CHANGE: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) { + ipvlan->dev->features = dev->features & IPVLAN_FEATURES; + ipvlan->dev->gso_max_size = dev->gso_max_size; + netdev_features_change(ipvlan->dev); + } + break; + + case NETDEV_CHANGEMTU: + list_for_each_entry(ipvlan, &port->ipvlans, pnode) + ipvlan_adjust_mtu(ipvlan, dev); + break; + + case NETDEV_PRE_TYPE_CHANGE: + /* Forbid underlying device to change its type. */ + return NOTIFY_BAD; + } + return NOTIFY_DONE; +} + +static int ipvlan_add_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) +{ + struct ipvl_addr *addr; + + if (ipvlan_addr_busy(ipvlan, ip6_addr, true)) { + netif_err(ipvlan, ifup, ipvlan->dev, + "Failed to add IPv6=%pI6c addr for %s intf\n", + ip6_addr, ipvlan->dev->name); + return -EINVAL; + } + addr = kzalloc(sizeof(struct ipvl_addr), GFP_ATOMIC); + if (!addr) + return -ENOMEM; + + addr->master = ipvlan; + memcpy(&addr->ip6addr, ip6_addr, sizeof(struct in6_addr)); + addr->atype = IPVL_IPV6; + list_add_tail_rcu(&addr->anode, &ipvlan->addrs); + ipvlan->ipv6cnt++; + ipvlan_ht_addr_add(ipvlan, addr); + + return 0; +} + +static void ipvlan_del_addr6(struct ipvl_dev *ipvlan, struct in6_addr *ip6_addr) +{ + struct ipvl_addr *addr; + + addr = ipvlan_ht_addr_lookup(ipvlan->port, ip6_addr, true); + if (!addr) + return; + + ipvlan_ht_addr_del(addr, true); + list_del_rcu(&addr->anode); + ipvlan->ipv6cnt--; + WARN_ON(ipvlan->ipv6cnt < 0); + kfree_rcu(addr, rcu); + + return; +} + +static int ipvlan_addr6_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct inet6_ifaddr *if6 = (struct inet6_ifaddr *)ptr; + struct net_device *dev = (struct net_device *)if6->idev->dev; + struct ipvl_dev *ipvlan = netdev_priv(dev); + + if (!ipvlan_dev_slave(dev)) + return NOTIFY_DONE; + + if (!ipvlan || !ipvlan->port) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + if (ipvlan_add_addr6(ipvlan, &if6->addr)) + return NOTIFY_BAD; + break; + + case NETDEV_DOWN: + ipvlan_del_addr6(ipvlan, &if6->addr); + break; + } + + return NOTIFY_OK; +} + +static int ipvlan_add_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) +{ + struct ipvl_addr *addr; + + if (ipvlan_addr_busy(ipvlan, ip4_addr, false)) { + netif_err(ipvlan, ifup, ipvlan->dev, + "Failed to add IPv4=%pI4 on %s intf.\n", + ip4_addr, ipvlan->dev->name); + return -EINVAL; + } + addr = kzalloc(sizeof(struct ipvl_addr), GFP_KERNEL); + if (!addr) + return -ENOMEM; + + addr->master = ipvlan; + memcpy(&addr->ip4addr, ip4_addr, sizeof(struct in_addr)); + addr->atype = IPVL_IPV4; + list_add_tail_rcu(&addr->anode, &ipvlan->addrs); + ipvlan->ipv4cnt++; + ipvlan_ht_addr_add(ipvlan, addr); + ipvlan_set_broadcast_mac_filter(ipvlan, true); + + return 0; +} + +static void ipvlan_del_addr4(struct ipvl_dev *ipvlan, struct in_addr *ip4_addr) +{ + struct ipvl_addr *addr; + + addr = ipvlan_ht_addr_lookup(ipvlan->port, ip4_addr, false); + if (!addr) + return; + + ipvlan_ht_addr_del(addr, true); + list_del_rcu(&addr->anode); + ipvlan->ipv4cnt--; + WARN_ON(ipvlan->ipv4cnt < 0); + if (!ipvlan->ipv4cnt) + ipvlan_set_broadcast_mac_filter(ipvlan, false); + kfree_rcu(addr, rcu); + + return; +} + +static int ipvlan_addr4_event(struct notifier_block *unused, + unsigned long event, void *ptr) +{ + struct in_ifaddr *if4 = (struct in_ifaddr *)ptr; + struct net_device *dev = (struct net_device *)if4->ifa_dev->dev; + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct in_addr ip4_addr; + + if (!ipvlan_dev_slave(dev)) + return NOTIFY_DONE; + + if (!ipvlan || !ipvlan->port) + return NOTIFY_DONE; + + switch (event) { + case NETDEV_UP: + ip4_addr.s_addr = if4->ifa_address; + if (ipvlan_add_addr4(ipvlan, &ip4_addr)) + return NOTIFY_BAD; + break; + + case NETDEV_DOWN: + ip4_addr.s_addr = if4->ifa_address; + ipvlan_del_addr4(ipvlan, &ip4_addr); + break; + } + + return NOTIFY_OK; +} + +static struct notifier_block ipvlan_addr4_notifier_block __read_mostly = { + .notifier_call = ipvlan_addr4_event, +}; + +static struct notifier_block ipvlan_notifier_block __read_mostly = { + .notifier_call = ipvlan_device_event, +}; + +static struct notifier_block ipvlan_addr6_notifier_block __read_mostly = { + .notifier_call = ipvlan_addr6_event, +}; + +static int __init ipvlan_init_module(void) +{ + int err; + + ipvlan_init_secret(); + register_netdevice_notifier(&ipvlan_notifier_block); + register_inet6addr_notifier(&ipvlan_addr6_notifier_block); + register_inetaddr_notifier(&ipvlan_addr4_notifier_block); + + err = ipvlan_link_register(&ipvlan_link_ops); + if (err < 0) + goto error; + + return 0; +error: + unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); + unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); + unregister_netdevice_notifier(&ipvlan_notifier_block); + return err; +} + +static void __exit ipvlan_cleanup_module(void) +{ + rtnl_link_unregister(&ipvlan_link_ops); + unregister_netdevice_notifier(&ipvlan_notifier_block); + unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); + unregister_inet6addr_notifier(&ipvlan_addr6_notifier_block); +} + +module_init(ipvlan_init_module); +module_exit(ipvlan_cleanup_module); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Mahesh Bandewar <maheshb@google.com>"); +MODULE_DESCRIPTION("Driver for L3 (IPv6/IPv4) based VLANs"); +MODULE_ALIAS_RTNL_LINK("ipvlan"); diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 5cd5087..2cb7724 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1230,6 +1230,8 @@ enum netdev_priv_flags { IFF_LIVE_ADDR_CHANGE = 1<<20, IFF_MACVLAN = 1<<21, IFF_XMIT_DST_RELEASE_PERM = 1<<22, + IFF_IPVLAN_MASTER = 1<<23, + IFF_IPVLAN_SLAVE = 1<<24, }; #define IFF_802_1Q_VLAN IFF_802_1Q_VLAN @@ -1255,6 +1257,8 @@ enum netdev_priv_flags { #define IFF_LIVE_ADDR_CHANGE IFF_LIVE_ADDR_CHANGE #define IFF_MACVLAN IFF_MACVLAN #define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM +#define IFF_IPVLAN_MASTER IFF_IPVLAN_MASTER +#define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE /** * struct net_device - The DEVICE structure. diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index 7072d83..36bddc2 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -330,6 +330,21 @@ enum macvlan_macaddr_mode { #define MACVLAN_FLAG_NOPROMISC 1 +/* IPVLAN section */ +enum { + IFLA_IPVLAN_UNSPEC, + IFLA_IPVLAN_MODE, + __IFLA_IPVLAN_MAX +}; + +#define IFLA_IPVLAN_MAX (__IFLA_IPVLAN_MAX - 1) + +enum ipvlan_mode { + IPVLAN_MODE_L2 = 0, + IPVLAN_MODE_L3, + IPVLAN_MODE_MAX +}; + /* VXLAN section */ enum { IFLA_VXLAN_UNSPEC, |