diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2009-07-30 16:45:03 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2009-07-30 16:45:03 -0700 |
commit | 6ae7d6f0195a0ec7e5d07821e62c79898cd33fdc (patch) | |
tree | ed7975b5ae042e16500c1f5cb8b5756a6bf8d643 | |
parent | ec30c5f3a18722f8fcf8c83146a10b03ac4d9ff1 (diff) | |
parent | 1842f23c05b6a866be831aa60bc8a8731c58ddd0 (diff) | |
download | op-kernel-dev-6ae7d6f0195a0ec7e5d07821e62c79898cd33fdc.zip op-kernel-dev-6ae7d6f0195a0ec7e5d07821e62c79898cd33fdc.tar.gz |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus
* git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux-2.6-for-linus:
lguest and virtio: cleanup struct definitions to Linux style.
lguest: update commentry
lguest: fix comment style
virtio: refactor find_vqs
virtio: delete vq from list
virtio: fix memory leak on device removal
lguest: fix descriptor corruption in example launcher
lguest: dereferencing freed mem in add_eventfd()
-rw-r--r-- | Documentation/lguest/lguest.c | 721 | ||||
-rw-r--r-- | arch/x86/include/asm/lguest.h | 3 | ||||
-rw-r--r-- | arch/x86/include/asm/lguest_hcall.h | 18 | ||||
-rw-r--r-- | arch/x86/lguest/boot.c | 509 | ||||
-rw-r--r-- | arch/x86/lguest/i386_head.S | 112 | ||||
-rw-r--r-- | drivers/lguest/core.c | 119 | ||||
-rw-r--r-- | drivers/lguest/hypercalls.c | 145 | ||||
-rw-r--r-- | drivers/lguest/interrupts_and_traps.c | 288 | ||||
-rw-r--r-- | drivers/lguest/lg.h | 32 | ||||
-rw-r--r-- | drivers/lguest/lguest_device.c | 160 | ||||
-rw-r--r-- | drivers/lguest/lguest_user.c | 232 | ||||
-rw-r--r-- | drivers/lguest/page_tables.c | 489 | ||||
-rw-r--r-- | drivers/lguest/segments.c | 106 | ||||
-rw-r--r-- | drivers/lguest/x86/core.c | 374 | ||||
-rw-r--r-- | drivers/lguest/x86/switcher_32.S | 22 | ||||
-rw-r--r-- | drivers/virtio/virtio_pci.c | 240 | ||||
-rw-r--r-- | include/linux/lguest.h | 39 | ||||
-rw-r--r-- | include/linux/lguest_launcher.h | 18 | ||||
-rw-r--r-- | include/linux/virtio_blk.h | 6 | ||||
-rw-r--r-- | include/linux/virtio_config.h | 3 | ||||
-rw-r--r-- | include/linux/virtio_net.h | 6 | ||||
-rw-r--r-- | include/linux/virtio_ring.h | 12 |
22 files changed, 2422 insertions, 1232 deletions
diff --git a/Documentation/lguest/lguest.c b/Documentation/lguest/lguest.c index 9ebcd6e..950cde6 100644 --- a/Documentation/lguest/lguest.c +++ b/Documentation/lguest/lguest.c @@ -1,7 +1,9 @@ -/*P:100 This is the Launcher code, a simple program which lays out the - * "physical" memory for the new Guest by mapping the kernel image and - * the virtual devices, then opens /dev/lguest to tell the kernel - * about the Guest and control it. :*/ +/*P:100 + * This is the Launcher code, a simple program which lays out the "physical" + * memory for the new Guest by mapping the kernel image and the virtual + * devices, then opens /dev/lguest to tell the kernel about the Guest and + * control it. +:*/ #define _LARGEFILE64_SOURCE #define _GNU_SOURCE #include <stdio.h> @@ -46,13 +48,15 @@ #include "linux/virtio_rng.h" #include "linux/virtio_ring.h" #include "asm/bootparam.h" -/*L:110 We can ignore the 39 include files we need for this program, but I do - * want to draw attention to the use of kernel-style types. +/*L:110 + * We can ignore the 42 include files we need for this program, but I do want + * to draw attention to the use of kernel-style types. * * As Linus said, "C is a Spartan language, and so should your naming be." I * like these abbreviations, so we define them here. Note that u64 is always * unsigned long long, which works on all Linux systems: this means that we can - * use %llu in printf for any u64. */ + * use %llu in printf for any u64. + */ typedef unsigned long long u64; typedef uint32_t u32; typedef uint16_t u16; @@ -69,8 +73,10 @@ typedef uint8_t u8; /* This will occupy 3 pages: it must be a power of 2. */ #define VIRTQUEUE_NUM 256 -/*L:120 verbose is both a global flag and a macro. The C preprocessor allows - * this, and although I wouldn't recommend it, it works quite nicely here. */ +/*L:120 + * verbose is both a global flag and a macro. The C preprocessor allows + * this, and although I wouldn't recommend it, it works quite nicely here. + */ static bool verbose; #define verbose(args...) \ do { if (verbose) printf(args); } while(0) @@ -87,8 +93,7 @@ static int lguest_fd; static unsigned int __thread cpu_id; /* This is our list of devices. */ -struct device_list -{ +struct device_list { /* Counter to assign interrupt numbers. */ unsigned int next_irq; @@ -100,8 +105,7 @@ struct device_list /* A single linked list of devices. */ struct device *dev; - /* And a pointer to the last device for easy append and also for - * configuration appending. */ + /* And a pointer to the last device for easy append. */ struct device *lastdev; }; @@ -109,8 +113,7 @@ struct device_list static struct device_list devices; /* The device structure describes a single device. */ -struct device -{ +struct device { /* The linked-list pointer. */ struct device *next; @@ -135,8 +138,7 @@ struct device }; /* The virtqueue structure describes a queue attached to a device. */ -struct virtqueue -{ +struct virtqueue { struct virtqueue *next; /* Which device owns me. */ @@ -168,20 +170,24 @@ static char **main_args; /* The original tty settings to restore on exit. */ static struct termios orig_term; -/* We have to be careful with barriers: our devices are all run in separate +/* + * We have to be careful with barriers: our devices are all run in separate * threads and so we need to make sure that changes visible to the Guest happen - * in precise order. */ + * in precise order. + */ #define wmb() __asm__ __volatile__("" : : : "memory") #define mb() __asm__ __volatile__("" : : : "memory") -/* Convert an iovec element to the given type. +/* + * Convert an iovec element to the given type. * * This is a fairly ugly trick: we need to know the size of the type and * alignment requirement to check the pointer is kosher. It's also nice to * have the name of the type in case we report failure. * * Typing those three things all the time is cumbersome and error prone, so we - * have a macro which sets them all up and passes to the real function. */ + * have a macro which sets them all up and passes to the real function. + */ #define convert(iov, type) \ ((type *)_convert((iov), sizeof(type), __alignof__(type), #type)) @@ -198,8 +204,10 @@ static void *_convert(struct iovec *iov, size_t size, size_t align, /* Wrapper for the last available index. Makes it easier to change. */ #define lg_last_avail(vq) ((vq)->last_avail_idx) -/* The virtio configuration space is defined to be little-endian. x86 is - * little-endian too, but it's nice to be explicit so we have these helpers. */ +/* + * The virtio configuration space is defined to be little-endian. x86 is + * little-endian too, but it's nice to be explicit so we have these helpers. + */ #define cpu_to_le16(v16) (v16) #define cpu_to_le32(v32) (v32) #define cpu_to_le64(v64) (v64) @@ -241,11 +249,12 @@ static u8 *get_feature_bits(struct device *dev) + dev->num_vq * sizeof(struct lguest_vqconfig); } -/*L:100 The Launcher code itself takes us out into userspace, that scary place - * where pointers run wild and free! Unfortunately, like most userspace - * programs, it's quite boring (which is why everyone likes to hack on the - * kernel!). Perhaps if you make up an Lguest Drinking Game at this point, it - * will get you through this section. Or, maybe not. +/*L:100 + * The Launcher code itself takes us out into userspace, that scary place where + * pointers run wild and free! Unfortunately, like most userspace programs, + * it's quite boring (which is why everyone likes to hack on the kernel!). + * Perhaps if you make up an Lguest Drinking Game at this point, it will get + * you through this section. Or, maybe not. * * The Launcher sets up a big chunk of memory to be the Guest's "physical" * memory and stores it in "guest_base". In other words, Guest physical == @@ -253,7 +262,8 @@ static u8 *get_feature_bits(struct device *dev) * * This can be tough to get your head around, but usually it just means that we * use these trivial conversion functions when the Guest gives us it's - * "physical" addresses: */ + * "physical" addresses: + */ static void *from_guest_phys(unsigned long addr) { return guest_base + addr; @@ -268,7 +278,8 @@ static unsigned long to_guest_phys(const void *addr) * Loading the Kernel. * * We start with couple of simple helper routines. open_or_die() avoids - * error-checking code cluttering the callers: */ + * error-checking code cluttering the callers: + */ static int open_or_die(const char *name, int flags) { int fd = open(name, flags); @@ -283,12 +294,19 @@ static void *map_zeroed_pages(unsigned int num) int fd = open_or_die("/dev/zero", O_RDONLY); void *addr; - /* We use a private mapping (ie. if we write to the page, it will be - * copied). */ + /* + * We use a private mapping (ie. if we write to the page, it will be + * copied). + */ addr = mmap(NULL, getpagesize() * num, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, fd, 0); if (addr == MAP_FAILED) err(1, "Mmaping %u pages of /dev/zero", num); + + /* + * One neat mmap feature is that you can close the fd, and it + * stays mapped. + */ close(fd); return addr; @@ -305,20 +323,24 @@ static void *get_pages(unsigned int num) return addr; } -/* This routine is used to load the kernel or initrd. It tries mmap, but if +/* + * This routine is used to load the kernel or initrd. It tries mmap, but if * that fails (Plan 9's kernel file isn't nicely aligned on page boundaries), - * it falls back to reading the memory in. */ + * it falls back to reading the memory in. + */ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) { ssize_t r; - /* We map writable even though for some segments are marked read-only. + /* + * We map writable even though for some segments are marked read-only. * The kernel really wants to be writable: it patches its own * instructions. * * MAP_PRIVATE means that the page won't be copied until a write is * done to it. This allows us to share untouched memory between - * Guests. */ + * Guests. + */ if (mmap(addr, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_FIXED|MAP_PRIVATE, fd, offset) != MAP_FAILED) return; @@ -329,7 +351,8 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) err(1, "Reading offset %lu len %lu gave %zi", offset, len, r); } -/* This routine takes an open vmlinux image, which is in ELF, and maps it into +/* + * This routine takes an open vmlinux image, which is in ELF, and maps it into * the Guest memory. ELF = Embedded Linking Format, which is the format used * by all modern binaries on Linux including the kernel. * @@ -337,23 +360,28 @@ static void map_at(int fd, void *addr, unsigned long offset, unsigned long len) * address. We use the physical address; the Guest will map itself to the * virtual address. * - * We return the starting address. */ + * We return the starting address. + */ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) { Elf32_Phdr phdr[ehdr->e_phnum]; unsigned int i; - /* Sanity checks on the main ELF header: an x86 executable with a - * reasonable number of correctly-sized program headers. */ + /* + * Sanity checks on the main ELF header: an x86 executable with a + * reasonable number of correctly-sized program headers. + */ if (ehdr->e_type != ET_EXEC || ehdr->e_machine != EM_386 || ehdr->e_phentsize != sizeof(Elf32_Phdr) || ehdr->e_phnum < 1 || ehdr->e_phnum > 65536U/sizeof(Elf32_Phdr)) errx(1, "Malformed elf header"); - /* An ELF executable contains an ELF header and a number of "program" + /* + * An ELF executable contains an ELF header and a number of "program" * headers which indicate which parts ("segments") of the program to - * load where. */ + * load where. + */ /* We read in all the program headers at once: */ if (lseek(elf_fd, ehdr->e_phoff, SEEK_SET) < 0) @@ -361,8 +389,10 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) if (read(elf_fd, phdr, sizeof(phdr)) != sizeof(phdr)) err(1, "Reading program headers"); - /* Try all the headers: there are usually only three. A read-only one, - * a read-write one, and a "note" section which we don't load. */ + /* + * Try all the headers: there are usually only three. A read-only one, + * a read-write one, and a "note" section which we don't load. + */ for (i = 0; i < ehdr->e_phnum; i++) { /* If this isn't a loadable segment, we ignore it */ if (phdr[i].p_type != PT_LOAD) @@ -380,13 +410,15 @@ static unsigned long map_elf(int elf_fd, const Elf32_Ehdr *ehdr) return ehdr->e_entry; } -/*L:150 A bzImage, unlike an ELF file, is not meant to be loaded. You're - * supposed to jump into it and it will unpack itself. We used to have to - * perform some hairy magic because the unpacking code scared me. +/*L:150 + * A bzImage, unlike an ELF file, is not meant to be loaded. You're supposed + * to jump into it and it will unpack itself. We used to have to perform some + * hairy magic because the unpacking code scared me. * * Fortunately, Jeremy Fitzhardinge convinced me it wasn't that hard and wrote * a small patch to jump over the tricky bits in the Guest, so now we just read - * the funky header so we know where in the file to load, and away we go! */ + * the funky header so we know where in the file to load, and away we go! + */ static unsigned long load_bzimage(int fd) { struct boot_params boot; @@ -394,8 +426,10 @@ static unsigned long load_bzimage(int fd) /* Modern bzImages get loaded at 1M. */ void *p = from_guest_phys(0x100000); - /* Go back to the start of the file and read the header. It should be - * a Linux boot header (see Documentation/x86/i386/boot.txt) */ + /* + * Go back to the start of the file and read the header. It should be + * a Linux boot header (see Documentation/x86/i386/boot.txt) + */ lseek(fd, 0, SEEK_SET); read(fd, &boot, sizeof(boot)); @@ -414,9 +448,11 @@ static unsigned long load_bzimage(int fd) return boot.hdr.code32_start; } -/*L:140 Loading the kernel is easy when it's a "vmlinux", but most kernels +/*L:140 + * Loading the kernel is easy when it's a "vmlinux", but most kernels * come wrapped up in the self-decompressing "bzImage" format. With a little - * work, we can load those, too. */ + * work, we can load those, too. + */ static unsigned long load_kernel(int fd) { Elf32_Ehdr hdr; @@ -433,24 +469,28 @@ static unsigned long load_kernel(int fd) return load_bzimage(fd); } -/* This is a trivial little helper to align pages. Andi Kleen hated it because +/* + * This is a trivial little helper to align pages. Andi Kleen hated it because * it calls getpagesize() twice: "it's dumb code." * * Kernel guys get really het up about optimization, even when it's not - * necessary. I leave this code as a reaction against that. */ + * necessary. I leave this code as a reaction against that. + */ static inline unsigned long page_align(unsigned long addr) { /* Add upwards and truncate downwards. */ return ((addr + getpagesize()-1) & ~(getpagesize()-1)); } -/*L:180 An "initial ram disk" is a disk image loaded into memory along with - * the kernel which the kernel can use to boot from without needing any - * drivers. Most distributions now use this as standard: the initrd contains - * the code to load the appropriate driver modules for the current machine. +/*L:180 + * An "initial ram disk" is a disk image loaded into memory along with the + * kernel which the kernel can use to boot from without needing any drivers. + * Most distributions now use this as standard: the initrd contains the code to + * load the appropriate driver modules for the current machine. * * Importantly, James Morris works for RedHat, and Fedora uses initrds for its - * kernels. He sent me this (and tells me when I break it). */ + * kernels. He sent me this (and tells me when I break it). + */ static unsigned long load_initrd(const char *name, unsigned long mem) { int ifd; @@ -462,12 +502,16 @@ static unsigned long load_initrd(const char *name, unsigned long mem) if (fstat(ifd, &st) < 0) err(1, "fstat() on initrd '%s'", name); - /* We map the initrd at the top of memory, but mmap wants it to be - * page-aligned, so we round the size up for that. */ + /* + * We map the initrd at the top of memory, but mmap wants it to be + * page-aligned, so we round the size up for that. + */ len = page_align(st.st_size); map_at(ifd, from_guest_phys(mem - len), 0, st.st_size); - /* Once a file is mapped, you can close the file descriptor. It's a - * little odd, but quite useful. */ + /* + * Once a file is mapped, you can close the file descriptor. It's a + * little odd, but quite useful. + */ close(ifd); verbose("mapped initrd %s size=%lu @ %p\n", name, len, (void*)mem-len); @@ -476,8 +520,10 @@ static unsigned long load_initrd(const char *name, unsigned long mem) } /*:*/ -/* Simple routine to roll all the commandline arguments together with spaces - * between them. */ +/* + * Simple routine to roll all the commandline arguments together with spaces + * between them. + */ static void concat(char *dst, char *args[]) { unsigned int i, len = 0; @@ -494,10 +540,12 @@ static void concat(char *dst, char *args[]) dst[len] = '\0'; } -/*L:185 This is where we actually tell the kernel to initialize the Guest. We +/*L:185 + * This is where we actually tell the kernel to initialize the Guest. We * saw the arguments it expects when we looked at initialize() in lguest_user.c: * the base of Guest "physical" memory, the top physical page to allow and the - * entry point for the Guest. */ + * entry point for the Guest. + */ static void tell_kernel(unsigned long start) { unsigned long args[] = { LHREQ_INITIALIZE, @@ -511,7 +559,7 @@ static void tell_kernel(unsigned long start) } /*:*/ -/* +/*L:200 * Device Handling. * * When the Guest gives us a buffer, it sends an array of addresses and sizes. @@ -522,20 +570,26 @@ static void tell_kernel(unsigned long start) static void *_check_pointer(unsigned long addr, unsigned int size, unsigned int line) { - /* We have to separately check addr and addr+size, because size could - * be huge and addr + size might wrap around. */ + /* + * We have to separately check addr and addr+size, because size could + * be huge and addr + size might wrap around. + */ if (addr >= guest_limit || addr + size >= guest_limit) errx(1, "%s:%i: Invalid address %#lx", __FILE__, line, addr); - /* We return a pointer for the caller's convenience, now we know it's - * safe to use. */ + /* + * We return a pointer for the caller's convenience, now we know it's + * safe to use. + */ return from_guest_phys(addr); } /* A macro which transparently hands the line number to the real function. */ #define check_pointer(addr,size) _check_pointer(addr, size, __LINE__) -/* Each buffer in the virtqueues is actually a chain of descriptors. This +/* + * Each buffer in the virtqueues is actually a chain of descriptors. This * function returns the next descriptor in the chain, or vq->vring.num if we're - * at the end. */ + * at the end. + */ static unsigned next_desc(struct vring_desc *desc, unsigned int i, unsigned int max) { @@ -556,7 +610,10 @@ static unsigned next_desc(struct vring_desc *desc, return next; } -/* This actually sends the interrupt for this virtqueue */ +/* + * This actually sends the interrupt for this virtqueue, if we've used a + * buffer. + */ static void trigger_irq(struct virtqueue *vq) { unsigned long buf[] = { LHREQ_IRQ, vq->config.irq }; @@ -576,12 +633,14 @@ static void trigger_irq(struct virtqueue *vq) err(1, "Triggering irq %i", vq->config.irq); } -/* This looks in the virtqueue and for the first available buffer, and converts +/* + * This looks in the virtqueue for the first available buffer, and converts * it to an iovec for convenient access. Since descriptors consist of some * number of output then some number of input descriptors, it's actually two * iovecs, but we pack them into one and note how many of each there were. * - * This function returns the descriptor number found. */ + * This function waits if necessary, and returns the descriptor number found. + */ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct iovec iov[], unsigned int *out_num, unsigned int *in_num) @@ -590,17 +649,23 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, struct vring_desc *desc; u16 last_avail = lg_last_avail(vq); + /* There's nothing available? */ while (last_avail == vq->vring.avail->idx) { u64 event; - /* OK, tell Guest about progress up to now. */ + /* + * Since we're about to sleep, now is a good time to tell the + * Guest about what we've used up to now. + */ trigger_irq(vq); /* OK, now we need to know about added descriptors. */ vq->vring.used->flags &= ~VRING_USED_F_NO_NOTIFY; - /* They could have slipped one in as we were doing that: make - * sure it's written, then check again. */ + /* + * They could have slipped one in as we were doing that: make + * sure it's written, then check again. + */ mb(); if (last_avail != vq->vring.avail->idx) { vq->vring.used->flags |= VRING_USED_F_NO_NOTIFY; @@ -620,8 +685,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, errx(1, "Guest moved used index from %u to %u", last_avail, vq->vring.avail->idx); - /* Grab the next descriptor number they're advertising, and increment - * the index we've seen. */ + /* + * Grab the next descriptor number they're advertising, and increment + * the index we've seen. + */ head = vq->vring.avail->ring[last_avail % vq->vring.num]; lg_last_avail(vq)++; @@ -636,8 +703,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, desc = vq->vring.desc; i = head; - /* If this is an indirect entry, then this buffer contains a descriptor - * table which we handle as if it's any normal descriptor chain. */ + /* + * If this is an indirect entry, then this buffer contains a descriptor + * table which we handle as if it's any normal descriptor chain. + */ if (desc[i].flags & VRING_DESC_F_INDIRECT) { if (desc[i].len % sizeof(struct vring_desc)) errx(1, "Invalid size for indirect buffer table"); @@ -656,8 +725,10 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, if (desc[i].flags & VRING_DESC_F_WRITE) (*in_num)++; else { - /* If it's an output descriptor, they're all supposed - * to come before any input descriptors. */ + /* + * If it's an output descriptor, they're all supposed + * to come before any input descriptors. + */ if (*in_num) errx(1, "Descriptor has out after in"); (*out_num)++; @@ -671,14 +742,19 @@ static unsigned wait_for_vq_desc(struct virtqueue *vq, return head; } -/* After we've used one of their buffers, we tell them about it. We'll then - * want to send them an interrupt, using trigger_irq(). */ +/* + * After we've used one of their buffers, we tell the Guest about it. Sometime + * later we'll want to send them an interrupt using trigger_irq(); note that + * wait_for_vq_desc() does that for us if it has to wait. + */ static void add_used(struct virtqueue *vq, unsigned int head, int len) { struct vring_used_elem *used; - /* The virtqueue contains a ring of used buffers. Get a pointer to the - * next entry in that used ring. */ + /* + * The virtqueue contains a ring of used buffers. Get a pointer to the + * next entry in that used ring. + */ used = &vq->vring.used->ring[vq->vring.used->idx % vq->vring.num]; used->id = head; used->len = len; @@ -698,9 +774,9 @@ static void add_used_and_trigger(struct virtqueue *vq, unsigned head, int len) /* * The Console * - * We associate some data with the console for our exit hack. */ -struct console_abort -{ + * We associate some data with the console for our exit hack. + */ +struct console_abort { /* How many times have they hit ^C? */ int count; /* When did they start? */ @@ -715,30 +791,35 @@ static void console_input(struct virtqueue *vq) struct console_abort *abort = vq->dev->priv; struct iovec iov[vq->vring.num]; - /* Make sure there's a descriptor waiting. */ + /* Make sure there's a descriptor available. */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); if (out_num) errx(1, "Output buffers in console in queue?"); - /* Read it in. */ + /* Read into it. This is where we usually wait. */ len = readv(STDIN_FILENO, iov, in_num); if (len <= 0) { /* Ran out of input? */ warnx("Failed to get console input, ignoring console."); - /* For simplicity, dying threads kill the whole Launcher. So - * just nap here. */ + /* + * For simplicity, dying threads kill the whole Launcher. So + * just nap here. + */ for (;;) pause(); } + /* Tell the Guest we used a buffer. */ add_used_and_trigger(vq, head, len); - /* Three ^C within one second? Exit. + /* + * Three ^C within one second? Exit. * * This is such a hack, but works surprisingly well. Each ^C has to * be in a buffer by itself, so they can't be too fast. But we check * that we get three within about a second, so they can't be too - * slow. */ + * slow. + */ if (len != 1 || ((char *)iov[0].iov_base)[0] != 3) { abort->count = 0; return; @@ -763,15 +844,23 @@ static void console_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here, for the Guest to give us something. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in console output queue?"); + + /* writev can return a partial write, so we loop here. */ while (!iov_empty(iov, out)) { int len = writev(STDOUT_FILENO, iov, out); if (len <= 0) err(1, "Write to stdout gave %i", len); iov_consume(iov, out, len); } + + /* + * We're finished with that buffer: if we're going to sleep, + * wait_for_vq_desc() will prod the Guest with an interrupt. + */ add_used(vq, head, 0); } @@ -791,15 +880,30 @@ static void net_output(struct virtqueue *vq) unsigned int head, out, in; struct iovec iov[vq->vring.num]; + /* We usually wait in here for the Guest to give us a packet. */ head = wait_for_vq_desc(vq, iov, &out, &in); if (in) errx(1, "Input buffers in net output queue?"); + /* + * Send the whole thing through to /dev/net/tun. It expects the exact + * same format: what a coincidence! + */ if (writev(net_info->tunfd, iov, out) < 0) errx(1, "Write to tun failed?"); + + /* + * Done with that one; wait_for_vq_desc() will send the interrupt if + * all packets are processed. + */ add_used(vq, head, 0); } -/* Will reading from this file descriptor block? */ +/* + * Handling network input is a bit trickier, because I've tried to optimize it. + * + * First we have a helper routine which tells is if from this file descriptor + * (ie. the /dev/net/tun device) will block: + */ static bool will_block(int fd) { fd_set fdset; @@ -809,8 +913,11 @@ static bool will_block(int fd) return select(fd+1, &fdset, NULL, NULL, &zero) != 1; } -/* This is where we handle packets coming in from the tun device to our - * Guest. */ +/* + * This handles packets coming in from the tun device to our Guest. Like all + * service routines, it gets called again as soon as it returns, so you don't + * see a while(1) loop here. + */ static void net_input(struct virtqueue *vq) { int len; @@ -818,21 +925,38 @@ static void net_input(struct virtqueue *vq) struct iovec iov[vq->vring.num]; struct net_info *net_info = vq->dev->priv; + /* + * Get a descriptor to write an incoming packet into. This will also + * send an interrupt if they're out of descriptors. + */ head = wait_for_vq_desc(vq, iov, &out, &in); if (out) errx(1, "Output buffers in net input queue?"); - /* Deliver interrupt now, since we're about to sleep. */ + /* + * If it looks like we'll block reading from the tun device, send them + * an interrupt. + */ if (vq->pending_used && will_block(net_info->tunfd)) trigger_irq(vq); + /* + * Read in the packet. This is where we normally wait (when there's no + * incoming network traffic). + */ len = readv(net_info->tunfd, iov, in); if (len <= 0) err(1, "Failed to read from tun."); + + /* + * Mark that packet buffer as used, but don't interrupt here. We want + * to wait until we've done as much work as we can. + */ add_used(vq, head, len); } +/*:*/ -/* This is the helper to create threads. */ +/* This is the helper to create threads: run the service routine in a loop. */ static int do_thread(void *_vq) { struct virtqueue *vq = _vq; @@ -842,8 +966,10 @@ static int do_thread(void *_vq) return 0; } -/* When a child dies, we kill our entire process group with SIGTERM. This - * also has the side effect that the shell restores the console for us! */ +/* + * When a child dies, we kill our entire process group with SIGTERM. This + * also has the side effect that the shell restores the console for us! + */ static void kill_launcher(int signal) { kill(0, SIGTERM); @@ -878,11 +1004,15 @@ static void reset_device(struct device *dev) signal(SIGCHLD, (void *)kill_launcher); } +/*L:216 + * This actually creates the thread which services the virtqueue for a device. + */ static void create_thread(struct virtqueue *vq) { - /* Create stack for thread and run it. Since stack grows - * upwards, we point the stack pointer to the end of this - * region. */ + /* + * Create stack for thread. Since the stack grows upwards, we point + * the stack pointer to the end of this region. + */ char *stack = malloc(32768); unsigned long args[] = { LHREQ_EVENTFD, vq->config.pfn*getpagesize(), 0 }; @@ -893,17 +1023,22 @@ static void create_thread(struct virtqueue *vq) err(1, "Creating eventfd"); args[2] = vq->eventfd; - /* Attach an eventfd to this virtqueue: it will go off - * when the Guest does an LHCALL_NOTIFY for this vq. */ + /* + * Attach an eventfd to this virtqueue: it will go off when the Guest + * does an LHCALL_NOTIFY for this vq. + */ if (write(lguest_fd, &args, sizeof(args)) != 0) err(1, "Attaching eventfd"); - /* CLONE_VM: because it has to access the Guest memory, and - * SIGCHLD so we get a signal if it dies. */ + /* + * CLONE_VM: because it has to access the Guest memory, and SIGCHLD so + * we get a signal if it dies. + */ vq->thread = clone(do_thread, stack + 32768, CLONE_VM | SIGCHLD, vq); if (vq->thread == (pid_t)-1) err(1, "Creating clone"); - /* We close our local copy, now the child has it. */ + + /* We close our local copy now the child has it. */ close(vq->eventfd); } @@ -955,7 +1090,10 @@ static void update_device_status(struct device *dev) } } -/* This is the generic routine we call when the Guest uses LHCALL_NOTIFY. */ +/*L:215 + * This is the generic routine we call when the Guest uses LHCALL_NOTIFY. In + * particular, it's used to notify us of device status changes during boot. + */ static void handle_output(unsigned long addr) { struct device *i; @@ -964,25 +1102,42 @@ static void handle_output(unsigned long addr) for (i = devices.dev; i; i = i->next) { struct virtqueue *vq; - /* Notifications to device descriptors update device status. */ + /* + * Notifications to device descriptors mean they updated the + * device status. + */ if (from_guest_phys(addr) == i->desc) { update_device_status(i); return; } - /* Devices *can* be used before status is set to DRIVER_OK. */ + /* + * Devices *can* be used before status is set to DRIVER_OK. + * The original plan was that they would never do this: they + * would always finish setting up their status bits before + * actually touching the virtqueues. In practice, we allowed + * them to, and they do (eg. the disk probes for partition + * tables as part of initialization). + * + * If we see this, we start the device: once it's running, we + * expect the device to catch all the notifications. + */ for (vq = i->vq; vq; vq = vq->next) { if (addr != vq->config.pfn*getpagesize()) continue; if (i->running) errx(1, "Notification on running %s", i->name); + /* This just calls create_thread() for each virtqueue */ start_device(i); return; } } - /* Early console write is done using notify on a nul-terminated string - * in Guest memory. */ + /* + * Early console write is done using notify on a nul-terminated string + * in Guest memory. It's also great for hacking debugging messages + * into a Guest. + */ if (addr >= guest_limit) errx(1, "Bad NOTIFY %#lx", addr); @@ -998,10 +1153,12 @@ static void handle_output(unsigned long addr) * routines to allocate and manage them. */ -/* The layout of the device page is a "struct lguest_device_desc" followed by a +/* + * The layout of the device page is a "struct lguest_device_desc" followed by a * number of virtqueue descriptors, then two sets of feature bits, then an * array of configuration bytes. This routine returns the configuration - * pointer. */ + * pointer. + */ static u8 *device_config(const struct device *dev) { return (void *)(dev->desc + 1) @@ -1009,9 +1166,11 @@ static u8 *device_config(const struct device *dev) + dev->feature_len * 2; } -/* This routine allocates a new "struct lguest_device_desc" from descriptor +/* + * This routine allocates a new "struct lguest_device_desc" from descriptor * table page just above the Guest's normal memory. It returns a pointer to - * that descriptor. */ + * that descriptor. + */ static struct lguest_device_desc *new_dev_desc(u16 type) { struct lguest_device_desc d = { .type = type }; @@ -1032,8 +1191,10 @@ static struct lguest_device_desc *new_dev_desc(u16 type) return memcpy(p, &d, sizeof(d)); } -/* Each device descriptor is followed by the description of its virtqueues. We - * specify how many descriptors the virtqueue is to have. */ +/* + * Each device descriptor is followed by the description of its virtqueues. We + * specify how many descriptors the virtqueue is to have. + */ static void add_virtqueue(struct device *dev, unsigned int num_descs, void (*service)(struct virtqueue *)) { @@ -1050,6 +1211,11 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, vq->next = NULL; vq->last_avail_idx = 0; vq->dev = dev; + + /* + * This is the routine the service thread will run, and its Process ID + * once it's running. + */ vq->service = service; vq->thread = (pid_t)-1; @@ -1061,10 +1227,12 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, /* Initialize the vring. */ vring_init(&vq->vring, num_descs, p, LGUEST_VRING_ALIGN); - /* Append virtqueue to this device's descriptor. We use + /* + * Append virtqueue to this device's descriptor. We use * device_config() to get the end of the device's current virtqueues; * we check that we haven't added any config or feature information - * yet, otherwise we'd be overwriting them. */ + * yet, otherwise we'd be overwriting them. + */ assert(dev->desc->config_len == 0 && dev->desc->feature_len == 0); memcpy(device_config(dev), &vq->config, sizeof(vq->config)); dev->num_vq++; @@ -1072,14 +1240,18 @@ static void add_virtqueue(struct device *dev, unsigned int num_descs, verbose("Virtqueue page %#lx\n", to_guest_phys(p)); - /* Add to tail of list, so dev->vq is first vq, dev->vq->next is - * second. */ + /* + * Add to tail of list, so dev->vq is first vq, dev->vq->next is + * second. + */ for (i = &dev->vq; *i; i = &(*i)->next); *i = vq; } -/* The first half of the feature bitmask is for us to advertise features. The - * second half is for the Guest to accept features. */ +/* + * The first half of the feature bitmask is for us to advertise features. The + * second half is for the Guest to accept features. + */ static void add_feature(struct device *dev, unsigned bit) { u8 *features = get_feature_bits(dev); @@ -1093,9 +1265,11 @@ static void add_feature(struct device *dev, unsigned bit) features[bit / CHAR_BIT] |= (1 << (bit % CHAR_BIT)); } -/* This routine sets the configuration fields for an existing device's +/* + * This routine sets the configuration fields for an existing device's * descriptor. It only works for the last device, but that's OK because that's - * how we use it. */ + * how we use it. + */ static void set_config(struct device *dev, unsigned len, const void *conf) { /* Check we haven't overflowed our single page. */ @@ -1105,12 +1279,18 @@ static void set_config(struct device *dev, unsigned len, const void *conf) /* Copy in the config information, and store the length. */ memcpy(device_config(dev), conf, len); dev->desc->config_len = len; + + /* Size must fit in config_len field (8 bits)! */ + assert(dev->desc->config_len == len); } -/* This routine does all the creation and setup of a new device, including - * calling new_dev_desc() to allocate the descriptor and device memory. +/* + * This routine does all the creation and setup of a new device, including + * calling new_dev_desc() to allocate the descriptor and device memory. We + * don't actually start the service threads until later. * - * See what I mean about userspace being boring? */ + * See what I mean about userspace being boring? + */ static struct device *new_device(const char *name, u16 type) { struct device *dev = malloc(sizeof(*dev)); @@ -1123,10 +1303,12 @@ static struct device *new_device(const char *name, u16 type) dev->num_vq = 0; dev->running = false; - /* Append to device list. Prepending to a single-linked list is + /* + * Append to device list. Prepending to a single-linked list is * easier, but the user expects the devices to be arranged on the bus * in command-line order. The first network device on the command line - * is eth0, the first block device /dev/vda, etc. */ + * is eth0, the first block device /dev/vda, etc. + */ if (devices.lastdev) devices.lastdev->next = dev; else @@ -1136,8 +1318,10 @@ static struct device *new_device(const char *name, u16 type) return dev; } -/* Our first setup routine is the console. It's a fairly simple device, but - * UNIX tty handling makes it uglier than it could be. */ +/* + * Our first setup routine is the console. It's a fairly simple device, but + * UNIX tty handling makes it uglier than it could be. + */ static void setup_console(void) { struct device *dev; @@ -1145,8 +1329,10 @@ static void setup_console(void) /* If we can save the initial standard input settings... */ if (tcgetattr(STDIN_FILENO, &orig_term) == 0) { struct termios term = orig_term; - /* Then we turn off echo, line buffering and ^C etc. We want a - * raw input stream to the Guest. */ + /* + * Then we turn off echo, line buffering and ^C etc: We want a + * raw input stream to the Guest. + */ term.c_lflag &= ~(ISIG|ICANON|ECHO); tcsetattr(STDIN_FILENO, TCSANOW, &term); } @@ -1157,10 +1343,12 @@ static void setup_console(void) dev->priv = malloc(sizeof(struct console_abort)); ((struct console_abort *)dev->priv)->count = 0; - /* The console needs two virtqueues: the input then the output. When + /* + * The console needs two virtqueues: the input then the output. When * they put something the input queue, we make sure we're listening to * stdin. When they put something in the output queue, we write it to - * stdout. */ + * stdout. + */ add_virtqueue(dev, VIRTQUEUE_NUM, console_input); add_virtqueue(dev, VIRTQUEUE_NUM, console_output); @@ -1168,7 +1356,8 @@ static void setup_console(void) } /*:*/ -/*M:010 Inter-guest networking is an interesting area. Simplest is to have a +/*M:010 + * Inter-guest networking is an interesting area. Simplest is to have a * --sharenet=<name> option which opens or creates a named pipe. This can be * used to send packets to another guest in a 1:1 manner. * @@ -1182,7 +1371,8 @@ static void setup_console(void) * multiple inter-guest channels behind one interface, although it would * require some manner of hotplugging new virtio channels. * - * Finally, we could implement a virtio network switch in the kernel. :*/ + * Finally, we could implement a virtio network switch in the kernel. +:*/ static u32 str2ip(const char *ipaddr) { @@ -1207,11 +1397,13 @@ static void str2mac(const char *macaddr, unsigned char mac[6]) mac[5] = m[5]; } -/* This code is "adapted" from libbridge: it attaches the Host end of the +/* + * This code is "adapted" from libbridge: it attaches the Host end of the * network device to the bridge device specified by the command line. * * This is yet another James Morris contribution (I'm an IP-level guy, so I - * dislike bridging), and I just try not to break it. */ + * dislike bridging), and I just try not to break it. + */ static void add_to_bridge(int fd, const char *if_name, const char *br_name) { int ifidx; @@ -1231,9 +1423,11 @@ static void add_to_bridge(int fd, const char *if_name, const char *br_name) err(1, "can't add %s to bridge %s", if_name, br_name); } -/* This sets up the Host end of the network device with an IP address, brings +/* + * This sets up the Host end of the network device with an IP address, brings * it up so packets will flow, the copies the MAC address into the hwaddr - * pointer. */ + * pointer. + */ static void configure_device(int fd, const char *tapif, u32 ipaddr) { struct ifreq ifr; @@ -1260,10 +1454,12 @@ static int get_tun_device(char tapif[IFNAMSIZ]) /* Start with this zeroed. Messy but sure. */ memset(&ifr, 0, sizeof(ifr)); - /* We open the /dev/net/tun device and tell it we want a tap device. A + /* + * We open the /dev/net/tun device and tell it we want a tap device. A * tap device is like a tun device, only somehow different. To tell * the truth, I completely blundered my way through this code, but it - * works now! */ + * works now! + */ netfd = open_or_die("/dev/net/tun", O_RDWR); ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_VNET_HDR; strcpy(ifr.ifr_name, "tap%d"); @@ -1274,18 +1470,22 @@ static int get_tun_device(char tapif[IFNAMSIZ]) TUN_F_CSUM|TUN_F_TSO4|TUN_F_TSO6|TUN_F_TSO_ECN) != 0) err(1, "Could not set features for tun device"); - /* We don't need checksums calculated for packets coming in this - * device: trust us! */ + /* + * We don't need checksums calculated for packets coming in this + * device: trust us! + */ ioctl(netfd, TUNSETNOCSUM, 1); memcpy(tapif, ifr.ifr_name, IFNAMSIZ); return netfd; } -/*L:195 Our network is a Host<->Guest network. This can either use bridging or +/*L:195 + * Our network is a Host<->Guest network. This can either use bridging or * routing, but the principle is the same: it uses the "tun" device to inject * packets into the Host as if they came in from a normal network card. We - * just shunt packets between the Guest and the tun device. */ + * just shunt packets between the Guest and the tun device. + */ static void setup_tun_net(char *arg) { struct device *dev; @@ -1302,13 +1502,14 @@ static void setup_tun_net(char *arg) dev = new_device("net", VIRTIO_ID_NET); dev->priv = net_info; - /* Network devices need a receive and a send queue, just like - * console. */ + /* Network devices need a recv and a send queue, just like console. */ add_virtqueue(dev, VIRTQUEUE_NUM, net_input); add_virtqueue(dev, VIRTQUEUE_NUM, net_output); - /* We need a socket to perform the magic network ioctls to bring up the - * tap interface, connect to the bridge etc. Any socket will do! */ + /* + * We need a socket to perform the magic network ioctls to bring up the + * tap interface, connect to the bridge etc. Any socket will do! + */ ipfd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); if (ipfd < 0) err(1, "opening IP socket"); @@ -1362,39 +1563,31 @@ static void setup_tun_net(char *arg) verbose("device %u: tun %s: %s\n", devices.device_num, tapif, arg); } - -/* Our block (disk) device should be really simple: the Guest asks for a block - * number and we read or write that position in the file. Unfortunately, that - * was amazingly slow: the Guest waits until the read is finished before - * running anything else, even if it could have been doing useful work. - * - * We could use async I/O, except it's reputed to suck so hard that characters - * actually go missing from your code when you try to use it. - * - * So we farm the I/O out to thread, and communicate with it via a pipe. */ +/*:*/ /* This hangs off device->priv. */ -struct vblk_info -{ +struct vblk_info { /* The size of the file. */ off64_t len; /* The file descriptor for the file. */ int fd; - /* IO thread listens on this file descriptor [0]. */ - int workpipe[2]; - - /* IO thread writes to this file descriptor to mark it done, then - * Launcher triggers interrupt to Guest. */ - int done_fd; }; /*L:210 * The Disk * - * Remember that the block device is handled by a separate I/O thread. We head - * straight into the core of that thread here: + * The disk only has one virtqueue, so it only has one thread. It is really + * simple: the Guest asks for a block number and we read or write that position + * in the file. + * + * Before we serviced each virtqueue in a separate thread, that was unacceptably + * slow: the Guest waits until the read is finished before running anything + * else, even if it could have been doing useful work. + * + * We could have used async I/O, except it's reputed to suck so hard that + * characters actually go missing from your code when you try to use it. */ static void blk_request(struct virtqueue *vq) { @@ -1406,47 +1599,64 @@ static void blk_request(struct virtqueue *vq) struct iovec iov[vq->vring.num]; off64_t off; - /* Get the next request. */ + /* + * Get the next request, where we normally wait. It triggers the + * interrupt to acknowledge previously serviced requests (if any). + */ head = wait_for_vq_desc(vq, iov, &out_num, &in_num); - /* Every block request should contain at least one output buffer + /* + * Every block request should contain at least one output buffer * (detailing the location on disk and the type of request) and one - * input buffer (to hold the result). */ + * input buffer (to hold the result). + */ if (out_num == 0 || in_num == 0) errx(1, "Bad virtblk cmd %u out=%u in=%u", head, out_num, in_num); out = convert(&iov[0], struct virtio_blk_outhdr); in = convert(&iov[out_num+in_num-1], u8); + /* + * For historical reasons, block operations are expressed in 512 byte + * "sectors". + */ off = out->sector * 512; - /* The block device implements "barriers", where the Guest indicates + /* + * The block device implements "barriers", where the Guest indicates * that it wants all previous writes to occur before this write. We * don't have a way of asking our kernel to do a barrier, so we just - * synchronize all the data in the file. Pretty poor, no? */ + * synchronize all the data in the file. Pretty poor, no? + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); - /* In general the virtio block driver is allowed to try SCSI commands. - * It'd be nice if we supported eject, for example, but we don't. */ + /* + * In general the virtio block driver is allowed to try SCSI commands. + * It'd be nice if we supported eject, for example, but we don't. + */ if (out->type & VIRTIO_BLK_T_SCSI_CMD) { fprintf(stderr, "Scsi commands unsupported\n"); *in = VIRTIO_BLK_S_UNSUPP; wlen = sizeof(*in); } else if (out->type & VIRTIO_BLK_T_OUT) { - /* Write */ - - /* Move to the right location in the block file. This can fail - * if they try to write past end. */ + /* + * Write + * + * Move to the right location in the block file. This can fail + * if they try to write past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); ret = writev(vblk->fd, iov+1, out_num-1); verbose("WRITE to sector %llu: %i\n", out->sector, ret); - /* Grr... Now we know how long the descriptor they sent was, we + /* + * Grr... Now we know how long the descriptor they sent was, we * make sure they didn't try to write over the end of the block - * file (possibly extending it). */ + * file (possibly extending it). + */ if (ret > 0 && off + ret > vblk->len) { /* Trim it back to the correct length */ ftruncate64(vblk->fd, vblk->len); @@ -1456,10 +1666,12 @@ static void blk_request(struct virtqueue *vq) wlen = sizeof(*in); *in = (ret >= 0 ? VIRTIO_BLK_S_OK : VIRTIO_BLK_S_IOERR); } else { - /* Read */ - - /* Move to the right location in the block file. This can fail - * if they try to read past end. */ + /* + * Read + * + * Move to the right location in the block file. This can fail + * if they try to read past end. + */ if (lseek64(vblk->fd, off, SEEK_SET) != off) err(1, "Bad seek to sector %llu", out->sector); @@ -1474,13 +1686,16 @@ static void blk_request(struct virtqueue *vq) } } - /* OK, so we noted that it was pretty poor to use an fdatasync as a + /* + * OK, so we noted that it was pretty poor to use an fdatasync as a * barrier. But Christoph Hellwig points out that we need a sync * *afterwards* as well: "Barriers specify no reordering to the front - * or the back." And Jens Axboe confirmed it, so here we are: */ + * or the back." And Jens Axboe confirmed it, so here we are: + */ if (out->type & VIRTIO_BLK_T_BARRIER) fdatasync(vblk->fd); + /* Finished that request. */ add_used(vq, head, wlen); } @@ -1491,7 +1706,7 @@ static void setup_block_file(const char *filename) struct vblk_info *vblk; struct virtio_blk_config conf; - /* The device responds to return from I/O thread. */ + /* Creat the device. */ dev = new_device("block", VIRTIO_ID_BLOCK); /* The device has one virtqueue, where the Guest places requests. */ @@ -1510,27 +1725,32 @@ static void setup_block_file(const char *filename) /* Tell Guest how many sectors this device has. */ conf.capacity = cpu_to_le64(vblk->len / 512); - /* Tell Guest not to put in too many descriptors at once: two are used - * for the in and out elements. */ + /* + * Tell Guest not to put in too many descriptors at once: two are used + * for the in and out elements. + */ add_feature(dev, VIRTIO_BLK_F_SEG_MAX); conf.seg_max = cpu_to_le32(VIRTQUEUE_NUM - 2); - set_config(dev, sizeof(conf), &conf); + /* Don't try to put whole struct: we have 8 bit limit. */ + set_config(dev, offsetof(struct virtio_blk_config, geometry), &conf); verbose("device %u: virtblock %llu sectors\n", ++devices.device_num, le64_to_cpu(conf.capacity)); } -struct rng_info { - int rfd; -}; - -/* Our random number generator device reads from /dev/random into the Guest's +/*L:211 + * Our random number generator device reads from /dev/random into the Guest's * input buffers. The usual case is that the Guest doesn't want random numbers * and so has no buffers although /dev/random is still readable, whereas * console is the reverse. * - * The same logic applies, however. */ + * The same logic applies, however. + */ +struct rng_info { + int rfd; +}; + static void rng_input(struct virtqueue *vq) { int len; @@ -1543,9 +1763,10 @@ static void rng_input(struct virtqueue *vq) if (out_num) errx(1, "Output buffers in rng?"); - /* This is why we convert to iovecs: the readv() call uses them, and so - * it reads straight into the Guest's buffer. We loop to make sure we - * fill it. */ + /* + * Just like the console write, we loop to cover the whole iovec. + * In this case, short reads actually happen quite a bit. + */ while (!iov_empty(iov, in_num)) { len = readv(rng_info->rfd, iov, in_num); if (len <= 0) @@ -1558,15 +1779,18 @@ static void rng_input(struct virtqueue *vq) add_used(vq, head, totlen); } -/* And this creates a "hardware" random number device for the Guest. */ +/*L:199 + * This creates a "hardware" random number device for the Guest. + */ static void setup_rng(void) { struct device *dev; struct rng_info *rng_info = malloc(sizeof(*rng_info)); + /* Our device's privat info simply contains the /dev/random fd. */ rng_info->rfd = open_or_die("/dev/random", O_RDONLY); - /* The device responds to return from I/O thread. */ + /* Create the new device. */ dev = new_device("rng", VIRTIO_ID_RNG); dev->priv = rng_info; @@ -1582,8 +1806,10 @@ static void __attribute__((noreturn)) restart_guest(void) { unsigned int i; - /* Since we don't track all open fds, we simply close everything beyond - * stderr. */ + /* + * Since we don't track all open fds, we simply close everything beyond + * stderr. + */ for (i = 3; i < FD_SETSIZE; i++) close(i); @@ -1594,8 +1820,10 @@ static void __attribute__((noreturn)) restart_guest(void) err(1, "Could not exec %s", main_args[0]); } -/*L:220 Finally we reach the core of the Launcher which runs the Guest, serves - * its input and output, and finally, lays it to rest. */ +/*L:220 + * Finally we reach the core of the Launcher which runs the Guest, serves + * its input and output, and finally, lays it to rest. + */ static void __attribute__((noreturn)) run_guest(void) { for (;;) { @@ -1630,7 +1858,7 @@ static void __attribute__((noreturn)) run_guest(void) * * Are you ready? Take a deep breath and join me in the core of the Host, in * "make Host". - :*/ +:*/ static struct option opts[] = { { "verbose", 0, NULL, 'v' }, @@ -1651,8 +1879,7 @@ static void usage(void) /*L:105 The main routine is where the real work begins: */ int main(int argc, char *argv[]) { - /* Memory, top-level pagetable, code startpoint and size of the - * (optional) initrd. */ + /* Memory, code startpoint and size of the (optional) initrd. */ unsigned long mem = 0, start, initrd_size = 0; /* Two temporaries. */ int i, c; @@ -1664,24 +1891,32 @@ int main(int argc, char *argv[]) /* Save the args: we "reboot" by execing ourselves again. */ main_args = argv; - /* First we initialize the device list. We keep a pointer to the last + /* + * First we initialize the device list. We keep a pointer to the last * device, and the next interrupt number to use for devices (1: - * remember that 0 is used by the timer). */ + * remember that 0 is used by the timer). + */ devices.lastdev = NULL; devices.next_irq = 1; + /* We're CPU 0. In fact, that's the only CPU possible right now. */ cpu_id = 0; - /* We need to know how much memory so we can set up the device + + /* + * We need to know how much memory so we can set up the device * descriptor and memory pages for the devices as we parse the command * line. So we quickly look through the arguments to find the amount - * of memory now. */ + * of memory now. + */ for (i = 1; i < argc; i++) { if (argv[i][0] != '-') { mem = atoi(argv[i]) * 1024 * 1024; - /* We start by mapping anonymous pages over all of + /* + * We start by mapping anonymous pages over all of * guest-physical memory range. This fills it with 0, * and ensures that the Guest won't be killed when it - * tries to access it. */ + * tries to access it. + */ guest_base = map_zeroed_pages(mem / getpagesize() + DEVICE_PAGES); guest_limit = mem; @@ -1714,8 +1949,10 @@ int main(int argc, char *argv[]) usage(); } } - /* After the other arguments we expect memory and kernel image name, - * followed by command line arguments for the kernel. */ + /* + * After the other arguments we expect memory and kernel image name, + * followed by command line arguments for the kernel. + */ if (optind + 2 > argc) usage(); @@ -1733,20 +1970,26 @@ int main(int argc, char *argv[]) /* Map the initrd image if requested (at top of physical memory) */ if (initrd_name) { initrd_size = load_initrd(initrd_name, mem); - /* These are the location in the Linux boot header where the - * start and size of the initrd are expected to be found. */ + /* + * These are the location in the Linux boot header where the + * start and size of the initrd are expected to be found. + */ boot->hdr.ramdisk_image = mem - initrd_size; boot->hdr.ramdisk_size = initrd_size; /* The bootloader type 0xFF means "unknown"; that's OK. */ boot->hdr.type_of_loader = 0xFF; } - /* The Linux boot header contains an "E820" memory map: ours is a - * simple, single region. */ + /* + * The Linux boot header contains an "E820" memory map: ours is a + * simple, single region. + */ boot->e820_entries = 1; boot->e820_map[0] = ((struct e820entry) { 0, mem, E820_RAM }); - /* The boot header contains a command line pointer: we put the command - * line after the boot header. */ + /* + * The boot header contains a command line pointer: we put the command + * line after the boot header. + */ boot->hdr.cmd_line_ptr = to_guest_phys(boot + 1); /* We use a simple helper to copy the arguments separated by spaces. */ concat((char *)(boot + 1), argv+optind+2); @@ -1760,11 +2003,13 @@ int main(int argc, char *argv[]) /* Tell the entry path not to try to reload segment registers. */ boot->hdr.loadflags |= KEEP_SEGMENTS; - /* We tell the kernel to initialize the Guest: this returns the open - * /dev/lguest file descriptor. */ + /* + * We tell the kernel to initialize the Guest: this returns the open + * /dev/lguest file descriptor. + */ tell_kernel(start); - /* Ensure that we terminate if a child dies. */ + /* Ensure that we terminate if a device-servicing child dies. */ signal(SIGCHLD, kill_launcher); /* If we exit via err(), this kills all the threads, restores tty. */ diff --git a/arch/x86/include/asm/lguest.h b/arch/x86/include/asm/lguest.h index 313389c..5136dad 100644 --- a/arch/x86/include/asm/lguest.h +++ b/arch/x86/include/asm/lguest.h @@ -17,8 +17,7 @@ /* Pages for switcher itself, then two pages per cpu */ #define TOTAL_SWITCHER_PAGES (SHARED_SWITCHER_PAGES + 2 * nr_cpu_ids) -/* We map at -4M (-2M when PAE is activated) for ease of mapping - * into the guest (one PTE page). */ +/* We map at -4M (-2M for PAE) for ease of mapping (one PTE page). */ #ifdef CONFIG_X86_PAE #define SWITCHER_ADDR 0xFFE00000 #else diff --git a/arch/x86/include/asm/lguest_hcall.h b/arch/x86/include/asm/lguest_hcall.h index 33600a6..ba0eed8 100644 --- a/arch/x86/include/asm/lguest_hcall.h +++ b/arch/x86/include/asm/lguest_hcall.h @@ -30,27 +30,27 @@ #include <asm/hw_irq.h> #include <asm/kvm_para.h> -/*G:030 But first, how does our Guest contact the Host to ask for privileged +/*G:030 + * But first, how does our Guest contact the Host to ask for privileged * operations? There are two ways: the direct way is to make a "hypercall", * to make requests of the Host Itself. * - * We use the KVM hypercall mechanism. Seventeen hypercalls are - * available: the hypercall number is put in the %eax register, and the - * arguments (when required) are placed in %ebx, %ecx, %edx and %esi. - * If a return value makes sense, it's returned in %eax. + * We use the KVM hypercall mechanism, though completely different hypercall + * numbers. Seventeen hypercalls are available: the hypercall number is put in + * the %eax register, and the arguments (when required) are placed in %ebx, + * %ecx, %edx and %esi. If a return value makes sense, it's returned in %eax. * * Grossly invalid calls result in Sudden Death at the hands of the vengeful * Host, rather than returning failure. This reflects Winston Churchill's - * definition of a gentleman: "someone who is only rude intentionally". */ -/*:*/ + * definition of a gentleman: "someone who is only rude intentionally". +:*/ /* Can't use our min() macro here: needs to be a constant */ #define LGUEST_IRQS (NR_IRQS < 32 ? NR_IRQS: 32) #define LHCALL_RING_SIZE 64 struct hcall_args { - /* These map directly onto eax, ebx, ecx, edx and esi - * in struct lguest_regs */ + /* These map directly onto eax/ebx/ecx/edx/esi in struct lguest_regs */ unsigned long arg0, arg1, arg2, arg3, arg4; }; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index f2bf1f7..d677fa9 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -22,7 +22,8 @@ * * So how does the kernel know it's a Guest? We'll see that later, but let's * just say that we end up here where we replace the native functions various - * "paravirt" structures with our Guest versions, then boot like normal. :*/ + * "paravirt" structures with our Guest versions, then boot like normal. +:*/ /* * Copyright (C) 2006, Rusty Russell <rusty@rustcorp.com.au> IBM Corporation. @@ -74,7 +75,8 @@ * * The Guest in our tale is a simple creature: identical to the Host but * behaving in simplified but equivalent ways. In particular, the Guest is the - * same kernel as the Host (or at least, built from the same source code). :*/ + * same kernel as the Host (or at least, built from the same source code). +:*/ struct lguest_data lguest_data = { .hcall_status = { [0 ... LHCALL_RING_SIZE-1] = 0xFF }, @@ -85,7 +87,8 @@ struct lguest_data lguest_data = { .syscall_vec = SYSCALL_VECTOR, }; -/*G:037 async_hcall() is pretty simple: I'm quite proud of it really. We have a +/*G:037 + * async_hcall() is pretty simple: I'm quite proud of it really. We have a * ring buffer of stored hypercalls which the Host will run though next time we * do a normal hypercall. Each entry in the ring has 5 slots for the hypercall * arguments, and a "hcall_status" word which is 0 if the call is ready to go, @@ -94,7 +97,8 @@ struct lguest_data lguest_data = { * If we come around to a slot which hasn't been finished, then the table is * full and we just make the hypercall directly. This has the nice side * effect of causing the Host to run all the stored calls in the ring buffer - * which empties it for next time! */ + * which empties it for next time! + */ static void async_hcall(unsigned long call, unsigned long arg1, unsigned long arg2, unsigned long arg3, unsigned long arg4) @@ -103,9 +107,11 @@ static void async_hcall(unsigned long call, unsigned long arg1, static unsigned int next_call; unsigned long flags; - /* Disable interrupts if not already disabled: we don't want an + /* + * Disable interrupts if not already disabled: we don't want an * interrupt handler making a hypercall while we're already doing - * one! */ + * one! + */ local_irq_save(flags); if (lguest_data.hcall_status[next_call] != 0xFF) { /* Table full, so do normal hcall which will flush table. */ @@ -125,8 +131,9 @@ static void async_hcall(unsigned long call, unsigned long arg1, local_irq_restore(flags); } -/*G:035 Notice the lazy_hcall() above, rather than hcall(). This is our first - * real optimization trick! +/*G:035 + * Notice the lazy_hcall() above, rather than hcall(). This is our first real + * optimization trick! * * When lazy_mode is set, it means we're allowed to defer all hypercalls and do * them as a batch when lazy_mode is eventually turned off. Because hypercalls @@ -136,7 +143,8 @@ static void async_hcall(unsigned long call, unsigned long arg1, * lguest_leave_lazy_mode(). * * So, when we're in lazy mode, we call async_hcall() to store the call for - * future processing: */ + * future processing: + */ static void lazy_hcall1(unsigned long call, unsigned long arg1) { @@ -146,6 +154,7 @@ static void lazy_hcall1(unsigned long call, async_hcall(call, arg1, 0, 0, 0); } +/* You can imagine what lazy_hcall2, 3 and 4 look like. :*/ static void lazy_hcall2(unsigned long call, unsigned long arg1, unsigned long arg2) @@ -181,8 +190,10 @@ static void lazy_hcall4(unsigned long call, } #endif -/* When lazy mode is turned off reset the per-cpu lazy mode variable and then - * issue the do-nothing hypercall to flush any stored calls. */ +/*G:036 + * When lazy mode is turned off reset the per-cpu lazy mode variable and then + * issue the do-nothing hypercall to flush any stored calls. +:*/ static void lguest_leave_lazy_mmu_mode(void) { kvm_hypercall0(LHCALL_FLUSH_ASYNC); @@ -208,9 +219,11 @@ static void lguest_end_context_switch(struct task_struct *next) * check there before it tries to deliver an interrupt. */ -/* save_flags() is expected to return the processor state (ie. "flags"). The +/* + * save_flags() is expected to return the processor state (ie. "flags"). The * flags word contains all kind of stuff, but in practice Linux only cares - * about the interrupt flag. Our "save_flags()" just returns that. */ + * about the interrupt flag. Our "save_flags()" just returns that. + */ static unsigned long save_fl(void) { return lguest_data.irq_enabled; @@ -222,13 +235,15 @@ static void irq_disable(void) lguest_data.irq_enabled = 0; } -/* Let's pause a moment. Remember how I said these are called so often? +/* + * Let's pause a moment. Remember how I said these are called so often? * Jeremy Fitzhardinge optimized them so hard early in 2009 that he had to * break some rules. In particular, these functions are assumed to save their * own registers if they need to: normal C functions assume they can trash the * eax register. To use normal C functions, we use * PV_CALLEE_SAVE_REGS_THUNK(), which pushes %eax onto the stack, calls the - * C function, then restores it. */ + * C function, then restores it. + */ PV_CALLEE_SAVE_REGS_THUNK(save_fl); PV_CALLEE_SAVE_REGS_THUNK(irq_disable); /*:*/ @@ -237,18 +252,18 @@ PV_CALLEE_SAVE_REGS_THUNK(irq_disable); extern void lg_irq_enable(void); extern void lg_restore_fl(unsigned long flags); -/*M:003 Note that we don't check for outstanding interrupts when we re-enable - * them (or when we unmask an interrupt). This seems to work for the moment, - * since interrupts are rare and we'll just get the interrupt on the next timer - * tick, but now we can run with CONFIG_NO_HZ, we should revisit this. One way - * would be to put the "irq_enabled" field in a page by itself, and have the - * Host write-protect it when an interrupt comes in when irqs are disabled. - * There will then be a page fault as soon as interrupts are re-enabled. +/*M:003 + * We could be more efficient in our checking of outstanding interrupts, rather + * than using a branch. One way would be to put the "irq_enabled" field in a + * page by itself, and have the Host write-protect it when an interrupt comes + * in when irqs are disabled. There will then be a page fault as soon as + * interrupts are re-enabled. * * A better method is to implement soft interrupt disable generally for x86: * instead of disabling interrupts, we set a flag. If an interrupt does come * in, we then disable them for real. This is uncommon, so we could simply use - * a hypercall for interrupt control and not worry about efficiency. :*/ + * a hypercall for interrupt control and not worry about efficiency. +:*/ /*G:034 * The Interrupt Descriptor Table (IDT). @@ -261,10 +276,12 @@ extern void lg_restore_fl(unsigned long flags); static void lguest_write_idt_entry(gate_desc *dt, int entrynum, const gate_desc *g) { - /* The gate_desc structure is 8 bytes long: we hand it to the Host in + /* + * The gate_desc structure is 8 bytes long: we hand it to the Host in * two 32-bit chunks. The whole 32-bit kernel used to hand descriptors * around like this; typesafety wasn't a big concern in Linux's early - * years. */ + * years. + */ u32 *desc = (u32 *)g; /* Keep the local copy up to date. */ native_write_idt_entry(dt, entrynum, g); @@ -272,9 +289,11 @@ static void lguest_write_idt_entry(gate_desc *dt, kvm_hypercall3(LHCALL_LOAD_IDT_ENTRY, entrynum, desc[0], desc[1]); } -/* Changing to a different IDT is very rare: we keep the IDT up-to-date every +/* + * Changing to a different IDT is very rare: we keep the IDT up-to-date every * time it is written, so we can simply loop through all entries and tell the - * Host about them. */ + * Host about them. + */ static void lguest_load_idt(const struct desc_ptr *desc) { unsigned int i; @@ -305,9 +324,11 @@ static void lguest_load_gdt(const struct desc_ptr *desc) kvm_hypercall3(LHCALL_LOAD_GDT_ENTRY, i, gdt[i].a, gdt[i].b); } -/* For a single GDT entry which changes, we do the lazy thing: alter our GDT, +/* + * For a single GDT entry which changes, we do the lazy thing: alter our GDT, * then tell the Host to reload the entire thing. This operation is so rare - * that this naive implementation is reasonable. */ + * that this naive implementation is reasonable. + */ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, const void *desc, int type) { @@ -317,29 +338,36 @@ static void lguest_write_gdt_entry(struct desc_struct *dt, int entrynum, dt[entrynum].a, dt[entrynum].b); } -/* OK, I lied. There are three "thread local storage" GDT entries which change +/* + * OK, I lied. There are three "thread local storage" GDT entries which change * on every context switch (these three entries are how glibc implements - * __thread variables). So we have a hypercall specifically for this case. */ + * __thread variables). So we have a hypercall specifically for this case. + */ static void lguest_load_tls(struct thread_struct *t, unsigned int cpu) { - /* There's one problem which normal hardware doesn't have: the Host + /* + * There's one problem which normal hardware doesn't have: the Host * can't handle us removing entries we're currently using. So we clear - * the GS register here: if it's needed it'll be reloaded anyway. */ + * the GS register here: if it's needed it'll be reloaded anyway. + */ lazy_load_gs(0); lazy_hcall2(LHCALL_LOAD_TLS, __pa(&t->tls_array), cpu); } -/*G:038 That's enough excitement for now, back to ploughing through each of - * the different pv_ops structures (we're about 1/3 of the way through). +/*G:038 + * That's enough excitement for now, back to ploughing through each of the + * different pv_ops structures (we're about 1/3 of the way through). * * This is the Local Descriptor Table, another weird Intel thingy. Linux only * uses this for some strange applications like Wine. We don't do anything - * here, so they'll get an informative and friendly Segmentation Fault. */ + * here, so they'll get an informative and friendly Segmentation Fault. + */ static void lguest_set_ldt(const void *addr, unsigned entries) { } -/* This loads a GDT entry into the "Task Register": that entry points to a +/* + * This loads a GDT entry into the "Task Register": that entry points to a * structure called the Task State Segment. Some comments scattered though the * kernel code indicate that this used for task switching in ages past, along * with blood sacrifice and astrology. @@ -347,19 +375,21 @@ static void lguest_set_ldt(const void *addr, unsigned entries) * Now there's nothing interesting in here that we don't get told elsewhere. * But the native version uses the "ltr" instruction, which makes the Host * complain to the Guest about a Segmentation Fault and it'll oops. So we - * override the native version with a do-nothing version. */ + * override the native version with a do-nothing version. + */ static void lguest_load_tr_desc(void) { } -/* The "cpuid" instruction is a way of querying both the CPU identity +/* + * The "cpuid" instruction is a way of querying both the CPU identity * (manufacturer, model, etc) and its features. It was introduced before the * Pentium in 1993 and keeps getting extended by both Intel, AMD and others. * As you might imagine, after a decade and a half this treatment, it is now a * giant ball of hair. Its entry in the current Intel manual runs to 28 pages. * * This instruction even it has its own Wikipedia entry. The Wikipedia entry - * has been translated into 4 languages. I am not making this up! + * has been translated into 5 languages. I am not making this up! * * We could get funky here and identify ourselves as "GenuineLguest", but * instead we just use the real "cpuid" instruction. Then I pretty much turned @@ -371,7 +401,8 @@ static void lguest_load_tr_desc(void) * Replacing the cpuid so we can turn features off is great for the kernel, but * anyone (including userspace) can just use the raw "cpuid" instruction and * the Host won't even notice since it isn't privileged. So we try not to get - * too worked up about it. */ + * too worked up about it. + */ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, unsigned int *cx, unsigned int *dx) { @@ -379,43 +410,63 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, native_cpuid(ax, bx, cx, dx); switch (function) { - case 0: /* ID and highest CPUID. Futureproof a little by sticking to - * older ones. */ + /* + * CPUID 0 gives the highest legal CPUID number (and the ID string). + * We futureproof our code a little by sticking to known CPUID values. + */ + case 0: if (*ax > 5) *ax = 5; break; - case 1: /* Basic feature request. */ - /* We only allow kernel to see SSE3, CMPXCHG16B and SSSE3 */ + + /* + * CPUID 1 is a basic feature request. + * + * CX: we only allow kernel to see SSE3, CMPXCHG16B and SSSE3 + * DX: SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU and PAE. + */ + case 1: *cx &= 0x00002201; - /* SSE, SSE2, FXSR, MMX, CMOV, CMPXCHG8B, TSC, FPU, PAE. */ *dx &= 0x07808151; - /* The Host can do a nice optimization if it knows that the + /* + * The Host can do a nice optimization if it knows that the * kernel mappings (addresses above 0xC0000000 or whatever * PAGE_OFFSET is set to) haven't changed. But Linux calls * flush_tlb_user() for both user and kernel mappings unless - * the Page Global Enable (PGE) feature bit is set. */ + * the Page Global Enable (PGE) feature bit is set. + */ *dx |= 0x00002000; - /* We also lie, and say we're family id 5. 6 or greater + /* + * We also lie, and say we're family id 5. 6 or greater * leads to a rdmsr in early_init_intel which we can't handle. - * Family ID is returned as bits 8-12 in ax. */ + * Family ID is returned as bits 8-12 in ax. + */ *ax &= 0xFFFFF0FF; *ax |= 0x00000500; break; + /* + * 0x80000000 returns the highest Extended Function, so we futureproof + * like we do above by limiting it to known fields. + */ case 0x80000000: - /* Futureproof this a little: if they ask how much extended - * processor information there is, limit it to known fields. */ if (*ax > 0x80000008) *ax = 0x80000008; break; + + /* + * PAE systems can mark pages as non-executable. Linux calls this the + * NX bit. Intel calls it XD (eXecute Disable), AMD EVP (Enhanced + * Virus Protection). We just switch turn if off here, since we don't + * support it. + */ case 0x80000001: - /* Here we should fix nx cap depending on host. */ - /* For this version of PAE, we just clear NX bit. */ *dx &= ~(1 << 20); break; } } -/* Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. +/* + * Intel has four control registers, imaginatively named cr0, cr2, cr3 and cr4. * I assume there's a cr1, but it hasn't bothered us yet, so we'll not bother * it. The Host needs to know when the Guest wants to change them, so we have * a whole series of functions like read_cr0() and write_cr0(). @@ -430,7 +481,8 @@ static void lguest_cpuid(unsigned int *ax, unsigned int *bx, * name like "FPUTRAP bit" be a little less cryptic? * * We store cr0 locally because the Host never changes it. The Guest sometimes - * wants to read it and we'd prefer not to bother the Host unnecessarily. */ + * wants to read it and we'd prefer not to bother the Host unnecessarily. + */ static unsigned long current_cr0; static void lguest_write_cr0(unsigned long val) { @@ -443,18 +495,22 @@ static unsigned long lguest_read_cr0(void) return current_cr0; } -/* Intel provided a special instruction to clear the TS bit for people too cool +/* + * Intel provided a special instruction to clear the TS bit for people too cool * to use write_cr0() to do it. This "clts" instruction is faster, because all - * the vowels have been optimized out. */ + * the vowels have been optimized out. + */ static void lguest_clts(void) { lazy_hcall1(LHCALL_TS, 0); current_cr0 &= ~X86_CR0_TS; } -/* cr2 is the virtual address of the last page fault, which the Guest only ever +/* + * cr2 is the virtual address of the last page fault, which the Guest only ever * reads. The Host kindly writes this into our "struct lguest_data", so we - * just read it out of there. */ + * just read it out of there. + */ static unsigned long lguest_read_cr2(void) { return lguest_data.cr2; @@ -463,10 +519,12 @@ static unsigned long lguest_read_cr2(void) /* See lguest_set_pte() below. */ static bool cr3_changed = false; -/* cr3 is the current toplevel pagetable page: the principle is the same as +/* + * cr3 is the current toplevel pagetable page: the principle is the same as * cr0. Keep a local copy, and tell the Host when it changes. The only * difference is that our local copy is in lguest_data because the Host needs - * to set it upon our initial hypercall. */ + * to set it upon our initial hypercall. + */ static void lguest_write_cr3(unsigned long cr3) { lguest_data.pgdir = cr3; @@ -511,7 +569,7 @@ static void lguest_write_cr4(unsigned long val) * cr3 ---> +---------+ * | --------->+---------+ * | | | PADDR1 | - * Top-level | | PADDR2 | + * Mid-level | | PADDR2 | * (PMD) page | | | * | | Lower-level | * | | (PTE) page | @@ -531,21 +589,62 @@ static void lguest_write_cr4(unsigned long val) * Index into top Index into second Offset within page * page directory page pagetable page * - * The kernel spends a lot of time changing both the top-level page directory - * and lower-level pagetable pages. The Guest doesn't know physical addresses, - * so while it maintains these page tables exactly like normal, it also needs - * to keep the Host informed whenever it makes a change: the Host will create - * the real page tables based on the Guests'. + * Now, unfortunately, this isn't the whole story: Intel added Physical Address + * Extension (PAE) to allow 32 bit systems to use 64GB of memory (ie. 36 bits). + * These are held in 64-bit page table entries, so we can now only fit 512 + * entries in a page, and the neat three-level tree breaks down. + * + * The result is a four level page table: + * + * cr3 --> [ 4 Upper ] + * [ Level ] + * [ Entries ] + * [(PUD Page)]---> +---------+ + * | --------->+---------+ + * | | | PADDR1 | + * Mid-level | | PADDR2 | + * (PMD) page | | | + * | | Lower-level | + * | | (PTE) page | + * | | | | + * .... .... + * + * + * And the virtual address is decoded as: + * + * 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 + * |<-2->|<--- 9 bits ---->|<---- 9 bits --->|<------ 12 bits ------>| + * Index into Index into mid Index into lower Offset within page + * top entries directory page pagetable page + * + * It's too hard to switch between these two formats at runtime, so Linux only + * supports one or the other depending on whether CONFIG_X86_PAE is set. Many + * distributions turn it on, and not just for people with silly amounts of + * memory: the larger PTE entries allow room for the NX bit, which lets the + * kernel disable execution of pages and increase security. + * + * This was a problem for lguest, which couldn't run on these distributions; + * then Matias Zabaljauregui figured it all out and implemented it, and only a + * handful of puppies were crushed in the process! + * + * Back to our point: the kernel spends a lot of time changing both the + * top-level page directory and lower-level pagetable pages. The Guest doesn't + * know physical addresses, so while it maintains these page tables exactly + * like normal, it also needs to keep the Host informed whenever it makes a + * change: the Host will create the real page tables based on the Guests'. */ -/* The Guest calls this to set a second-level entry (pte), ie. to map a page - * into a process' address space. We set the entry then tell the Host the - * toplevel and address this corresponds to. The Guest uses one pagetable per - * process, so we need to tell the Host which one we're changing (mm->pgd). */ +/* + * The Guest calls this after it has set a second-level entry (pte), ie. to map + * a page into a process' address space. Wetell the Host the toplevel and + * address this corresponds to. The Guest uses one pagetable per process, so + * we need to tell the Host which one we're changing (mm->pgd). + */ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, pte_t *ptep) { #ifdef CONFIG_X86_PAE + /* PAE needs to hand a 64 bit page table entry, so it uses two args. */ lazy_hcall4(LHCALL_SET_PTE, __pa(mm->pgd), addr, ptep->pte_low, ptep->pte_high); #else @@ -553,6 +652,7 @@ static void lguest_pte_update(struct mm_struct *mm, unsigned long addr, #endif } +/* This is the "set and update" combo-meal-deal version. */ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval) { @@ -560,10 +660,13 @@ static void lguest_set_pte_at(struct mm_struct *mm, unsigned long addr, lguest_pte_update(mm, addr, ptep); } -/* The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd +/* + * The Guest calls lguest_set_pud to set a top-level entry and lguest_set_pmd * to set a middle-level entry when PAE is activated. + * * Again, we set the entry then tell the Host which page we changed, - * and the index of the entry we changed. */ + * and the index of the entry we changed. + */ #ifdef CONFIG_X86_PAE static void lguest_set_pud(pud_t *pudp, pud_t pudval) { @@ -582,8 +685,7 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #else -/* The Guest calls lguest_set_pmd to set a top-level entry when PAE is not - * activated. */ +/* The Guest calls lguest_set_pmd to set a top-level entry when !PAE. */ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) { native_set_pmd(pmdp, pmdval); @@ -592,7 +694,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) } #endif -/* There are a couple of legacy places where the kernel sets a PTE, but we +/* + * There are a couple of legacy places where the kernel sets a PTE, but we * don't know the top level any more. This is useless for us, since we don't * know which pagetable is changing or what address, so we just tell the Host * to forget all of them. Fortunately, this is very rare. @@ -600,7 +703,8 @@ static void lguest_set_pmd(pmd_t *pmdp, pmd_t pmdval) * ... except in early boot when the kernel sets up the initial pagetables, * which makes booting astonishingly slow: 1.83 seconds! So we don't even tell * the Host anything changed until we've done the first page table switch, - * which brings boot back to 0.25 seconds. */ + * which brings boot back to 0.25 seconds. + */ static void lguest_set_pte(pte_t *ptep, pte_t pteval) { native_set_pte(ptep, pteval); @@ -609,6 +713,11 @@ static void lguest_set_pte(pte_t *ptep, pte_t pteval) } #ifdef CONFIG_X86_PAE +/* + * With 64-bit PTE values, we need to be careful setting them: if we set 32 + * bits at a time, the hardware could see a weird half-set entry. These + * versions ensure we update all 64 bits at once. + */ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) { native_set_pte_atomic(ptep, pte); @@ -616,19 +725,21 @@ static void lguest_set_pte_atomic(pte_t *ptep, pte_t pte) lazy_hcall1(LHCALL_FLUSH_TLB, 1); } -void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) +static void lguest_pte_clear(struct mm_struct *mm, unsigned long addr, + pte_t *ptep) { native_pte_clear(mm, addr, ptep); lguest_pte_update(mm, addr, ptep); } -void lguest_pmd_clear(pmd_t *pmdp) +static void lguest_pmd_clear(pmd_t *pmdp) { lguest_set_pmd(pmdp, __pmd(0)); } #endif -/* Unfortunately for Lguest, the pv_mmu_ops for page tables were based on +/* + * Unfortunately for Lguest, the pv_mmu_ops for page tables were based on * native page table operations. On native hardware you can set a new page * table entry whenever you want, but if you want to remove one you have to do * a TLB flush (a TLB is a little cache of page table entries kept by the CPU). @@ -637,24 +748,29 @@ void lguest_pmd_clear(pmd_t *pmdp) * called when a valid entry is written, not when it's removed (ie. marked not * present). Instead, this is where we come when the Guest wants to remove a * page table entry: we tell the Host to set that entry to 0 (ie. the present - * bit is zero). */ + * bit is zero). + */ static void lguest_flush_tlb_single(unsigned long addr) { /* Simply set it to zero: if it was not, it will fault back in. */ lazy_hcall3(LHCALL_SET_PTE, lguest_data.pgdir, addr, 0); } -/* This is what happens after the Guest has removed a large number of entries. +/* + * This is what happens after the Guest has removed a large number of entries. * This tells the Host that any of the page table entries for userspace might - * have changed, ie. virtual addresses below PAGE_OFFSET. */ + * have changed, ie. virtual addresses below PAGE_OFFSET. + */ static void lguest_flush_tlb_user(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 0); } -/* This is called when the kernel page tables have changed. That's not very +/* + * This is called when the kernel page tables have changed. That's not very * common (unless the Guest is using highmem, which makes the Guest extremely - * slow), so it's worth separating this from the user flushing above. */ + * slow), so it's worth separating this from the user flushing above. + */ static void lguest_flush_tlb_kernel(void) { lazy_hcall1(LHCALL_FLUSH_TLB, 1); @@ -691,26 +807,38 @@ static struct irq_chip lguest_irq_controller = { .unmask = enable_lguest_irq, }; -/* This sets up the Interrupt Descriptor Table (IDT) entry for each hardware +/* + * This sets up the Interrupt Descriptor Table (IDT) entry for each hardware * interrupt (except 128, which is used for system calls), and then tells the * Linux infrastructure that each interrupt is controlled by our level-based - * lguest interrupt controller. */ + * lguest interrupt controller. + */ static void __init lguest_init_IRQ(void) { unsigned int i; for (i = FIRST_EXTERNAL_VECTOR; i < NR_VECTORS; i++) { - /* Some systems map "vectors" to interrupts weirdly. Lguest has - * a straightforward 1 to 1 mapping, so force that here. */ + /* Some systems map "vectors" to interrupts weirdly. Not us! */ __get_cpu_var(vector_irq)[i] = i - FIRST_EXTERNAL_VECTOR; if (i != SYSCALL_VECTOR) set_intr_gate(i, interrupt[i - FIRST_EXTERNAL_VECTOR]); } - /* This call is required to set up for 4k stacks, where we have - * separate stacks for hard and soft interrupts. */ + + /* + * This call is required to set up for 4k stacks, where we have + * separate stacks for hard and soft interrupts. + */ irq_ctx_init(smp_processor_id()); } +/* + * With CONFIG_SPARSE_IRQ, interrupt descriptors are allocated as-needed, so + * rather than set them in lguest_init_IRQ we are called here every time an + * lguest device needs an interrupt. + * + * FIXME: irq_to_desc_alloc_node() can fail due to lack of memory, we should + * pass that up! + */ void lguest_setup_irq(unsigned int irq) { irq_to_desc_alloc_node(irq, 0); @@ -729,31 +857,39 @@ static unsigned long lguest_get_wallclock(void) return lguest_data.time.tv_sec; } -/* The TSC is an Intel thing called the Time Stamp Counter. The Host tells us +/* + * The TSC is an Intel thing called the Time Stamp Counter. The Host tells us * what speed it runs at, or 0 if it's unusable as a reliable clock source. * This matches what we want here: if we return 0 from this function, the x86 - * TSC clock will give up and not register itself. */ + * TSC clock will give up and not register itself. + */ static unsigned long lguest_tsc_khz(void) { return lguest_data.tsc_khz; } -/* If we can't use the TSC, the kernel falls back to our lower-priority - * "lguest_clock", where we read the time value given to us by the Host. */ +/* + * If we can't use the TSC, the kernel falls back to our lower-priority + * "lguest_clock", where we read the time value given to us by the Host. + */ static cycle_t lguest_clock_read(struct clocksource *cs) { unsigned long sec, nsec; - /* Since the time is in two parts (seconds and nanoseconds), we risk + /* + * Since the time is in two parts (seconds and nanoseconds), we risk * reading it just as it's changing from 99 & 0.999999999 to 100 and 0, * and getting 99 and 0. As Linux tends to come apart under the stress - * of time travel, we must be careful: */ + * of time travel, we must be careful: + */ do { /* First we read the seconds part. */ sec = lguest_data.time.tv_sec; - /* This read memory barrier tells the compiler and the CPU that + /* + * This read memory barrier tells the compiler and the CPU that * this can't be reordered: we have to complete the above - * before going on. */ + * before going on. + */ rmb(); /* Now we read the nanoseconds part. */ nsec = lguest_data.time.tv_nsec; @@ -777,9 +913,11 @@ static struct clocksource lguest_clock = { .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -/* We also need a "struct clock_event_device": Linux asks us to set it to go +/* + * We also need a "struct clock_event_device": Linux asks us to set it to go * off some time in the future. Actually, James Morris figured all this out, I - * just applied the patch. */ + * just applied the patch. + */ static int lguest_clockevent_set_next_event(unsigned long delta, struct clock_event_device *evt) { @@ -829,8 +967,10 @@ static struct clock_event_device lguest_clockevent = { .max_delta_ns = LG_CLOCK_MAX_DELTA, }; -/* This is the Guest timer interrupt handler (hardware interrupt 0). We just - * call the clockevent infrastructure and it does whatever needs doing. */ +/* + * This is the Guest timer interrupt handler (hardware interrupt 0). We just + * call the clockevent infrastructure and it does whatever needs doing. + */ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) { unsigned long flags; @@ -841,10 +981,12 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) local_irq_restore(flags); } -/* At some point in the boot process, we get asked to set up our timing +/* + * At some point in the boot process, we get asked to set up our timing * infrastructure. The kernel doesn't expect timer interrupts before this, but * we cleverly initialized the "blocked_interrupts" field of "struct - * lguest_data" so that timer interrupts were blocked until now. */ + * lguest_data" so that timer interrupts were blocked until now. + */ static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ @@ -868,14 +1010,16 @@ static void lguest_time_init(void) * to work. They're pretty simple. */ -/* The Guest needs to tell the Host what stack it expects traps to use. For +/* + * The Guest needs to tell the Host what stack it expects traps to use. For * native hardware, this is part of the Task State Segment mentioned above in * lguest_load_tr_desc(), but to help hypervisors there's this special call. * * We tell the Host the segment we want to use (__KERNEL_DS is the kernel data * segment), the privilege level (we're privilege level 1, the Host is 0 and * will not tolerate us trying to use that), the stack pointer, and the number - * of pages in the stack. */ + * of pages in the stack. + */ static void lguest_load_sp0(struct tss_struct *tss, struct thread_struct *thread) { @@ -889,7 +1033,8 @@ static void lguest_set_debugreg(int regno, unsigned long value) /* FIXME: Implement */ } -/* There are times when the kernel wants to make sure that no memory writes are +/* + * There are times when the kernel wants to make sure that no memory writes are * caught in the cache (that they've all reached real hardware devices). This * doesn't matter for the Guest which has virtual hardware. * @@ -903,11 +1048,13 @@ static void lguest_wbinvd(void) { } -/* If the Guest expects to have an Advanced Programmable Interrupt Controller, +/* + * If the Guest expects to have an Advanced Programmable Interrupt Controller, * we play dumb by ignoring writes and returning 0 for reads. So it's no * longer Programmable nor Controlling anything, and I don't think 8 lines of * code qualifies for Advanced. It will also never interrupt anything. It - * does, however, allow us to get through the Linux boot code. */ + * does, however, allow us to get through the Linux boot code. + */ #ifdef CONFIG_X86_LOCAL_APIC static void lguest_apic_write(u32 reg, u32 v) { @@ -956,11 +1103,13 @@ static void lguest_safe_halt(void) kvm_hypercall0(LHCALL_HALT); } -/* The SHUTDOWN hypercall takes a string to describe what's happening, and +/* + * The SHUTDOWN hypercall takes a string to describe what's happening, and * an argument which says whether this to restart (reboot) the Guest or not. * * Note that the Host always prefers that the Guest speak in physical addresses - * rather than virtual addresses, so we use __pa() here. */ + * rather than virtual addresses, so we use __pa() here. + */ static void lguest_power_off(void) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa("Power down"), @@ -991,8 +1140,10 @@ static __init char *lguest_memory_setup(void) * nice to move it back to lguest_init. Patch welcome... */ atomic_notifier_chain_register(&panic_notifier_list, &paniced); - /* The Linux bootloader header contains an "e820" memory map: the - * Launcher populated the first entry with our memory limit. */ + /* + *The Linux bootloader header contains an "e820" memory map: the + * Launcher populated the first entry with our memory limit. + */ e820_add_region(boot_params.e820_map[0].addr, boot_params.e820_map[0].size, boot_params.e820_map[0].type); @@ -1001,16 +1152,17 @@ static __init char *lguest_memory_setup(void) return "LGUEST"; } -/* We will eventually use the virtio console device to produce console output, +/* + * We will eventually use the virtio console device to produce console output, * but before that is set up we use LHCALL_NOTIFY on normal memory to produce - * console output. */ + * console output. + */ static __init int early_put_chars(u32 vtermno, const char *buf, int count) { char scratch[17]; unsigned int len = count; - /* We use a nul-terminated string, so we have to make a copy. Icky, - * huh? */ + /* We use a nul-terminated string, so we make a copy. Icky, huh? */ if (len > sizeof(scratch) - 1) len = sizeof(scratch) - 1; scratch[len] = '\0'; @@ -1021,8 +1173,10 @@ static __init int early_put_chars(u32 vtermno, const char *buf, int count) return len; } -/* Rebooting also tells the Host we're finished, but the RESTART flag tells the - * Launcher to reboot us. */ +/* + * Rebooting also tells the Host we're finished, but the RESTART flag tells the + * Launcher to reboot us. + */ static void lguest_restart(char *reason) { kvm_hypercall2(LHCALL_SHUTDOWN, __pa(reason), LGUEST_SHUTDOWN_RESTART); @@ -1049,7 +1203,8 @@ static void lguest_restart(char *reason) * fit comfortably. * * First we need assembly templates of each of the patchable Guest operations, - * and these are in i386_head.S. */ + * and these are in i386_head.S. + */ /*G:060 We construct a table from the assembler templates: */ static const struct lguest_insns @@ -1060,9 +1215,11 @@ static const struct lguest_insns [PARAVIRT_PATCH(pv_irq_ops.save_fl)] = { lgstart_pushf, lgend_pushf }, }; -/* Now our patch routine is fairly simple (based on the native one in +/* + * Now our patch routine is fairly simple (based on the native one in * paravirt.c). If we have a replacement, we copy it in and return how much of - * the available space we used. */ + * the available space we used. + */ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, unsigned long addr, unsigned len) { @@ -1074,8 +1231,7 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, insn_len = lguest_insns[type].end - lguest_insns[type].start; - /* Similarly if we can't fit replacement (shouldn't happen, but let's - * be thorough). */ + /* Similarly if it can't fit (doesn't happen, but let's be thorough). */ if (len < insn_len) return paravirt_patch_default(type, clobber, ibuf, addr, len); @@ -1084,22 +1240,28 @@ static unsigned lguest_patch(u8 type, u16 clobber, void *ibuf, return insn_len; } -/*G:029 Once we get to lguest_init(), we know we're a Guest. The various +/*G:029 + * Once we get to lguest_init(), we know we're a Guest. The various * pv_ops structures in the kernel provide points for (almost) every routine we - * have to override to avoid privileged instructions. */ + * have to override to avoid privileged instructions. + */ __init void lguest_init(void) { - /* We're under lguest, paravirt is enabled, and we're running at - * privilege level 1, not 0 as normal. */ + /* We're under lguest. */ pv_info.name = "lguest"; + /* Paravirt is enabled. */ pv_info.paravirt_enabled = 1; + /* We're running at privilege level 1, not 0 as normal. */ pv_info.kernel_rpl = 1; + /* Everyone except Xen runs with this set. */ pv_info.shared_kernel_pmd = 1; - /* We set up all the lguest overrides for sensitive operations. These - * are detailed with the operations themselves. */ + /* + * We set up all the lguest overrides for sensitive operations. These + * are detailed with the operations themselves. + */ - /* interrupt-related operations */ + /* Interrupt-related operations */ pv_irq_ops.init_IRQ = lguest_init_IRQ; pv_irq_ops.save_fl = PV_CALLEE_SAVE(save_fl); pv_irq_ops.restore_fl = __PV_IS_CALLEE_SAVE(lg_restore_fl); @@ -1107,11 +1269,11 @@ __init void lguest_init(void) pv_irq_ops.irq_enable = __PV_IS_CALLEE_SAVE(lg_irq_enable); pv_irq_ops.safe_halt = lguest_safe_halt; - /* init-time operations */ + /* Setup operations */ pv_init_ops.memory_setup = lguest_memory_setup; pv_init_ops.patch = lguest_patch; - /* Intercepts of various cpu instructions */ + /* Intercepts of various CPU instructions */ pv_cpu_ops.load_gdt = lguest_load_gdt; pv_cpu_ops.cpuid = lguest_cpuid; pv_cpu_ops.load_idt = lguest_load_idt; @@ -1132,7 +1294,7 @@ __init void lguest_init(void) pv_cpu_ops.start_context_switch = paravirt_start_context_switch; pv_cpu_ops.end_context_switch = lguest_end_context_switch; - /* pagetable management */ + /* Pagetable management */ pv_mmu_ops.write_cr3 = lguest_write_cr3; pv_mmu_ops.flush_tlb_user = lguest_flush_tlb_user; pv_mmu_ops.flush_tlb_single = lguest_flush_tlb_single; @@ -1154,54 +1316,71 @@ __init void lguest_init(void) pv_mmu_ops.pte_update_defer = lguest_pte_update; #ifdef CONFIG_X86_LOCAL_APIC - /* apic read/write intercepts */ + /* APIC read/write intercepts */ set_lguest_basic_apic_ops(); #endif - /* time operations */ + /* Time operations */ pv_time_ops.get_wallclock = lguest_get_wallclock; pv_time_ops.time_init = lguest_time_init; pv_time_ops.get_tsc_khz = lguest_tsc_khz; - /* Now is a good time to look at the implementations of these functions - * before returning to the rest of lguest_init(). */ + /* + * Now is a good time to look at the implementations of these functions + * before returning to the rest of lguest_init(). + */ - /*G:070 Now we've seen all the paravirt_ops, we return to + /*G:070 + * Now we've seen all the paravirt_ops, we return to * lguest_init() where the rest of the fairly chaotic boot setup - * occurs. */ + * occurs. + */ - /* The stack protector is a weird thing where gcc places a canary + /* + * The stack protector is a weird thing where gcc places a canary * value on the stack and then checks it on return. This file is * compiled with -fno-stack-protector it, so we got this far without * problems. The value of the canary is kept at offset 20 from the * %gs register, so we need to set that up before calling C functions - * in other files. */ + * in other files. + */ setup_stack_canary_segment(0); - /* We could just call load_stack_canary_segment(), but we might as - * call switch_to_new_gdt() which loads the whole table and sets up - * the per-cpu segment descriptor register %fs as well. */ + + /* + * We could just call load_stack_canary_segment(), but we might as well + * call switch_to_new_gdt() which loads the whole table and sets up the + * per-cpu segment descriptor register %fs as well. + */ switch_to_new_gdt(0); - /* As described in head_32.S, we map the first 128M of memory. */ + /* We actually boot with all memory mapped, but let's say 128MB. */ max_pfn_mapped = (128*1024*1024) >> PAGE_SHIFT; - /* The Host<->Guest Switcher lives at the top of our address space, and + /* + * The Host<->Guest Switcher lives at the top of our address space, and * the Host told us how big it is when we made LGUEST_INIT hypercall: - * it put the answer in lguest_data.reserve_mem */ + * it put the answer in lguest_data.reserve_mem + */ reserve_top_address(lguest_data.reserve_mem); - /* If we don't initialize the lock dependency checker now, it crashes - * paravirt_disable_iospace. */ + /* + * If we don't initialize the lock dependency checker now, it crashes + * paravirt_disable_iospace. + */ lockdep_init(); - /* The IDE code spends about 3 seconds probing for disks: if we reserve + /* + * The IDE code spends about 3 seconds probing for disks: if we reserve * all the I/O ports up front it can't get them and so doesn't probe. * Other device drivers are similar (but less severe). This cuts the - * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. */ + * kernel boot time on my machine from 4.1 seconds to 0.45 seconds. + */ paravirt_disable_iospace(); - /* This is messy CPU setup stuff which the native boot code does before - * start_kernel, so we have to do, too: */ + /* + * This is messy CPU setup stuff which the native boot code does before + * start_kernel, so we have to do, too: + */ cpu_detect(&new_cpu_data); /* head.S usually sets up the first capability word, so do it here. */ new_cpu_data.x86_capability[0] = cpuid_edx(1); @@ -1218,22 +1397,28 @@ __init void lguest_init(void) acpi_ht = 0; #endif - /* We set the preferred console to "hvc". This is the "hypervisor + /* + * We set the preferred console to "hvc". This is the "hypervisor * virtual console" driver written by the PowerPC people, which we also - * adapted for lguest's use. */ + * adapted for lguest's use. + */ add_preferred_console("hvc", 0, NULL); /* Register our very early console. */ virtio_cons_early_init(early_put_chars); - /* Last of all, we set the power management poweroff hook to point to + /* + * Last of all, we set the power management poweroff hook to point to * the Guest routine to power off, and the reboot hook to our restart - * routine. */ + * routine. + */ pm_power_off = lguest_power_off; machine_ops.restart = lguest_restart; - /* Now we're set up, call i386_start_kernel() in head32.c and we proceed - * to boot as normal. It never returns. */ + /* + * Now we're set up, call i386_start_kernel() in head32.c and we proceed + * to boot as normal. It never returns. + */ i386_start_kernel(); } /* diff --git a/arch/x86/lguest/i386_head.S b/arch/x86/lguest/i386_head.S index a9c8cfe..27eac0f 100644 --- a/arch/x86/lguest/i386_head.S +++ b/arch/x86/lguest/i386_head.S @@ -5,7 +5,8 @@ #include <asm/thread_info.h> #include <asm/processor-flags.h> -/*G:020 Our story starts with the kernel booting into startup_32 in +/*G:020 + * Our story starts with the kernel booting into startup_32 in * arch/x86/kernel/head_32.S. It expects a boot header, which is created by * the bootloader (the Launcher in our case). * @@ -21,11 +22,14 @@ * data without remembering to subtract __PAGE_OFFSET! * * The .section line puts this code in .init.text so it will be discarded after - * boot. */ + * boot. + */ .section .init.text, "ax", @progbits ENTRY(lguest_entry) - /* We make the "initialization" hypercall now to tell the Host about - * us, and also find out where it put our page tables. */ + /* + * We make the "initialization" hypercall now to tell the Host about + * us, and also find out where it put our page tables. + */ movl $LHCALL_LGUEST_INIT, %eax movl $lguest_data - __PAGE_OFFSET, %ebx .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ @@ -33,13 +37,14 @@ ENTRY(lguest_entry) /* Set up the initial stack so we can run C code. */ movl $(init_thread_union+THREAD_SIZE),%esp - /* Jumps are relative, and we're running __PAGE_OFFSET too low at the - * moment. */ + /* Jumps are relative: we're running __PAGE_OFFSET too low. */ jmp lguest_init+__PAGE_OFFSET -/*G:055 We create a macro which puts the assembler code between lgstart_ and - * lgend_ markers. These templates are put in the .text section: they can't be - * discarded after boot as we may need to patch modules, too. */ +/*G:055 + * We create a macro which puts the assembler code between lgstart_ and lgend_ + * markers. These templates are put in the .text section: they can't be + * discarded after boot as we may need to patch modules, too. + */ .text #define LGUEST_PATCH(name, insns...) \ lgstart_##name: insns; lgend_##name:; \ @@ -48,83 +53,103 @@ ENTRY(lguest_entry) LGUEST_PATCH(cli, movl $0, lguest_data+LGUEST_DATA_irq_enabled) LGUEST_PATCH(pushf, movl lguest_data+LGUEST_DATA_irq_enabled, %eax) -/*G:033 But using those wrappers is inefficient (we'll see why that doesn't - * matter for save_fl and irq_disable later). If we write our routines - * carefully in assembler, we can avoid clobbering any registers and avoid - * jumping through the wrapper functions. +/*G:033 + * But using those wrappers is inefficient (we'll see why that doesn't matter + * for save_fl and irq_disable later). If we write our routines carefully in + * assembler, we can avoid clobbering any registers and avoid jumping through + * the wrapper functions. * * I skipped over our first piece of assembler, but this one is worth studying - * in a bit more detail so I'll describe in easy stages. First, the routine - * to enable interrupts: */ + * in a bit more detail so I'll describe in easy stages. First, the routine to + * enable interrupts: + */ ENTRY(lg_irq_enable) - /* The reverse of irq_disable, this sets lguest_data.irq_enabled to - * X86_EFLAGS_IF (ie. "Interrupts enabled"). */ + /* + * The reverse of irq_disable, this sets lguest_data.irq_enabled to + * X86_EFLAGS_IF (ie. "Interrupts enabled"). + */ movl $X86_EFLAGS_IF, lguest_data+LGUEST_DATA_irq_enabled - /* But now we need to check if the Host wants to know: there might have + /* + * But now we need to check if the Host wants to know: there might have * been interrupts waiting to be delivered, in which case it will have * set lguest_data.irq_pending to X86_EFLAGS_IF. If it's not zero, we - * jump to send_interrupts, otherwise we're done. */ + * jump to send_interrupts, otherwise we're done. + */ testl $0, lguest_data+LGUEST_DATA_irq_pending jnz send_interrupts - /* One cool thing about x86 is that you can do many things without using + /* + * One cool thing about x86 is that you can do many things without using * a register. In this case, the normal path hasn't needed to save or - * restore any registers at all! */ + * restore any registers at all! + */ ret send_interrupts: - /* OK, now we need a register: eax is used for the hypercall number, + /* + * OK, now we need a register: eax is used for the hypercall number, * which is LHCALL_SEND_INTERRUPTS. * * We used not to bother with this pending detection at all, which was * much simpler. Sooner or later the Host would realize it had to * send us an interrupt. But that turns out to make performance 7 * times worse on a simple tcp benchmark. So now we do this the hard - * way. */ + * way. + */ pushl %eax movl $LHCALL_SEND_INTERRUPTS, %eax - /* This is a vmcall instruction (same thing that KVM uses). Older + /* + * This is a vmcall instruction (same thing that KVM uses). Older * assembler versions might not know the "vmcall" instruction, so we - * create one manually here. */ + * create one manually here. + */ .byte 0x0f,0x01,0xc1 /* KVM_HYPERCALL */ + /* Put eax back the way we found it. */ popl %eax ret -/* Finally, the "popf" or "restore flags" routine. The %eax register holds the +/* + * Finally, the "popf" or "restore flags" routine. The %eax register holds the * flags (in practice, either X86_EFLAGS_IF or 0): if it's X86_EFLAGS_IF we're - * enabling interrupts again, if it's 0 we're leaving them off. */ + * enabling interrupts again, if it's 0 we're leaving them off. + */ ENTRY(lg_restore_fl) /* This is just "lguest_data.irq_enabled = flags;" */ movl %eax, lguest_data+LGUEST_DATA_irq_enabled - /* Now, if the %eax value has enabled interrupts and + /* + * Now, if the %eax value has enabled interrupts and * lguest_data.irq_pending is set, we want to tell the Host so it can * deliver any outstanding interrupts. Fortunately, both values will * be X86_EFLAGS_IF (ie. 512) in that case, and the "testl" * instruction will AND them together for us. If both are set, we - * jump to send_interrupts. */ + * jump to send_interrupts. + */ testl lguest_data+LGUEST_DATA_irq_pending, %eax jnz send_interrupts /* Again, the normal path has used no extra registers. Clever, huh? */ ret +/*:*/ /* These demark the EIP range where host should never deliver interrupts. */ .global lguest_noirq_start .global lguest_noirq_end -/*M:004 When the Host reflects a trap or injects an interrupt into the Guest, - * it sets the eflags interrupt bit on the stack based on - * lguest_data.irq_enabled, so the Guest iret logic does the right thing when - * restoring it. However, when the Host sets the Guest up for direct traps, - * such as system calls, the processor is the one to push eflags onto the - * stack, and the interrupt bit will be 1 (in reality, interrupts are always - * enabled in the Guest). +/*M:004 + * When the Host reflects a trap or injects an interrupt into the Guest, it + * sets the eflags interrupt bit on the stack based on lguest_data.irq_enabled, + * so the Guest iret logic does the right thing when restoring it. However, + * when the Host sets the Guest up for direct traps, such as system calls, the + * processor is the one to push eflags onto the stack, and the interrupt bit + * will be 1 (in reality, interrupts are always enabled in the Guest). * * This turns out to be harmless: the only trap which should happen under Linux * with interrupts disabled is Page Fault (due to our lazy mapping of vmalloc * regions), which has to be reflected through the Host anyway. If another * trap *does* go off when interrupts are disabled, the Guest will panic, and - * we'll never get to this iret! :*/ + * we'll never get to this iret! +:*/ -/*G:045 There is one final paravirt_op that the Guest implements, and glancing - * at it you can see why I left it to last. It's *cool*! It's in *assembler*! +/*G:045 + * There is one final paravirt_op that the Guest implements, and glancing at it + * you can see why I left it to last. It's *cool*! It's in *assembler*! * * The "iret" instruction is used to return from an interrupt or trap. The * stack looks like this: @@ -148,15 +173,18 @@ ENTRY(lg_restore_fl) * return to userspace or wherever. Our solution to this is to surround the * code with lguest_noirq_start: and lguest_noirq_end: labels. We tell the * Host that it is *never* to interrupt us there, even if interrupts seem to be - * enabled. */ + * enabled. + */ ENTRY(lguest_iret) pushl %eax movl 12(%esp), %eax lguest_noirq_start: - /* Note the %ss: segment prefix here. Normal data accesses use the + /* + * Note the %ss: segment prefix here. Normal data accesses use the * "ds" segment, but that will have already been restored for whatever * we're returning to (such as userspace): we can't trust it. The %ss: - * prefix makes sure we use the stack segment, which is still valid. */ + * prefix makes sure we use the stack segment, which is still valid. + */ movl %eax,%ss:lguest_data+LGUEST_DATA_irq_enabled popl %eax iret diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a6974e9..1e2cb84 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,6 +1,8 @@ -/*P:400 This contains run_guest() which actually calls into the Host<->Guest +/*P:400 + * This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines. :*/ + * Host to do something. This file also contains useful helper routines. +:*/ #include <linux/module.h> #include <linux/stringify.h> #include <linux/stddef.h> @@ -24,7 +26,8 @@ static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); -/*H:010 We need to set up the Switcher at a high virtual address. Remember the +/*H:010 + * We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the * CPU to run the Guest, and then changes back to the Host when a trap or * interrupt happens. @@ -33,7 +36,8 @@ DEFINE_MUTEX(lguest_lock); * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. */ + * it's not a simple one-liner. + */ static __init int map_switcher(void) { int i, err; @@ -47,8 +51,10 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of struct page pointers. map_vm_area() wants - * this, rather than just an array of pages. */ + /* + * We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. + */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -56,8 +62,10 @@ static __init int map_switcher(void) goto out; } - /* Now we actually allocate the pages. The Guest will see these pages, - * so we make sure they're zeroed. */ + /* + * Now we actually allocate the pages. The Guest will see these pages, + * so we make sure they're zeroed. + */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { unsigned long addr = get_zeroed_page(GFP_KERNEL); if (!addr) { @@ -67,19 +75,23 @@ static __init int map_switcher(void) switcher_page[i] = virt_to_page(addr); } - /* First we check that the Switcher won't overlap the fixmap area at + /* + * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. */ + * very strange effects if it ever happened. + */ if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 + /* + * Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. */ + * allocates an extra guard page, so we need space for that. + */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); @@ -89,11 +101,13 @@ static __init int map_switcher(void) goto free_pages; } - /* This code actually sets up the pages we've allocated to appear at + /* + * This code actually sets up the pages we've allocated to appear at * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't - * care. */ + * care. + */ pagep = switcher_page; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { @@ -101,8 +115,10 @@ static __init int map_switcher(void) goto free_vma; } - /* Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ + /* + * Now the Switcher is mapped at the right address, we can't fail! + * Copy in the compiled-in Switcher code (from <arch>_switcher.S). + */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); @@ -124,8 +140,7 @@ out: } /*:*/ -/* Cleaning up the mapping when the module is unloaded is almost... - * too easy. */ +/* Cleaning up the mapping when the module is unloaded is almost... too easy. */ static void unmap_switcher(void) { unsigned int i; @@ -151,16 +166,19 @@ static void unmap_switcher(void) * But we can't trust the Guest: it might be trying to access the Launcher * code. We have to check that the range is below the pfn_limit the Launcher * gave us. We have to make sure that addr + len doesn't give us a false - * positive by overflowing, too. */ + * positive by overflowing, too. + */ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len) { return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); } -/* This routine copies memory from the Guest. Here we can see how useful the +/* + * This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random - * value (all zeroes) instead of needing to return an error. */ + * value (all zeroes) instead of needing to return an error. + */ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) { if (!lguest_address_ok(cpu->lg, addr, bytes) @@ -181,9 +199,11 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, } /*:*/ -/*H:030 Let's jump straight to the the main loop which runs the Guest. +/*H:030 + * Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep - * going around and around until something interesting happens. */ + * going around and around until something interesting happens. + */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ @@ -195,10 +215,17 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. */ + /* + * It's possible the Guest did a NOTIFY hypercall to the + * Launcher. + */ if (cpu->pending_notify) { + /* + * Does it just needs to write to a registered + * eventfd (ie. the appropriate virtqueue thread)? + */ if (!send_notify_to_eventfd(cpu)) { + /* OK, we tell the main Laucher. */ if (put_user(cpu->pending_notify, user)) return -EFAULT; return sizeof(cpu->pending_notify); @@ -209,29 +236,39 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (signal_pending(current)) return -ERESTARTSYS; - /* Check if there are any interrupts which can be delivered now: + /* + * Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next - * run the Guest. */ + * run the Guest. + */ irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) try_deliver_interrupt(cpu, irq, more); - /* All long-lived kernel loops need to check with this horrible + /* + * All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, - * it stops us. */ + * it stops us. + */ try_to_freeze(); - /* Just make absolutely sure the Guest is still alive. One of - * those hypercalls could have been fatal, for example. */ + /* + * Just make absolutely sure the Guest is still alive. One of + * those hypercalls could have been fatal, for example. + */ if (cpu->lg->dead) break; - /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. */ + /* + * If the Guest asked to be stopped, we sleep. The Guest's + * clock timer will wake us. + */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure no interrupt snuck in - * which we should be doing. */ + /* + * Just before we sleep, make sure no interrupt snuck in + * which we should be doing. + */ if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else @@ -239,8 +276,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) continue; } - /* OK, now we're ready to jump into the Guest. First we put up - * the "Do Not Disturb" sign: */ + /* + * OK, now we're ready to jump into the Guest. First we put up + * the "Do Not Disturb" sign: + */ local_irq_disable(); /* Actually run the Guest until something happens. */ @@ -327,8 +366,10 @@ static void __exit fini(void) } /*:*/ -/* The Host side of lguest can be a module. This is a nice way for people to - * play with it. */ +/* + * The Host side of lguest can be a module. This is a nice way for people to + * play with it. + */ module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index c29ffa1..83511eb 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -1,8 +1,10 @@ -/*P:500 Just as userspace programs request kernel operations through a system +/*P:500 + * Just as userspace programs request kernel operations through a system * call, the Guest requests Host operations through a "hypercall". You might * notice this nomenclature doesn't really follow any logic, but the name has * been around for long enough that we're stuck with it. As you'd expect, this - * code is basically a one big switch statement. :*/ + * code is basically a one big switch statement. +:*/ /* Copyright (C) 2006 Rusty Russell IBM Corporation @@ -28,30 +30,41 @@ #include <asm/pgtable.h> #include "lg.h" -/*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ +/*H:120 + * This is the core hypercall routine: where the Guest gets what it wants. + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. + */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: - /* This call does nothing, except by breaking out of the Guest - * it makes us process all the asynchronous hypercalls. */ + /* + * This call does nothing, except by breaking out of the Guest + * it makes us process all the asynchronous hypercalls. + */ break; case LHCALL_SEND_INTERRUPTS: - /* This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. */ + /* + * This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. + */ break; case LHCALL_LGUEST_INIT: - /* You can't get here unless you're already initialized. Don't - * do that. */ + /* + * You can't get here unless you're already initialized. Don't + * do that. + */ kill_guest(cpu, "already have lguest_data"); break; case LHCALL_SHUTDOWN: { - /* Shutdown is such a trivial hypercall that we do it in four - * lines right here. */ char msg[128]; - /* If the lgread fails, it will call kill_guest() itself; the - * kill_guest() with the message will be ignored. */ + /* + * Shutdown is such a trivial hypercall that we do it in five + * lines right here. + * + * If the lgread fails, it will call kill_guest() itself; the + * kill_guest() with the message will be ignored. + */ __lgread(cpu, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; kill_guest(cpu, "CRASH: %s", msg); @@ -60,16 +73,17 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) break; } case LHCALL_FLUSH_TLB: - /* FLUSH_TLB comes in two flavors, depending on the - * argument: */ + /* FLUSH_TLB comes in two flavors, depending on the argument: */ if (args->arg1) guest_pagetable_clear_all(cpu); else guest_pagetable_flush_user(cpu); break; - /* All these calls simply pass the arguments through to the right - * routines. */ + /* + * All these calls simply pass the arguments through to the right + * routines. + */ case LHCALL_NEW_PGTABLE: guest_new_pagetable(cpu, args->arg1); break; @@ -112,15 +126,16 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) kill_guest(cpu, "Bad hypercall %li\n", args->arg0); } } -/*:*/ -/*H:124 Asynchronous hypercalls are easy: we just look in the array in the +/*H:124 + * Asynchronous hypercalls are easy: we just look in the array in the * Guest's "struct lguest_data" to see if any new ones are marked "ready". * * We are careful to do these in order: obviously we respect the order the * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before - * checking for a normal hcall). */ + * checking for a normal hcall). + */ static void do_async_hcalls(struct lg_cpu *cpu) { unsigned int i; @@ -133,22 +148,28 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* We process "struct lguest_data"s hcalls[] ring once. */ for (i = 0; i < ARRAY_SIZE(st); i++) { struct hcall_args args; - /* We remember where we were up to from last time. This makes + /* + * We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest - * places them in the ring. */ + * places them in the ring. + */ unsigned int n = cpu->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) break; - /* OK, we have hypercall. Increment the "next_hcall" cursor, - * and wrap back to 0 if we reach the end. */ + /* + * OK, we have hypercall. Increment the "next_hcall" cursor, + * and wrap back to 0 if we reach the end. + */ if (++cpu->next_hcall == LHCALL_RING_SIZE) cpu->next_hcall = 0; - /* Copy the hypercall arguments into a local copy of - * the hcall_args struct. */ + /* + * Copy the hypercall arguments into a local copy of the + * hcall_args struct. + */ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { kill_guest(cpu, "Fetching async hypercalls"); @@ -164,19 +185,25 @@ static void do_async_hcalls(struct lg_cpu *cpu) break; } - /* Stop doing hypercalls if they want to notify the Launcher: - * it needs to service this first. */ + /* + * Stop doing hypercalls if they want to notify the Launcher: + * it needs to service this first. + */ if (cpu->pending_notify) break; } } -/* Last of all, we look at what happens first of all. The very first time the - * Guest makes a hypercall, we end up here to set things up: */ +/* + * Last of all, we look at what happens first of all. The very first time the + * Guest makes a hypercall, we end up here to set things up: + */ static void initialize(struct lg_cpu *cpu) { - /* You can't do anything until you're initialized. The Guest knows the - * rules, so we're unforgiving here. */ + /* + * You can't do anything until you're initialized. The Guest knows the + * rules, so we're unforgiving here. + */ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); return; @@ -185,32 +212,44 @@ static void initialize(struct lg_cpu *cpu) if (lguest_arch_init_hypercalls(cpu)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* The Guest tells us where we're not to deliver interrupts by putting - * the range of addresses into "struct lguest_data". */ + /* + * The Guest tells us where we're not to deliver interrupts by putting + * the range of addresses into "struct lguest_data". + */ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* We write the current time into the Guest's data page once so it can - * set its clock. */ + /* + * We write the current time into the Guest's data page once so it can + * set its clock. + */ write_timestamp(cpu); /* page_tables.c will also do some setup. */ page_table_guest_data_init(cpu); - /* This is the one case where the above accesses might have been the + /* + * This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest - * pagetable. */ + * pagetable. + */ guest_pagetable_clear_all(cpu); } /*:*/ -/*M:013 If a Guest reads from a page (so creates a mapping) that it has never +/*M:013 + * If a Guest reads from a page (so creates a mapping) that it has never * written to, and then the Launcher writes to it (ie. the output of a virtual * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But - * a similar scenario might one day bite us, so it's worth mentioning. :*/ + * a similar scenario might one day bite us, so it's worth mentioning. + * + * Note that if we used a shared anonymous mapping in the Launcher instead of + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we + * need that to switch the Launcher to processes (away from threads) anyway. +:*/ /*H:100 * Hypercalls @@ -229,17 +268,22 @@ void do_hypercalls(struct lg_cpu *cpu) return; } - /* The Guest has initialized. + /* + * The Guest has initialized. * - * Look in the hypercall ring for the async hypercalls: */ + * Look in the hypercall ring for the async hypercalls: + */ do_async_hcalls(cpu); - /* If we stopped reading the hypercall ring because the Guest did a + /* + * If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do - * the hypercall. */ + * the hypercall. + */ if (!cpu->pending_notify) { do_hcall(cpu, cpu->hcall); - /* Tricky point: we reset the hcall pointer to mark the + /* + * Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. * Normally it doesn't matter: the Guest will run again and @@ -248,13 +292,16 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. Finding that bug sucked. */ + * hypercall. Finding that bug sucked. + */ cpu->hcall = NULL; } } -/* This routine supplies the Guest with time: it's used for wallclock time at - * initial boot and as a rough time source if the TSC isn't available. */ +/* + * This routine supplies the Guest with time: it's used for wallclock time at + * initial boot and as a rough time source if the TSC isn't available. + */ void write_timestamp(struct lg_cpu *cpu) { struct timespec now; diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0e9067b..1864818 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -1,4 +1,5 @@ -/*P:800 Interrupts (traps) are complicated enough to earn their own file. +/*P:800 + * Interrupts (traps) are complicated enough to earn their own file. * There are three classes of interrupts: * * 1) Real hardware interrupts which occur while we're running the Guest, @@ -10,7 +11,8 @@ * just like real hardware would deliver them. Traps from the Guest can be set * up to go directly back into the Guest, but sometimes the Host wants to see * them first, so we also have a way of "reflecting" them into the Guest as if - * they had been delivered to it directly. :*/ + * they had been delivered to it directly. +:*/ #include <linux/uaccess.h> #include <linux/interrupt.h> #include <linux/module.h> @@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi) return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); } -/* The "type" of the interrupt handler is a 4 bit field: we only support a - * couple of types. */ +/* + * The "type" of the interrupt handler is a 4 bit field: we only support a + * couple of types. + */ static int idt_type(u32 lo, u32 hi) { return (hi >> 8) & 0xF; @@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi) return (hi & 0x8000); } -/* We need a helper to "push" a value onto the Guest's stack, since that's a - * big part of what delivering an interrupt does. */ +/* + * We need a helper to "push" a value onto the Guest's stack, since that's a + * big part of what delivering an interrupt does. + */ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ @@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) lgwrite(cpu, *gstack, u32, val); } -/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or +/*H:210 + * The set_guest_interrupt() routine actually delivers the interrupt or * trap. The mechanics of delivering traps and interrupts to the Guest are the * same, except some traps have an "error code" which gets pushed onto the * stack as well: the caller tells us if this is one. @@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo - * it). */ + * it). + */ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, bool has_err) { @@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, u32 eflags, ss, irq_enable; unsigned long virtstack; - /* There are two cases for interrupts: one where the Guest is already + /* + * There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in - * userspace. We check the privilege level to find out. */ + * userspace. We check the privilege level to find out. + */ if ((cpu->regs->ss&0x3) != GUEST_PL) { - /* The Guest told us their kernel stack with the SET_STACK - * hypercall: both the virtual address and the segment */ + /* + * The Guest told us their kernel stack with the SET_STACK + * hypercall: both the virtual address and the segment. + */ virtstack = cpu->esp1; ss = cpu->ss1; origstack = gstack = guest_pa(cpu, virtstack); - /* We push the old stack segment and pointer onto the new + /* + * We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege - * levels and expect these here. */ + * levels and expect these here. + */ push_guest_stack(cpu, &gstack, cpu->regs->ss); push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { @@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, origstack = gstack = guest_pa(cpu, virtstack); } - /* Remember that we never let the Guest actually disable interrupts, so + /* + * Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest - * copy it back in "lguest_iret". */ + * copy it back in "lguest_iret". + */ eflags = cpu->regs->eflags; if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; - /* An interrupt is expected to push three things on the stack: the old + /* + * An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction - * pointer. */ + * pointer. + */ push_guest_stack(cpu, &gstack, eflags); push_guest_stack(cpu, &gstack, cpu->regs->cs); push_guest_stack(cpu, &gstack, cpu->regs->eip); @@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); - /* Now we've pushed all the old state, we change the stack, the code - * segment and the address to execute. */ + /* + * Now we've pushed all the old state, we change the stack, the code + * segment and the address to execute. + */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); - /* There are two kinds of interrupt handlers: 0xE is an "interrupt - * gate" which expects interrupts to be disabled on entry. */ + /* + * There are two kinds of interrupt handlers: 0xE is an "interrupt + * gate" which expects interrupts to be disabled on entry. + */ if (idt_type(lo, hi) == 0xE) if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) kill_guest(cpu, "Disabling interrupts"); @@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. */ + * we go to sleep when the Guest has halted itself. + */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; @@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) if (!cpu->lg->lguest_data) return LGUEST_IRQS; - /* Take our "irqs_pending" array and remove any interrupts the Guest - * wants blocked: the result ends up in "blk". */ + /* + * Take our "irqs_pending" array and remove any interrupts the Guest + * wants blocked: the result ends up in "blk". + */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return LGUEST_IRQS; @@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) return irq; } -/* This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). */ +/* + * This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). + */ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; BUG_ON(irq >= LGUEST_IRQS); - /* They may be in the middle of an iret, where they asked us never to - * deliver interrupts. */ + /* + * They may be in the middle of an iret, where they asked us never to + * deliver interrupts. + */ if (cpu->regs->eip >= cpu->lg->noirq_start && (cpu->regs->eip < cpu->lg->noirq_end)) return; @@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) } } - /* Look at the IDT entry the Guest gave us for this interrupt. The + /* + * Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip - * over them. */ + * over them. + */ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ clear_bit(irq, cpu->irqs_pending); - /* set_guest_interrupt() takes the interrupt descriptor and a + /* + * set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto - * the stack as well: virtual interrupts never do. */ + * the stack as well: virtual interrupts never do. + */ set_guest_interrupt(cpu, idt->a, idt->b, false); } - /* Every time we deliver an interrupt, we update the timestamp in the + /* + * Every time we deliver an interrupt, we update the timestamp in the * Guest's lguest_data struct. It would be better for the Guest if we * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every - * timer interrupt. */ + * timer interrupt. + */ write_timestamp(cpu); - /* If there are no other interrupts we want to deliver, clear - * the pending flag. */ + /* + * If there are no other interrupts we want to deliver, clear + * the pending flag. + */ if (!more) put_user(0, &cpu->lg->lguest_data->irq_pending); } @@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) /* And this is the routine when we want to set an interrupt for the Guest. */ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) { - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_bit(irq, cpu->irqs_pending); - /* Make sure it sees it; it might be asleep (eg. halted), or - * running the Guest right now, in which case kick_process() - * will knock it out. */ + /* + * Make sure it sees it; it might be asleep (eg. halted), or running + * the Guest right now, in which case kick_process() will knock it out. + */ if (!wake_up_process(cpu->tsk)) kick_process(cpu->tsk); } /*:*/ -/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent +/* + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent * me a patch, so we support that too. It'd be a big step for lguest if half * the Plan 9 user base were to start using it. * * Actually now I think of it, it's possible that Ron *is* half the Plan 9 - * userbase. Oh well. */ + * userbase. Oh well. + */ static bool could_be_syscall(unsigned int num) { /* Normal Linux SYSCALL_VECTOR or reserved vector? */ @@ -274,9 +316,11 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps like +/*H:220 + * Now we've got the routines to deliver interrupts, delivering traps like * page fault is easy. The only trick is that Intel decided that some traps - * should have error codes: */ + * should have error codes: + */ static bool has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); @@ -285,13 +329,17 @@ static bool has_err(unsigned int trap) /* deliver_trap() returns true if it could deliver the trap. */ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) { - /* Trap numbers are always 8 bit, but we set an impossible trap number - * for traps inside the Switcher, so check that here. */ + /* + * Trap numbers are always 8 bit, but we set an impossible trap number + * for traps inside the Switcher, so check that here. + */ if (num >= ARRAY_SIZE(cpu->arch.idt)) return false; - /* Early on the Guest hasn't set the IDT entries (or maybe it put a - * bogus one in): if we fail here, the Guest will be killed. */ + /* + * Early on the Guest hasn't set the IDT entries (or maybe it put a + * bogus one in): if we fail here, the Guest will be killed. + */ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return false; set_guest_interrupt(cpu, cpu->arch.idt[num].a, @@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) return true; } -/*H:250 Here's the hard part: returning to the Host every time a trap happens +/*H:250 + * Here's the hard part: returning to the Host every time a trap happens * and then calling deliver_trap() and re-entering the Guest is slow. * Particularly because Guest userspace system calls are traps (usually trap * 128). @@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) * the other hypervisors would beat it up at lunchtime. * * This routine indicates if a particular trap number could be delivered - * directly. */ + * directly. + */ static bool direct_trap(unsigned int num) { - /* Hardware interrupts don't go to the Guest at all (except system - * call). */ + /* + * Hardware interrupts don't go to the Guest at all (except system + * call). + */ if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) return false; - /* The Host needs to see page faults (for shadow paging and to save the + /* + * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. */ + * and of course, the hypercall trap. + */ return num != 14 && num != 13 && num != 7 && num != 6 && num != LGUEST_TRAP_ENTRY; } /*:*/ -/*M:005 The Guest has the ability to turn its interrupt gates into trap gates, +/*M:005 + * The Guest has the ability to turn its interrupt gates into trap gates, * if it is careful. The Host will let trap gates can go directly to the * Guest, but the Guest needs the interrupts atomically disabled for an * interrupt gate. It can do this by pointing the trap gate at instructions - * within noirq_start and noirq_end, where it can safely disable interrupts. */ + * within noirq_start and noirq_end, where it can safely disable interrupts. + */ -/*M:006 The Guests do not use the sysenter (fast system call) instruction, +/*M:006 + * The Guests do not use the sysenter (fast system call) instruction, * because it's hardcoded to enter privilege level 0 and so can't go direct. * It's about twice as fast as the older "int 0x80" system call, so it might * still be worthwhile to handle it in the Switcher and lcall down to the * Guest. The sysenter semantics are hairy tho: search for that keyword in - * entry.S :*/ + * entry.S +:*/ -/*H:260 When we make traps go directly into the Guest, we need to make sure +/*H:260 + * When we make traps go directly into the Guest, we need to make sure * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the * CPU trying to deliver the trap will fault while trying to push the interrupt * words on the stack: this is called a double fault, and it forces us to kill * the Guest. * - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. + */ void pin_stack_pages(struct lg_cpu *cpu) { unsigned int i; - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or - * two pages of stack space. */ + /* + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or + * two pages of stack space. + */ for (i = 0; i < cpu->lg->stack_pages; i++) - /* The stack grows *upwards*, so the address we're given is the + /* + * The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to - * get to the rest of the stack pages. */ + * get to the rest of the stack pages. + */ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); } -/* Direct traps also mean that we need to know whenever the Guest wants to use +/* + * Direct traps also mean that we need to know whenever the Guest wants to use * a different kernel stack, so we can change the IDT entries to use that * stack. The IDT entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * * In Linux each process has its own kernel stack, so this happens a lot: we - * change stacks on each context switch. */ + * change stacks on each context switch. + */ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) { - /* You are not allowed have a stack segment with privilege level 0: bad - * Guest! */ + /* + * You're not allowed a stack segment with privilege level 0: bad Guest! + */ if ((seg & 0x3) != GUEST_PL) kill_guest(cpu, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ @@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) pin_stack_pages(cpu); } -/* All this reference to mapping stacks leads us neatly into the other complex - * part of the Host: page table handling. */ +/* + * All this reference to mapping stacks leads us neatly into the other complex + * part of the Host: page table handling. + */ -/*H:235 This is the routine which actually checks the Guest's IDT entry and - * transfers it into the entry in "struct lguest": */ +/*H:235 + * This is the routine which actually checks the Guest's IDT entry and + * transfers it into the entry in "struct lguest": + */ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { @@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, if (type != 0xE && type != 0xF) kill_guest(cpu, "bad IDT type %i", type); - /* We only copy the handler address, present bit, privilege level and + /* + * We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered * manually with an "int" instruction. This is usually GUEST_PL, - * except for system calls which userspace can use. */ + * except for system calls which userspace can use. + */ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); trap->b = (hi&0xFFFFEF00); } -/*H:230 While we're here, dealing with delivering traps and interrupts to the +/*H:230 + * While we're here, dealing with delivering traps and interrupts to the * Guest, we might as well complete the picture: how the Guest tells us where * it wants them to go. This would be simple, except making traps fast * requires some tricks. * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. + */ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) { - /* Guest never handles: NMI, doublefault, spurious interrupt or - * hypercall. We ignore when it tries to set them. */ + /* + * Guest never handles: NMI, doublefault, spurious interrupt or + * hypercall. We ignore when it tries to set them. + */ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) return; - /* Mark the IDT as changed: next time the Guest runs we'll know we have - * to copy this again. */ + /* + * Mark the IDT as changed: next time the Guest runs we'll know we have + * to copy this again. + */ cpu->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ @@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); } -/* The default entry for each interrupt points into the Switcher routines which +/* + * The default entry for each interrupt points into the Switcher routines which * simply return to the Host. The run_guest() loop will then call - * deliver_trap() to bounce it back into the Guest. */ + * deliver_trap() to bounce it back into the Guest. + */ static void default_idt_entry(struct desc_struct *idt, int trap, const unsigned long handler, @@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt, /* A present interrupt gate. */ u32 flags = 0x8e00; - /* Set the privilege level on the entry for the hypercall: this allows - * the Guest to use the "int" instruction to trigger it. */ + /* + * Set the privilege level on the entry for the hypercall: this allows + * the Guest to use the "int" instruction to trigger it. + */ if (trap == LGUEST_TRAP_ENTRY) flags |= (GUEST_PL << 13); else if (base) - /* Copy priv. level from what Guest asked for. This allows - * debug (int 3) traps from Guest userspace, for example. */ + /* + * Copy privilege level from what Guest asked for. This allows + * debug (int 3) traps from Guest userspace, for example. + */ flags |= (base->b & 0x6000); /* Now pack it into the IDT entry in its weird format. */ @@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state, default_idt_entry(&state->guest_idt[i], i, def[i], NULL); } -/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead +/*H:240 + * We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just - * before we run the Guest. This routine does that copy. */ + * before we run the Guest. This routine does that copy. + */ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def) { unsigned int i; - /* We can simply copy the direct traps, otherwise we use the default - * ones in the Switcher: they will return to the Host. */ + /* + * We can simply copy the direct traps, otherwise we use the default + * ones in the Switcher: they will return to the Host. + */ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { const struct desc_struct *gidt = &cpu->arch.idt[i]; @@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, if (!direct_trap(i)) continue; - /* Only trap gates (type 15) can go direct to the Guest. + /* + * Only trap gates (type 15) can go direct to the Guest. * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. * * If it can't go direct, we still need to copy the priv. level: * they might want to give userspace access to a software - * interrupt. */ + * interrupt. + */ if (idt_type(gidt->a, gidt->b) == 0xF) idt[i] = *gidt; else @@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * the next timer interrupt (in nanoseconds). We use the high-resolution timer * infrastructure to set a callback at that time. * - * 0 means "turn off the clock". */ + * 0 means "turn off the clock". + */ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) { ktime_t expires; @@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) return; } - /* We use wallclock time here, so the Guest might not be running for + /* + * We use wallclock time here, so the Guest might not be running for * all the time between now and the timer interrupt it asked for. This - * is almost always the right thing to do. */ + * is almost always the right thing to do. + */ expires = ktime_add_ns(ktime_get_real(), delta); hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 01c5919..bc28745 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -16,15 +16,13 @@ void free_pagetables(void); int init_pagetables(struct page **switcher_page, unsigned int pages); -struct pgdir -{ +struct pgdir { unsigned long gpgdir; pgd_t *pgdir; }; /* We have two pages shared with guests, per cpu. */ -struct lguest_pages -{ +struct lguest_pages { /* This is the stack page mapped rw in guest */ char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; struct lguest_regs regs; @@ -54,13 +52,13 @@ struct lg_cpu { unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ - /* At end of a page shared mapped over lguest_pages in guest. */ + /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; struct lguest_pages *last_pages; - int cpu_pgd; /* which pgd this cpu is currently using */ + int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; @@ -89,15 +87,17 @@ struct lg_eventfd_map { }; /* The private info the thread maintains about the guest. */ -struct lguest -{ +struct lguest { struct lguest_data __user *lguest_data; struct lg_cpu cpus[NR_CPUS]; unsigned int nr_cpus; u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ + + /* + * This provides the offset to the base of guest-physical memory in the + * Launcher. + */ void __user *mem_base; unsigned long kernel_address; @@ -122,11 +122,13 @@ bool lguest_address_ok(const struct lguest *lg, void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); -/*H:035 Using memory-copy operations like that is usually inconvient, so we +/*H:035 + * Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * - * This reads into a variable of the given type then returns that. */ + * This reads into a variable of the given type then returns that. + */ #define lgread(cpu, addr, type) \ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) @@ -140,9 +142,11 @@ void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); int run_guest(struct lg_cpu *cpu, unsigned long __user *user); -/* Helper macros to obtain the first 12 or the last 20 bits, this is only the +/* + * Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined - * in the kernel. */ + * in the kernel. + */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e082cda..b6200bc 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,12 @@ -/*P:050 Lguest guests use a very simple method to describe devices. It's a +/*P:050 + * Lguest guests use a very simple method to describe devices. It's a * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. +:*/ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/lguest_launcher.h> @@ -20,8 +22,10 @@ /* The pointer to our (page) of device descriptions. */ static void *lguest_devices; -/* For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. */ +/* + * For Guests, device memory can be used as normal memory, so we cast away the + * __iomem to quieten sparse. + */ static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) { return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); @@ -32,8 +36,10 @@ static inline void lguest_unmap(void *addr) iounmap((__force void __iomem *)addr); } -/*D:100 Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. */ +/*D:100 + * Each lguest device is just a virtio device plus a pointer to its entry + * in the lguest_devices page. + */ struct lguest_device { struct virtio_device vdev; @@ -41,9 +47,11 @@ struct lguest_device { struct lguest_device_desc *desc; }; -/* Since the virtio infrastructure hands us a pointer to the virtio_device all +/* + * Since the virtio infrastructure hands us a pointer to the virtio_device all * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. */ + * lguest_device it's enclosed in. + */ #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) /*D:130 @@ -55,7 +63,8 @@ struct lguest_device { * the driver will look at them during setup. * * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. */ + * immediately after the descriptor. + */ static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) { return (void *)(desc + 1); @@ -98,10 +107,12 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } -/* The virtio core takes the features the Host offers, and copies the - * ones supported by the driver into the vdev->features array. Once - * that's all sorted out, this routine is called so we can tell the - * Host which features we understand and accept. */ +/* + * The virtio core takes the features the Host offers, and copies the ones + * supported by the driver into the vdev->features array. Once that's all + * sorted out, this routine is called so we can tell the Host which features we + * understand and accept. + */ static void lg_finalize_features(struct virtio_device *vdev) { unsigned int i, bits; @@ -112,10 +123,11 @@ static void lg_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - /* The vdev->feature array is a Linux bitmask: this isn't the - * same as a the simple array of bits used by lguest devices - * for features. So we do this slow, manual conversion which is - * completely general. */ + /* + * The vdev->feature array is a Linux bitmask: this isn't the same as a + * the simple array of bits used by lguest devices for features. So we + * do this slow, manual conversion which is completely general. + */ memset(out_features, 0, desc->feature_len); bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; for (i = 0; i < bits; i++) { @@ -146,15 +158,19 @@ static void lg_set(struct virtio_device *vdev, unsigned int offset, memcpy(lg_config(desc) + offset, buf, len); } -/* The operations to get and set the status word just access the status field - * of the device descriptor. */ +/* + * The operations to get and set the status word just access the status field + * of the device descriptor. + */ static u8 lg_get_status(struct virtio_device *vdev) { return to_lgdev(vdev)->desc->status; } -/* To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". */ +/* + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the + * descriptor address of the device. A zero status means "reset". + */ static void set_status(struct virtio_device *vdev, u8 status) { unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; @@ -191,8 +207,7 @@ static void lg_reset(struct virtio_device *vdev) */ /*D:140 This is the information we remember about each virtqueue. */ -struct lguest_vq_info -{ +struct lguest_vq_info { /* A copy of the information contained in the device config. */ struct lguest_vqconfig config; @@ -200,13 +215,17 @@ struct lguest_vq_info void *pages; }; -/* When the virtio_ring code wants to prod the Host, it calls us here and we +/* + * When the virtio_ring code wants to prod the Host, it calls us here and we * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. */ + * knows which virtqueue we're talking about. + */ static void lg_notify(struct virtqueue *vq) { - /* We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. */ + /* + * We store our virtqueue information in the "priv" pointer of the + * virtqueue structure. + */ struct lguest_vq_info *lvq = vq->priv; kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); @@ -215,7 +234,8 @@ static void lg_notify(struct virtqueue *vq) /* An extern declaration inside a C file is bad form. Don't do it. */ extern void lguest_setup_irq(unsigned int irq); -/* This routine finds the first virtqueue described in the configuration of +/* + * This routine finds the Nth virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -223,9 +243,7 @@ extern void lguest_setup_irq(unsigned int irq); * everyone wants to do it differently. The KVM coders want the Guest to * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. - * - * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ + */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), @@ -244,9 +262,11 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, if (!lvq) return ERR_PTR(-ENOMEM); - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + /* + * Make a copy of the "struct lguest_vqconfig" entry, which sits after * the descriptor. We need a copy because the config space might not - * be aligned correctly. */ + * be aligned correctly. + */ memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); printk("Mapping virtqueue %i addr %lx\n", index, @@ -261,8 +281,10 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, goto free_lvq; } - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. */ + /* + * OK, tell virtio_ring.c to set up a virtqueue now we know its size + * and we've got a pointer to its pages. + */ vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, lvq->pages, lg_notify, callback, name); if (!vq) { @@ -273,18 +295,23 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* Make sure the interrupt is allocated. */ lguest_setup_irq(lvq->config.irq); - /* Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. */ - /* FIXME: We used to have a flag for the Host to tell us we could use + /* + * Tell the interrupt for this virtqueue to go to the virtio_ring + * interrupt handler. + * + * FIXME: We used to have a flag for the Host to tell us we could use * the interrupt as a source of randomness: it'd be nice to have that - * back.. */ + * back. + */ err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) goto destroy_vring; - /* Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. */ + /* + * Last of all we hook up our 'struct lguest_vq_info" to the + * virtqueue's priv pointer. + */ vq->priv = lvq; return vq; @@ -358,11 +385,14 @@ static struct virtio_config_ops lguest_config_ops = { .del_vqs = lg_del_vqs, }; -/* The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ +/* + * The root device for the lguest virtio devices. This makes them appear as + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. + */ static struct device *lguest_root; -/*D:120 This is the core of the lguest bus: actually adding a new device. +/*D:120 + * This is the core of the lguest bus: actually adding a new device. * It's a separate function because it's neater that way, and because an * earlier version of the code supported hotplug and unplug. They were removed * early on because they were never used. @@ -371,14 +401,14 @@ static struct device *lguest_root; * * It's worth reading this carefully: we start with a pointer to the new device * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. */ + * descriptor page so we can uniquely identify it if things go badly wrong. + */ static void add_lguest_device(struct lguest_device_desc *d, unsigned int offset) { struct lguest_device *ldev; - /* Start with zeroed memory; Linux's device layer seems to count on - * it. */ + /* Start with zeroed memory; Linux's device layer counts on it. */ ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) { printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", @@ -388,17 +418,25 @@ static void add_lguest_device(struct lguest_device_desc *d, /* This devices' parent is the lguest/ dir. */ ldev->vdev.dev.parent = lguest_root; - /* We have a unique device index thanks to the dev_index counter. */ + /* + * The device type comes straight from the descriptor. There's also a + * device vendor field in the virtio_device struct, which we leave as + * 0. + */ ldev->vdev.id.device = d->type; - /* We have a simple set of routines for querying the device's - * configuration information and setting its status. */ + /* + * We have a simple set of routines for querying the device's + * configuration information and setting its status. + */ ldev->vdev.config = &lguest_config_ops; /* And we remember the device's descriptor for lguest_config_ops. */ ldev->desc = d; - /* register_virtio_device() sets up the generic fields for the struct + /* + * register_virtio_device() sets up the generic fields for the struct * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. */ + * infrastructure look for a matching driver. + */ if (register_virtio_device(&ldev->vdev) != 0) { printk(KERN_ERR "Failed to register lguest dev %u type %u\n", offset, d->type); @@ -406,8 +444,10 @@ static void add_lguest_device(struct lguest_device_desc *d, } } -/*D:110 scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". */ +/*D:110 + * scan_devices() simply iterates through the device page. The type 0 is + * reserved to mean "end of devices". + */ static void scan_devices(void) { unsigned int i; @@ -426,7 +466,8 @@ static void scan_devices(void) } } -/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the +/*D:105 + * Fairly early in boot, lguest_devices_init() is called to set up the * lguest device infrastructure. We check that we are a Guest by checking * pv_info.name: there are other ways of checking, but this seems most * obvious to me. @@ -437,7 +478,8 @@ static void scan_devices(void) * correct sysfs incantation). * * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. */ + * lguest_devices page. + */ static int __init lguest_devices_init(void) { if (strcmp(pv_info.name, "lguest") != 0) @@ -456,11 +498,13 @@ static int __init lguest_devices_init(void) /* We do this after core stuff, but before the drivers. */ postcore_initcall(lguest_devices_init); -/*D:150 At this point in the journey we used to now wade through the lguest +/*D:150 + * At this point in the journey we used to now wade through the lguest * devices themselves: net, block and console. Since they're all now virtio * devices rather than lguest-specific, I've decided to ignore them. Mostly, * they're kind of boring. But this does mean you'll never experience the * thrill of reading the forbidden love scene buried deep in the block driver. * * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?". */ + * come from?", and "What do you do when someone asks for optimization?". + */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 9f9a295..b4d3f7c 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,9 @@ /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout, pagetable, entry point and kernel address - * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. :*/ + * tell us the Guest's memory layout and entry point. A read will run the + * Guest until something happens, such as a signal or the Guest doing a NOTIFY + * out to the Launcher. +:*/ #include <linux/uaccess.h> #include <linux/miscdevice.h> #include <linux/fs.h> @@ -11,14 +12,41 @@ #include <linux/file.h> #include "lg.h" +/*L:056 + * Before we move on, let's jump ahead and look at what the kernel does when + * it needs to look up the eventfds. That will complete our picture of how we + * use RCU. + * + * The notification value is in cpu->pending_notify: we return true if it went + * to an eventfd. + */ bool send_notify_to_eventfd(struct lg_cpu *cpu) { unsigned int i; struct lg_eventfd_map *map; - /* lg->eventfds is RCU-protected */ + /* + * This "rcu_read_lock()" helps track when someone is still looking at + * the (RCU-using) eventfds array. It's not actually a lock at all; + * indeed it's a noop in many configurations. (You didn't expect me to + * explain all the RCU secrets here, did you?) + */ rcu_read_lock(); + /* + * rcu_dereference is the counter-side of rcu_assign_pointer(); it + * makes sure we don't access the memory pointed to by + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, + * but Alpha allows this! Paul McKenney points out that a really + * aggressive compiler could have the same effect: + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html + * + * So play safe, use rcu_dereference to get the rcu-protected pointer: + */ map = rcu_dereference(cpu->lg->eventfds); + /* + * Simple array search: even if they add an eventfd while we do this, + * we'll continue to use the old array and just won't see the new one. + */ for (i = 0; i < map->num; i++) { if (map->map[i].addr == cpu->pending_notify) { eventfd_signal(map->map[i].event, 1); @@ -26,19 +54,50 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) break; } } + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ rcu_read_unlock(); + + /* If we cleared the notification, it's because we found a match. */ return cpu->pending_notify == 0; } +/*L:055 + * One of the more tricksy tricks in the Linux Kernel is a technique called + * Read Copy Update. Since one point of lguest is to teach lguest journeyers + * about kernel coding, I use it here. (In case you're curious, other purposes + * include learning about virtualization and instilling a deep appreciation for + * simplicity and puppies). + * + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we + * add new eventfds without ever blocking readers from accessing the array. + * The current Launcher only does this during boot, so that never happens. But + * Read Copy Update is cool, and adding a lock risks damaging even more puppies + * than this code does. + * + * We allocate a brand new one-larger array, copy the old one and add our new + * element. Then we make the lg eventfd pointer point to the new array. + * That's the easy part: now we need to free the old one, but we need to make + * sure no slow CPU somewhere is still looking at it. That's what + * synchronize_rcu does for us: waits until every CPU has indicated that it has + * moved on to know it's no longer using the old one. + * + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. + */ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) { struct lg_eventfd_map *new, *old = lg->eventfds; + /* + * We don't allow notifications on value 0 anyway (pending_notify of + * 0 means "nothing pending"). + */ if (!addr) return -EINVAL; - /* Replace the old array with the new one, carefully: others can - * be accessing it at the same time */ + /* + * Replace the old array with the new one, carefully: others can + * be accessing it at the same time. + */ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), GFP_KERNEL); if (!new) @@ -52,22 +111,41 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) new->map[new->num].addr = addr; new->map[new->num].event = eventfd_ctx_fdget(fd); if (IS_ERR(new->map[new->num].event)) { + int err = PTR_ERR(new->map[new->num].event); kfree(new); - return PTR_ERR(new->map[new->num].event); + return err; } new->num++; - /* Now put new one in place. */ + /* + * Now put new one in place: rcu_assign_pointer() is a fancy way of + * doing "lg->eventfds = new", but it uses memory barriers to make + * absolutely sure that the contents of "new" written above is nailed + * down before we actually do the assignment. + * + * We have to think about these kinds of things when we're operating on + * live data without locks. + */ rcu_assign_pointer(lg->eventfds, new); - /* We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. */ + /* + * We're not in a big hurry. Wait until noone's looking at old + * version, then free it. + */ synchronize_rcu(); kfree(old); return 0; } +/*L:052 + * Receiving notifications from the Guest is usually done by attaching a + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will + * become readable when the Guest does an LHCALL_NOTIFY with that value. + * + * This is really convenient for processing each virtqueue in a separate + * thread. + */ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) { unsigned long addr, fd; @@ -79,15 +157,22 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) if (get_user(fd, input) != 0) return -EFAULT; + /* + * Just make sure two callers don't add eventfds at once. We really + * only need to lock against callers adding to the same Guest, so using + * the Big Lguest Lock is overkill. But this is setup, not a fast path. + */ mutex_lock(&lguest_lock); err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); - return 0; + return err; } -/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt - * number to /dev/lguest. */ +/*L:050 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt + * number to /dev/lguest. + */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long irq; @@ -97,12 +182,18 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) if (irq >= LGUEST_IRQS) return -EINVAL; + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_interrupt(cpu, irq); return 0; } -/*L:040 Once our Guest is initialized, the Launcher makes it run by reading - * from /dev/lguest. */ +/*L:040 + * Once our Guest is initialized, the Launcher makes it run by reading + * from /dev/lguest. + */ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; @@ -138,8 +229,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest sent I/O, - * clear the flag. */ + /* + * If we returned from read() last time because the Guest sent I/O, + * clear the flag. + */ if (cpu->pending_notify) cpu->pending_notify = 0; @@ -147,8 +240,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(cpu, (unsigned long __user *)user); } -/*L:025 This actually initializes a CPU. For the moment, a Guest is only - * uniprocessor, so "id" is always 0. */ +/*L:025 + * This actually initializes a CPU. For the moment, a Guest is only + * uniprocessor, so "id" is always 0. + */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { /* We have a limited number the number of CPUs in the lguest struct. */ @@ -163,8 +258,10 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* Each CPU has a timer it can set. */ init_clockdev(cpu); - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ + /* + * We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. + */ cpu->regs_page = get_zeroed_page(GFP_KERNEL); if (!cpu->regs_page) return -ENOMEM; @@ -172,29 +269,38 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* We actually put the registers at the bottom of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - /* Now we initialize the Guest's registers, handing it the start - * address. */ + /* + * Now we initialize the Guest's registers, handing it the start + * address. + */ lguest_arch_setup_regs(cpu, start_ip); - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (eg. console input). */ + /* + * We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (eg. console input). + */ cpu->tsk = current; - /* We need to keep a pointer to the Launcher's memory map, because if + /* + * We need to keep a pointer to the Launcher's memory map, because if * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ + * reference, it is destroyed before close() is called. + */ cpu->mm = get_task_mm(cpu->tsk); - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ + /* + * We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. + */ cpu->last_pages = NULL; /* No error == success. */ return 0; } -/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) - * values (in addition to the LHREQ_INITIALIZE value). These are: +/*L:020 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in + * addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. * @@ -206,14 +312,15 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) */ static int initialize(struct file *file, const unsigned long __user *input) { - /* "struct lguest" contains everything we (the Host) know about a - * Guest. */ + /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; unsigned long args[3]; - /* We grab the Big Lguest lock, which protects against multiple - * simultaneous initializations. */ + /* + * We grab the Big Lguest lock, which protects against multiple + * simultaneous initializations. + */ mutex_lock(&lguest_lock); /* You can't initialize twice! Close the device and start again... */ if (file->private_data) { @@ -248,8 +355,10 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_eventfds; - /* Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. */ + /* + * Initialize the Guest's shadow page tables, using the toplevel + * address the Launcher gave us. This allocates memory, so can fail. + */ err = init_guest_pagetable(lg); if (err) goto free_regs; @@ -274,20 +383,24 @@ unlock: return err; } -/*L:010 The first operation the Launcher does must be a write. All writes +/*L:010 + * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts. + * writes of other values to send interrupts or set up receipt of notifications. * * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with. Currently this is always 0, since we only + * CPU number we're dealing with. Currently this is always 0 since we only * support uniprocessor Guests, but you can see the beginnings of SMP support - * here. */ + * here. + */ static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { - /* Once the Guest is initialized, we hold the "struct lguest" in the - * file private data. */ + /* + * Once the Guest is initialized, we hold the "struct lguest" in the + * file private data. + */ struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; @@ -322,13 +435,15 @@ static ssize_t write(struct file *file, const char __user *in, } } -/*L:060 The final piece of interface code is the close() routine. It reverses +/*L:060 + * The final piece of interface code is the close() routine. It reverses * everything done in initialize(). This is usually called because the * Launcher exited. * * Note that the close routine returns 0 or a negative error number: it can't * really fail, but it can whine. I blame Sun for this wart, and K&R C for - * letting them do it. :*/ + * letting them do it. +:*/ static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; @@ -338,8 +453,10 @@ static int close(struct inode *inode, struct file *file) if (!lg) return 0; - /* We need the big lock, to protect from inter-guest I/O and other - * Launchers initializing guests. */ + /* + * We need the big lock, to protect from inter-guest I/O and other + * Launchers initializing guests. + */ mutex_lock(&lguest_lock); /* Free up the shadow page tables for the Guest. */ @@ -350,8 +467,10 @@ static int close(struct inode *inode, struct file *file) hrtimer_cancel(&lg->cpus[i].hrt); /* We can free up the register page we allocated. */ free_page(lg->cpus[i].regs_page); - /* Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. */ + /* + * Now all the memory cleanups are done, it's safe to release + * the Launcher's memory management structure. + */ mmput(lg->cpus[i].mm); } @@ -360,8 +479,10 @@ static int close(struct inode *inode, struct file *file) eventfd_ctx_put(lg->eventfds->map[i].event); kfree(lg->eventfds); - /* If lg->dead doesn't contain an error code it will be NULL or a - * kmalloc()ed string, either of which is ok to hand to kfree(). */ + /* + * If lg->dead doesn't contain an error code it will be NULL or a + * kmalloc()ed string, either of which is ok to hand to kfree(). + */ if (!IS_ERR(lg->dead)) kfree(lg->dead); /* Free the memory allocated to the lguest_struct */ @@ -385,7 +506,8 @@ static int close(struct inode *inode, struct file *file) * * We begin our understanding with the Host kernel interface which the Launcher * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: */ + * work happens in the read(), write() and close() routines: + */ static struct file_operations lguest_fops = { .owner = THIS_MODULE, .release = close, @@ -393,8 +515,10 @@ static struct file_operations lguest_fops = { .read = read, }; -/* This is a textbook example of a "misc" character device. Populate a "struct - * miscdevice" and register it with misc_register(). */ +/* + * This is a textbook example of a "misc" character device. Populate a "struct + * miscdevice" and register it with misc_register(). + */ static struct miscdevice lguest_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "lguest", diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a6fe1ab..a8d0aee 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1,9 +1,11 @@ -/*P:700 The pagetable code, on the other hand, still shows the scars of +/*P:700 + * The pagetable code, on the other hand, still shows the scars of * previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust * it nor use it: we verify and convert it here then point the CPU to the - * converted Guest pages when running the Guest. :*/ + * converted Guest pages when running the Guest. +:*/ /* Copyright (C) Rusty Russell IBM Corporation 2006. * GPL v2 and any later version */ @@ -17,18 +19,20 @@ #include <asm/bootparam.h> #include "lg.h" -/*M:008 We hold reference to pages, which prevents them from being swapped. +/*M:008 + * We hold reference to pages, which prevents them from being swapped. * It'd be nice to have a callback in the "struct mm_struct" when Linux wants * to swap out. If we had this, and a shrinker callback to trim PTE pages, we - * could probably consider launching Guests as non-root. :*/ + * could probably consider launching Guests as non-root. +:*/ /*H:300 * The Page Table Code * - * We use two-level page tables for the Guest. If you're not entirely - * comfortable with virtual addresses, physical addresses and page tables then - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with - * diagrams!). + * We use two-level page tables for the Guest, or three-level with PAE. If + * you're not entirely comfortable with virtual addresses, physical addresses + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page + * Table Handling" (with diagrams!). * * The Guest keeps page tables, but we maintain the actual ones here: these are * called "shadow" page tables. Which is a very Guest-centric name: these are @@ -45,16 +49,18 @@ * (v) Flushing (throwing away) page tables, * (vi) Mapping the Switcher when the Guest is about to run, * (vii) Setting up the page tables initially. - :*/ +:*/ - -/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is - * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. */ +/* + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) + * or 512 PTE entries with PAE (2MB). + */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) -/* For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. */ +/* + * For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. + */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define RESERVE_MEM 2U @@ -64,14 +70,18 @@ #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* We actually need a separate PTE page for each CPU. Remember that after the +/* + * We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. */ + * CPU's guest to see the pages of any other CPU. + */ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) -/*H:320 The page table code is curly enough to need helper functions to keep it - * clear and clean. +/*H:320 + * The page table code is curly enough to need helper functions to keep it + * clear and clean. The kernel itself provides many of them; one advantage + * of insisting that the Guest and Host use the same CONFIG_PAE setting. * * There are two functions which return pointers to the shadow (aka "real") * page tables. @@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * spgd_addr() takes the virtual address and returns a pointer to the top-level * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's - * usually the current one). */ + * usually the current one). + */ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); @@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) } #ifdef CONFIG_X86_PAE -/* This routine then takes the PGD entry given above, which contains the +/* + * This routine then takes the PGD entry given above, which contains the * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. */ + * given address. + */ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { unsigned int index = pmd_index(vaddr); @@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } #endif -/* This routine then takes the page directory entry returned above, which +/* + * This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a - * pointer to the PTE entry for the given address. */ + * pointer to the PTE entry for the given address. + */ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { #ifdef CONFIG_X86_PAE @@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) return &page[pte_index(vaddr)]; } -/* These two functions just like the above two, except they access the Guest - * page tables. Hence they return a Guest address. */ +/* + * These functions are just like the above two, except they access the Guest + * page tables. Hence they return a Guest address. + */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); @@ -148,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) } #ifdef CONFIG_X86_PAE +/* Follow the PGD to the PMD. */ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; @@ -155,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) return gpage + pmd_index(vaddr) * sizeof(pmd_t); } +/* Follow the PMD to the PTE. */ static unsigned long gpte_addr(struct lg_cpu *cpu, pmd_t gpmd, unsigned long vaddr) { @@ -164,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, return gpage + pte_index(vaddr) * sizeof(pte_t); } #else +/* Follow the PGD to the PTE (no mid-level for !PAE). */ static unsigned long gpte_addr(struct lg_cpu *cpu, pgd_t gpgd, unsigned long vaddr) { @@ -175,17 +195,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as - * an optimization (ie. pre-faulting). :*/ +/*M:014 + * get_pfn is slow: we could probably try to grab batches of pages here as + * an optimization (ie. pre-faulting). +:*/ -/*H:350 This routine takes a page number given by the Guest and converts it to +/*H:350 + * This routine takes a page number given by the Guest and converts it to * an actual, physical page number. It can fail for several reasons: the * virtual address might not be mapped by the Launcher, the write flag is set * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * * This holds a reference to the page, so release_pte() is careful to put that - * back. */ + * back. + */ static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; @@ -198,33 +222,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) return -1UL; } -/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table +/*H:340 + * Converting a Guest page table entry to a shadow (ie. real) page table * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page - * number. */ + * number. + */ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) { unsigned long pfn, base, flags; - /* The Guest sets the global flag, because it thinks that it is using + /* + * The Guest sets the global flag, because it thinks that it is using * PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually - * use the global bit, so throw it away. */ + * use the global bit, so throw it away. + */ flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; - /* We need a temporary "unsigned long" variable to hold the answer from + /* + * We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the - * page, given the virtual number. */ + * page, given the virtual number. + */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); - /* When we destroy the Guest, we'll go through the shadow page + /* + * When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think - * this one is valid! */ + * this one is valid! + */ flags = 0; } /* Now we assemble our shadow PTE from the page number and flags. */ @@ -234,8 +266,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) /*H:460 And to complete the chain, release_pte() looks like this: */ static void release_pte(pte_t pte) { - /* Remember that get_user_pages_fast() took a reference to the page, in - * get_pfn()? We have to put it back now. */ + /* + * Remember that get_user_pages_fast() took a reference to the page, in + * get_pfn()? We have to put it back now. + */ if (pte_flags(pte) & _PAGE_PRESENT) put_page(pte_page(pte)); } @@ -273,7 +307,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. */ + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; @@ -282,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; + /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; @@ -298,22 +334,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); - /* And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* middle level not present? We can't map it in. */ + /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; @@ -324,8 +364,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; @@ -334,27 +376,37 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); - /* And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif + + /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; - /* Check they're not trying to write to a page the Guest wants - * read-only (bit 2 of errcode == write). */ + /* + * Check they're not trying to write to a page the Guest wants + * read-only (bit 2 of errcode == write). + */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; @@ -362,8 +414,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; - /* Check that the Guest PTE flags are OK, and the page number is below - * the pfn_limit (ie. not mapping the Launcher binary). */ + /* + * Check that the Guest PTE flags are OK, and the page number is below + * the pfn_limit (ie. not mapping the Launcher binary). + */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ @@ -373,29 +427,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); - /* If there was a valid shadow PTE entry here before, we release it. - * This can happen with a write to a previously read-only entry. */ + + /* + * If there was a valid shadow PTE entry here before, we release it. + * This can happen with a write to a previously read-only entry. + */ release_pte(*spte); - /* If this is a write, we insist that the Guest page is writable (the - * final arg to gpte_to_spte()). */ + /* + * If this is a write, we insist that the Guest page is writable (the + * final arg to gpte_to_spte()). + */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else - /* If this is a read, don't set the "writable" bit in the page + /* + * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so - * we can update the Guest's _PAGE_DIRTY flag. */ + * we can update the Guest's _PAGE_DIRTY flag. + */ native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); - /* Finally, we write the Guest PTE entry back: we've set the - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ + /* + * Finally, we write the Guest PTE entry back: we've set the + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. + */ lgwrite(cpu, gpte_ptr, pte_t, gpte); - /* The fault is fixed, the page table is populated, the mapping + /* + * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest - * has that a page fault occurred at all. */ + * has that a page fault occurred at all. + */ return true; } @@ -408,7 +473,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * mapped, so it's overkill. * * This is a quick version which answers the question: is this virtual address - * mapped by the shadow page tables, and is it writable? */ + * mapped by the shadow page tables, and is it writable? + */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; @@ -428,21 +494,26 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) return false; #endif - /* Check the flags on the pte entry itself: it must be present and - * writable. */ + /* + * Check the flags on the pte entry itself: it must be present and + * writable. + */ flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } -/* So, when pin_stack_pages() asks us to pin a page, we check if it's already +/* + * So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 - * (meaning "write"). */ + * (meaning "write"). + */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) kill_guest(cpu, "bad stack page %#lx", vaddr); } +/*:*/ #ifdef CONFIG_X86_PAE static void release_pmd(pmd_t *spmd) @@ -479,15 +550,21 @@ static void release_pgd(pgd_t *spgd) } #else /* !CONFIG_X86_PAE */ -/*H:450 If we chase down the release_pgd() code, it looks like this: */ +/*H:450 + * If we chase down the release_pgd() code, the non-PAE version looks like + * this. The PAE version is almost identical, but instead of calling + * release_pte it calls release_pmd(), which looks much like this. + */ static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; - /* Converting the pfn to find the actual PTE page is easy: turn + /* + * Converting the pfn to find the actual PTE page is easy: turn * the page number into a physical address, then convert to a - * virtual address (easy for kernel pages like this one). */ + * virtual address (easy for kernel pages like this one). + */ pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -499,9 +576,12 @@ static void release_pgd(pgd_t *spgd) } } #endif -/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() + +/*H:445 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ + * It simply releases every PTE page from 0 up to the Guest's kernel address. + */ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; @@ -510,10 +590,12 @@ static void flush_user_mappings(struct lguest *lg, int idx) release_pgd(lg->pgdirs[idx].pgdir + i); } -/*H:440 (v) Flushing (throwing away) page tables, +/*H:440 + * (v) Flushing (throwing away) page tables, * * The Guest has a hypercall to throw away the page tables: it's used when a - * large number of mappings have been changed. */ + * large number of mappings have been changed. + */ void guest_pagetable_flush_user(struct lg_cpu *cpu) { /* Drop the userspace part of the current page table. */ @@ -551,9 +633,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } -/* We keep several page tables. This is a simple routine to find the page +/* + * We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given - * us. */ + * us. + */ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; @@ -563,9 +647,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) return i; } -/*H:435 And this is us, creating the new page directory. If we really do +/*H:435 + * And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set - * blank_pgdir. */ + * blank_pgdir. + */ static unsigned int new_pgdir(struct lg_cpu *cpu, unsigned long gpgdir, int *blank_pgdir) @@ -575,8 +661,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, pmd_t *pmd_table; #endif - /* We pick one entry at random to throw out. Choosing the Least - * Recently Used might be better, but this is easy. */ + /* + * We pick one entry at random to throw out. Choosing the Least + * Recently Used might be better, but this is easy. + */ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { @@ -587,8 +675,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, next = cpu->cpu_pgd; else { #ifdef CONFIG_X86_PAE - /* In PAE mode, allocate a pmd page and populate the - * last pgd entry. */ + /* + * In PAE mode, allocate a pmd page and populate the + * last pgd entry. + */ pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd_table) { free_page((long)cpu->lg->pgdirs[next].pgdir); @@ -598,8 +688,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, set_pgd(cpu->lg->pgdirs[next].pgdir + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + /* + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! + */ *blank_pgdir = 1; } #else @@ -615,19 +707,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 (iv) Switching page tables +/*H:430 + * (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. */ + * pgdir). This occurs on almost every context switch. + */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ newpgdir = find_pgdir(cpu->lg, pgtable); - /* If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. */ + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ @@ -637,9 +733,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) pin_stack_pages(cpu); } -/*H:470 Finally, a routine which throws away everything: all PGD entries in all +/*H:470 + * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used - * when we destroy the Guest. */ + * when we destroy the Guest. + */ static void release_all_pagetables(struct lguest *lg) { unsigned int i, j; @@ -656,8 +754,10 @@ static void release_all_pagetables(struct lguest *lg) spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - /* And release the pmd entries of that pmd page, - * except for the switcher pmd. */ + /* + * And release the pmd entries of that pmd page, + * except for the switcher pmd. + */ for (k = 0; k < SWITCHER_PMD_INDEX; k++) release_pmd(&pmdpage[k]); #endif @@ -667,10 +767,12 @@ static void release_all_pagetables(struct lguest *lg) } } -/* We also throw away everything when a Guest tells us it's changed a kernel +/* + * We also throw away everything when a Guest tells us it's changed a kernel * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as - * everything faults back in, but it's rare. */ + * everything faults back in, but it's rare. + */ void guest_pagetable_clear_all(struct lg_cpu *cpu) { release_all_pagetables(cpu->lg); @@ -678,15 +780,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) pin_stack_pages(cpu); } /*:*/ -/*M:009 Since we throw away all mappings when a kernel mapping changes, our + +/*M:009 + * Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I - * don't know, but the term "puissant code-fu" comes to mind. :*/ + * don't know, but the term "puissant code-fu" comes to mind. +:*/ -/*H:420 This is the routine which actually sets the page table entry for then +/*H:420 + * This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they @@ -715,31 +821,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, spmd = spmd_addr(cpu, *spgd, vaddr); if (pmd_flags(*spmd) & _PAGE_PRESENT) { #endif - /* Otherwise, we start by releasing - * the existing entry. */ + /* Otherwise, start by releasing the existing entry. */ pte_t *spte = spte_addr(cpu, *spgd, vaddr); release_pte(*spte); - /* If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us - * in now. This shaves 10% off a - * copy-on-write micro-benchmark. */ + /* + * If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us in + * now. This shaves 10% off a copy-on-write + * micro-benchmark. + */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); native_set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); - } else - /* Otherwise kill it and we can demand_page() - * it in later. */ + } else { + /* + * Otherwise kill it and we can demand_page() + * it in later. + */ native_set_pte(spte, __pte(0)); + } #ifdef CONFIG_X86_PAE } #endif } } -/*H:410 Updating a PTE entry is a little trickier. +/*H:410 + * Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have @@ -748,12 +859,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * all the page tables, not just the current one. This is rare. * * The benefit is that when we have to track a new page table, we can keep all - * the kernel mappings. This speeds up context switch immensely. */ + * the kernel mappings. This speeds up context switch immensely. + */ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { - /* Kernel mappings must be changed on all top levels. Slow, but doesn't - * happen often. */ + /* + * Kernel mappings must be changed on all top levels. Slow, but doesn't + * happen often. + */ if (vaddr >= cpu->lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) @@ -795,19 +909,25 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } + #ifdef CONFIG_X86_PAE +/* For setting a mid-level, we just throw everything away. It's easy. */ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) { guest_pagetable_clear_all(&lg->cpus[0]); } #endif -/* Once we know how much memory we have we can construct simple identity - * (which set virtual == physical) and linear mappings - * which will get the Guest far enough into the boot to create its own. +/*H:505 + * To get through boot, we construct simple identity page mappings (which + * set virtual == physical) and linear mappings which will get the Guest far + * enough into the boot to create its own. The linear mapping means we + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, + * as you'll see. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ + * know its size here). + */ static unsigned long setup_pagetables(struct lguest *lg, unsigned long mem, unsigned long initrd_size) @@ -825,8 +945,10 @@ static unsigned long setup_pagetables(struct lguest *lg, unsigned int phys_linear; #endif - /* We have mapped_pages frames to map, so we need - * linear_pages page tables to map them. */ + /* + * We have mapped_pages frames to map, so we need linear_pages page + * tables to map them. + */ mapped_pages = mem / PAGE_SIZE; linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; @@ -837,10 +959,16 @@ static unsigned long setup_pagetables(struct lguest *lg, linear = (void *)pgdir - linear_pages * PAGE_SIZE; #ifdef CONFIG_X86_PAE + /* + * And the single mid page goes below that. We only use one, but + * that's enough to map 1G, which definitely gets us through boot. + */ pmds = (void *)linear - PAGE_SIZE; #endif - /* Linear mapping is easy: put every page's address into the - * mapping in order. */ + /* + * Linear mapping is easy: put every page's address into the + * mapping in order. + */ for (i = 0; i < mapped_pages; i++) { pte_t pte; pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); @@ -848,11 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } - /* The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. */ #ifdef CONFIG_X86_PAE + /* + * Make the Guest PMD entries point to the corresponding place in the + * linear mapping (up to one page worth of PMD). + */ for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); @@ -860,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } + /* One PGD entry, pointing to that PMD page. */ set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; + /* + * And the third PGD entry (ie. addresses 3G-4G). + * + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. + */ if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) return -EFAULT; #else + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; + /* + * Create a PGD entry which points to the right part of the + * linear PTE pages. + */ pgd = __pgd((phys_linear + i * sizeof(pte_t)) | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + /* + * Copy it into the PGD page at 0 and PAGE_OFFSET. + */ if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) + i / PTRS_PER_PTE], @@ -880,15 +1029,19 @@ static unsigned long setup_pagetables(struct lguest *lg, } #endif - /* We return the top level (guest-physical) address: remember where - * this is. */ + /* + * We return the top level (guest-physical) address: we remember where + * this is to write it into lguest_data when the Guest initializes. + */ return (unsigned long)pgdir - mem_base; } -/*H:500 (vii) Setting up the page tables initially. +/*H:500 + * (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: */ + * its first page table is. We set some things up here: + */ int init_guest_pagetable(struct lguest *lg) { u64 mem; @@ -898,21 +1051,27 @@ int init_guest_pagetable(struct lguest *lg) pgd_t *pgd; pmd_t *pmd_table; #endif - /* Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). */ + /* + * Get the Guest memory size and the ramdisk size from the boot header + * located at lg->mem_base (Guest address 0). + */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) || get_user(initrd_size, &boot->hdr.ramdisk_size)) return -EFAULT; - /* We start on the first shadow page table, and give it a blank PGD - * page. */ + /* + * We start on the first shadow page table, and give it a blank PGD + * page. + */ lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) return lg->pgdirs[0].gpgdir; lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; + #ifdef CONFIG_X86_PAE + /* For PAE, we also create the initial mid-level. */ pgd = lg->pgdirs[0].pgdir; pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); if (!pmd_table) @@ -921,27 +1080,33 @@ int init_guest_pagetable(struct lguest *lg) set_pgd(pgd + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); #endif + + /* This is the current page table. */ lg->cpus[0].cpu_pgd = 0; return 0; } -/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. */ + /* + * We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. + */ || put_user(RESERVE_MEM * 1024 * 1024, &cpu->lg->lguest_data->reserve_mem) || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* In flush_user_mappings() we loop from 0 to + /* + * In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the - * Switcher mappings, so check that now. */ + * Switcher mappings, so check that now. + */ #ifdef CONFIG_X86_PAE if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) @@ -964,12 +1129,14 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 (vi) Mapping the Switcher when the Guest is about to run. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. */ + * Guest is about to run on this CPU. + */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); @@ -980,30 +1147,38 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) pmd_t switcher_pmd; pmd_t *pmd_table; + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + /* Figure out where the pmd page is, by reading the PGD, and converting + * it to a virtual address. */ pmd_table = __va(pgd_pfn(cpu->lg-> pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); + /* Now write it into the shadow page table. */ native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); #else pgd_t switcher_pgd; - /* Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). */ + /* + * Make the last PGD entry for this Guest point to the Switcher's PTE + * page for this CPU (with appropriate flags). + */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; #endif - /* We also change the Switcher PTE page. When we're running the Guest, + /* + * We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out - * again. */ + * again. + */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], @@ -1019,10 +1194,12 @@ static void free_switcher_pte_pages(void) free_page((long)switcher_pte_page(i)); } -/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given +/*H:520 + * Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * - * Currently the Switcher is less than a page long, so "pages" is always 1. */ + * Currently the Switcher is less than a page long, so "pages" is always 1. + */ static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages) @@ -1043,13 +1220,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu, native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); - /* The second page contains the "struct lguest_ro_state", and is - * read-only. */ + /* + * The second page contains the "struct lguest_ro_state", and is + * read-only. + */ native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } -/* We've made it through the page table code. Perhaps our tired brains are +/* + * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page tables @@ -1058,10 +1238,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD * have implemented shadow page table support directly into hardware. * - * There is just one file remaining in the Host. */ + * There is just one file remaining in the Host. + */ -/*H:510 At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. */ +/*H:510 + * At boot or module load time, init_pagetables() allocates and populates + * the Switcher PTE page for each CPU. + */ __init int init_pagetables(struct page **switcher_page, unsigned int pages) { unsigned int i; diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 482ed5a..951c57b 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -1,4 +1,5 @@ -/*P:600 The x86 architecture has segments, which involve a table of descriptors +/*P:600 + * The x86 architecture has segments, which involve a table of descriptors * which can be used to do funky things with virtual address interpretation. * We originally used to use segments so the Guest couldn't alter the * Guest<->Host Switcher, and then we had to trim Guest segments, and restore @@ -8,7 +9,8 @@ * * In these modern times, the segment handling code consists of simple sanity * checks, and the worst you'll experience reading this code is butterfly-rash - * from frolicking through its parklike serenity. :*/ + * from frolicking through its parklike serenity. +:*/ #include "lg.h" /*H:600 @@ -41,10 +43,12 @@ * begin. */ -/* There are several entries we don't let the Guest set. The TSS entry is the +/* + * There are several entries we don't let the Guest set. The TSS entry is the * "Task State Segment" which controls all kinds of delicate things. The * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the - * the Guest can't be trusted to deal with double faults. */ + * the Guest can't be trusted to deal with double faults. + */ static bool ignored_gdt(unsigned int num) { return (num == GDT_ENTRY_TSS @@ -53,42 +57,52 @@ static bool ignored_gdt(unsigned int num) || num == GDT_ENTRY_DOUBLEFAULT_TSS); } -/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We +/*H:630 + * Once the Guest gave us new GDT entries, we fix them up a little. We * don't care if they're invalid: the worst that can happen is a General * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a - * mess: the message will be "unhandled trap 256". */ + * mess: the message will be "unhandled trap 256". + */ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) { unsigned int i; for (i = start; i < end; i++) { - /* We never copy these ones to real GDT, so we don't care what - * they say */ + /* + * We never copy these ones to real GDT, so we don't care what + * they say + */ if (ignored_gdt(i)) continue; - /* Segment descriptors contain a privilege level: the Guest is + /* + * Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's - * running at privilege level 1. If so, we fix it here. */ + * running at privilege level 1. If so, we fix it here. + */ if ((cpu->arch.gdt[i].b & 0x00006000) == 0) cpu->arch.gdt[i].b |= (GUEST_PL << 13); - /* Each descriptor has an "accessed" bit. If we don't set it + /* + * Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't - * writable by the Guest, so bad things can happen. */ + * writable by the Guest, so bad things can happen. + */ cpu->arch.gdt[i].b |= 0x00000100; } } -/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep +/*H:610 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep * a GDT for each CPU, and copy across the Guest's entries each time we want to * run the Guest on that CPU. * * This routine is called at boot or modprobe time for each CPU to set up the * constant GDT entries: the ones which are the same no matter what Guest we're - * running. */ + * running. + */ void setup_default_gdt_entries(struct lguest_ro_state *state) { struct desc_struct *gdt = state->guest_gdt; @@ -98,30 +112,37 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - /* The TSS segment refers to the TSS entry for this particular CPU. + /* + * The TSS segment refers to the TSS entry for this particular CPU. * Forgive the magic flags: the 0x8900 means the entry is Present, it's * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ + * means Saturn is eclipsed by Mercury in the twelfth house. + */ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | ((tss >> 16) & 0x000000FF); } -/* This routine sets up the initial Guest GDT for booting. All entries start - * as 0 (unusable). */ +/* + * This routine sets up the initial Guest GDT for booting. All entries start + * as 0 (unusable). + */ void setup_guest_gdt(struct lg_cpu *cpu) { - /* Start with full 0-4G segments... */ + /* + * Start with full 0-4G segments...except the Guest is allowed to use + * them, so set the privilege level appropriately in the flags. + */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - /* ...except the Guest is allowed to use them, so set the privilege - * level appropriately in the flags. */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } -/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" - * entries. */ +/*H:650 + * An optimization of copy_gdt(), for just the three "thead-local storage" + * entries. + */ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; @@ -130,26 +151,34 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) gdt[i] = cpu->arch.gdt[i]; } -/*H:640 When the Guest is run on a different CPU, or the GDT entries have - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this - * CPU's GDT. */ +/*H:640 + * When the Guest is run on a different CPU, or the GDT entries have changed, + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's + * GDT. + */ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; - /* The default entries from setup_default_gdt_entries() are not - * replaced. See ignored_gdt() above. */ + /* + * The default entries from setup_default_gdt_entries() are not + * replaced. See ignored_gdt() above. + */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) gdt[i] = cpu->arch.gdt[i]; } -/*H:620 This is where the Guest asks us to load a new GDT entry - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ +/*H:620 + * This is where the Guest asks us to load a new GDT entry + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. + */ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { - /* We assume the Guest has the same number of GDT entries as the - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ + /* + * We assume the Guest has the same number of GDT entries as the + * Host, otherwise we'd have to dynamically allocate the Guest GDT. + */ if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); @@ -157,15 +186,19 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) cpu->arch.gdt[num].a = lo; cpu->arch.gdt[num].b = hi; fixup_gdt_table(cpu, num, num+1); - /* Mark that the GDT changed so the core knows it has to copy it again, - * even if the Guest is run on the same CPU. */ + /* + * Mark that the GDT changed so the core knows it has to copy it again, + * even if the Guest is run on the same CPU. + */ cpu->changed |= CHANGED_GDT; } -/* This is the fast-track version for just changing the three TLS entries. +/* + * This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover - * both cases? */ + * both cases? + */ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) { struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; @@ -175,7 +208,6 @@ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) /* Note that just the TLS entries have changed. */ cpu->changed |= CHANGED_GDT_TLS; } -/*:*/ /*H:660 * With this, we have finished the Host. diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index eaf722f..6ae3888 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,13 +17,15 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/*P:450 This file contains the x86-specific lguest code. It used to be all +/*P:450 + * This file contains the x86-specific lguest code. It used to be all * mixed in with drivers/lguest/core.c but several foolhardy code slashers * wrestled most of the dependencies out to here in preparation for porting * lguest to other architectures (see what I mean by foolhardy?). * * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. :*/ + * were implemented after days of debugging pain. +:*/ #include <linux/kernel.h> #include <linux/start_kernel.h> #include <linux/string.h> @@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); */ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { - /* Copying all this data can be quite expensive. We usually run the + /* + * Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ + * Guest has changed. + */ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { __get_cpu_var(last_cpu) = cpu; cpu->last_pages = pages; cpu->changed = CHANGED_ALL; } - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ + /* + * These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. + */ pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ + /* + * Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). + */ map_switcher_in_guest(cpu, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use + /* + * Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ + * level 1). + */ pages->state.guest_tss.sp1 = cpu->esp1; pages->state.guest_tss.ss1 = cpu->ss1; @@ -125,97 +135,126 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ + /* + * Copy the guest-specific information into this CPU's "struct + * lguest_pages". + */ copy_in_guest_info(cpu, pages); - /* Set the trap number to 256 (impossible value). If we fault while + /* + * Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ + * cause us to abort the Guest. + */ cpu->regs->trapnum = 256; - /* Now: we push the "eflags" register on the stack, then do an "lcall". + /* + * Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using * the dedicated lguest code segment, as well as jumping into the * Switcher. * * The lcall also pushes the old code segment (KERNEL_CS) onto the * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... */ + * exactly match the stack layout created by an interrupt... + */ asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ + /* + * This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. + */ : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the + /* + * %eax contains the pages pointer. ("0" refers to the * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page - * directory. */ + * directory. + */ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) - /* We tell gcc that all these registers could change, + /* + * We tell gcc that all these registers could change, * which means we don't have to save and restore them in - * the Switcher. */ + * the Switcher. + */ : "memory", "%edx", "%ecx", "%edi", "%esi"); } /*:*/ -/*M:002 There are hooks in the scheduler which we can register to tell when we +/*M:002 + * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us * to lazily disable SYSENTER which would regain some performance, and should * also simplify copy_in_guest_info(). Note that we'd still need to restore * things when we exit to Launcher userspace, but that's fairly easy. * - * We could also try using this hooks for PGE, but that might be too expensive. + * We could also try using these hooks for PGE, but that might be too expensive. * - * The hooks were designed for KVM, but we can also put them to good use. :*/ + * The hooks were designed for KVM, but we can also put them to good use. +:*/ -/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. */ +/*H:040 + * This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. + */ void lguest_arch_run_guest(struct lg_cpu *cpu) { - /* Remember the awfully-named TS bit? If the Guest has asked to set it + /* + * Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it - * uses the FPU. */ + * uses the FPU. + */ if (cpu->ts) unlazy_fpu(current); - /* SYSENTER is an optimized way of doing system calls. We can't allow + /* + * SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest * won't try it because we don't advertise it in CPUID, but a malicious * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. */ + * CPU to disable it before running the Guest. + */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - /* Now we actually run the Guest. It will return when something + /* + * Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it - * was doing. */ + * was doing. + */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" structure contains two extra entries which are + /* + * Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some - * traps set. */ + * traps set. + */ /* Restore SYSENTER if it's supposed to be on. */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - /* If the Guest page faulted, then the cr2 register will tell us the + /* + * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. */ + * cr2, or we could even move off to a different CPU. + */ if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, + /* + * Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. * math_state_restore() may sleep and we may even move off to * a different CPU. So all the critical stuff should be done - * before this. */ + * before this. + */ else if (cpu->regs->trapnum == 7) math_state_restore(); } -/*H:130 Now we've examined the hypercall code; our Guest can make requests. +/*H:130 + * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements @@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ + * instructions and skip over it. We return true if we did. + */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ + /* + * The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. + */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - /* This must be the Guest kernel trying to do something, not userspace! + /* + * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ + /* + * 0x66 is an "operand prefix". It means it's using the upper 16 bits + * of the eax register. + */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ @@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu) insn = lgread(cpu, physaddr + insnlen, u8); } - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ + /* + * We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. + */ switch (insn & 0xFE) { case 0xE4: /* in <next byte>,%al */ insnlen += 2; @@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu) return 0; } - /* If it was an "IN" instruction, they expect the result to be read + /* + * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ + * traditionally means "there's nothing there". + */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) @@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* Our hypercalls mechanism used to be based on direct software interrupts. +/* + * Our hypercalls mechanism used to be based on direct software interrupts. * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to * change over to using kvm hypercalls. * @@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu) */ static void rewrite_hypercall(struct lg_cpu *cpu) { - /* This are the opcodes we use to patch the Guest. The opcode for "int + /* + * This are the opcodes we use to patch the Guest. The opcode for "int * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). */ + * complete the sequence with a NOP (0x90). + */ u8 insn[3] = {0xcd, 0x1f, 0x90}; __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* The above write might have caused a copy of that page to be made + /* + * The above write might have caused a copy of that page to be made * (if it was read-only). We need to make sure the Guest has * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. */ + * drop them all. + */ guest_pagetable_clear_all(cpu); } @@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu) { u8 insn[3]; - /* This must be the Guest kernel trying to do something. + /* + * This must be the Guest kernel trying to do something. * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return false; @@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* Check if this was one of those annoying IN or OUT + /* + * Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. */ + * back into the Guest after we've done it. + */ if (cpu->regs->errcode == 0) { if (emulate_insn(cpu)) return; } - /* If KVM is active, the vmcall instruction triggers a - * General Protection Fault. Normally it triggers an - * invalid opcode fault (6): */ + /* + * If KVM is active, the vmcall instruction triggers a General + * Protection Fault. Normally it triggers an invalid opcode + * fault (6): + */ case 6: - /* We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ + /* + * We need to check if ring == GUEST_PL and faulting + * instruction == vmcall. + */ if (is_hypercall(cpu)) { rewrite_hypercall(cpu); return; } break; case 14: /* We've intercepted a Page Fault. */ - /* The Guest accessed a virtual address that wasn't mapped. + /* + * The Guest accessed a virtual address that wasn't mapped. * This happens a lot: we don't actually set up most of the page * tables for the Guest at all when we start: as it runs it asks * for more and more, and we set them up as required. In this * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. */ + * whether kernel or userspace code. + */ if (demand_page(cpu, cpu->arch.last_pagefault, cpu->regs->errcode)) return; - /* OK, it's really not there (or not OK): the Guest needs to + /* + * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the * fault occurred. * * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL */ + * lg->lguest_data could be NULL + */ if (cpu->lg->lguest_data && put_user(cpu->arch.last_pagefault, &cpu->lg->lguest_data->cr2)) kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already restored the - * Floating Point Unit, so we just continue without telling - * it. */ + /* + * If the Guest doesn't want to know, we already restored the + * Floating Point Unit, so we just continue without telling it. + */ if (!cpu->ts) return; break; case 32 ... 255: - /* These values mean a real interrupt occurred, in which case + /* + * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again */ + * return to run the Guest again + */ cond_resched(); return; case LGUEST_TRAP_ENTRY: - /* Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. */ + /* + * Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. + */ cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* If the Guest doesn't have a handler (either it hasn't + /* + * If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. */ + * it handle), it dies with this cryptic error message. + */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault : cpu->regs->errcode); } -/* Now we can look at each of the routines this calls, in increasing order of +/* + * Now we can look at each of the routines this calls, in increasing order of * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), * deliver_trap() and demand_page(). After all those, we'll be ready to * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ + * duality will be complete. +:*/ static void adjust_pge(void *on) { if (on) @@ -439,13 +515,16 @@ static void adjust_pge(void *on) write_cr4(read_cr4() & ~X86_CR4_PGE); } -/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. */ +/*H:020 + * Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. + */ void __init lguest_arch_host_init(void) { int i; - /* Most of the i386/switcher.S doesn't care that it's been moved; on + /* + * Most of the i386/switcher.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void) * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. */ + * compiled-in switcher code and the high-mapped copy we just made. + */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void) for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ + /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; - /* The Global Descriptor Table: the Host has a different one + /* + * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ + * byte, not the size, hence the "-1"). + */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - /* All CPUs on the Host use the same Interrupt Descriptor + /* + * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ + * descriptor. + */ store_idt(&state->host_idt_desc); - /* The descriptors for the Guest's GDT and IDT can be filled + /* + * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ + * ->guest_idt before actually running the Guest. + */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; - /* We know where we want the stack to be when the Guest enters + /* + * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ + * we start it at the end of that structure. + */ state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ + /* + * And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. + */ state->guest_tss.ss0 = LGUEST_DS; - /* x86 can have a finegrained bitmap which indicates what I/O + /* + * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our - * structure, meaning "none". */ + * structure, meaning "none". + */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ + /* + * Some GDT entries are the same across all Guests, so we can + * set them up now. + */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ + /* + * The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. + */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } - /* In the Switcher, we want the %cs segment register to use the + /* + * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ + * need this structure to feed to Intel's "lcall" instruction. + */ lguest_entry.offset = (long)switch_to_guest + switcher_offset(); lguest_entry.segment = LGUEST_CS; - /* Finally, we need to turn off "Page Global Enable". PGE is an + /* + * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. @@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void) * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ + * on when we return, but that slowed the Switcher down noticibly. + */ - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ + /* + * We don't need the complexity of CPUs coming and going while we're + * doing this. + */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ + /* + * adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). + */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); @@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. */ + /* + * The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. + */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; - /* Having checked it, we simply set lg->lguest_data to point straight + /* + * Having checked it, we simply set lg->lguest_data to point straight * into the Launcher's memory at the right place and then use * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid - * optimizations. */ + * optimizations. + */ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - /* We insist that the Time Stamp Counter exist and doesn't change with + /* + * We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC * changes could be handled in software. I decided that time going * backwards might be good for benchmarks, but it's bad for users. * * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ + * TSCs for its own purposes, and we use that here. + */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) tsc_speed = tsc_khz; else @@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) } /*:*/ -/*L:030 lguest_arch_setup_regs() +/*L:030 + * lguest_arch_setup_regs() * * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ + * allocate the structure, so they will be 0. + */ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { struct lguest_regs *regs = cpu->regs; - /* There are four "segment" registers which the Guest needs to boot: + /* + * There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment * __KERNEL_CS, and the "data", "extra" and "stack" segment registers * refer to the kernel data segment __KERNEL_DS. * * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ + * at privilege level 1 (GUEST_PL). + */ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; regs->cs = __KERNEL_CS|GUEST_PL; - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + /* + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) * is supposed to always be "1". Bit 9 (0x200) controls whether * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ + * running the Guest. + */ regs->eflags = X86_EFLAGS_IF | 0x2; - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ + /* + * The "Extended Instruction Pointer" register says where the Guest is + * running. + */ regs->eip = start; - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ + /* + * %esi points to our boot information, at physical address 0, so don't + * touch it. + */ - /* There are a couple of GDT entries the Guest expects when first - * booting. */ + /* There are a couple of GDT entries the Guest expects at boot. */ setup_guest_gdt(cpu); } diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 3fc1531..40634b0 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,12 +1,15 @@ -/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as - * it can be made, but it's naturally very specific to x86. +/*P:900 + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride + * both the Host and Guest to do the low-level Guest<->Host switch. It is as + * simple as it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage - * can be found in "make Guest". :*/ + * can be found in "make Guest". + :*/ -/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must +/*M:012 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must * gain at least 1% more performance. Since neither LOC nor performance can be * measured beforehand, it generally means implementing a feature then deciding * if it's worth it. And once it's implemented, who can say no? @@ -31,11 +34,14 @@ * Host (which is actually really easy). * * Two questions remain. Would the performance gain outweigh the complexity? - * And who would write the verse documenting it? :*/ + * And who would write the verse documenting it? +:*/ -/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their +/*M:011 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their * code). It's worth doing though, since it would let us use oprofile in the - * Host when a Guest is running. :*/ + * Host when a Guest is running. +:*/ /*S:100 * Welcome to the Switcher itself! diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index bcec78f..248e00e 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,8 +52,10 @@ struct virtio_pci_device char (*msix_names)[256]; /* Number of available vectors */ unsigned msix_vectors; - /* Vectors allocated */ + /* Vectors allocated, excluding per-vq vectors if any */ unsigned msix_used_vectors; + /* Whether we have vector per vq */ + bool per_vq_vectors; }; /* Constants for MSI-X */ @@ -258,7 +260,6 @@ static void vp_free_vectors(struct virtio_device *vdev) for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(vp_dev->msix_entries[i].vector, vp_dev); - vp_dev->msix_used_vectors = 0; if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ @@ -267,80 +268,77 @@ static void vp_free_vectors(struct virtio_device *vdev) /* Flush the write out to device */ ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - vp_dev->msix_enabled = 0; pci_disable_msix(vp_dev->pci_dev); + vp_dev->msix_enabled = 0; + vp_dev->msix_vectors = 0; } -} -static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, - int *options, int noptions) -{ - int i; - for (i = 0; i < noptions; ++i) - if (!pci_enable_msix(dev, entries, options[i])) - return options[i]; - return -EBUSY; + vp_dev->msix_used_vectors = 0; + kfree(vp_dev->msix_names); + vp_dev->msix_names = NULL; + kfree(vp_dev->msix_entries); + vp_dev->msix_entries = NULL; } -static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) +static int vp_request_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned i, v; int err = -ENOMEM; - /* We want at most one vector per queue and one for config changes. - * Fallback to separate vectors for config and a shared for queues. - * Finally fall back to regular interrupts. */ - int options[] = { max_vqs + 1, 2 }; - int nvectors = max(options[0], options[1]); + + if (!nvectors) { + /* Can't allocate MSI-X vectors, use regular interrupt */ + vp_dev->msix_vectors = 0; + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, + IRQF_SHARED, name, vp_dev); + if (err) + return err; + vp_dev->intx_enabled = 1; + return 0; + } vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, GFP_KERNEL); if (!vp_dev->msix_entries) - goto error_entries; + goto error; vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, GFP_KERNEL); if (!vp_dev->msix_names) - goto error_names; + goto error; for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; - err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, - options, ARRAY_SIZE(options)); - if (err < 0) { - /* Can't allocate enough MSI-X vectors, use regular interrupt */ - vp_dev->msix_vectors = 0; - err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, - IRQF_SHARED, name, vp_dev); - if (err) - goto error_irq; - vp_dev->intx_enabled = 1; - } else { - vp_dev->msix_vectors = err; - vp_dev->msix_enabled = 1; - - /* Set the vector used for configuration */ - v = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, - "%s-config", name); - err = request_irq(vp_dev->msix_entries[v].vector, - vp_config_changed, 0, vp_dev->msix_names[v], - vp_dev); - if (err) - goto error_irq; - ++vp_dev->msix_used_vectors; - - iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; - goto error_irq; - } + err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors); + if (err > 0) + err = -ENOSPC; + if (err) + goto error; + vp_dev->msix_vectors = nvectors; + vp_dev->msix_enabled = 1; + + /* Set the vector used for configuration */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-config", name); + err = request_irq(vp_dev->msix_entries[v].vector, + vp_config_changed, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + + iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Verify we had enough resources to assign the vector */ + v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + if (v == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto error; } - if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) { + if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, @@ -349,28 +347,25 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) - goto error_irq; + goto error; ++vp_dev->msix_used_vectors; } return 0; -error_irq: +error: vp_free_vectors(vdev); - kfree(vp_dev->msix_names); -error_names: - kfree(vp_dev->msix_entries); -error_entries: return err; } static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), - const char *name) + const char *name, + u16 vector) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq; unsigned long flags, size; - u16 num, vector; + u16 num; int err; /* Select the queue we're interested in */ @@ -389,7 +384,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, info->queue_index = index; info->num = num; - info->vector = VIRTIO_MSI_NO_VECTOR; + info->vector = vector; size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); @@ -413,22 +408,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, vq->priv = info; info->vq = vq; - /* allocate per-vq vector if available and necessary */ - if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) { - vector = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, - "%s-%s", dev_name(&vp_dev->vdev.dev), name); - err = request_irq(vp_dev->msix_entries[vector].vector, - vring_interrupt, 0, - vp_dev->msix_names[vector], vq); - if (err) - goto out_request_irq; - info->vector = vector; - ++vp_dev->msix_used_vectors; - } else - vector = VP_MSIX_VQ_VECTOR; - - if (callback && vp_dev->msix_enabled) { + if (vector != VIRTIO_MSI_NO_VECTOR) { iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); if (vector == VIRTIO_MSI_NO_VECTOR) { @@ -444,11 +424,6 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, return vq; out_assign: - if (info->vector != VIRTIO_MSI_NO_VECTOR) { - free_irq(vp_dev->msix_entries[info->vector].vector, vq); - --vp_dev->msix_used_vectors; - } -out_request_irq: vring_del_virtqueue(vq); out_activate_queue: iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); @@ -462,12 +437,13 @@ static void vp_del_vq(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_vq_info *info = vq->priv; - unsigned long size; + unsigned long flags, size; - iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); - if (info->vector != VIRTIO_MSI_NO_VECTOR) - free_irq(vp_dev->msix_entries[info->vector].vector, vq); + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); if (vp_dev->msix_enabled) { iowrite16(VIRTIO_MSI_NO_VECTOR, @@ -489,36 +465,62 @@ static void vp_del_vq(struct virtqueue *vq) /* the config->del_vqs() implementation */ static void vp_del_vqs(struct virtio_device *vdev) { + struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq, *n; + struct virtio_pci_vq_info *info; - list_for_each_entry_safe(vq, n, &vdev->vqs, list) + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + info = vq->priv; + if (vp_dev->per_vq_vectors) + free_irq(vp_dev->msix_entries[info->vector].vector, vq); vp_del_vq(vq); + } + vp_dev->per_vq_vectors = false; vp_free_vectors(vdev); } -/* the config->find_vqs() implementation */ -static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) +static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[], + int nvectors, + bool per_vq_vectors) { - int vectors = 0; - int i, err; - - /* How many vectors would we like? */ - for (i = 0; i < nvqs; ++i) - if (callbacks[i]) - ++vectors; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u16 vector; + int i, err, allocated_vectors; - err = vp_request_vectors(vdev, vectors); + err = vp_request_vectors(vdev, nvectors, per_vq_vectors); if (err) goto error_request; + vp_dev->per_vq_vectors = per_vq_vectors; + allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { - vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) + if (!callbacks[i] || !vp_dev->msix_enabled) + vector = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + vector = allocated_vectors++; + else + vector = VP_MSIX_VQ_VECTOR; + vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i], vector); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); goto error_find; + } + /* allocate per-vq irq if available and necessary */ + if (vp_dev->per_vq_vectors && vector != VIRTIO_MSI_NO_VECTOR) { + snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, + "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(vp_dev->msix_entries[vector].vector, + vring_interrupt, 0, + vp_dev->msix_names[vector], vqs[i]); + if (err) { + vp_del_vq(vqs[i]); + goto error_find; + } + } } return 0; @@ -526,7 +528,37 @@ error_find: vp_del_vqs(vdev); error_request: - return PTR_ERR(vqs[i]); + return err; +} + +/* the config->find_vqs() implementation */ +static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + int vectors = 0; + int i, uninitialized_var(err); + + /* How many vectors would we like? */ + for (i = 0; i < nvqs; ++i) + if (callbacks[i]) + ++vectors; + + /* We want at most one vector per queue and one for config changes. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + vectors + 1, true); + if (!err) + return 0; + /* Fallback to separate vectors for config and a shared for queues. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 2, false); + if (!err) + return 0; + /* Finally fall back to regular interrupts. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 0, false); + return err; } static struct virtio_config_ops virtio_pci_config_ops = { diff --git a/include/linux/lguest.h b/include/linux/lguest.h index dbf2479..2fb1dcb 100644 --- a/include/linux/lguest.h +++ b/include/linux/lguest.h @@ -1,5 +1,7 @@ -/* Things the lguest guest needs to know. Note: like all lguest interfaces, - * this is subject to wild and random change between versions. */ +/* + * Things the lguest guest needs to know. Note: like all lguest interfaces, + * this is subject to wild and random change between versions. + */ #ifndef _LINUX_LGUEST_H #define _LINUX_LGUEST_H @@ -11,32 +13,41 @@ #define LG_CLOCK_MIN_DELTA 100UL #define LG_CLOCK_MAX_DELTA ULONG_MAX -/*G:031 The second method of communicating with the Host is to via "struct +/*G:031 + * The second method of communicating with the Host is to via "struct * lguest_data". Once the Guest's initialization hypercall tells the Host where - * this is, the Guest and Host both publish information in it. :*/ -struct lguest_data -{ - /* 512 == enabled (same as eflags in normal hardware). The Guest - * changes interrupts so often that a hypercall is too slow. */ + * this is, the Guest and Host both publish information in it. +:*/ +struct lguest_data { + /* + * 512 == enabled (same as eflags in normal hardware). The Guest + * changes interrupts so often that a hypercall is too slow. + */ unsigned int irq_enabled; /* Fine-grained interrupt disabling by the Guest */ DECLARE_BITMAP(blocked_interrupts, LGUEST_IRQS); - /* The Host writes the virtual address of the last page fault here, + /* + * The Host writes the virtual address of the last page fault here, * which saves the Guest a hypercall. CR2 is the native register where - * this address would normally be found. */ + * this address would normally be found. + */ unsigned long cr2; /* Wallclock time set by the Host. */ struct timespec time; - /* Interrupt pending set by the Host. The Guest should do a hypercall - * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). */ + /* + * Interrupt pending set by the Host. The Guest should do a hypercall + * if it re-enables interrupts and sees this set (to X86_EFLAGS_IF). + */ int irq_pending; - /* Async hypercall ring. Instead of directly making hypercalls, we can + /* + * Async hypercall ring. Instead of directly making hypercalls, we can * place them in here for processing the next time the Host wants. - * This batching can be quite efficient. */ + * This batching can be quite efficient. + */ /* 0xFF == done (set by Host), 0 == pending (set by Guest). */ u8 hcall_status[LHCALL_RING_SIZE]; diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index bfefbdf..495203f 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -29,8 +29,10 @@ struct lguest_device_desc { __u8 type; /* The number of virtqueues (first in config array) */ __u8 num_vq; - /* The number of bytes of feature bits. Multiply by 2: one for host - * features and one for Guest acknowledgements. */ + /* + * The number of bytes of feature bits. Multiply by 2: one for host + * features and one for Guest acknowledgements. + */ __u8 feature_len; /* The number of bytes of the config array after virtqueues. */ __u8 config_len; @@ -39,8 +41,10 @@ struct lguest_device_desc { __u8 config[0]; }; -/*D:135 This is how we expect the device configuration field for a virtqueue - * to be laid out in config space. */ +/*D:135 + * This is how we expect the device configuration field for a virtqueue + * to be laid out in config space. + */ struct lguest_vqconfig { /* The number of entries in the virtio_ring */ __u16 num; @@ -61,7 +65,9 @@ enum lguest_req LHREQ_EVENTFD, /* + address, fd. */ }; -/* The alignment to use between consumer and producer parts of vring. - * x86 pagesize for historical reasons. */ +/* + * The alignment to use between consumer and producer parts of vring. + * x86 pagesize for historical reasons. + */ #define LGUEST_VRING_ALIGN 4096 #endif /* _LINUX_LGUEST_LAUNCHER */ diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index be7d255..8dab9f2 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -20,8 +20,7 @@ #define VIRTIO_BLK_ID_BYTES (sizeof(__u16[256])) /* IDENTIFY DATA */ -struct virtio_blk_config -{ +struct virtio_blk_config { /* The capacity (in 512-byte sectors). */ __u64 capacity; /* The maximum segment size (if VIRTIO_BLK_F_SIZE_MAX) */ @@ -50,8 +49,7 @@ struct virtio_blk_config #define VIRTIO_BLK_T_BARRIER 0x80000000 /* This is the first element of the read scatter-gather list. */ -struct virtio_blk_outhdr -{ +struct virtio_blk_outhdr { /* VIRTIO_BLK_T* */ __u32 type; /* io priority. */ diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 99f5145..e547e3c 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -79,8 +79,7 @@ * the dev->feature bits if it wants. */ typedef void vq_callback_t(struct virtqueue *); -struct virtio_config_ops -{ +struct virtio_config_ops { void (*get)(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len); void (*set)(struct virtio_device *vdev, unsigned offset, diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 9c543d6..d8dd539 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -31,8 +31,7 @@ #define VIRTIO_NET_S_LINK_UP 1 /* Link is up */ -struct virtio_net_config -{ +struct virtio_net_config { /* The config defining mac address (if VIRTIO_NET_F_MAC) */ __u8 mac[6]; /* See VIRTIO_NET_F_STATUS and VIRTIO_NET_S_* above */ @@ -41,8 +40,7 @@ struct virtio_net_config /* This is the first element of the scatter-gather list. If you don't * specify GSO or CSUM features, you can simply ignore the header. */ -struct virtio_net_hdr -{ +struct virtio_net_hdr { #define VIRTIO_NET_HDR_F_NEEDS_CSUM 1 // Use csum_start, csum_offset __u8 flags; #define VIRTIO_NET_HDR_GSO_NONE 0 // Not a GSO frame diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 693e0ec..e4d144b 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -30,8 +30,7 @@ #define VIRTIO_RING_F_INDIRECT_DESC 28 /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ -struct vring_desc -{ +struct vring_desc { /* Address (guest-physical). */ __u64 addr; /* Length. */ @@ -42,24 +41,21 @@ struct vring_desc __u16 next; }; -struct vring_avail -{ +struct vring_avail { __u16 flags; __u16 idx; __u16 ring[]; }; /* u32 is used here for ids for padding reasons. */ -struct vring_used_elem -{ +struct vring_used_elem { /* Index of start of used descriptor chain. */ __u32 id; /* Total length of the descriptor chain which was used (written to) */ __u32 len; }; -struct vring_used -{ +struct vring_used { __u16 flags; __u16 idx; struct vring_used_elem ring[]; |