summaryrefslogtreecommitdiffstats
path: root/drivers/lguest
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-02-18 09:24:01 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-02-18 09:24:01 -0800
commit53861af9a17022898619a2ae4ead0dfc601b7c13 (patch)
treedc11088d9e86fa1d8d8479974864153a8f976897 /drivers/lguest
parent5c2770079fb9b8c5bfb7113d9e76de66e77a0e24 (diff)
parent5b40a7daf51812b35cf05d1601a779a7043f8414 (diff)
downloadop-kernel-dev-53861af9a17022898619a2ae4ead0dfc601b7c13.zip
op-kernel-dev-53861af9a17022898619a2ae4ead0dfc601b7c13.tar.gz
Merge tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux
Pull virtio updates from Rusty Russell: "OK, this has the big virtio 1.0 implementation, as specified by OASIS. On top of tht is the major rework of lguest, to use PCI and virtio 1.0, to double-check the implementation. Then comes the inevitable fixes and cleanups from that work" * tag 'virtio-next-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/rusty/linux: (80 commits) virtio: don't set VIRTIO_CONFIG_S_DRIVER_OK twice. virtio_net: unconditionally define struct virtio_net_hdr_v1. tools/lguest: don't use legacy definitions for net device in example launcher. virtio: Don't expose legacy net features when VIRTIO_NET_NO_LEGACY defined. tools/lguest: use common error macros in the example launcher. tools/lguest: give virtqueues names for better error messages tools/lguest: more documentation and checking of virtio 1.0 compliance. lguest: don't look in console features to find emerg_wr. tools/lguest: don't start devices until DRIVER_OK status set. tools/lguest: handle indirect partway through chain. tools/lguest: insert driver references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: insert device references from the 1.0 spec (4.1 Virtio Over PCI) tools/lguest: rename virtio_pci_cfg_cap field to match spec. tools/lguest: fix features_accepted logic in example launcher. tools/lguest: handle device reset correctly in example launcher. virtual: Documentation: simplify and generalize paravirt_ops.txt lguest: remove NOTIFY call and eventfd facility. lguest: remove NOTIFY facility from demonstration launcher. lguest: use the PCI console device's emerg_wr for early boot messages. lguest: always put console in PCI slot #1. ...
Diffstat (limited to 'drivers/lguest')
-rw-r--r--drivers/lguest/Makefile3
-rw-r--r--drivers/lguest/core.c29
-rw-r--r--drivers/lguest/hypercalls.c7
-rw-r--r--drivers/lguest/lg.h26
-rw-r--r--drivers/lguest/lguest_device.c540
-rw-r--r--drivers/lguest/lguest_user.c221
-rw-r--r--drivers/lguest/page_tables.c75
-rw-r--r--drivers/lguest/x86/core.c198
8 files changed, 252 insertions, 847 deletions
diff --git a/drivers/lguest/Makefile b/drivers/lguest/Makefile
index c419750..16f52ee 100644
--- a/drivers/lguest/Makefile
+++ b/drivers/lguest/Makefile
@@ -1,6 +1,3 @@
-# Guest requires the device configuration and probing code.
-obj-$(CONFIG_LGUEST_GUEST) += lguest_device.o
-
# Host requires the other files, which can be a module.
obj-$(CONFIG_LGUEST) += lg.o
lg-y = core.o hypercalls.o page_tables.o interrupts_and_traps.o \
diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c
index 6590558..7dc93aa 100644
--- a/drivers/lguest/core.c
+++ b/drivers/lguest/core.c
@@ -208,6 +208,14 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b,
*/
int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
{
+ /* If the launcher asked for a register with LHREQ_GETREG */
+ if (cpu->reg_read) {
+ if (put_user(*cpu->reg_read, user))
+ return -EFAULT;
+ cpu->reg_read = NULL;
+ return sizeof(*cpu->reg_read);
+ }
+
/* We stop running once the Guest is dead. */
while (!cpu->lg->dead) {
unsigned int irq;
@@ -217,21 +225,12 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user)
if (cpu->hcall)
do_hypercalls(cpu);
- /*
- * It's possible the Guest did a NOTIFY hypercall to the
- * Launcher.
- */
- if (cpu->pending_notify) {
- /*
- * Does it just needs to write to a registered
- * eventfd (ie. the appropriate virtqueue thread)?
- */
- if (!send_notify_to_eventfd(cpu)) {
- /* OK, we tell the main Launcher. */
- if (put_user(cpu->pending_notify, user))
- return -EFAULT;
- return sizeof(cpu->pending_notify);
- }
+ /* Do we have to tell the Launcher about a trap? */
+ if (cpu->pending.trap) {
+ if (copy_to_user(user, &cpu->pending,
+ sizeof(cpu->pending)))
+ return -EFAULT;
+ return sizeof(cpu->pending);
}
/*
diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c
index 83511eb..1219af4 100644
--- a/drivers/lguest/hypercalls.c
+++ b/drivers/lguest/hypercalls.c
@@ -117,9 +117,6 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args)
/* Similarly, this sets the halted flag for run_guest(). */
cpu->halted = 1;
break;
- case LHCALL_NOTIFY:
- cpu->pending_notify = args->arg1;
- break;
default:
/* It should be an architecture-specific hypercall. */
if (lguest_arch_do_hcall(cpu, args))
@@ -189,7 +186,7 @@ static void do_async_hcalls(struct lg_cpu *cpu)
* Stop doing hypercalls if they want to notify the Launcher:
* it needs to service this first.
*/
- if (cpu->pending_notify)
+ if (cpu->pending.trap)
break;
}
}
@@ -280,7 +277,7 @@ void do_hypercalls(struct lg_cpu *cpu)
* NOTIFY to the Launcher, we want to return now. Otherwise we do
* the hypercall.
*/
- if (!cpu->pending_notify) {
+ if (!cpu->pending.trap) {
do_hcall(cpu, cpu->hcall);
/*
* Tricky point: we reset the hcall pointer to mark the
diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h
index 2eef40b..307e8b3 100644
--- a/drivers/lguest/lg.h
+++ b/drivers/lguest/lg.h
@@ -50,7 +50,10 @@ struct lg_cpu {
/* Bitmap of what has changed: see CHANGED_* above. */
int changed;
- unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */
+ /* Pending operation. */
+ struct lguest_pending pending;
+
+ unsigned long *reg_read; /* register from LHREQ_GETREG */
/* At end of a page shared mapped over lguest_pages in guest. */
unsigned long regs_page;
@@ -78,24 +81,18 @@ struct lg_cpu {
struct lg_cpu_arch arch;
};
-struct lg_eventfd {
- unsigned long addr;
- struct eventfd_ctx *event;
-};
-
-struct lg_eventfd_map {
- unsigned int num;
- struct lg_eventfd map[];
-};
-
/* The private info the thread maintains about the guest. */
struct lguest {
struct lguest_data __user *lguest_data;
struct lg_cpu cpus[NR_CPUS];
unsigned int nr_cpus;
+ /* Valid guest memory pages must be < this. */
u32 pfn_limit;
+ /* Device memory is >= pfn_limit and < device_limit. */
+ u32 device_limit;
+
/*
* This provides the offset to the base of guest-physical memory in the
* Launcher.
@@ -110,8 +107,6 @@ struct lguest {
unsigned int stack_pages;
u32 tsc_khz;
- struct lg_eventfd_map *eventfds;
-
/* Dead? */
const char *dead;
};
@@ -197,8 +192,10 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu);
void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir,
unsigned long vaddr, pte_t val);
void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages);
-bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode);
+bool demand_page(struct lg_cpu *cpu, unsigned long cr2, int errcode,
+ unsigned long *iomem);
void pin_page(struct lg_cpu *cpu, unsigned long vaddr);
+bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr);
unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr);
void page_table_guest_data_init(struct lg_cpu *cpu);
@@ -210,6 +207,7 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu);
int lguest_arch_init_hypercalls(struct lg_cpu *cpu);
int lguest_arch_do_hcall(struct lg_cpu *cpu, struct hcall_args *args);
void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start);
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any);
/* <arch>/switcher.S: */
extern char start_switcher_text[], end_switcher_text[], switch_to_guest[];
diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c
deleted file mode 100644
index 89088d6..0000000
--- a/drivers/lguest/lguest_device.c
+++ /dev/null
@@ -1,540 +0,0 @@
-/*P:050
- * Lguest guests use a very simple method to describe devices. It's a
- * series of device descriptors contained just above the top of normal Guest
- * memory.
- *
- * We use the standard "virtio" device infrastructure, which provides us with a
- * console, a network and a block driver. Each one expects some configuration
- * information and a "virtqueue" or two to send and receive data.
-:*/
-#include <linux/init.h>
-#include <linux/bootmem.h>
-#include <linux/lguest_launcher.h>
-#include <linux/virtio.h>
-#include <linux/virtio_config.h>
-#include <linux/interrupt.h>
-#include <linux/virtio_ring.h>
-#include <linux/err.h>
-#include <linux/export.h>
-#include <linux/slab.h>
-#include <asm/io.h>
-#include <asm/paravirt.h>
-#include <asm/lguest_hcall.h>
-
-/* The pointer to our (page) of device descriptions. */
-static void *lguest_devices;
-
-/*
- * For Guests, device memory can be used as normal memory, so we cast away the
- * __iomem to quieten sparse.
- */
-static inline void *lguest_map(unsigned long phys_addr, unsigned long pages)
-{
- return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages);
-}
-
-static inline void lguest_unmap(void *addr)
-{
- iounmap((__force void __iomem *)addr);
-}
-
-/*D:100
- * Each lguest device is just a virtio device plus a pointer to its entry
- * in the lguest_devices page.
- */
-struct lguest_device {
- struct virtio_device vdev;
-
- /* The entry in the lguest_devices page for this device. */
- struct lguest_device_desc *desc;
-};
-
-/*
- * Since the virtio infrastructure hands us a pointer to the virtio_device all
- * the time, it helps to have a curt macro to get a pointer to the struct
- * lguest_device it's enclosed in.
- */
-#define to_lgdev(vd) container_of(vd, struct lguest_device, vdev)
-
-/*D:130
- * Device configurations
- *
- * The configuration information for a device consists of one or more
- * virtqueues, a feature bitmap, and some configuration bytes. The
- * configuration bytes don't really matter to us: the Launcher sets them up, and
- * the driver will look at them during setup.
- *
- * A convenient routine to return the device's virtqueue config array:
- * immediately after the descriptor.
- */
-static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc)
-{
- return (void *)(desc + 1);
-}
-
-/* The features come immediately after the virtqueues. */
-static u8 *lg_features(const struct lguest_device_desc *desc)
-{
- return (void *)(lg_vq(desc) + desc->num_vq);
-}
-
-/* The config space comes after the two feature bitmasks. */
-static u8 *lg_config(const struct lguest_device_desc *desc)
-{
- return lg_features(desc) + desc->feature_len * 2;
-}
-
-/* The total size of the config page used by this device (incl. desc) */
-static unsigned desc_size(const struct lguest_device_desc *desc)
-{
- return sizeof(*desc)
- + desc->num_vq * sizeof(struct lguest_vqconfig)
- + desc->feature_len * 2
- + desc->config_len;
-}
-
-/* This gets the device's feature bits. */
-static u64 lg_get_features(struct virtio_device *vdev)
-{
- unsigned int i;
- u32 features = 0;
- struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
- u8 *in_features = lg_features(desc);
-
- /* We do this the slow but generic way. */
- for (i = 0; i < min(desc->feature_len * 8, 32); i++)
- if (in_features[i / 8] & (1 << (i % 8)))
- features |= (1 << i);
-
- return features;
-}
-
-/*
- * To notify on reset or feature finalization, we (ab)use the NOTIFY
- * hypercall, with the descriptor address of the device.
- */
-static void status_notify(struct virtio_device *vdev)
-{
- unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices;
-
- hcall(LHCALL_NOTIFY, (max_pfn << PAGE_SHIFT) + offset, 0, 0, 0);
-}
-
-/*
- * The virtio core takes the features the Host offers, and copies the ones
- * supported by the driver into the vdev->features array. Once that's all
- * sorted out, this routine is called so we can tell the Host which features we
- * understand and accept.
- */
-static int lg_finalize_features(struct virtio_device *vdev)
-{
- unsigned int i, bits;
- struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
- /* Second half of bitmap is features we accept. */
- u8 *out_features = lg_features(desc) + desc->feature_len;
-
- /* Give virtio_ring a chance to accept features. */
- vring_transport_features(vdev);
-
- /* Make sure we don't have any features > 32 bits! */
- BUG_ON((u32)vdev->features != vdev->features);
-
- /*
- * Since lguest is currently x86-only, we're little-endian. That
- * means we could just memcpy. But it's not time critical, and in
- * case someone copies this code, we do it the slow, obvious way.
- */
- memset(out_features, 0, desc->feature_len);
- bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8;
- for (i = 0; i < bits; i++) {
- if (__virtio_test_bit(vdev, i))
- out_features[i / 8] |= (1 << (i % 8));
- }
-
- /* Tell Host we've finished with this device's feature negotiation */
- status_notify(vdev);
-
- return 0;
-}
-
-/* Once they've found a field, getting a copy of it is easy. */
-static void lg_get(struct virtio_device *vdev, unsigned int offset,
- void *buf, unsigned len)
-{
- struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-
- /* Check they didn't ask for more than the length of the config! */
- BUG_ON(offset + len > desc->config_len);
- memcpy(buf, lg_config(desc) + offset, len);
-}
-
-/* Setting the contents is also trivial. */
-static void lg_set(struct virtio_device *vdev, unsigned int offset,
- const void *buf, unsigned len)
-{
- struct lguest_device_desc *desc = to_lgdev(vdev)->desc;
-
- /* Check they didn't ask for more than the length of the config! */
- BUG_ON(offset + len > desc->config_len);
- memcpy(lg_config(desc) + offset, buf, len);
-}
-
-/*
- * The operations to get and set the status word just access the status field
- * of the device descriptor.
- */
-static u8 lg_get_status(struct virtio_device *vdev)
-{
- return to_lgdev(vdev)->desc->status;
-}
-
-static void lg_set_status(struct virtio_device *vdev, u8 status)
-{
- BUG_ON(!status);
- to_lgdev(vdev)->desc->status = status;
-
- /* Tell Host immediately if we failed. */
- if (status & VIRTIO_CONFIG_S_FAILED)
- status_notify(vdev);
-}
-
-static void lg_reset(struct virtio_device *vdev)
-{
- /* 0 status means "reset" */
- to_lgdev(vdev)->desc->status = 0;
- status_notify(vdev);
-}
-
-/*
- * Virtqueues
- *
- * The other piece of infrastructure virtio needs is a "virtqueue": a way of
- * the Guest device registering buffers for the other side to read from or
- * write into (ie. send and receive buffers). Each device can have multiple
- * virtqueues: for example the console driver uses one queue for sending and
- * another for receiving.
- *
- * Fortunately for us, a very fast shared-memory-plus-descriptors virtqueue
- * already exists in virtio_ring.c. We just need to connect it up.
- *
- * We start with the information we need to keep about each virtqueue.
- */
-
-/*D:140 This is the information we remember about each virtqueue. */
-struct lguest_vq_info {
- /* A copy of the information contained in the device config. */
- struct lguest_vqconfig config;
-
- /* The address where we mapped the virtio ring, so we can unmap it. */
- void *pages;
-};
-
-/*
- * When the virtio_ring code wants to prod the Host, it calls us here and we
- * make a hypercall. We hand the physical address of the virtqueue so the Host
- * knows which virtqueue we're talking about.
- */
-static bool lg_notify(struct virtqueue *vq)
-{
- /*
- * We store our virtqueue information in the "priv" pointer of the
- * virtqueue structure.
- */
- struct lguest_vq_info *lvq = vq->priv;
-
- hcall(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT, 0, 0, 0);
- return true;
-}
-
-/* An extern declaration inside a C file is bad form. Don't do it. */
-extern int lguest_setup_irq(unsigned int irq);
-
-/*
- * This routine finds the Nth virtqueue described in the configuration of
- * this device and sets it up.
- *
- * This is kind of an ugly duckling. It'd be nicer to have a standard
- * representation of a virtqueue in the configuration space, but it seems that
- * everyone wants to do it differently. The KVM coders want the Guest to
- * allocate its own pages and tell the Host where they are, but for lguest it's
- * simpler for the Host to simply tell us where the pages are.
- */
-static struct virtqueue *lg_find_vq(struct virtio_device *vdev,
- unsigned index,
- void (*callback)(struct virtqueue *vq),
- const char *name)
-{
- struct lguest_device *ldev = to_lgdev(vdev);
- struct lguest_vq_info *lvq;
- struct virtqueue *vq;
- int err;
-
- if (!name)
- return NULL;
-
- /* We must have this many virtqueues. */
- if (index >= ldev->desc->num_vq)
- return ERR_PTR(-ENOENT);
-
- lvq = kmalloc(sizeof(*lvq), GFP_KERNEL);
- if (!lvq)
- return ERR_PTR(-ENOMEM);
-
- /*
- * Make a copy of the "struct lguest_vqconfig" entry, which sits after
- * the descriptor. We need a copy because the config space might not
- * be aligned correctly.
- */
- memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config));
-
- printk("Mapping virtqueue %i addr %lx\n", index,
- (unsigned long)lvq->config.pfn << PAGE_SHIFT);
- /* Figure out how many pages the ring will take, and map that memory */
- lvq->pages = lguest_map((unsigned long)lvq->config.pfn << PAGE_SHIFT,
- DIV_ROUND_UP(vring_size(lvq->config.num,
- LGUEST_VRING_ALIGN),
- PAGE_SIZE));
- if (!lvq->pages) {
- err = -ENOMEM;
- goto free_lvq;
- }
-
- /*
- * OK, tell virtio_ring.c to set up a virtqueue now we know its size
- * and we've got a pointer to its pages. Note that we set weak_barriers
- * to 'true': the host just a(nother) SMP CPU, so we only need inter-cpu
- * barriers.
- */
- vq = vring_new_virtqueue(index, lvq->config.num, LGUEST_VRING_ALIGN, vdev,
- true, lvq->pages, lg_notify, callback, name);
- if (!vq) {
- err = -ENOMEM;
- goto unmap;
- }
-
- /* Make sure the interrupt is allocated. */
- err = lguest_setup_irq(lvq->config.irq);
- if (err)
- goto destroy_vring;
-
- /*
- * Tell the interrupt for this virtqueue to go to the virtio_ring
- * interrupt handler.
- *
- * FIXME: We used to have a flag for the Host to tell us we could use
- * the interrupt as a source of randomness: it'd be nice to have that
- * back.
- */
- err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED,
- dev_name(&vdev->dev), vq);
- if (err)
- goto free_desc;
-
- /*
- * Last of all we hook up our 'struct lguest_vq_info" to the
- * virtqueue's priv pointer.
- */
- vq->priv = lvq;
- return vq;
-
-free_desc:
- irq_free_desc(lvq->config.irq);
-destroy_vring:
- vring_del_virtqueue(vq);
-unmap:
- lguest_unmap(lvq->pages);
-free_lvq:
- kfree(lvq);
- return ERR_PTR(err);
-}
-/*:*/
-
-/* Cleaning up a virtqueue is easy */
-static void lg_del_vq(struct virtqueue *vq)
-{
- struct lguest_vq_info *lvq = vq->priv;
-
- /* Release the interrupt */
- free_irq(lvq->config.irq, vq);
- /* Tell virtio_ring.c to free the virtqueue. */
- vring_del_virtqueue(vq);
- /* Unmap the pages containing the ring. */
- lguest_unmap(lvq->pages);
- /* Free our own queue information. */
- kfree(lvq);
-}
-
-static void lg_del_vqs(struct virtio_device *vdev)
-{
- struct virtqueue *vq, *n;
-
- list_for_each_entry_safe(vq, n, &vdev->vqs, list)
- lg_del_vq(vq);
-}
-
-static int lg_find_vqs(struct virtio_device *vdev, unsigned nvqs,
- struct virtqueue *vqs[],
- vq_callback_t *callbacks[],
- const char *names[])
-{
- struct lguest_device *ldev = to_lgdev(vdev);
- int i;
-
- /* We must have this many virtqueues. */
- if (nvqs > ldev->desc->num_vq)
- return -ENOENT;
-
- for (i = 0; i < nvqs; ++i) {
- vqs[i] = lg_find_vq(vdev, i, callbacks[i], names[i]);
- if (IS_ERR(vqs[i]))
- goto error;
- }
- return 0;
-
-error:
- lg_del_vqs(vdev);
- return PTR_ERR(vqs[i]);
-}
-
-static const char *lg_bus_name(struct virtio_device *vdev)
-{
- return "";
-}
-
-/* The ops structure which hooks everything together. */
-static const struct virtio_config_ops lguest_config_ops = {
- .get_features = lg_get_features,
- .finalize_features = lg_finalize_features,
- .get = lg_get,
- .set = lg_set,
- .get_status = lg_get_status,
- .set_status = lg_set_status,
- .reset = lg_reset,
- .find_vqs = lg_find_vqs,
- .del_vqs = lg_del_vqs,
- .bus_name = lg_bus_name,
-};
-
-/*
- * The root device for the lguest virtio devices. This makes them appear as
- * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2.
- */
-static struct device *lguest_root;
-
-/*D:120
- * This is the core of the lguest bus: actually adding a new device.
- * It's a separate function because it's neater that way, and because an
- * earlier version of the code supported hotplug and unplug. They were removed
- * early on because they were never used.
- *
- * As Andrew Tridgell says, "Untested code is buggy code".
- *
- * It's worth reading this carefully: we start with a pointer to the new device
- * descriptor in the "lguest_devices" page, and the offset into the device
- * descriptor page so we can uniquely identify it if things go badly wrong.
- */
-static void add_lguest_device(struct lguest_device_desc *d,
- unsigned int offset)
-{
- struct lguest_device *ldev;
-
- /* Start with zeroed memory; Linux's device layer counts on it. */
- ldev = kzalloc(sizeof(*ldev), GFP_KERNEL);
- if (!ldev) {
- printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n",
- offset, d->type);
- return;
- }
-
- /* This devices' parent is the lguest/ dir. */
- ldev->vdev.dev.parent = lguest_root;
- /*
- * The device type comes straight from the descriptor. There's also a
- * device vendor field in the virtio_device struct, which we leave as
- * 0.
- */
- ldev->vdev.id.device = d->type;
- /*
- * We have a simple set of routines for querying the device's
- * configuration information and setting its status.
- */
- ldev->vdev.config = &lguest_config_ops;
- /* And we remember the device's descriptor for lguest_config_ops. */
- ldev->desc = d;
-
- /*
- * register_virtio_device() sets up the generic fields for the struct
- * virtio_device and calls device_register(). This makes the bus
- * infrastructure look for a matching driver.
- */
- if (register_virtio_device(&ldev->vdev) != 0) {
- printk(KERN_ERR "Failed to register lguest dev %u type %u\n",
- offset, d->type);
- kfree(ldev);
- }
-}
-
-/*D:110
- * scan_devices() simply iterates through the device page. The type 0 is
- * reserved to mean "end of devices".
- */
-static void scan_devices(void)
-{
- unsigned int i;
- struct lguest_device_desc *d;
-
- /* We start at the page beginning, and skip over each entry. */
- for (i = 0; i < PAGE_SIZE; i += desc_size(d)) {
- d = lguest_devices + i;
-
- /* Once we hit a zero, stop. */
- if (d->type == 0)
- break;
-
- printk("Device at %i has size %u\n", i, desc_size(d));
- add_lguest_device(d, i);
- }
-}
-
-/*D:105
- * Fairly early in boot, lguest_devices_init() is called to set up the
- * lguest device infrastructure. We check that we are a Guest by checking
- * pv_info.name: there are other ways of checking, but this seems most
- * obvious to me.
- *
- * So we can access the "struct lguest_device_desc"s easily, we map that memory
- * and store the pointer in the global "lguest_devices". Then we register a
- * root device from which all our devices will hang (this seems to be the
- * correct sysfs incantation).
- *
- * Finally we call scan_devices() which adds all the devices found in the
- * lguest_devices page.
- */
-static int __init lguest_devices_init(void)
-{
- if (strcmp(pv_info.name, "lguest") != 0)
- return 0;
-
- lguest_root = root_device_register("lguest");
- if (IS_ERR(lguest_root))
- panic("Could not register lguest root");
-
- /* Devices are in a single page above top of "normal" mem */
- lguest_devices = lguest_map(max_pfn<<PAGE_SHIFT, 1);
-
- scan_devices();
- return 0;
-}
-/* We do this after core stuff, but before the drivers. */
-postcore_initcall(lguest_devices_init);
-
-/*D:150
- * At this point in the journey we used to now wade through the lguest
- * devices themselves: net, block and console. Since they're all now virtio
- * devices rather than lguest-specific, I've decided to ignore them. Mostly,
- * they're kind of boring. But this does mean you'll never experience the
- * thrill of reading the forbidden love scene buried deep in the block driver.
- *
- * "make Launcher" beckons, where we answer questions like "Where do Guests
- * come from?", and "What do you do when someone asks for optimization?".
- */
diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c
index 4263f4c..c4c6113 100644
--- a/drivers/lguest/lguest_user.c
+++ b/drivers/lguest/lguest_user.c
@@ -2,175 +2,62 @@
* launcher controls and communicates with the Guest. For example,
* the first write will tell us the Guest's memory layout and entry
* point. A read will run the Guest until something happens, such as
- * a signal or the Guest doing a NOTIFY out to the Launcher. There is
- * also a way for the Launcher to attach eventfds to particular NOTIFY
- * values instead of returning from the read() call.
+ * a signal or the Guest accessing a device.
:*/
#include <linux/uaccess.h>
#include <linux/miscdevice.h>
#include <linux/fs.h>
#include <linux/sched.h>
-#include <linux/eventfd.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/export.h>
#include "lg.h"
-/*L:056
- * Before we move on, let's jump ahead and look at what the kernel does when
- * it needs to look up the eventfds. That will complete our picture of how we
- * use RCU.
- *
- * The notification value is in cpu->pending_notify: we return true if it went
- * to an eventfd.
- */
-bool send_notify_to_eventfd(struct lg_cpu *cpu)
-{
- unsigned int i;
- struct lg_eventfd_map *map;
-
- /*
- * This "rcu_read_lock()" helps track when someone is still looking at
- * the (RCU-using) eventfds array. It's not actually a lock at all;
- * indeed it's a noop in many configurations. (You didn't expect me to
- * explain all the RCU secrets here, did you?)
- */
- rcu_read_lock();
- /*
- * rcu_dereference is the counter-side of rcu_assign_pointer(); it
- * makes sure we don't access the memory pointed to by
- * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy,
- * but Alpha allows this! Paul McKenney points out that a really
- * aggressive compiler could have the same effect:
- * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html
- *
- * So play safe, use rcu_dereference to get the rcu-protected pointer:
- */
- map = rcu_dereference(cpu->lg->eventfds);
- /*
- * Simple array search: even if they add an eventfd while we do this,
- * we'll continue to use the old array and just won't see the new one.
- */
- for (i = 0; i < map->num; i++) {
- if (map->map[i].addr == cpu->pending_notify) {
- eventfd_signal(map->map[i].event, 1);
- cpu->pending_notify = 0;
- break;
- }
- }
- /* We're done with the rcu-protected variable cpu->lg->eventfds. */
- rcu_read_unlock();
-
- /* If we cleared the notification, it's because we found a match. */
- return cpu->pending_notify == 0;
-}
-
-/*L:055
- * One of the more tricksy tricks in the Linux Kernel is a technique called
- * Read Copy Update. Since one point of lguest is to teach lguest journeyers
- * about kernel coding, I use it here. (In case you're curious, other purposes
- * include learning about virtualization and instilling a deep appreciation for
- * simplicity and puppies).
- *
- * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we
- * add new eventfds without ever blocking readers from accessing the array.
- * The current Launcher only does this during boot, so that never happens. But
- * Read Copy Update is cool, and adding a lock risks damaging even more puppies
- * than this code does.
- *
- * We allocate a brand new one-larger array, copy the old one and add our new
- * element. Then we make the lg eventfd pointer point to the new array.
- * That's the easy part: now we need to free the old one, but we need to make
- * sure no slow CPU somewhere is still looking at it. That's what
- * synchronize_rcu does for us: waits until every CPU has indicated that it has
- * moved on to know it's no longer using the old one.
- *
- * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update.
- */
-static int add_eventfd(struct lguest *lg, unsigned long addr, int fd)
+/*L:052
+ The Launcher can get the registers, and also set some of them.
+*/
+static int getreg_setup(struct lg_cpu *cpu, const unsigned long __user *input)
{
- struct lg_eventfd_map *new, *old = lg->eventfds;
-
- /*
- * We don't allow notifications on value 0 anyway (pending_notify of
- * 0 means "nothing pending").
- */
- if (!addr)
- return -EINVAL;
-
- /*
- * Replace the old array with the new one, carefully: others can
- * be accessing it at the same time.
- */
- new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1),
- GFP_KERNEL);
- if (!new)
- return -ENOMEM;
+ unsigned long which;
- /* First make identical copy. */
- memcpy(new->map, old->map, sizeof(old->map[0]) * old->num);
- new->num = old->num;
-
- /* Now append new entry. */
- new->map[new->num].addr = addr;
- new->map[new->num].event = eventfd_ctx_fdget(fd);
- if (IS_ERR(new->map[new->num].event)) {
- int err = PTR_ERR(new->map[new->num].event);
- kfree(new);
- return err;
- }
- new->num++;
+ /* We re-use the ptrace structure to specify which register to read. */
+ if (get_user(which, input) != 0)
+ return -EFAULT;
/*
- * Now put new one in place: rcu_assign_pointer() is a fancy way of
- * doing "lg->eventfds = new", but it uses memory barriers to make
- * absolutely sure that the contents of "new" written above is nailed
- * down before we actually do the assignment.
+ * We set up the cpu register pointer, and their next read will
+ * actually get the value (instead of running the guest).
*
- * We have to think about these kinds of things when we're operating on
- * live data without locks.
+ * The last argument 'true' says we can access any register.
*/
- rcu_assign_pointer(lg->eventfds, new);
+ cpu->reg_read = lguest_arch_regptr(cpu, which, true);
+ if (!cpu->reg_read)
+ return -ENOENT;
- /*
- * We're not in a big hurry. Wait until no one's looking at old
- * version, then free it.
- */
- synchronize_rcu();
- kfree(old);
-
- return 0;
+ /* And because this is a write() call, we return the length used. */
+ return sizeof(unsigned long) * 2;
}
-/*L:052
- * Receiving notifications from the Guest is usually done by attaching a
- * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will
- * become readable when the Guest does an LHCALL_NOTIFY with that value.
- *
- * This is really convenient for processing each virtqueue in a separate
- * thread.
- */
-static int attach_eventfd(struct lguest *lg, const unsigned long __user *input)
+static int setreg(struct lg_cpu *cpu, const unsigned long __user *input)
{
- unsigned long addr, fd;
- int err;
+ unsigned long which, value, *reg;
- if (get_user(addr, input) != 0)
+ /* We re-use the ptrace structure to specify which register to read. */
+ if (get_user(which, input) != 0)
return -EFAULT;
input++;
- if (get_user(fd, input) != 0)
+ if (get_user(value, input) != 0)
return -EFAULT;
- /*
- * Just make sure two callers don't add eventfds at once. We really
- * only need to lock against callers adding to the same Guest, so using
- * the Big Lguest Lock is overkill. But this is setup, not a fast path.
- */
- mutex_lock(&lguest_lock);
- err = add_eventfd(lg, addr, fd);
- mutex_unlock(&lguest_lock);
+ /* The last argument 'false' means we can't access all registers. */
+ reg = lguest_arch_regptr(cpu, which, false);
+ if (!reg)
+ return -ENOENT;
- return err;
+ *reg = value;
+
+ /* And because this is a write() call, we return the length used. */
+ return sizeof(unsigned long) * 3;
}
/*L:050
@@ -194,6 +81,23 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input)
return 0;
}
+/*L:053
+ * Deliver a trap: this is used by the Launcher if it can't emulate
+ * an instruction.
+ */
+static int trap(struct lg_cpu *cpu, const unsigned long __user *input)
+{
+ unsigned long trapnum;
+
+ if (get_user(trapnum, input) != 0)
+ return -EFAULT;
+
+ if (!deliver_trap(cpu, trapnum))
+ return -EINVAL;
+
+ return 0;
+}
+
/*L:040
* Once our Guest is initialized, the Launcher makes it run by reading
* from /dev/lguest.
@@ -237,8 +141,8 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o)
* If we returned from read() last time because the Guest sent I/O,
* clear the flag.
*/
- if (cpu->pending_notify)
- cpu->pending_notify = 0;
+ if (cpu->pending.trap)
+ cpu->pending.trap = 0;
/* Run the Guest until something interesting happens. */
return run_guest(cpu, (unsigned long __user *)user);
@@ -319,7 +223,7 @@ static int initialize(struct file *file, const unsigned long __user *input)
/* "struct lguest" contains all we (the Host) know about a Guest. */
struct lguest *lg;
int err;
- unsigned long args[3];
+ unsigned long args[4];
/*
* We grab the Big Lguest lock, which protects against multiple
@@ -343,21 +247,15 @@ static int initialize(struct file *file, const unsigned long __user *input)
goto unlock;
}
- lg->eventfds = kmalloc(sizeof(*lg->eventfds), GFP_KERNEL);
- if (!lg->eventfds) {
- err = -ENOMEM;
- goto free_lg;
- }
- lg->eventfds->num = 0;
-
/* Populate the easy fields of our "struct lguest" */
lg->mem_base = (void __user *)args[0];
lg->pfn_limit = args[1];
+ lg->device_limit = args[3];
/* This is the first cpu (cpu 0) and it will start booting at args[2] */
err = lg_cpu_start(&lg->cpus[0], 0, args[2]);
if (err)
- goto free_eventfds;
+ goto free_lg;
/*
* Initialize the Guest's shadow page tables. This allocates
@@ -378,8 +276,6 @@ static int initialize(struct file *file, const unsigned long __user *input)
free_regs:
/* FIXME: This should be in free_vcpu */
free_page(lg->cpus[0].regs_page);
-free_eventfds:
- kfree(lg->eventfds);
free_lg:
kfree(lg);
unlock:
@@ -432,8 +328,12 @@ static ssize_t write(struct file *file, const char __user *in,
return initialize(file, input);
case LHREQ_IRQ:
return user_send_irq(cpu, input);
- case LHREQ_EVENTFD:
- return attach_eventfd(lg, input);
+ case LHREQ_GETREG:
+ return getreg_setup(cpu, input);
+ case LHREQ_SETREG:
+ return setreg(cpu, input);
+ case LHREQ_TRAP:
+ return trap(cpu, input);
default:
return -EINVAL;
}
@@ -478,11 +378,6 @@ static int close(struct inode *inode, struct file *file)
mmput(lg->cpus[i].mm);
}
- /* Release any eventfds they registered. */
- for (i = 0; i < lg->eventfds->num; i++)
- eventfd_ctx_put(lg->eventfds->map[i].event);
- kfree(lg->eventfds);
-
/*
* If lg->dead doesn't contain an error code it will be NULL or a
* kmalloc()ed string, either of which is ok to hand to kfree().
diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c
index e8b55c3..e3abebc9 100644
--- a/drivers/lguest/page_tables.c
+++ b/drivers/lguest/page_tables.c
@@ -250,6 +250,16 @@ static void release_pte(pte_t pte)
}
/*:*/
+static bool gpte_in_iomem(struct lg_cpu *cpu, pte_t gpte)
+{
+ /* We don't handle large pages. */
+ if (pte_flags(gpte) & _PAGE_PSE)
+ return false;
+
+ return (pte_pfn(gpte) >= cpu->lg->pfn_limit
+ && pte_pfn(gpte) < cpu->lg->device_limit);
+}
+
static bool check_gpte(struct lg_cpu *cpu, pte_t gpte)
{
if ((pte_flags(gpte) & _PAGE_PSE) ||
@@ -374,8 +384,14 @@ static pte_t *find_spte(struct lg_cpu *cpu, unsigned long vaddr, bool allocate,
*
* If we fixed up the fault (ie. we mapped the address), this routine returns
* true. Otherwise, it was a real fault and we need to tell the Guest.
+ *
+ * There's a corner case: they're trying to access memory between
+ * pfn_limit and device_limit, which is I/O memory. In this case, we
+ * return false and set @iomem to the physical address, so the the
+ * Launcher can handle the instruction manually.
*/
-bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
+bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode,
+ unsigned long *iomem)
{
unsigned long gpte_ptr;
pte_t gpte;
@@ -383,6 +399,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
pmd_t gpmd;
pgd_t gpgd;
+ *iomem = 0;
+
/* We never demand page the Switcher, so trying is a mistake. */
if (vaddr >= switcher_addr)
return false;
@@ -459,6 +477,12 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode)
if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER))
return false;
+ /* If they're accessing io memory, we expect a fault. */
+ if (gpte_in_iomem(cpu, gpte)) {
+ *iomem = (pte_pfn(gpte) << PAGE_SHIFT) | (vaddr & ~PAGE_MASK);
+ return false;
+ }
+
/*
* Check that the Guest PTE flags are OK, and the page number is below
* the pfn_limit (ie. not mapping the Launcher binary).
@@ -553,7 +577,9 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr)
*/
void pin_page(struct lg_cpu *cpu, unsigned long vaddr)
{
- if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2))
+ unsigned long iomem;
+
+ if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2, &iomem))
kill_guest(cpu, "bad stack page %#lx", vaddr);
}
/*:*/
@@ -647,7 +673,7 @@ void guest_pagetable_flush_user(struct lg_cpu *cpu)
/*:*/
/* We walk down the guest page tables to get a guest-physical address */
-unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
+bool __guest_pa(struct lg_cpu *cpu, unsigned long vaddr, unsigned long *paddr)
{
pgd_t gpgd;
pte_t gpte;
@@ -656,31 +682,47 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
#endif
/* Still not set up? Just map 1:1. */
- if (unlikely(cpu->linear_pages))
- return vaddr;
+ if (unlikely(cpu->linear_pages)) {
+ *paddr = vaddr;
+ return true;
+ }
/* First step: get the top-level Guest page table entry. */
gpgd = lgread(cpu, gpgd_addr(cpu, vaddr), pgd_t);
/* Toplevel not present? We can't map it in. */
- if (!(pgd_flags(gpgd) & _PAGE_PRESENT)) {
- kill_guest(cpu, "Bad address %#lx", vaddr);
- return -1UL;
- }
+ if (!(pgd_flags(gpgd) & _PAGE_PRESENT))
+ goto fail;
#ifdef CONFIG_X86_PAE
gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t);
- if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) {
- kill_guest(cpu, "Bad address %#lx", vaddr);
- return -1UL;
- }
+ if (!(pmd_flags(gpmd) & _PAGE_PRESENT))
+ goto fail;
gpte = lgread(cpu, gpte_addr(cpu, gpmd, vaddr), pte_t);
#else
gpte = lgread(cpu, gpte_addr(cpu, gpgd, vaddr), pte_t);
#endif
if (!(pte_flags(gpte) & _PAGE_PRESENT))
- kill_guest(cpu, "Bad address %#lx", vaddr);
+ goto fail;
+
+ *paddr = pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+ return true;
+
+fail:
+ *paddr = -1UL;
+ return false;
+}
- return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK);
+/*
+ * This is the version we normally use: kills the Guest if it uses a
+ * bad address
+ */
+unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr)
+{
+ unsigned long paddr;
+
+ if (!__guest_pa(cpu, vaddr, &paddr))
+ kill_guest(cpu, "Bad address %#lx", vaddr);
+ return paddr;
}
/*
@@ -912,7 +954,8 @@ static void __guest_set_pte(struct lg_cpu *cpu, int idx,
* now. This shaves 10% off a copy-on-write
* micro-benchmark.
*/
- if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) {
+ if ((pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED))
+ && !gpte_in_iomem(cpu, gpte)) {
if (!check_gpte(cpu, gpte))
return;
set_pte(spte,
diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c
index 6adfd7b..30f2aef 100644
--- a/drivers/lguest/x86/core.c
+++ b/drivers/lguest/x86/core.c
@@ -182,6 +182,52 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages)
}
/*:*/
+unsigned long *lguest_arch_regptr(struct lg_cpu *cpu, size_t reg_off, bool any)
+{
+ switch (reg_off) {
+ case offsetof(struct pt_regs, bx):
+ return &cpu->regs->ebx;
+ case offsetof(struct pt_regs, cx):
+ return &cpu->regs->ecx;
+ case offsetof(struct pt_regs, dx):
+ return &cpu->regs->edx;
+ case offsetof(struct pt_regs, si):
+ return &cpu->regs->esi;
+ case offsetof(struct pt_regs, di):
+ return &cpu->regs->edi;
+ case offsetof(struct pt_regs, bp):
+ return &cpu->regs->ebp;
+ case offsetof(struct pt_regs, ax):
+ return &cpu->regs->eax;
+ case offsetof(struct pt_regs, ip):
+ return &cpu->regs->eip;
+ case offsetof(struct pt_regs, sp):
+ return &cpu->regs->esp;
+ }
+
+ /* Launcher can read these, but we don't allow any setting. */
+ if (any) {
+ switch (reg_off) {
+ case offsetof(struct pt_regs, ds):
+ return &cpu->regs->ds;
+ case offsetof(struct pt_regs, es):
+ return &cpu->regs->es;
+ case offsetof(struct pt_regs, fs):
+ return &cpu->regs->fs;
+ case offsetof(struct pt_regs, gs):
+ return &cpu->regs->gs;
+ case offsetof(struct pt_regs, cs):
+ return &cpu->regs->cs;
+ case offsetof(struct pt_regs, flags):
+ return &cpu->regs->eflags;
+ case offsetof(struct pt_regs, ss):
+ return &cpu->regs->ss;
+ }
+ }
+
+ return NULL;
+}
+
/*M:002
* There are hooks in the scheduler which we can register to tell when we
* get kicked off the CPU (preempt_notifier_register()). This would allow us
@@ -269,110 +315,73 @@ void lguest_arch_run_guest(struct lg_cpu *cpu)
* usually attached to a PC.
*
* When the Guest uses one of these instructions, we get a trap (General
- * Protection Fault) and come here. We see if it's one of those troublesome
- * instructions and skip over it. We return true if we did.
+ * Protection Fault) and come here. We queue this to be sent out to the
+ * Launcher to handle.
*/
-static int emulate_insn(struct lg_cpu *cpu)
-{
- u8 insn;
- unsigned int insnlen = 0, in = 0, small_operand = 0;
- /*
- * The eip contains the *virtual* address of the Guest's instruction:
- * walk the Guest's page tables to find the "physical" address.
- */
- unsigned long physaddr = guest_pa(cpu, cpu->regs->eip);
-
- /*
- * This must be the Guest kernel trying to do something, not userspace!
- * The bottom two bits of the CS segment register are the privilege
- * level.
- */
- if ((cpu->regs->cs & 3) != GUEST_PL)
- return 0;
-
- /* Decoding x86 instructions is icky. */
- insn = lgread(cpu, physaddr, u8);
- /*
- * Around 2.6.33, the kernel started using an emulation for the
- * cmpxchg8b instruction in early boot on many configurations. This
- * code isn't paravirtualized, and it tries to disable interrupts.
- * Ignore it, which will Mostly Work.
- */
- if (insn == 0xfa) {
- /* "cli", or Clear Interrupt Enable instruction. Skip it. */
- cpu->regs->eip++;
- return 1;
+/*
+ * The eip contains the *virtual* address of the Guest's instruction:
+ * we copy the instruction here so the Launcher doesn't have to walk
+ * the page tables to decode it. We handle the case (eg. in a kernel
+ * module) where the instruction is over two pages, and the pages are
+ * virtually but not physically contiguous.
+ *
+ * The longest possible x86 instruction is 15 bytes, but we don't handle
+ * anything that strange.
+ */
+static void copy_from_guest(struct lg_cpu *cpu,
+ void *dst, unsigned long vaddr, size_t len)
+{
+ size_t to_page_end = PAGE_SIZE - (vaddr % PAGE_SIZE);
+ unsigned long paddr;
+
+ BUG_ON(len > PAGE_SIZE);
+
+ /* If it goes over a page, copy in two parts. */
+ if (len > to_page_end) {
+ /* But make sure the next page is mapped! */
+ if (__guest_pa(cpu, vaddr + to_page_end, &paddr))
+ copy_from_guest(cpu, dst + to_page_end,
+ vaddr + to_page_end,
+ len - to_page_end);
+ else
+ /* Otherwise fill with zeroes. */
+ memset(dst + to_page_end, 0, len - to_page_end);
+ len = to_page_end;
}
- /*
- * 0x66 is an "operand prefix". It means a 16, not 32 bit in/out.
- */
- if (insn == 0x66) {
- small_operand = 1;
- /* The instruction is 1 byte so far, read the next byte. */
- insnlen = 1;
- insn = lgread(cpu, physaddr + insnlen, u8);
- }
+ /* This will kill the guest if it isn't mapped, but that
+ * shouldn't happen. */
+ __lgread(cpu, dst, guest_pa(cpu, vaddr), len);
+}
- /*
- * We can ignore the lower bit for the moment and decode the 4 opcodes
- * we need to emulate.
- */
- switch (insn & 0xFE) {
- case 0xE4: /* in <next byte>,%al */
- insnlen += 2;
- in = 1;
- break;
- case 0xEC: /* in (%dx),%al */
- insnlen += 1;
- in = 1;
- break;
- case 0xE6: /* out %al,<next byte> */
- insnlen += 2;
- break;
- case 0xEE: /* out %al,(%dx) */
- insnlen += 1;
- break;
- default:
- /* OK, we don't know what this is, can't emulate. */
- return 0;
- }
- /*
- * If it was an "IN" instruction, they expect the result to be read
- * into %eax, so we change %eax. We always return all-ones, which
- * traditionally means "there's nothing there".
- */
- if (in) {
- /* Lower bit tells means it's a 32/16 bit access */
- if (insn & 0x1) {
- if (small_operand)
- cpu->regs->eax |= 0xFFFF;
- else
- cpu->regs->eax = 0xFFFFFFFF;
- } else
- cpu->regs->eax |= 0xFF;
- }
- /* Finally, we've "done" the instruction, so move past it. */
- cpu->regs->eip += insnlen;
- /* Success! */
- return 1;
+static void setup_emulate_insn(struct lg_cpu *cpu)
+{
+ cpu->pending.trap = 13;
+ copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
+ sizeof(cpu->pending.insn));
+}
+
+static void setup_iomem_insn(struct lg_cpu *cpu, unsigned long iomem_addr)
+{
+ cpu->pending.trap = 14;
+ cpu->pending.addr = iomem_addr;
+ copy_from_guest(cpu, cpu->pending.insn, cpu->regs->eip,
+ sizeof(cpu->pending.insn));
}
/*H:050 Once we've re-enabled interrupts, we look at why the Guest exited. */
void lguest_arch_handle_trap(struct lg_cpu *cpu)
{
+ unsigned long iomem_addr;
+
switch (cpu->regs->trapnum) {
case 13: /* We've intercepted a General Protection Fault. */
- /*
- * Check if this was one of those annoying IN or OUT
- * instructions which we need to emulate. If so, we just go
- * back into the Guest after we've done it.
- */
+ /* Hand to Launcher to emulate those pesky IN and OUT insns */
if (cpu->regs->errcode == 0) {
- if (emulate_insn(cpu))
- return;
+ setup_emulate_insn(cpu);
+ return;
}
break;
case 14: /* We've intercepted a Page Fault. */
@@ -387,9 +396,16 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu)
* whether kernel or userspace code.
*/
if (demand_page(cpu, cpu->arch.last_pagefault,
- cpu->regs->errcode))
+ cpu->regs->errcode, &iomem_addr))
return;
+ /* Was this an access to memory mapped IO? */
+ if (iomem_addr) {
+ /* Tell Launcher, let it handle it. */
+ setup_iomem_insn(cpu, iomem_addr);
+ return;
+ }
+
/*
* OK, it's really not there (or not OK): the Guest needs to
* know. We write out the cr2 value so it knows where the
OpenPOWER on IntegriCloud