summaryrefslogtreecommitdiffstats
path: root/lib/libvmmapi
diff options
context:
space:
mode:
authorneel <neel@FreeBSD.org>2015-06-18 06:00:17 +0000
committerneel <neel@FreeBSD.org>2015-06-18 06:00:17 +0000
commit8c70d6c7af8325d488ec8dff7b0bbe7b6dfc32f9 (patch)
tree34972705799ce1ec23ee45b1e6c0c7631ad7ed98 /lib/libvmmapi
parent07553194d65f7f87d0eea74f0ad0f786e9a189bb (diff)
downloadFreeBSD-src-8c70d6c7af8325d488ec8dff7b0bbe7b6dfc32f9.zip
FreeBSD-src-8c70d6c7af8325d488ec8dff7b0bbe7b6dfc32f9.tar.gz
Restructure memory allocation in bhyve to support "devmem".
devmem is used to represent MMIO devices like the boot ROM or a VESA framebuffer where doing a trap-and-emulate for every access is impractical. devmem is a hybrid of system memory (sysmem) and emulated device models. devmem is mapped in the guest address space via nested page tables similar to sysmem. However the address range where devmem is mapped may be changed by the guest at runtime (e.g. by reprogramming a PCI BAR). Also devmem is usually mapped RO or RW as compared to RWX mappings for sysmem. Each devmem segment is named (e.g. "bootrom") and this name is used to create a device node for the devmem segment (e.g. /dev/vmm/testvm.bootrom). The device node supports mmap(2) and this decouples the host mapping of devmem from its mapping in the guest address space (which can change). Reviewed by: tychon Discussed with: grehan Differential Revision: https://reviews.freebsd.org/D2762 MFC after: 4 weeks
Diffstat (limited to 'lib/libvmmapi')
-rw-r--r--lib/libvmmapi/vmmapi.c320
-rw-r--r--lib/libvmmapi/vmmapi.h52
2 files changed, 311 insertions, 61 deletions
diff --git a/lib/libvmmapi/vmmapi.c b/lib/libvmmapi/vmmapi.c
index 1e6e627..d5f387a 100644
--- a/lib/libvmmapi/vmmapi.c
+++ b/lib/libvmmapi/vmmapi.c
@@ -58,15 +58,23 @@ __FBSDID("$FreeBSD$");
#define MB (1024 * 1024UL)
#define GB (1024 * 1024 * 1024UL)
+/*
+ * Size of the guard region before and after the virtual address space
+ * mapping the guest physical memory. This must be a multiple of the
+ * superpage size for performance reasons.
+ */
+#define VM_MMAP_GUARD_SIZE (4 * MB)
+
+#define PROT_RW (PROT_READ | PROT_WRITE)
+#define PROT_ALL (PROT_READ | PROT_WRITE | PROT_EXEC)
+
struct vmctx {
int fd;
uint32_t lowmem_limit;
- enum vm_mmap_style vms;
int memflags;
size_t lowmem;
- char *lowmem_addr;
size_t highmem;
- char *highmem_addr;
+ char *baseaddr;
char *name;
};
@@ -157,22 +165,6 @@ vm_parse_memsize(const char *optarg, size_t *ret_memsize)
return (error);
}
-int
-vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
- int *wired)
-{
- int error;
- struct vm_memory_segment seg;
-
- bzero(&seg, sizeof(seg));
- seg.gpa = gpa;
- error = ioctl(ctx->fd, VM_GET_MEMORY_SEG, &seg);
- *ret_len = seg.len;
- if (wired != NULL)
- *wired = seg.wired;
- return (error);
-}
-
uint32_t
vm_get_lowmem_limit(struct vmctx *ctx)
{
@@ -194,39 +186,184 @@ vm_set_memflags(struct vmctx *ctx, int flags)
ctx->memflags = flags;
}
+int
+vm_get_memflags(struct vmctx *ctx)
+{
+
+ return (ctx->memflags);
+}
+
+/*
+ * Map segment 'segid' starting at 'off' into guest address range [gpa,gpa+len).
+ */
+int
+vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid, vm_ooffset_t off,
+ size_t len, int prot)
+{
+ struct vm_memmap memmap;
+ int error, flags;
+
+ memmap.gpa = gpa;
+ memmap.segid = segid;
+ memmap.segoff = off;
+ memmap.len = len;
+ memmap.prot = prot;
+ memmap.flags = 0;
+
+ if (ctx->memflags & VM_MEM_F_WIRED)
+ memmap.flags |= VM_MEMMAP_F_WIRED;
+
+ /*
+ * If this mapping already exists then don't create it again. This
+ * is the common case for SYSMEM mappings created by bhyveload(8).
+ */
+ error = vm_mmap_getnext(ctx, &gpa, &segid, &off, &len, &prot, &flags);
+ if (error == 0 && gpa == memmap.gpa) {
+ if (segid != memmap.segid || off != memmap.segoff ||
+ prot != memmap.prot || flags != memmap.flags) {
+ errno = EEXIST;
+ return (-1);
+ } else {
+ return (0);
+ }
+ }
+
+ error = ioctl(ctx->fd, VM_MMAP_MEMSEG, &memmap);
+ return (error);
+}
+
+int
+vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+ vm_ooffset_t *segoff, size_t *len, int *prot, int *flags)
+{
+ struct vm_memmap memmap;
+ int error;
+
+ bzero(&memmap, sizeof(struct vm_memmap));
+ memmap.gpa = *gpa;
+ error = ioctl(ctx->fd, VM_MMAP_GETNEXT, &memmap);
+ if (error == 0) {
+ *gpa = memmap.gpa;
+ *segid = memmap.segid;
+ *segoff = memmap.segoff;
+ *len = memmap.len;
+ *prot = memmap.prot;
+ *flags = memmap.flags;
+ }
+ return (error);
+}
+
+/*
+ * Return 0 if the segments are identical and non-zero otherwise.
+ *
+ * This is slightly complicated by the fact that only device memory segments
+ * are named.
+ */
+static int
+cmpseg(size_t len, const char *str, size_t len2, const char *str2)
+{
+
+ if (len == len2) {
+ if ((!str && !str2) || (str && str2 && !strcmp(str, str2)))
+ return (0);
+ }
+ return (-1);
+}
+
static int
-setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char **addr)
+vm_alloc_memseg(struct vmctx *ctx, int segid, size_t len, const char *name)
{
- int error, mmap_flags;
- struct vm_memory_segment seg;
+ struct vm_memseg memseg;
+ size_t n;
+ int error;
/*
- * Create and optionally map 'len' bytes of memory at guest
- * physical address 'gpa'
+ * If the memory segment has already been created then just return.
+ * This is the usual case for the SYSMEM segment created by userspace
+ * loaders like bhyveload(8).
*/
- bzero(&seg, sizeof(seg));
- seg.gpa = gpa;
- seg.len = len;
- error = ioctl(ctx->fd, VM_MAP_MEMORY, &seg);
- if (error == 0 && addr != NULL) {
- mmap_flags = MAP_SHARED;
- if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
- mmap_flags |= MAP_NOCORE;
- *addr = mmap(NULL, len, PROT_READ | PROT_WRITE, mmap_flags,
- ctx->fd, gpa);
+ error = vm_get_memseg(ctx, segid, &memseg.len, memseg.name,
+ sizeof(memseg.name));
+ if (error)
+ return (error);
+
+ if (memseg.len != 0) {
+ if (cmpseg(len, name, memseg.len, VM_MEMSEG_NAME(&memseg))) {
+ errno = EINVAL;
+ return (-1);
+ } else {
+ return (0);
+ }
+ }
+
+ bzero(&memseg, sizeof(struct vm_memseg));
+ memseg.segid = segid;
+ memseg.len = len;
+ if (name != NULL) {
+ n = strlcpy(memseg.name, name, sizeof(memseg.name));
+ if (n >= sizeof(memseg.name)) {
+ errno = ENAMETOOLONG;
+ return (-1);
+ }
}
+
+ error = ioctl(ctx->fd, VM_ALLOC_MEMSEG, &memseg);
return (error);
}
int
-vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+vm_get_memseg(struct vmctx *ctx, int segid, size_t *lenp, char *namebuf,
+ size_t bufsize)
{
- char **addr;
+ struct vm_memseg memseg;
+ size_t n;
int error;
- /* XXX VM_MMAP_SPARSE not implemented yet */
- assert(vms == VM_MMAP_NONE || vms == VM_MMAP_ALL);
- ctx->vms = vms;
+ memseg.segid = segid;
+ error = ioctl(ctx->fd, VM_GET_MEMSEG, &memseg);
+ if (error == 0) {
+ *lenp = memseg.len;
+ n = strlcpy(namebuf, memseg.name, bufsize);
+ if (n >= bufsize) {
+ errno = ENAMETOOLONG;
+ error = -1;
+ }
+ }
+ return (error);
+}
+
+static int
+setup_memory_segment(struct vmctx *ctx, vm_paddr_t gpa, size_t len, char *base)
+{
+ char *ptr;
+ int error, flags;
+
+ /* Map 'len' bytes starting at 'gpa' in the guest address space */
+ error = vm_mmap_memseg(ctx, gpa, VM_SYSMEM, gpa, len, PROT_ALL);
+ if (error)
+ return (error);
+
+ flags = MAP_SHARED | MAP_FIXED;
+ if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+ flags |= MAP_NOCORE;
+
+ /* mmap into the process address space on the host */
+ ptr = mmap(base + gpa, len, PROT_RW, flags, ctx->fd, gpa);
+ if (ptr == MAP_FAILED)
+ return (-1);
+
+ return (0);
+}
+
+int
+vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
+{
+ size_t objsize, len;
+ vm_paddr_t gpa;
+ char *baseaddr, *ptr;
+ int error, flags;
+
+ assert(vms == VM_MMAP_ALL);
/*
* If 'memsize' cannot fit entirely in the 'lowmem' segment then
@@ -234,46 +371,63 @@ vm_setup_memory(struct vmctx *ctx, size_t memsize, enum vm_mmap_style vms)
*/
if (memsize > ctx->lowmem_limit) {
ctx->lowmem = ctx->lowmem_limit;
- ctx->highmem = memsize - ctx->lowmem;
+ ctx->highmem = memsize - ctx->lowmem_limit;
+ objsize = 4*GB + ctx->highmem;
} else {
ctx->lowmem = memsize;
ctx->highmem = 0;
+ objsize = ctx->lowmem;
}
- if (ctx->lowmem > 0) {
- addr = (vms == VM_MMAP_ALL) ? &ctx->lowmem_addr : NULL;
- error = setup_memory_segment(ctx, 0, ctx->lowmem, addr);
+ error = vm_alloc_memseg(ctx, VM_SYSMEM, objsize, NULL);
+ if (error)
+ return (error);
+
+ /*
+ * Stake out a contiguous region covering the guest physical memory
+ * and the adjoining guard regions.
+ */
+ len = VM_MMAP_GUARD_SIZE + objsize + VM_MMAP_GUARD_SIZE;
+ flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+ ptr = mmap(NULL, len, PROT_NONE, flags, -1, 0);
+ if (ptr == MAP_FAILED)
+ return (-1);
+
+ baseaddr = ptr + VM_MMAP_GUARD_SIZE;
+ if (ctx->highmem > 0) {
+ gpa = 4*GB;
+ len = ctx->highmem;
+ error = setup_memory_segment(ctx, gpa, len, baseaddr);
if (error)
return (error);
}
- if (ctx->highmem > 0) {
- addr = (vms == VM_MMAP_ALL) ? &ctx->highmem_addr : NULL;
- error = setup_memory_segment(ctx, 4*GB, ctx->highmem, addr);
+ if (ctx->lowmem > 0) {
+ gpa = 0;
+ len = ctx->lowmem;
+ error = setup_memory_segment(ctx, gpa, len, baseaddr);
if (error)
return (error);
}
+ ctx->baseaddr = baseaddr;
+
return (0);
}
void *
vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len)
{
+ vm_paddr_t start, end, mapend;
- /* XXX VM_MMAP_SPARSE not implemented yet */
- assert(ctx->vms == VM_MMAP_ALL);
-
- if (gaddr < ctx->lowmem && gaddr + len <= ctx->lowmem)
- return ((void *)(ctx->lowmem_addr + gaddr));
-
- if (gaddr >= 4*GB) {
- gaddr -= 4*GB;
- if (gaddr < ctx->highmem && gaddr + len <= ctx->highmem)
- return ((void *)(ctx->highmem_addr + gaddr));
- }
+ start = gaddr;
+ end = gaddr + len;
+ mapend = ctx->highmem ? 4*GB + ctx->highmem : ctx->lowmem;
- return (NULL);
+ if (start <= end && end <= mapend)
+ return (ctx->baseaddr + start);
+ else
+ return (NULL);
}
size_t
@@ -290,6 +444,56 @@ vm_get_highmem_size(struct vmctx *ctx)
return (ctx->highmem);
}
+void *
+vm_create_devmem(struct vmctx *ctx, int segid, const char *name, size_t len)
+{
+ char pathname[MAXPATHLEN];
+ size_t len2;
+ char *base, *ptr;
+ int fd, error, flags;
+
+ fd = -1;
+ ptr = MAP_FAILED;
+ if (name == NULL || strlen(name) == 0) {
+ errno = EINVAL;
+ goto done;
+ }
+
+ error = vm_alloc_memseg(ctx, segid, len, name);
+ if (error)
+ goto done;
+
+ strlcpy(pathname, "/dev/vmm/", sizeof(pathname));
+ strlcat(pathname, ctx->name, sizeof(pathname));
+ strlcat(pathname, ".", sizeof(pathname));
+ strlcat(pathname, name, sizeof(pathname));
+
+ fd = open(pathname, O_RDWR);
+ if (fd < 0)
+ goto done;
+
+ /*
+ * Stake out a contiguous region covering the device memory and the
+ * adjoining guard regions.
+ */
+ len2 = VM_MMAP_GUARD_SIZE + len + VM_MMAP_GUARD_SIZE;
+ flags = MAP_PRIVATE | MAP_ANON | MAP_NOCORE | MAP_ALIGNED_SUPER;
+ base = mmap(NULL, len2, PROT_NONE, flags, -1, 0);
+ if (base == MAP_FAILED)
+ goto done;
+
+ flags = MAP_SHARED | MAP_FIXED;
+ if ((ctx->memflags & VM_MEM_F_INCORE) == 0)
+ flags |= MAP_NOCORE;
+
+ /* mmap the devmem region in the host address space */
+ ptr = mmap(base + VM_MMAP_GUARD_SIZE, len, PROT_RW, flags, fd, 0);
+done:
+ if (fd >= 0)
+ close(fd);
+ return (ptr);
+}
+
int
vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
uint64_t base, uint32_t limit, uint32_t access)
diff --git a/lib/libvmmapi/vmmapi.h b/lib/libvmmapi/vmmapi.h
index d3ecdc4..57f8c56 100644
--- a/lib/libvmmapi/vmmapi.h
+++ b/lib/libvmmapi/vmmapi.h
@@ -36,7 +36,7 @@
* API version for out-of-tree consumers like grub-bhyve for making compile
* time decisions.
*/
-#define VMMAPI_VERSION 0101 /* 2 digit major followed by 2 digit minor */
+#define VMMAPI_VERSION 0102 /* 2 digit major followed by 2 digit minor */
struct iovec;
struct vmctx;
@@ -52,14 +52,59 @@ enum vm_mmap_style {
VM_MMAP_SPARSE, /* mappings created on-demand */
};
+/*
+ * 'flags' value passed to 'vm_set_memflags()'.
+ */
#define VM_MEM_F_INCORE 0x01 /* include guest memory in core file */
+#define VM_MEM_F_WIRED 0x02 /* guest memory is wired */
+
+/*
+ * Identifiers for memory segments:
+ * - vm_setup_memory() uses VM_SYSMEM for the system memory segment.
+ * - the remaining identifiers can be used to create devmem segments.
+ */
+enum {
+ VM_SYSMEM,
+ VM_BOOTROM,
+ VM_FRAMEBUFFER,
+};
+
+/*
+ * Get the length and name of the memory segment identified by 'segid'.
+ * Note that system memory segments are identified with a nul name.
+ *
+ * Returns 0 on success and non-zero otherwise.
+ */
+int vm_get_memseg(struct vmctx *ctx, int ident, size_t *lenp, char *name,
+ size_t namesiz);
+
+/*
+ * Iterate over the guest address space. This function finds an address range
+ * that starts at an address >= *gpa.
+ *
+ * Returns 0 if the next address range was found and non-zero otherwise.
+ */
+int vm_mmap_getnext(struct vmctx *ctx, vm_paddr_t *gpa, int *segid,
+ vm_ooffset_t *segoff, size_t *len, int *prot, int *flags);
+/*
+ * Create a device memory segment identified by 'segid'.
+ *
+ * Returns a pointer to the memory segment on success and MAP_FAILED otherwise.
+ */
+void *vm_create_devmem(struct vmctx *ctx, int segid, const char *name,
+ size_t len);
+
+/*
+ * Map the memory segment identified by 'segid' into the guest address space
+ * at [gpa,gpa+len) with protection 'prot'.
+ */
+int vm_mmap_memseg(struct vmctx *ctx, vm_paddr_t gpa, int segid,
+ vm_ooffset_t segoff, size_t len, int prot);
int vm_create(const char *name);
struct vmctx *vm_open(const char *name);
void vm_destroy(struct vmctx *ctx);
int vm_parse_memsize(const char *optarg, size_t *memsize);
-int vm_get_memory_seg(struct vmctx *ctx, vm_paddr_t gpa, size_t *ret_len,
- int *wired);
int vm_setup_memory(struct vmctx *ctx, size_t len, enum vm_mmap_style s);
void *vm_map_gpa(struct vmctx *ctx, vm_paddr_t gaddr, size_t len);
int vm_get_gpa_pmap(struct vmctx *, uint64_t gpa, uint64_t *pte, int *num);
@@ -68,6 +113,7 @@ int vm_gla2gpa(struct vmctx *, int vcpuid, struct vm_guest_paging *paging,
uint32_t vm_get_lowmem_limit(struct vmctx *ctx);
void vm_set_lowmem_limit(struct vmctx *ctx, uint32_t limit);
void vm_set_memflags(struct vmctx *ctx, int flags);
+int vm_get_memflags(struct vmctx *ctx);
size_t vm_get_lowmem_size(struct vmctx *ctx);
size_t vm_get_highmem_size(struct vmctx *ctx);
int vm_set_desc(struct vmctx *ctx, int vcpu, int reg,
OpenPOWER on IntegriCloud