diff options
author | Timothy Pearson <tpearson@raptorengineering.com> | 2019-05-11 15:12:49 -0500 |
---|---|---|
committer | Timothy Pearson <tpearson@raptorengineering.com> | 2019-05-11 15:12:49 -0500 |
commit | 9e80202352dd49bdd9e67b8b906d86f058431505 (patch) | |
tree | 5673c17aad6e3833da8c4ff21b5a11f666ec9fbe /src/docs | |
download | hqemu-9e80202352dd49bdd9e67b8b906d86f058431505.zip hqemu-9e80202352dd49bdd9e67b8b906d86f058431505.tar.gz |
Diffstat (limited to 'src/docs')
60 files changed, 13151 insertions, 0 deletions
diff --git a/src/docs/aio_notify.promela b/src/docs/aio_notify.promela new file mode 100644 index 0000000..fccc7ee --- /dev/null +++ b/src/docs/aio_notify.promela @@ -0,0 +1,93 @@ +/* + * This model describes the interaction between ctx->notify_me + * and aio_notify(). + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To simulate it: + * spin -p docs/aio_notify.promela + * + * To verify it: + * spin -a docs/aio_notify.promela + * gcc -O2 pan.c + * ./a.out -a + * + * To verify it (with a bug planted in the model): + * spin -a -DBUG docs/aio_notify.promela + * gcc -O2 pan.c + * ./a.out -a + */ + +#define MAX 4 +#define LAST (1 << (MAX - 1)) +#define FINAL ((LAST << 1) - 1) + +bool notify_me; +bool event; + +int req; +int done; + +active proctype waiter() +{ + int fetch; + + do + :: true -> { + notify_me++; + + if +#ifndef BUG + :: (req > 0) -> skip; +#endif + :: else -> + // Wait for a nudge from the other side + do + :: event == 1 -> { event = 0; break; } + od; + fi; + + notify_me--; + + atomic { fetch = req; req = 0; } + done = done | fetch; + } + od +} + +active proctype notifier() +{ + int next = 1; + + do + :: next <= LAST -> { + // generate a request + req = req | next; + next = next << 1; + + // aio_notify + if + :: notify_me == 1 -> event = 1; + :: else -> printf("Skipped event_notifier_set\n"); skip; + fi; + + // Test both synchronous and asynchronous delivery + if + :: 1 -> do + :: req == 0 -> break; + od; + :: 1 -> skip; + fi; + } + od; +} + +never { /* [] done < FINAL */ +accept_init: + do + :: done < FINAL -> skip; + od; +} diff --git a/src/docs/aio_notify_accept.promela b/src/docs/aio_notify_accept.promela new file mode 100644 index 0000000..9cef2c9 --- /dev/null +++ b/src/docs/aio_notify_accept.promela @@ -0,0 +1,152 @@ +/* + * This model describes the interaction between ctx->notified + * and ctx->notifier. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify the buggy version: + * spin -a -DBUG1 docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * (or -DBUG2) + * + * To verify the fixed version: + * spin -a docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * Add -DCHECK_REQ to test an alternative invariant and the + * "notify_me" optimization. + */ + +int notify_me; +bool notified; +bool event; +bool req; +bool notifier_done; + +#ifdef CHECK_REQ +#define USE_NOTIFY_ME 1 +#else +#define USE_NOTIFY_ME 0 +#endif + +#ifdef BUG +#error Please define BUG1 or BUG2 instead. +#endif + +active proctype notifier() +{ + do + :: true -> { + req = 1; + if + :: !USE_NOTIFY_ME || notify_me -> +#if defined BUG1 + /* CHECK_REQ does not detect this bug! */ + notified = 1; + event = 1; +#elif defined BUG2 + if + :: !notified -> event = 1; + :: else -> skip; + fi; + notified = 1; +#else + event = 1; + notified = 1; +#endif + :: else -> skip; + fi + } + :: true -> break; + od; + notifier_done = 1; +} + +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + atomic { old = notified; notified = 0; } \ + if \ + :: old -> event = 0; \ + :: else -> skip; \ + fi; \ + \ + req = 0; + +active proctype waiter() +{ + bool old; + + do + :: true -> AIO_POLL; + od; +} + +/* Same as waiter(), but disappears after a while. */ +active proctype temporary_waiter() +{ + bool old; + + do + :: true -> AIO_POLL; + :: true -> break; + od; +} + +#ifdef CHECK_REQ +never { + do + :: req -> goto accept_if_req_not_eventually_false; + :: true -> skip; + od; + +accept_if_req_not_eventually_false: + if + :: req -> goto accept_if_req_not_eventually_false; + fi; + assert(0); +} + +#else +/* There must be infinitely many transitions of event as long + * as the notifier does not exit. + * + * If event stayed always true, the waiters would be busy looping. + * If event stayed always false, the waiters would be sleeping + * forever. + */ +never { + do + :: !event -> goto accept_if_event_not_eventually_true; + :: event -> goto accept_if_event_not_eventually_false; + :: true -> skip; + od; + +accept_if_event_not_eventually_true: + if + :: !event && notifier_done -> do :: true -> skip; od; + :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; + fi; + assert(0); + +accept_if_event_not_eventually_false: + if + :: event -> goto accept_if_event_not_eventually_false; + fi; + assert(0); +} +#endif diff --git a/src/docs/aio_notify_bug.promela b/src/docs/aio_notify_bug.promela new file mode 100644 index 0000000..b3bfca1 --- /dev/null +++ b/src/docs/aio_notify_bug.promela @@ -0,0 +1,140 @@ +/* + * This model describes a bug in aio_notify. If ctx->notifier is + * cleared too late, a wakeup could be lost. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify the buggy version: + * spin -a -DBUG docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * To verify the fixed version: + * spin -a docs/aio_notify_bug.promela + * gcc -O2 pan.c + * ./a.out -a -f + * + * Add -DCHECK_REQ to test an alternative invariant and the + * "notify_me" optimization. + */ + +int notify_me; +bool event; +bool req; +bool notifier_done; + +#ifdef CHECK_REQ +#define USE_NOTIFY_ME 1 +#else +#define USE_NOTIFY_ME 0 +#endif + +active proctype notifier() +{ + do + :: true -> { + req = 1; + if + :: !USE_NOTIFY_ME || notify_me -> event = 1; + :: else -> skip; + fi + } + :: true -> break; + od; + notifier_done = 1; +} + +#ifdef BUG +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + req = 0; \ + event = 0; +#else +#define AIO_POLL \ + notify_me++; \ + if \ + :: !req -> { \ + if \ + :: event -> skip; \ + fi; \ + } \ + :: else -> skip; \ + fi; \ + notify_me--; \ + \ + event = 0; \ + req = 0; +#endif + +active proctype waiter() +{ + do + :: true -> AIO_POLL; + od; +} + +/* Same as waiter(), but disappears after a while. */ +active proctype temporary_waiter() +{ + do + :: true -> AIO_POLL; + :: true -> break; + od; +} + +#ifdef CHECK_REQ +never { + do + :: req -> goto accept_if_req_not_eventually_false; + :: true -> skip; + od; + +accept_if_req_not_eventually_false: + if + :: req -> goto accept_if_req_not_eventually_false; + fi; + assert(0); +} + +#else +/* There must be infinitely many transitions of event as long + * as the notifier does not exit. + * + * If event stayed always true, the waiters would be busy looping. + * If event stayed always false, the waiters would be sleeping + * forever. + */ +never { + do + :: !event -> goto accept_if_event_not_eventually_true; + :: event -> goto accept_if_event_not_eventually_false; + :: true -> skip; + od; + +accept_if_event_not_eventually_true: + if + :: !event && notifier_done -> do :: true -> skip; od; + :: !event && !notifier_done -> goto accept_if_event_not_eventually_true; + fi; + assert(0); + +accept_if_event_not_eventually_false: + if + :: event -> goto accept_if_event_not_eventually_false; + fi; + assert(0); +} +#endif diff --git a/src/docs/atomics.txt b/src/docs/atomics.txt new file mode 100644 index 0000000..ef285e3 --- /dev/null +++ b/src/docs/atomics.txt @@ -0,0 +1,352 @@ +CPUs perform independent memory operations effectively in random order. +but this can be a problem for CPU-CPU interaction (including interactions +between QEMU and the guest). Multi-threaded programs use various tools +to instruct the compiler and the CPU to restrict the order to something +that is consistent with the expectations of the programmer. + +The most basic tool is locking. Mutexes, condition variables and +semaphores are used in QEMU, and should be the default approach to +synchronization. Anything else is considerably harder, but it's +also justified more often than one would like. The two tools that +are provided by qemu/atomic.h are memory barriers and atomic operations. + +Macros defined by qemu/atomic.h fall in three camps: + +- compiler barriers: barrier(); + +- weak atomic access and manual memory barriers: atomic_read(), + atomic_set(), smp_rmb(), smp_wmb(), smp_mb(), smp_read_barrier_depends(); + +- sequentially consistent atomic access: everything else. + + +COMPILER MEMORY BARRIER +======================= + +barrier() prevents the compiler from moving the memory accesses either +side of it to the other side. The compiler barrier has no direct effect +on the CPU, which may then reorder things however it wishes. + +barrier() is mostly used within qemu/atomic.h itself. On some +architectures, CPU guarantees are strong enough that blocking compiler +optimizations already ensures the correct order of execution. In this +case, qemu/atomic.h will reduce stronger memory barriers to simple +compiler barriers. + +Still, barrier() can be useful when writing code that can be interrupted +by signal handlers. + + +SEQUENTIALLY CONSISTENT ATOMIC ACCESS +===================================== + +Most of the operations in the qemu/atomic.h header ensure *sequential +consistency*, where "the result of any execution is the same as if the +operations of all the processors were executed in some sequential order, +and the operations of each individual processor appear in this sequence +in the order specified by its program". + +qemu/atomic.h provides the following set of atomic read-modify-write +operations: + + void atomic_inc(ptr) + void atomic_dec(ptr) + void atomic_add(ptr, val) + void atomic_sub(ptr, val) + void atomic_and(ptr, val) + void atomic_or(ptr, val) + + typeof(*ptr) atomic_fetch_inc(ptr) + typeof(*ptr) atomic_fetch_dec(ptr) + typeof(*ptr) atomic_fetch_add(ptr, val) + typeof(*ptr) atomic_fetch_sub(ptr, val) + typeof(*ptr) atomic_fetch_and(ptr, val) + typeof(*ptr) atomic_fetch_or(ptr, val) + typeof(*ptr) atomic_xchg(ptr, val + typeof(*ptr) atomic_cmpxchg(ptr, old, new) + +all of which return the old value of *ptr. These operations are +polymorphic; they operate on any type that is as wide as an int. + +Sequentially consistent loads and stores can be done using: + + atomic_fetch_add(ptr, 0) for loads + atomic_xchg(ptr, val) for stores + +However, they are quite expensive on some platforms, notably POWER and +ARM. Therefore, qemu/atomic.h provides two primitives with slightly +weaker constraints: + + typeof(*ptr) atomic_mb_read(ptr) + void atomic_mb_set(ptr, val) + +The semantics of these primitives map to Java volatile variables, +and are strongly related to memory barriers as used in the Linux +kernel (see below). + +As long as you use atomic_mb_read and atomic_mb_set, accesses cannot +be reordered with each other, and it is also not possible to reorder +"normal" accesses around them. + +However, and this is the important difference between +atomic_mb_read/atomic_mb_set and sequential consistency, it is important +for both threads to access the same volatile variable. It is not the +case that everything visible to thread A when it writes volatile field f +becomes visible to thread B after it reads volatile field g. The store +and load have to "match" (i.e., be performed on the same volatile +field) to achieve the right semantics. + + +These operations operate on any type that is as wide as an int or smaller. + + +WEAK ATOMIC ACCESS AND MANUAL MEMORY BARRIERS +============================================= + +Compared to sequentially consistent atomic access, programming with +weaker consistency models can be considerably more complicated. +In general, if the algorithm you are writing includes both writes +and reads on the same side, it is generally simpler to use sequentially +consistent primitives. + +When using this model, variables are accessed with atomic_read() and +atomic_set(), and restrictions to the ordering of accesses is enforced +using the smp_rmb(), smp_wmb(), smp_mb() and smp_read_barrier_depends() +memory barriers. + +atomic_read() and atomic_set() prevents the compiler from using +optimizations that might otherwise optimize accesses out of existence +on the one hand, or that might create unsolicited accesses on the other. +In general this should not have any effect, because the same compiler +barriers are already implied by memory barriers. However, it is useful +to do so, because it tells readers which variables are shared with +other threads, and which are local to the current thread or protected +by other, more mundane means. + +Memory barriers control the order of references to shared memory. +They come in four kinds: + +- smp_rmb() guarantees that all the LOAD operations specified before + the barrier will appear to happen before all the LOAD operations + specified after the barrier with respect to the other components of + the system. + + In other words, smp_rmb() puts a partial ordering on loads, but is not + required to have any effect on stores. + +- smp_wmb() guarantees that all the STORE operations specified before + the barrier will appear to happen before all the STORE operations + specified after the barrier with respect to the other components of + the system. + + In other words, smp_wmb() puts a partial ordering on stores, but is not + required to have any effect on loads. + +- smp_mb() guarantees that all the LOAD and STORE operations specified + before the barrier will appear to happen before all the LOAD and + STORE operations specified after the barrier with respect to the other + components of the system. + + smp_mb() puts a partial ordering on both loads and stores. It is + stronger than both a read and a write memory barrier; it implies both + smp_rmb() and smp_wmb(), but it also prevents STOREs coming before the + barrier from overtaking LOADs coming after the barrier and vice versa. + +- smp_read_barrier_depends() is a weaker kind of read barrier. On + most processors, whenever two loads are performed such that the + second depends on the result of the first (e.g., the first load + retrieves the address to which the second load will be directed), + the processor will guarantee that the first LOAD will appear to happen + before the second with respect to the other components of the system. + However, this is not always true---for example, it was not true on + Alpha processors. Whenever this kind of access happens to shared + memory (that is not protected by a lock), a read barrier is needed, + and smp_read_barrier_depends() can be used instead of smp_rmb(). + + Note that the first load really has to have a _data_ dependency and not + a control dependency. If the address for the second load is dependent + on the first load, but the dependency is through a conditional rather + than actually loading the address itself, then it's a _control_ + dependency and a full read barrier or better is required. + + +This is the set of barriers that is required *between* two atomic_read() +and atomic_set() operations to achieve sequential consistency: + + | 2nd operation | + |-----------------------------------------| + 1st operation | (after last) | atomic_read | atomic_set | + ---------------+--------------+-------------+------------| + (before first) | | none | smp_wmb() | + ---------------+--------------+-------------+------------| + atomic_read | smp_rmb() | smp_rmb()* | ** | + ---------------+--------------+-------------+------------| + atomic_set | none | smp_mb()*** | smp_wmb() | + ---------------+--------------+-------------+------------| + + * Or smp_read_barrier_depends(). + + ** This requires a load-store barrier. How to achieve this varies + depending on the machine, but in practice smp_rmb()+smp_wmb() + should have the desired effect. For example, on PowerPC the + lwsync instruction is a combined load-load, load-store and + store-store barrier. + + *** This requires a store-load barrier. On most machines, the only + way to achieve this is a full barrier. + + +You can see that the two possible definitions of atomic_mb_read() +and atomic_mb_set() are the following: + + 1) atomic_mb_read(p) = atomic_read(p); smp_rmb() + atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v); smp_mb() + + 2) atomic_mb_read(p) = smp_mb() atomic_read(p); smp_rmb() + atomic_mb_set(p, v) = smp_wmb(); atomic_set(p, v); + +Usually the former is used, because smp_mb() is expensive and a program +normally has more reads than writes. Therefore it makes more sense to +make atomic_mb_set() the more expensive operation. + +There are two common cases in which atomic_mb_read and atomic_mb_set +generate too many memory barriers, and thus it can be useful to manually +place barriers instead: + +- when a data structure has one thread that is always a writer + and one thread that is always a reader, manual placement of + memory barriers makes the write side faster. Furthermore, + correctness is easy to check for in this case using the "pairing" + trick that is explained below: + + thread 1 thread 1 + ------------------------- ------------------------ + (other writes) + smp_wmb() + atomic_mb_set(&a, x) atomic_set(&a, x) + smp_wmb() + atomic_mb_set(&b, y) atomic_set(&b, y) + + => + thread 2 thread 2 + ------------------------- ------------------------ + y = atomic_mb_read(&b) y = atomic_read(&b) + smp_rmb() + x = atomic_mb_read(&a) x = atomic_read(&a) + smp_rmb() + +- sometimes, a thread is accessing many variables that are otherwise + unrelated to each other (for example because, apart from the current + thread, exactly one other thread will read or write each of these + variables). In this case, it is possible to "hoist" the implicit + barriers provided by atomic_mb_read() and atomic_mb_set() outside + a loop. For example, the above definition atomic_mb_read() gives + the following transformation: + + n = 0; n = 0; + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) + n += atomic_mb_read(&a[i]); n += atomic_read(&a[i]); + smp_rmb(); + + Similarly, atomic_mb_set() can be transformed as follows: + smp_mb(): + + smp_wmb(); + for (i = 0; i < 10; i++) => for (i = 0; i < 10; i++) + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); + smp_mb(); + + +The two tricks can be combined. In this case, splitting a loop in +two lets you hoist the barriers out of the loops _and_ eliminate the +expensive smp_mb(): + + smp_wmb(); + for (i = 0; i < 10; i++) { => for (i = 0; i < 10; i++) + atomic_mb_set(&a[i], false); atomic_set(&a[i], false); + atomic_mb_set(&b[i], false); smb_wmb(); + } for (i = 0; i < 10; i++) + atomic_set(&a[i], false); + smp_mb(); + + The other thread can still use atomic_mb_read()/atomic_mb_set() + + +Memory barrier pairing +---------------------- + +A useful rule of thumb is that memory barriers should always, or almost +always, be paired with another barrier. In the case of QEMU, however, +note that the other barrier may actually be in a driver that runs in +the guest! + +For the purposes of pairing, smp_read_barrier_depends() and smp_rmb() +both count as read barriers. A read barrier shall pair with a write +barrier or a full barrier; a write barrier shall pair with a read +barrier or a full barrier. A full barrier can pair with anything. +For example: + + thread 1 thread 2 + =============== =============== + a = 1; + smp_wmb(); + b = 2; x = b; + smp_rmb(); + y = a; + +Note that the "writing" thread is accessing the variables in the +opposite order as the "reading" thread. This is expected: stores +before the write barrier will normally match the loads after the +read barrier, and vice versa. The same is true for more than 2 +access and for data dependency barriers: + + thread 1 thread 2 + =============== =============== + b[2] = 1; + smp_wmb(); + x->i = 2; + smp_wmb(); + a = x; x = a; + smp_read_barrier_depends(); + y = x->i; + smp_read_barrier_depends(); + z = b[y]; + +smp_wmb() also pairs with atomic_mb_read(), and smp_rmb() also pairs +with atomic_mb_set(). + + +COMPARISON WITH LINUX KERNEL MEMORY BARRIERS +============================================ + +Here is a list of differences between Linux kernel atomic operations +and memory barriers, and the equivalents in QEMU: + +- atomic operations in Linux are always on a 32-bit int type and + use a boxed atomic_t type; atomic operations in QEMU are polymorphic + and use normal C types. + +- atomic_read and atomic_set in Linux give no guarantee at all; + atomic_read and atomic_set in QEMU include a compiler barrier + (similar to the ACCESS_ONCE macro in Linux). + +- most atomic read-modify-write operations in Linux return void; + in QEMU, all of them return the old value of the variable. + +- different atomic read-modify-write operations in Linux imply + a different set of memory barriers; in QEMU, all of them enforce + sequential consistency, which means they imply full memory barriers + before and after the operation. + +- Linux does not have an equivalent of atomic_mb_read() and + atomic_mb_set(). In particular, note that set_mb() is a little + weaker than atomic_mb_set(). + + +SOURCES +======= + +* Documentation/memory-barriers.txt from the Linux kernel + +* "The JSR-133 Cookbook for Compiler Writers", available at + http://g.oswego.edu/dl/jmm/cookbook.html diff --git a/src/docs/bitmaps.md b/src/docs/bitmaps.md new file mode 100644 index 0000000..a2e8d51 --- /dev/null +++ b/src/docs/bitmaps.md @@ -0,0 +1,505 @@ +<!-- +Copyright 2015 John Snow <jsnow@redhat.com> and Red Hat, Inc. +All rights reserved. + +This file is licensed via The FreeBSD Documentation License, the full text of +which is included at the end of this document. +--> + +# Dirty Bitmaps and Incremental Backup + +* Dirty Bitmaps are objects that track which data needs to be backed up for the + next incremental backup. + +* Dirty bitmaps can be created at any time and attached to any node + (not just complete drives.) + +## Dirty Bitmap Names + +* A dirty bitmap's name is unique to the node, but bitmaps attached to different + nodes can share the same name. + +* Dirty bitmaps created for internal use by QEMU may be anonymous and have no + name, but any user-created bitmaps may not be. There can be any number of + anonymous bitmaps per node. + +* The name of a user-created bitmap must not be empty (""). + +## Bitmap Modes + +* A Bitmap can be "frozen," which means that it is currently in-use by a backup + operation and cannot be deleted, renamed, written to, reset, + etc. + +* The normal operating mode for a bitmap is "active." + +## Basic QMP Usage + +### Supported Commands ### + +* block-dirty-bitmap-add +* block-dirty-bitmap-remove +* block-dirty-bitmap-clear + +### Creation + +* To create a new bitmap, enabled, on the drive with id=drive0: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +* This bitmap will have a default granularity that matches the cluster size of + its associated drive, if available, clamped to between [4KiB, 64KiB]. + The current default for qcow2 is 64KiB. + +* To create a new bitmap that tracks changes in 32KiB segments: + +```json +{ "execute": "block-dirty-bitmap-add", + "arguments": { + "node": "drive0", + "name": "bitmap0", + "granularity": 32768 + } +} +``` + +### Deletion + +* Bitmaps that are frozen cannot be deleted. + +* Deleting the bitmap does not impact any other bitmaps attached to the same + node, nor does it affect any backups already created from this node. + +* Because bitmaps are only unique to the node to which they are attached, + you must specify the node/drive name here, too. + +```json +{ "execute": "block-dirty-bitmap-remove", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +### Resetting + +* Resetting a bitmap will clear all information it holds. + +* An incremental backup created from an empty bitmap will copy no data, + as if nothing has changed. + +```json +{ "execute": "block-dirty-bitmap-clear", + "arguments": { + "node": "drive0", + "name": "bitmap0" + } +} +``` + +## Transactions + +### Justification + +Bitmaps can be safely modified when the VM is paused or halted by using +the basic QMP commands. For instance, you might perform the following actions: + +1. Boot the VM in a paused state. +2. Create a full drive backup of drive0. +3. Create a new bitmap attached to drive0. +4. Resume execution of the VM. +5. Incremental backups are ready to be created. + +At this point, the bitmap and drive backup would be correctly in sync, +and incremental backups made from this point forward would be correctly aligned +to the full drive backup. + +This is not particularly useful if we decide we want to start incremental +backups after the VM has been running for a while, for which we will need to +perform actions such as the following: + +1. Boot the VM and begin execution. +2. Using a single transaction, perform the following operations: + * Create bitmap0. + * Create a full drive backup of drive0. +3. Incremental backups are now ready to be created. + +### Supported Bitmap Transactions + +* block-dirty-bitmap-add +* block-dirty-bitmap-clear + +The usages are identical to their respective QMP commands, but see below +for examples. + +### Example: New Incremental Backup + +As outlined in the justification, perhaps we want to create a new incremental +backup chain attached to a drive. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-add", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +### Example: New Incremental Backup Anchor Point + +Maybe we just want to create a new full backup with an existing bitmap and +want to reset the bitmap to track the new chain. + +```json +{ "execute": "transaction", + "arguments": { + "actions": [ + {"type": "block-dirty-bitmap-clear", + "data": {"node": "drive0", "name": "bitmap0"} }, + {"type": "drive-backup", + "data": {"device": "drive0", "target": "/path/to/new_full_backup.img", + "sync": "full", "format": "qcow2"} } + ] + } +} +``` + +## Incremental Backups + +The star of the show. + +**Nota Bene!** Only incremental backups of entire drives are supported for now. +So despite the fact that you can attach a bitmap to any arbitrary node, they are +only currently useful when attached to the root node. This is because +drive-backup only supports drives/devices instead of arbitrary nodes. + +### Example: First Incremental Backup + +1. Create a full backup and sync it to the dirty bitmap, as in the transactional +examples above; or with the VM offline, manually create a full copy and then +create a new bitmap before the VM begins execution. + + * Let's assume the full backup is named 'full_backup.img'. + * Let's assume the bitmap you created is 'bitmap0' attached to 'drive0'. + +2. Create a destination image for the incremental backup that utilizes the +full backup as a backing image. + + * Let's assume it is named 'incremental.0.img'. + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +3. Issue the incremental backup command: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +### Example: Second Incremental Backup + +1. Create a new destination image for the incremental backup that points to the + previous one, e.g.: 'incremental.1.img' + + ```sh + # qemu-img create -f qcow2 incremental.1.img -b incremental.0.img -F qcow2 + ``` + +2. Issue a new incremental backup command. The only difference here is that we + have changed the target image below. + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.1.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +## Errors + +* In the event of an error that occurs after a backup job is successfully + launched, either by a direct QMP command or a QMP transaction, the user + will receive a BLOCK_JOB_COMPLETE event with a failure message, accompanied + by a BLOCK_JOB_ERROR event. + +* In the case of an event being cancelled, the user will receive a + BLOCK_JOB_CANCELLED event instead of a pair of COMPLETE and ERROR events. + +* In either case, the incremental backup data contained within the bitmap is + safely rolled back, and the data within the bitmap is not lost. The image + file created for the failed attempt can be safely deleted. + +* Once the underlying problem is fixed (e.g. more storage space is freed up), + you can simply retry the incremental backup command with the same bitmap. + +### Example + +1. Create a target image: + + ```sh + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +2. Attempt to create an incremental backup via QMP: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +3. Receive an event notifying us of failure: + + ```json + { "timestamp": { "seconds": 1424709442, "microseconds": 844524 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "No space left on device", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +4. Delete the failed incremental, and re-create the image. + + ```sh + # rm incremental.0.img + # qemu-img create -f qcow2 incremental.0.img -b full_backup.img -F qcow2 + ``` + +5. Retry the command after fixing the underlying problem, + such as freeing up space on the backup volume: + + ```json + { "execute": "drive-backup", + "arguments": { + "device": "drive0", + "bitmap": "bitmap0", + "target": "incremental.0.img", + "format": "qcow2", + "sync": "incremental", + "mode": "existing" + } + } + ``` + +6. Receive confirmation that the job completed successfully: + + ```json + { "timestamp": { "seconds": 1424709668, "microseconds": 526525 }, + "data": { "device": "drive1", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864}, + "event": "BLOCK_JOB_COMPLETED" } + ``` + +### Partial Transactional Failures + +* Sometimes, a transaction will succeed in launching and return success, + but then later the backup jobs themselves may fail. It is possible that + a management application may have to deal with a partial backup failure + after a successful transaction. + +* If multiple backup jobs are specified in a single transaction, when one of + them fails, it will not interact with the other backup jobs in any way. + +* The job(s) that succeeded will clear the dirty bitmap associated with the + operation, but the job(s) that failed will not. It is not "safe" to delete + any incremental backups that were created successfully in this scenario, + even though others failed. + +#### Example + +* QMP example highlighting two backup jobs: + + ```json + { "execute": "transaction", + "arguments": { + "actions": [ + { "type": "drive-backup", + "data": { "device": "drive0", "bitmap": "bitmap0", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d0-incr-1.qcow2" } }, + { "type": "drive-backup", + "data": { "device": "drive1", "bitmap": "bitmap1", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d1-incr-1.qcow2" } }, + ] + } + } + ``` + +* QMP example response, highlighting one success and one failure: + * Acknowledgement that the Transaction was accepted and jobs were launched: + ```json + { "return": {} } + ``` + + * Later, QEMU sends notice that the first job was completed: + ```json + { "timestamp": { "seconds": 1447192343, "microseconds": 615698 }, + "data": { "device": "drive0", "type": "backup", + "speed": 0, "len": 67108864, "offset": 67108864 }, + "event": "BLOCK_JOB_COMPLETED" + } + ``` + + * Later yet, QEMU sends notice that the second job has failed: + ```json + { "timestamp": { "seconds": 1447192399, "microseconds": 683015 }, + "data": { "device": "drive1", "action": "report", + "operation": "read" }, + "event": "BLOCK_JOB_ERROR" } + ``` + + ```json + { "timestamp": { "seconds": 1447192399, "microseconds": 685853 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "Input/output error", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + +* In the above example, "d0-incr-1.qcow2" is valid and must be kept, + but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide + incremental backup of all drives at a point-in-time is to be made, + new backups for both drives will need to be made, taking into account + that a new incremental backup for drive0 needs to be based on top of + "d0-incr-1.qcow2." + +### Grouped Completion Mode + +* While jobs launched by transactions normally complete or fail on their own, + it is possible to instruct them to complete or fail together as a group. + +* QMP transactions take an optional properties structure that can affect + the semantics of the transaction. + +* The "completion-mode" transaction property can be either "individual" + which is the default, legacy behavior described above, or "grouped," + a new behavior detailed below. + +* Delayed Completion: In grouped completion mode, no jobs will report + success until all jobs are ready to report success. + +* Grouped failure: If any job fails in grouped completion mode, all remaining + jobs will be cancelled. Any incremental backups will restore their dirty + bitmap objects as if no backup command was ever issued. + + * Regardless of if QEMU reports a particular incremental backup job as + CANCELLED or as an ERROR, the in-memory bitmap will be restored. + +#### Example + +* Here's the same example scenario from above with the new property: + + ```json + { "execute": "transaction", + "arguments": { + "actions": [ + { "type": "drive-backup", + "data": { "device": "drive0", "bitmap": "bitmap0", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d0-incr-1.qcow2" } }, + { "type": "drive-backup", + "data": { "device": "drive1", "bitmap": "bitmap1", + "format": "qcow2", "mode": "existing", + "sync": "incremental", "target": "d1-incr-1.qcow2" } }, + ], + "properties": { + "completion-mode": "grouped" + } + } + } + ``` + +* QMP example response, highlighting a failure for drive2: + * Acknowledgement that the Transaction was accepted and jobs were launched: + ```json + { "return": {} } + ``` + + * Later, QEMU sends notice that the second job has errored out, + but that the first job was also cancelled: + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 632377 }, + "data": { "device": "drive1", "action": "report", + "operation": "read" }, + "event": "BLOCK_JOB_ERROR" } + ``` + + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 640074 }, + "data": { "speed": 0, "offset": 0, "len": 67108864, + "error": "Input/output error", + "device": "drive1", "type": "backup" }, + "event": "BLOCK_JOB_COMPLETED" } + ``` + + ```json + { "timestamp": { "seconds": 1447193702, "microseconds": 640163 }, + "data": { "device": "drive0", "type": "backup", "speed": 0, + "len": 67108864, "offset": 16777216 }, + "event": "BLOCK_JOB_CANCELLED" } + ``` + +<!-- +The FreeBSD Documentation License + +Redistribution and use in source (Markdown) and 'compiled' forms (SGML, HTML, +PDF, PostScript, RTF and so forth) with or without modification, are permitted +provided that the following conditions are met: + +Redistributions of source code (Markdown) must retain the above copyright +notice, this list of conditions and the following disclaimer of this file +unmodified. + +Redistributions in compiled form (transformed to other DTDs, converted to PDF, +PostScript, RTF and other formats) must reproduce the above copyright notice, +this list of conditions and the following disclaimer in the documentation and/or +other materials provided with the distribution. + +THIS DOCUMENTATION IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF +THIS DOCUMENTATION, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> diff --git a/src/docs/blkdebug.txt b/src/docs/blkdebug.txt new file mode 100644 index 0000000..b67a36d --- /dev/null +++ b/src/docs/blkdebug.txt @@ -0,0 +1,161 @@ +Block I/O error injection using blkdebug +---------------------------------------- +Copyright (C) 2014 Red Hat Inc + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + +The blkdebug block driver is a rule-based error injection engine. It can be +used to exercise error code paths in block drivers including ENOSPC (out of +space) and EIO. + +This document gives an overview of the features available in blkdebug. + +Background +---------- +Block drivers have many error code paths that handle I/O errors. Image formats +are especially complex since metadata I/O errors during cluster allocation or +while updating tables happen halfway through request processing and require +discipline to keep image files consistent. + +Error injection allows test cases to trigger I/O errors at specific points. +This way, all error paths can be tested to make sure they are correct. + +Rules +----- +The blkdebug block driver takes a list of "rules" that tell the error injection +engine when to fail an I/O request. + +Each I/O request is evaluated against the rules. If a rule matches the request +then its "action" is executed. + +Rules can be placed in a configuration file; the configuration file +follows the same .ini-like format used by QEMU's -readconfig option, and +each section of the file represents a rule. + +The following configuration file defines a single rule: + + $ cat blkdebug.conf + [inject-error] + event = "read_aio" + errno = "28" + +This rule fails all aio read requests with ENOSPC (28). Note that the errno +value depends on the host. On Linux, see +/usr/include/asm-generic/errno-base.h for errno values. + +Invoke QEMU as follows: + + $ qemu-system-x86_64 + -drive if=none,cache=none,file=blkdebug:blkdebug.conf:test.img,id=drive0 \ + -device virtio-blk-pci,drive=drive0,id=virtio-blk-pci0 + +Rules support the following attributes: + + event - which type of operation to match (e.g. read_aio, write_aio, + flush_to_os, flush_to_disk). See the "Events" section for + information on events. + + state - (optional) the engine must be in this state number in order for this + rule to match. See the "State transitions" section for information + on states. + + errno - the numeric errno value to return when a request matches this rule. + The errno values depend on the host since the numeric values are not + standarized in the POSIX specification. + + sector - (optional) a sector number that the request must overlap in order to + match this rule + + once - (optional, default "off") only execute this action on the first + matching request + + immediately - (optional, default "off") return a NULL BlockAIOCB + pointer and fail without an errno instead. This + exercises the code path where BlockAIOCB fails and the + caller's BlockCompletionFunc is not invoked. + +Events +------ +Block drivers provide information about the type of I/O request they are about +to make so rules can match specific types of requests. For example, the qcow2 +block driver tells blkdebug when it accesses the L1 table so rules can match +only L1 table accesses and not other metadata or guest data requests. + +The core events are: + + read_aio - guest data read + + write_aio - guest data write + + flush_to_os - write out unwritten block driver state (e.g. cached metadata) + + flush_to_disk - flush the host block device's disk cache + +See block/blkdebug.c:event_names[] for the full list of events. You may need +to grep block driver source code to understand the meaning of specific events. + +State transitions +----------------- +There are cases where more power is needed to match a particular I/O request in +a longer sequence of requests. For example: + + write_aio + flush_to_disk + write_aio + +How do we match the 2nd write_aio but not the first? This is where state +transitions come in. + +The error injection engine has an integer called the "state" that always starts +initialized to 1. The state integer is internal to blkdebug and cannot be +observed from outside but rules can interact with it for powerful matching +behavior. + +Rules can be conditional on the current state and they can transition to a new +state. + +When a rule's "state" attribute is non-zero then the current state must equal +the attribute in order for the rule to match. + +For example, to match the 2nd write_aio: + + [set-state] + event = "write_aio" + state = "1" + new_state = "2" + + [inject-error] + event = "write_aio" + state = "2" + errno = "5" + +The first write_aio request matches the set-state rule and transitions from +state 1 to state 2. Once state 2 has been entered, the set-state rule no +longer matches since it requires state 1. But the inject-error rule now +matches the next write_aio request and injects EIO (5). + +State transition rules support the following attributes: + + event - which type of operation to match (e.g. read_aio, write_aio, + flush_to_os, flush_to_disk). See the "Events" section for + information on events. + + state - (optional) the engine must be in this state number in order for this + rule to match + + new_state - transition to this state number + +Suspend and resume +------------------ +Exercising code paths in block drivers may require specific ordering amongst +concurrent requests. The "breakpoint" feature allows requests to be halted on +a blkdebug event and resumed later. This makes it possible to achieve +deterministic ordering when multiple requests are in flight. + +Breakpoints on blkdebug events are associated with a user-defined "tag" string. +This tag serves as an identifier by which the request can be resumed at a later +point. + +See the qemu-io(1) break, resume, remove_break, and wait_break commands for +details. diff --git a/src/docs/blkverify.txt b/src/docs/blkverify.txt new file mode 100644 index 0000000..d556dc4 --- /dev/null +++ b/src/docs/blkverify.txt @@ -0,0 +1,69 @@ += Block driver correctness testing with blkverify = + +== Introduction == + +This document describes how to use the blkverify protocol to test that a block +driver is operating correctly. + +It is difficult to test and debug block drivers against real guests. Often +processes inside the guest will crash because corrupt sectors were read as part +of the executable. Other times obscure errors are raised by a program inside +the guest. These issues are extremely hard to trace back to bugs in the block +driver. + +Blkverify solves this problem by catching data corruption inside QEMU the first +time bad data is read and reporting the disk sector that is corrupted. + +== How it works == + +The blkverify protocol has two child block devices, the "test" device and the +"raw" device. Read/write operations are mirrored to both devices so their +state should always be in sync. + +The "raw" device is a raw image, a flat file, that has identical starting +contents to the "test" image. The idea is that the "raw" device will handle +read/write operations correctly and not corrupt data. It can be used as a +reference for comparison against the "test" device. + +After a mirrored read operation completes, blkverify will compare the data and +raise an error if it is not identical. This makes it possible to catch the +first instance where corrupt data is read. + +== Example == + +Imagine raw.img has 0xcd repeated throughout its first sector: + + $ ./qemu-io -c 'read -v 0 512' raw.img + 00000000: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 00000010: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + [...] + 000001e0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + 000001f0: cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd cd ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (97.656 MiB/sec and 200000.0000 ops/sec) + +And test.img is corrupt, its first sector is zeroed when it shouldn't be: + + $ ./qemu-io -c 'read -v 0 512' test.img + 00000000: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 00000010: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + [...] + 000001e0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + 000001f0: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ + read 512/512 bytes at offset 0 + 512.000000 bytes, 1 ops; 0.0000 sec (81.380 MiB/sec and 166666.6667 ops/sec) + +This error is caught by blkverify: + + $ ./qemu-io -c 'read 0 512' blkverify:a.img:b.img + blkverify: read sector_num=0 nb_sectors=4 contents mismatch in sector 0 + +A more realistic scenario is verifying the installation of a guest OS: + + $ ./qemu-img create raw.img 16G + $ ./qemu-img create -f qcow2 test.qcow2 16G + $ x86_64-softmmu/qemu-system-x86_64 -cdrom debian.iso \ + -drive file=blkverify:raw.img:test.qcow2 + +If the installation is aborted when blkverify detects corruption, use qemu-io +to explore the contents of the disk image at the sector in question. diff --git a/src/docs/bootindex.txt b/src/docs/bootindex.txt new file mode 100644 index 0000000..f84fac7 --- /dev/null +++ b/src/docs/bootindex.txt @@ -0,0 +1,43 @@ += Bootindex property = + +Block and net devices have bootindex property. This property is used to +determine the order in which firmware will consider devices for booting +the guest OS. If the bootindex property is not set for a device, it gets +lowest boot priority. There is no particular order in which devices with +unset bootindex property will be considered for booting, but they will +still be bootable. + +== Example == + +Let's assume we have a QEMU machine with two NICs (virtio, e1000) and two +disks (IDE, virtio): + +qemu -drive file=disk1.img,if=none,id=disk1 + -device ide-drive,drive=disk1,bootindex=4 + -drive file=disk2.img,if=none,id=disk2 + -device virtio-blk-pci,drive=disk2,bootindex=3 + -netdev type=user,id=net0 -device virtio-net-pci,netdev=net0,bootindex=2 + -netdev type=user,id=net1 -device e1000,netdev=net1,bootindex=1 + +Given the command above, firmware should try to boot from the e1000 NIC +first. If this fails, it should try the virtio NIC next; if this fails +too, it should try the virtio disk, and then the IDE disk. + +== Limitations == + +1. Some firmware has limitations on which devices can be considered for +booting. For instance, the PC BIOS boot specification allows only one +disk to be bootable. If boot from disk fails for some reason, the BIOS +won't retry booting from other disk. It can still try to boot from +floppy or net, though. + +2. Sometimes, firmware cannot map the device path QEMU wants firmware to +boot from to a boot method. It doesn't happen for devices the firmware +can natively boot from, but if firmware relies on an option ROM for +booting, and the same option ROM is used for booting from more then one +device, the firmware may not be able to ask the option ROM to boot from +a particular device reliably. For instance with the PC BIOS, if a SCSI HBA +has three bootable devices target1, target3, target5 connected to it, +the option ROM will have a boot method for each of them, but it is not +possible to map from boot method back to a specific target. This is a +shortcoming of the PC BIOS boot specification. diff --git a/src/docs/build-system.txt b/src/docs/build-system.txt new file mode 100644 index 0000000..5ddddea --- /dev/null +++ b/src/docs/build-system.txt @@ -0,0 +1,507 @@ + The QEMU build system architecture + ================================== + +This document aims to help developers understand the architecture of the +QEMU build system. As with projects using GNU autotools, the QEMU build +system has two stages, first the developer runs the "configure" script +to determine the local build environment characteristics, then they run +"make" to build the project. There is about where the similarities with +GNU autotools end, so try to forget what you know about them. + + +Stage 1: configure +================== + +The QEMU configure script is written directly in shell, and should be +compatible with any POSIX shell, hence it uses #!/bin/sh. An important +implication of this is that it is important to avoid using bash-isms on +development platforms where bash is the primary host. + +In contrast to autoconf scripts, QEMU's configure is expected to be +silent while it is checking for features. It will only display output +when an error occurs, or to show the final feature enablement summary +on completion. + +Adding new checks to the configure script usually comprises the +following tasks: + + - Initialize one or more variables with the default feature state. + + Ideally features should auto-detect whether they are present, + so try to avoid hardcoding the initial state to either enabled + or disabled, as that forces the user to pass a --enable-XXX + / --disable-XXX flag on every invocation of configure. + + - Add support to the command line arg parser to handle any new + --enable-XXX / --disable-XXX flags required by the feature XXX. + + - Add information to the help output message to report on the new + feature flag. + + - Add code to perform the actual feature check. As noted above, try to + be fully dynamic in checking enablement/disablement. + + - Add code to print out the feature status in the configure summary + upon completion. + + - Add any new makefile variables to $config_host_mak on completion. + + +Taking (a simplified version of) the probe for gnutls from configure, +we have the following pieces: + + # Initial variable state + gnutls="" + + ..snip.. + + # Configure flag processing + --disable-gnutls) gnutls="no" + ;; + --enable-gnutls) gnutls="yes" + ;; + + ..snip.. + + # Help output feature message + gnutls GNUTLS cryptography support + + ..snip.. + + # Test for gnutls + if test "$gnutls" != "no"; then + if ! $pkg_config --exists "gnutls"; then + gnutls_cflags=`$pkg_config --cflags gnutls` + gnutls_libs=`$pkg_config --libs gnutls` + libs_softmmu="$gnutls_libs $libs_softmmu" + libs_tools="$gnutls_libs $libs_tools" + QEMU_CFLAGS="$QEMU_CFLAGS $gnutls_cflags" + gnutls="yes" + elif test "$gnutls" = "yes"; then + feature_not_found "gnutls" "Install gnutls devel" + else + gnutls="no" + fi + fi + + ..snip.. + + # Completion feature summary + echo "GNUTLS support $gnutls" + + ..snip.. + + # Define make variables + if test "$gnutls" = "yes" ; then + echo "CONFIG_GNUTLS=y" >> $config_host_mak + fi + + +Helper functions +---------------- + +The configure script provides a variety of helper functions to assist +developers in checking for system features: + + - do_cc $ARGS... + + Attempt to run the system C compiler passing it $ARGS... + + - do_cxx $ARGS... + + Attempt to run the system C++ compiler passing it $ARGS... + + - compile_object $CFLAGS + + Attempt to compile a test program with the system C compiler using + $CFLAGS. The test program must have been previously written to a file + called $TMPC. + + - compile_prog $CFLAGS $LDFLAGS + + Attempt to compile a test program with the system C compiler using + $CFLAGS and link it with the system linker using $LDFLAGS. The test + program must have been previously written to a file called $TMPC. + + - has $COMMAND + + Determine if $COMMAND exists in the current environment, either as a + shell builtin, or executable binary, returning 0 on success. + + - path_of $COMMAND + + Return the fully qualified path of $COMMAND, printing it to stdout, + and returning 0 on success. + + - check_define $NAME + + Determine if the macro $NAME is defined by the system C compiler + + - check_include $NAME + + Determine if the include $NAME file is available to the system C + compiler + + - write_c_skeleton + + Write a minimal C program main() function to the temporary file + indicated by $TMPC + + - feature_not_found $NAME $REMEDY + + Print a message to stderr that the feature $NAME was not available + on the system, suggesting the user try $REMEDY to address the + problem. + + - error_exit $MESSAGE $MORE... + + Print $MESSAGE to stderr, followed by $MORE... and then exit from the + configure script with non-zero status + + - query_pkg_config $ARGS... + + Run pkg-config passing it $ARGS. If QEMU is doing a static build, + then --static will be automatically added to $ARGS + + +Stage 2: makefiles +================== + +The use of GNU make is required with the QEMU build system. + +Although the source code is spread across multiple subdirectories, the +build system should be considered largely non-recursive in nature, in +contrast to common practices seen with automake. There is some recursive +invocation of make, but this is related to the things being built, +rather than the source directory structure. + +QEMU currently supports both VPATH and non-VPATH builds, so there are +three general ways to invoke configure & perform a build. + + - VPATH, build artifacts outside of QEMU source tree entirely + + cd ../ + mkdir build + cd build + ../qemu/configure + make + + - VPATH, build artifacts in a subdir of QEMU source tree + + mkdir build + cd build + ../configure + make + + - non-VPATH, build artifacts everywhere + + ./configure + make + +The QEMU maintainers generally recommend that a VPATH build is used by +developers. Patches to QEMU are expected to ensure VPATH build still +works. + + +Module structure +---------------- + +There are a number of key outputs of the QEMU build system: + + - Tools - qemu-img, qemu-nbd, qga (guest agent), etc + - System emulators - qemu-system-$ARCH + - Userspace emulators - qemu-$ARCH + - Unit tests + +The source code is highly modularized, split across many files to +facilitate building of all of these components with as little duplicated +compilation as possible. There can be considered to be two distinct +groups of files, those which are independent of the QEMU emulation +target and those which are dependent on the QEMU emulation target. + +In the target-independent set lives various general purpose helper code, +such as error handling infrastructure, standard data structures, +platform portability wrapper functions, etc. This code can be compiled +once only and the .o files linked into all output binaries. + +In the target-dependent set lives CPU emulation, device emulation and +much glue code. This sometimes also has to be compiled multiple times, +once for each target being built. + +The utility code that is used by all binaries is built into a +static archive called libqemuutil.a, which is then linked to all the +binaries. In order to provide hooks that are only needed by some of the +binaries, code in libqemuutil.a may depend on other functions that are +not fully implemented by all QEMU binaries. To deal with this there is a +second library called libqemustub.a which provides dummy stubs for all +these functions. These will get lazy linked into the binary if the real +implementation is not present. In this way, the libqemustub.a static +library can be thought of as a portable implementation of the weak +symbols concept. All binaries should link to both libqemuutil.a and +libqemustub.a. e.g. + + qemu-img$(EXESUF): qemu-img.o ..snip.. libqemuutil.a libqemustub.a + + +Windows platform portability +---------------------------- + +On Windows, all binaries have the suffix '.exe', so all Makefile rules +which create binaries must include the $(EXESUF) variable on the binary +name. e.g. + + qemu-img$(EXESUF): qemu-img.o ..snip.. + +This expands to '.exe' on Windows, or '' on other platforms. + +A further complication for the system emulator binaries is that +two separate binaries need to be generated. + +The main binary (e.g. qemu-system-x86_64.exe) is linked against the +Windows console runtime subsystem. These are expected to be run from a +command prompt window, and so will print stderr to the console that +launched them. + +The second binary generated has a 'w' on the end of its name (e.g. +qemu-system-x86_64w.exe) and is linked against the Windows graphical +runtime subsystem. These are expected to be run directly from the +desktop and will open up a dedicated console window for stderr output. + +The Makefile.target will generate the binary for the graphical subsystem +first, and then use objcopy to relink it against the console subsystem +to generate the second binary. + + +Object variable naming +---------------------- + +The QEMU convention is to define variables to list different groups of +object files. These are named with the convention $PREFIX-obj-y. For +example the libqemuutil.a file will be linked with all objects listed +in a variable 'util-obj-y'. So, for example, util/Makefile.obj will +contain a set of definitions looking like + + util-obj-y += bitmap.o bitops.o hbitmap.o + util-obj-y += fifo8.o + util-obj-y += acl.o + util-obj-y += error.o qemu-error.o + +When there is an object file which needs to be conditionally built based +on some characteristic of the host system, the configure script will +define a variable for the conditional. For example, on Windows it will +define $(CONFIG_POSIX) with a value of 'n' and $(CONFIG_WIN32) with a +value of 'y'. It is now possible to use the config variables when +listing object files. For example, + + util-obj-$(CONFIG_WIN32) += oslib-win32.o qemu-thread-win32.o + util-obj-$(CONFIG_POSIX) += oslib-posix.o qemu-thread-posix.o + +On Windows this expands to + + util-obj-y += oslib-win32.o qemu-thread-win32.o + util-obj-n += oslib-posix.o qemu-thread-posix.o + +Since libqemutil.a links in $(util-obj-y), the POSIX specific files +listed against $(util-obj-n) are ignored on the Windows platform builds. + + +CFLAGS / LDFLAGS / LIBS handling +-------------------------------- + +There are many different binaries being built with differing purposes, +and some of them might even be 3rd party libraries pulled in via git +submodules. As such the use of the global CFLAGS variable is generally +avoided in QEMU, since it would apply to too many build targets. + +Flags that are needed by any QEMU code (i.e. everything *except* GIT +submodule projects) are put in $(QEMU_CFLAGS) variable. For linker +flags the $(LIBS) variable is sometimes used, but a couple of more +targeted variables are preferred. $(libs_softmmu) is used for +libraries that must be linked to system emulator targets, $(LIBS_TOOLS) +is used for tools like qemu-img, qemu-nbd, etc and $(LIBS_QGA) is used +for the QEMU guest agent. There is currently no specific variable for +the userspace emulator targets as the global $(LIBS), or more targeted +variables shown below, are sufficient. + +In addition to these variables, it is possible to provide cflags and +libs against individual source code files, by defining variables of the +form $FILENAME-cflags and $FILENAME-libs. For example, the curl block +driver needs to link to the libcurl library, so block/Makefile defines +some variables: + + curl.o-cflags := $(CURL_CFLAGS) + curl.o-libs := $(CURL_LIBS) + +The scope is a little different between the two variables. The libs get +used when linking any target binary that includes the curl.o object +file, while the cflags get used when compiling the curl.c file only. + + +Statically defined files +------------------------ + +The following key files are statically defined in the source tree, with +the rules needed to build QEMU. Their behaviour is influenced by a +number of dynamically created files listed later. + +- Makefile + +The main entry point used when invoking make to build all the components +of QEMU. The default 'all' target will naturally result in the build of +every component. The various tools and helper binaries are built +directly via a non-recursive set of rules. + +Each system/userspace emulation target needs to have a slightly +different set of make rules / variables. Thus, make will be recursively +invoked for each of the emulation targets. + +The recursive invocation will end up processing the toplevel +Makefile.target file (more on that later). + + +- */Makefile.objs + +Since the source code is spread across multiple directories, the rules +for each file are similarly modularized. Thus each subdirectory +containing .c files will usually also contain a Makefile.objs file. +These files are not directly invoked by a recursive make, but instead +they are imported by the top level Makefile and/or Makefile.target + +Each Makefile.objs usually just declares a set of variables listing the +.o files that need building from the source files in the directory. They +will also define any custom linker or compiler flags. For example in +block/Makefile.objs + + block-obj-$(CONFIG_LIBISCSI) += iscsi.o + block-obj-$(CONFIG_CURL) += curl.o + + ..snip... + + iscsi.o-cflags := $(LIBISCSI_CFLAGS) + iscsi.o-libs := $(LIBISCSI_LIBS) + curl.o-cflags := $(CURL_CFLAGS) + curl.o-libs := $(CURL_LIBS) + +If there are any rules defined in the Makefile.objs file, they should +all use $(obj) as a prefix to the target, e.g. + + $(obj)/generated-tcg-tracers.h: $(obj)/generated-tcg-tracers.h-timestamp + + +- Makefile.target + +This file provides the entry point used to build each individual system +or userspace emulator target. Each enabled target has its own +subdirectory. For example if configure is run with the argument +'--target-list=x86_64-softmmu', then a sub-directory 'x86_64-softmu' +will be created, containing a 'Makefile' which symlinks back to +Makefile.target + +So when the recursive '$(MAKE) -C x86_64-softmmu' is invoked, it ends up +using Makefile.target for the build rules. + + +- rules.mak + +This file provides the generic helper rules for invoking build tools, in +particular the compiler and linker. This also contains the magic (hairy) +'unnest-vars' function which is used to merge the variable definitions +from all Makefile.objs in the source tree down into the main Makefile +context. + + +- default-configs/*.mak + +The files under default-configs/ control what emulated hardware is built +into each QEMU system and userspace emulator targets. They merely +contain a long list of config variable definitions. For example, +default-configs/x86_64-softmmu.mak has: + + include pci.mak + include sound.mak + include usb.mak + CONFIG_QXL=$(CONFIG_SPICE) + CONFIG_VGA_ISA=y + CONFIG_VGA_CIRRUS=y + CONFIG_VMWARE_VGA=y + CONFIG_VIRTIO_VGA=y + ...snip... + +These files rarely need changing unless new devices / hardware need to +be enabled for a particular system/userspace emulation target + + +- tests/Makefile + +Rules for building the unit tests. This file is included directly by the +top level Makefile, so anything defined in this file will influence the +entire build system. Care needs to be taken when writing rules for tests +to ensure they only apply to the unit test execution / build. + + +- po/Makefile + +Rules for building and installing the binary message catalogs from the +text .po file sources. This almost never needs changing for any reason. + + +Dynamically created files +------------------------- + +The following files are generated dynamically by configure in order to +control the behaviour of the statically defined makefiles. This avoids +the need for QEMU makefiles to go through any pre-processing as seen +with autotools, where Makefile.am generates Makefile.in which generates +Makefile. + + +- config-host.mak + +When configure has determined the characteristics of the build host it +will write a long list of variables to config-host.mak file. This +provides the various install directories, compiler / linker flags and a +variety of CONFIG_* variables related to optionally enabled features. +This is imported by the top level Makefile in order to tailor the build +output. + +The variables defined here are those which are applicable to all QEMU +build outputs. Variables which are potentially different for each +emulator target are defined by the next file... + +It is also used as a dependency checking mechanism. If make sees that +the modification timestamp on configure is newer than that on +config-host.mak, then configure will be re-run. + + +- config-host.h + +The config-host.h file is used by source code to determine what features +are enabled. It is generated from the contents of config-host.mak using +the scripts/create_config program. This extracts all the CONFIG_* variables, +most of the HOST_* variables and a few other misc variables from +config-host.mak, formatting them as C preprocessor macros. + + +- $TARGET-NAME/config-target.mak + +TARGET-NAME is the name of a system or userspace emulator, for example, +x86_64-softmmu denotes the system emulator for the x86_64 architecture. +This file contains the variables which need to vary on a per-target +basis. For example, it will indicate whether KVM or Xen are enabled for +the target and any other potential custom libraries needed for linking +the target. + + +- $TARGET-NAME/config-devices.mak + +TARGET-NAME is again the name of a system or userspace emulator. The +config-devices.mak file is automatically generated by make using the +scripts/make_device_config.sh program, feeding it the +default-configs/$TARGET-NAME file as input. + + +- $TARGET-NAME/Makefile + +This is the entrypoint used when make recurses to build a single system +or userspace emulator target. It is merely a symlink back to the +Makefile.target in the top level. diff --git a/src/docs/ccid.txt b/src/docs/ccid.txt new file mode 100644 index 0000000..c7fda6d --- /dev/null +++ b/src/docs/ccid.txt @@ -0,0 +1,181 @@ +QEMU CCID Device Documentation. + +Contents +1. USB CCID device +2. Building +3. Using ccid-card-emulated with hardware +4. Using ccid-card-emulated with certificates +5. Using ccid-card-passthru with client side hardware +6. Using ccid-card-passthru with client side certificates +7. Passthrough protocol scenario +8. libcacard + +1. USB CCID device + +The USB CCID device is a USB device implementing the CCID specification, which +lets one connect smart card readers that implement the same spec. For more +information see the specification: + + Universal Serial Bus + Device Class: Smart Card + CCID + Specification for + Integrated Circuit(s) Cards Interface Devices + Revision 1.1 + April 22rd, 2005 + +Smartcards are used for authentication, single sign on, decryption in +public/private schemes and digital signatures. A smartcard reader on the client +cannot be used on a guest with simple usb passthrough since it will then not be +available on the client, possibly locking the computer when it is "removed". On +the other hand this device can let you use the smartcard on both the client and +the guest machine. It is also possible to have a completely virtual smart card +reader and smart card (i.e. not backed by a physical device) using this device. + +2. Building + +The cryptographic functions and access to the physical card is done via NSS. + +Installing NSS: + +In redhat/fedora: + yum install nss-devel +In ubuntu/debian: + apt-get install libnss3-dev + (not tested on ubuntu) + +Configuring and building: + ./configure --enable-smartcard && make + + +3. Using ccid-card-emulated with hardware + +Assuming you have a working smartcard on the host with the current +user, using NSS, qemu acts as another NSS client using ccid-card-emulated: + + qemu -usb -device usb-ccid -device ccid-card-emulated + + +4. Using ccid-card-emulated with certificates stored in files + +You must create the CA and card certificates. This is a one time process. +We use NSS certificates: + + mkdir fake-smartcard + cd fake-smartcard + certutil -N -d sql:$PWD + certutil -S -d sql:$PWD -s "CN=Fake Smart Card CA" -x -t TC,TC,TC -n fake-smartcard-ca + certutil -S -d sql:$PWD -t ,, -s "CN=John Doe" -n id-cert -c fake-smartcard-ca + certutil -S -d sql:$PWD -t ,, -s "CN=John Doe (signing)" --nsCertType smime -n signing-cert -c fake-smartcard-ca + certutil -S -d sql:$PWD -t ,, -s "CN=John Doe (encryption)" --nsCertType sslClient -n encryption-cert -c fake-smartcard-ca + +Note: you must have exactly three certificates. + +You can use the emulated card type with the certificates backend: + + qemu -usb -device usb-ccid -device ccid-card-emulated,backend=certificates,db=sql:$PWD,cert1=id-cert,cert2=signing-cert,cert3=encryption-cert + +To use the certificates in the guest, export the CA certificate: + + certutil -L -r -d sql:$PWD -o fake-smartcard-ca.cer -n fake-smartcard-ca + +and import it in the guest: + + certutil -A -d /etc/pki/nssdb -i fake-smartcard-ca.cer -t TC,TC,TC -n fake-smartcard-ca + +In a Linux guest you can then use the CoolKey PKCS #11 module to access +the card: + + certutil -d /etc/pki/nssdb -L -h all + +It will prompt you for the PIN (which is the password you assigned to the +certificate database early on), and then show you all three certificates +together with the manually imported CA cert: + + Certificate Nickname Trust Attributes + fake-smartcard-ca CT,C,C + John Doe:CAC ID Certificate u,u,u + John Doe:CAC Email Signature Certificate u,u,u + John Doe:CAC Email Encryption Certificate u,u,u + +If this does not happen, CoolKey is not installed or not registered with +NSS. Registration can be done from Firefox or the command line: + + modutil -dbdir /etc/pki/nssdb -add "CAC Module" -libfile /usr/lib64/pkcs11/libcoolkeypk11.so + modutil -dbdir /etc/pki/nssdb -list + + +5. Using ccid-card-passthru with client side hardware + +on the host specify the ccid-card-passthru device with a suitable chardev: + + qemu -chardev socket,server,host=0.0.0.0,port=2001,id=ccid,nowait -usb -device usb-ccid -device ccid-card-passthru,chardev=ccid + +on the client run vscclient, built when you built QEMU: + + vscclient <qemu-host> 2001 + + +6. Using ccid-card-passthru with client side certificates + +This case is not particularly useful, but you can use it to debug +your setup if #4 works but #5 does not. + +Follow instructions as per #4, except run QEMU and vscclient as follows: +Run qemu as per #5, and run vscclient from the "fake-smartcard" +directory as follows: + + qemu -chardev socket,server,host=0.0.0.0,port=2001,id=ccid,nowait -usb -device usb-ccid -device ccid-card-passthru,chardev=ccid + vscclient -e "db=\"sql:$PWD\" use_hw=no soft=(,Test,CAC,,id-cert,signing-cert,encryption-cert)" <qemu-host> 2001 + + +7. Passthrough protocol scenario + +This is a typical interchange of messages when using the passthru card device. +usb-ccid is a usb device. It defaults to an unattached usb device on startup. +usb-ccid expects a chardev and expects the protocol defined in +cac_card/vscard_common.h to be passed over that. +The usb-ccid device can be in one of three modes: + * detached + * attached with no card + * attached with card + +A typical interchange is: (the arrow shows who started each exchange, it can be client +originated or guest originated) + +client event | vscclient | passthru | usb-ccid | guest event +---------------------------------------------------------------------------------------------- + | VSC_Init | | | + | VSC_ReaderAdd | | attach | + | | | | sees new usb device. +card inserted -> | | | | + | VSC_ATR | insert | insert | see new card + | | | | + | VSC_APDU | VSC_APDU | | <- guest sends APDU +client<->physical | | | | +card APDU exchange| | | | +client response ->| VSC_APDU | VSC_APDU | | receive APDU response + ... + [APDU<->APDU repeats several times] + ... +card removed -> | | | | + | VSC_CardRemove | remove | remove | card removed + ... + [(card insert, apdu's, card remove) repeat] + ... +kill/quit | | | | + vscclient | | | | + | VSC_ReaderRemove | | detach | + | | | | usb device removed. + + +8. libcacard + +Both ccid-card-emulated and vscclient use libcacard as the card emulator. +libcacard implements a completely virtual CAC (DoD standard for smart +cards) compliant card and uses NSS to retrieve certificates and do +any encryption. The backend can then be a real reader and card, or +certificates stored in files. + +For documentation of the library see docs/libcacard.txt. + diff --git a/src/docs/ich9-ehci-uhci.cfg b/src/docs/ich9-ehci-uhci.cfg new file mode 100644 index 0000000..a0e9b96 --- /dev/null +++ b/src/docs/ich9-ehci-uhci.cfg @@ -0,0 +1,37 @@ +########################################################################### +# +# You can pass this file directly to qemu using the -readconfig +# command line switch. +# +# This config file creates a EHCI adapter with companion UHCI +# controllers as multifunction device in PCI slot "1d". +# +# Specify "bus=ehci.0" when creating usb devices to hook them up +# there. +# + +[device "ehci"] + driver = "ich9-usb-ehci1" + addr = "1d.7" + multifunction = "on" + +[device "uhci-1"] + driver = "ich9-usb-uhci1" + addr = "1d.0" + multifunction = "on" + masterbus = "ehci.0" + firstport = "0" + +[device "uhci-2"] + driver = "ich9-usb-uhci2" + addr = "1d.1" + multifunction = "on" + masterbus = "ehci.0" + firstport = "2" + +[device "uhci-3"] + driver = "ich9-usb-uhci3" + addr = "1d.2" + multifunction = "on" + masterbus = "ehci.0" + firstport = "4" diff --git a/src/docs/image-fuzzer.txt b/src/docs/image-fuzzer.txt new file mode 100644 index 0000000..3e23ebe --- /dev/null +++ b/src/docs/image-fuzzer.txt @@ -0,0 +1,239 @@ +# Specification for the fuzz testing tool +# +# Copyright (C) 2014 Maria Kustova <maria.k@catit.be> +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + + +Image fuzzer +============ + +Description +----------- + +The goal of the image fuzzer is to catch crashes of qemu-io/qemu-img +by providing to them randomly corrupted images. +Test images are generated from scratch and have valid inner structure with some +elements, e.g. L1/L2 tables, having random invalid values. + + +Test runner +----------- + +The test runner generates test images, executes tests utilizing generated +images, indicates their results and collects all test related artifacts (logs, +core dumps, test images, backing files). +The test means execution of all available commands under test with the same +generated test image. +By default, the test runner generates new tests and executes them until +keyboard interruption. But if a test seed is specified via the '--seed' runner +parameter, then only one test with this seed will be executed, after its finish +the runner will exit. + +The runner uses an external image fuzzer to generate test images. An image +generator should be specified as a mandatory parameter of the test runner. +Details about interactions between the runner and fuzzers see "Module +interfaces". + +The runner activates generation of core dumps during test executions, but it +assumes that core dumps will be generated in the current working directory. +For comprehensive test results, please, set up your test environment +properly. + +Paths to binaries under test (SUTs) qemu-img and qemu-io are retrieved from +environment variables. If the environment check fails the runner will +use SUTs installed in system paths. +qemu-img is required for creation of backing files, so it's mandatory to set +the related environment variable if it's not installed in the system path. +For details about environment variables see qemu-iotests/check. + +The runner accepts a JSON array of fields expected to be fuzzed via the +'--config' argument, e.g. + + '[["feature_name_table"], ["header", "l1_table_offset"]]' + +Each sublist can have one or two strings defining image structure elements. +In the latter case a parent element should be placed on the first position, +and a field name on the second one. + +The runner accepts a list of commands under test as a JSON array via +the '--command' argument. Each command is a list containing a SUT and all its +arguments, e.g. + + runner.py -c '[["qemu-io", "$test_img", "-c", "write $off $len"]]' + /tmp/test ../qcow2 + +For variable arguments next aliases can be used: + - $test_img for a fuzzed img + - $off for an offset in the fuzzed image + - $len for a data size + +Values for last two aliases will be generated based on a size of a virtual +disk of the generated image. +In case when no commands are specified the runner will execute commands from +the default list: + - qemu-img check + - qemu-img info + - qemu-img convert + - qemu-io -c read + - qemu-io -c write + - qemu-io -c aio_read + - qemu-io -c aio_write + - qemu-io -c flush + - qemu-io -c discard + - qemu-io -c truncate + + +Qcow2 image generator +--------------------- + +The 'qcow2' generator is a Python package providing 'create_image' method as +a single public API. See details in 'Test runner/image fuzzer' chapter of +'Module interfaces'. + +Qcow2 contains two submodules: fuzz.py and layout.py. + +'fuzz.py' contains all fuzzing functions, one per image field. It's assumed +that after code analysis every field will have own constraints for its value. +For now only universal potentially dangerous values are used, e.g. type limits +for integers or unsafe symbols as '%s' for strings. For bitmasks random amount +of bits are set to ones. All fuzzed values are checked on non-equality to the +current valid value of the field. In case of equality the value will be +regenerated. + +'layout.py' creates a random valid image, fuzzes a random subset of the image +fields by 'fuzz.py' module and writes a fuzzed image to the file specified. +If a fuzzer configuration is specified, then it has the next interpretation: + + 1. If a list contains a parent image element only, then some random portion + of fields of this element will be fuzzed every test. + The same behavior is applied for the entire image if no configuration is + used. This case is useful for the test specialization. + + 2. If a list contains a parent element and a field name, then a field + will be always fuzzed for every test. This case is useful for regression + testing. + +The generator can create header fields, header extensions, L1/L2 tables and +refcount table and blocks. + +Module interfaces +----------------- + +* Test runner/image fuzzer + +The runner calls an image generator specifying the path to a test image file, +path to a backing file and its format and a fuzzer configuration. +An image generator is expected to provide a + + 'create_image(test_img_path, backing_file_path=None, + backing_file_format=None, fuzz_config=None)' + +method that creates a test image, writes it to the specified file and returns +the size of the virtual disk. +The file should be created if it doesn't exist or overwritten otherwise. +fuzz_config has a form of a list of lists. Every sublist can have one +or two elements: first element is a name of a parent image element, second one +if exists is a name of a field in this element. +Example, + [['header', 'l1_table_offset'], + ['header', 'nb_snapshots'], + ['feature_name_table']] + +Random seed is set by the runner at every test execution for the regression +purpose, so an image generator is not recommended to modify it internally. + + +Overall fuzzer requirements +=========================== + +Input data: +---------- + + - image template (generator) + - work directory + - action vector (optional) + - seed (optional) + - SUT and its arguments (optional) + + +Fuzzer requirements: +------------------- + +1. Should be able to inject random data +2. Should be able to select a random value from the manually pregenerated + vector (boundary values, e.g. max/min cluster size) +3. Image template should describe a general structure invariant for all + test images (image format description) +4. Image template should be autonomous and other fuzzer parts should not + rely on it +5. Image template should contain reference rules (not only block+size + description) +6. Should generate the test image with the correct structure based on an image + template +7. Should accept a seed as an argument (for regression purpose) +8. Should generate a seed if it is not specified as an input parameter. +9. The same seed should generate the same image for the same action vector, + specified or generated. +10. Should accept a vector of actions as an argument (for test reproducing and + for test case specification, e.g. group of tests for header structure, + group of test for snapshots, etc) +11. Action vector should be randomly generated from the pool of available + actions, if it is not specified as an input parameter +12. Pool of actions should be defined automatically based on an image template +13. Should accept a SUT and its call parameters as an argument or select them + randomly otherwise. As far as it's expected to be rarely changed, the list + of all possible test commands can be available in the test runner + internally. +14. Should support an external cancellation of a test run +15. Seed should be logged (for regression purpose) +16. All files related to a test result should be collected: a test image, + SUT logs, fuzzer logs and crash dumps +17. Should be compatible with python version 2.4-2.7 +18. Usage of external libraries should be limited as much as possible. + + +Image formats: +------------- + +Main target image format is qcow2, but support of image templates should +provide an ability to add any other image format. + + +Effectiveness: +------------- + +The fuzzer can be controlled via template, seed and action vector; +it makes the fuzzer itself invariant to an image format and test logic. +It should be able to perform rather complex and precise tests, that can be +specified via an action vector. Otherwise, knowledge about an image structure +allows the fuzzer to generate the pool of all available areas can be fuzzed +and randomly select some of them and so compose its own action vector. +Also complexity of a template defines complexity of the fuzzer, so its +functionality can be varied from simple model-independent fuzzing to smart +model-based one. + + +Glossary: +-------- + +Action vector is a sequence of structure elements retrieved from an image +format, each of them will be fuzzed for the test image. It's a subset of +elements of the action pool. Example: header, refcount table, etc. +Action pool is all available elements of an image structure that generated +automatically from an image template. +Image template is a formal description of an image structure and relations +between image blocks. +Test image is an output image of the fuzzer defined by the current seed and +action vector. diff --git a/src/docs/live-block-ops.txt b/src/docs/live-block-ops.txt new file mode 100644 index 0000000..a257087 --- /dev/null +++ b/src/docs/live-block-ops.txt @@ -0,0 +1,58 @@ +LIVE BLOCK OPERATIONS +===================== + +High level description of live block operations. Note these are not +supported for use with the raw format at the moment. + +Snapshot live merge +=================== + +Given a snapshot chain, described in this document in the following +format: + +[A] -> [B] -> [C] -> [D] + +Where the rightmost object ([D] in the example) described is the current +image which the guest OS has write access to. To the left of it is its base +image, and so on accordingly until the leftmost image, which has no +base. + +The snapshot live merge operation transforms such a chain into a +smaller one with fewer elements, such as this transformation relative +to the first example: + +[A] -> [D] + +Currently only forward merge with target being the active image is +supported, that is, data copy is performed in the right direction with +destination being the rightmost image. + +The operation is implemented in QEMU through image streaming facilities. + +The basic idea is to execute 'block_stream virtio0' while the guest is +running. Progress can be monitored using 'info block-jobs'. When the +streaming operation completes it raises a QMP event. 'block_stream' +copies data from the backing file(s) into the active image. When finished, +it adjusts the backing file pointer. + +The 'base' parameter specifies an image which data need not be streamed from. +This image will be used as the backing file for the active image when the +operation is finished. + +In the example above, the command would be: + +(qemu) block_stream virtio0 A + + +Live block copy +=============== + +To copy an in use image to another destination in the filesystem, one +should create a live snapshot in the desired destination, then stream +into that image. Example: + +(qemu) snapshot_blkdev ide0-hd0 /new-path/disk.img qcow2 + +(qemu) block_stream ide0-hd0 + + diff --git a/src/docs/memory-hotplug.txt b/src/docs/memory-hotplug.txt new file mode 100644 index 0000000..56bdd0a --- /dev/null +++ b/src/docs/memory-hotplug.txt @@ -0,0 +1,93 @@ +QEMU memory hotplug +=================== + +This document explains how to use the memory hotplug feature in QEMU, +which is present since v2.1.0. + +Guest support is required for memory hotplug to work. + +Basic RAM hotplug +----------------- + +In order to be able to hotplug memory, QEMU has to be told how many +hotpluggable memory slots to create and what is the maximum amount of +memory the guest can grow. This is done at startup time by means of +the -m command-line option, which has the following format: + + -m [size=]megs[,slots=n,maxmem=size] + +Where, + + - "megs" is the startup RAM. It is the RAM the guest will boot with + - "slots" is the number of hotpluggable memory slots + - "maxmem" is the maximum RAM size the guest can have + +For example, the following command-line: + + qemu [...] 1G,slots=3,maxmem=4G + +Creates a guest with 1GB of memory and three hotpluggable memory slots. +The hotpluggable memory slots are empty when the guest is booted, so all +memory the guest will see after boot is 1GB. The maximum memory the +guest can reach is 4GB. This means that three additional gigabytes can be +hotplugged by using any combination of the available memory slots. + +Two monitor commands are used to hotplug memory: + + - "object_add": creates a memory backend object + - "device_add": creates a front-end pc-dimm device and inserts it + into the first empty slot + +For example, the following commands add another 1GB to the guest +discussed earlier: + + (qemu) object_add memory-backend-ram,id=mem1,size=1G + (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 + +Using the file backend +---------------------- + +Besides basic RAM hotplug, QEMU also supports using files as a memory +backend. This is useful for using hugetlbfs in Linux, which provides +access to bigger page sizes. + +For example, assuming that the host has 1GB hugepages available in +the /mnt/hugepages-1GB directory, a 1GB hugepage could be hotplugged +into the guest from the previous section with the following commands: + + (qemu) object_add memory-backend-file,id=mem1,size=1G,mem-path=/mnt/hugepages-1GB + (qemu) device_add pc-dimm,id=dimm1,memdev=mem1 + +It's also possible to start a guest with memory cold-plugged into the +hotpluggable memory slots. This might seem counterintuitive at first, +but this allows for a lot of flexibility when using the file backend. + +In the following command-line example, a 8GB guest is created where 6GB +comes from regular RAM, 1GB is a 1GB hugepage page and 256MB is from +2MB pages. Also, the guest has additional memory slots to hotplug more +2GB if needed: + + qemu [...] -m 6GB,slots=4,maxmem=10G \ + -object memory-backend-file,id=mem1,size=1G,mem-path=/mnt/hugepages-1G \ + -device pc-dimm,id=dimm1,memdev=mem1 \ + -object memory-backend-file,id=mem2,size=256M,mem-path=/mnt/hugepages-2MB \ + -device pc-dimm,id=dimm2,memdev=mem2 + + +RAM hot-unplug +--------------- + +In order to be able to hot unplug pc-dimm device, QEMU has to be told the ids +of pc-dimm device and memory backend object. The ids were assigned when you hot +plugged memory. + +Two monitor commands are used to hot unplug memory: + + - "device_del": deletes a front-end pc-dimm device + - "object_del": deletes a memory backend object + +For example, assuming that the pc-dimm device with id "dimm1" exists, and its memory +backend is "mem1", the following commands tries to remove it. + + (qemu) device_del dimm1 + (qemu) object_del mem1 diff --git a/src/docs/memory.txt b/src/docs/memory.txt new file mode 100644 index 0000000..2ceb348 --- /dev/null +++ b/src/docs/memory.txt @@ -0,0 +1,286 @@ +The memory API +============== + +The memory API models the memory and I/O buses and controllers of a QEMU +machine. It attempts to allow modelling of: + + - ordinary RAM + - memory-mapped I/O (MMIO) + - memory controllers that can dynamically reroute physical memory regions + to different destinations + +The memory model provides support for + + - tracking RAM changes by the guest + - setting up coalesced memory for kvm + - setting up ioeventfd regions for kvm + +Memory is modelled as an acyclic graph of MemoryRegion objects. Sinks +(leaves) are RAM and MMIO regions, while other nodes represent +buses, memory controllers, and memory regions that have been rerouted. + +In addition to MemoryRegion objects, the memory API provides AddressSpace +objects for every root and possibly for intermediate MemoryRegions too. +These represent memory as seen from the CPU or a device's viewpoint. + +Types of regions +---------------- + +There are four types of memory regions (all represented by a single C type +MemoryRegion): + +- RAM: a RAM region is simply a range of host memory that can be made available + to the guest. + +- MMIO: a range of guest memory that is implemented by host callbacks; + each read or write causes a callback to be called on the host. + +- container: a container simply includes other memory regions, each at + a different offset. Containers are useful for grouping several regions + into one unit. For example, a PCI BAR may be composed of a RAM region + and an MMIO region. + + A container's subregions are usually non-overlapping. In some cases it is + useful to have overlapping regions; for example a memory controller that + can overlay a subregion of RAM with MMIO or ROM, or a PCI controller + that does not prevent card from claiming overlapping BARs. + +- alias: a subsection of another region. Aliases allow a region to be + split apart into discontiguous regions. Examples of uses are memory banks + used when the guest address space is smaller than the amount of RAM + addressed, or a memory controller that splits main memory to expose a "PCI + hole". Aliases may point to any type of region, including other aliases, + but an alias may not point back to itself, directly or indirectly. + +It is valid to add subregions to a region which is not a pure container +(that is, to an MMIO, RAM or ROM region). This means that the region +will act like a container, except that any addresses within the container's +region which are not claimed by any subregion are handled by the +container itself (ie by its MMIO callbacks or RAM backing). However +it is generally possible to achieve the same effect with a pure container +one of whose subregions is a low priority "background" region covering +the whole address range; this is often clearer and is preferred. +Subregions cannot be added to an alias region. + +Region names +------------ + +Regions are assigned names by the constructor. For most regions these are +only used for debugging purposes, but RAM regions also use the name to identify +live migration sections. This means that RAM region names need to have ABI +stability. + +Region lifecycle +---------------- + +A region is created by one of the memory_region_init*() functions and +attached to an object, which acts as its owner or parent. QEMU ensures +that the owner object remains alive as long as the region is visible to +the guest, or as long as the region is in use by a virtual CPU or another +device. For example, the owner object will not die between an +address_space_map operation and the corresponding address_space_unmap. + +After creation, a region can be added to an address space or a +container with memory_region_add_subregion(), and removed using +memory_region_del_subregion(). + +Various region attributes (read-only, dirty logging, coalesced mmio, +ioeventfd) can be changed during the region lifecycle. They take effect +as soon as the region is made visible. This can be immediately, later, +or never. + +Destruction of a memory region happens automatically when the owner +object dies. + +If however the memory region is part of a dynamically allocated data +structure, you should call object_unparent() to destroy the memory region +before the data structure is freed. For an example see VFIOMSIXInfo +and VFIOQuirk in hw/vfio/pci.c. + +You must not destroy a memory region as long as it may be in use by a +device or CPU. In order to do this, as a general rule do not create or +destroy memory regions dynamically during a device's lifetime, and only +call object_unparent() in the memory region owner's instance_finalize +callback. The dynamically allocated data structure that contains the +memory region then should obviously be freed in the instance_finalize +callback as well. + +If you break this rule, the following situation can happen: + +- the memory region's owner had a reference taken via memory_region_ref + (for example by address_space_map) + +- the region is unparented, and has no owner anymore + +- when address_space_unmap is called, the reference to the memory region's + owner is leaked. + + +There is an exception to the above rule: it is okay to call +object_unparent at any time for an alias or a container region. It is +therefore also okay to create or destroy alias and container regions +dynamically during a device's lifetime. + +This exceptional usage is valid because aliases and containers only help +QEMU building the guest's memory map; they are never accessed directly. +memory_region_ref and memory_region_unref are never called on aliases +or containers, and the above situation then cannot happen. Exploiting +this exception is rarely necessary, and therefore it is discouraged, +but nevertheless it is used in a few places. + +For regions that "have no owner" (NULL is passed at creation time), the +machine object is actually used as the owner. Since instance_finalize is +never called for the machine object, you must never call object_unparent +on regions that have no owner, unless they are aliases or containers. + + +Overlapping regions and priority +-------------------------------- +Usually, regions may not overlap each other; a memory address decodes into +exactly one target. In some cases it is useful to allow regions to overlap, +and sometimes to control which of an overlapping regions is visible to the +guest. This is done with memory_region_add_subregion_overlap(), which +allows the region to overlap any other region in the same container, and +specifies a priority that allows the core to decide which of two regions at +the same address are visible (highest wins). +Priority values are signed, and the default value is zero. This means that +you can use memory_region_add_subregion_overlap() both to specify a region +that must sit 'above' any others (with a positive priority) and also a +background region that sits 'below' others (with a negative priority). + +If the higher priority region in an overlap is a container or alias, then +the lower priority region will appear in any "holes" that the higher priority +region has left by not mapping subregions to that area of its address range. +(This applies recursively -- if the subregions are themselves containers or +aliases that leave holes then the lower priority region will appear in these +holes too.) + +For example, suppose we have a container A of size 0x8000 with two subregions +B and C. B is a container mapped at 0x2000, size 0x4000, priority 1; C is +an MMIO region mapped at 0x0, size 0x6000, priority 2. B currently has two +of its own subregions: D of size 0x1000 at offset 0 and E of size 0x1000 at +offset 0x2000. As a diagram: + + 0 1000 2000 3000 4000 5000 6000 7000 8000 + |------|------|------|------|------|------|------|-------| + A: [ ] + C: [CCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCC] + B: [ ] + D: [DDDDD] + E: [EEEEE] + +The regions that will be seen within this address range then are: + [CCCCCCCCCCCC][DDDDD][CCCCC][EEEEE][CCCCC] + +Since B has higher priority than C, its subregions appear in the flat map +even where they overlap with C. In ranges where B has not mapped anything +C's region appears. + +If B had provided its own MMIO operations (ie it was not a pure container) +then these would be used for any addresses in its range not handled by +D or E, and the result would be: + [CCCCCCCCCCCC][DDDDD][BBBBB][EEEEE][BBBBB] + +Priority values are local to a container, because the priorities of two +regions are only compared when they are both children of the same container. +This means that the device in charge of the container (typically modelling +a bus or a memory controller) can use them to manage the interaction of +its child regions without any side effects on other parts of the system. +In the example above, the priorities of D and E are unimportant because +they do not overlap each other. It is the relative priority of B and C +that causes D and E to appear on top of C: D and E's priorities are never +compared against the priority of C. + +Visibility +---------- +The memory core uses the following rules to select a memory region when the +guest accesses an address: + +- all direct subregions of the root region are matched against the address, in + descending priority order + - if the address lies outside the region offset/size, the subregion is + discarded + - if the subregion is a leaf (RAM or MMIO), the search terminates, returning + this leaf region + - if the subregion is a container, the same algorithm is used within the + subregion (after the address is adjusted by the subregion offset) + - if the subregion is an alias, the search is continued at the alias target + (after the address is adjusted by the subregion offset and alias offset) + - if a recursive search within a container or alias subregion does not + find a match (because of a "hole" in the container's coverage of its + address range), then if this is a container with its own MMIO or RAM + backing the search terminates, returning the container itself. Otherwise + we continue with the next subregion in priority order +- if none of the subregions match the address then the search terminates + with no match found + +Example memory map +------------------ + +system_memory: container@0-2^48-1 + | + +---- lomem: alias@0-0xdfffffff ---> #ram (0-0xdfffffff) + | + +---- himem: alias@0x100000000-0x11fffffff ---> #ram (0xe0000000-0xffffffff) + | + +---- vga-window: alias@0xa0000-0xbfffff ---> #pci (0xa0000-0xbffff) + | (prio 1) + | + +---- pci-hole: alias@0xe0000000-0xffffffff ---> #pci (0xe0000000-0xffffffff) + +pci (0-2^32-1) + | + +--- vga-area: container@0xa0000-0xbffff + | | + | +--- alias@0x00000-0x7fff ---> #vram (0x010000-0x017fff) + | | + | +--- alias@0x08000-0xffff ---> #vram (0x020000-0x027fff) + | + +---- vram: ram@0xe1000000-0xe1ffffff + | + +---- vga-mmio: mmio@0xe2000000-0xe200ffff + +ram: ram@0x00000000-0xffffffff + +This is a (simplified) PC memory map. The 4GB RAM block is mapped into the +system address space via two aliases: "lomem" is a 1:1 mapping of the first +3.5GB; "himem" maps the last 0.5GB at address 4GB. This leaves 0.5GB for the +so-called PCI hole, that allows a 32-bit PCI bus to exist in a system with +4GB of memory. + +The memory controller diverts addresses in the range 640K-768K to the PCI +address space. This is modelled using the "vga-window" alias, mapped at a +higher priority so it obscures the RAM at the same addresses. The vga window +can be removed by programming the memory controller; this is modelled by +removing the alias and exposing the RAM underneath. + +The pci address space is not a direct child of the system address space, since +we only want parts of it to be visible (we accomplish this using aliases). +It has two subregions: vga-area models the legacy vga window and is occupied +by two 32K memory banks pointing at two sections of the framebuffer. +In addition the vram is mapped as a BAR at address e1000000, and an additional +BAR containing MMIO registers is mapped after it. + +Note that if the guest maps a BAR outside the PCI hole, it would not be +visible as the pci-hole alias clips it to a 0.5GB range. + +MMIO Operations +--------------- + +MMIO regions are provided with ->read() and ->write() callbacks; in addition +various constraints can be supplied to control how these callbacks are called: + + - .valid.min_access_size, .valid.max_access_size define the access sizes + (in bytes) which the device accepts; accesses outside this range will + have device and bus specific behaviour (ignored, or machine check) + - .valid.aligned specifies that the device only accepts naturally aligned + accesses. Unaligned accesses invoke device and bus specific behaviour. + - .impl.min_access_size, .impl.max_access_size define the access sizes + (in bytes) supported by the *implementation*; other access sizes will be + emulated using the ones available. For example a 4-byte write will be + emulated using four 1-byte writes, if .impl.max_access_size = 1. + - .impl.unaligned specifies that the *implementation* supports unaligned + accesses; if false, unaligned accesses will be emulated by two aligned + accesses. + - .old_mmio can be used to ease porting from code using + cpu_register_io_memory(). It should not be used in new code. diff --git a/src/docs/migration.txt b/src/docs/migration.txt new file mode 100644 index 0000000..fda8d61 --- /dev/null +++ b/src/docs/migration.txt @@ -0,0 +1,484 @@ += Migration = + +QEMU has code to load/save the state of the guest that it is running. +These are two complementary operations. Saving the state just does +that, saves the state for each device that the guest is running. +Restoring a guest is just the opposite operation: we need to load the +state of each device. + +For this to work, QEMU has to be launched with the same arguments the +two times. I.e. it can only restore the state in one guest that has +the same devices that the one it was saved (this last requirement can +be relaxed a bit, but for now we can consider that configuration has +to be exactly the same). + +Once that we are able to save/restore a guest, a new functionality is +requested: migration. This means that QEMU is able to start in one +machine and being "migrated" to another machine. I.e. being moved to +another machine. + +Next was the "live migration" functionality. This is important +because some guests run with a lot of state (specially RAM), and it +can take a while to move all state from one machine to another. Live +migration allows the guest to continue running while the state is +transferred. Only while the last part of the state is transferred has +the guest to be stopped. Typically the time that the guest is +unresponsive during live migration is the low hundred of milliseconds +(notice that this depends on a lot of things). + +=== Types of migration === + +Now that we have talked about live migration, there are several ways +to do migration: + +- tcp migration: do the migration using tcp sockets +- unix migration: do the migration using unix sockets +- exec migration: do the migration using the stdin/stdout through a process. +- fd migration: do the migration using an file descriptor that is + passed to QEMU. QEMU doesn't care how this file descriptor is opened. + +All these four migration protocols use the same infrastructure to +save/restore state devices. This infrastructure is shared with the +savevm/loadvm functionality. + +=== State Live Migration === + +This is used for RAM and block devices. It is not yet ported to vmstate. +<Fill more information here> + +=== What is the common infrastructure === + +QEMU uses a QEMUFile abstraction to be able to do migration. Any type +of migration that wants to use QEMU infrastructure has to create a +QEMUFile with: + +QEMUFile *qemu_fopen_ops(void *opaque, + QEMUFilePutBufferFunc *put_buffer, + QEMUFileGetBufferFunc *get_buffer, + QEMUFileCloseFunc *close); + +The functions have the following functionality: + +This function writes a chunk of data to a file at the given position. +The pos argument can be ignored if the file is only used for +streaming. The handler should try to write all of the data it can. + +typedef int (QEMUFilePutBufferFunc)(void *opaque, const uint8_t *buf, + int64_t pos, int size); + +Read a chunk of data from a file at the given position. The pos argument +can be ignored if the file is only be used for streaming. The number of +bytes actually read should be returned. + +typedef int (QEMUFileGetBufferFunc)(void *opaque, uint8_t *buf, + int64_t pos, int size); + +Close a file and return an error code. + +typedef int (QEMUFileCloseFunc)(void *opaque); + +You can use any internal state that you need using the opaque void * +pointer that is passed to all functions. + +The important functions for us are put_buffer()/get_buffer() that +allow to write/read a buffer into the QEMUFile. + +=== How to save the state of one device === + +The state of a device is saved using intermediate buffers. There are +some helper functions to assist this saving. + +There is a new concept that we have to explain here: device state +version. When we migrate a device, we save/load the state as a series +of fields. Some times, due to bugs or new functionality, we need to +change the state to store more/different information. We use the +version to identify each time that we do a change. Each version is +associated with a series of fields saved. The save_state always saves +the state as the newer version. But load_state sometimes is able to +load state from an older version. + +=== Legacy way === + +This way is going to disappear as soon as all current users are ported to VMSTATE. + +Each device has to register two functions, one to save the state and +another to load the state back. + +int register_savevm(DeviceState *dev, + const char *idstr, + int instance_id, + int version_id, + SaveStateHandler *save_state, + LoadStateHandler *load_state, + void *opaque); + +typedef void SaveStateHandler(QEMUFile *f, void *opaque); +typedef int LoadStateHandler(QEMUFile *f, void *opaque, int version_id); + +The important functions for the device state format are the save_state +and load_state. Notice that load_state receives a version_id +parameter to know what state format is receiving. save_state doesn't +have a version_id parameter because it always uses the latest version. + +=== VMState === + +The legacy way of saving/loading state of the device had the problem +that we have to maintain two functions in sync. If we did one change +in one of them and not in the other, we would get a failed migration. + +VMState changed the way that state is saved/loaded. Instead of using +a function to save the state and another to load it, it was changed to +a declarative way of what the state consisted of. Now VMState is able +to interpret that definition to be able to load/save the state. As +the state is declared only once, it can't go out of sync in the +save/load functions. + +An example (from hw/input/pckbd.c) + +static const VMStateDescription vmstate_kbd = { + .name = "pckbd", + .version_id = 3, + .minimum_version_id = 3, + .fields = (VMStateField[]) { + VMSTATE_UINT8(write_cmd, KBDState), + VMSTATE_UINT8(status, KBDState), + VMSTATE_UINT8(mode, KBDState), + VMSTATE_UINT8(pending, KBDState), + VMSTATE_END_OF_LIST() + } +}; + +We are declaring the state with name "pckbd". +The version_id is 3, and the fields are 4 uint8_t in a KBDState structure. +We registered this with: + + vmstate_register(NULL, 0, &vmstate_kbd, s); + +Note: talk about how vmstate <-> qdev interact, and what the instance ids mean. + +You can search for VMSTATE_* macros for lots of types used in QEMU in +include/hw/hw.h. + +=== More about versions === + +You can see that there are several version fields: + +- version_id: the maximum version_id supported by VMState for that device. +- minimum_version_id: the minimum version_id that VMState is able to understand + for that device. +- minimum_version_id_old: For devices that were not able to port to vmstate, we can + assign a function that knows how to read this old state. This field is + ignored if there is no load_state_old handler. + +So, VMState is able to read versions from minimum_version_id to +version_id. And the function load_state_old() (if present) is able to +load state from minimum_version_id_old to minimum_version_id. This +function is deprecated and will be removed when no more users are left. + +=== Massaging functions === + +Sometimes, it is not enough to be able to save the state directly +from one structure, we need to fill the correct values there. One +example is when we are using kvm. Before saving the cpu state, we +need to ask kvm to copy to QEMU the state that it is using. And the +opposite when we are loading the state, we need a way to tell kvm to +load the state for the cpu that we have just loaded from the QEMUFile. + +The functions to do that are inside a vmstate definition, and are called: + +- int (*pre_load)(void *opaque); + + This function is called before we load the state of one device. + +- int (*post_load)(void *opaque, int version_id); + + This function is called after we load the state of one device. + +- void (*pre_save)(void *opaque); + + This function is called before we save the state of one device. + +Example: You can look at hpet.c, that uses the three function to + massage the state that is transferred. + +If you use memory API functions that update memory layout outside +initialization (i.e., in response to a guest action), this is a strong +indication that you need to call these functions in a post_load callback. +Examples of such memory API functions are: + + - memory_region_add_subregion() + - memory_region_del_subregion() + - memory_region_set_readonly() + - memory_region_set_enabled() + - memory_region_set_address() + - memory_region_set_alias_offset() + +=== Subsections === + +The use of version_id allows to be able to migrate from older versions +to newer versions of a device. But not the other way around. This +makes very complicated to fix bugs in stable branches. If we need to +add anything to the state to fix a bug, we have to disable migration +to older versions that don't have that bug-fix (i.e. a new field). + +But sometimes, that bug-fix is only needed sometimes, not always. For +instance, if the device is in the middle of a DMA operation, it is +using a specific functionality, .... + +It is impossible to create a way to make migration from any version to +any other version to work. But we can do better than only allowing +migration from older versions to newer ones. For that fields that are +only needed sometimes, we add the idea of subsections. A subsection +is "like" a device vmstate, but with a particularity, it has a Boolean +function that tells if that values are needed to be sent or not. If +this functions returns false, the subsection is not sent. + +On the receiving side, if we found a subsection for a device that we +don't understand, we just fail the migration. If we understand all +the subsections, then we load the state with success. + +One important note is that the post_load() function is called "after" +loading all subsections, because a newer subsection could change same +value that it uses. + +Example: + +static bool ide_drive_pio_state_needed(void *opaque) +{ + IDEState *s = opaque; + + return ((s->status & DRQ_STAT) != 0) + || (s->bus->error_status & BM_STATUS_PIO_RETRY); +} + +const VMStateDescription vmstate_ide_drive_pio_state = { + .name = "ide_drive/pio_state", + .version_id = 1, + .minimum_version_id = 1, + .pre_save = ide_drive_pio_pre_save, + .post_load = ide_drive_pio_post_load, + .needed = ide_drive_pio_state_needed, + .fields = (VMStateField[]) { + VMSTATE_INT32(req_nb_sectors, IDEState), + VMSTATE_VARRAY_INT32(io_buffer, IDEState, io_buffer_total_len, 1, + vmstate_info_uint8, uint8_t), + VMSTATE_INT32(cur_io_buffer_offset, IDEState), + VMSTATE_INT32(cur_io_buffer_len, IDEState), + VMSTATE_UINT8(end_transfer_fn_idx, IDEState), + VMSTATE_INT32(elementary_transfer_size, IDEState), + VMSTATE_INT32(packet_transfer_size, IDEState), + VMSTATE_END_OF_LIST() + } +}; + +const VMStateDescription vmstate_ide_drive = { + .name = "ide_drive", + .version_id = 3, + .minimum_version_id = 0, + .post_load = ide_drive_post_load, + .fields = (VMStateField[]) { + .... several fields .... + VMSTATE_END_OF_LIST() + }, + .subsections = (const VMStateDescription*[]) { + &vmstate_ide_drive_pio_state, + NULL + } +}; + +Here we have a subsection for the pio state. We only need to +save/send this state when we are in the middle of a pio operation +(that is what ide_drive_pio_state_needed() checks). If DRQ_STAT is +not enabled, the values on that fields are garbage and don't need to +be sent. + += Return path = + +In most migration scenarios there is only a single data path that runs +from the source VM to the destination, typically along a single fd (although +possibly with another fd or similar for some fast way of throwing pages across). + +However, some uses need two way communication; in particular the Postcopy +destination needs to be able to request pages on demand from the source. + +For these scenarios there is a 'return path' from the destination to the source; +qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return +path. + + Source side + Forward path - written by migration thread + Return path - opened by main thread, read by return-path thread + + Destination side + Forward path - read by main thread + Return path - opened by main thread, written by main thread AND postcopy + thread (protected by rp_mutex) + += Postcopy = +'Postcopy' migration is a way to deal with migrations that refuse to converge +(or take too long to converge) its plus side is that there is an upper bound on +the amount of migration traffic and time it takes, the down side is that during +the postcopy phase, a failure of *either* side or the network connection causes +the guest to be lost. + +In postcopy the destination CPUs are started before all the memory has been +transferred, and accesses to pages that are yet to be transferred cause +a fault that's translated by QEMU into a request to the source QEMU. + +Postcopy can be combined with precopy (i.e. normal migration) so that if precopy +doesn't finish in a given time the switch is made to postcopy. + +=== Enabling postcopy === + +To enable postcopy, issue this command on the monitor prior to the +start of migration: + +migrate_set_capability x-postcopy-ram on + +The normal commands are then used to start a migration, which is still +started in precopy mode. Issuing: + +migrate_start_postcopy + +will now cause the transition from precopy to postcopy. +It can be issued immediately after migration is started or any +time later on. Issuing it after the end of a migration is harmless. + +Note: During the postcopy phase, the bandwidth limits set using +migrate_set_speed is ignored (to avoid delaying requested pages that +the destination is waiting for). + +=== Postcopy device transfer === + +Loading of device data may cause the device emulation to access guest RAM +that may trigger faults that have to be resolved by the source, as such +the migration stream has to be able to respond with page data *during* the +device load, and hence the device data has to be read from the stream completely +before the device load begins to free the stream up. This is achieved by +'packaging' the device data into a blob that's read in one go. + +Source behaviour + +Until postcopy is entered the migration stream is identical to normal +precopy, except for the addition of a 'postcopy advise' command at +the beginning, to tell the destination that postcopy might happen. +When postcopy starts the source sends the page discard data and then +forms the 'package' containing: + + Command: 'postcopy listen' + The device state + A series of sections, identical to the precopy streams device state stream + containing everything except postcopiable devices (i.e. RAM) + Command: 'postcopy run' + +The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the +contents are formatted in the same way as the main migration stream. + +During postcopy the source scans the list of dirty pages and sends them +to the destination without being requested (in much the same way as precopy), +however when a page request is received from the destination, the dirty page +scanning restarts from the requested location. This causes requested pages +to be sent quickly, and also causes pages directly after the requested page +to be sent quickly in the hope that those pages are likely to be used +by the destination soon. + +Destination behaviour + +Initially the destination looks the same as precopy, with a single thread +reading the migration stream; the 'postcopy advise' and 'discard' commands +are processed to change the way RAM is managed, but don't affect the stream +processing. + +------------------------------------------------------------------------------ + 1 2 3 4 5 6 7 +main -----DISCARD-CMD_PACKAGED ( LISTEN DEVICE DEVICE DEVICE RUN ) +thread | | + | (page request) + | \___ + v \ +listen thread: --- page -- page -- page -- page -- page -- + + a b c +------------------------------------------------------------------------------ + +On receipt of CMD_PACKAGED (1) + All the data associated with the package - the ( ... ) section in the +diagram - is read into memory (into a QEMUSizedBuffer), and the main thread +recurses into qemu_loadvm_state_main to process the contents of the package (2) +which contains commands (3,6) and devices (4...) + +On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package) +a new thread (a) is started that takes over servicing the migration stream, +while the main thread carries on loading the package. It loads normal +background page data (b) but if during a device load a fault happens (5) the +returned page (c) is loaded by the listen thread allowing the main threads +device load to carry on. + +The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination +CPUs start running. +At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour +and is no longer used by migration, while the listen thread carries +on servicing page data until the end of migration. + +=== Postcopy states === + +Postcopy moves through a series of states (see postcopy_state) from +ADVISE->DISCARD->LISTEN->RUNNING->END + + Advise: Set at the start of migration if postcopy is enabled, even + if it hasn't had the start command; here the destination + checks that its OS has the support needed for postcopy, and performs + setup to ensure the RAM mappings are suitable for later postcopy. + The destination will fail early in migration at this point if the + required OS support is not present. + (Triggered by reception of POSTCOPY_ADVISE command) + + Discard: Entered on receipt of the first 'discard' command; prior to + the first Discard being performed, hugepages are switched off + (using madvise) to ensure that no new huge pages are created + during the postcopy phase, and to cause any huge pages that + have discards on them to be broken. + + Listen: The first command in the package, POSTCOPY_LISTEN, switches + the destination state to Listen, and starts a new thread + (the 'listen thread') which takes over the job of receiving + pages off the migration stream, while the main thread carries + on processing the blob. With this thread able to process page + reception, the destination now 'sensitises' the RAM to detect + any access to missing pages (on Linux using the 'userfault' + system). + + Running: POSTCOPY_RUN causes the destination to synchronise all + state and start the CPUs and IO devices running. The main + thread now finishes processing the migration package and + now carries on as it would for normal precopy migration + (although it can't do the cleanup it would do as it + finishes a normal migration). + + End: The listen thread can now quit, and perform the cleanup of migration + state, the migration is now complete. + +=== Source side page maps === + +The source side keeps two bitmaps during postcopy; 'the migration bitmap' +and 'unsent map'. The 'migration bitmap' is basically the same as in +the precopy case, and holds a bit to indicate that page is 'dirty' - +i.e. needs sending. During the precopy phase this is updated as the CPU +dirties pages, however during postcopy the CPUs are stopped and nothing +should dirty anything any more. + +The 'unsent map' is used for the transition to postcopy. It is a bitmap that +has a bit cleared whenever a page is sent to the destination, however during +the transition to postcopy mode it is combined with the migration bitmap +to form a set of pages that: + a) Have been sent but then redirtied (which must be discarded) + b) Have not yet been sent - which also must be discarded to cause any + transparent huge pages built during precopy to be broken. + +Note that the contents of the unsentmap are sacrificed during the calculation +of the discard set and thus aren't valid once in postcopy. The dirtymap +is still valid and is used to ensure that no page is sent more than once. Any +request for a page that has already been sent is ignored. Duplicate requests +such as this can happen as a page is sent at about the same time the +destination accesses it. + diff --git a/src/docs/multi-thread-compression.txt b/src/docs/multi-thread-compression.txt new file mode 100644 index 0000000..3d477c3 --- /dev/null +++ b/src/docs/multi-thread-compression.txt @@ -0,0 +1,149 @@ +Use multiple thread (de)compression in live migration +===================================================== +Copyright (C) 2015 Intel Corporation +Author: Liang Li <liang.z.li@intel.com> + +This work is licensed under the terms of the GNU GPLv2 or later. See +the COPYING file in the top-level directory. + +Contents: +========= +* Introduction +* When to use +* Performance +* Usage +* TODO + +Introduction +============ +Instead of sending the guest memory directly, this solution will +compress the RAM page before sending; after receiving, the data will +be decompressed. Using compression in live migration can help +to reduce the data transferred about 60%, this is very useful when the +bandwidth is limited, and the total migration time can also be reduced +about 70% in a typical case. In addition to this, the VM downtime can be +reduced about 50%. The benefit depends on data's compressibility in VM. + +The process of compression will consume additional CPU cycles, and the +extra CPU cycles will increase the migration time. On the other hand, +the amount of data transferred will decrease; this factor can reduce +the total migration time. If the process of the compression is quick +enough, then the total migration time can be reduced, and multiple +thread compression can be used to accelerate the compression process. + +The decompression speed of Zlib is at least 4 times as quick as +compression, if the source and destination CPU have equal speed, +keeping the compression thread count 4 times the decompression +thread count can avoid resource waste. + +Compression level can be used to control the compression speed and the +compression ratio. High compression ratio will take more time, level 0 +stands for no compression, level 1 stands for the best compression +speed, and level 9 stands for the best compression ratio. Users can +select a level number between 0 and 9. + + +When to use the multiple thread compression in live migration +============================================================= +Compression of data will consume extra CPU cycles; so in a system with +high overhead of CPU, avoid using this feature. When the network +bandwidth is very limited and the CPU resource is adequate, use of +multiple thread compression will be very helpful. If both the CPU and +the network bandwidth are adequate, use of multiple thread compression +can still help to reduce the migration time. + +Performance +=========== +Test environment: + +CPU: Intel(R) Xeon(R) CPU E5-2680 0 @ 2.70GHz +Socket Count: 2 +RAM: 128G +NIC: Intel I350 (10/100/1000Mbps) +Host OS: CentOS 7 64-bit +Guest OS: RHEL 6.5 64-bit +Parameter: qemu-system-x86_64 -enable-kvm -smp 4 -m 4096 + /share/ia32e_rhel6u5.qcow -monitor stdio + +There is no additional application is running on the guest when doing +the test. + + +Speed limit: 1000Gb/s +--------------------------------------------------------------- + | original | compress thread: 8 + | way | decompress thread: 2 + | | compression level: 1 +--------------------------------------------------------------- +total time(msec): | 3333 | 1833 +--------------------------------------------------------------- +downtime(msec): | 100 | 27 +--------------------------------------------------------------- +transferred ram(kB):| 363536 | 107819 +--------------------------------------------------------------- +throughput(mbps): | 893.73 | 482.22 +--------------------------------------------------------------- +total ram(kB): | 4211524 | 4211524 +--------------------------------------------------------------- + +There is an application running on the guest which write random numbers +to RAM block areas periodically. + +Speed limit: 1000Gb/s +--------------------------------------------------------------- + | original | compress thread: 8 + | way | decompress thread: 2 + | | compression level: 1 +--------------------------------------------------------------- +total time(msec): | 37369 | 15989 +--------------------------------------------------------------- +downtime(msec): | 337 | 173 +--------------------------------------------------------------- +transferred ram(kB):| 4274143 | 1699824 +--------------------------------------------------------------- +throughput(mbps): | 936.99 | 870.95 +--------------------------------------------------------------- +total ram(kB): | 4211524 | 4211524 +--------------------------------------------------------------- + +Usage +===== +1. Verify both the source and destination QEMU are able +to support the multiple thread compression migration: + {qemu} info_migrate_capabilities + {qemu} ... compress: off ... + +2. Activate compression on the source: + {qemu} migrate_set_capability compress on + +3. Set the compression thread count on source: + {qemu} migrate_set_parameter compress_threads 12 + +4. Set the compression level on the source: + {qemu} migrate_set_parameter compress_level 1 + +5. Set the decompression thread count on destination: + {qemu} migrate_set_parameter decompress_threads 3 + +6. Start outgoing migration: + {qemu} migrate -d tcp:destination.host:4444 + {qemu} info migrate + Capabilities: ... compress: on + ... + +The following are the default settings: + compress: off + compress_threads: 8 + decompress_threads: 2 + compress_level: 1 (which means best speed) + +So, only the first two steps are required to use the multiple +thread compression in migration. You can do more if the default +settings are not appropriate. + +TODO +==== +Some faster (de)compression method such as LZ4 and Quicklz can help +to reduce the CPU consumption when doing (de)compression. If using +these faster (de)compression method, less (de)compression threads +are needed when doing the migration. diff --git a/src/docs/multiple-iothreads.txt b/src/docs/multiple-iothreads.txt new file mode 100644 index 0000000..40b8419 --- /dev/null +++ b/src/docs/multiple-iothreads.txt @@ -0,0 +1,134 @@ +Copyright (c) 2014 Red Hat Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + + +This document explains the IOThread feature and how to write code that runs +outside the QEMU global mutex. + +The main loop and IOThreads +--------------------------- +QEMU is an event-driven program that can do several things at once using an +event loop. The VNC server and the QMP monitor are both processed from the +same event loop, which monitors their file descriptors until they become +readable and then invokes a callback. + +The default event loop is called the main loop (see main-loop.c). It is +possible to create additional event loop threads using -object +iothread,id=my-iothread. + +Side note: The main loop and IOThread are both event loops but their code is +not shared completely. Sometimes it is useful to remember that although they +are conceptually similar they are currently not interchangeable. + +Why IOThreads are useful +------------------------ +IOThreads allow the user to control the placement of work. The main loop is a +scalability bottleneck on hosts with many CPUs. Work can be spread across +several IOThreads instead of just one main loop. When set up correctly this +can improve I/O latency and reduce jitter seen by the guest. + +The main loop is also deeply associated with the QEMU global mutex, which is a +scalability bottleneck in itself. vCPU threads and the main loop use the QEMU +global mutex to serialize execution of QEMU code. This mutex is necessary +because a lot of QEMU's code historically was not thread-safe. + +The fact that all I/O processing is done in a single main loop and that the +QEMU global mutex is contended by all vCPU threads and the main loop explain +why it is desirable to place work into IOThreads. + +The experimental virtio-blk data-plane implementation has been benchmarked and +shows these effects: +ftp://public.dhe.ibm.com/linux/pdfs/KVM_Virtualized_IO_Performance_Paper.pdf + +How to program for IOThreads +---------------------------- +The main difference between legacy code and new code that can run in an +IOThread is dealing explicitly with the event loop object, AioContext +(see include/block/aio.h). Code that only works in the main loop +implicitly uses the main loop's AioContext. Code that supports running +in IOThreads must be aware of its AioContext. + +AioContext supports the following services: + * File descriptor monitoring (read/write/error on POSIX hosts) + * Event notifiers (inter-thread signalling) + * Timers + * Bottom Halves (BH) deferred callbacks + +There are several old APIs that use the main loop AioContext: + * LEGACY qemu_aio_set_fd_handler() - monitor a file descriptor + * LEGACY qemu_aio_set_event_notifier() - monitor an event notifier + * LEGACY timer_new_ms() - create a timer + * LEGACY qemu_bh_new() - create a BH + * LEGACY qemu_aio_wait() - run an event loop iteration + +Since they implicitly work on the main loop they cannot be used in code that +runs in an IOThread. They might cause a crash or deadlock if called from an +IOThread since the QEMU global mutex is not held. + +Instead, use the AioContext functions directly (see include/block/aio.h): + * aio_set_fd_handler() - monitor a file descriptor + * aio_set_event_notifier() - monitor an event notifier + * aio_timer_new() - create a timer + * aio_bh_new() - create a BH + * aio_poll() - run an event loop iteration + +The AioContext can be obtained from the IOThread using +iothread_get_aio_context() or for the main loop using qemu_get_aio_context(). +Code that takes an AioContext argument works both in IOThreads or the main +loop, depending on which AioContext instance the caller passes in. + +How to synchronize with an IOThread +----------------------------------- +AioContext is not thread-safe so some rules must be followed when using file +descriptors, event notifiers, timers, or BHs across threads: + +1. AioContext functions can be called safely from file descriptor, event +notifier, timer, or BH callbacks invoked by the AioContext. No locking is +necessary. + +2. Other threads wishing to access the AioContext must use +aio_context_acquire()/aio_context_release() for mutual exclusion. Once the +context is acquired no other thread can access it or run event loop iterations +in this AioContext. + +aio_context_acquire()/aio_context_release() calls may be nested. This +means you can call them if you're not sure whether #1 applies. + +There is currently no lock ordering rule if a thread needs to acquire multiple +AioContexts simultaneously. Therefore, it is only safe for code holding the +QEMU global mutex to acquire other AioContexts. + +Side note: the best way to schedule a function call across threads is to create +a BH in the target AioContext beforehand and then call qemu_bh_schedule(). No +acquire/release or locking is needed for the qemu_bh_schedule() call. But be +sure to acquire the AioContext for aio_bh_new() if necessary. + +The relationship between AioContext and the block layer +------------------------------------------------------- +The AioContext originates from the QEMU block layer because it provides a +scoped way of running event loop iterations until all work is done. This +feature is used to complete all in-flight block I/O requests (see +bdrv_drain_all()). Nowadays AioContext is a generic event loop that can be +used by any QEMU subsystem. + +The block layer has support for AioContext integrated. Each BlockDriverState +is associated with an AioContext using bdrv_set_aio_context() and +bdrv_get_aio_context(). This allows block layer code to process I/O inside the +right AioContext. Other subsystems may wish to follow a similar approach. + +Block layer code must therefore expect to run in an IOThread and avoid using +old APIs that implicitly use the main loop. See the "How to program for +IOThreads" above for information on how to do that. + +If main loop code such as a QMP function wishes to access a BlockDriverState it +must first call aio_context_acquire(bdrv_get_aio_context(bs)) to ensure the +IOThread does not run in parallel. + +Long-running jobs (usually in the form of coroutines) are best scheduled in the +BlockDriverState's AioContext to avoid the need to acquire/release around each +bdrv_*() call. Be aware that there is currently no mechanism to get notified +when bdrv_set_aio_context() moves this BlockDriverState to a different +AioContext (see bdrv_detach_aio_context()/bdrv_attach_aio_context()), so you +may need to add this if you want to support long-running jobs. diff --git a/src/docs/multiseat.txt b/src/docs/multiseat.txt new file mode 100644 index 0000000..807518c --- /dev/null +++ b/src/docs/multiseat.txt @@ -0,0 +1,145 @@ + +multiseat howto (with some multihead coverage) +============================================== + +host devices +------------ + +First you must compile qemu with a user interface supporting +multihead/multiseat and input event routing. Right now this +list includes sdl2, gtk (both 2+3) and vnc: + + ./configure --enable-sdl --with-sdlabi=2.0 + +or + + ./configure --enable-gtk + + +Next put together the qemu command line (sdk/gtk): + +qemu -enable-kvm -usb $memory $disk $whatever \ + -display [ sdl | gtk ] \ + -vga std \ + -device usb-tablet + +That is it for the first seat, which will use the standard vga, the +standard ps/2 keyboard (implicitly there) and the usb-tablet. Now the +additional switches for the second seat: + + -device pci-bridge,addr=12.0,chassis_nr=2,id=head.2 \ + -device secondary-vga,bus=head.2,addr=02.0,id=video.2 \ + -device nec-usb-xhci,bus=head.2,addr=0f.0,id=usb.2 \ + -device usb-kbd,bus=usb.2.0,port=1,display=video.2 \ + -device usb-tablet,bus=usb.2.0,port=2,display=video.2 + +This places a pci bridge in slot 12, connects a display adapter and +xhci (usb) controller to the bridge. Then it adds a usb keyboard and +usb mouse, both connected to the xhci and linked to the display. + +The "display=video2" sets up the input routing. Any input coming from +the window which belongs to the video.2 display adapter will be routed +to these input devices. + +Starting with qemu 2.4 and linux kernel 4.1 you can also use virtio +for the input devices, using this ... + + -device pci-bridge,addr=12.0,chassis_nr=2,id=head.2 \ + -device secondary-vga,bus=head.2,addr=02.0,id=video.2 \ + -device virtio-keyboard-pci,bus=head.2,addr=03.0,display=video.2 \ + -device virtio-tablet-pci,bus=head.2,addr=03.0,display=video.2 + +... instead of xhci and usb hid devices. + +host ui +------- + +The sdl2 ui will start up with two windows, one for each display +device. The gtk ui will start with a single window and each display +in a separate tab. You can either simply switch tabs to switch heads, +or use the "View / Detach tab" menu item to move one of the displays +to its own window so you can see both display devices side-by-side. + +For vnc some additional configuration on the command line is needed. +We'll create two vnc server instances, and bind the second one to the +second seat, simliar to input devices: + + -display vnc=:1,id=primary \ + -display vnc=:2,id=secondary,display=video.2 + +Connecting to vnc display :1 gives you access to the first seat, and +likewise connecting to vnc display :2 shows the second seat. + +Note on spice: Spice handles multihead just fine. But it can't do +multiseat. For tablet events the event source is sent to the spice +agent. But qemu can't figure it, so it can't do input routing. +Fixing this needs a new or extended input interface between +libspice-server and qemu. For keyboard events it is even worse: The +event source isn't included in the spice protocol, so the wire +protocol must be extended to support this. + + +guest side +---------- + +You need a pretty recent linux guest. systemd with loginctl. kernel +3.14+ with CONFIG_DRM_BOCHS enabled. Fedora 20 will do. Must be +fully updated for the new kernel though, i.e. the live iso doesn't cut +it. + +Now we'll have to configure the guest. Boot and login. "lspci -vt" +should list the pci bridge with the display adapter and usb controller: + + [root@fedora ~]# lspci -vt + -[0000:00]-+-00.0 Intel Corporation 440FX - 82441FX PMC [Natoma] + [ ... ] + \-12.0-[01]--+-02.0 Device 1234:1111 + \-0f.0 NEC Corporation USB 3.0 Host Controller + +Good. Now lets tell the system that the pci bridge and all devices +below it belong to a separate seat by dropping a file into +/etc/udev/rules.d: + + [root@fedora ~]# cat /etc/udev/rules.d/70-qemu-autoseat.rules + SUBSYSTEMS=="pci", DEVPATH=="*/0000:00:12.0", TAG+="seat", ENV{ID_AUTOSEAT}="1" + +Reboot. System should come up with two seats. With loginctl you can +check the configuration: + + [root@fedora ~]# loginctl list-seats + SEAT + seat0 + seat-pci-pci-0000_00_12_0 + + 2 seats listed. + +You can use "loginctl seat-status seat-pci-pci-0000_00_12_0" to list +the devices attached to the seat. + +Background info is here: + http://www.freedesktop.org/wiki/Software/systemd/multiseat/ + + +guest side with pci-bridge-seat +------------------------------- + +Qemu version 2.4 and newer has a new pci-bridge-seat device which +can be used instead of pci-bridge. Just swap the device name in the +qemu command line above. The only difference between the two devices +is the pci id. We can match the pci id instead of the device path +with a nice generic rule now, which simplifies the guest +configuration: + + [root@fedora ~]# cat /etc/udev/rules.d/70-qemu-pci-bridge-seat.rules + SUBSYSTEM=="pci", ATTR{vendor}=="0x1b36", ATTR{device}=="0x000a", \ + TAG+="seat", ENV{ID_AUTOSEAT}="1" + +Patch with this rule has been submitted to upstream udev/systemd, was +accepted and should be included in the next systemd release (222). +So, if your guest has this or a newer version, multiseat will work just +fine without any manual guest configuration. + +Enjoy! + +-- +Gerd Hoffmann <kraxel@redhat.com> diff --git a/src/docs/pci_expander_bridge.txt b/src/docs/pci_expander_bridge.txt new file mode 100644 index 0000000..d7913fb --- /dev/null +++ b/src/docs/pci_expander_bridge.txt @@ -0,0 +1,58 @@ +PCI EXPANDER BRIDGE (PXB) +========================= + +Description +=========== +PXB is a "light-weight" host bridge in the same PCI domain +as the main host bridge whose purpose is to enable +the main host bridge to support multiple PCI root buses. +It is implemented only for i440fx and can be placed only +on bus 0 (pci.0). + +As opposed to PCI-2-PCI bridge's secondary bus, PXB's bus +is a primary bus and can be associated with a NUMA node +(different from the main host bridge) allowing the guest OS +to recognize the proximity of a pass-through device to +other resources as RAM and CPUs. + +Usage +===== +A detailed command line would be: + +[qemu-bin + storage options] +-m 2G +-object memory-backend-ram,size=1024M,policy=bind,host-nodes=0,id=ram-node0 -numa node,nodeid=0,cpus=0,memdev=ram-node0 +-object memory-backend-ram,size=1024M,policy=bind,host-nodes=1,id=ram-node1 -numa node,nodeid=1,cpus=1,memdev=ram-node1 +-device pxb,id=bridge1,bus=pci.0,numa_node=1,bus_nr=4 -netdev user,id=nd-device e1000,bus=bridge1,addr=0x4,netdev=nd +-device pxb,id=bridge2,bus=pci.0,numa_node=0,bus_nr=8,bus=pci.0 -device e1000,bus=bridge2,addr=0x3 +-device pxb,id=bridge3,bus=pci.0,bus_nr=40,bus=pci.0 -drive if=none,id=drive0,file=[img] -device virtio-blk-pci,drive=drive0,scsi=off,bus=bridge3,addr=1 + +Here you have: + - 2 NUMA nodes for the guest, 0 and 1. (both mapped to the same NUMA node in host, but you can and should put it in different host NUMA nodes) + - a pxb host bridge attached to NUMA 1 with an e1000 behind it + - a pxb host bridge attached to NUMA 0 with an e1000 behind it + - a pxb host bridge not attached to any NUMA with a hard drive behind it. + +Limitations +=========== +Please observe that we specified the bus "pci.0" for the second and third pxb. +This is because when no bus is given, another pxb can be selected by QEMU as default bus, +however, PXBs can be placed only under the root bus. + +Implementation +============== +The PXB is composed by: +- HostBridge (TYPE_PXB_HOST) + The host bridge allows to register and query the PXB's rPCI root bus in QEMU. +- PXBDev(TYPE_PXB_DEVICE) + It is a regular PCI Device that resides on the piix host-bridge bus and its bus uses the same PCI domain. + However, the bus behind is exposed through ACPI as a primary PCI bus and starts a new PCI hierarchy. + The interrupts from devices behind the PXB are routed through this device the same as if it were a + PCI-2-PCI bridge. The _PRT follows the i440fx model. +- PCIBridgeDev(TYPE_PCI_BRIDGE_DEV) + Created automatically as part of init sequence. + When adding a device to PXB it is attached to the bridge for two reasons: + - Using the bridge will enable hotplug support + - All the devices behind the bridge will use bridge's IO/MEM windows compacting + the PCI address space. + diff --git a/src/docs/q35-chipset.cfg b/src/docs/q35-chipset.cfg new file mode 100644 index 0000000..e4ddb7d --- /dev/null +++ b/src/docs/q35-chipset.cfg @@ -0,0 +1,152 @@ +################################################################ +# +# qemu -M q35 creates a bare machine with just the very essential +# chipset devices being present: +# +# 00.0 - Host bridge +# 1f.0 - ISA bridge / LPC +# 1f.2 - SATA (AHCI) controller +# 1f.3 - SMBus controller +# +# This config file documents the other devices and how they are +# created. You can simply use "-readconfig $thisfile" to create +# them all. Here is a overview: +# +# 19.0 - Ethernet controller (not created, our e1000 emulation +# doesn't emulate the ich9 device). +# 1a.* - USB Controller #2 (ehci + uhci companions) +# 1b.0 - HD Audio Controller +# 1c.* - PCI Express Ports +# 1d.* - USB Controller #1 (ehci + uhci companions, +# "qemu -M q35 -usb" creates these too) +# 1e.0 - PCI Bridge +# + +[device "ich9-ehci-2"] + driver = "ich9-usb-ehci2" + multifunction = "on" + bus = "pcie.0" + addr = "1a.7" + +[device "ich9-uhci-4"] + driver = "ich9-usb-uhci4" + multifunction = "on" + bus = "pcie.0" + addr = "1a.0" + masterbus = "ich9-ehci-2.0" + firstport = "0" + +[device "ich9-uhci-5"] + driver = "ich9-usb-uhci5" + multifunction = "on" + bus = "pcie.0" + addr = "1a.1" + masterbus = "ich9-ehci-2.0" + firstport = "2" + +[device "ich9-uhci-6"] + driver = "ich9-usb-uhci6" + multifunction = "on" + bus = "pcie.0" + addr = "1a.2" + masterbus = "ich9-ehci-2.0" + firstport = "4" + + +[device "ich9-hda-audio"] + driver = "ich9-intel-hda" + bus = "pcie.0" + addr = "1b.0" + + +[device "ich9-pcie-port-1"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.0" + port = "1" + chassis = "1" + +[device "ich9-pcie-port-2"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.1" + port = "2" + chassis = "2" + +[device "ich9-pcie-port-3"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.2" + port = "3" + chassis = "3" + +[device "ich9-pcie-port-4"] + driver = "ioh3420" + multifunction = "on" + bus = "pcie.0" + addr = "1c.3" + port = "4" + chassis = "4" + +## +# Example PCIe switch with two downstream ports +# +#[device "pcie-switch-upstream-port-1"] +# driver = "x3130-upstream" +# bus = "ich9-pcie-port-4" +# addr = "00.0" +# +#[device "pcie-switch-downstream-port-1-1"] +# driver = "xio3130-downstream" +# multifunction = "on" +# bus = "pcie-switch-upstream-port-1" +# addr = "00.0" +# port = "1" +# chassis = "5" +# +#[device "pcie-switch-downstream-port-1-2"] +# driver = "xio3130-downstream" +# multifunction = "on" +# bus = "pcie-switch-upstream-port-1" +# addr = "00.1" +# port = "1" +# chassis = "6" + +[device "ich9-ehci-1"] + driver = "ich9-usb-ehci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.7" + +[device "ich9-uhci-1"] + driver = "ich9-usb-uhci1" + multifunction = "on" + bus = "pcie.0" + addr = "1d.0" + masterbus = "ich9-ehci-1.0" + firstport = "0" + +[device "ich9-uhci-2"] + driver = "ich9-usb-uhci2" + multifunction = "on" + bus = "pcie.0" + addr = "1d.1" + masterbus = "ich9-ehci-1.0" + firstport = "2" + +[device "ich9-uhci-3"] + driver = "ich9-usb-uhci3" + multifunction = "on" + bus = "pcie.0" + addr = "1d.2" + masterbus = "ich9-ehci-1.0" + firstport = "4" + + +[device "ich9-pci-bridge"] + driver = "i82801b11-bridge" + bus = "pcie.0" + addr = "1e.0" diff --git a/src/docs/qapi-code-gen.txt b/src/docs/qapi-code-gen.txt new file mode 100644 index 0000000..ceb9a78 --- /dev/null +++ b/src/docs/qapi-code-gen.txt @@ -0,0 +1,1112 @@ += How to use the QAPI code generator = + +Copyright IBM Corp. 2011 +Copyright (C) 2012-2015 Red Hat, Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +== Introduction == + +QAPI is a native C API within QEMU which provides management-level +functionality to internal and external users. For external +users/processes, this interface is made available by a JSON-based wire +format for the QEMU Monitor Protocol (QMP) for controlling qemu, as +well as the QEMU Guest Agent (QGA) for communicating with the guest. +The remainder of this document uses "Client JSON Protocol" when +referring to the wire contents of a QMP or QGA connection. + +To map Client JSON Protocol interfaces to the native C QAPI +implementations, a JSON-based schema is used to define types and +function signatures, and a set of scripts is used to generate types, +signatures, and marshaling/dispatch code. This document will describe +how the schemas, scripts, and resulting code are used. + + +== QMP/Guest agent schema == + +A QAPI schema file is designed to be loosely based on JSON +(http://www.ietf.org/rfc/rfc7159.txt) with changes for quoting style +and the use of comments; a QAPI schema file is then parsed by a python +code generation program. A valid QAPI schema consists of a series of +top-level expressions, with no commas between them. Where +dictionaries (JSON objects) are used, they are parsed as python +OrderedDicts so that ordering is preserved (for predictable layout of +generated C structs and parameter lists). Ordering doesn't matter +between top-level expressions or the keys within an expression, but +does matter within dictionary values for 'data' and 'returns' members +of a single expression. QAPI schema input is written using 'single +quotes' instead of JSON's "double quotes" (in contrast, Client JSON +Protocol uses no comments, and while input accepts 'single quotes' as +an extension, output is strict JSON using only "double quotes"). As +in JSON, trailing commas are not permitted in arrays or dictionaries. +Input must be ASCII (although QMP supports full Unicode strings, the +QAPI parser does not). At present, there is no place where a QAPI +schema requires the use of JSON numbers or null. + +Comments are allowed; anything between an unquoted # and the following +newline is ignored. Although there is not yet a documentation +generator, a form of stylized comments has developed for consistently +documenting details about an expression and when it was added to the +schema. The documentation is delimited between two lines of ##, then +the first line names the expression, an optional overview is provided, +then individual documentation about each member of 'data' is provided, +and finally, a 'Since: x.y.z' tag lists the release that introduced +the expression. Optional fields are tagged with the phrase +'#optional', often with their default value; and extensions added +after the expression was first released are also given a '(since +x.y.z)' comment. For example: + + ## + # @BlockStats: + # + # Statistics of a virtual block device or a block backing device. + # + # @device: #optional If the stats are for a virtual block device, the name + # corresponding to the virtual block device. + # + # @stats: A @BlockDeviceStats for the device. + # + # @parent: #optional This describes the file block device if it has one. + # + # @backing: #optional This describes the backing block device if it has one. + # (Since 2.0) + # + # Since: 0.14.0 + ## + { 'struct': 'BlockStats', + 'data': {'*device': 'str', 'stats': 'BlockDeviceStats', + '*parent': 'BlockStats', + '*backing': 'BlockStats'} } + +The schema sets up a series of types, as well as commands and events +that will use those types. Forward references are allowed: the parser +scans in two passes, where the first pass learns all type names, and +the second validates the schema and generates the code. This allows +the definition of complex structs that can have mutually recursive +types, and allows for indefinite nesting of Client JSON Protocol that +satisfies the schema. A type name should not be defined more than +once. It is permissible for the schema to contain additional types +not used by any commands or events in the Client JSON Protocol, for +the side effect of generated C code used internally. + +There are seven top-level expressions recognized by the parser: +'include', 'command', 'struct', 'enum', 'union', 'alternate', and +'event'. There are several groups of types: simple types (a number of +built-in types, such as 'int' and 'str'; as well as enumerations), +complex types (structs and two flavors of unions), and alternate types +(a choice between other types). The 'command' and 'event' expressions +can refer to existing types by name, or list an anonymous type as a +dictionary. Listing a type name inside an array refers to a +single-dimension array of that type; multi-dimension arrays are not +directly supported (although an array of a complex struct that +contains an array member is possible). + +Types, commands, and events share a common namespace. Therefore, +generally speaking, type definitions should always use CamelCase for +user-defined type names, while built-in types are lowercase. Type +definitions should not end in 'Kind', as this namespace is used for +creating implicit C enums for visiting union types, or in 'List', as +this namespace is used for creating array types. Command names, +and field names within a type, should be all lower case with words +separated by a hyphen. However, some existing older commands and +complex types use underscore; when extending such expressions, +consistency is preferred over blindly avoiding underscore. Event +names should be ALL_CAPS with words separated by underscore. Field +names cannot start with 'has-' or 'has_', as this is reserved for +tracking optional fields. + +Any name (command, event, type, field, or enum value) beginning with +"x-" is marked experimental, and may be withdrawn or changed +incompatibly in a future release. Downstream vendors may add +extensions; such extensions should begin with a prefix matching +"__RFQDN_" (for the reverse-fully-qualified-domain-name of the +vendor), even if the rest of the name uses dash (example: +__com.redhat_drive-mirror). Other than downstream extensions (with +leading underscore and the use of dots), all names should begin with a +letter, and contain only ASCII letters, digits, dash, and underscore. +Names beginning with 'q_' are reserved for the generator: QMP names +that resemble C keywords or other problematic strings will be munged +in C to use this prefix. For example, a field named "default" in +qapi becomes "q_default" in the generated C code. + +In the rest of this document, usage lines are given for each +expression type, with literal strings written in lower case and +placeholders written in capitals. If a literal string includes a +prefix of '*', that key/value pair can be omitted from the expression. +For example, a usage statement that includes '*base':STRUCT-NAME +means that an expression has an optional key 'base', which if present +must have a value that forms a struct name. + + +=== Built-in Types === + +The following types are predefined, and map to C as follows: + + Schema C JSON + str char * any JSON string, UTF-8 + number double any JSON number + int int64_t a JSON number without fractional part + that fits into the C integer type + int8 int8_t likewise + int16 int16_t likewise + int32 int32_t likewise + int64 int64_t likewise + uint8 uint8_t likewise + uint16 uint16_t likewise + uint32 uint32_t likewise + uint64 uint64_t likewise + size uint64_t like uint64_t, except StringInputVisitor + accepts size suffixes + bool bool JSON true or false + any QObject * any JSON value + + +=== Includes === + +Usage: { 'include': STRING } + +The QAPI schema definitions can be modularized using the 'include' directive: + + { 'include': 'path/to/file.json' } + +The directive is evaluated recursively, and include paths are relative to the +file using the directive. Multiple includes of the same file are +idempotent. No other keys should appear in the expression, and the include +value should be a string. + +As a matter of style, it is a good idea to have all files be +self-contained, but at the moment, nothing prevents an included file +from making a forward reference to a type that is only introduced by +an outer file. The parser may be made stricter in the future to +prevent incomplete include files. + + +=== Struct types === + +Usage: { 'struct': STRING, 'data': DICT, '*base': STRUCT-NAME } + +A struct is a dictionary containing a single 'data' key whose +value is a dictionary. This corresponds to a struct in C or an Object +in JSON. Each value of the 'data' dictionary must be the name of a +type, or a one-element array containing a type name. An example of a +struct is: + + { 'struct': 'MyType', + 'data': { 'member1': 'str', 'member2': 'int', '*member3': 'str' } } + +The use of '*' as a prefix to the name means the member is optional in +the corresponding JSON protocol usage. + +The default initialization value of an optional argument should not be changed +between versions of QEMU unless the new default maintains backward +compatibility to the user-visible behavior of the old default. + +With proper documentation, this policy still allows some flexibility; for +example, documenting that a default of 0 picks an optimal buffer size allows +one release to declare the optimal size at 512 while another release declares +the optimal size at 4096 - the user-visible behavior is not the bytes used by +the buffer, but the fact that the buffer was optimal size. + +On input structures (only mentioned in the 'data' side of a command), changing +from mandatory to optional is safe (older clients will supply the option, and +newer clients can benefit from the default); changing from optional to +mandatory is backwards incompatible (older clients may be omitting the option, +and must continue to work). + +On output structures (only mentioned in the 'returns' side of a command), +changing from mandatory to optional is in general unsafe (older clients may be +expecting the field, and could crash if it is missing), although it can be done +if the only way that the optional argument will be omitted is when it is +triggered by the presence of a new input flag to the command that older clients +don't know to send. Changing from optional to mandatory is safe. + +A structure that is used in both input and output of various commands +must consider the backwards compatibility constraints of both directions +of use. + +A struct definition can specify another struct as its base. +In this case, the fields of the base type are included as top-level fields +of the new struct's dictionary in the Client JSON Protocol wire +format. An example definition is: + + { 'struct': 'BlockdevOptionsGenericFormat', 'data': { 'file': 'str' } } + { 'struct': 'BlockdevOptionsGenericCOWFormat', + 'base': 'BlockdevOptionsGenericFormat', + 'data': { '*backing': 'str' } } + +An example BlockdevOptionsGenericCOWFormat object on the wire could use +both fields like this: + + { "file": "/some/place/my-image", + "backing": "/some/place/my-backing-file" } + + +=== Enumeration types === + +Usage: { 'enum': STRING, 'data': ARRAY-OF-STRING } + { 'enum': STRING, '*prefix': STRING, 'data': ARRAY-OF-STRING } + +An enumeration type is a dictionary containing a single 'data' key +whose value is a list of strings. An example enumeration is: + + { 'enum': 'MyEnum', 'data': [ 'value1', 'value2', 'value3' ] } + +Nothing prevents an empty enumeration, although it is probably not +useful. The list of strings should be lower case; if an enum name +represents multiple words, use '-' between words. The string 'max' is +not allowed as an enum value, and values should not be repeated. + +The enum constants will be named by using a heuristic to turn the +type name into a set of underscore separated words. For the example +above, 'MyEnum' will turn into 'MY_ENUM' giving a constant name +of 'MY_ENUM_VALUE1' for the first value. If the default heuristic +does not result in a desirable name, the optional 'prefix' field +can be used when defining the enum. + +The enumeration values are passed as strings over the Client JSON +Protocol, but are encoded as C enum integral values in generated code. +While the C code starts numbering at 0, it is better to use explicit +comparisons to enum values than implicit comparisons to 0; the C code +will also include a generated enum member ending in _MAX for tracking +the size of the enum, useful when using common functions for +converting between strings and enum values. Since the wire format +always passes by name, it is acceptable to reorder or add new +enumeration members in any location without breaking clients of Client +JSON Protocol; however, removing enum values would break +compatibility. For any struct that has a field that will only contain +a finite set of string values, using an enum type for that field is +better than open-coding the field to be type 'str'. + + +=== Union types === + +Usage: { 'union': STRING, 'data': DICT } +or: { 'union': STRING, 'data': DICT, 'base': STRUCT-NAME, + 'discriminator': ENUM-MEMBER-OF-BASE } + +Union types are used to let the user choose between several different +variants for an object. There are two flavors: simple (no +discriminator or base), flat (both discriminator and base). A union +type is defined using a data dictionary as explained in the following +paragraphs. + +A simple union type defines a mapping from automatic discriminator +values to data types like in this example: + + { 'struct': 'FileOptions', 'data': { 'filename': 'str' } } + { 'struct': 'Qcow2Options', + 'data': { 'backing-file': 'str', 'lazy-refcounts': 'bool' } } + + { 'union': 'BlockdevOptions', + 'data': { 'file': 'FileOptions', + 'qcow2': 'Qcow2Options' } } + +In the Client JSON Protocol, a simple union is represented by a +dictionary that contains the 'type' field as a discriminator, and a +'data' field that is of the specified data type corresponding to the +discriminator value, as in these examples: + + { "type": "file", "data" : { "filename": "/some/place/my-image" } } + { "type": "qcow2", "data" : { "backing-file": "/some/place/my-image", + "lazy-refcounts": true } } + +The generated C code uses a struct containing a union. Additionally, +an implicit C enum 'NameKind' is created, corresponding to the union +'Name', for accessing the various branches of the union. No branch of +the union can be named 'max', as this would collide with the implicit +enum. The value for each branch can be of any type. + +A flat union definition specifies a struct as its base, and +avoids nesting on the wire. All branches of the union must be +complex types, and the top-level fields of the union dictionary on +the wire will be combination of fields from both the base type and the +appropriate branch type (when merging two dictionaries, there must be +no keys in common). The 'discriminator' field must be the name of an +enum-typed member of the base struct. + +The following example enhances the above simple union example by +adding a common field 'readonly', renaming the discriminator to +something more applicable, and reducing the number of {} required on +the wire: + + { 'enum': 'BlockdevDriver', 'data': [ 'file', 'qcow2' ] } + { 'struct': 'BlockdevCommonOptions', + 'data': { 'driver': 'BlockdevDriver', 'readonly': 'bool' } } + { 'union': 'BlockdevOptions', + 'base': 'BlockdevCommonOptions', + 'discriminator': 'driver', + 'data': { 'file': 'FileOptions', + 'qcow2': 'Qcow2Options' } } + +Resulting in these JSON objects: + + { "driver": "file", "readonly": true, + "filename": "/some/place/my-image" } + { "driver": "qcow2", "readonly": false, + "backing-file": "/some/place/my-image", "lazy-refcounts": true } + +Notice that in a flat union, the discriminator name is controlled by +the user, but because it must map to a base member with enum type, the +code generator can ensure that branches exist for all values of the +enum (although the order of the keys need not match the declaration of +the enum). In the resulting generated C data types, a flat union is +represented as a struct with the base member fields included directly, +and then a union of structures for each branch of the struct. + +A simple union can always be re-written as a flat union where the base +class has a single member named 'type', and where each branch of the +union has a struct with a single member named 'data'. That is, + + { 'union': 'Simple', 'data': { 'one': 'str', 'two': 'int' } } + +is identical on the wire to: + + { 'enum': 'Enum', 'data': ['one', 'two'] } + { 'struct': 'Base', 'data': { 'type': 'Enum' } } + { 'struct': 'Branch1', 'data': { 'data': 'str' } } + { 'struct': 'Branch2', 'data': { 'data': 'int' } } + { 'union': 'Flat', 'base': 'Base', 'discriminator': 'type', + 'data': { 'one': 'Branch1', 'two': 'Branch2' } } + + +=== Alternate types === + +Usage: { 'alternate': STRING, 'data': DICT } + +An alternate type is one that allows a choice between two or more JSON +data types (string, integer, number, or object, but currently not +array) on the wire. The definition is similar to a simple union type, +where each branch of the union names a QAPI type. For example: + + { 'alternate': 'BlockRef', + 'data': { 'definition': 'BlockdevOptions', + 'reference': 'str' } } + +Just like for a simple union, an implicit C enum 'NameKind' is created +to enumerate the branches for the alternate 'Name'. + +Unlike a union, the discriminator string is never passed on the wire +for the Client JSON Protocol. Instead, the value's JSON type serves +as an implicit discriminator, which in turn means that an alternate +can only express a choice between types represented differently in +JSON. If a branch is typed as the 'bool' built-in, the alternate +accepts true and false; if it is typed as any of the various numeric +built-ins, it accepts a JSON number; if it is typed as a 'str' +built-in or named enum type, it accepts a JSON string; and if it is +typed as a complex type (struct or union), it accepts a JSON object. +Two different complex types, for instance, aren't permitted, because +both are represented as a JSON object. + +The example alternate declaration above allows using both of the +following example objects: + + { "file": "my_existing_block_device_id" } + { "file": { "driver": "file", + "readonly": false, + "filename": "/tmp/mydisk.qcow2" } } + + +=== Commands === + +Usage: { 'command': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT, + '*returns': TYPE-NAME, + '*gen': false, '*success-response': false } + +Commands are defined by using a dictionary containing several members, +where three members are most common. The 'command' member is a +mandatory string, and determines the "execute" value passed in a +Client JSON Protocol command exchange. + +The 'data' argument maps to the "arguments" dictionary passed in as +part of a Client JSON Protocol command. The 'data' member is optional +and defaults to {} (an empty dictionary). If present, it must be the +string name of a complex type, or a dictionary that declares an +anonymous type with the same semantics as a 'struct' expression, with +one exception noted below when 'gen' is used. + +The 'returns' member describes what will appear in the "return" field +of a Client JSON Protocol reply on successful completion of a command. +The member is optional from the command declaration; if absent, the +"return" field will be an empty dictionary. If 'returns' is present, +it must be the string name of a complex or built-in type, a +one-element array containing the name of a complex or built-in type, +with one exception noted below when 'gen' is used. Although it is +permitted to have the 'returns' member name a built-in type or an +array of built-in types, any command that does this cannot be extended +to return additional information in the future; thus, new commands +should strongly consider returning a dictionary-based type or an array +of dictionaries, even if the dictionary only contains one field at the +present. + +All commands in Client JSON Protocol use a dictionary to report +failure, with no way to specify that in QAPI. Where the error return +is different than the usual GenericError class in order to help the +client react differently to certain error conditions, it is worth +documenting this in the comments before the command declaration. + +Some example commands: + + { 'command': 'my-first-command', + 'data': { 'arg1': 'str', '*arg2': 'str' } } + { 'struct': 'MyType', 'data': { '*value': 'str' } } + { 'command': 'my-second-command', + 'returns': [ 'MyType' ] } + +which would validate this Client JSON Protocol transaction: + + => { "execute": "my-first-command", + "arguments": { "arg1": "hello" } } + <= { "return": { } } + => { "execute": "my-second-command" } + <= { "return": [ { "value": "one" }, { } ] } + +In rare cases, QAPI cannot express a type-safe representation of a +corresponding Client JSON Protocol command. You then have to suppress +generation of a marshalling function by including a key 'gen' with +boolean value false, and instead write your own function. Please try +to avoid adding new commands that rely on this, and instead use +type-safe unions. For an example of this usage: + + { 'command': 'netdev_add', + 'data': {'type': 'str', 'id': 'str'}, + 'gen': false } + +Normally, the QAPI schema is used to describe synchronous exchanges, +where a response is expected. But in some cases, the action of a +command is expected to change state in a way that a successful +response is not possible (although the command will still return a +normal dictionary error on failure). When a successful reply is not +possible, the command expression should include the optional key +'success-response' with boolean value false. So far, only QGA makes +use of this field. + + +=== Events === + +Usage: { 'event': STRING, '*data': COMPLEX-TYPE-NAME-OR-DICT } + +Events are defined with the keyword 'event'. It is not allowed to +name an event 'MAX', since the generator also produces a C enumeration +of all event names with a generated _MAX value at the end. When +'data' is also specified, additional info will be included in the +event, with similar semantics to a 'struct' expression. Finally there +will be C API generated in qapi-event.h; when called by QEMU code, a +message with timestamp will be emitted on the wire. + +An example event is: + +{ 'event': 'EVENT_C', + 'data': { '*a': 'int', 'b': 'str' } } + +Resulting in this JSON object: + +{ "event": "EVENT_C", + "data": { "b": "test string" }, + "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } + + +== Client JSON Protocol introspection == + +Clients of a Client JSON Protocol commonly need to figure out what +exactly the server (QEMU) supports. + +For this purpose, QMP provides introspection via command +query-qmp-schema. QGA currently doesn't support introspection. + +While Client JSON Protocol wire compatibility should be maintained +between qemu versions, we cannot make the same guarantees for +introspection stability. For example, one version of qemu may provide +a non-variant optional member of a struct, and a later version rework +the member to instead be non-optional and associated with a variant. +Likewise, one version of qemu may list a member with open-ended type +'str', and a later version could convert it to a finite set of strings +via an enum type; or a member may be converted from a specific type to +an alternate that represents a choice between the original type and +something else. + +query-qmp-schema returns a JSON array of SchemaInfo objects. These +objects together describe the wire ABI, as defined in the QAPI schema. +There is no specified order to the SchemaInfo objects returned; a +client must search for a particular name throughout the entire array +to learn more about that name, but is at least guaranteed that there +will be no collisions between type, command, and event names. + +However, the SchemaInfo can't reflect all the rules and restrictions +that apply to QMP. It's interface introspection (figuring out what's +there), not interface specification. The specification is in the QAPI +schema. To understand how QMP is to be used, you need to study the +QAPI schema. + +Like any other command, query-qmp-schema is itself defined in the QAPI +schema, along with the SchemaInfo type. This text attempts to give an +overview how things work. For details you need to consult the QAPI +schema. + +SchemaInfo objects have common members "name" and "meta-type", and +additional variant members depending on the value of meta-type. + +Each SchemaInfo object describes a wire ABI entity of a certain +meta-type: a command, event or one of several kinds of type. + +SchemaInfo for commands and events have the same name as in the QAPI +schema. + +Command and event names are part of the wire ABI, but type names are +not. Therefore, the SchemaInfo for types have auto-generated +meaningless names. For readability, the examples in this section use +meaningful type names instead. + +To examine a type, start with a command or event using it, then follow +references by name. + +QAPI schema definitions not reachable that way are omitted. + +The SchemaInfo for a command has meta-type "command", and variant +members "arg-type" and "ret-type". On the wire, the "arguments" +member of a client's "execute" command must conform to the object type +named by "arg-type". The "return" member that the server passes in a +success response conforms to the type named by "ret-type". + +If the command takes no arguments, "arg-type" names an object type +without members. Likewise, if the command returns nothing, "ret-type" +names an object type without members. + +Example: the SchemaInfo for command query-qmp-schema + + { "name": "query-qmp-schema", "meta-type": "command", + "arg-type": ":empty", "ret-type": "SchemaInfoList" } + + Type ":empty" is an object type without members, and type + "SchemaInfoList" is the array of SchemaInfo type. + +The SchemaInfo for an event has meta-type "event", and variant member +"arg-type". On the wire, a "data" member that the server passes in an +event conforms to the object type named by "arg-type". + +If the event carries no additional information, "arg-type" names an +object type without members. The event may not have a data member on +the wire then. + +Each command or event defined with dictionary-valued 'data' in the +QAPI schema implicitly defines an object type. + +Example: the SchemaInfo for EVENT_C from section Events + + { "name": "EVENT_C", "meta-type": "event", + "arg-type": ":obj-EVENT_C-arg" } + + Type ":obj-EVENT_C-arg" is an implicitly defined object type with + the two members from the event's definition. + +The SchemaInfo for struct and union types has meta-type "object". + +The SchemaInfo for a struct type has variant member "members". + +The SchemaInfo for a union type additionally has variant members "tag" +and "variants". + +"members" is a JSON array describing the object's common members, if +any. Each element is a JSON object with members "name" (the member's +name), "type" (the name of its type), and optionally "default". The +member is optional if "default" is present. Currently, "default" can +only have value null. Other values are reserved for future +extensions. The "members" array is in no particular order; clients +must search the entire object when learning whether a particular +member is supported. + +Example: the SchemaInfo for MyType from section Struct types + + { "name": "MyType", "meta-type": "object", + "members": [ + { "name": "member1", "type": "str" }, + { "name": "member2", "type": "int" }, + { "name": "member3", "type": "str", "default": null } ] } + +"tag" is the name of the common member serving as type tag. +"variants" is a JSON array describing the object's variant members. +Each element is a JSON object with members "case" (the value of type +tag this element applies to) and "type" (the name of an object type +that provides the variant members for this type tag value). The +"variants" array is in no particular order, and is not guaranteed to +list cases in the same order as the corresponding "tag" enum type. + +Example: the SchemaInfo for flat union BlockdevOptions from section +Union types + + { "name": "BlockdevOptions", "meta-type": "object", + "members": [ + { "name": "driver", "type": "BlockdevDriver" }, + { "name": "readonly", "type": "bool"} ], + "tag": "driver", + "variants": [ + { "case": "file", "type": "FileOptions" }, + { "case": "qcow2", "type": "Qcow2Options" } ] } + +Note that base types are "flattened": its members are included in the +"members" array. + +A simple union implicitly defines an enumeration type for its implicit +discriminator (called "type" on the wire, see section Union types). + +A simple union implicitly defines an object type for each of its +variants. + +Example: the SchemaInfo for simple union BlockdevOptions from section +Union types + + { "name": "BlockdevOptions", "meta-type": "object", + "members": [ + { "name": "kind", "type": "BlockdevOptionsKind" } ], + "tag": "type", + "variants": [ + { "case": "file", "type": ":obj-FileOptions-wrapper" }, + { "case": "qcow2", "type": ":obj-Qcow2Options-wrapper" } ] } + + Enumeration type "BlockdevOptionsKind" and the object types + ":obj-FileOptions-wrapper", ":obj-Qcow2Options-wrapper" are + implicitly defined. + +The SchemaInfo for an alternate type has meta-type "alternate", and +variant member "members". "members" is a JSON array. Each element is +a JSON object with member "type", which names a type. Values of the +alternate type conform to exactly one of its member types. There is +no guarantee on the order in which "members" will be listed. + +Example: the SchemaInfo for BlockRef from section Alternate types + + { "name": "BlockRef", "meta-type": "alternate", + "members": [ + { "type": "BlockdevOptions" }, + { "type": "str" } ] } + +The SchemaInfo for an array type has meta-type "array", and variant +member "element-type", which names the array's element type. Array +types are implicitly defined. For convenience, the array's name may +resemble the element type; however, clients should examine member +"element-type" instead of making assumptions based on parsing member +"name". + +Example: the SchemaInfo for ['str'] + + { "name": "[str]", "meta-type": "array", + "element-type": "str" } + +The SchemaInfo for an enumeration type has meta-type "enum" and +variant member "values". The values are listed in no particular +order; clients must search the entire enum when learning whether a +particular value is supported. + +Example: the SchemaInfo for MyEnum from section Enumeration types + + { "name": "MyEnum", "meta-type": "enum", + "values": [ "value1", "value2", "value3" ] } + +The SchemaInfo for a built-in type has the same name as the type in +the QAPI schema (see section Built-in Types), with one exception +detailed below. It has variant member "json-type" that shows how +values of this type are encoded on the wire. + +Example: the SchemaInfo for str + + { "name": "str", "meta-type": "builtin", "json-type": "string" } + +The QAPI schema supports a number of integer types that only differ in +how they map to C. They are identical as far as SchemaInfo is +concerned. Therefore, they get all mapped to a single type "int" in +SchemaInfo. + +As explained above, type names are not part of the wire ABI. Not even +the names of built-in types. Clients should examine member +"json-type" instead of hard-coding names of built-in types. + + +== Code generation == + +Schemas are fed into four scripts to generate all the code/files that, +paired with the core QAPI libraries, comprise everything required to +take JSON commands read in by a Client JSON Protocol server, unmarshal +the arguments into the underlying C types, call into the corresponding +C function, and map the response back to a Client JSON Protocol +response to be returned to the user. + +As an example, we'll use the following schema, which describes a single +complex user-defined type (which will produce a C struct, along with a list +node structure that can be used to chain together a list of such types in +case we want to accept/return a list of this type with a command), and a +command which takes that type as a parameter and returns the same type: + + $ cat example-schema.json + { 'struct': 'UserDefOne', + 'data': { 'integer': 'int', 'string': 'str' } } + + { 'command': 'my-command', + 'data': {'arg1': 'UserDefOne'}, + 'returns': 'UserDefOne' } + + { 'event': 'MY_EVENT' } + +=== scripts/qapi-types.py === + +Used to generate the C types defined by a schema. The following files are +created: + +$(prefix)qapi-types.h - C types corresponding to types defined in + the schema you pass in +$(prefix)qapi-types.c - Cleanup functions for the above C types + +The $(prefix) is an optional parameter used as a namespace to keep the +generated code from one schema/code-generation separated from others so code +can be generated/used from multiple schemas without clobbering previously +created code. + +Example: + + $ python scripts/qapi-types.py --output-dir="qapi-generated" \ + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-types.c +[Uninteresting stuff omitted...] + + void qapi_free_UserDefOne(UserDefOne *obj) + { + QapiDeallocVisitor *qdv; + Visitor *v; + + if (!obj) { + return; + } + + qdv = qapi_dealloc_visitor_new(); + v = qapi_dealloc_get_visitor(qdv); + visit_type_UserDefOne(v, &obj, NULL, NULL); + qapi_dealloc_visitor_cleanup(qdv); + } + + void qapi_free_UserDefOneList(UserDefOneList *obj) + { + QapiDeallocVisitor *qdv; + Visitor *v; + + if (!obj) { + return; + } + + qdv = qapi_dealloc_visitor_new(); + v = qapi_dealloc_get_visitor(qdv); + visit_type_UserDefOneList(v, &obj, NULL, NULL); + qapi_dealloc_visitor_cleanup(qdv); + } + $ cat qapi-generated/example-qapi-types.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_TYPES_H + #define EXAMPLE_QAPI_TYPES_H + +[Built-in types omitted...] + + typedef struct UserDefOne UserDefOne; + + typedef struct UserDefOneList UserDefOneList; + + struct UserDefOne { + int64_t integer; + char *string; + }; + + void qapi_free_UserDefOne(UserDefOne *obj); + + struct UserDefOneList { + union { + UserDefOne *value; + uint64_t padding; + }; + UserDefOneList *next; + }; + + void qapi_free_UserDefOneList(UserDefOneList *obj); + + #endif + +=== scripts/qapi-visit.py === + +Used to generate the visitor functions used to walk through and convert +a QObject (as provided by QMP) to a native C data structure and +vice-versa, as well as the visitor function used to dealloc a complex +schema-defined C type. + +The following files are generated: + +$(prefix)qapi-visit.c: visitor function for a particular C type, used + to automagically convert QObjects into the + corresponding C type and vice-versa, as well + as for deallocating memory for an existing C + type + +$(prefix)qapi-visit.h: declarations for previously mentioned visitor + functions + +Example: + + $ python scripts/qapi-visit.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-visit.c +[Uninteresting stuff omitted...] + + static void visit_type_UserDefOne_fields(Visitor *v, UserDefOne **obj, Error **errp) + { + Error *err = NULL; + + visit_type_int(v, &(*obj)->integer, "integer", &err); + if (err) { + goto out; + } + visit_type_str(v, &(*obj)->string, "string", &err); + if (err) { + goto out; + } + + out: + error_propagate(errp, err); + } + + void visit_type_UserDefOne(Visitor *v, UserDefOne **obj, const char *name, Error **errp) + { + Error *err = NULL; + + visit_start_struct(v, (void **)obj, "UserDefOne", name, sizeof(UserDefOne), &err); + if (!err) { + if (*obj) { + visit_type_UserDefOne_fields(v, obj, errp); + } + visit_end_struct(v, &err); + } + error_propagate(errp, err); + } + + void visit_type_UserDefOneList(Visitor *v, UserDefOneList **obj, const char *name, Error **errp) + { + Error *err = NULL; + GenericList *i, **prev; + + visit_start_list(v, name, &err); + if (err) { + goto out; + } + + for (prev = (GenericList **)obj; + !err && (i = visit_next_list(v, prev, &err)) != NULL; + prev = &i) { + UserDefOneList *native_i = (UserDefOneList *)i; + visit_type_UserDefOne(v, &native_i->value, NULL, &err); + } + + error_propagate(errp, err); + err = NULL; + visit_end_list(v, &err); + out: + error_propagate(errp, err); + } + $ cat qapi-generated/example-qapi-visit.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_VISIT_H + #define EXAMPLE_QAPI_VISIT_H + +[Visitors for built-in types omitted...] + + void visit_type_UserDefOne(Visitor *v, UserDefOne **obj, const char *name, Error **errp); + void visit_type_UserDefOneList(Visitor *v, UserDefOneList **obj, const char *name, Error **errp); + + #endif + +=== scripts/qapi-commands.py === + +Used to generate the marshaling/dispatch functions for the commands defined +in the schema. The following files are generated: + +$(prefix)qmp-marshal.c: command marshal/dispatch functions for each + QMP command defined in the schema. Functions + generated by qapi-visit.py are used to + convert QObjects received from the wire into + function parameters, and uses the same + visitor functions to convert native C return + values to QObjects from transmission back + over the wire. + +$(prefix)qmp-commands.h: Function prototypes for the QMP commands + specified in the schema. + +Example: + + $ python scripts/qapi-commands.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qmp-marshal.c +[Uninteresting stuff omitted...] + + static void qmp_marshal_output_UserDefOne(UserDefOne *ret_in, QObject **ret_out, Error **errp) + { + Error *err = NULL; + QmpOutputVisitor *qov = qmp_output_visitor_new(); + QapiDeallocVisitor *qdv; + Visitor *v; + + v = qmp_output_get_visitor(qov); + visit_type_UserDefOne(v, &ret_in, "unused", &err); + if (err) { + goto out; + } + *ret_out = qmp_output_get_qobject(qov); + + out: + error_propagate(errp, err); + qmp_output_visitor_cleanup(qov); + qdv = qapi_dealloc_visitor_new(); + v = qapi_dealloc_get_visitor(qdv); + visit_type_UserDefOne(v, &ret_in, "unused", NULL); + qapi_dealloc_visitor_cleanup(qdv); + } + + static void qmp_marshal_my_command(QDict *args, QObject **ret, Error **errp) + { + Error *err = NULL; + UserDefOne *retval; + QmpInputVisitor *qiv = qmp_input_visitor_new_strict(QOBJECT(args)); + QapiDeallocVisitor *qdv; + Visitor *v; + UserDefOne *arg1 = NULL; + + v = qmp_input_get_visitor(qiv); + visit_type_UserDefOne(v, &arg1, "arg1", &err); + if (err) { + goto out; + } + + retval = qmp_my_command(arg1, &err); + if (err) { + goto out; + } + + qmp_marshal_output_UserDefOne(retval, ret, &err); + + out: + error_propagate(errp, err); + qmp_input_visitor_cleanup(qiv); + qdv = qapi_dealloc_visitor_new(); + v = qapi_dealloc_get_visitor(qdv); + visit_type_UserDefOne(v, &arg1, "arg1", NULL); + qapi_dealloc_visitor_cleanup(qdv); + } + + static void qmp_init_marshal(void) + { + qmp_register_command("my-command", qmp_marshal_my_command, QCO_NO_OPTIONS); + } + + qapi_init(qmp_init_marshal); + $ cat qapi-generated/example-qmp-commands.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QMP_COMMANDS_H + #define EXAMPLE_QMP_COMMANDS_H + + #include "example-qapi-types.h" + #include "qapi/qmp/qdict.h" + #include "qapi/error.h" + + UserDefOne *qmp_my_command(UserDefOne *arg1, Error **errp); + + #endif + +=== scripts/qapi-event.py === + +Used to generate the event-related C code defined by a schema. The +following files are created: + +$(prefix)qapi-event.h - Function prototypes for each event type, plus an + enumeration of all event names +$(prefix)qapi-event.c - Implementation of functions to send an event + +Example: + + $ python scripts/qapi-event.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qapi-event.c +[Uninteresting stuff omitted...] + + void qapi_event_send_my_event(Error **errp) + { + QDict *qmp; + Error *err = NULL; + QMPEventFuncEmit emit; + emit = qmp_event_get_func_emit(); + if (!emit) { + return; + } + + qmp = qmp_event_build_dict("MY_EVENT"); + + emit(EXAMPLE_QAPI_EVENT_MY_EVENT, qmp, &err); + + error_propagate(errp, err); + QDECREF(qmp); + } + + const char *const example_QAPIEvent_lookup[] = { + [EXAMPLE_QAPI_EVENT_MY_EVENT] = "MY_EVENT", + [EXAMPLE_QAPI_EVENT_MAX] = NULL, + }; + $ cat qapi-generated/example-qapi-event.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QAPI_EVENT_H + #define EXAMPLE_QAPI_EVENT_H + + #include "qapi/error.h" + #include "qapi/qmp/qdict.h" + #include "example-qapi-types.h" + + + void qapi_event_send_my_event(Error **errp); + + typedef enum example_QAPIEvent { + EXAMPLE_QAPI_EVENT_MY_EVENT = 0, + EXAMPLE_QAPI_EVENT_MAX = 1, + } example_QAPIEvent; + + extern const char *const example_QAPIEvent_lookup[]; + + #endif + +=== scripts/qapi-introspect.py === + +Used to generate the introspection C code for a schema. The following +files are created: + +$(prefix)qmp-introspect.c - Defines a string holding a JSON + description of the schema. +$(prefix)qmp-introspect.h - Declares the above string. + +Example: + + $ python scripts/qapi-introspect.py --output-dir="qapi-generated" + --prefix="example-" example-schema.json + $ cat qapi-generated/example-qmp-introspect.c +[Uninteresting stuff omitted...] + + const char example_qmp_schema_json[] = "[" + "{\"arg-type\": \"0\", \"meta-type\": \"event\", \"name\": \"MY_EVENT\"}, " + "{\"arg-type\": \"1\", \"meta-type\": \"command\", \"name\": \"my-command\", \"ret-type\": \"2\"}, " + "{\"members\": [], \"meta-type\": \"object\", \"name\": \"0\"}, " + "{\"members\": [{\"name\": \"arg1\", \"type\": \"2\"}], \"meta-type\": \"object\", \"name\": \"1\"}, " + "{\"members\": [{\"name\": \"integer\", \"type\": \"int\"}, {\"name\": \"string\", \"type\": \"str\"}], \"meta-type\": \"object\", \"name\": \"2\"}, " + "{\"json-type\": \"int\", \"meta-type\": \"builtin\", \"name\": \"int\"}, " + "{\"json-type\": \"string\", \"meta-type\": \"builtin\", \"name\": \"str\"}]"; + $ cat qapi-generated/example-qmp-introspect.h +[Uninteresting stuff omitted...] + + #ifndef EXAMPLE_QMP_INTROSPECT_H + #define EXAMPLE_QMP_INTROSPECT_H + + extern const char example_qmp_schema_json[]; + + #endif diff --git a/src/docs/qcow2-cache.txt b/src/docs/qcow2-cache.txt new file mode 100644 index 0000000..5bb0607 --- /dev/null +++ b/src/docs/qcow2-cache.txt @@ -0,0 +1,164 @@ +qcow2 L2/refcount cache configuration +===================================== +Copyright (C) 2015 Igalia, S.L. +Author: Alberto Garcia <berto@igalia.com> + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +Introduction +------------ +The QEMU qcow2 driver has two caches that can improve the I/O +performance significantly. However, setting the right cache sizes is +not a straightforward operation. + +This document attempts to give an overview of the L2 and refcount +caches, and how to configure them. + +Please refer to the docs/specs/qcow2.txt file for an in-depth +technical description of the qcow2 file format. + + +Clusters +-------- +A qcow2 file is organized in units of constant size called clusters. + +The cluster size is configurable, but it must be a power of two and +its value 512 bytes or higher. QEMU currently defaults to 64 KB +clusters, and it does not support sizes larger than 2MB. + +The 'qemu-img create' command supports specifying the size using the +cluster_size option: + + qemu-img create -f qcow2 -o cluster_size=128K hd.qcow2 4G + + +The L2 tables +------------- +The qcow2 format uses a two-level structure to map the virtual disk as +seen by the guest to the disk image in the host. These structures are +called the L1 and L2 tables. + +There is one single L1 table per disk image. The table is small and is +always kept in memory. + +There can be many L2 tables, depending on how much space has been +allocated in the image. Each table is one cluster in size. In order to +read or write data from the virtual disk, QEMU needs to read its +corresponding L2 table to find out where that data is located. Since +reading the table for each I/O operation can be expensive, QEMU keeps +an L2 cache in memory to speed up disk access. + +The size of the L2 cache can be configured, and setting the right +value can improve the I/O performance significantly. + + +The refcount blocks +------------------- +The qcow2 format also mantains a reference count for each cluster. +Reference counts are used for cluster allocation and internal +snapshots. The data is stored in a two-level structure similar to the +L1/L2 tables described above. + +The second level structures are called refcount blocks, are also one +cluster in size and the number is also variable and dependent on the +amount of allocated space. + +Each block contains a number of refcount entries. Their size (in bits) +is a power of two and must not be higher than 64. It defaults to 16 +bits, but a different value can be set using the refcount_bits option: + + qemu-img create -f qcow2 -o refcount_bits=8 hd.qcow2 4G + +QEMU keeps a refcount cache to speed up I/O much like the +aforementioned L2 cache, and its size can also be configured. + + +Choosing the right cache sizes +------------------------------ +In order to choose the cache sizes we need to know how they relate to +the amount of allocated space. + +The amount of virtual disk that can be mapped by the L2 and refcount +caches (in bytes) is: + + disk_size = l2_cache_size * cluster_size / 8 + disk_size = refcount_cache_size * cluster_size * 8 / refcount_bits + +With the default values for cluster_size (64KB) and refcount_bits +(16), that is + + disk_size = l2_cache_size * 8192 + disk_size = refcount_cache_size * 32768 + +So in order to cover n GB of disk space with the default values we +need: + + l2_cache_size = disk_size_GB * 131072 + refcount_cache_size = disk_size_GB * 32768 + +QEMU has a default L2 cache of 1MB (1048576 bytes) and a refcount +cache of 256KB (262144 bytes), so using the formulas we've just seen +we have + + 1048576 / 131072 = 8 GB of virtual disk covered by that cache + 262144 / 32768 = 8 GB + + +How to configure the cache sizes +-------------------------------- +Cache sizes can be configured using the -drive option in the +command-line, or the 'blockdev-add' QMP command. + +There are three options available, and all of them take bytes: + +"l2-cache-size": maximum size of the L2 table cache +"refcount-cache-size": maximum size of the refcount block cache +"cache-size": maximum size of both caches combined + +There are two things that need to be taken into account: + + - Both caches must have a size that is a multiple of the cluster + size. + + - If you only set one of the options above, QEMU will automatically + adjust the others so that the L2 cache is 4 times bigger than the + refcount cache. + +This means that these options are equivalent: + + -drive file=hd.qcow2,l2-cache-size=2097152 + -drive file=hd.qcow2,refcount-cache-size=524288 + -drive file=hd.qcow2,cache-size=2621440 + +The reason for this 1/4 ratio is to ensure that both caches cover the +same amount of disk space. Note however that this is only valid with +the default value of refcount_bits (16). If you are using a different +value you might want to calculate both cache sizes yourself since QEMU +will always use the same 1/4 ratio. + +It's also worth mentioning that there's no strict need for both caches +to cover the same amount of disk space. The refcount cache is used +much less often than the L2 cache, so it's perfectly reasonable to +keep it small. + + +Reducing the memory usage +------------------------- +It is possible to clean unused cache entries in order to reduce the +memory usage during periods of low I/O activity. + +The parameter "cache-clean-interval" defines an interval (in seconds). +All cache entries that haven't been accessed during that interval are +removed from memory. + +This example removes all unused cache entries every 15 minutes: + + -drive file=hd.qcow2,cache-clean-interval=900 + +If unset, the default value for this parameter is 0 and it disables +this feature. + +Note that this functionality currently relies on the MADV_DONTNEED +argument for madvise() to actually free the memory, so it is not +useful in systems that don't follow that behavior. diff --git a/src/docs/qdev-device-use.txt b/src/docs/qdev-device-use.txt new file mode 100644 index 0000000..136d271 --- /dev/null +++ b/src/docs/qdev-device-use.txt @@ -0,0 +1,416 @@ += How to convert to -device & friends = + +=== Specifying Bus and Address on Bus === + +In qdev, each device has a parent bus. Some devices provide one or +more buses for children. You can specify a device's parent bus with +-device parameter bus. + +A device typically has a device address on its parent bus. For buses +where this address can be configured, devices provide a bus-specific +property. Examples: + + bus property name value format + PCI addr %x.%x (dev.fn, .fn optional) + I2C address %u + SCSI scsi-id %u + IDE unit %u + HDA cad %u + virtio-serial-bus nr %u + ccid-bus slot %u + USB port %d(.%d)* (port.port...) + +Example: device i440FX-pcihost is on the root bus, and provides a PCI +bus named pci.0. To put a FOO device into its slot 4, use -device +FOO,bus=/i440FX-pcihost/pci.0,addr=4. The abbreviated form bus=pci.0 +also works as long as the bus name is unique. + +=== Block Devices === + +A QEMU block device (drive) has a host and a guest part. + +In the general case, the guest device is connected to a controller +device. For instance, the IDE controller provides two IDE buses, each +of which can have up to two ide-drive devices, and each ide-drive +device is a guest part, and is connected to a host part. + +Except we sometimes lump controller, bus(es) and drive device(s) all +together into a single device. For instance, the ISA floppy +controller is connected to up to two host drives. + +The old ways to define block devices define host and guest part +together. Sometimes, they can even define a controller device in +addition to the block device. + +The new way keeps the parts separate: you create the host part with +-drive, and guest device(s) with -device. + +The various old ways to define drives all boil down to the common form + + -drive if=TYPE,bus=BUS,unit=UNIT,OPTS... + +TYPE, BUS and UNIT identify the controller device, which of its buses +to use, and the drive's address on that bus. Details depend on TYPE. + +Instead of bus=BUS,unit=UNIT, you can also say index=IDX. + +In the new way, this becomes something like + + -drive if=none,id=DRIVE-ID,HOST-OPTS... + -device DEVNAME,drive=DRIVE-ID,DEV-OPTS... + +The old OPTS get split into HOST-OPTS and DEV-OPTS as follows: + +* file, format, snapshot, cache, aio, readonly, rerror, werror go into + HOST-OPTS. + +* cyls, head, secs and trans go into HOST-OPTS. Future work: they + should go into DEV-OPTS instead. + +* serial goes into DEV-OPTS, for devices supporting serial numbers. + For other devices, it goes nowhere. + +* media is special. In the old way, it selects disk vs. CD-ROM with + if=ide, if=scsi and if=xen. The new way uses DEVNAME for that. + Additionally, readonly=on goes into HOST-OPTS. + +* addr is special, see if=virtio below. + +The -device argument differs in detail for each type of drive: + +* if=ide + + -device DEVNAME,drive=DRIVE-ID,bus=IDE-BUS,unit=UNIT + + where DEVNAME is either ide-hd or ide-cd, IDE-BUS identifies an IDE + bus, normally either ide.0 or ide.1, and UNIT is either 0 or 1. + +* if=scsi + + The old way implicitly creates SCSI controllers as needed. The new + way makes that explicit: + + -device lsi53c895a,id=ID + + As for all PCI devices, you can add bus=PCI-BUS,addr=DEVFN to + control the PCI device address. + + This SCSI controller provides a single SCSI bus, named ID.0. Put a + disk on it: + + -device DEVNAME,drive=DRIVE-ID,bus=ID.0,scsi-id=UNIT + + where DEVNAME is either scsi-hd, scsi-cd or scsi-generic. + +* if=floppy + + -global isa-fdc.driveA=DRIVE-ID + -global isa-fdc.driveB=DRIVE-ID + + This is -global instead of -device, because the floppy controller is + created automatically, and we want to configure that one, not create + a second one (which isn't possible anyway). + + Without any -global isa-fdc,... you get an empty driveA and no + driveB. You can use -nodefaults to suppress the default driveA, see + "Default Devices". + +* if=virtio + + -device virtio-blk-pci,drive=DRIVE-ID,class=C,vectors=V,ioeventfd=IOEVENTFD + + This lets you control PCI device class and MSI-X vectors. + + IOEVENTFD controls whether or not ioeventfd is used for virtqueue + notify. It can be set to on (default) or off. + + As for all PCI devices, you can add bus=PCI-BUS,addr=DEVFN to + control the PCI device address. This replaces option addr available + with -drive if=virtio. + +* if=pflash, if=mtd, if=sd, if=xen are not yet available with -device + +For USB devices, the old way is actually different: + + -usbdevice disk:format=FMT:FILENAME + +Provides much less control than -drive's OPTS... The new way fixes +that: + + -device usb-storage,drive=DRIVE-ID,removable=RMB + +The removable parameter gives control over the SCSI INQUIRY removable +(RMB) bit. USB thumbdrives usually set removable=on, while USB hard +disks set removable=off. + +Bug: usb-storage pretends to be a block device, but it's really a SCSI +controller that can serve only a single device, which it creates +automatically. The automatic creation guesses what kind of guest part +to create from the host part, like -drive if=scsi. Host and guest +part are not cleanly separated. + +=== Character Devices === + +A QEMU character device has a host and a guest part. + +The old ways to define character devices define host and guest part +together. + +The new way keeps the parts separate: you create the host part with +-chardev, and the guest device with -device. + +The various old ways to define a character device are all of the +general form + + -FOO FOO-OPTS...,LEGACY-CHARDEV + +where FOO-OPTS... is specific to -FOO, and the host part +LEGACY-CHARDEV is the same everywhere. + +In the new way, this becomes + + -chardev HOST-OPTS...,id=CHR-ID + -device DEVNAME,chardev=CHR-ID,DEV-OPTS... + +The appropriate DEVNAME depends on the machine type. For type "pc": + +* -serial becomes -device isa-serial,iobase=IOADDR,irq=IRQ,index=IDX + + This lets you control I/O ports and IRQs. + +* -parallel becomes -device isa-parallel,iobase=IOADDR,irq=IRQ,index=IDX + + This lets you control I/O ports and IRQs. + +* -usbdevice serial:vendorid=VID,productid=PRID becomes + -device usb-serial,vendorid=VID,productid=PRID + +* -usbdevice braille doesn't support LEGACY-CHARDEV syntax. It always + uses "braille". With -device, this useful default is gone, so you + have to use something like + + -device usb-braille,chardev=braille,vendorid=VID,productid=PRID + -chardev braille,id=braille + +* -virtioconsole becomes + -device virtio-serial-pci,class=C,vectors=V,ioeventfd=IOEVENTFD,max_ports=N + -device virtconsole,is_console=NUM,nr=NR,name=NAME + +LEGACY-CHARDEV translates to -chardev HOST-OPTS... as follows: + +* null becomes -chardev null + +* pty, msmouse, braille, stdio likewise + +* vc:WIDTHxHEIGHT becomes -chardev vc,width=WIDTH,height=HEIGHT + +* vc:<COLS>Cx<ROWS>C becomes -chardev vc,cols=<COLS>,rows=<ROWS> + +* con: becomes -chardev console + +* COM<NUM> becomes -chardev serial,path=COM<NUM> + +* file:FNAME becomes -chardev file,path=FNAME + +* pipe:FNAME becomes -chardev pipe,path=FNAME + +* tcp:HOST:PORT,OPTS... becomes -chardev socket,host=HOST,port=PORT,OPTS... + +* telnet:HOST:PORT,OPTS... becomes + -chardev socket,host=HOST,port=PORT,OPTS...,telnet=on + +* udp:HOST:PORT@LOCALADDR:LOCALPORT becomes + -chardev udp,host=HOST,port=PORT,localaddr=LOCALADDR,localport=LOCALPORT + +* unix:FNAME becomes -chardev socket,path=FNAME + +* /dev/parportN becomes -chardev parport,file=/dev/parportN + +* /dev/ppiN likewise + +* Any other /dev/FNAME becomes -chardev tty,path=/dev/FNAME + +* mon:LEGACY-CHARDEV is special: it multiplexes the monitor onto the + character device defined by LEGACY-CHARDEV. -chardev provides more + general multiplexing instead: you can connect up to four users to a + single host part. You need to pass mux=on to -chardev to enable + switching the input focus. + +QEMU uses LEGACY-CHARDEV syntax not just to set up guest devices, but +also in various other places such as -monitor or -net +user,guestfwd=... You can use chardev:CHR-ID in place of +LEGACY-CHARDEV to refer to a host part defined with -chardev. + +=== Network Devices === + +Host and guest part of network devices have always been separate. + +The old way to define the guest part looks like this: + + -net nic,netdev=NET-ID,macaddr=MACADDR,model=MODEL,name=ID,addr=STR,vectors=V + +Except for USB it looks like this: + + -usbdevice net:netdev=NET-ID,macaddr=MACADDR,name=ID + +The new way is -device: + + -device DEVNAME,netdev=NET-ID,mac=MACADDR,DEV-OPTS... + +DEVNAME equals MODEL, except for virtio you have to name the virtio +device appropriate for the bus (virtio-net-pci for PCI), and for USB +you have to use usb-net. + +The old name=ID parameter becomes the usual id=ID with -device. + +For PCI devices, you can add bus=PCI-BUS,addr=DEVFN to control the PCI +device address, as usual. The old -net nic provides parameter addr +for that, which is silently ignored when the NIC is not a PCI device. + +For virtio-net-pci, you can control whether or not ioeventfd is used for +virtqueue notify by setting ioeventfd= to on or off (default). + +-net nic accepts vectors=V for all models, but it's silently ignored +except for virtio-net-pci (model=virtio). With -device, only devices +that support it accept it. + +Not all devices are available with -device at this time. All PCI +devices and ne2k_isa are. + +Some PCI devices aren't available with -net nic, e.g. i82558a. + +To connect to a VLAN instead of an ordinary host part, replace +netdev=NET-ID by vlan=VLAN. + +=== Graphics Devices === + +Host and guest part of graphics devices have always been separate. + +The old way to define the guest graphics device is -vga VGA. Not all +machines support all -vga options. + +The new way is -device. The mapping from -vga argument to -device +depends on the machine type. For machine "pc", it's: + + std -device VGA + cirrus -device cirrus-vga + vmware -device vmware-svga + qxl -device qxl-vga + none -nodefaults + disables more than just VGA, see "Default Devices" + +As for all PCI devices, you can add bus=PCI-BUS,addr=DEVFN to control +the PCI device address. + +-device VGA supports properties bios-offset and bios-size, but they +aren't used with machine type "pc". + +For machine "isapc", it's + + std -device isa-vga + cirrus not yet available with -device + none -nodefaults + disables more than just VGA, see "Default Devices" + +Bug: the new way doesn't work for machine types "pc" and "isapc", +because it violates obscure device initialization ordering +constraints. + +=== Audio Devices === + +Host and guest part of audio devices have always been separate. + +The old way to define guest audio devices is -soundhw C1,... + +The new way is to define each guest audio device separately with +-device. + +Map from -soundhw sound card name to -device: + + ac97 -device AC97 + cs4231a -device cs4231a,iobase=IOADDR,irq=IRQ,dma=DMA + es1370 -device ES1370 + gus -device gus,iobase=IOADDR,irq=IRQ,dma=DMA,freq=F + hda -device intel-hda,msi=MSI -device hda-duplex + sb16 -device sb16,iobase=IOADDR,irq=IRQ,dma=DMA,dma16=DMA16,version=V + adlib not yet available with -device + pcspk not yet available with -device + +For PCI devices, you can add bus=PCI-BUS,addr=DEVFN to control the PCI +device address, as usual. + +=== USB Devices === + +The old way to define a virtual USB device is -usbdevice DRIVER:OPTS... + +The new way is -device DEVNAME,DEV-OPTS... Details depend on DRIVER: + +* ccid -device usb-ccid +* keyboard -device usb-kbd +* mouse -device usb-mouse +* tablet -device usb-tablet +* wacom-tablet -device usb-wacom-tablet +* host:... See "Host Device Assignment" +* disk:... See "Block Devices" +* serial:... See "Character Devices" +* braille See "Character Devices" +* net:... See "Network Devices" +* bt:... not yet available with -device + +=== Watchdog Devices === + +Host and guest part of watchdog devices have always been separate. + +The old way to define a guest watchdog device is -watchdog DEVNAME. +The new way is -device DEVNAME. For PCI devices, you can add +bus=PCI-BUS,addr=DEVFN to control the PCI device address, as usual. + +=== Host Device Assignment === + +QEMU supports assigning host PCI devices (qemu-kvm only at this time) +and host USB devices. + +The old way to assign a host PCI device is + + -pcidevice host=ADDR,dma=none,id=ID + +The new way is + + -device pci-assign,host=ADDR,iommu=IOMMU,id=ID + +The old dma=none becomes iommu=off with -device. + +The old way to assign a host USB device is + + -usbdevice host:auto:BUS.ADDR:VID:PRID + +where any of BUS, ADDR, VID, PRID can be the wildcard *. + +The new way is + + -device usb-host,hostbus=BUS,hostaddr=ADDR,vendorid=VID,productid=PRID + +Omitted options match anything, just like the old way's wildcard. + +=== Default Devices === + +QEMU creates a number of devices by default, depending on the machine +type. + +-device DEVNAME... and global DEVNAME... suppress default devices for +some DEVNAMEs: + + default device suppressing DEVNAMEs + CD-ROM ide-cd, ide-drive, scsi-cd + isa-fdc's driveA isa-fdc + parallel isa-parallel + serial isa-serial + VGA VGA, cirrus-vga, vmware-svga + virtioconsole virtio-serial-pci, virtio-serial-s390, virtio-serial + +The default NIC is connected to a default part created along with it. +It is *not* suppressed by configuring a NIC with -device (you may call +that a bug). -net and -netdev suppress the default NIC. + +-nodefaults suppresses all the default devices mentioned above, plus a +few other things such as default SD-Card drive and default monitor. diff --git a/src/docs/qemupciserial.inf b/src/docs/qemupciserial.inf new file mode 100644 index 0000000..6f7eef4 --- /dev/null +++ b/src/docs/qemupciserial.inf @@ -0,0 +1,102 @@ +; qemupciserial.inf for QEMU, based on MSPORTS.INF + +; The driver itself is shipped with Windows (serial.sys). This is +; just a inf file to tell windows which pci id the serial pci card +; emulated by qemu has, and to apply a name tag to it which windows +; will show in the device manager. + +; Installing the driver: Go to device manager. You should find a "pci +; serial card" tagged with a yellow question mark. Open properties. +; Pick "update driver". Then "select driver manually". Pick "Ports +; (Com+Lpt)" from the list. Click "Have a disk". Select this file. +; Procedure may vary a bit depending on the windows version. + +; This file covers all options: pci-serial, pci-serial-2x, pci-serial-4x +; for both 32 and 64 bit platforms. + +[Version] +Signature="$Windows NT$" +Class=MultiFunction +ClassGUID={4d36e971-e325-11ce-bfc1-08002be10318} +Provider=%QEMU% +DriverVer=12/29/2013,1.3.0 +[ControlFlags] +ExcludeFromSelect=* +[Manufacturer] +%QEMU%=QEMU,NTx86,NTAMD64 + +[QEMU.NTx86] +%QEMU-PCI_SERIAL_1_PORT%=ComPort_inst1, PCI\VEN_1B36&DEV_0002 +%QEMU-PCI_SERIAL_2_PORT%=ComPort_inst2, PCI\VEN_1B36&DEV_0003 +%QEMU-PCI_SERIAL_4_PORT%=ComPort_inst4, PCI\VEN_1B36&DEV_0004 + +[QEMU.NTAMD64] +%QEMU-PCI_SERIAL_1_PORT%=ComPort_inst1, PCI\VEN_1B36&DEV_0002 +%QEMU-PCI_SERIAL_2_PORT%=ComPort_inst2, PCI\VEN_1B36&DEV_0003 +%QEMU-PCI_SERIAL_4_PORT%=ComPort_inst4, PCI\VEN_1B36&DEV_0004 + +[ComPort_inst1] +Include=mf.inf +Needs=MFINSTALL.mf + +[ComPort_inst2] +Include=mf.inf +Needs=MFINSTALL.mf + +[ComPort_inst4] +Include=mf.inf +Needs=MFINSTALL.mf + +[ComPort_inst1.HW] +AddReg=ComPort_inst1.RegHW + +[ComPort_inst2.HW] +AddReg=ComPort_inst2.RegHW + +[ComPort_inst4.HW] +AddReg=ComPort_inst4.RegHW + +[ComPort_inst1.Services] +Include=mf.inf +Needs=MFINSTALL.mf.Services + +[ComPort_inst2.Services] +Include=mf.inf +Needs=MFINSTALL.mf.Services + +[ComPort_inst4.Services] +Include=mf.inf +Needs=MFINSTALL.mf.Services + +[ComPort_inst1.RegHW] +HKR,Child0000,HardwareID,,*PNP0501 +HKR,Child0000,VaryingResourceMap,1,00, 00,00,00,00, 08,00,00,00 +HKR,Child0000,ResourceMap,1,02 + +[ComPort_inst2.RegHW] +HKR,Child0000,HardwareID,,*PNP0501 +HKR,Child0000,VaryingResourceMap,1,00, 00,00,00,00, 08,00,00,00 +HKR,Child0000,ResourceMap,1,02 +HKR,Child0001,HardwareID,,*PNP0501 +HKR,Child0001,VaryingResourceMap,1,00, 08,00,00,00, 08,00,00,00 +HKR,Child0001,ResourceMap,1,02 + +[ComPort_inst4.RegHW] +HKR,Child0000,HardwareID,,*PNP0501 +HKR,Child0000,VaryingResourceMap,1,00, 00,00,00,00, 08,00,00,00 +HKR,Child0000,ResourceMap,1,02 +HKR,Child0001,HardwareID,,*PNP0501 +HKR,Child0001,VaryingResourceMap,1,00, 08,00,00,00, 08,00,00,00 +HKR,Child0001,ResourceMap,1,02 +HKR,Child0002,HardwareID,,*PNP0501 +HKR,Child0002,VaryingResourceMap,1,00, 10,00,00,00, 08,00,00,00 +HKR,Child0002,ResourceMap,1,02 +HKR,Child0003,HardwareID,,*PNP0501 +HKR,Child0003,VaryingResourceMap,1,00, 18,00,00,00, 08,00,00,00 +HKR,Child0003,ResourceMap,1,02 + +[Strings] +QEMU="QEMU" +QEMU-PCI_SERIAL_1_PORT="1x QEMU PCI Serial Card" +QEMU-PCI_SERIAL_2_PORT="2x QEMU PCI Serial Card" +QEMU-PCI_SERIAL_4_PORT="4x QEMU PCI Serial Card" diff --git a/src/docs/qmp-events.txt b/src/docs/qmp-events.txt new file mode 100644 index 0000000..d2f1ce4 --- /dev/null +++ b/src/docs/qmp-events.txt @@ -0,0 +1,676 @@ + QEMU Machine Protocol Events + ============================ + +ACPI_DEVICE_OST +--------------- + +Emitted when guest executes ACPI _OST method. + + - data: ACPIOSTInfo type as described in qapi-schema.json + +{ "event": "ACPI_DEVICE_OST", + "data": { "device": "d1", "slot": "0", "slot-type": "DIMM", "source": 1, "status": 0 } } + +BALLOON_CHANGE +-------------- + +Emitted when the guest changes the actual BALLOON level. This +value is equivalent to the 'actual' field return by the +'query-balloon' command + +Data: + +- "actual": actual level of the guest memory balloon in bytes (json-number) + +Example: + +{ "event": "BALLOON_CHANGE", + "data": { "actual": 944766976 }, + "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } + +Note: this event is rate-limited. + +BLOCK_IMAGE_CORRUPTED +--------------------- + +Emitted when a disk image is being marked corrupt. The image can be +identified by its device or node name. The 'device' field is always +present for compatibility reasons, but it can be empty ("") if the +image does not have a device name associated. + +Data: + +- "device": Device name (json-string) +- "node-name": Node name (json-string, optional) +- "msg": Informative message (e.g., reason for the corruption) + (json-string) +- "offset": If the corruption resulted from an image access, this + is the host's access offset into the image + (json-int, optional) +- "size": If the corruption resulted from an image access, this + is the access size (json-int, optional) + +Example: + +{ "event": "BLOCK_IMAGE_CORRUPTED", + "data": { "device": "ide0-hd0", "node-name": "node0", + "msg": "Prevented active L1 table overwrite", "offset": 196608, + "size": 65536 }, + "timestamp": { "seconds": 1378126126, "microseconds": 966463 } } + +BLOCK_IO_ERROR +-------------- + +Emitted when a disk I/O error occurs. + +Data: + +- "device": device name (json-string) +- "operation": I/O operation (json-string, "read" or "write") +- "action": action that has been taken, it's one of the following (json-string): + "ignore": error has been ignored + "report": error has been reported to the device + "stop": the VM is going to stop because of the error + +Example: + +{ "event": "BLOCK_IO_ERROR", + "data": { "device": "ide0-hd1", + "operation": "write", + "action": "stop" }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +Note: If action is "stop", a STOP event will eventually follow the +BLOCK_IO_ERROR event. + +BLOCK_JOB_CANCELLED +------------------- + +Emitted when a block job has been cancelled. + +Data: + +- "type": Job type (json-string; "stream" for image streaming + "commit" for block commit) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) + +Example: + +{ "event": "BLOCK_JOB_CANCELLED", + "data": { "type": "stream", "device": "virtio-disk0", + "len": 10737418240, "offset": 134217728, + "speed": 0 }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } + +BLOCK_JOB_COMPLETED +------------------- + +Emitted when a block job has completed. + +Data: + +- "type": Job type (json-string; "stream" for image streaming + "commit" for block commit) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) +- "error": Error message (json-string, optional) + Only present on failure. This field contains a human-readable + error message. There are no semantics other than that streaming + has failed and clients should not try to interpret the error + string. + +Example: + +{ "event": "BLOCK_JOB_COMPLETED", + "data": { "type": "stream", "device": "virtio-disk0", + "len": 10737418240, "offset": 10737418240, + "speed": 0 }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } + +BLOCK_JOB_ERROR +--------------- + +Emitted when a block job encounters an error. + +Data: + +- "device": device name (json-string) +- "operation": I/O operation (json-string, "read" or "write") +- "action": action that has been taken, it's one of the following (json-string): + "ignore": error has been ignored, the job may fail later + "report": error will be reported and the job canceled + "stop": error caused job to be paused + +Example: + +{ "event": "BLOCK_JOB_ERROR", + "data": { "device": "ide0-hd1", + "operation": "write", + "action": "stop" }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +BLOCK_JOB_READY +--------------- + +Emitted when a block job is ready to complete. + +Data: + +- "type": Job type (json-string; "stream" for image streaming + "commit" for block commit) +- "device": Device name (json-string) +- "len": Maximum progress value (json-int) +- "offset": Current progress value (json-int) + On success this is equal to len. + On failure this is less than len. +- "speed": Rate limit, bytes per second (json-int) + +Example: + +{ "event": "BLOCK_JOB_READY", + "data": { "device": "drive0", "type": "mirror", "speed": 0, + "len": 2097152, "offset": 2097152 } + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +Note: The "ready to complete" status is always reset by a BLOCK_JOB_ERROR +event. + +DEVICE_DELETED +-------------- + +Emitted whenever the device removal completion is acknowledged +by the guest. +At this point, it's safe to reuse the specified device ID. +Device removal can be initiated by the guest or by HMP/QMP commands. + +Data: + +- "device": device name (json-string, optional) +- "path": device path (json-string) + +{ "event": "DEVICE_DELETED", + "data": { "device": "virtio-net-pci-0", + "path": "/machine/peripheral/virtio-net-pci-0" }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +DEVICE_TRAY_MOVED +----------------- + +It's emitted whenever the tray of a removable device is moved by the guest +or by HMP/QMP commands. + +Data: + +- "device": device name (json-string) +- "tray-open": true if the tray has been opened or false if it has been closed + (json-bool) + +{ "event": "DEVICE_TRAY_MOVED", + "data": { "device": "ide1-cd0", + "tray-open": true + }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +GUEST_PANICKED +-------------- + +Emitted when guest OS panic is detected. + +Data: + +- "action": Action that has been taken (json-string, currently always "pause"). + +Example: + +{ "event": "GUEST_PANICKED", + "data": { "action": "pause" } } + +MEM_UNPLUG_ERROR +-------------------- +Emitted when memory hot unplug error occurs. + +Data: + +- "device": device name (json-string) +- "msg": Informative message (e.g., reason for the error) (json-string) + +Example: + +{ "event": "MEM_UNPLUG_ERROR" + "data": { "device": "dimm1", + "msg": "acpi: device unplug for unsupported device" + }, + "timestamp": { "seconds": 1265044230, "microseconds": 450486 } } + +NIC_RX_FILTER_CHANGED +--------------------- + +The event is emitted once until the query command is executed, +the first event will always be emitted. + +Data: + +- "name": net client name (json-string) +- "path": device path (json-string) + +{ "event": "NIC_RX_FILTER_CHANGED", + "data": { "name": "vnet0", + "path": "/machine/peripheral/vnet0/virtio-backend" }, + "timestamp": { "seconds": 1368697518, "microseconds": 326866 } } +} + +POWERDOWN +--------- + +Emitted when the Virtual Machine is powered down through the power +control system, such as via ACPI. + +Data: None. + +Example: + +{ "event": "POWERDOWN", + "timestamp": { "seconds": 1267040730, "microseconds": 682951 } } + +QUORUM_FAILURE +-------------- + +Emitted by the Quorum block driver if it fails to establish a quorum. + +Data: + +- "reference": device name if defined else node name. +- "sector-num": Number of the first sector of the failed read operation. +- "sectors-count": Failed read operation sector count. + +Example: + +{ "event": "QUORUM_FAILURE", + "data": { "reference": "usr1", "sector-num": 345435, "sectors-count": 5 }, + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + +Note: this event is rate-limited. + +QUORUM_REPORT_BAD +----------------- + +Emitted to report a corruption of a Quorum file. + +Data: + +- "error": Error message (json-string, optional) + Only present on failure. This field contains a human-readable + error message. There are no semantics other than that the + block layer reported an error and clients should not try to + interpret the error string. +- "node-name": The graph node name of the block driver state. +- "sector-num": Number of the first sector of the failed read operation. +- "sectors-count": Failed read operation sector count. + +Example: + +{ "event": "QUORUM_REPORT_BAD", + "data": { "node-name": "1.raw", "sector-num": 345435, "sectors-count": 5 }, + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + +Note: this event is rate-limited. + +RESET +----- + +Emitted when the Virtual Machine is reset. + +Data: None. + +Example: + +{ "event": "RESET", + "timestamp": { "seconds": 1267041653, "microseconds": 9518 } } + +RESUME +------ + +Emitted when the Virtual Machine resumes execution. + +Data: None. + +Example: + +{ "event": "RESUME", + "timestamp": { "seconds": 1271770767, "microseconds": 582542 } } + +RTC_CHANGE +---------- + +Emitted when the guest changes the RTC time. + +Data: + +- "offset": Offset between base RTC clock (as specified by -rtc base), and +new RTC clock value (json-number) + +Example: + +{ "event": "RTC_CHANGE", + "data": { "offset": 78 }, + "timestamp": { "seconds": 1267020223, "microseconds": 435656 } } + +Note: this event is rate-limited. + +SHUTDOWN +-------- + +Emitted when the Virtual Machine has shut down, indicating that qemu +is about to exit. + +Data: None. + +Example: + +{ "event": "SHUTDOWN", + "timestamp": { "seconds": 1267040730, "microseconds": 682951 } } + +Note: If the command-line option "-no-shutdown" has been specified, a STOP +event will eventually follow the SHUTDOWN event. + +SPICE_CONNECTED +--------------- + +Emitted when a SPICE client connects. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") +- "client": Client information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + +Example: + +{ "timestamp": {"seconds": 1290688046, "microseconds": 388707}, + "event": "SPICE_CONNECTED", + "data": { + "server": { "port": "5920", "family": "ipv4", "host": "127.0.0.1"}, + "client": {"port": "52873", "family": "ipv4", "host": "127.0.0.1"} +}} + +SPICE_DISCONNECTED +------------------ + +Emitted when a SPICE client disconnects. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") +- "client": Client information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + +Example: + +{ "timestamp": {"seconds": 1290688046, "microseconds": 388707}, + "event": "SPICE_DISCONNECTED", + "data": { + "server": { "port": "5920", "family": "ipv4", "host": "127.0.0.1"}, + "client": {"port": "52873", "family": "ipv4", "host": "127.0.0.1"} +}} + +SPICE_INITIALIZED +----------------- + +Emitted after initial handshake and authentication takes place (if any) +and the SPICE channel is up and running + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "port": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "connection-id": spice connection id. All channels with the same id + belong to the same spice session (json-int) + - "channel-type": channel type. "1" is the main control channel, filter for + this one if you want track spice sessions only (json-int) + - "channel-id": channel id. Usually "0", might be different needed when + multiple channels of the same type exist, such as multiple + display channels in a multihead setup (json-int) + - "tls": whevener the channel is encrypted (json-bool) + +Example: + +{ "timestamp": {"seconds": 1290688046, "microseconds": 417172}, + "event": "SPICE_INITIALIZED", + "data": {"server": {"auth": "spice", "port": "5921", + "family": "ipv4", "host": "127.0.0.1"}, + "client": {"port": "49004", "family": "ipv4", "channel-type": 3, + "connection-id": 1804289383, "host": "127.0.0.1", + "channel-id": 0, "tls": true} +}} + +SPICE_MIGRATE_COMPLETED +----------------------- + +Emitted when SPICE migration has completed + +Data: None. + +Example: + +{ "timestamp": {"seconds": 1290688046, "microseconds": 417172}, + "event": "SPICE_MIGRATE_COMPLETED" } + +MIGRATION +--------- + +Emitted when a migration event happens + +Data: None. + + - "status": migration status + See MigrationStatus in ~/qapi-schema.json for possible values + +Example: + +{"timestamp": {"seconds": 1432121972, "microseconds": 744001}, + "event": "MIGRATION", "data": {"status": "completed"}} + +STOP +---- + +Emitted when the Virtual Machine is stopped. + +Data: None. + +Example: + +{ "event": "STOP", + "timestamp": { "seconds": 1267041730, "microseconds": 281295 } } + +SUSPEND +------- + +Emitted when guest enters S3 state. + +Data: None. + +Example: + +{ "event": "SUSPEND", + "timestamp": { "seconds": 1344456160, "microseconds": 309119 } } + +SUSPEND_DISK +------------ + +Emitted when the guest makes a request to enter S4 state. + +Data: None. + +Example: + +{ "event": "SUSPEND_DISK", + "timestamp": { "seconds": 1344456160, "microseconds": 309119 } } + +Note: QEMU shuts down when entering S4 state. + +VNC_CONNECTED +------------- + +Emitted when a VNC client establishes a connection. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + +Example: + +{ "event": "VNC_CONNECTED", + "data": { + "server": { "auth": "sasl", "family": "ipv4", + "service": "5901", "host": "0.0.0.0" }, + "client": { "family": "ipv4", "service": "58425", + "host": "127.0.0.1" } }, + "timestamp": { "seconds": 1262976601, "microseconds": 975795 } } + + +Note: This event is emitted before any authentication takes place, thus +the authentication ID is not provided. + +VNC_DISCONNECTED +---------------- + +Emitted when the connection is closed. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "x509_dname": TLS dname (json-string, optional) + - "sasl_username": SASL username (json-string, optional) + +Example: + +{ "event": "VNC_DISCONNECTED", + "data": { + "server": { "auth": "sasl", "family": "ipv4", + "service": "5901", "host": "0.0.0.0" }, + "client": { "family": "ipv4", "service": "58425", + "host": "127.0.0.1", "sasl_username": "luiz" } }, + "timestamp": { "seconds": 1262976601, "microseconds": 975795 } } + +VNC_INITIALIZED +--------------- + +Emitted after authentication takes place (if any) and the VNC session is +made active. + +Data: + +- "server": Server information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "auth": authentication method (json-string, optional) +- "client": Client information (json-object) + - "host": IP address (json-string) + - "service": port number (json-string) + - "family": address family (json-string, "ipv4" or "ipv6") + - "x509_dname": TLS dname (json-string, optional) + - "sasl_username": SASL username (json-string, optional) + +Example: + +{ "event": "VNC_INITIALIZED", + "data": { + "server": { "auth": "sasl", "family": "ipv4", + "service": "5901", "host": "0.0.0.0"}, + "client": { "family": "ipv4", "service": "46089", + "host": "127.0.0.1", "sasl_username": "luiz" } }, + "timestamp": { "seconds": 1263475302, "microseconds": 150772 } } + +VSERPORT_CHANGE +--------------- + +Emitted when the guest opens or closes a virtio-serial port. + +Data: + +- "id": device identifier of the virtio-serial port (json-string) +- "open": true if the guest has opened the virtio-serial port (json-bool) + +Example: + +{ "event": "VSERPORT_CHANGE", + "data": { "id": "channel0", "open": true }, + "timestamp": { "seconds": 1401385907, "microseconds": 422329 } } + +Note: this event is rate-limited separately for each "id". + +WAKEUP +------ + +Emitted when the guest has woken up from S3 and is running. + +Data: None. + +Example: + +{ "event": "WAKEUP", + "timestamp": { "seconds": 1344522075, "microseconds": 745528 } } + +WATCHDOG +-------- + +Emitted when the watchdog device's timer is expired. + +Data: + +- "action": Action that has been taken, it's one of the following (json-string): + "reset", "shutdown", "poweroff", "pause", "debug", or "none" + +Example: + +{ "event": "WATCHDOG", + "data": { "action": "reset" }, + "timestamp": { "seconds": 1267061043, "microseconds": 959568 } } + +Note: If action is "reset", "shutdown", or "pause" the WATCHDOG event is +followed respectively by the RESET, SHUTDOWN, or STOP events. + +Note: this event is rate-limited. diff --git a/src/docs/qmp-intro.txt b/src/docs/qmp-intro.txt new file mode 100644 index 0000000..f6a3a03 --- /dev/null +++ b/src/docs/qmp-intro.txt @@ -0,0 +1,87 @@ + QEMU Machine Protocol + ===================== + +Introduction +------------ + +The QEMU Machine Protocol (QMP) allows applications to operate a +QEMU instance. + +QMP is JSON[1] based and features the following: + +- Lightweight, text-based, easy to parse data format +- Asynchronous messages support (ie. events) +- Capabilities Negotiation + +For detailed information on QMP's usage, please, refer to the following files: + +o qmp-spec.txt QEMU Machine Protocol current specification +o qmp-commands.txt QMP supported commands (auto-generated at build-time) +o qmp-events.txt List of available asynchronous events + +[1] http://www.json.org + +Usage +----- + +You can use the -qmp option to enable QMP. For example, the following +makes QMP available on localhost port 4444: + +$ qemu [...] -qmp tcp:localhost:4444,server,nowait + +However, for more flexibility and to make use of more options, the -mon +command-line option should be used. For instance, the following example +creates one HMP instance (human monitor) on stdio and one QMP instance +on localhost port 4444: + +$ qemu [...] -chardev stdio,id=mon0 -mon chardev=mon0,mode=readline \ + -chardev socket,id=mon1,host=localhost,port=4444,server,nowait \ + -mon chardev=mon1,mode=control,pretty=on + +Please, refer to QEMU's manpage for more information. + +Simple Testing +-------------- + +To manually test QMP one can connect with telnet and issue commands by hand: + +$ telnet localhost 4444 +Trying 127.0.0.1... +Connected to localhost. +Escape character is '^]'. +{ + "QMP": { + "version": { + "qemu": { + "micro": 50, + "minor": 6, + "major": 1 + }, + "package": "" + }, + "capabilities": [ + ] + } +} + +{ "execute": "qmp_capabilities" } +{ + "return": { + } +} + +{ "execute": "query-status" } +{ + "return": { + "status": "prelaunch", + "singlestep": false, + "running": false + } +} + +Please, refer to the qapi-schema.json file for a complete command reference. + +QMP wiki page +------------- + +http://wiki.qemu-project.org/QMP diff --git a/src/docs/qmp-spec.txt b/src/docs/qmp-spec.txt new file mode 100644 index 0000000..4fb10a5 --- /dev/null +++ b/src/docs/qmp-spec.txt @@ -0,0 +1,341 @@ + QEMU Machine Protocol Specification + +0. About This Document +====================== + +Copyright (C) 2009-2015 Red Hat, Inc. + +This work is licensed under the terms of the GNU GPL, version 2 or +later. See the COPYING file in the top-level directory. + +1. Introduction +=============== + +This document specifies the QEMU Machine Protocol (QMP), a JSON-based +protocol which is available for applications to operate QEMU at the +machine-level. It is also in use by the QEMU Guest Agent (QGA), which +is available for host applications to interact with the guest +operating system. + +2. Protocol Specification +========================= + +This section details the protocol format. For the purpose of this document +"Client" is any application which is using QMP to communicate with QEMU and +"Server" is QEMU itself. + +JSON data structures, when mentioned in this document, are always in the +following format: + + json-DATA-STRUCTURE-NAME + +Where DATA-STRUCTURE-NAME is any valid JSON data structure, as defined +by the JSON standard: + +http://www.ietf.org/rfc/rfc7159.txt + +The protocol is always encoded in UTF-8 except for synchronization +bytes (documented below); although thanks to json-string escape +sequences, the server will reply using only the strict ASCII subset. + +For convenience, json-object members mentioned in this document will +be in a certain order. However, in real protocol usage they can be in +ANY order, thus no particular order should be assumed. On the other +hand, use of json-array elements presumes that preserving order is +important unless specifically documented otherwise. Repeating a key +within a json-object gives unpredictable results. + +Also for convenience, the server will accept an extension of +'single-quoted' strings in place of the usual "double-quoted" +json-string, and both input forms of strings understand an additional +escape sequence of "\'" for a single quote. The server will only use +double quoting on output. + +2.1 General Definitions +----------------------- + +2.1.1 All interactions transmitted by the Server are json-objects, always + terminating with CRLF + +2.1.2 All json-objects members are mandatory when not specified otherwise + +2.2 Server Greeting +------------------- + +Right when connected the Server will issue a greeting message, which signals +that the connection has been successfully established and that the Server is +ready for capabilities negotiation (for more information refer to section +'4. Capabilities Negotiation'). + +The greeting message format is: + +{ "QMP": { "version": json-object, "capabilities": json-array } } + + Where, + +- The "version" member contains the Server's version information (the format + is the same of the query-version command) +- The "capabilities" member specify the availability of features beyond the + baseline specification; the order of elements in this array has no + particular significance, so a client must search the entire array + when looking for a particular capability + +2.2.1 Capabilities +------------------ + +As of the date this document was last revised, no server or client +capability strings have been defined. + + +2.3 Issuing Commands +-------------------- + +The format for command execution is: + +{ "execute": json-string, "arguments": json-object, "id": json-value } + + Where, + +- The "execute" member identifies the command to be executed by the Server +- The "arguments" member is used to pass any arguments required for the + execution of the command, it is optional when no arguments are + required. Each command documents what contents will be considered + valid when handling the json-argument +- The "id" member is a transaction identification associated with the + command execution, it is optional and will be part of the response if + provided. The "id" member can be any json-value, although most + clients merely use a json-number incremented for each successive + command + +2.4 Commands Responses +---------------------- + +There are two possible responses which the Server will issue as the result +of a command execution: success or error. + +2.4.1 success +------------- + +The format of a success response is: + +{ "return": json-value, "id": json-value } + + Where, + +- The "return" member contains the data returned by the command, which + is defined on a per-command basis (usually a json-object or + json-array of json-objects, but sometimes a json-number, json-string, + or json-array of json-strings); it is an empty json-object if the + command does not return data +- The "id" member contains the transaction identification associated + with the command execution if issued by the Client + +2.4.2 error +----------- + +The format of an error response is: + +{ "error": { "class": json-string, "desc": json-string }, "id": json-value } + + Where, + +- The "class" member contains the error class name (eg. "GenericError") +- The "desc" member is a human-readable error message. Clients should + not attempt to parse this message. +- The "id" member contains the transaction identification associated with + the command execution if issued by the Client + +NOTE: Some errors can occur before the Server is able to read the "id" member, +in these cases the "id" member will not be part of the error response, even +if provided by the client. + +2.5 Asynchronous events +----------------------- + +As a result of state changes, the Server may send messages unilaterally +to the Client at any time, when not in the middle of any other +response. They are called "asynchronous events". + +The format of asynchronous events is: + +{ "event": json-string, "data": json-object, + "timestamp": { "seconds": json-number, "microseconds": json-number } } + + Where, + +- The "event" member contains the event's name +- The "data" member contains event specific data, which is defined in a + per-event basis, it is optional +- The "timestamp" member contains the exact time of when the event + occurred in the Server. It is a fixed json-object with time in + seconds and microseconds relative to the Unix Epoch (1 Jan 1970); if + there is a failure to retrieve host time, both members of the + timestamp will be set to -1. + +For a listing of supported asynchronous events, please, refer to the +qmp-events.txt file. + +Some events are rate-limited to at most one per second. If additional +"similar" events arrive within one second, all but the last one are +dropped, and the last one is delayed. "Similar" normally means same +event type. See qmp-events.txt for details. + +2.5 QGA Synchronization +----------------------- + +When using QGA, an additional synchronization feature is built into +the protocol. If the Client sends a raw 0xFF sentinel byte (not valid +JSON), then the Server will reset its state and discard all pending +data prior to the sentinel. Conversely, if the Client makes use of +the 'guest-sync-delimited' command, the Server will send a raw 0xFF +sentinel byte prior to its response, to aid the Client in discarding +any data prior to the sentinel. + + +3. QMP Examples +=============== + +This section provides some examples of real QMP usage, in all of them +"C" stands for "Client" and "S" stands for "Server". + +3.1 Server greeting +------------------- + +S: { "QMP": { "version": { "qemu": { "micro": 50, "minor": 6, "major": 1 }, + "package": ""}, "capabilities": []}} + +3.2 Client QMP negotiation +-------------------------- +C: { "execute": "qmp_capabilities" } +S: { "return": {}} + +3.3 Simple 'stop' execution +--------------------------- + +C: { "execute": "stop" } +S: { "return": {} } + +3.4 KVM information +------------------- + +C: { "execute": "query-kvm", "id": "example" } +S: { "return": { "enabled": true, "present": true }, "id": "example"} + +3.5 Parsing error +------------------ + +C: { "execute": } +S: { "error": { "class": "GenericError", "desc": "Invalid JSON syntax" } } + +3.6 Powerdown event +------------------- + +S: { "timestamp": { "seconds": 1258551470, "microseconds": 802384 }, + "event": "POWERDOWN" } + +4. Capabilities Negotiation +=========================== + +When a Client successfully establishes a connection, the Server is in +Capabilities Negotiation mode. + +In this mode only the qmp_capabilities command is allowed to run, all +other commands will return the CommandNotFound error. Asynchronous +messages are not delivered either. + +Clients should use the qmp_capabilities command to enable capabilities +advertised in the Server's greeting (section '2.2 Server Greeting') they +support. + +When the qmp_capabilities command is issued, and if it does not return an +error, the Server enters in Command mode where capabilities changes take +effect, all commands (except qmp_capabilities) are allowed and asynchronous +messages are delivered. + +5 Compatibility Considerations +============================== + +All protocol changes or new features which modify the protocol format in an +incompatible way are disabled by default and will be advertised by the +capabilities array (section '2.2 Server Greeting'). Thus, Clients can check +that array and enable the capabilities they support. + +The QMP Server performs a type check on the arguments to a command. It +generates an error if a value does not have the expected type for its +key, or if it does not understand a key that the Client included. The +strictness of the Server catches wrong assumptions of Clients about +the Server's schema. Clients can assume that, when such validation +errors occur, they will be reported before the command generated any +side effect. + +However, Clients must not assume any particular: + +- Length of json-arrays +- Size of json-objects; in particular, future versions of QEMU may add + new keys and Clients should be able to ignore them. +- Order of json-object members or json-array elements +- Amount of errors generated by a command, that is, new errors can be added + to any existing command in newer versions of the Server + +Any command or field name beginning with "x-" is deemed experimental, +and may be withdrawn or changed in an incompatible manner in a future +release. + +Of course, the Server does guarantee to send valid JSON. But apart from +this, a Client should be "conservative in what they send, and liberal in +what they accept". + +6. Downstream extension of QMP +============================== + +We recommend that downstream consumers of QEMU do *not* modify QMP. +Management tools should be able to support both upstream and downstream +versions of QMP without special logic, and downstream extensions are +inherently at odds with that. + +However, we recognize that it is sometimes impossible for downstreams to +avoid modifying QMP. Both upstream and downstream need to take care to +preserve long-term compatibility and interoperability. + +To help with that, QMP reserves JSON object member names beginning with +'__' (double underscore) for downstream use ("downstream names"). This +means upstream will never use any downstream names for its commands, +arguments, errors, asynchronous events, and so forth. + +Any new names downstream wishes to add must begin with '__'. To +ensure compatibility with other downstreams, it is strongly +recommended that you prefix your downstream names with '__RFQDN_' where +RFQDN is a valid, reverse fully qualified domain name which you +control. For example, a qemu-kvm specific monitor command would be: + + (qemu) __org.linux-kvm_enable_irqchip + +Downstream must not change the server greeting (section 2.2) other than +to offer additional capabilities. But see below for why even that is +discouraged. + +Section '5 Compatibility Considerations' applies to downstream as well +as to upstream, obviously. It follows that downstream must behave +exactly like upstream for any input not containing members with +downstream names ("downstream members"), except it may add members +with downstream names to its output. + +Thus, a client should not be able to distinguish downstream from +upstream as long as it doesn't send input with downstream members, and +properly ignores any downstream members in the output it receives. + +Advice on downstream modifications: + +1. Introducing new commands is okay. If you want to extend an existing + command, consider introducing a new one with the new behaviour + instead. + +2. Introducing new asynchronous messages is okay. If you want to extend + an existing message, consider adding a new one instead. + +3. Introducing new errors for use in new commands is okay. Adding new + errors to existing commands counts as extension, so 1. applies. + +4. New capabilities are strongly discouraged. Capabilities are for + evolving the basic protocol, and multiple diverging basic protocol + dialects are most undesirable. diff --git a/src/docs/rcu.txt b/src/docs/rcu.txt new file mode 100644 index 0000000..2f70954 --- /dev/null +++ b/src/docs/rcu.txt @@ -0,0 +1,390 @@ +Using RCU (Read-Copy-Update) for synchronization +================================================ + +Read-copy update (RCU) is a synchronization mechanism that is used to +protect read-mostly data structures. RCU is very efficient and scalable +on the read side (it is wait-free), and thus can make the read paths +extremely fast. + +RCU supports concurrency between a single writer and multiple readers, +thus it is not used alone. Typically, the write-side will use a lock to +serialize multiple updates, but other approaches are possible (e.g., +restricting updates to a single task). In QEMU, when a lock is used, +this will often be the "iothread mutex", also known as the "big QEMU +lock" (BQL). Also, restricting updates to a single task is done in +QEMU using the "bottom half" API. + +RCU is fundamentally a "wait-to-finish" mechanism. The read side marks +sections of code with "critical sections", and the update side will wait +for the execution of all *currently running* critical sections before +proceeding, or before asynchronously executing a callback. + +The key point here is that only the currently running critical sections +are waited for; critical sections that are started _after_ the beginning +of the wait do not extend the wait, despite running concurrently with +the updater. This is the reason why RCU is more scalable than, +for example, reader-writer locks. It is so much more scalable that +the system will have a single instance of the RCU mechanism; a single +mechanism can be used for an arbitrary number of "things", without +having to worry about things such as contention or deadlocks. + +How is this possible? The basic idea is to split updates in two phases, +"removal" and "reclamation". During removal, we ensure that subsequent +readers will not be able to get a reference to the old data. After +removal has completed, a critical section will not be able to access +the old data. Therefore, critical sections that begin after removal +do not matter; as soon as all previous critical sections have finished, +there cannot be any readers who hold references to the data structure, +and these can now be safely reclaimed (e.g., freed or unref'ed). + +Here is a picutre: + + thread 1 thread 2 thread 3 + ------------------- ------------------------ ------------------- + enter RCU crit.sec. + | finish removal phase + | begin wait + | | enter RCU crit.sec. + exit RCU crit.sec | | + complete wait | + begin reclamation phase | + exit RCU crit.sec. + + +Note how thread 3 is still executing its critical section when thread 2 +starts reclaiming data. This is possible, because the old version of the +data structure was not accessible at the time thread 3 began executing +that critical section. + + +RCU API +======= + +The core RCU API is small: + + void rcu_read_lock(void); + + Used by a reader to inform the reclaimer that the reader is + entering an RCU read-side critical section. + + void rcu_read_unlock(void); + + Used by a reader to inform the reclaimer that the reader is + exiting an RCU read-side critical section. Note that RCU + read-side critical sections may be nested and/or overlapping. + + void synchronize_rcu(void); + + Blocks until all pre-existing RCU read-side critical sections + on all threads have completed. This marks the end of the removal + phase and the beginning of reclamation phase. + + Note that it would be valid for another update to come while + synchronize_rcu is running. Because of this, it is better that + the updater releases any locks it may hold before calling + synchronize_rcu. If this is not possible (for example, because + the updater is protected by the BQL), you can use call_rcu. + + void call_rcu1(struct rcu_head * head, + void (*func)(struct rcu_head *head)); + + This function invokes func(head) after all pre-existing RCU + read-side critical sections on all threads have completed. This + marks the end of the removal phase, with func taking care + asynchronously of the reclamation phase. + + The foo struct needs to have an rcu_head structure added, + perhaps as follows: + + struct foo { + struct rcu_head rcu; + int a; + char b; + long c; + }; + + so that the reclaimer function can fetch the struct foo address + and free it: + + call_rcu1(&foo.rcu, foo_reclaim); + + void foo_reclaim(struct rcu_head *rp) + { + struct foo *fp = container_of(rp, struct foo, rcu); + g_free(fp); + } + + For the common case where the rcu_head member is the first of the + struct, you can use the following macro. + + void call_rcu(T *p, + void (*func)(T *p), + field-name); + void g_free_rcu(T *p, + field-name); + + call_rcu1 is typically used through these macro, in the common case + where the "struct rcu_head" is the first field in the struct. If + the callback function is g_free, in particular, g_free_rcu can be + used. In the above case, one could have written simply: + + g_free_rcu(&foo, rcu); + + typeof(*p) atomic_rcu_read(p); + + atomic_rcu_read() is similar to atomic_mb_read(), but it makes + some assumptions on the code that calls it. This allows a more + optimized implementation. + + atomic_rcu_read assumes that whenever a single RCU critical + section reads multiple shared data, these reads are either + data-dependent or need no ordering. This is almost always the + case when using RCU, because read-side critical sections typically + navigate one or more pointers (the pointers that are changed on + every update) until reaching a data structure of interest, + and then read from there. + + RCU read-side critical sections must use atomic_rcu_read() to + read data, unless concurrent writes are presented by another + synchronization mechanism. + + Furthermore, RCU read-side critical sections should traverse the + data structure in a single direction, opposite to the direction + in which the updater initializes it. + + void atomic_rcu_set(p, typeof(*p) v); + + atomic_rcu_set() is also similar to atomic_mb_set(), and it also + makes assumptions on the code that calls it in order to allow a more + optimized implementation. + + In particular, atomic_rcu_set() suffices for synchronization + with readers, if the updater never mutates a field within a + data item that is already accessible to readers. This is the + case when initializing a new copy of the RCU-protected data + structure; just ensure that initialization of *p is carried out + before atomic_rcu_set() makes the data item visible to readers. + If this rule is observed, writes will happen in the opposite + order as reads in the RCU read-side critical sections (or if + there is just one update), and there will be no need for other + synchronization mechanism to coordinate the accesses. + +The following APIs must be used before RCU is used in a thread: + + void rcu_register_thread(void); + + Mark a thread as taking part in the RCU mechanism. Such a thread + will have to report quiescent points regularly, either manually + or through the QemuCond/QemuSemaphore/QemuEvent APIs. + + void rcu_unregister_thread(void); + + Mark a thread as not taking part anymore in the RCU mechanism. + It is not a problem if such a thread reports quiescent points, + either manually or by using the QemuCond/QemuSemaphore/QemuEvent + APIs. + +Note that these APIs are relatively heavyweight, and should _not_ be +nested. + + +DIFFERENCES WITH LINUX +====================== + +- Waiting on a mutex is possible, though discouraged, within an RCU critical + section. This is because spinlocks are rarely (if ever) used in userspace + programming; not allowing this would prevent upgrading an RCU read-side + critical section to become an updater. + +- atomic_rcu_read and atomic_rcu_set replace rcu_dereference and + rcu_assign_pointer. They take a _pointer_ to the variable being accessed. + +- call_rcu is a macro that has an extra argument (the name of the first + field in the struct, which must be a struct rcu_head), and expects the + type of the callback's argument to be the type of the first argument. + call_rcu1 is the same as Linux's call_rcu. + + +RCU PATTERNS +============ + +Many patterns using read-writer locks translate directly to RCU, with +the advantages of higher scalability and deadlock immunity. + +In general, RCU can be used whenever it is possible to create a new +"version" of a data structure every time the updater runs. This may +sound like a very strict restriction, however: + +- the updater does not mean "everything that writes to a data structure", + but rather "everything that involves a reclamation step". See the + array example below + +- in some cases, creating a new version of a data structure may actually + be very cheap. For example, modifying the "next" pointer of a singly + linked list is effectively creating a new version of the list. + +Here are some frequently-used RCU idioms that are worth noting. + + +RCU list processing +------------------- + +TBD (not yet used in QEMU) + + +RCU reference counting +---------------------- + +Because grace periods are not allowed to complete while there is an RCU +read-side critical section in progress, the RCU read-side primitives +may be used as a restricted reference-counting mechanism. For example, +consider the following code fragment: + + rcu_read_lock(); + p = atomic_rcu_read(&foo); + /* do something with p. */ + rcu_read_unlock(); + +The RCU read-side critical section ensures that the value of "p" remains +valid until after the rcu_read_unlock(). In some sense, it is acquiring +a reference to p that is later released when the critical section ends. +The write side looks simply like this (with appropriate locking): + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + synchronize_rcu(); + free(old); + +If the processing cannot be done purely within the critical section, it +is possible to combine this idiom with a "real" reference count: + + rcu_read_lock(); + p = atomic_rcu_read(&foo); + foo_ref(p); + rcu_read_unlock(); + /* do something with p. */ + foo_unref(p); + +The write side can be like this: + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + synchronize_rcu(); + foo_unref(old); + +or with call_rcu: + + qemu_mutex_lock(&foo_mutex); + old = foo; + atomic_rcu_set(&foo, new); + qemu_mutex_unlock(&foo_mutex); + call_rcu(foo_unref, old, rcu); + +In both cases, the write side only performs removal. Reclamation +happens when the last reference to a "foo" object is dropped. +Using synchronize_rcu() is undesirably expensive, because the +last reference may be dropped on the read side. Hence you can +use call_rcu() instead: + + foo_unref(struct foo *p) { + if (atomic_fetch_dec(&p->refcount) == 1) { + call_rcu(foo_destroy, p, rcu); + } + } + + +Note that the same idioms would be possible with reader/writer +locks: + + read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); + p = foo; p = foo; + /* do something with p. */ foo = new; + read_unlock(&foo_rwlock); free(p); + write_mutex_unlock(&foo_rwlock); + free(p); + + ------------------------------------------------------------------ + + read_lock(&foo_rwlock); write_mutex_lock(&foo_rwlock); + p = foo; old = foo; + foo_ref(p); foo = new; + read_unlock(&foo_rwlock); foo_unref(old); + /* do something with p. */ write_mutex_unlock(&foo_rwlock); + read_lock(&foo_rwlock); + foo_unref(p); + read_unlock(&foo_rwlock); + +foo_unref could use a mechanism such as bottom halves to move deallocation +out of the write-side critical section. + + +RCU resizable arrays +-------------------- + +Resizable arrays can be used with RCU. The expensive RCU synchronization +(or call_rcu) only needs to take place when the array is resized. +The two items to take care of are: + +- ensuring that the old version of the array is available between removal + and reclamation; + +- avoiding mismatches in the read side between the array data and the + array size. + +The first problem is avoided simply by not using realloc. Instead, +each resize will allocate a new array and copy the old data into it. +The second problem would arise if the size and the data pointers were +two members of a larger struct: + + struct mystuff { + ... + int data_size; + int data_alloc; + T *data; + ... + }; + +Instead, we store the size of the array with the array itself: + + struct arr { + int size; + int alloc; + T data[]; + }; + struct arr *global_array; + + read side: + rcu_read_lock(); + struct arr *array = atomic_rcu_read(&global_array); + x = i < array->size ? array->data[i] : -1; + rcu_read_unlock(); + return x; + + write side (running under a lock): + if (global_array->size == global_array->alloc) { + /* Creating a new version. */ + new_array = g_malloc(sizeof(struct arr) + + global_array->alloc * 2 * sizeof(T)); + new_array->size = global_array->size; + new_array->alloc = global_array->alloc * 2; + memcpy(new_array->data, global_array->data, + global_array->alloc * sizeof(T)); + + /* Removal phase. */ + old_array = global_array; + atomic_rcu_set(&new_array->data, new_array); + synchronize_rcu(); + + /* Reclamation phase. */ + free(old_array); + } + + +SOURCES +======= + +* Documentation/RCU/ from the Linux kernel diff --git a/src/docs/rdma.txt b/src/docs/rdma.txt new file mode 100644 index 0000000..2bdd0a5 --- /dev/null +++ b/src/docs/rdma.txt @@ -0,0 +1,420 @@ +(RDMA: Remote Direct Memory Access) +RDMA Live Migration Specification, Version # 1 +============================================== +Wiki: http://wiki.qemu-project.org/Features/RDMALiveMigration +Github: git@github.com:hinesmr/qemu.git, 'rdma' branch + +Copyright (C) 2013 Michael R. Hines <mrhines@us.ibm.com> + +An *exhaustive* paper (2010) shows additional performance details +linked on the QEMU wiki above. + +Contents: +========= +* Introduction +* Before running +* Running +* Performance +* RDMA Migration Protocol Description +* Versioning and Capabilities +* QEMUFileRDMA Interface +* Migration of VM's ram +* Error handling +* TODO + +Introduction: +============= + +RDMA helps make your migration more deterministic under heavy load because +of the significantly lower latency and higher throughput over TCP/IP. This is +because the RDMA I/O architecture reduces the number of interrupts and +data copies by bypassing the host networking stack. In particular, a TCP-based +migration, under certain types of memory-bound workloads, may take a more +unpredicatable amount of time to complete the migration if the amount of +memory tracked during each live migration iteration round cannot keep pace +with the rate of dirty memory produced by the workload. + +RDMA currently comes in two flavors: both Ethernet based (RoCE, or RDMA +over Converged Ethernet) as well as Infiniband-based. This implementation of +migration using RDMA is capable of using both technologies because of +the use of the OpenFabrics OFED software stack that abstracts out the +programming model irrespective of the underlying hardware. + +Refer to openfabrics.org or your respective RDMA hardware vendor for +an understanding on how to verify that you have the OFED software stack +installed in your environment. You should be able to successfully link +against the "librdmacm" and "libibverbs" libraries and development headers +for a working build of QEMU to run successfully using RDMA Migration. + +BEFORE RUNNING: +=============== + +Use of RDMA during migration requires pinning and registering memory +with the hardware. This means that memory must be physically resident +before the hardware can transmit that memory to another machine. +If this is not acceptable for your application or product, then the use +of RDMA migration may in fact be harmful to co-located VMs or other +software on the machine if there is not sufficient memory available to +relocate the entire footprint of the virtual machine. If so, then the +use of RDMA is discouraged and it is recommended to use standard TCP migration. + +Experimental: Next, decide if you want dynamic page registration. +For example, if you have an 8GB RAM virtual machine, but only 1GB +is in active use, then enabling this feature will cause all 8GB to +be pinned and resident in memory. This feature mostly affects the +bulk-phase round of the migration and can be enabled for extremely +high-performance RDMA hardware using the following command: + +QEMU Monitor Command: +$ migrate_set_capability rdma-pin-all on # disabled by default + +Performing this action will cause all 8GB to be pinned, so if that's +not what you want, then please ignore this step altogether. + +On the other hand, this will also significantly speed up the bulk round +of the migration, which can greatly reduce the "total" time of your migration. +Example performance of this using an idle VM in the previous example +can be found in the "Performance" section. + +Note: for very large virtual machines (hundreds of GBs), pinning all +*all* of the memory of your virtual machine in the kernel is very expensive +may extend the initial bulk iteration time by many seconds, +and thus extending the total migration time. However, this will not +affect the determinism or predictability of your migration you will +still gain from the benefits of advanced pinning with RDMA. + +RUNNING: +======== + +First, set the migration speed to match your hardware's capabilities: + +QEMU Monitor Command: +$ migrate_set_speed 40g # or whatever is the MAX of your RDMA device + +Next, on the destination machine, add the following to the QEMU command line: + +qemu ..... -incoming rdma:host:port + +Finally, perform the actual migration on the source machine: + +QEMU Monitor Command: +$ migrate -d rdma:host:port + +PERFORMANCE +=========== + +Here is a brief summary of total migration time and downtime using RDMA: +Using a 40gbps infiniband link performing a worst-case stress test, +using an 8GB RAM virtual machine: + +Using the following command: +$ apt-get install stress +$ stress --vm-bytes 7500M --vm 1 --vm-keep + +1. Migration throughput: 26 gigabits/second. +2. Downtime (stop time) varies between 15 and 100 milliseconds. + +EFFECTS of memory registration on bulk phase round: + +For example, in the same 8GB RAM example with all 8GB of memory in +active use and the VM itself is completely idle using the same 40 gbps +infiniband link: + +1. rdma-pin-all disabled total time: approximately 7.5 seconds @ 9.5 Gbps +2. rdma-pin-all enabled total time: approximately 4 seconds @ 26 Gbps + +These numbers would of course scale up to whatever size virtual machine +you have to migrate using RDMA. + +Enabling this feature does *not* have any measurable affect on +migration *downtime*. This is because, without this feature, all of the +memory will have already been registered already in advance during +the bulk round and does not need to be re-registered during the successive +iteration rounds. + +RDMA Protocol Description: +========================== + +Migration with RDMA is separated into two parts: + +1. The transmission of the pages using RDMA +2. Everything else (a control channel is introduced) + +"Everything else" is transmitted using a formal +protocol now, consisting of infiniband SEND messages. + +An infiniband SEND message is the standard ibverbs +message used by applications of infiniband hardware. +The only difference between a SEND message and an RDMA +message is that SEND messages cause notifications +to be posted to the completion queue (CQ) on the +infiniband receiver side, whereas RDMA messages (used +for VM's ram) do not (to behave like an actual DMA). + +Messages in infiniband require two things: + +1. registration of the memory that will be transmitted +2. (SEND only) work requests to be posted on both + sides of the network before the actual transmission + can occur. + +RDMA messages are much easier to deal with. Once the memory +on the receiver side is registered and pinned, we're +basically done. All that is required is for the sender +side to start dumping bytes onto the link. + +(Memory is not released from pinning until the migration +completes, given that RDMA migrations are very fast.) + +SEND messages require more coordination because the +receiver must have reserved space (using a receive +work request) on the receive queue (RQ) before QEMUFileRDMA +can start using them to carry all the bytes as +a control transport for migration of device state. + +To begin the migration, the initial connection setup is +as follows (migration-rdma.c): + +1. Receiver and Sender are started (command line or libvirt): +2. Both sides post two RQ work requests +3. Receiver does listen() +4. Sender does connect() +5. Receiver accept() +6. Check versioning and capabilities (described later) + +At this point, we define a control channel on top of SEND messages +which is described by a formal protocol. Each SEND message has a +header portion and a data portion (but together are transmitted +as a single SEND message). + +Header: + * Length (of the data portion, uint32, network byte order) + * Type (what command to perform, uint32, network byte order) + * Repeat (Number of commands in data portion, same type only) + +The 'Repeat' field is here to support future multiple page registrations +in a single message without any need to change the protocol itself +so that the protocol is compatible against multiple versions of QEMU. +Version #1 requires that all server implementations of the protocol must +check this field and register all requests found in the array of commands located +in the data portion and return an equal number of results in the response. +The maximum number of repeats is hard-coded to 4096. This is a conservative +limit based on the maximum size of a SEND message along with empirical +observations on the maximum future benefit of simultaneous page registrations. + +The 'type' field has 12 different command values: + 1. Unused + 2. Error (sent to the source during bad things) + 3. Ready (control-channel is available) + 4. QEMU File (for sending non-live device state) + 5. RAM Blocks request (used right after connection setup) + 6. RAM Blocks result (used right after connection setup) + 7. Compress page (zap zero page and skip registration) + 8. Register request (dynamic chunk registration) + 9. Register result ('rkey' to be used by sender) + 10. Register finished (registration for current iteration finished) + 11. Unregister request (unpin previously registered memory) + 12. Unregister finished (confirmation that unpin completed) + +A single control message, as hinted above, can contain within the data +portion an array of many commands of the same type. If there is more than +one command, then the 'repeat' field will be greater than 1. + +After connection setup, message 5 & 6 are used to exchange ram block +information and optionally pin all the memory if requested by the user. + +After ram block exchange is completed, we have two protocol-level +functions, responsible for communicating control-channel commands +using the above list of values: + +Logically: + +qemu_rdma_exchange_recv(header, expected command type) + +1. We transmit a READY command to let the sender know that + we are *ready* to receive some data bytes on the control channel. +2. Before attempting to receive the expected command, we post another + RQ work request to replace the one we just used up. +3. Block on a CQ event channel and wait for the SEND to arrive. +4. When the send arrives, librdmacm will unblock us. +5. Verify that the command-type and version received matches the one we expected. + +qemu_rdma_exchange_send(header, data, optional response header & data): + +1. Block on the CQ event channel waiting for a READY command + from the receiver to tell us that the receiver + is *ready* for us to transmit some new bytes. +2. Optionally: if we are expecting a response from the command + (that we have not yet transmitted), let's post an RQ + work request to receive that data a few moments later. +3. When the READY arrives, librdmacm will + unblock us and we immediately post a RQ work request + to replace the one we just used up. +4. Now, we can actually post the work request to SEND + the requested command type of the header we were asked for. +5. Optionally, if we are expecting a response (as before), + we block again and wait for that response using the additional + work request we previously posted. (This is used to carry + 'Register result' commands #6 back to the sender which + hold the rkey need to perform RDMA. Note that the virtual address + corresponding to this rkey was already exchanged at the beginning + of the connection (described below). + +All of the remaining command types (not including 'ready') +described above all use the aformentioned two functions to do the hard work: + +1. After connection setup, RAMBlock information is exchanged using + this protocol before the actual migration begins. This information includes + a description of each RAMBlock on the server side as well as the virtual addresses + and lengths of each RAMBlock. This is used by the client to determine the + start and stop locations of chunks and how to register them dynamically + before performing the RDMA operations. +2. During runtime, once a 'chunk' becomes full of pages ready to + be sent with RDMA, the registration commands are used to ask the + other side to register the memory for this chunk and respond + with the result (rkey) of the registration. +3. Also, the QEMUFile interfaces also call these functions (described below) + when transmitting non-live state, such as devices or to send + its own protocol information during the migration process. +4. Finally, zero pages are only checked if a page has not yet been registered + using chunk registration (or not checked at all and unconditionally + written if chunk registration is disabled. This is accomplished using + the "Compress" command listed above. If the page *has* been registered + then we check the entire chunk for zero. Only if the entire chunk is + zero, then we send a compress command to zap the page on the other side. + +Versioning and Capabilities +=========================== +Current version of the protocol is version #1. + +The same version applies to both for protocol traffic and capabilities +negotiation. (i.e. There is only one version number that is referred to +by all communication). + +librdmacm provides the user with a 'private data' area to be exchanged +at connection-setup time before any infiniband traffic is generated. + +Header: + * Version (protocol version validated before send/recv occurs), + uint32, network byte order + * Flags (bitwise OR of each capability), + uint32, network byte order + +There is no data portion of this header right now, so there is +no length field. The maximum size of the 'private data' section +is only 192 bytes per the Infiniband specification, so it's not +very useful for data anyway. This structure needs to remain small. + +This private data area is a convenient place to check for protocol +versioning because the user does not need to register memory to +transmit a few bytes of version information. + +This is also a convenient place to negotiate capabilities +(like dynamic page registration). + +If the version is invalid, we throw an error. + +If the version is new, we only negotiate the capabilities that the +requested version is able to perform and ignore the rest. + +Currently there is only one capability in Version #1: dynamic page registration + +Finally: Negotiation happens with the Flags field: If the primary-VM +sets a flag, but the destination does not support this capability, it +will return a zero-bit for that flag and the primary-VM will understand +that as not being an available capability and will thus disable that +capability on the primary-VM side. + +QEMUFileRDMA Interface: +======================= + +QEMUFileRDMA introduces a couple of new functions: + +1. qemu_rdma_get_buffer() (QEMUFileOps rdma_read_ops) +2. qemu_rdma_put_buffer() (QEMUFileOps rdma_write_ops) + +These two functions are very short and simply use the protocol +describe above to deliver bytes without changing the upper-level +users of QEMUFile that depend on a bytestream abstraction. + +Finally, how do we handoff the actual bytes to get_buffer()? + +Again, because we're trying to "fake" a bytestream abstraction +using an analogy not unlike individual UDP frames, we have +to hold on to the bytes received from control-channel's SEND +messages in memory. + +Each time we receive a complete "QEMU File" control-channel +message, the bytes from SEND are copied into a small local holding area. + +Then, we return the number of bytes requested by get_buffer() +and leave the remaining bytes in the holding area until get_buffer() +comes around for another pass. + +If the buffer is empty, then we follow the same steps +listed above and issue another "QEMU File" protocol command, +asking for a new SEND message to re-fill the buffer. + +Migration of VM's ram: +==================== + +At the beginning of the migration, (migration-rdma.c), +the sender and the receiver populate the list of RAMBlocks +to be registered with each other into a structure. +Then, using the aforementioned protocol, they exchange a +description of these blocks with each other, to be used later +during the iteration of main memory. This description includes +a list of all the RAMBlocks, their offsets and lengths, virtual +addresses and possibly includes pre-registered RDMA keys in case dynamic +page registration was disabled on the server-side, otherwise not. + +Main memory is not migrated with the aforementioned protocol, +but is instead migrated with normal RDMA Write operations. + +Pages are migrated in "chunks" (hard-coded to 1 Megabyte right now). +Chunk size is not dynamic, but it could be in a future implementation. +There's nothing to indicate that this is useful right now. + +When a chunk is full (or a flush() occurs), the memory backed by +the chunk is registered with librdmacm is pinned in memory on +both sides using the aforementioned protocol. +After pinning, an RDMA Write is generated and transmitted +for the entire chunk. + +Chunks are also transmitted in batches: This means that we +do not request that the hardware signal the completion queue +for the completion of *every* chunk. The current batch size +is about 64 chunks (corresponding to 64 MB of memory). +Only the last chunk in a batch must be signaled. +This helps keep everything as asynchronous as possible +and helps keep the hardware busy performing RDMA operations. + +Error-handling: +=============== + +Infiniband has what is called a "Reliable, Connected" +link (one of 4 choices). This is the mode in which +we use for RDMA migration. + +If a *single* message fails, +the decision is to abort the migration entirely and +cleanup all the RDMA descriptors and unregister all +the memory. + +After cleanup, the Virtual Machine is returned to normal +operation the same way that would happen if the TCP +socket is broken during a non-RDMA based migration. + +TODO: +===== +1. Currently, 'ulimit -l' mlock() limits as well as cgroups swap limits + are not compatible with infinband memory pinning and will result in + an aborted migration (but with the source VM left unaffected). +2. Use of the recent /proc/<pid>/pagemap would likely speed up + the use of KSM and ballooning while using RDMA. +3. Also, some form of balloon-device usage tracking would also + help alleviate some issues. +4. Use LRU to provide more fine-grained direction of UNREGISTER + requests for unpinning memory in an overcommitted environment. +5. Expose UNREGISTER support to the user by way of workload-specific + hints about application behavior. diff --git a/src/docs/replay.txt b/src/docs/replay.txt new file mode 100644 index 0000000..149727e --- /dev/null +++ b/src/docs/replay.txt @@ -0,0 +1,168 @@ +Copyright (c) 2010-2015 Institute for System Programming + of the Russian Academy of Sciences. + +This work is licensed under the terms of the GNU GPL, version 2 or later. +See the COPYING file in the top-level directory. + +Record/replay +------------- + +Record/replay functions are used for the reverse execution and deterministic +replay of qemu execution. This implementation of deterministic replay can +be used for deterministic debugging of guest code through a gdb remote +interface. + +Execution recording writes a non-deterministic events log, which can be later +used for replaying the execution anywhere and for unlimited number of times. +It also supports checkpointing for faster rewinding during reverse debugging. +Execution replaying reads the log and replays all non-deterministic events +including external input, hardware clocks, and interrupts. + +Deterministic replay has the following features: + * Deterministically replays whole system execution and all contents of + the memory, state of the hardware devices, clocks, and screen of the VM. + * Writes execution log into the file for later replaying for multiple times + on different machines. + * Supports i386, x86_64, and ARM hardware platforms. + * Performs deterministic replay of all operations with keyboard and mouse + input devices. + +Usage of the record/replay: + * First, record the execution, by adding the following arguments to the command line: + '-icount shift=7,rr=record,rrfile=replay.bin -net none'. + Block devices' images are not actually changed in the recording mode, + because all of the changes are written to the temporary overlay file. + * Then you can replay it by using another command + line option: '-icount shift=7,rr=replay,rrfile=replay.bin -net none' + * '-net none' option should also be specified if network replay patches + are not applied. + +Papers with description of deterministic replay implementation: +http://www.computer.org/csdl/proceedings/csmr/2012/4666/00/4666a553-abs.html +http://dl.acm.org/citation.cfm?id=2786805.2803179 + +Modifications of qemu include: + * wrappers for clock and time functions to save their return values in the log + * saving different asynchronous events (e.g. system shutdown) into the log + * synchronization of the bottom halves execution + * synchronization of the threads from thread pool + * recording/replaying user input (mouse and keyboard) + * adding internal checkpoints for cpu and io synchronization + +Non-deterministic events +------------------------ + +Our record/replay system is based on saving and replaying non-deterministic +events (e.g. keyboard input) and simulating deterministic ones (e.g. reading +from HDD or memory of the VM). Saving only non-deterministic events makes +log file smaller, simulation faster, and allows using reverse debugging even +for realtime applications. + +The following non-deterministic data from peripheral devices is saved into +the log: mouse and keyboard input, network packets, audio controller input, +USB packets, serial port input, and hardware clocks (they are non-deterministic +too, because their values are taken from the host machine). Inputs from +simulated hardware, memory of VM, software interrupts, and execution of +instructions are not saved into the log, because they are deterministic and +can be replayed by simulating the behavior of virtual machine starting from +initial state. + +We had to solve three tasks to implement deterministic replay: recording +non-deterministic events, replaying non-deterministic events, and checking +that there is no divergence between record and replay modes. + +We changed several parts of QEMU to make event log recording and replaying. +Devices' models that have non-deterministic input from external devices were +changed to write every external event into the execution log immediately. +E.g. network packets are written into the log when they arrive into the virtual +network adapter. + +All non-deterministic events are coming from these devices. But to +replay them we need to know at which moments they occur. We specify +these moments by counting the number of instructions executed between +every pair of consecutive events. + +Instruction counting +-------------------- + +QEMU should work in icount mode to use record/replay feature. icount was +designed to allow deterministic execution in absence of external inputs +of the virtual machine. We also use icount to control the occurrence of the +non-deterministic events. The number of instructions elapsed from the last event +is written to the log while recording the execution. In replay mode we +can predict when to inject that event using the instruction counter. + +Timers +------ + +Timers are used to execute callbacks from different subsystems of QEMU +at the specified moments of time. There are several kinds of timers: + * Real time clock. Based on host time and used only for callbacks that + do not change the virtual machine state. For this reason real time + clock and timers does not affect deterministic replay at all. + * Virtual clock. These timers run only during the emulation. In icount + mode virtual clock value is calculated using executed instructions counter. + That is why it is completely deterministic and does not have to be recorded. + * Host clock. This clock is used by device models that simulate real time + sources (e.g. real time clock chip). Host clock is the one of the sources + of non-determinism. Host clock read operations should be logged to + make the execution deterministic. + * Real time clock for icount. This clock is similar to real time clock but + it is used only for increasing virtual clock while virtual machine is + sleeping. Due to its nature it is also non-deterministic as the host clock + and has to be logged too. + +Checkpoints +----------- + +Replaying of the execution of virtual machine is bound by sources of +non-determinism. These are inputs from clock and peripheral devices, +and QEMU thread scheduling. Thread scheduling affect on processing events +from timers, asynchronous input-output, and bottom halves. + +Invocations of timers are coupled with clock reads and changing the state +of the virtual machine. Reads produce non-deterministic data taken from +host clock. And VM state changes should preserve their order. Their relative +order in replay mode must replicate the order of callbacks in record mode. +To preserve this order we use checkpoints. When a specific clock is processed +in record mode we save to the log special "checkpoint" event. +Checkpoints here do not refer to virtual machine snapshots. They are just +record/replay events used for synchronization. + +QEMU in replay mode will try to invoke timers processing in random moment +of time. That's why we do not process a group of timers until the checkpoint +event will be read from the log. Such an event allows synchronizing CPU +execution and timer events. + +Another checkpoints application in record/replay is instruction counting +while the virtual machine is idle. This function (qemu_clock_warp) is called +from the wait loop. It changes virtual machine state and must be deterministic +then. That is why we added checkpoint to this function to prevent its +operation in replay mode when it does not correspond to record mode. + +Bottom halves +------------- + +Disk I/O events are completely deterministic in our model, because +in both record and replay modes we start virtual machine from the same +disk state. But callbacks that virtual disk controller uses for reading and +writing the disk may occur at different moments of time in record and replay +modes. + +Reading and writing requests are created by CPU thread of QEMU. Later these +requests proceed to block layer which creates "bottom halves". Bottom +halves consist of callback and its parameters. They are processed when +main loop locks the global mutex. These locks are not synchronized with +replaying process because main loop also processes the events that do not +affect the virtual machine state (like user interaction with monitor). + +That is why we had to implement saving and replaying bottom halves callbacks +synchronously to the CPU execution. When the callback is about to execute +it is added to the queue in the replay module. This queue is written to the +log when its callbacks are executed. In replay mode callbacks are not processed +until the corresponding event is read from the events log file. + +Sometimes the block layer uses asynchronous callbacks for its internal purposes +(like reading or writing VM snapshots or disk image cluster tables). In this +case bottom halves are not marked as "replayable" and do not saved +into the log. diff --git a/src/docs/specs/acpi_cpu_hotplug.txt b/src/docs/specs/acpi_cpu_hotplug.txt new file mode 100644 index 0000000..340b751 --- /dev/null +++ b/src/docs/specs/acpi_cpu_hotplug.txt @@ -0,0 +1,24 @@ +QEMU<->ACPI BIOS CPU hotplug interface +-------------------------------------- + +QEMU supports CPU hotplug via ACPI. This document +describes the interface between QEMU and the ACPI BIOS. + +ACPI GPE block (IO ports 0xafe0-0xafe3, byte access): +----------------------------------------- + +Generic ACPI GPE block. Bit 2 (GPE.2) used to notify CPU +hot-add/remove event to ACPI BIOS, via SCI interrupt. + +CPU present bitmap for: + ICH9-LPC (IO port 0x0cd8-0xcf7, 1-byte access) + PIIX-PM (IO port 0xaf00-0xaf1f, 1-byte access) +--------------------------------------------------------------- +One bit per CPU. Bit position reflects corresponding CPU APIC ID. +Read-only. + +CPU hot-add/remove notification: +----------------------------------------------------- +QEMU sets/clears corresponding CPU bit on hot-add/remove event. +CPU present map read by ACPI BIOS GPE.2 handler to notify OS of CPU +hot-(un)plug events. diff --git a/src/docs/specs/acpi_mem_hotplug.txt b/src/docs/specs/acpi_mem_hotplug.txt new file mode 100644 index 0000000..3df3620 --- /dev/null +++ b/src/docs/specs/acpi_mem_hotplug.txt @@ -0,0 +1,94 @@ +QEMU<->ACPI BIOS memory hotplug interface +-------------------------------------- + +ACPI BIOS GPE.3 handler is dedicated for notifying OS about memory hot-add +and hot-remove events. + +Memory hot-plug interface (IO port 0xa00-0xa17, 1-4 byte access): +--------------------------------------------------------------- +0xa00: + read access: + [0x0-0x3] Lo part of memory device phys address + [0x4-0x7] Hi part of memory device phys address + [0x8-0xb] Lo part of memory device size in bytes + [0xc-0xf] Hi part of memory device size in bytes + [0x10-0x13] Memory device proximity domain + [0x14] Memory device status fields + bits: + 0: Device is enabled and may be used by guest + 1: Device insert event, used to distinguish device for which + no device check event to OSPM was issued. + It's valid only when bit 1 is set. + 2: Device remove event, used to distinguish device for which + no device eject request to OSPM was issued. + 3-7: reserved and should be ignored by OSPM + [0x15-0x17] reserved + + write access: + [0x0-0x3] Memory device slot selector, selects active memory device. + All following accesses to other registers in 0xa00-0xa17 + region will read/store data from/to selected memory device. + [0x4-0x7] OST event code reported by OSPM + [0x8-0xb] OST status code reported by OSPM + [0xc-0x13] reserved, writes into it are ignored + [0x14] Memory device control fields + bits: + 0: reserved, OSPM must clear it before writing to register. + Due to BUG in versions prior 2.4 that field isn't cleared + when other fields are written. Keep it reserved and don't + try to reuse it. + 1: if set to 1 clears device insert event, set by OSPM + after it has emitted device check event for the + selected memory device + 2: if set to 1 clears device remove event, set by OSPM + after it has emitted device eject request for the + selected memory device + 3: if set to 1 initiates device eject, set by OSPM when it + triggers memory device removal and calls _EJ0 method + 4-7: reserved, OSPM must clear them before writing to register + +Selecting memory device slot beyond present range has no effect on platform: + - write accesses to memory hot-plug registers not documented above are + ignored + - read accesses to memory hot-plug registers not documented above return + all bits set to 1. + +Memory hot remove process diagram: +---------------------------------- + +-------------+     +-----------------------+      +------------------+      + |  1. QEMU    |     | 2. QEMU               |      |3. QEMU           |      + |  device_del +---->+ device unplug request +----->+Send SCI to guest,|      + |             |     |         cb            |      |return control to |      + +-------------+     +-----------------------+      |management        |      +                                                    +------------------+      +                                                                              + +---------------------------------------------------------------------+      +                                                                              + +---------------------+              +-------------------------+             + | OSPM:               | remove event | OSPM:                   |             + | send Eject Request, |              | Scan memory devices     |             + | clear remove event  +<-------------+ for event flags         |             + |                     |              |                         |             + +---------------------+              +-------------------------+             +           |                                                                  +           |                                                                  + +---------v--------+            +-----------------------+                    + | Guest OS:        |  success   | OSPM:                 |                    + | process Ejection +----------->+ Execute _EJ0 method,  |                    + | request          |            | set eject bit in flags|                    + +------------------+            +-----------------------+                    +           |failure                         |                                 +           v                                v                                 + +------------------------+      +-----------------------+                    + | OSPM:                  |      | QEMU:                 |                    + | set OST event & status |      | call device unplug cb |                    + | fields                 |      |                       |                    + +------------------------+      +-----------------------+                    +          |                                  |                                +          v                                  v                                + +------------------+              +-------------------+                      + |QEMU:             |              |QEMU:              |                      + |Send OST QMP event|              |Send device deleted|                      + |                  |              |QMP event          |                      + +------------------+              |                   |                      +                                   +-------------------+ diff --git a/src/docs/specs/acpi_pci_hotplug.txt b/src/docs/specs/acpi_pci_hotplug.txt new file mode 100644 index 0000000..a839434 --- /dev/null +++ b/src/docs/specs/acpi_pci_hotplug.txt @@ -0,0 +1,45 @@ +QEMU<->ACPI BIOS PCI hotplug interface +-------------------------------------- + +QEMU supports PCI hotplug via ACPI, for PCI bus 0. This document +describes the interface between QEMU and the ACPI BIOS. + +ACPI GPE block (IO ports 0xafe0-0xafe3, byte access): +----------------------------------------- + +Generic ACPI GPE block. Bit 1 (GPE.1) used to notify PCI hotplug/eject +event to ACPI BIOS, via SCI interrupt. + +PCI slot injection notification pending (IO port 0xae00-0xae03, 4-byte access): +--------------------------------------------------------------- +Slot injection notification pending. One bit per slot. + +Read by ACPI BIOS GPE.1 handler to notify OS of injection +events. Read-only. + +PCI slot removal notification (IO port 0xae04-0xae07, 4-byte access): +----------------------------------------------------- +Slot removal notification pending. One bit per slot. + +Read by ACPI BIOS GPE.1 handler to notify OS of removal +events. Read-only. + +PCI device eject (IO port 0xae08-0xae0b, 4-byte access): +---------------------------------------- + +Write: Used by ACPI BIOS _EJ0 method to request device removal. +One bit per slot. + +Read: Hotplug features register. Used by platform to identify features +available. Current base feature set (no bits set): + - Read-only "up" register @0xae00, 4-byte access, bit per slot + - Read-only "down" register @0xae04, 4-byte access, bit per slot + - Read/write "eject" register @0xae08, 4-byte access, + write: bit per slot eject, read: hotplug feature set + - Read-only hotplug capable register @0xae0c, 4-byte access, bit per slot + +PCI removability status (IO port 0xae0c-0xae0f, 4-byte access): +----------------------------------------------- + +Used by ACPI BIOS _RMV method to indicate removability status to OS. One +bit per slot. Read-only diff --git a/src/docs/specs/edu.txt b/src/docs/specs/edu.txt new file mode 100644 index 0000000..7f81467 --- /dev/null +++ b/src/docs/specs/edu.txt @@ -0,0 +1,110 @@ + +EDU device +========== + +Copyright (c) 2014-2015 Jiri Slaby + +This document is licensed under the GPLv2 (or later). + +This is an educational device for writing (kernel) drivers. Its original +intention was to support the Linux kernel lectures taught at the Masaryk +University. Students are given this virtual device and are expected to write a +driver with I/Os, IRQs, DMAs and such. + +The devices behaves very similar to the PCI bridge present in the COMBO6 cards +developed under the Liberouter wings. Both PCI device ID and PCI space is +inherited from that device. + +Command line switches: + -device edu[,dma_mask=mask] + + dma_mask makes the virtual device work with DMA addresses with the given + mask. For educational purposes, the device supports only 28 bits (256 MiB) + by default. Students shall set dma_mask for the device in the OS driver + properly. + +PCI specs +--------- + +PCI ID: 1234:11e8 + +PCI Region 0: + I/O memory, 1 MB in size. Users are supposed to communicate with the card + through this memory. + +MMIO area spec +-------------- + +Only size == 4 accesses are allowed for addresses < 0x80. size == 4 or +size == 8 for the rest. + +0x00 (RO) : identification (0xRRrr00edu) + RR -- major version + rr -- minor version + +0x04 (RW) : card liveness check + It is a simple value inversion (~ C operator). + +0x08 (RW) : factorial computation + The stored value is taken and factorial of it is put back here. + This happens only after factorial bit in the status register (0x20 + below) is cleared. + +0x20 (RW) : status register, bitwise OR + 0x01 -- computing factorial (RO) + 0x80 -- raise interrupt 0x01 after finishing factorial computation + +0x24 (RO) : interrupt status register + It contains values which raised the interrupt (see interrupt raise + register below). + +0x60 (WO) : interrupt raise register + Raise an interrupt. The value will be put to the interrupt status + register (using bitwise OR). + +0x64 (WO) : interrupt acknowledge register + Clear an interrupt. The value will be cleared from the interrupt + status register. This needs to be done from the ISR to stop + generating interrupts. + +0x80 (RW) : DMA source address + Where to perform the DMA from. + +0x88 (RW) : DMA destination address + Where to perform the DMA to. + +0x90 (RW) : DMA transfer count + The size of the area to perform the DMA on. + +0x98 (RW) : DMA command register, bitwise OR + 0x01 -- start transfer + 0x02 -- direction (0: from RAM to EDU, 1: from EDU to RAM) + 0x04 -- raise interrupt 0x100 after finishing the DMA + +IRQ controller +-------------- +An IRQ is generated when written to the interrupt raise register. The value +appears in interrupt status register when the interrupt is raised and has to +be written to the interrupt acknowledge register to lower it. + +DMA controller +-------------- +One has to specify, source, destination, size, and start the transfer. One +4096 bytes long buffer at offset 0x40000 is available in the EDU device. I.e. +one can perform DMA to/from this space when programmed properly. + +Example of transferring a 100 byte block to and from the buffer using a given +PCI address 'addr': +addr -> DMA source address +0x40000 -> DMA destination address +100 -> DMA transfer count +1 -> DMA command register +while (DMA command register & 1) + ; + +0x40000 -> DMA source address +addr+100 -> DMA destination address +100 -> DMA transfer count +3 -> DMA command register +while (DMA command register & 1) + ; diff --git a/src/docs/specs/fw_cfg.txt b/src/docs/specs/fw_cfg.txt new file mode 100644 index 0000000..b8c794f --- /dev/null +++ b/src/docs/specs/fw_cfg.txt @@ -0,0 +1,312 @@ +QEMU Firmware Configuration (fw_cfg) Device +=========================================== + += Guest-side Hardware Interface = + +This hardware interface allows the guest to retrieve various data items +(blobs) that can influence how the firmware configures itself, or may +contain tables to be installed for the guest OS. Examples include device +boot order, ACPI and SMBIOS tables, virtual machine UUID, SMP and NUMA +information, kernel/initrd images for direct (Linux) kernel booting, etc. + +== Selector (Control) Register == + +* Write only +* Location: platform dependent (IOport or MMIO) +* Width: 16-bit +* Endianness: little-endian (if IOport), or big-endian (if MMIO) + +A write to this register sets the index of a firmware configuration +item which can subsequently be accessed via the data register. + +Setting the selector register will cause the data offset to be set +to zero. The data offset impacts which data is accessed via the data +register, and is explained below. + +Bit14 of the selector register indicates whether the configuration +setting is being written. A value of 0 means the item is only being +read, and all write access to the data port will be ignored. A value +of 1 means the item's data can be overwritten by writes to the data +register. In other words, configuration write mode is enabled when +the selector value is between 0x4000-0x7fff or 0xc000-0xffff. + +NOTE: As of QEMU v2.4, writes to the fw_cfg data register are no + longer supported, and will be ignored (treated as no-ops)! + +Bit15 of the selector register indicates whether the configuration +setting is architecture specific. A value of 0 means the item is a +generic configuration item. A value of 1 means the item is specific +to a particular architecture. In other words, generic configuration +items are accessed with a selector value between 0x0000-0x7fff, and +architecture specific configuration items are accessed with a selector +value between 0x8000-0xffff. + +== Data Register == + +* Read/Write (writes ignored as of QEMU v2.4) +* Location: platform dependent (IOport [*] or MMIO) +* Width: 8-bit (if IOport), 8/16/32/64-bit (if MMIO) +* Endianness: string-preserving + +[*] On platforms where the data register is exposed as an IOport, its +port number will always be one greater than the port number of the +selector register. In other words, the two ports overlap, and can not +be mapped separately. + +The data register allows access to an array of bytes for each firmware +configuration data item. The specific item is selected by writing to +the selector register, as described above. + +Initially following a write to the selector register, the data offset +will be set to zero. Each successful access to the data register will +increment the data offset by the appropriate access width. + +Each firmware configuration item has a maximum length of data +associated with the item. After the data offset has passed the +end of this maximum data length, then any reads will return a data +value of 0x00, and all writes will be ignored. + +An N-byte wide read of the data register will return the next available +N bytes of the selected firmware configuration item, as a substring, in +increasing address order, similar to memcpy(). + +== Register Locations == + +=== x86, x86_64 Register Locations === + +Selector Register IOport: 0x510 +Data Register IOport: 0x511 +DMA Address IOport: 0x514 + +=== ARM Register Locations === + +Selector Register address: Base + 8 (2 bytes) +Data Register address: Base + 0 (8 bytes) +DMA Address address: Base + 16 (8 bytes) + +== Firmware Configuration Items == + +=== Signature (Key 0x0000, FW_CFG_SIGNATURE) === + +The presence of the fw_cfg selector and data registers can be verified +by selecting the "signature" item using key 0x0000 (FW_CFG_SIGNATURE), +and reading four bytes from the data register. If the fw_cfg device is +present, the four bytes read will contain the characters "QEMU". + +If the DMA interface is available, then reading the DMA Address +Register returns 0x51454d5520434647 ("QEMU CFG" in big-endian format). + +=== Revision / feature bitmap (Key 0x0001, FW_CFG_ID) === + +A 32-bit little-endian unsigned int, this item is used to check for enabled +features. + - Bit 0: traditional interface. Always set. + - Bit 1: DMA interface. + +=== File Directory (Key 0x0019, FW_CFG_FILE_DIR) === + +Firmware configuration items stored at selector keys 0x0020 or higher +(FW_CFG_FILE_FIRST or higher) have an associated entry in a directory +structure, which makes it easier for guest-side firmware to identify +and retrieve them. The format of this file directory (from fw_cfg.h in +the QEMU source tree) is shown here, slightly annotated for clarity: + +struct FWCfgFiles { /* the entire file directory fw_cfg item */ + uint32_t count; /* number of entries, in big-endian format */ + struct FWCfgFile f[]; /* array of file entries, see below */ +}; + +struct FWCfgFile { /* an individual file entry, 64 bytes total */ + uint32_t size; /* size of referenced fw_cfg item, big-endian */ + uint16_t select; /* selector key of fw_cfg item, big-endian */ + uint16_t reserved; + char name[56]; /* fw_cfg item name, NUL-terminated ascii */ +}; + +=== All Other Data Items === + +Please consult the QEMU source for the most up-to-date and authoritative +list of selector keys and their respective items' purpose and format. + +=== Ranges === + +Theoretically, there may be up to 0x4000 generic firmware configuration +items, and up to 0x4000 architecturally specific ones. + +Selector Reg. Range Usage +--------------- ----------- +0x0000 - 0x3fff Generic (0x0000 - 0x3fff, RO) +0x4000 - 0x7fff Generic (0x0000 - 0x3fff, RW, ignored in QEMU v2.4+) +0x8000 - 0xbfff Arch. Specific (0x0000 - 0x3fff, RO) +0xc000 - 0xffff Arch. Specific (0x0000 - 0x3fff, RW, ignored in v2.4+) + +In practice, the number of allowed firmware configuration items is given +by the value of FW_CFG_MAX_ENTRY (see fw_cfg.h). + += Guest-side DMA Interface = + +If bit 1 of the feature bitmap is set, the DMA interface is present. This does +not replace the existing fw_cfg interface, it is an add-on. This interface +can be used through the 64-bit wide address register. + +The address register is in big-endian format. The value for the register is 0 +at startup and after an operation. A write to the least significant half (at +offset 4) triggers an operation. This means that operations with 32-bit +addresses can be triggered with just one write, whereas operations with +64-bit addresses can be triggered with one 64-bit write or two 32-bit writes, +starting with the most significant half (at offset 0). + +In this register, the physical address of a FWCfgDmaAccess structure in RAM +should be written. This is the format of the FWCfgDmaAccess structure: + +typedef struct FWCfgDmaAccess { + uint32_t control; + uint32_t length; + uint64_t address; +} FWCfgDmaAccess; + +The fields of the structure are in big endian mode, and the field at the lowest +address is the "control" field. + +The "control" field has the following bits: + - Bit 0: Error + - Bit 1: Read + - Bit 2: Skip + - Bit 3: Select. The upper 16 bits are the selected index. + +When an operation is triggered, if the "control" field has bit 3 set, the +upper 16 bits are interpreted as an index of a firmware configuration item. +This has the same effect as writing the selector register. + +If the "control" field has bit 1 set, a read operation will be performed. +"length" bytes for the current selector and offset will be copied into the +physical RAM address specified by the "address" field. + +If the "control" field has bit 2 set (and not bit 1), a skip operation will be +performed. The offset for the current selector will be advanced "length" bytes. + +To check the result, read the "control" field: + error bit set -> something went wrong. + all bits cleared -> transfer finished successfully. + otherwise -> transfer still in progress (doesn't happen + today due to implementation not being async, + but may in the future). + += Host-side API = + +The following functions are available to the QEMU programmer for adding +data to a fw_cfg device during guest initialization (see fw_cfg.h for +each function's complete prototype): + +== fw_cfg_add_bytes() == + +Given a selector key value, starting pointer, and size, create an item +as a raw "blob" of the given size, available by selecting the given key. +The data referenced by the starting pointer is only linked, NOT copied, +into the data structure of the fw_cfg device. + +== fw_cfg_add_string() == + +Instead of a starting pointer and size, this function accepts a pointer +to a NUL-terminated ascii string, and inserts a newly allocated copy of +the string (including the NUL terminator) into the fw_cfg device data +structure. + +== fw_cfg_add_iXX() == + +Insert an XX-bit item, where XX may be 16, 32, or 64. These functions +will convert a 16-, 32-, or 64-bit integer to little-endian, then add +a dynamically allocated copy of the appropriately sized item to fw_cfg +under the given selector key value. + +== fw_cfg_modify_iXX() == + +Modify the value of an XX-bit item (where XX may be 16, 32, or 64). +Similarly to the corresponding fw_cfg_add_iXX() function set, convert +a 16-, 32-, or 64-bit integer to little endian, create a dynamically +allocated copy of the required size, and replace the existing item at +the given selector key value with the newly allocated one. The previous +item, assumed to have been allocated during an earlier call to +fw_cfg_add_iXX() or fw_cfg_modify_iXX() (of the same width XX), is freed +before the function returns. + +== fw_cfg_add_file() == + +Given a filename (i.e., fw_cfg item name), starting pointer, and size, +create an item as a raw "blob" of the given size. Unlike fw_cfg_add_bytes() +above, the next available selector key (above 0x0020, FW_CFG_FILE_FIRST) +will be used, and a new entry will be added to the file directory structure +(at key 0x0019), containing the item name, blob size, and automatically +assigned selector key value. The data referenced by the starting pointer +is only linked, NOT copied, into the fw_cfg data structure. + +== fw_cfg_add_file_callback() == + +Like fw_cfg_add_file(), but additionally sets pointers to a callback +function (and opaque argument), which will be executed host-side by +QEMU each time a byte is read by the guest from this particular item. + +NOTE: The callback function is given the opaque argument set by +fw_cfg_add_file_callback(), but also the current data offset, +allowing it the option of only acting upon specific offset values +(e.g., 0, before the first data byte of the selected item is +returned to the guest). + +== fw_cfg_modify_file() == + +Given a filename (i.e., fw_cfg item name), starting pointer, and size, +completely replace the configuration item referenced by the given item +name with the new given blob. If an existing blob is found, its +callback information is removed, and a pointer to the old data is +returned to allow the caller to free it, helping avoid memory leaks. +If a configuration item does not already exist under the given item +name, a new item will be created as with fw_cfg_add_file(), and NULL +is returned to the caller. In any case, the data referenced by the +starting pointer is only linked, NOT copied, into the fw_cfg data +structure. + +== fw_cfg_add_callback() == + +Like fw_cfg_add_bytes(), but additionally sets pointers to a callback +function (and opaque argument), which will be executed host-side by +QEMU each time a guest-side write operation to this particular item +completes fully overwriting the item's data. + +NOTE: This function is deprecated, and will be completely removed +starting with QEMU v2.4. + +== Externally Provided Items == + +As of v2.4, "file" fw_cfg items (i.e., items with selector keys above +FW_CFG_FILE_FIRST, and with a corresponding entry in the fw_cfg file +directory structure) may be inserted via the QEMU command line, using +the following syntax: + + -fw_cfg [name=]<item_name>,file=<path> + +where <item_name> is the fw_cfg item name, and <path> is the location +on the host file system of a file containing the data to be inserted. + +Small enough items may be provided directly as strings on the command +line, using the syntax: + + -fw_cfg [name=]<item_name>,string=<string> + +The terminating NUL character of the content <string> will NOT be +included as part of the fw_cfg item data, which is consistent with +the absence of a NUL terminator for items inserted via the file option. + +Both <item_name> and, if applicable, the content <string> are passed +through by QEMU without any interpretation, expansion, or further +processing. Any such processing (potentially performed e.g., by the shell) +is outside of QEMU's responsibility; as such, using plain ASCII characters +is recommended. + +NOTE: Users *SHOULD* choose item names beginning with the prefix "opt/" +when using the "-fw_cfg" command line option, to avoid conflicting with +item names used internally by QEMU. For instance: + + -fw_cfg name=opt/my_item_name,file=./my_blob.bin + +Similarly, QEMU developers *SHOULD NOT* use item names prefixed with +"opt/" when inserting items programmatically, e.g. via fw_cfg_add_file(). diff --git a/src/docs/specs/ivshmem_device_spec.txt b/src/docs/specs/ivshmem_device_spec.txt new file mode 100644 index 0000000..d318d65 --- /dev/null +++ b/src/docs/specs/ivshmem_device_spec.txt @@ -0,0 +1,161 @@ + +Device Specification for Inter-VM shared memory device +------------------------------------------------------ + +The Inter-VM shared memory device is designed to share a memory region (created +on the host via the POSIX shared memory API) between multiple QEMU processes +running different guests. In order for all guests to be able to pick up the +shared memory area, it is modeled by QEMU as a PCI device exposing said memory +to the guest as a PCI BAR. +The memory region does not belong to any guest, but is a POSIX memory object on +the host. The host can access this shared memory if needed. + +The device also provides an optional communication mechanism between guests +sharing the same memory object. More details about that in the section 'Guest to +guest communication' section. + + +The Inter-VM PCI device +----------------------- + +From the VM point of view, the ivshmem PCI device supports three BARs. + +- BAR0 is a 1 Kbyte MMIO region to support registers and interrupts when MSI is + not used. +- BAR1 is used for MSI-X when it is enabled in the device. +- BAR2 is used to access the shared memory object. + +It is your choice how to use the device but you must choose between two +behaviors : + +- basically, if you only need the shared memory part, you will map BAR2. + This way, you have access to the shared memory in guest and can use it as you + see fit (memnic, for example, uses it in userland + http://dpdk.org/browse/memnic). + +- BAR0 and BAR1 are used to implement an optional communication mechanism + through interrupts in the guests. If you need an event mechanism between the + guests accessing the shared memory, you will most likely want to write a + kernel driver that will handle interrupts. See details in the section 'Guest + to guest communication' section. + +The behavior is chosen when starting your QEMU processes: +- no communication mechanism needed, the first QEMU to start creates the shared + memory on the host, subsequent QEMU processes will use it. + +- communication mechanism needed, an ivshmem server must be started before any + QEMU processes, then each QEMU process connects to the server unix socket. + +For more details on the QEMU ivshmem parameters, see qemu-doc documentation. + + +Guest to guest communication +---------------------------- + +This section details the communication mechanism between the guests accessing +the ivhsmem shared memory. + +*ivshmem server* + +This server code is available in qemu.git/contrib/ivshmem-server. + +The server must be started on the host before any guest. +It creates a shared memory object then waits for clients to connect on a unix +socket. All the messages are little-endian int64_t integer. + +For each client (QEMU process) that connects to the server: +- the server sends a protocol version, if client does not support it, the client + closes the communication, +- the server assigns an ID for this client and sends this ID to him as the first + message, +- the server sends a fd to the shared memory object to this client, +- the server creates a new set of host eventfds associated to the new client and + sends this set to all already connected clients, +- finally, the server sends all the eventfds sets for all clients to the new + client. + +The server signals all clients when one of them disconnects. + +The client IDs are limited to 16 bits because of the current implementation (see +Doorbell register in 'PCI device registers' subsection). Hence only 65536 +clients are supported. + +All the file descriptors (fd to the shared memory, eventfds for each client) +are passed to clients using SCM_RIGHTS over the server unix socket. + +Apart from the current ivshmem implementation in QEMU, an ivshmem client has +been provided in qemu.git/contrib/ivshmem-client for debug. + +*QEMU as an ivshmem client* + +At initialisation, when creating the ivshmem device, QEMU first receives a +protocol version and closes communication with server if it does not match. +Then, QEMU gets its ID from the server then makes it available through BAR0 +IVPosition register for the VM to use (see 'PCI device registers' subsection). +QEMU then uses the fd to the shared memory to map it to BAR2. +eventfds for all other clients received from the server are stored to implement +BAR0 Doorbell register (see 'PCI device registers' subsection). +Finally, eventfds assigned to this QEMU process are used to send interrupts in +this VM. + +*PCI device registers* + +From the VM point of view, the ivshmem PCI device supports 4 registers of +32-bits each. + +enum ivshmem_registers { + IntrMask = 0, + IntrStatus = 4, + IVPosition = 8, + Doorbell = 12 +}; + +The first two registers are the interrupt mask and status registers. Mask and +status are only used with pin-based interrupts. They are unused with MSI +interrupts. + +Status Register: The status register is set to 1 when an interrupt occurs. + +Mask Register: The mask register is bitwise ANDed with the interrupt status +and the result will raise an interrupt if it is non-zero. However, since 1 is +the only value the status will be set to, it is only the first bit of the mask +that has any effect. Therefore interrupts can be masked by setting the first +bit to 0 and unmasked by setting the first bit to 1. + +IVPosition Register: The IVPosition register is read-only and reports the +guest's ID number. The guest IDs are non-negative integers. When using the +server, since the server is a separate process, the VM ID will only be set when +the device is ready (shared memory is received from the server and accessible +via the device). If the device is not ready, the IVPosition will return -1. +Applications should ensure that they have a valid VM ID before accessing the +shared memory. + +Doorbell Register: To interrupt another guest, a guest must write to the +Doorbell register. The doorbell register is 32-bits, logically divided into +two 16-bit fields. The high 16-bits are the guest ID to interrupt and the low +16-bits are the interrupt vector to trigger. The semantics of the value +written to the doorbell depends on whether the device is using MSI or a regular +pin-based interrupt. In short, MSI uses vectors while regular interrupts set +the status register. + +Regular Interrupts + +If regular interrupts are used (due to either a guest not supporting MSI or the +user specifying not to use them on startup) then the value written to the lower +16-bits of the Doorbell register results is arbitrary and will trigger an +interrupt in the destination guest. + +Message Signalled Interrupts + +An ivshmem device may support multiple MSI vectors. If so, the lower 16-bits +written to the Doorbell register must be between 0 and the maximum number of +vectors the guest supports. The lower 16 bits written to the doorbell is the +MSI vector that will be raised in the destination guest. The number of MSI +vectors is configurable but it is set when the VM is started. + +The important thing to remember with MSI is that it is only a signal, no status +is set (since MSI interrupts are not shared). All information other than the +interrupt itself should be communicated via the shared memory region. Devices +supporting multiple MSI vectors can use different vectors to indicate different +events have occurred. The semantics of interrupt vectors are left to the +user's discretion. diff --git a/src/docs/specs/pci-ids.txt b/src/docs/specs/pci-ids.txt new file mode 100644 index 0000000..0adcb89 --- /dev/null +++ b/src/docs/specs/pci-ids.txt @@ -0,0 +1,54 @@ + +PCI IDs for qemu +================ + +Red Hat, Inc. donates a part of its device ID range to qemu, to be used for +virtual devices. The vendor IDs are 1af4 (formerly Qumranet ID) and 1b36. + +Contact Gerd Hoffmann <kraxel@redhat.com> to get a device ID assigned +for your devices. + +1af4 vendor ID +-------------- + +The 1000 -> 10ff device ID range is used as follows for virtio-pci devices. +Note that this allocation separate from the virtio device IDs, which are +maintained as part of the virtio specification. + +1af4:1000 network device +1af4:1001 block device +1af4:1002 balloon device +1af4:1003 console device +1af4:1004 SCSI host bus adapter device +1af4:1005 entropy generator device +1af4:1009 9p filesystem device + +1af4:10f0 Available for experimental usage without registration. Must get + to official ID when the code leaves the test lab (i.e. when seeking +1af4:10ff upstream merge or shipping a distro/product) to avoid conflicts. + +1af4:1100 Used as PCI Subsystem ID for existing hardware devices emulated + by qemu. + +1af4:1110 ivshmem device (shared memory, docs/specs/ivshmem_device_spec.txt) + +All other device IDs are reserved. + +1b36 vendor ID +-------------- + +The 0000 -> 00ff device ID range is used as follows for QEMU-specific +PCI devices (other than virtio): + +1b36:0001 PCI-PCI bridge +1b36:0002 PCI serial port (16550A) adapter (docs/specs/pci-serial.txt) +1b36:0003 PCI Dual-port 16550A adapter (docs/specs/pci-serial.txt) +1b36:0004 PCI Quad-port 16550A adapter (docs/specs/pci-serial.txt) +1b36:0005 PCI test device (docs/specs/pci-testdev.txt) +1b36:0006 PCI Rocker Ethernet switch device +1b36:0007 PCI SD Card Host Controller Interface (SDHCI) +1b36:000a PCI-PCI bridge (multiseat) + +All these devices are documented in docs/specs. + +The 0100 device ID is used for the QXL video card device. diff --git a/src/docs/specs/pci-serial.txt b/src/docs/specs/pci-serial.txt new file mode 100644 index 0000000..66c761f --- /dev/null +++ b/src/docs/specs/pci-serial.txt @@ -0,0 +1,34 @@ + +QEMU pci serial devices +======================= + +There is one single-port variant and two muliport-variants. Linux +guests out-of-the box with all cards. There is a Windows inf file +(docs/qemupciserial.inf) to setup the single-port card in Windows +guests. + + +single-port card +---------------- + +Name: pci-serial +PCI ID: 1b36:0002 + +PCI Region 0: + IO bar, 8 bytes long, with the 16550 uart mapped to it. + Interrupt is wired to pin A. + + +multiport cards +--------------- + +Name: pci-serial-2x +PCI ID: 1b36:0003 + +Name: pci-serial-4x +PCI ID: 1b36:0004 + +PCI Region 0: + IO bar, with two/four 16550 uart mapped after each other. + The first is at offset 0, second at offset 8, ... + Interrupt is wired to pin A. diff --git a/src/docs/specs/pci-testdev.txt b/src/docs/specs/pci-testdev.txt new file mode 100644 index 0000000..128ae22 --- /dev/null +++ b/src/docs/specs/pci-testdev.txt @@ -0,0 +1,26 @@ +pci-test is a device used for testing low level IO + +device implements up to two BARs: BAR0 and BAR1. +Each BAR can be memory or IO. Guests must detect +BAR type and act accordingly. + +Each BAR size is up to 4K bytes. +Each BAR starts with the following header: + +typedef struct PCITestDevHdr { + uint8_t test; <- write-only, starts a given test number + uint8_t width_type; <- read-only, type and width of access for a given test. + 1,2,4 for byte,word or long write. + any other value if test not supported on this BAR + uint8_t pad0[2]; + uint32_t offset; <- read-only, offset in this BAR for a given test + uint32_t data; <- read-only, data to use for a given test + uint32_t count; <- for debugging. number of writes detected. + uint8_t name[]; <- for debugging. 0-terminated ASCII string. +} PCITestDevHdr; + +All registers are little endian. + +device is expected to always implement tests 0 to N on each BAR, and to add new +tests with higher numbers. In this way a guest can scan test numbers until it +detects an access type that it does not support on this BAR, then stop. diff --git a/src/docs/specs/ppc-spapr-hcalls.txt b/src/docs/specs/ppc-spapr-hcalls.txt new file mode 100644 index 0000000..5bd8eab --- /dev/null +++ b/src/docs/specs/ppc-spapr-hcalls.txt @@ -0,0 +1,78 @@ +When used with the "pseries" machine type, QEMU-system-ppc64 implements +a set of hypervisor calls using a subset of the server "PAPR" specification +(IBM internal at this point), which is also what IBM's proprietary hypervisor +adheres too. + +The subset is selected based on the requirements of Linux as a guest. + +In addition to those calls, we have added our own private hypervisor +calls which are mostly used as a private interface between the firmware +running in the guest and QEMU. + +All those hypercalls start at hcall number 0xf000 which correspond +to a implementation specific range in PAPR. + +- H_RTAS (0xf000) + +RTAS is a set of runtime services generally provided by the firmware +inside the guest to the operating system. It predates the existence +of hypervisors (it was originally an extension to Open Firmware) and +is still used by PAPR to provide various services that aren't performance +sensitive. + +We currently implement the RTAS services in QEMU itself. The actual RTAS +"firmware" blob in the guest is a small stub of a few instructions which +calls our private H_RTAS hypervisor call to pass the RTAS calls to QEMU. + +Arguments: + + r3 : H_RTAS (0xf000) + r4 : Guest physical address of RTAS parameter block + +Returns: + + H_SUCCESS : Successfully called the RTAS function (RTAS result + will have been stored in the parameter block) + H_PARAMETER : Unknown token + +- H_LOGICAL_MEMOP (0xf001) + +When the guest runs in "real mode" (in powerpc lingua this means +with MMU disabled, ie guest effective == guest physical), it only +has access to a subset of memory and no IOs. + +PAPR provides a set of hypervisor calls to perform cacheable or +non-cacheable accesses to any guest physical addresses that the +guest can use in order to access IO devices while in real mode. + +This is typically used by the firmware running in the guest. + +However, doing a hypercall for each access is extremely inefficient +(even more so when running KVM) when accessing the frame buffer. In +that case, things like scrolling become unusably slow. + +This hypercall allows the guest to request a "memory op" to be applied +to memory. The supported memory ops at this point are to copy a range +of memory (supports overlap of source and destination) and XOR which +is used by our SLOF firmware to invert the screen. + +Arguments: + + r3: H_LOGICAL_MEMOP (0xf001) + r4: Guest physical address of destination + r5: Guest physical address of source + r6: Individual element size + 0 = 1 byte + 1 = 2 bytes + 2 = 4 bytes + 3 = 8 bytes + r7: Number of elements + r8: Operation + 0 = copy + 1 = xor + +Returns: + + H_SUCCESS : Success + H_PARAMETER : Invalid argument + diff --git a/src/docs/specs/ppc-spapr-hotplug.txt b/src/docs/specs/ppc-spapr-hotplug.txt new file mode 100644 index 0000000..631b0ca --- /dev/null +++ b/src/docs/specs/ppc-spapr-hotplug.txt @@ -0,0 +1,353 @@ += sPAPR Dynamic Reconfiguration = + +sPAPR/"pseries" guests make use of a facility called dynamic-reconfiguration +to handle hotplugging of dynamic "physical" resources like PCI cards, or +"logical"/paravirtual resources like memory, CPUs, and "physical" +host-bridges, which are generally managed by the host/hypervisor and provided +to guests as virtualized resources. The specifics of dynamic-reconfiguration +are documented extensively in PAPR+ v2.7, Section 13.1. This document +provides a summary of that information as it applies to the implementation +within QEMU. + +== Dynamic-reconfiguration Connectors == + +To manage hotplug/unplug of these resources, a firmware abstraction known as +a Dynamic Resource Connector (DRC) is used to assign a particular dynamic +resource to the guest, and provide an interface for the guest to manage +configuration/removal of the resource associated with it. + +== Device-tree description of DRCs == + +A set of 4 Open Firmware device tree array properties are used to describe +the name/index/power-domain/type of each DRC allocated to a guest at +boot-time. There may be multiple sets of these arrays, rooted at different +paths in the device tree depending on the type of resource the DRCs manage. + +In some cases, the DRCs themselves may be provided by a dynamic resource, +such as the DRCs managing PCI slots on a hotplugged PHB. In this case the +arrays would be fetched as part of the device tree retrieval interfaces +for hotplugged resources described under "Guest->Host interface". + +The array properties are described below. Each entry/element in an array +describes the DRC identified by the element in the corresponding position +of ibm,drc-indexes: + +ibm,drc-names: + first 4-bytes: BE-encoded integer denoting the number of entries + each entry: a NULL-terminated <name> string encoded as a byte array + + <name> values for logical/virtual resources are defined in PAPR+ v2.7, + Section 13.5.2.4, and basically consist of the type of the resource + followed by a space and a numerical value that's unique across resources + of that type. + + <name> values for "physical" resources such as PCI or VIO devices are + defined as being "location codes", which are the "location labels" of + each encapsulating device, starting from the chassis down to the + individual slot for the device, concatenated by a hyphen. This provides + a mapping of resources to a physical location in a chassis for debugging + purposes. For QEMU, this mapping is less important, so we assign a + location code that conforms to naming specifications, but is simply a + location label for the slot by itself to simplify the implementation. + The naming convention for location labels is documented in detail in + PAPR+ v2.7, Section 12.3.1.5, and in our case amounts to using "C<n>" + for PCI/VIO device slots, where <n> is unique across all PCI/VIO + device slots. + +ibm,drc-indexes: + first 4-bytes: BE-encoded integer denoting the number of entries + each 4-byte entry: BE-encoded <index> integer that is unique across all DRCs + in the machine + + <index> is arbitrary, but in the case of QEMU we try to maintain the + convention used to assign them to pSeries guests on pHyp: + + bit[31:28]: integer encoding of <type>, where <type> is: + 1 for CPU resource + 2 for PHB resource + 3 for VIO resource + 4 for PCI resource + 8 for Memory resource + bit[27:0]: integer encoding of <id>, where <id> is unique across + all resources of specified type + +ibm,drc-power-domains: + first 4-bytes: BE-encoded integer denoting the number of entries + each 4-byte entry: 32-bit, BE-encoded <index> integer that specifies the + power domain the resource will be assigned to. In the case of QEMU + we associated all resources with a "live insertion" domain, where the + power is assumed to be managed automatically. The integer value for + this domain is a special value of -1. + + +ibm,drc-types: + first 4-bytes: BE-encoded integer denoting the number of entries + each entry: a NULL-terminated <type> string encoded as a byte array + + <type> is assigned as follows: + "CPU" for a CPU + "PHB" for a physical host-bridge + "SLOT" for a VIO slot + "28" for a PCI slot + "MEM" for memory resource + +== Guest->Host interface to manage dynamic resources == + +Each DRC is given a globally unique DRC Index, and resources associated with +a particular DRC are configured/managed by the guest via a number of RTAS +calls which reference individual DRCs based on the DRC index. This can be +considered the guest->host interface. + +rtas-set-power-level: + arg[0]: integer identifying power domain + arg[1]: new power level for the domain, 0-100 + output[0]: status, 0 on success + output[1]: power level after command + + Set the power level for a specified power domain + +rtas-get-power-level: + arg[0]: integer identifying power domain + output[0]: status, 0 on success + output[1]: current power level + + Get the power level for a specified power domain + +rtas-set-indicator: + arg[0]: integer identifying sensor/indicator type + arg[1]: index of sensor, for DR-related sensors this is generally the + DRC index + arg[2]: desired sensor value + output[0]: status, 0 on success + + Set the state of an indicator or sensor. For the purpose of this document we + focus on the indicator/sensor types associated with a DRC. The types are: + + 9001: isolation-state, controls/indicates whether a device has been made + accessible to a guest + + supported sensor values: + 0: isolate, device is made unaccessible by guest OS + 1: unisolate, device is made available to guest OS + + 9002: dr-indicator, controls "visual" indicator associated with device + + supported sensor values: + 0: inactive, resource may be safely removed + 1: active, resource is in use and cannot be safely removed + 2: identify, used to visually identify slot for interactive hotplug + 3: action, in most cases, used in the same manner as identify + + 9003: allocation-state, generally only used for "logical" DR resources to + request the allocation/deallocation of a resource prior to acquiring + it via isolation-state->unisolate, or after releasing it via + isolation-state->isolate, respectively. for "physical" DR (like PCI + hotplug/unplug) the pre-allocation of the resource is implied and + this sensor is unused. + + supported sensor values: + 0: unusable, tell firmware/system the resource can be + unallocated/reclaimed and added back to the system resource pool + 1: usable, request the resource be allocated/reserved for use by + guest OS + 2: exchange, used to allocate a spare resource to use for fail-over + in certain situations. unused in QEMU + 3: recover, used to reclaim a previously allocated resource that's + not currently allocated to the guest OS. unused in QEMU + +rtas-get-sensor-state: + arg[0]: integer identifying sensor/indicator type + arg[1]: index of sensor, for DR-related sensors this is generally the + DRC index + output[0]: status, 0 on success + + Used to read an indicator or sensor value. + + For DR-related operations, the only noteworthy sensor is dr-entity-sense, + which has a type value of 9003, as allocation-state does in the case of + rtas-set-indicator. The semantics/encodings of the sensor values are distinct + however: + + supported sensor values for dr-entity-sense (9003) sensor: + 0: empty, + for physical resources: DRC/slot is empty + for logical resources: unused + 1: present, + for physical resources: DRC/slot is populated with a device/resource + for logical resources: resource has been allocated to the DRC + 2: unusable, + for physical resources: unused + for logical resources: DRC has no resource allocated to it + 3: exchange, + for physical resources: unused + for logical resources: resource available for exchange (see + allocation-state sensor semantics above) + 4: recovery, + for physical resources: unused + for logical resources: resource available for recovery (see + allocation-state sensor semantics above) + +rtas-ibm-configure-connector: + arg[0]: guest physical address of 4096-byte work area buffer + arg[1]: 0, or address of additional 4096-byte work area buffer. only non-zero + if a prior RTAS response indicated a need for additional memory + output[0]: status: + 0: completed transmittal of device-tree node + 1: instruct guest to prepare for next DT sibling node + 2: instruct guest to prepare for next DT child node + 3: instruct guest to prepare for next DT property + 4: instruct guest to ascend to parent DT node + 5: instruct guest to provide additional work-area buffer + via arg[1] + 990x: instruct guest that operation took too long and to try + again later + + Used to fetch an OF device-tree description of the resource associated with + a particular DRC. The DRC index is encoded in the first 4-bytes of the first + work area buffer. + + Work area layout, using 4-byte offsets: + wa[0]: DRC index of the DRC to fetch device-tree nodes from + wa[1]: 0 (hard-coded) + wa[2]: for next-sibling/next-child response: + wa offset of null-terminated string denoting the new node's name + for next-property response: + wa offset of null-terminated string denoting new property's name + wa[3]: for next-property response (unused otherwise): + byte-length of new property's value + wa[4]: for next-property response (unused otherwise): + new property's value, encoded as an OFDT-compatible byte array + +== hotplug/unplug events == + +For most DR operations, the hypervisor will issue host->guest add/remove events +using the EPOW/check-exception notification framework, where the host issues a +check-exception interrupt, then provides an RTAS event log via an +rtas-check-exception call issued by the guest in response. This framework is +documented by PAPR+ v2.7, and already use in by QEMU for generating powerdown +requests via EPOW events. + +For DR, this framework has been extended to include hotplug events, which were +previously unneeded due to direct manipulation of DR-related guest userspace +tools by host-level management such as an HMC. This level of management is not +applicable to PowerKVM, hence the reason for extending the notification +framework to support hotplug events. + +Note that these events are not yet formally part of the PAPR+ specification, +but support for this format has already been implemented in DR-related +guest tools such as powerpc-utils/librtas, as well as kernel patches that have +been submitted to handle in-kernel processing of memory/cpu-related hotplug +events[1], and is planned for formal inclusion is PAPR+ specification. The +hotplug-specific payload is QEMU implemented as follows (with all values +encoded in big-endian format): + +struct rtas_event_log_v6_hp { +#define SECTION_ID_HOTPLUG 0x4850 /* HP */ + struct section_header { + uint16_t section_id; /* set to SECTION_ID_HOTPLUG */ + uint16_t section_length; /* sizeof(rtas_event_log_v6_hp), + * plus the length of the DRC name + * if a DRC name identifier is + * specified for hotplug_identifier + */ + uint8_t section_version; /* version 1 */ + uint8_t section_subtype; /* unused */ + uint16_t creator_component_id; /* unused */ + } hdr; +#define RTAS_LOG_V6_HP_TYPE_CPU 1 +#define RTAS_LOG_V6_HP_TYPE_MEMORY 2 +#define RTAS_LOG_V6_HP_TYPE_SLOT 3 +#define RTAS_LOG_V6_HP_TYPE_PHB 4 +#define RTAS_LOG_V6_HP_TYPE_PCI 5 + uint8_t hotplug_type; /* type of resource/device */ +#define RTAS_LOG_V6_HP_ACTION_ADD 1 +#define RTAS_LOG_V6_HP_ACTION_REMOVE 2 + uint8_t hotplug_action; /* action (add/remove) */ +#define RTAS_LOG_V6_HP_ID_DRC_NAME 1 +#define RTAS_LOG_V6_HP_ID_DRC_INDEX 2 +#define RTAS_LOG_V6_HP_ID_DRC_COUNT 3 + uint8_t hotplug_identifier; /* type of the resource identifier, + * which serves as the discriminator + * for the 'drc' union field below + */ + uint8_t reserved; + union { + uint32_t index; /* DRC index of resource to take action + * on + */ + uint32_t count; /* number of DR resources to take + * action on (guest chooses which) + */ + char name[1]; /* string representing the name of the + * DRC to take action on + */ + } drc; +} QEMU_PACKED; + +== ibm,lrdr-capacity == + +ibm,lrdr-capacity is a property in the /rtas device tree node that identifies +the dynamic reconfiguration capabilities of the guest. It consists of a triple +consisting of <phys>, <size> and <maxcpus>. + + <phys>, encoded in BE format represents the maximum address in bytes and + hence the maximum memory that can be allocated to the guest. + + <size>, encoded in BE format represents the size increments in which + memory can be hot-plugged to the guest. + + <maxcpus>, a BE-encoded integer, represents the maximum number of + processors that the guest can have. + +pseries guests use this property to note the maximum allowed CPUs for the +guest. + +== ibm,dynamic-reconfiguration-memory == + +ibm,dynamic-reconfiguration-memory is a device tree node that represents +dynamically reconfigurable logical memory blocks (LMB). This node +is generated only when the guest advertises the support for it via +ibm,client-architecture-support call. Memory that is not dynamically +reconfigurable is represented by /memory nodes. The properties of this +node that are of interest to the sPAPR memory hotplug implementation +in QEMU are described here. + +ibm,lmb-size + +This 64bit integer defines the size of each dynamically reconfigurable LMB. + +ibm,associativity-lookup-arrays + +This property defines a lookup array in which the NUMA associativity +information for each LMB can be found. It is a property encoded array +that begins with an integer M, the number of associativity lists followed +by an integer N, the number of entries per associativity list and terminated +by M associativity lists each of length N integers. + +This property provides the same information as given by ibm,associativity +property in a /memory node. Each assigned LMB has an index value between +0 and M-1 which is used as an index into this table to select which +associativity list to use for the LMB. This index value for each LMB +is defined in ibm,dynamic-memory property. + +ibm,dynamic-memory + +This property describes the dynamically reconfigurable memory. It is a +property encoded array that has an integer N, the number of LMBs followed +by N LMB list entires. + +Each LMB list entry consists of the following elements: + +- Logical address of the start of the LMB encoded as a 64bit integer. This + corresponds to reg property in /memory node. +- DRC index of the LMB that corresponds to ibm,my-drc-index property + in a /memory node. +- Four bytes reserved for expansion. +- Associativity list index for the LMB that is used as an index into + ibm,associativity-lookup-arrays property described earlier. This + is used to retrieve the right associativity list to be used for this + LMB. +- A 32bit flags word. The bit at bit position 0x00000008 defines whether + the LMB is assigned to the the partition as of boot time. + +[1] http://thread.gmane.org/gmane.linux.ports.ppc.embedded/75350/focus=106867 diff --git a/src/docs/specs/pvpanic.txt b/src/docs/specs/pvpanic.txt new file mode 100644 index 0000000..c7bbacc --- /dev/null +++ b/src/docs/specs/pvpanic.txt @@ -0,0 +1,39 @@ +PVPANIC DEVICE +============== + +pvpanic device is a simulated ISA device, through which a guest panic +event is sent to qemu, and a QMP event is generated. This allows +management apps (e.g. libvirt) to be notified and respond to the event. + +The management app has the option of waiting for GUEST_PANICKED events, +and/or polling for guest-panicked RunState, to learn when the pvpanic +device has fired a panic event. + +ISA Interface +------------- + +pvpanic exposes a single I/O port, by default 0x505. On read, the bits +recognized by the device are set. Software should ignore bits it doesn't +recognize. On write, the bits not recognized by the device are ignored. +Software should set only bits both itself and the device recognize. +Currently, only bit 0 is recognized, setting it indicates a guest panic +has happened. + +ACPI Interface +-------------- + +pvpanic device is defined with ACPI ID "QEMU0001". Custom methods: + +RDPT: To determine whether guest panic notification is supported. +Arguments: None +Return: Returns a byte, bit 0 set to indicate guest panic + notification is supported. Other bits are reserved and + should be ignored. + +WRPT: To send a guest panic event +Arguments: Arg0 is a byte, with bit 0 set to indicate guest panic has + happened. Other bits are reserved and should be cleared. +Return: None + +The ACPI device will automatically refer to the right port in case it +is modified. diff --git a/src/docs/specs/qcow2.txt b/src/docs/specs/qcow2.txt new file mode 100644 index 0000000..f236d8c --- /dev/null +++ b/src/docs/specs/qcow2.txt @@ -0,0 +1,362 @@ +== General == + +A qcow2 image file is organized in units of constant size, which are called +(host) clusters. A cluster is the unit in which all allocations are done, +both for actual guest data and for image metadata. + +Likewise, the virtual disk as seen by the guest is divided into (guest) +clusters of the same size. + +All numbers in qcow2 are stored in Big Endian byte order. + + +== Header == + +The first cluster of a qcow2 image contains the file header: + + Byte 0 - 3: magic + QCOW magic string ("QFI\xfb") + + 4 - 7: version + Version number (valid values are 2 and 3) + + 8 - 15: backing_file_offset + Offset into the image file at which the backing file name + is stored (NB: The string is not null terminated). 0 if the + image doesn't have a backing file. + + 16 - 19: backing_file_size + Length of the backing file name in bytes. Must not be + longer than 1023 bytes. Undefined if the image doesn't have + a backing file. + + 20 - 23: cluster_bits + Number of bits that are used for addressing an offset + within a cluster (1 << cluster_bits is the cluster size). + Must not be less than 9 (i.e. 512 byte clusters). + + Note: qemu as of today has an implementation limit of 2 MB + as the maximum cluster size and won't be able to open images + with larger cluster sizes. + + 24 - 31: size + Virtual disk size in bytes + + 32 - 35: crypt_method + 0 for no encryption + 1 for AES encryption + + 36 - 39: l1_size + Number of entries in the active L1 table + + 40 - 47: l1_table_offset + Offset into the image file at which the active L1 table + starts. Must be aligned to a cluster boundary. + + 48 - 55: refcount_table_offset + Offset into the image file at which the refcount table + starts. Must be aligned to a cluster boundary. + + 56 - 59: refcount_table_clusters + Number of clusters that the refcount table occupies + + 60 - 63: nb_snapshots + Number of snapshots contained in the image + + 64 - 71: snapshots_offset + Offset into the image file at which the snapshot table + starts. Must be aligned to a cluster boundary. + +If the version is 3 or higher, the header has the following additional fields. +For version 2, the values are assumed to be zero, unless specified otherwise +in the description of a field. + + 72 - 79: incompatible_features + Bitmask of incompatible features. An implementation must + fail to open an image if an unknown bit is set. + + Bit 0: Dirty bit. If this bit is set then refcounts + may be inconsistent, make sure to scan L1/L2 + tables to repair refcounts before accessing the + image. + + Bit 1: Corrupt bit. If this bit is set then any data + structure may be corrupt and the image must not + be written to (unless for regaining + consistency). + + Bits 2-63: Reserved (set to 0) + + 80 - 87: compatible_features + Bitmask of compatible features. An implementation can + safely ignore any unknown bits that are set. + + Bit 0: Lazy refcounts bit. If this bit is set then + lazy refcount updates can be used. This means + marking the image file dirty and postponing + refcount metadata updates. + + Bits 1-63: Reserved (set to 0) + + 88 - 95: autoclear_features + Bitmask of auto-clear features. An implementation may only + write to an image with unknown auto-clear features if it + clears the respective bits from this field first. + + Bits 0-63: Reserved (set to 0) + + 96 - 99: refcount_order + Describes the width of a reference count block entry (width + in bits: refcount_bits = 1 << refcount_order). For version 2 + images, the order is always assumed to be 4 + (i.e. refcount_bits = 16). + This value may not exceed 6 (i.e. refcount_bits = 64). + + 100 - 103: header_length + Length of the header structure in bytes. For version 2 + images, the length is always assumed to be 72 bytes. + +Directly after the image header, optional sections called header extensions can +be stored. Each extension has a structure like the following: + + Byte 0 - 3: Header extension type: + 0x00000000 - End of the header extension area + 0xE2792ACA - Backing file format name + 0x6803f857 - Feature name table + other - Unknown header extension, can be safely + ignored + + 4 - 7: Length of the header extension data + + 8 - n: Header extension data + + n - m: Padding to round up the header extension size to the next + multiple of 8. + +Unless stated otherwise, each header extension type shall appear at most once +in the same image. + +If the image has a backing file then the backing file name should be stored in +the remaining space between the end of the header extension area and the end of +the first cluster. It is not allowed to store other data here, so that an +implementation can safely modify the header and add extensions without harming +data of compatible features that it doesn't support. Compatible features that +need space for additional data can use a header extension. + + +== Feature name table == + +The feature name table is an optional header extension that contains the name +for features used by the image. It can be used by applications that don't know +the respective feature (e.g. because the feature was introduced only later) to +display a useful error message. + +The number of entries in the feature name table is determined by the length of +the header extension data. Each entry look like this: + + Byte 0: Type of feature (select feature bitmap) + 0: Incompatible feature + 1: Compatible feature + 2: Autoclear feature + + 1: Bit number within the selected feature bitmap (valid + values: 0-63) + + 2 - 47: Feature name (padded with zeros, but not necessarily null + terminated if it has full length) + + +== Host cluster management == + +qcow2 manages the allocation of host clusters by maintaining a reference count +for each host cluster. A refcount of 0 means that the cluster is free, 1 means +that it is used, and >= 2 means that it is used and any write access must +perform a COW (copy on write) operation. + +The refcounts are managed in a two-level table. The first level is called +refcount table and has a variable size (which is stored in the header). The +refcount table can cover multiple clusters, however it needs to be contiguous +in the image file. + +It contains pointers to the second level structures which are called refcount +blocks and are exactly one cluster in size. + +Given a offset into the image file, the refcount of its cluster can be obtained +as follows: + + refcount_block_entries = (cluster_size * 8 / refcount_bits) + + refcount_block_index = (offset / cluster_size) % refcount_block_entries + refcount_table_index = (offset / cluster_size) / refcount_block_entries + + refcount_block = load_cluster(refcount_table[refcount_table_index]); + return refcount_block[refcount_block_index]; + +Refcount table entry: + + Bit 0 - 8: Reserved (set to 0) + + 9 - 63: Bits 9-63 of the offset into the image file at which the + refcount block starts. Must be aligned to a cluster + boundary. + + If this is 0, the corresponding refcount block has not yet + been allocated. All refcounts managed by this refcount block + are 0. + +Refcount block entry (x = refcount_bits - 1): + + Bit 0 - x: Reference count of the cluster. If refcount_bits implies a + sub-byte width, note that bit 0 means the least significant + bit in this context. + + +== Cluster mapping == + +Just as for refcounts, qcow2 uses a two-level structure for the mapping of +guest clusters to host clusters. They are called L1 and L2 table. + +The L1 table has a variable size (stored in the header) and may use multiple +clusters, however it must be contiguous in the image file. L2 tables are +exactly one cluster in size. + +Given a offset into the virtual disk, the offset into the image file can be +obtained as follows: + + l2_entries = (cluster_size / sizeof(uint64_t)) + + l2_index = (offset / cluster_size) % l2_entries + l1_index = (offset / cluster_size) / l2_entries + + l2_table = load_cluster(l1_table[l1_index]); + cluster_offset = l2_table[l2_index]; + + return cluster_offset + (offset % cluster_size) + +L1 table entry: + + Bit 0 - 8: Reserved (set to 0) + + 9 - 55: Bits 9-55 of the offset into the image file at which the L2 + table starts. Must be aligned to a cluster boundary. If the + offset is 0, the L2 table and all clusters described by this + L2 table are unallocated. + + 56 - 62: Reserved (set to 0) + + 63: 0 for an L2 table that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in the active L1 table. + +L2 table entry: + + Bit 0 - 61: Cluster descriptor + + 62: 0 for standard clusters + 1 for compressed clusters + + 63: 0 for a cluster that is unused or requires COW, 1 if its + refcount is exactly one. This information is only accurate + in L2 tables that are reachable from the active L1 + table. + +Standard Cluster Descriptor: + + Bit 0: If set to 1, the cluster reads as all zeros. The host + cluster offset can be used to describe a preallocation, + but it won't be used for reading data from this cluster, + nor is data read from the backing file if the cluster is + unallocated. + + With version 2, this is always 0. + + 1 - 8: Reserved (set to 0) + + 9 - 55: Bits 9-55 of host cluster offset. Must be aligned to a + cluster boundary. If the offset is 0, the cluster is + unallocated. + + 56 - 61: Reserved (set to 0) + + +Compressed Clusters Descriptor (x = 62 - (cluster_bits - 8)): + + Bit 0 - x: Host cluster offset. This is usually _not_ aligned to a + cluster boundary! + + x+1 - 61: Compressed size of the images in sectors of 512 bytes + +If a cluster is unallocated, read requests shall read the data from the backing +file (except if bit 0 in the Standard Cluster Descriptor is set). If there is +no backing file or the backing file is smaller than the image, they shall read +zeros for all parts that are not covered by the backing file. + + +== Snapshots == + +qcow2 supports internal snapshots. Their basic principle of operation is to +switch the active L1 table, so that a different set of host clusters are +exposed to the guest. + +When creating a snapshot, the L1 table should be copied and the refcount of all +L2 tables and clusters reachable from this L1 table must be increased, so that +a write causes a COW and isn't visible in other snapshots. + +When loading a snapshot, bit 63 of all entries in the new active L1 table and +all L2 tables referenced by it must be reconstructed from the refcount table +as it doesn't need to be accurate in inactive L1 tables. + +A directory of all snapshots is stored in the snapshot table, a contiguous area +in the image file, whose starting offset and length are given by the header +fields snapshots_offset and nb_snapshots. The entries of the snapshot table +have variable length, depending on the length of ID, name and extra data. + +Snapshot table entry: + + Byte 0 - 7: Offset into the image file at which the L1 table for the + snapshot starts. Must be aligned to a cluster boundary. + + 8 - 11: Number of entries in the L1 table of the snapshots + + 12 - 13: Length of the unique ID string describing the snapshot + + 14 - 15: Length of the name of the snapshot + + 16 - 19: Time at which the snapshot was taken in seconds since the + Epoch + + 20 - 23: Subsecond part of the time at which the snapshot was taken + in nanoseconds + + 24 - 31: Time that the guest was running until the snapshot was + taken in nanoseconds + + 32 - 35: Size of the VM state in bytes. 0 if no VM state is saved. + If there is VM state, it starts at the first cluster + described by first L1 table entry that doesn't describe a + regular guest cluster (i.e. VM state is stored like guest + disk content, except that it is stored at offsets that are + larger than the virtual disk presented to the guest) + + 36 - 39: Size of extra data in the table entry (used for future + extensions of the format) + + variable: Extra data for future extensions. Unknown fields must be + ignored. Currently defined are (offset relative to snapshot + table entry): + + Byte 40 - 47: Size of the VM state in bytes. 0 if no VM + state is saved. If this field is present, + the 32-bit value in bytes 32-35 is ignored. + + Byte 48 - 55: Virtual disk size of the snapshot in bytes + + Version 3 images must include extra data at least up to + byte 55. + + variable: Unique ID string for the snapshot (not null terminated) + + variable: Name of the snapshot (not null terminated) + + variable: Padding to round up the snapshot table entry size to the + next multiple of 8. diff --git a/src/docs/specs/qed_spec.txt b/src/docs/specs/qed_spec.txt new file mode 100644 index 0000000..7982e05 --- /dev/null +++ b/src/docs/specs/qed_spec.txt @@ -0,0 +1,138 @@ +=Specification= + +The file format looks like this: + + +----------+----------+----------+-----+ + | cluster0 | cluster1 | cluster2 | ... | + +----------+----------+----------+-----+ + +The first cluster begins with the '''header'''. The header contains information about where regular clusters start; this allows the header to be extensible and store extra information about the image file. A regular cluster may be a '''data cluster''', an '''L2''', or an '''L1 table'''. L1 and L2 tables are composed of one or more contiguous clusters. + +Normally the file size will be a multiple of the cluster size. If the file size is not a multiple, extra information after the last cluster may not be preserved if data is written. Legitimate extra information should use space between the header and the first regular cluster. + +All fields are little-endian. + +==Header== + Header { + uint32_t magic; /* QED\0 */ + + uint32_t cluster_size; /* in bytes */ + uint32_t table_size; /* for L1 and L2 tables, in clusters */ + uint32_t header_size; /* in clusters */ + + uint64_t features; /* format feature bits */ + uint64_t compat_features; /* compat feature bits */ + uint64_t autoclear_features; /* self-resetting feature bits */ + + uint64_t l1_table_offset; /* in bytes */ + uint64_t image_size; /* total logical image size, in bytes */ + + /* if (features & QED_F_BACKING_FILE) */ + uint32_t backing_filename_offset; /* in bytes from start of header */ + uint32_t backing_filename_size; /* in bytes */ + } + +Field descriptions: +* ''cluster_size'' must be a power of 2 in range [2^12, 2^26]. +* ''table_size'' must be a power of 2 in range [1, 16]. +* ''header_size'' is the number of clusters used by the header and any additional information stored before regular clusters. +* ''features'', ''compat_features'', and ''autoclear_features'' are file format extension bitmaps. They work as follows: +** An image with unknown ''features'' bits enabled must not be opened. File format changes that are not backwards-compatible must use ''features'' bits. +** An image with unknown ''compat_features'' bits enabled can be opened safely. The unknown features are simply ignored and represent backwards-compatible changes to the file format. +** An image with unknown ''autoclear_features'' bits enable can be opened safely after clearing the unknown bits. This allows for backwards-compatible changes to the file format which degrade gracefully and can be re-enabled again by a new program later. +* ''l1_table_offset'' is the offset of the first byte of the L1 table in the image file and must be a multiple of ''cluster_size''. +* ''image_size'' is the block device size seen by the guest and must be a multiple of 512 bytes. +* ''backing_filename_offset'' and ''backing_filename_size'' describe a string in (byte offset, byte size) form. It is not NUL-terminated and has no alignment constraints. The string must be stored within the first ''header_size'' clusters. The backing filename may be an absolute path or relative to the image file. + +Feature bits: +* QED_F_BACKING_FILE = 0x01. The image uses a backing file. +* QED_F_NEED_CHECK = 0x02. The image needs a consistency check before use. +* QED_F_BACKING_FORMAT_NO_PROBE = 0x04. The backing file is a raw disk image and no file format autodetection should be attempted. This should be used to ensure that raw backing files are never detected as an image format if they happen to contain magic constants. + +There are currently no defined ''compat_features'' or ''autoclear_features'' bits. + +Fields predicated on a feature bit are only used when that feature is set. The fields always take up header space, regardless of whether or not the feature bit is set. + +==Tables== + +Tables provide the translation from logical offsets in the block device to cluster offsets in the file. + + #define TABLE_NOFFSETS (table_size * cluster_size / sizeof(uint64_t)) + + Table { + uint64_t offsets[TABLE_NOFFSETS]; + } + +The tables are organized as follows: + + +----------+ + | L1 table | + +----------+ + ,------' | '------. + +----------+ | +----------+ + | L2 table | ... | L2 table | + +----------+ +----------+ + ,------' | '------. + +----------+ | +----------+ + | Data | ... | Data | + +----------+ +----------+ + +A table is made up of one or more contiguous clusters. The table_size header field determines table size for an image file. For example, cluster_size=64 KB and table_size=4 results in 256 KB tables. + +The logical image size must be less than or equal to the maximum possible size of clusters rooted by the L1 table: + header.image_size <= TABLE_NOFFSETS * TABLE_NOFFSETS * header.cluster_size + +L1, L2, and data cluster offsets must be aligned to header.cluster_size. The following offsets have special meanings: + +===L2 table offsets=== +* 0 - unallocated. The L2 table is not yet allocated. + +===Data cluster offsets=== +* 0 - unallocated. The data cluster is not yet allocated. +* 1 - zero. The data cluster contents are all zeroes and no cluster is allocated. + +Future format extensions may wish to store per-offset information. The least significant 12 bits of an offset are reserved for this purpose and must be set to zero. Image files with cluster_size > 2^12 will have more unused bits which should also be zeroed. + +===Unallocated L2 tables and data clusters=== +Reads to an unallocated area of the image file access the backing file. If there is no backing file, then zeroes are produced. The backing file may be smaller than the image file and reads of unallocated areas beyond the end of the backing file produce zeroes. + +Writes to an unallocated area cause a new data clusters to be allocated, and a new L2 table if that is also unallocated. The new data cluster is populated with data from the backing file (or zeroes if no backing file) and the data being written. + +===Zero data clusters=== +Zero data clusters are a space-efficient way of storing zeroed regions of the image. + +Reads to a zero data cluster produce zeroes. Note that the difference between an unallocated and a zero data cluster is that zero data clusters stop the reading of contents from the backing file. + +Writes to a zero data cluster cause a new data cluster to be allocated. The new data cluster is populated with zeroes and the data being written. + +===Logical offset translation=== +Logical offsets are translated into cluster offsets as follows: + + table_bits table_bits cluster_bits + <--------> <--------> <---------------> + +----------+----------+-----------------+ + | L1 index | L2 index | byte offset | + +----------+----------+-----------------+ + + Structure of a logical offset + + offset_mask = ~(cluster_size - 1) # mask for the image file byte offset + + def logical_to_cluster_offset(l1_index, l2_index, byte_offset): + l2_offset = l1_table[l1_index] + l2_table = load_table(l2_offset) + cluster_offset = l2_table[l2_index] & offset_mask + return cluster_offset + byte_offset + +==Consistency checking== + +This section is informational and included to provide background on the use of the QED_F_NEED_CHECK ''features'' bit. + +The QED_F_NEED_CHECK bit is used to mark an image as dirty before starting an operation that could leave the image in an inconsistent state if interrupted by a crash or power failure. A dirty image must be checked on open because its metadata may not be consistent. + +Consistency check includes the following invariants: +# Each cluster is referenced once and only once. It is an inconsistency to have a cluster referenced more than once by L1 or L2 tables. A cluster has been leaked if it has no references. +# Offsets must be within the image file size and must be ''cluster_size'' aligned. +# Table offsets must at least ''table_size'' * ''cluster_size'' bytes from the end of the image file so that there is space for the entire table. + +The consistency check process starts by from ''l1_table_offset'' and scans all L2 tables. After the check completes with no other errors besides leaks, the QED_F_NEED_CHECK bit can be cleared and the image can be accessed. diff --git a/src/docs/specs/rocker.txt b/src/docs/specs/rocker.txt new file mode 100644 index 0000000..d2a8262 --- /dev/null +++ b/src/docs/specs/rocker.txt @@ -0,0 +1,1014 @@ +Rocker Network Switch Register Programming Guide +Copyright (c) Scott Feldman <sfeldma@gmail.com> +Copyright (c) Neil Horman <nhorman@tuxdriver.com> +Version 0.11, 12/29/2014 + +LICENSE +======= + +This program is free software; you can redistribute it and/or modify +it under the terms of the GNU General Public License as published by +the Free Software Foundation; either version 2 of the License, or +(at your option) any later version. + +This program is distributed in the hope that it will be useful, +but WITHOUT ANY WARRANTY; without even the implied warranty of +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +GNU General Public License for more details. + +SECTION 1: Introduction +======================= + +Overview +-------- + +This document describes the hardware/software interface for the Rocker switch +device. The intended audience is authors of OS drivers and device emulation +software. + +Notations and Conventions +------------------------- + +o In register descriptions, [n:m] indicates a range from bit n to bit m, +inclusive. +o Use of leading 0x indicates a hexadecimal number. +o Use of leading 0b indicates a binary number. +o The use of RSVD or Reserved indicates that a bit or field is reserved for +future use. +o Field width is in bytes, unless otherwise noted. +o Register are (R) read-only, (R/W) read/write, (W) write-only, or (COR) clear +on read +o TLV values in network-byte-order are designated with (N). + + +SECTION 2: PCI Configuration Registers +====================================== + +PCI Configuration Space +----------------------- + +Each switch instance registers as a PCI device with PCI configuration space: + + offset width description value + --------------------------------------------- + 0x0 2 Vendor ID 0x1b36 + 0x2 2 Device ID 0x0006 + 0x4 4 Command/Status + 0x8 1 Revision ID 0x01 + 0x9 3 Class code 0x2800 + 0xC 1 Cache line size + 0xD 1 Latency timer + 0xE 1 Header type + 0xF 1 Built-in self test + 0x10 4 Base address low + 0x14 4 Base address high + 0x18-28 Reserved + 0x2C 2 Subsystem vendor ID * + 0x2E 2 Subsystem ID * + 0x30-38 Reserved + 0x3C 1 Interrupt line + 0x3D 1 Interrupt pin 0x00 + 0x3E 1 Min grant 0x00 + 0x3D 1 Max latency 0x00 + 0x40 1 TRDY timeout + 0x41 1 Retry count + 0x42 2 Reserved + + +* Assigned by sub-system implementation + +SECTION 3: Memory-Mapped Register Space +======================================= + +There are two memory-mapped BARs. BAR0 maps device register space and is +0x2000 in size. BAR1 maps MSI-X vector and PBA tables and is also 0x2000 in +size, allowing for 256 MSI-X vectors. + +All registers are 4 or 8 bytes long. It is assumed host software will access 4 +byte registers with one 4-byte access, and 8 byte registers with either two +4-byte accesses or a single 8-byte access. In the case of two 4-byte accesses, +access must be lower and then upper 4-bytes, in that order. + +BAR0 device register space is organized as follows: + + offset description + ------------------------------------------------------ + 0x0000-0x000f Bogus registers to catch misbehaving + drivers. Writes do nothing. Reads + back as 0xDEADBABE. + 0x0010-0x00ff Test registers + 0x0300-0x03ff General purpose registers + 0x1000-0x1fff Descriptor control + +Holes in register space are reserved. Writes to reserved registers do nothing. +Reads to reserved registers read back as 0. + +No fancy stuff like write-combining is enabled on any of the registers. + +BAR1 MSI-X register space is organized as follows: + + offset description + ------------------------------------------------------ + 0x0000-0x0fff MSI-X vector table (256 vectors total) + 0x1000-0x1fff MSI-X PBA table + + +SECTION 4: Interrupts, DMA, and Endianness +========================================== + +PCI Interrupts +-------------- + +The device supports only MSI-X interrupts. BAR1 memory-mapped region contains +the MSI-X vector and PBA tables, with support for up to 256 MSI-X vectors. + +The vector assignment is: + + vector description + ----------------------------------------------------- + 0 Command descriptor ring completion + 1 Event descriptor ring completion + 2 Test operation completion + 3 RSVD + 4-255 Tx and Rx descriptor ring completion + Tx vector is even + Rx vector is odd + +A MSI-X vector table entry is 16 bytes: + + field offset width description + ------------------------------------------------------------- + lower_addr 0x0 4 [31:2] message address[31:2] + [1:0] Rsvd (4 byte alignment + required) + upper_addr 0x4 4 [31:19] Rsvd + [14:0] message address[46:32] + data 0x8 4 message data[31:0] + control 0xc 4 [31:1] Rsvd + [0] mask (0 = enable, + 1 = masked) + +Software should install the Interrupt Service Routine (ISR) before any ports +are enabled or any commands are issued on the command ring. + +DMA Operations +-------------- + +DMA operations are used for packet DMA to/from the CPU, command and event +processing. Command processing includes statistical counters and table dumps, +table insertion/deletion, and more. Event processing provides an async +notification method for device-originating events. Each DMA operation has a +set of control registers to manage a descriptor ring. The descriptor rings are +allocated from contiguous host DMA-able memory and registers specify the rings +base address, size and current head and tail indices. Software always writes +the head, and hardware always writes the tail. + +The higher-order bit of DMA_DESC_COMP_ERR is used to mark hardware completion +of a descriptor. Software will clear this bit when posting a descriptor to the +ring, and hardware will set this bit when the descriptor is complete. + +Descriptor ring sizes must be a power of 2 and range from 2 to 64K entries. +Descriptor rings' base address must be 8-byte aligned. Descriptors must be +packed within ring. Each descriptor in each ring must also be aligned on an 8 +byte boundary. Each descriptor ring will have these registers: + + DMA_DESC_xxx_BASE_ADDR, offset 0x1000 + (x * 32), 64-bit, (R/W) + DMA_DESC_xxx_SIZE, offset 0x1008 + (x * 32), 32-bit, (R/W) + DMA_DESC_xxx_HEAD, offset 0x100c + (x * 32), 32-bit, (R/W) + DMA_DESC_xxx_TAIL, offset 0x1010 + (x * 32), 32-bit, (R) + DMA_DESC_xxx_CTRL, offset 0x1014 + (x * 32), 32-bit, (W) + DMA_DESC_xxx_CREDITS, offset 0x1018 + (x * 32), 32-bit, (R/W) + DMA_DESC_xxx_RSVD1, offset 0x101c + (x * 32), 32-bit, (R/W) + +Where x is descriptor ring index: + + index ring + -------------------- + 0 CMD + 1 EVENT + 2 TX (port 0) + 3 RX (port 0) + 4 TX (port 1) + 5 RX (port 1) + . + . + . + 124 TX (port 61) + 125 RX (port 61) + 126 Resv + 127 Resv + +Writing BASE_ADDR or SIZE will reset HEAD and TAIL to zero. HEAD cannot be +written past TAIL. To do so would wrap the ring. An empty ring is when HEAD +== TAIL. A full ring is when HEAD is one position behind TAIL. Both HEAD and +TAIL increment and modulo wrap at the ring size. + +CTRL register bits: + + bit name description + ------------------------------------------------------------------------ + [0] CTRL_RESET Reset the descriptor ring + [1:31] Reserved + +All descriptor types share some common fields: + + field width description + ------------------------------------------------------------------- + DMA_DESC_BUF_ADDR 8 Phys addr of desc payload, 8-byte + aligned + DMA_DESC_COOKIE 8 Desc cookie for completion matching, + upper-most bit is reserved + DMA_DESC_BUF_SIZE 2 Desc payload size in bytes + DMA_DESC_TLV_SIZE 2 Desc payload total size in bytes + used for TLVs. Must be <= + DMA_DESC_BUF_SIZE. + DMA_DESC_COMP_ERR 2 Completion status of associated + desc payload. High order bit is + clear on new descs, toggled by + hw for completed items. + +To support forward- and backward-compatibility, descriptor and completion +payloads are specified in TLV format. Fields are packed with Type=field name, +Length=field length, and Value=field value. Software will ignore unknown fields +filled in by the switch. Likewise, the switch will ignore unknown fields +filled in by software. + +Descriptor payload buffer is 8-byte aligned and TLVs are 8-byte aligned. The +value within a TLV is also 8-byte aligned. The (packed, 8 byte) TLV header is: + + field width description + ----------------------------- + type 4 TLV type + len 2 TLV value length + pad 2 Reserved + +The alignment requirements for descriptors and TLVs are to avoid unaligned +access exceptions in software. Note that the payload for each TLV is also +8 byte aligned. + +Figure 1 shows an example descriptor buffer with two TLVs. + + <------- 8 bytes -------> + + 8-byte +––––+ +–––––––––––+–––––+–––––+ +–+ + align | type | len | pad | TLV#1 hdr | + +–––––––––––+–––––+–––––+ (len=22) | + | | | + | value | TVL#1 value | + | | (padded to 8-byte | + | +–––––+ alignment) | + | |/////| | + 8-byte +––––+ +–––––––––––+–––––––––––+ | + align | type | len | pad | TLV#2 hdr DESC_BUF_SIZE + +–––––+–––––+–––––+–––––+ (len=2) | + |value|/////////////////| TLV#2 value | + +–––––+/////////////////| | + |///////////////////////| | + |///////////////////////| | + |///////////////////////| | + |////////unused/////////| | + |////////space//////////| | + |///////////////////////| | + |///////////////////////| | + |///////////////////////| | + +–––––––––––––––––––––––+ +–+ + + fig. 1 + +TLVs can be nested within the NEST TLV type. + +Interrupt credits +^^^^^^^^^^^^^^^^^ + +MSI-X vectors used for descriptor ring completions use a credit mechanism for +efficient device, PCIe bus, OS and driver operations. Each descriptor ring has +a credit count which represents the number of outstanding descriptors to be +processed by the driver. As the device marks descriptors complete, the credit +count is incremented. As the driver processes those outstanding descriptors, +it returns credits back to the device. This way, the device knows the driver's +progress and can make decisions about when to fire the next interrupt or not. +When the credit count is zero, and the first descriptors are posted for the +driver, a single interrupt is fired. Once the interrupt is fired, the +interrupt is disabled (auto-masked*). In response to the interrupt, the driver +will process descriptors and PIO write a returned credit value for that +descriptor ring. If the driver returns all credits (the driver caught up with +the device and there is no outstanding work), then the interrupt is unmasked, +but not fired. If only partial credits are returned, the interrupt remains +masked but the device generates an interrupt, signaling the driver that more +outstanding work is available. + +(* this masking is unrelated to the MSI-X interrupt mask register) + +Endianness +---------- + +Device registers are hard-coded to little-endian (LE). The driver should +convert to/from host endianess to LE for device register accesses. + +Descriptors are LE. Descriptor buffer TLVs will have LE type and length +fields, but the value field can either be LE or network-byte-order, depending +on context. TLV values containing network packet data will be in network-byte +order. A TLV value containing a field or mask used to compare against network +packet data is network-byte order. For example, flow match fields (and masks) +are network-byte-order since they're matched directly, byte-by-byte, against +network packet data. All non-network-packet TLV multi-byte values will be LE. + +TLV values in network-byte-order are designated with (N). + + +SECTION 5: Test Registers +========================= + +Rocker has several test registers to support troubleshooting register access, +interrupt generation, and DMA operations: + + TEST_REG, offset 0x0010, 32-bit (R/W) + TEST_REG64, offset 0x0018, 64-bit (R/W) + TEST_IRQ, offset 0x0020, 32-bit (R/W) + TEST_DMA_ADDR, offset 0x0028, 64-bit (R/W) + TEST_DMA_SIZE, offset 0x0030, 32-bit (R/W) + TEST_DMA_CTRL, offset 0x0034, 32-bit (R/W) + +Reads to TEST_REG and TEST_REG64 will read a value equal to twice the last +value written to the register. The 32-bit and 64-bit versions are for testing +32-bit and 64-bit host accesses. + +A vector can be written to TEST_IRQ and the device will generate an interrupt +for that vector. + +To test basic DMA operations, allocate a DMA-able host buffer and put the +buffer address into TEST_DMA_ADDR and size into TEST_DMA_SIZE. Then, write to +TEST_DMA_CTRL to manipulate the buffer contents. TEST_DMA_CTRL operations are: + + operation value description + ----------------------------------------------------------- + TEST_DMA_CTRL_CLEAR 1 clear buffer + TEST_DMA_CTRL_FILL 2 fill buffer bytes with 0x96 + TEST_DMA_CTRL_INVERT 4 invert bytes in buffer + +Various buffer address and sizes should be tested to verify no address boundary +issue exists. In particular, buffers that start on odd-8-byte boundary and/or +span multiple PAGE sizes should be tested. + + +SECTION 6: Ports +================ + +Physical and Logical Ports +------------------------------------ + +The switch supports up to 62 physical (front-panel) ports. Register +PORT_PHYS_COUNT returns the actual number of physical ports available: + + PORT_PHYS_COUNT, offset 0x0304, 32-bit, (R) + +In addition to front-panel ports, the switch supports logical ports for +tunnels. + +Front-panel ports and logical tunnel ports are mapped into a single 32-bit port +space. A special CPU port is assigned port 0. The front-panel ports are +mapped to ports 1-62. A special loopback port is assigned port 63. Logical +tunnel ports are assigned ports 0x0001000-0x0001ffff. +To summarize the port assignments: + + port mapping + ------------------------------------------------------- + 0 CPU port (for packets to/from host CPU) + 1-62 front-panel physical ports + 63 loopback port + 64-0x0000ffff RSVD + 0x00010000-0x0001ffff logical tunnel ports + 0x00020000-0xffffffff RSVD + +Physical Port Mode +------------------ + +Switch front-panel ports operate in a mode. Currently, the only mode is +OF-DPA. OF-DPA[1] mode is based on OpenFlow Data Plane Abstraction (OF-DPA) +Abstract Switch Specification, Version 1.0, from Broadcom Corporation. To +set/get the mode for front-panel ports, see port settings, below. + +Port Settings +------------- + +Link status for all front-panel ports is available via PORT_PHYS_LINK_STATUS: + + PORT_PHYS_LINK_STATUS, offset 0x0310, 64-bit, (R) + + Value is port bitmap. Bits 0 and 63 always read 0. Bits 1-62 + read 1 for link UP and 0 for link DOWN for respective front-panel ports. + +Other properties for front-panel ports are available via DMA CMD descriptors: + + Get PORT_SETTINGS descriptor: + + field width description + ---------------------------------------------- + PORT_SETTINGS 2 CMD_GET + PPORT 4 Physical port # + + Get PORT_SETTINGS completion: + + field width description + ---------------------------------------------- + PPORT 4 Physical port # + SPEED 4 Current port interface speed, in Mbps + DUPLEX 1 1 = Full, 0 = Half + AUTONEG 1 1 = enabled, 0 = disabled + MACADDR 6 Port MAC address + MODE 1 0 = OF-DPA + LEARNING 1 MAC address learning on port + 1 = enabled + 0 = disabled + PHYS_NAME <var> Physical port name (string) + + Set PORT_SETTINGS descriptor: + + field width description + ---------------------------------------------- + PORT_SETTINGS 2 CMD_SET + PPORT 4 Physical port # + SPEED 4 Port interface speed, in Mbps + DUPLEX 1 1 = Full, 0 = Half + AUTONEG 1 1 = enabled, 0 = disabled + MACADDR 6 Port MAC address + MODE 1 0 = OF-DPA + +Port Enable +----------- + +Front-panel ports are initially disabled, which means port ingress and egress +packets will be dropped. To enable or disable a port, use PORT_PHYS_ENABLE: + + PORT_PHYS_ENABLE: offset 0x0318, 64-bit, (R/W) + + Value is bitmap of first 64 ports. Bits 0 and 63 are ignored + and always read as 0. Write 1 to enable port; write 0 to disable it. + Default is 0. + + +SECTION 7: Switch Control +========================= + +This section covers switch-wide register settings. + +Control +------- + +This register is used for low level control of the switch. + + CONTROL: offset 0x0300, 32-bit, (W) + + bit name description + ------------------------------------------------------------------------ + [0] CONTROL_RESET If set, device will perform reset + [1:31] Reserved + +Switch ID +--------- + +The switch has a SWITCH_ID to be used by software to uniquely identify the +switch: + + SWITCH_ID: offset 0x0320, 64-bit, (R) + + Value is opaque to switch software and no special encoding is implied. + + +SECTION 8: Events +================= + +Non-I/O asynchronous events from the device are notified to the host using the +event ring. The TLV structure for events is: + + field width description + --------------------------------------------------- + TYPE 4 Event type, one of: + 1: LINK_CHANGED + 2: MAC_VLAN_SEEN + INFO <nest> Event info (details below) + +Link Changed Event +------------------ + +When link status changes on a physical port, this event is generated. + + field width description + --------------------------------------------------- + INFO <nest> + PPORT 4 Physical port + LINKUP 1 Link status: + 0: down + 1: up + +MAC VLAN Seen Event +------------------- + +When a packet ingresses on a port and the source MAC/VLAN isn't known to the +device, the device will generate this event. In response to the event, the +driver should install to the device the MAC/VLAN on the port into the bridge +table. Once installed, the MAC/VLAN is known on the port and this event will +no longer be generated. + + field width description + --------------------------------------------------- + INFO <nest> + PPORT 4 Physical port + MAC 6 MAC address + VLAN 2 VLAN ID + + +SECTION 9: CPU Packet Processing +================================ + +Ingress packets directed to the host CPU for further processing are delivered +in the DMA RX ring. Likewise, host CPU originating packets destined to egress +on switch ports are scheduled by software using the DMA TX ring. + +Tx Packet Processing +-------------------- + +Software schedules packets for egress on switch ports using the DMA TX ring. A +TX descriptor buffer describes the packet location and size in host DMA-able +memory, the destination port, and any hardware-offload functions (such as L3 +payload checksum offload). Software then bumps the descriptor head to signal +hardware of new Tx work. In response, hardware will DMA read Tx descriptors up +to head, DMA read descriptor buffer and packet data, perform offloading +functions, and finally frame packet on wire (network). Once packet processing +is complete, hardware will writeback status to descriptor(s) to signal to +software that Tx is complete and software resources (e.g. skb) backing packet +can be released. + +Figure 2 shows an example 3-fragment packet queued with one Tx descriptor. A +TLV is used for each packet fragment. + + pkt frag 1 + +–––––––+ +–+ + +–––+ | | + desc buf | | | | + +––––––––+ | | | | + Tx ring +–––+ +–––––+ | | | + +–––––––––+ | | TLVs | +–––––––+ | + | +–––+ +––––––––+ pkt frag 2 | + | desc 0 | | +–––––+ +–––––––+ | + +–––––––––+ | TLVs | +–––+ | | + head+–+ | +––––––––+ | | | + | desc 1 | | +–––––+ +–––––––+ |pkt + +–––––––––+ | TLVs | | | + | | +––––––––+ | pkt frag 3 | + | | | +–––––––+ | + +–––––––––+ +–––+ | | + | | | | | + | | | | | + +–––––––––+ | | | + | | | | | + | | | | | + +–––––––––+ | | | + | | +–––––––+ +–+ + | | + +–––––––––+ + + fig 2. + +The TLVs for Tx descriptor buffer are: + + field width description + --------------------------------------------------------------------- + PPORT 4 Destination physical port # + TX_OFFLOAD 1 Hardware offload modes: + 0: no offload + 1: insert IP csum (ipv4 only) + 2: insert TCP/UDP csum + 3: L3 csum calc and insert + into csum offset (TX_L3_CSUM_OFF) + 16-bit 1's complement csum value. + IPv4 pseudo-header and IP + already calculated by OS + and inserted. + 4: TSO (TCP Segmentation Offload) + TX_L3_CSUM_OFF 2 For L3 csum offload mode, the offset, + from the beginning of the packet, + of the csum field in the L3 header + TX_TSO_MSS 2 For TSO offload mode, the + Maximum Segment Size in bytes + TX_TSO_HDR_LEN 2 For TSO offload mode, the + length of ethernet, IP, and + TCP/UDP headers, including IP + and TCP options. + TX_FRAGS <array> Packet fragments + TX_FRAG <nest> Packet fragment + TX_FRAG_ADDR 8 DMA address of packet fragment + TX_FRAG_LEN 2 Packet fragment length + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR reason + -------------------------------------------------------------------- + 0 OK + -ROCKER_ENXIO address or data read err on desc buf or packet + fragment + -ROCKER_EINVAL bad pport or TSO or csum offloading error + -ROCKER_ENOMEM no memory for internal staging tx fragment + +Rx Packet Processing +-------------------- + +For packets ingressing on switch ports that are not forwarded by the switch but +rather directed to the host CPU for further processing are delivered in the DMA +RX ring. Rx descriptor buffers are allocated by software and placed on the +ring. Hardware will fill Rx descriptor buffers with packet data, write the +completion, and signal to software that a new packet is ready. Since Rx packet +size is not known a-priori, the Rx descriptor buffer must be allocated for +worst-case packet size. A single Rx descriptor will contain the entire Rx +packet data in one RX_FRAG. Other Rx TLVs describe and hardware offloads +performed on the packet, such as checksum validation. + +The TLVs for Rx descriptor buffer are: + + field width description + --------------------------------------------------- + PPORT 4 Source physical port # + RX_FLAGS 2 Packet parsing flags: + (1 << 0): IPv4 packet + (1 << 1): IPv6 packet + (1 << 2): csum calculated + (1 << 3): IPv4 csum good + (1 << 4): IP fragment + (1 << 5): TCP packet + (1 << 6): UDP packet + (1 << 7): TCP/UDP csum good + (1 << 8): Offload forward + RX_CSUM 2 IP calculated checksum: + IPv4: IP payload csum + IPv6: header and payload csum + (Only valid is RX_FLAGS:csum calc is set) + RX_FRAG_ADDR 8 DMA address of packet fragment + RX_FRAG_MAX_LEN 2 Packet maximum fragment length + RX_FRAG_LEN 2 Actual packet fragment length after receive + +Offload forward RX_FLAG indicates the device has already forwarded the packet +so the host CPU should not also forward the packet. + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR reason + -------------------------------------------------------------------- + 0 OK + -ROCKER_ENXIO address or data read err on desc buf + -ROCKER_ENOMEM no memory for internal staging desc buf + -ROCKER_EMSGSIZE Rx descriptor buffer wasn't big enough to contain + packet data TLV and other TLVs. + + +SECTION 10: OF-DPA Mode +====================== + +OF-DPA mode allows the switch to offload flow packet processing functions to +hardware. An OpenFlow controller would communicate with an OpenFlow agent +installed on the switch. The OpenFlow agent would (directly or indirectly) +communicate with the Rocker switch driver, which in turn would program switch +hardware with flow functionality, as defined in OF-DPA. The block diagram is: + + +–––––––––––––––----–––+ + | OF | + | Remote Controller | + +––––––––+––----–––––––+ + | + | + +––––––––+–––––––––+ + | OF | + | Local Agent | + +––––––––––––––––––+ + | | + | Rocker Driver | + +––––––––––––––––––+ + <this spec> + +––––––––––––––––––+ + | | + | Rocker Switch | + +––––––––––––––––––+ + +To participate in flow functions, ports must be configure for OF-DPA mode +during switch initialization. + +OF-DPA Flow Table Interface +--------------------------- + +There are commands to add, modify, delete, and get stats of flow table entries. +The commands are issued using the DMA CMD descriptor ring. The following +commands are defined: + + CMD_ADD: add an entry to flow table + CMD_MOD: modify an entry in flow table + CMD_DEL: delete an entry from flow table + CMD_GET_STATS: get stats for flow entry + +TLVs for add and modify commands are: + + field width description + ---------------------------------------------------- + OF_DPA_CMD 2 CMD_[ADD|MOD] + OF_DPA_TBL 2 Flow table ID + 0: ingress port + 10: vlan + 20: termination mac + 30: unicast routing + 40: multicast routing + 50: bridging + 60: ACL policy + OF_DPA_PRIORITY 4 Flow priority + OF_DPA_HARDTIME 4 Hard timeout for flow + OF_DPA_IDLETIME 4 Idle timeout for flow + OF_DPA_COOKIE 8 Cookie + +Additional TLVs based on flow table ID: + +Table ID 0: ingress port + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + +Table ID 10: vlan + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_VLAN_ID_MASK 2 (N) vlan ID mask + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_NEW_VLAN_ID 2 (N) new vlan ID + +Table ID 20: termination mac + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_IN_PPORT_MASK 4 ingress physical port number mask + OF_DPA_ETHERTYPE 2 (N) must be either 0x0800 or 0x86dd + OF_DPA_DST_MAC 6 (N) destination MAC + OF_DPA_DST_MAC_MASK 6 (N) destination MAC mask + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_VLAN_ID_MASK 2 (N) vlan ID mask + OF_DPA_GOTO_TBL 2 only acceptable values are + unicast or multicast routing + table IDs + OF_DPA_OUT_PPORT 2 if specified, must be + controller, set zero otherwise + +Table ID 30: unicast routing + + field width description + ---------------------------------------------------- + OF_DPA_ETHERTYPE 2 (N) must be either 0x0800 or 0x86dd + OF_DPA_DST_IP 4 (N) destination IPv4 address. + Must be unicast address + OF_DPA_DST_IP_MASK 4 (N) IP mask. Must be prefix mask + OF_DPA_DST_IPV6 16 (N) destination IPv6 address. + Must be unicast address + OF_DPA_DST_IPV6_MASK 16 (N) IPv6 mask. Must be prefix mask + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_GROUP_ID 4 data for GROUP action must + be an L3 Unicast group entry + +Table ID 40: multicast routing + + field width description + ---------------------------------------------------- + OF_DPA_ETHERTYPE 2 (N) must be either 0x0800 or 0x86dd + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_SRC_IP 4 (N) source IPv4. Optional, + can contain IPv4 address, + must be completely masked + if not used + OF_DPA_SRC_IP_MASK 4 (N) IP Mask + OF_DPA_DST_IP 4 (N) destination IPv4 address. + Must be multicast address + OF_DPA_SRC_IPV6 16 (N) source IPv6 Address. Optional. + Can contain IPv6 address, + must be completely masked + if not used + OF_DPA_SRC_IPV6_MASK 16 (N) IPv6 mask. + OF_DPA_DST_IPV6 16 (N) destination IPv6 Address. Must + be multicast address + Must be multicast address + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_GROUP_ID 4 data for GROUP action must + be an L3 multicast group entry + +Table ID 50: bridging + + field width description + ---------------------------------------------------- + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_TUNNEL_ID 4 tunnel ID + OF_DPA_DST_MAC 6 (N) destination MAC + OF_DPA_DST_MAC_MASK 6 (N) destination MAC mask + OF_DPA_GOTO_TBL 2 goto table ID; zero to drop + OF_DPA_GROUP_ID 4 data for GROUP action must + be a L2 Interface, L2 + Multicast, L2 Flood, + or L2 Overlay group entry + as appropriate + OF_DPA_TUNNEL_LPORT 4 unicast Tenant Bridging + flows specify a tunnel + logical port ID + OF_DPA_OUT_PPORT 2 data for OUTPUT action, + restricted to CONTROLLER, + set to 0 otherwise + +Table ID 60: acl policy + + field width description + ---------------------------------------------------- + OF_DPA_IN_PPORT 4 ingress physical port number + OF_DPA_IN_PPORT_MASK 4 ingress physical port number mask + OF_DPA_ETHERTYPE 2 (N) ethertype + OF_DPA_VLAN_ID 2 (N) vlan ID + OF_DPA_VLAN_ID_MASK 2 (N) vlan ID mask + OF_DPA_VLAN_PCP 2 (N) vlan Priority Code Point + OF_DPA_VLAN_PCP_MASK 2 (N) vlan Priority Code Point mask + OF_DPA_SRC_MAC 6 (N) source MAC + OF_DPA_SRC_MAC_MASK 6 (N) source MAC mask + OF_DPA_DST_MAC 6 (N) destination MAC + OF_DPA_DST_MAC_MASK 6 (N) destination MAC mask + OF_DPA_TUNNEL_ID 4 tunnel ID + OF_DPA_SRC_IP 4 (N) source IPv4. Optional, + can contain IPv4 address, + must be completely masked + if not used + OF_DPA_SRC_IP_MASK 4 (N) IP Mask + OF_DPA_DST_IP 4 (N) destination IPv4 address. + Must be multicast address + OF_DPA_DST_IP_MASK 4 (N) IP Mask + OF_DPA_SRC_IPV6 16 (N) source IPv6 Address. Optional. + Can contain IPv6 address, + must be completely masked + if not used + OF_DPA_SRC_IPV6_MASK 16 (N) IPv6 mask + OF_DPA_DST_IPV6 16 (N) destination IPv6 Address. Must + be multicast address. + OF_DPA_DST_IPV6_MASK 16 (N) IPv6 mask + OF_DPA_SRC_ARP_IP 4 (N) source IPv4 address in the ARP + payload. Only used if ethertype + == 0x0806. + OF_DPA_SRC_ARP_IP_MASK 4 (N) IP Mask + OF_DPA_IP_PROTO 1 IP protocol + OF_DPA_IP_PROTO_MASK 1 IP protocol mask + OF_DPA_IP_DSCP 1 DSCP + OF_DPA_IP_DSCP_MASK 1 DSCP mask + OF_DPA_IP_ECN 1 ECN + OF_DPA_IP_ECN_MASK 1 ECN mask + OF_DPA_L4_SRC_PORT 2 (N) L4 source port, only for + TCP, UDP, or SCTP + OF_DPA_L4_SRC_PORT_MASK 2 (N) L4 source port mask + OF_DPA_L4_DST_PORT 2 (N) L4 source port, only for + TCP, UDP, or SCTP + OF_DPA_L4_DST_PORT_MASK 2 (N) L4 source port mask + OF_DPA_ICMP_TYPE 1 ICMP type, only if IP + protocol is 1 + OF_DPA_ICMP_TYPE_MASK 1 ICMP type mask + OF_DPA_ICMP_CODE 1 ICMP code + OF_DPA_ICMP_CODE_MASK 1 ICMP code mask + OF_DPA_IPV6_LABEL 4 (N) IPv6 flow label + OF_DPA_IPV6_LABEL_MASK 4 (N) IPv6 flow label mask + OF_DPA_GROUP_ID 4 data for GROUP action + OF_DPA_QUEUE_ID_ACTION 1 write the queue ID + OF_DPA_NEW_QUEUE_ID 1 queue ID + OF_DPA_VLAN_PCP_ACTION 1 write the VLAN priority + OF_DPA_NEW_VLAN_PCP 1 VLAN priority + OF_DPA_IP_DSCP_ACTION 1 write the DSCP + OF_DPA_NEW_IP_DSCP 1 new DSCP + OF_DPA_TUNNEL_LPORT 4 restrct to valid tunnel + logical port, set to 0 + otherwise. + OF_DPA_OUT_PPORT 2 data for OUTPUT action, + restricted to CONTROLLER, + set to 0 otherwise + OF_DPA_CLEAR_ACTIONS 4 if 1 packets matching flow are + dropped (all other instructions + ignored) + +TLVs for flow delete and get stats command are: + + field width description + --------------------------------------------------- + OF_DPA_CMD 2 CMD_[DEL|GET_STATS] + OF_DPA_COOKIE 8 Cookie + +On completion of get stats command, the descriptor buffer is written back with +the following TLVs: + + field width description + --------------------------------------------------- + OF_DPA_STAT_DURATION 4 Flow duration + OF_DPA_STAT_RX_PKTS 8 Received packets + OF_DPA_STAT_TX_PKTS 8 Transmit packets + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR command reason + -------------------------------------------------------------------- + 0 all OK + -ROCKER_EFAULT all head or tail index outside + of ring + -ROCKER_ENXIO all address or data read err on + desc buf + -ROCKER_EMSGSIZE GET_STATS cmd descriptor buffer wasn't + big enough to contain write-back + TLVs + -ROCKER_EINVAL all invalid parameters passed in + -ROCKER_EEXIST ADD entry already exists + -ROCKER_ENOSPC ADD no space left in flow table + -ROCKER_ENOENT MOD|DEL|GET_STATS cookie invalid + +Group Table Interface +--------------------- + +There are commands to add, modify, delete, and get stats of group table +entries. The commands are issued using the DMA CMD descriptor ring. The +following commands are defined: + + CMD_ADD: add an entry to group table + CMD_MOD: modify an entry in group table + CMD_DEL: delete an entry from group table + CMD_GET_STATS: get stats for group entry + +TLVs for add and modify commands are: + + field width description + ----------------------------------------------------------- + FLOW_GROUP_CMD 2 CMD_[ADD|MOD] + FLOW_GROUP_ID 2 Flow group ID + FLOW_GROUP_TYPE 1 Group type: + 0: L2 interface + 1: L2 rewrite + 2: L3 unicast + 3: L2 multicast + 4: L2 flood + 5: L3 interface + 6: L3 multicast + 7: L3 ECMP + 8: L2 overlay + FLOW_VLAN_ID 2 Vlan ID (types 0, 3, 4, 6) + FLOW_L2_PORT 2 Port (types 0) + FLOW_INDEX 4 Index (all types but 0) + FLOW_OVERLAY_TYPE 1 Overlay sub-type (type 8): + 0: Flood unicast tunnel + 1: Flood multicast tunnel + 2: Multicast unicast tunnel + 3: Multicast multicast tunnel + FLOW_GROUP_ACTION nest + FLOW_GROUP_ID 2 next group ID in chain (all + types except 0) + FLOW_OUT_PORT 4 egress port (types 0, 8) + FLOW_POP_VLAN_TAG 1 strip outer VLAN tag (type 1 + only) + FLOW_VLAN_ID 2 (types 1, 5) + FLOW_SRC_MAC 6 (types 1, 2, 5) + FLOW_DST_MAC 6 (types 1, 2) + +TLVs for flow delete and get stats command are: + + field width description + ----------------------------------------------------------- + FLOW_GROUP_CMD 2 CMD_[DEL|GET_STATS] + FLOW_GROUP_ID 2 Flow group ID + +On completion of get stats command, the descriptor buffer is written back with +the following TLVs: + + field width description + --------------------------------------------------- + FLOW_GROUP_ID 2 Flow group ID + FLOW_STAT_DURATION 4 Flow duration + FLOW_STAT_REF_COUNT 4 Flow reference count + FLOW_STAT_BUCKET_COUNT 4 Flow bucket count + +Possible status return codes in descriptor on completion are: + + DESC_COMP_ERR command reason + -------------------------------------------------------------------- + 0 all OK + -ROCKER_EFAULT all head or tail index outside + of ring + -ROCKER_ENXIO all address or data read err on + desc buf + -ROCKER_ENOSPC GET_STATS cmd descriptor buffer wasn't + big enough to contain write-back + TLVs + -ROCKER_EINVAL ADD|MOD invalid parameters passed in + -ROCKER_EEXIST ADD entry already exists + -ROCKER_ENOSPC ADD no space left in flow table + -ROCKER_ENOENT MOD|DEL|GET_STATS group ID invalid + -ROCKER_EBUSY DEL group reference count non-zero + -ROCKER_ENODEV ADD next group ID doesn't exist + + + +References +========== + +[1] OpenFlow Data Plane Abstraction (OF-DPA) Abstract Switch Specification, +Version 1.0, from Broadcom Corporation, February 21, 2014. diff --git a/src/docs/specs/standard-vga.txt b/src/docs/specs/standard-vga.txt new file mode 100644 index 0000000..19d2a74 --- /dev/null +++ b/src/docs/specs/standard-vga.txt @@ -0,0 +1,81 @@ + +QEMU Standard VGA +================= + +Exists in two variants, for isa and pci. + +command line switches: + -vga std [ picks isa for -M isapc, otherwise pci ] + -device VGA [ pci variant ] + -device isa-vga [ isa variant ] + -device secondary-vga [ legacy-free pci variant ] + + +PCI spec +-------- + +Applies to the pci variant only for obvious reasons. + +PCI ID: 1234:1111 + +PCI Region 0: + Framebuffer memory, 16 MB in size (by default). + Size is tunable via vga_mem_mb property. + +PCI Region 1: + Reserved (so we have the option to make the framebuffer bar 64bit). + +PCI Region 2: + MMIO bar, 4096 bytes in size (qemu 1.3+) + +PCI ROM Region: + Holds the vgabios (qemu 0.14+). + + +The legacy-free variant has no ROM and has PCI_CLASS_DISPLAY_OTHER +instead of PCI_CLASS_DISPLAY_VGA. + + +IO ports used +------------- + +Doesn't apply to the legacy-free pci variant, use the MMIO bar instead. + +03c0 - 03df : standard vga ports +01ce : bochs vbe interface index port +01cf : bochs vbe interface data port (x86 only) +01d0 : bochs vbe interface data port + + +Memory regions used +------------------- + +0xe0000000 : Framebuffer memory, isa variant only. + +The pci variant used to mirror the framebuffer bar here, qemu 0.14+ +stops doing that (except when in -M pc-$old compat mode). + + +MMIO area spec +-------------- + +Likewise applies to the pci variant only for obvious reasons. + +0000 - 03ff : reserved, for possible virtio extension. +0400 - 041f : vga ioports (0x3c0 -> 0x3df), remapped 1:1. + word access is supported, bytes are written + in little endia order (aka index port first), + so indexed registers can be updated with a + single mmio write (and thus only one vmexit). +0500 - 0515 : bochs dispi interface registers, mapped flat + without index/data ports. Use (index << 1) + as offset for (16bit) register access. + +0600 - 0607 : qemu extended registers. qemu 2.2+ only. + The pci revision is 2 (or greater) when + these registers are present. The registers + are 32bit. + 0600 : qemu extended register region size, in bytes. + 0604 : framebuffer endianness register. + - 0xbebebebe indicates big endian. + - 0x1e1e1e1e indicates little endian. diff --git a/src/docs/specs/vhost-user.txt b/src/docs/specs/vhost-user.txt new file mode 100644 index 0000000..0312d40 --- /dev/null +++ b/src/docs/specs/vhost-user.txt @@ -0,0 +1,466 @@ +Vhost-user Protocol +=================== + +Copyright (c) 2014 Virtual Open Systems Sarl. + +This work is licensed under the terms of the GNU GPL, version 2 or later. +See the COPYING file in the top-level directory. +=================== + +This protocol is aiming to complement the ioctl interface used to control the +vhost implementation in the Linux kernel. It implements the control plane needed +to establish virtqueue sharing with a user space process on the same host. It +uses communication over a Unix domain socket to share file descriptors in the +ancillary data of the message. + +The protocol defines 2 sides of the communication, master and slave. Master is +the application that shares its virtqueues, in our case QEMU. Slave is the +consumer of the virtqueues. + +In the current implementation QEMU is the Master, and the Slave is intended to +be a software Ethernet switch running in user space, such as Snabbswitch. + +Master and slave can be either a client (i.e. connecting) or server (listening) +in the socket communication. + +Message Specification +--------------------- + +Note that all numbers are in the machine native byte order. A vhost-user message +consists of 3 header fields and a payload: + +------------------------------------ +| request | flags | size | payload | +------------------------------------ + + * Request: 32-bit type of the request + * Flags: 32-bit bit field: + - Lower 2 bits are the version (currently 0x01) + - Bit 2 is the reply flag - needs to be sent on each reply from the slave + * Size - 32-bit size of the payload + + +Depending on the request type, payload can be: + + * A single 64-bit integer + ------- + | u64 | + ------- + + u64: a 64-bit unsigned integer + + * A vring state description + --------------- + | index | num | + --------------- + + Index: a 32-bit index + Num: a 32-bit number + + * A vring address description + -------------------------------------------------------------- + | index | flags | size | descriptor | used | available | log | + -------------------------------------------------------------- + + Index: a 32-bit vring index + Flags: a 32-bit vring flags + Descriptor: a 64-bit user address of the vring descriptor table + Used: a 64-bit user address of the vring used ring + Available: a 64-bit user address of the vring available ring + Log: a 64-bit guest address for logging + + * Memory regions description + --------------------------------------------------- + | num regions | padding | region0 | ... | region7 | + --------------------------------------------------- + + Num regions: a 32-bit number of regions + Padding: 32-bit + + A region is: + ----------------------------------------------------- + | guest address | size | user address | mmap offset | + ----------------------------------------------------- + + Guest address: a 64-bit guest address of the region + Size: a 64-bit size + User address: a 64-bit user address + mmap offset: 64-bit offset where region starts in the mapped memory + +* Log description + --------------------------- + | log size | log offset | + --------------------------- + log size: size of area used for logging + log offset: offset from start of supplied file descriptor + where logging starts (i.e. where guest address 0 would be logged) + +In QEMU the vhost-user message is implemented with the following struct: + +typedef struct VhostUserMsg { + VhostUserRequest request; + uint32_t flags; + uint32_t size; + union { + uint64_t u64; + struct vhost_vring_state state; + struct vhost_vring_addr addr; + VhostUserMemory memory; + VhostUserLog log; + }; +} QEMU_PACKED VhostUserMsg; + +Communication +------------- + +The protocol for vhost-user is based on the existing implementation of vhost +for the Linux Kernel. Most messages that can be sent via the Unix domain socket +implementing vhost-user have an equivalent ioctl to the kernel implementation. + +The communication consists of master sending message requests and slave sending +message replies. Most of the requests don't require replies. Here is a list of +the ones that do: + + * VHOST_GET_FEATURES + * VHOST_GET_PROTOCOL_FEATURES + * VHOST_GET_VRING_BASE + * VHOST_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD) + +There are several messages that the master sends with file descriptors passed +in the ancillary data: + + * VHOST_SET_MEM_TABLE + * VHOST_SET_LOG_BASE (if VHOST_USER_PROTOCOL_F_LOG_SHMFD) + * VHOST_SET_LOG_FD + * VHOST_SET_VRING_KICK + * VHOST_SET_VRING_CALL + * VHOST_SET_VRING_ERR + +If Master is unable to send the full message or receives a wrong reply it will +close the connection. An optional reconnection mechanism can be implemented. + +Any protocol extensions are gated by protocol feature bits, +which allows full backwards compatibility on both master +and slave. +As older slaves don't support negotiating protocol features, +a feature bit was dedicated for this purpose: +#define VHOST_USER_F_PROTOCOL_FEATURES 30 + +Starting and stopping rings +---------------------- +Client must only process each ring when it is started. + +Client must only pass data between the ring and the +backend, when the ring is enabled. + +If ring is started but disabled, client must process the +ring without talking to the backend. + +For example, for a networking device, in the disabled state +client must not supply any new RX packets, but must process +and discard any TX packets. + +If VHOST_USER_F_PROTOCOL_FEATURES has not been negotiated, the ring is initialized +in an enabled state. + +If VHOST_USER_F_PROTOCOL_FEATURES has been negotiated, the ring is initialized +in a disabled state. Client must not pass data to/from the backend until ring is enabled by +VHOST_USER_SET_VRING_ENABLE with parameter 1, or after it has been disabled by +VHOST_USER_SET_VRING_ENABLE with parameter 0. + +Each ring is initialized in a stopped state, client must not process it until +ring is started, or after it has been stopped. + +Client must start ring upon receiving a kick (that is, detecting that file +descriptor is readable) on the descriptor specified by +VHOST_USER_SET_VRING_KICK, and stop ring upon receiving +VHOST_USER_GET_VRING_BASE. + +While processing the rings (whether they are enabled or not), client must +support changing some configuration aspects on the fly. + +Multiple queue support +---------------------- + +Multiple queue is treated as a protocol extension, hence the slave has to +implement protocol features first. The multiple queues feature is supported +only when the protocol feature VHOST_USER_PROTOCOL_F_MQ (bit 0) is set. + +The max number of queues the slave supports can be queried with message +VHOST_USER_GET_PROTOCOL_FEATURES. Master should stop when the number of +requested queues is bigger than that. + +As all queues share one connection, the master uses a unique index for each +queue in the sent message to identify a specified queue. One queue pair +is enabled initially. More queues are enabled dynamically, by sending +message VHOST_USER_SET_VRING_ENABLE. + +Migration +--------- + +During live migration, the master may need to track the modifications +the slave makes to the memory mapped regions. The client should mark +the dirty pages in a log. Once it complies to this logging, it may +declare the VHOST_F_LOG_ALL vhost feature. + +To start/stop logging of data/used ring writes, server may send messages +VHOST_USER_SET_FEATURES with VHOST_F_LOG_ALL and VHOST_USER_SET_VRING_ADDR with +VHOST_VRING_F_LOG in ring's flags set to 1/0, respectively. + +All the modifications to memory pointed by vring "descriptor" should +be marked. Modifications to "used" vring should be marked if +VHOST_VRING_F_LOG is part of ring's flags. + +Dirty pages are of size: +#define VHOST_LOG_PAGE 0x1000 + +The log memory fd is provided in the ancillary data of +VHOST_USER_SET_LOG_BASE message when the slave has +VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol feature. + +The size of the log is supplied as part of VhostUserMsg +which should be large enough to cover all known guest +addresses. Log starts at the supplied offset in the +supplied file descriptor. +The log covers from address 0 to the maximum of guest +regions. In pseudo-code, to mark page at "addr" as dirty: + +page = addr / VHOST_LOG_PAGE +log[page / 8] |= 1 << page % 8 + +Where addr is the guest physical address. + +Use atomic operations, as the log may be concurrently manipulated. + +Note that when logging modifications to the used ring (when VHOST_VRING_F_LOG +is set for this ring), log_guest_addr should be used to calculate the log +offset: the write to first byte of the used ring is logged at this offset from +log start. Also note that this value might be outside the legal guest physical +address range (i.e. does not have to be covered by the VhostUserMemory table), +but the bit offset of the last byte of the ring must fall within +the size supplied by VhostUserLog. + +VHOST_USER_SET_LOG_FD is an optional message with an eventfd in +ancillary data, it may be used to inform the master that the log has +been modified. + +Once the source has finished migration, rings will be stopped by +the source. No further update must be done before rings are +restarted. + +Protocol features +----------------- + +#define VHOST_USER_PROTOCOL_F_MQ 0 +#define VHOST_USER_PROTOCOL_F_LOG_SHMFD 1 +#define VHOST_USER_PROTOCOL_F_RARP 2 + +Message types +------------- + + * VHOST_USER_GET_FEATURES + + Id: 1 + Equivalent ioctl: VHOST_GET_FEATURES + Master payload: N/A + Slave payload: u64 + + Get from the underlying vhost implementation the features bitmask. + Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for + VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES. + + * VHOST_USER_SET_FEATURES + + Id: 2 + Ioctl: VHOST_SET_FEATURES + Master payload: u64 + + Enable features in the underlying vhost implementation using a bitmask. + Feature bit VHOST_USER_F_PROTOCOL_FEATURES signals slave support for + VHOST_USER_GET_PROTOCOL_FEATURES and VHOST_USER_SET_PROTOCOL_FEATURES. + + * VHOST_USER_GET_PROTOCOL_FEATURES + + Id: 15 + Equivalent ioctl: VHOST_GET_FEATURES + Master payload: N/A + Slave payload: u64 + + Get the protocol feature bitmask from the underlying vhost implementation. + Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in + VHOST_USER_GET_FEATURES. + Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support + this message even before VHOST_USER_SET_FEATURES was called. + + * VHOST_USER_SET_PROTOCOL_FEATURES + + Id: 16 + Ioctl: VHOST_SET_FEATURES + Master payload: u64 + + Enable protocol features in the underlying vhost implementation. + Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in + VHOST_USER_GET_FEATURES. + Note: slave that reported VHOST_USER_F_PROTOCOL_FEATURES must support + this message even before VHOST_USER_SET_FEATURES was called. + + * VHOST_USER_SET_OWNER + + Id: 3 + Equivalent ioctl: VHOST_SET_OWNER + Master payload: N/A + + Issued when a new connection is established. It sets the current Master + as an owner of the session. This can be used on the Slave as a + "session start" flag. + + * VHOST_USER_RESET_OWNER + + Id: 4 + Master payload: N/A + + This is no longer used. Used to be sent to request disabling + all rings, but some clients interpreted it to also discard + connection state (this interpretation would lead to bugs). + It is recommended that clients either ignore this message, + or use it to disable all rings. + + * VHOST_USER_SET_MEM_TABLE + + Id: 5 + Equivalent ioctl: VHOST_SET_MEM_TABLE + Master payload: memory regions description + + Sets the memory map regions on the slave so it can translate the vring + addresses. In the ancillary data there is an array of file descriptors + for each memory mapped region. The size and ordering of the fds matches + the number and ordering of memory regions. + + * VHOST_USER_SET_LOG_BASE + + Id: 6 + Equivalent ioctl: VHOST_SET_LOG_BASE + Master payload: u64 + Slave payload: N/A + + Sets logging shared memory space. + When slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol + feature, the log memory fd is provided in the ancillary data of + VHOST_USER_SET_LOG_BASE message, the size and offset of shared + memory area provided in the message. + + + * VHOST_USER_SET_LOG_FD + + Id: 7 + Equivalent ioctl: VHOST_SET_LOG_FD + Master payload: N/A + + Sets the logging file descriptor, which is passed as ancillary data. + + * VHOST_USER_SET_VRING_NUM + + Id: 8 + Equivalent ioctl: VHOST_SET_VRING_NUM + Master payload: vring state description + + Sets the number of vrings for this owner. + + * VHOST_USER_SET_VRING_ADDR + + Id: 9 + Equivalent ioctl: VHOST_SET_VRING_ADDR + Master payload: vring address description + Slave payload: N/A + + Sets the addresses of the different aspects of the vring. + + * VHOST_USER_SET_VRING_BASE + + Id: 10 + Equivalent ioctl: VHOST_SET_VRING_BASE + Master payload: vring state description + + Sets the base offset in the available vring. + + * VHOST_USER_GET_VRING_BASE + + Id: 11 + Equivalent ioctl: VHOST_USER_GET_VRING_BASE + Master payload: vring state description + Slave payload: vring state description + + Get the available vring base offset. + + * VHOST_USER_SET_VRING_KICK + + Id: 12 + Equivalent ioctl: VHOST_SET_VRING_KICK + Master payload: u64 + + Set the event file descriptor for adding buffers to the vring. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling should be used + instead of waiting for a kick. + + * VHOST_USER_SET_VRING_CALL + + Id: 13 + Equivalent ioctl: VHOST_SET_VRING_CALL + Master payload: u64 + + Set the event file descriptor to signal when buffers are used. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. This signals that polling will be used + instead of waiting for the call. + + * VHOST_USER_SET_VRING_ERR + + Id: 14 + Equivalent ioctl: VHOST_SET_VRING_ERR + Master payload: u64 + + Set the event file descriptor to signal when error occurs. It + is passed in the ancillary data. + Bits (0-7) of the payload contain the vring index. Bit 8 is the + invalid FD flag. This flag is set when there is no file descriptor + in the ancillary data. + + * VHOST_USER_GET_QUEUE_NUM + + Id: 17 + Equivalent ioctl: N/A + Master payload: N/A + Slave payload: u64 + + Query how many queues the backend supports. This request should be + sent only when VHOST_USER_PROTOCOL_F_MQ is set in quried protocol + features by VHOST_USER_GET_PROTOCOL_FEATURES. + + * VHOST_USER_SET_VRING_ENABLE + + Id: 18 + Equivalent ioctl: N/A + Master payload: vring state description + + Signal slave to enable or disable corresponding vring. + This request should be sent only when VHOST_USER_F_PROTOCOL_FEATURES + has been negotiated. + + * VHOST_USER_SEND_RARP + + Id: 19 + Equivalent ioctl: N/A + Master payload: u64 + + Ask vhost user backend to broadcast a fake RARP to notify the migration + is terminated for guest that does not support GUEST_ANNOUNCE. + Only legal if feature bit VHOST_USER_F_PROTOCOL_FEATURES is present in + VHOST_USER_GET_FEATURES and protocol feature bit VHOST_USER_PROTOCOL_F_RARP + is present in VHOST_USER_GET_PROTOCOL_FEATURES. + The first 6 bytes of the payload contain the mac address of the guest to + allow the vhost user backend to construct and broadcast the fake RARP. diff --git a/src/docs/specs/vmw_pvscsi-spec.txt b/src/docs/specs/vmw_pvscsi-spec.txt new file mode 100644 index 0000000..49affb2 --- /dev/null +++ b/src/docs/specs/vmw_pvscsi-spec.txt @@ -0,0 +1,92 @@ +General Description +=================== + +This document describes VMWare PVSCSI device interface specification. +Created by Dmitry Fleytman (dmitry@daynix.com), Daynix Computing LTD. +Based on source code of PVSCSI Linux driver from kernel 3.0.4 + +PVSCSI Device Interface Overview +================================ + +The interface is based on memory area shared between hypervisor and VM. +Memory area is obtained by driver as device IO memory resource of +PVSCSI_MEM_SPACE_SIZE length. +The shared memory consists of registers area and rings area. +The registers area is used to raise hypervisor interrupts and issue device +commands. The rings area is used to transfer data descriptors and SCSI +commands from VM to hypervisor and to transfer messages produced by +hypervisor to VM. Data itself is transferred via virtual scatter-gather DMA. + +PVSCSI Device Registers +======================= + +The length of the registers area is 1 page (PVSCSI_MEM_SPACE_COMMAND_NUM_PAGES). +The structure of the registers area is described by the PVSCSIRegOffset enum. +There are registers to issue device command (with optional short data), +issue device interrupt, control interrupts masking. + +PVSCSI Device Rings +=================== + +There are three rings in shared memory: + + 1. Request ring (struct PVSCSIRingReqDesc *req_ring) + - ring for OS to device requests + 2. Completion ring (struct PVSCSIRingCmpDesc *cmp_ring) + - ring for device request completions + 3. Message ring (struct PVSCSIRingMsgDesc *msg_ring) + - ring for messages from device. + This ring is optional and the guest might not configure it. +There is a control area (struct PVSCSIRingsState *rings_state) used to control +rings operation. + +PVSCSI Device to Host Interrupts +================================ +There are following interrupt types supported by PVSCSI device: + 1. Completion interrupts (completion ring notifications): + PVSCSI_INTR_CMPL_0 + PVSCSI_INTR_CMPL_1 + 2. Message interrupts (message ring notifications): + PVSCSI_INTR_MSG_0 + PVSCSI_INTR_MSG_1 + +Interrupts are controlled via PVSCSI_REG_OFFSET_INTR_MASK register +Bit set means interrupt enabled, bit cleared - disabled + +Interrupt modes supported are legacy, MSI and MSI-X +In case of legacy interrupts, register PVSCSI_REG_OFFSET_INTR_STATUS +is used to check which interrupt has arrived. Interrupts are +acknowledged when the corresponding bit is written to the interrupt +status register. + +PVSCSI Device Operation Sequences +================================= + +1. Startup sequence: + a. Issue PVSCSI_CMD_ADAPTER_RESET command; + aa. Windows driver reads interrupt status register here; + b. Issue PVSCSI_CMD_SETUP_MSG_RING command with no additional data, + check status and disable device messages if error returned; + (Omitted if device messages disabled by driver configuration) + c. Issue PVSCSI_CMD_SETUP_RINGS command, provide rings configuration + as struct PVSCSICmdDescSetupRings; + d. Issue PVSCSI_CMD_SETUP_MSG_RING command again, provide + rings configuration as struct PVSCSICmdDescSetupMsgRing; + e. Unmask completion and message (if device messages enabled) interrupts. + +2. Shutdown sequences + a. Mask interrupts; + b. Flush request ring using PVSCSI_REG_OFFSET_KICK_NON_RW_IO; + c. Issue PVSCSI_CMD_ADAPTER_RESET command. + +3. Send request + a. Fill next free request ring descriptor; + b. Issue PVSCSI_REG_OFFSET_KICK_RW_IO for R/W operations; + or PVSCSI_REG_OFFSET_KICK_NON_RW_IO for other operations. + +4. Abort command + a. Issue PVSCSI_CMD_ABORT_CMD command; + +5. Request completion processing + a. Upon completion interrupt arrival process completion + and message (if enabled) rings. diff --git a/src/docs/spice-port-fqdn.txt b/src/docs/spice-port-fqdn.txt new file mode 100644 index 0000000..5077895 --- /dev/null +++ b/src/docs/spice-port-fqdn.txt @@ -0,0 +1,19 @@ +A Spice port channel is an arbitrary communication between the Spice +server host side and the client side. + +Thanks to the associated reverse fully qualified domain name (fqdn), +a Spice client can handle the various ports appropriately. + +The following fqdn names are reserved by the QEMU project: + +org.qemu.monitor.hmp.0 + QEMU human monitor + +org.qemu.monitor.qmp.0: + QEMU control monitor + +org.qemu.console.serial.0 + QEMU virtual serial port + +org.qemu.console.debug.0 + QEMU debug console diff --git a/src/docs/tracing.txt b/src/docs/tracing.txt new file mode 100644 index 0000000..3853a6a --- /dev/null +++ b/src/docs/tracing.txt @@ -0,0 +1,349 @@ += Tracing = + +== Introduction == + +This document describes the tracing infrastructure in QEMU and how to use it +for debugging, profiling, and observing execution. + +== Quickstart == + +1. Build with the 'simple' trace backend: + + ./configure --enable-trace-backends=simple + make + +2. Create a file with the events you want to trace: + + echo bdrv_aio_readv > /tmp/events + echo bdrv_aio_writev >> /tmp/events + +3. Run the virtual machine to produce a trace file: + + qemu -trace events=/tmp/events ... # your normal QEMU invocation + +4. Pretty-print the binary trace file: + + ./scripts/simpletrace.py trace-events trace-* # Override * with QEMU <pid> + +== Trace events == + +There is a set of static trace events declared in the "trace-events" source +file. Each trace event declaration names the event, its arguments, and the +format string which can be used for pretty-printing: + + qemu_vmalloc(size_t size, void *ptr) "size %zu ptr %p" + qemu_vfree(void *ptr) "ptr %p" + +The "trace-events" file is processed by the "tracetool" script during build to +generate code for the trace events. Trace events are invoked directly from +source code like this: + + #include "trace.h" /* needed for trace event prototype */ + + void *qemu_vmalloc(size_t size) + { + void *ptr; + size_t align = QEMU_VMALLOC_ALIGN; + + if (size < align) { + align = getpagesize(); + } + ptr = qemu_memalign(align, size); + trace_qemu_vmalloc(size, ptr); + return ptr; + } + +=== Declaring trace events === + +The "tracetool" script produces the trace.h header file which is included by +every source file that uses trace events. Since many source files include +trace.h, it uses a minimum of types and other header files included to keep the +namespace clean and compile times and dependencies down. + +Trace events should use types as follows: + + * Use stdint.h types for fixed-size types. Most offsets and guest memory + addresses are best represented with uint32_t or uint64_t. Use fixed-size + types over primitive types whose size may change depending on the host + (32-bit versus 64-bit) so trace events don't truncate values or break + the build. + + * Use void * for pointers to structs or for arrays. The trace.h header + cannot include all user-defined struct declarations and it is therefore + necessary to use void * for pointers to structs. + + * For everything else, use primitive scalar types (char, int, long) with the + appropriate signedness. + +Format strings should reflect the types defined in the trace event. Take +special care to use PRId64 and PRIu64 for int64_t and uint64_t types, +respectively. This ensures portability between 32- and 64-bit platforms. + +=== Hints for adding new trace events === + +1. Trace state changes in the code. Interesting points in the code usually + involve a state change like starting, stopping, allocating, freeing. State + changes are good trace events because they can be used to understand the + execution of the system. + +2. Trace guest operations. Guest I/O accesses like reading device registers + are good trace events because they can be used to understand guest + interactions. + +3. Use correlator fields so the context of an individual line of trace output + can be understood. For example, trace the pointer returned by malloc and + used as an argument to free. This way mallocs and frees can be matched up. + Trace events with no context are not very useful. + +4. Name trace events after their function. If there are multiple trace events + in one function, append a unique distinguisher at the end of the name. + +== Generic interface and monitor commands == + +You can programmatically query and control the state of trace events through a +backend-agnostic interface provided by the header "trace/control.h". + +Note that some of the backends do not provide an implementation for some parts +of this interface, in which case QEMU will just print a warning (please refer to +header "trace/control.h" to see which routines are backend-dependent). + +The state of events can also be queried and modified through monitor commands: + +* info trace-events + View available trace events and their state. State 1 means enabled, state 0 + means disabled. + +* trace-event NAME on|off + Enable/disable a given trace event or a group of events (using wildcards). + +The "-trace events=<file>" command line argument can be used to enable the +events listed in <file> from the very beginning of the program. This file must +contain one event name per line. + +If a line in the "-trace events=<file>" file begins with a '-', the trace event +will be disabled instead of enabled. This is useful when a wildcard was used +to enable an entire family of events but one noisy event needs to be disabled. + +Wildcard matching is supported in both the monitor command "trace-event" and the +events list file. That means you can enable/disable the events having a common +prefix in a batch. For example, virtio-blk trace events could be enabled using +the following monitor command: + + trace-event virtio_blk_* on + +== Trace backends == + +The "tracetool" script automates tedious trace event code generation and also +keeps the trace event declarations independent of the trace backend. The trace +events are not tightly coupled to a specific trace backend, such as LTTng or +SystemTap. Support for trace backends can be added by extending the "tracetool" +script. + +The trace backends are chosen at configure time: + + ./configure --enable-trace-backends=simple + +For a list of supported trace backends, try ./configure --help or see below. +If multiple backends are enabled, the trace is sent to them all. + +The following subsections describe the supported trace backends. + +=== Nop === + +The "nop" backend generates empty trace event functions so that the compiler +can optimize out trace events completely. This is the default and imposes no +performance penalty. + +Note that regardless of the selected trace backend, events with the "disable" +property will be generated with the "nop" backend. + +=== Stderr === + +The "stderr" backend sends trace events directly to standard error. This +effectively turns trace events into debug printfs. + +This is the simplest backend and can be used together with existing code that +uses DPRINTF(). + +=== Simpletrace === + +The "simple" backend supports common use cases and comes as part of the QEMU +source tree. It may not be as powerful as platform-specific or third-party +trace backends but it is portable. This is the recommended trace backend +unless you have specific needs for more advanced backends. + +The "simple" backend currently does not capture string arguments, it simply +records the char* pointer value instead of the string that is pointed to. + +=== Ftrace === + +The "ftrace" backend writes trace data to ftrace marker. This effectively +sends trace events to ftrace ring buffer, and you can compare qemu trace +data and kernel(especially kvm.ko when using KVM) trace data. + +if you use KVM, enable kvm events in ftrace: + + # echo 1 > /sys/kernel/debug/tracing/events/kvm/enable + +After running qemu by root user, you can get the trace: + + # cat /sys/kernel/debug/tracing/trace + +Restriction: "ftrace" backend is restricted to Linux only. + +==== Monitor commands ==== + +* trace-file on|off|flush|set <path> + Enable/disable/flush the trace file or set the trace file name. + +==== Analyzing trace files ==== + +The "simple" backend produces binary trace files that can be formatted with the +simpletrace.py script. The script takes the "trace-events" file and the binary +trace: + + ./scripts/simpletrace.py trace-events trace-12345 + +You must ensure that the same "trace-events" file was used to build QEMU, +otherwise trace event declarations may have changed and output will not be +consistent. + +=== LTTng Userspace Tracer === + +The "ust" backend uses the LTTng Userspace Tracer library. There are no +monitor commands built into QEMU, instead UST utilities should be used to list, +enable/disable, and dump traces. + +Package lttng-tools is required for userspace tracing. You must ensure that the +current user belongs to the "tracing" group, or manually launch the +lttng-sessiond daemon for the current user prior to running any instance of +QEMU. + +While running an instrumented QEMU, LTTng should be able to list all available +events: + + lttng list -u + +Create tracing session: + + lttng create mysession + +Enable events: + + lttng enable-event qemu:g_malloc -u + +Where the events can either be a comma-separated list of events, or "-a" to +enable all tracepoint events. Start and stop tracing as needed: + + lttng start + lttng stop + +View the trace: + + lttng view + +Destroy tracing session: + + lttng destroy + +Babeltrace can be used at any later time to view the trace: + + babeltrace $HOME/lttng-traces/mysession-<date>-<time> + +=== SystemTap === + +The "dtrace" backend uses DTrace sdt probes but has only been tested with +SystemTap. When SystemTap support is detected a .stp file with wrapper probes +is generated to make use in scripts more convenient. This step can also be +performed manually after a build in order to change the binary name in the .stp +probes: + + scripts/tracetool.py --backends=dtrace --format=stap \ + --binary path/to/qemu-binary \ + --target-type system \ + --target-name x86_64 \ + <trace-events >qemu.stp + +== Trace event properties == + +Each event in the "trace-events" file can be prefixed with a space-separated +list of zero or more of the following event properties. + +=== "disable" === + +If a specific trace event is going to be invoked a huge number of times, this +might have a noticeable performance impact even when the event is +programmatically disabled. + +In this case you should declare such event with the "disable" property. This +will effectively disable the event at compile time (by using the "nop" backend), +thus having no performance impact at all on regular builds (i.e., unless you +edit the "trace-events" file). + +In addition, there might be cases where relatively complex computations must be +performed to generate values that are only used as arguments for a trace +function. In these cases you can use the macro 'TRACE_${EVENT_NAME}_ENABLED' to +guard such computations and avoid its compilation when the event is disabled: + + #include "trace.h" /* needed for trace event prototype */ + + void *qemu_vmalloc(size_t size) + { + void *ptr; + size_t align = QEMU_VMALLOC_ALIGN; + + if (size < align) { + align = getpagesize(); + } + ptr = qemu_memalign(align, size); + if (TRACE_QEMU_VMALLOC_ENABLED) { /* preprocessor macro */ + void *complex; + /* some complex computations to produce the 'complex' value */ + trace_qemu_vmalloc(size, ptr, complex); + } + return ptr; + } + +You can check both if the event has been disabled and is dynamically enabled at +the same time using the 'trace_event_get_state' routine (see header +"trace/control.h" for more information). + +=== "tcg" === + +Guest code generated by TCG can be traced by defining an event with the "tcg" +event property. Internally, this property generates two events: +"<eventname>_trans" to trace the event at translation time, and +"<eventname>_exec" to trace the event at execution time. + +Instead of using these two events, you should instead use the function +"trace_<eventname>_tcg" during translation (TCG code generation). This function +will automatically call "trace_<eventname>_trans", and will generate the +necessary TCG code to call "trace_<eventname>_exec" during guest code execution. + +Events with the "tcg" property can be declared in the "trace-events" file with a +mix of native and TCG types, and "trace_<eventname>_tcg" will gracefully forward +them to the "<eventname>_trans" and "<eventname>_exec" events. Since TCG values +are not known at translation time, these are ignored by the "<eventname>_trans" +event. Because of this, the entry in the "trace-events" file needs two printing +formats (separated by a comma): + + tcg foo(uint8_t a1, TCGv_i32 a2) "a1=%d", "a1=%d a2=%d" + +For example: + + #include "trace-tcg.h" + + void some_disassembly_func (...) + { + uint8_t a1 = ...; + TCGv_i32 a2 = ...; + trace_foo_tcg(a1, a2); + } + +This will immediately call: + + void trace_foo_trans(uint8_t a1); + +and will generate the TCG code to call: + + void trace_foo(uint8_t a1, uint32_t a2); diff --git a/src/docs/usb-storage.txt b/src/docs/usb-storage.txt new file mode 100644 index 0000000..c5a3866 --- /dev/null +++ b/src/docs/usb-storage.txt @@ -0,0 +1,47 @@ + +qemu usb storage emulation +-------------------------- + +QEMU has three devices for usb storage emulation. + +Number one emulates the classic bulk-only transport protocol which is +used by 99% of the usb sticks on the market today and is called +"usb-storage". Usage (hooking up to xhci, other host controllers work +too): + + qemu ${other_vm_args} \ + -drive if=none,id=stick,file=/path/to/file.img \ + -device nec-usb-xhci,id=xhci \ + -device usb-storage,bus=xhci.0,drive=stick + + +Number two is the newer usb attached scsi transport. This one doesn't +automagically create a scsi disk, so you have to explicitly attach one +manually. Multiple logical units are supported. Here is an example +with tree logical units: + + qemu ${other_vm_args} \ + -drive if=none,id=uas-disk1,file=/path/to/file1.img \ + -drive if=none,id=uas-disk2,file=/path/to/file2.img \ + -drive if=none,id=uas-cdrom,media=cdrom,file=/path/to/image.iso \ + -device nec-usb-xhci,id=xhci \ + -device usb-uas,id=uas,bus=xhci.0 \ + -device scsi-hd,bus=uas.0,scsi-id=0,lun=0,drive=uas-disk1 \ + -device scsi-hd,bus=uas.0,scsi-id=0,lun=1,drive=uas-disk2 \ + -device scsi-cd,bus=uas.0,scsi-id=0,lun=5,drive=uas-cdrom + + +Number three emulates the classic bulk-only transport protocol too. +It's called "usb-bot". It shares most code with "usb-storage", and +the guest will not be able to see the difference. The qemu command +line interface is simliar to usb-uas though, i.e. no automatic scsi +disk creation. It also features support for up to 16 LUNs. The LUN +numbers must be continuous, i.e. for three devices you must use 0+1+2. +The 0+1+5 numbering from the "usb-uas" example isn't going to work +with "usb-bot". + +enjoy, + Gerd + +-- +Gerd Hoffmann <kraxel@redhat.com> diff --git a/src/docs/usb2.txt b/src/docs/usb2.txt new file mode 100644 index 0000000..c7a445a --- /dev/null +++ b/src/docs/usb2.txt @@ -0,0 +1,161 @@ + +USB 2.0 Quick Start +=================== + +The QEMU EHCI Adapter can be used with and without companion +controllers. See below for the companion controller mode. + +When not running in companion controller mode there are two completely +separate USB busses: One USB 1.1 bus driven by the UHCI controller and +one USB 2.0 bus driven by the EHCI controller. Devices must be +attached to the correct controller manually. + +The '-usb' switch will make qemu create the UHCI controller as part of +the PIIX3 chipset. The USB 1.1 bus will carry the name "usb-bus.0". + +You can use the standard -device switch to add a EHCI controller to +your virtual machine. It is strongly recommended to specify an ID for +the controller so the USB 2.0 bus gets a individual name, for example +'-device usb-ehci,id=ehci". This will give you a USB 2.0 bus named +"ehci.0". + +I strongly recomment to also use -device to attach usb devices because +you can specify the bus they should be attached to this way. Here is +a complete example: + + qemu -M pc ${otheroptions} \ + -drive if=none,id=usbstick,file=/path/to/image \ + -usb \ + -device usb-ehci,id=ehci \ + -device usb-tablet,bus=usb-bus.0 \ + -device usb-storage,bus=ehci.0,drive=usbstick + +This attaches a usb tablet to the UHCI adapter and a usb mass storage +device to the EHCI adapter. + + +Companion controller support +---------------------------- + +Companion controller support has been added recently. The operational +model described above with two completely separate busses still works +fine. Additionally the UHCI and OHCI controllers got the ability to +attach to a usb bus created by EHCI as companion controllers. This is +done by specifying the masterbus and firstport properties. masterbus +specifies the bus name the controller should attach to. firstport +specifies the first port the controller should attach to, which is +needed as usually one ehci controller with six ports has three uhci +companion controllers with two ports each. + +There is a config file in docs which will do all this for you, just +try ... + + qemu -readconfig docs/ich9-ehci-uhci.cfg + +... then use "bus=ehci.0" to assign your usb devices to that bus. + + +xhci controller support +----------------------- + +There is also xhci host controller support available. It got a lot +less testing than ehci and there are a bunch of known limitations, so +ehci may work better for you. On the other hand the xhci hardware +design is much more virtualization-friendly, thus xhci emulation uses +less resources (especially cpu). If you want to give xhci a try +use this to add the host controller ... + + qemu -device nec-usb-xhci,id=xhci + +... then use "bus=xhci.0" when assigning usb devices. + + +More USB tips & tricks +====================== + +Recently the usb pass through driver (also known as usb-host) and the +qemu usb subsystem gained a few capabilities which are available only +via qdev properties, i,e. when using '-device'. + + +physical port addressing +------------------------ + +First you can (for all usb devices) specify the physical port where +the device will show up in the guest. This can be done using the +"port" property. UHCI has two root ports (1,2). EHCI has four root +ports (1-4), the emulated (1.1) USB hub has eight ports. + +Plugging a tablet into UHCI port 1 works like this: + + -device usb-tablet,bus=usb-bus.0,port=1 + +Plugging a hub into UHCI port 2 works like this: + + -device usb-hub,bus=usb-bus.0,port=2 + +Plugging a virtual usb stick into port 4 of the hub just plugged works +this way: + + -device usb-storage,bus=usb-bus.0,port=2.4,drive=... + +You can do basically the same in the monitor using the device_add +command. If you want to unplug devices too you should specify some +unique id which you can use to refer to the device ... + + (qemu) device_add usb-tablet,bus=usb-bus.0,port=1,id=my-tablet + (qemu) device_del my-tablet + +... when unplugging it with device_del. + + +USB pass through hints +---------------------- + +The usb-host driver has a bunch of properties to specify the device +which should be passed to the guest: + + hostbus=<nr> -- Specifies the bus number the device must be attached + to. + + hostaddr=<nr> -- Specifies the device address the device got + assigned by the guest os. + + hostport=<str> -- Specifies the physical port the device is attached + to. + + vendorid=<hexnr> -- Specifies the vendor ID of the device. + productid=<hexnr> -- Specifies the product ID of the device. + +In theory you can combine all these properties as you like. In +practice only a few combinations are useful: + + (1) vendorid+productid -- match for a specific device, pass it to + the guest when it shows up somewhere in the host. + + (2) hostbus+hostport -- match for a specific physical port in the + host, any device which is plugged in there gets passed to the + guest. + + (3) hostbus+hostaddr -- most useful for ad-hoc pass through as the + hostaddr isn't stable, the next time you plug in the device it + gets a new one ... + +Note that USB 1.1 devices are handled by UHCI/OHCI and USB 2.0 by +EHCI. That means a device plugged into the very same physical port +may show up on different busses depending on the speed. The port I'm +using for testing is bus 1 + port 1 for 2.0 devices and bus 3 + port 1 +for 1.1 devices. Passing through any device plugged into that port +and also assign them to the correct bus can be done this way: + + qemu -M pc ${otheroptions} \ + -usb \ + -device usb-ehci,id=ehci \ + -device usb-host,bus=usb-bus.0,hostbus=3,hostport=1 \ + -device usb-host,bus=ehci.0,hostbus=1,hostport=1 + +enjoy, + Gerd + +-- +Gerd Hoffmann <kraxel@redhat.com> diff --git a/src/docs/virtio-balloon-stats.txt b/src/docs/virtio-balloon-stats.txt new file mode 100644 index 0000000..edff5f2 --- /dev/null +++ b/src/docs/virtio-balloon-stats.txt @@ -0,0 +1,105 @@ +virtio balloon memory statistics +================================ + +The virtio balloon driver supports guest memory statistics reporting. These +statistics are available to QEMU users as QOM (QEMU Object Model) device +properties via a polling mechanism. + +Before querying the available stats, clients first have to enable polling. +This is done by writing a time interval value (in seconds) to the +guest-stats-polling-interval property. This value can be: + + > 0 enables polling in the specified interval. If polling is already + enabled, the polling time interval is changed to the new value + + 0 disables polling. Previous polled statistics are still valid and + can be queried. + +Once polling is enabled, the virtio-balloon device in QEMU will start +polling the guest's balloon driver for new stats in the specified time +interval. + +To retrieve those stats, clients have to query the guest-stats property, +which will return a dictionary containing: + + o A key named 'stats', containing all available stats. If the guest + doesn't support a particular stat, or if it couldn't be retrieved, + its value will be -1. Currently, the following stats are supported: + + - stat-swap-in + - stat-swap-out + - stat-major-faults + - stat-minor-faults + - stat-free-memory + - stat-total-memory + + o A key named last-update, which contains the last stats update + timestamp in seconds. Since this timestamp is generated by the host, + a buggy guest can't influence its value. The value is 0 if the guest + has not updated the stats (yet). + +It's also important to note the following: + + - Previously polled statistics remain available even if the polling is + later disabled + + - As noted above, if a guest doesn't support a particular stat its value + will always be -1. However, it's also possible that a guest temporarily + couldn't update one or even all stats. If this happens, just wait for + the next update + + - Polling can be enabled even if the guest doesn't have stats support + or the balloon driver wasn't loaded in the guest. If this is the case + and stats are queried, last-update will be 0. + + - The polling timer is only re-armed when the guest responds to the + statistics request. This means that if a (buggy) guest doesn't ever + respond to the request the timer will never be re-armed, which has + the same effect as disabling polling + +Here are a few examples. QEMU is started with '-balloon virtio', which +generates '/machine/peripheral-anon/device[1]' as the QOM path for the +balloon device. + +Enable polling with 2 seconds interval: + +{ "execute": "qom-set", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "guest-stats-polling-interval", "value": 2 } } + +{ "return": {} } + +Change polling to 10 seconds: + +{ "execute": "qom-set", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "guest-stats-polling-interval", "value": 10 } } + +{ "return": {} } + +Get stats: + +{ "execute": "qom-get", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "guest-stats" } } +{ + "return": { + "stats": { + "stat-swap-out": 0, + "stat-free-memory": 844943360, + "stat-minor-faults": 219028, + "stat-major-faults": 235, + "stat-total-memory": 1044406272, + "stat-swap-in": 0 + }, + "last-update": 1358529861 + } +} + +Disable polling: + +{ "execute": "qom-set", + "arguments": { "path": "/machine/peripheral-anon/device[1]", + "property": "stats-polling-interval", "value": 0 } } + +{ "return": {} } diff --git a/src/docs/virtio-migration.txt b/src/docs/virtio-migration.txt new file mode 100644 index 0000000..cf66458 --- /dev/null +++ b/src/docs/virtio-migration.txt @@ -0,0 +1,106 @@ +Virtio devices and migration +============================ + +Copyright 2015 IBM Corp. + +This work is licensed under the terms of the GNU GPL, version 2 or later. See +the COPYING file in the top-level directory. + +Saving and restoring the state of virtio devices is a bit of a twisty maze, +for several reasons: +- state is distributed between several parts: + - virtio core, for common fields like features, number of queues, ... + - virtio transport (pci, ccw, ...), for the different proxy devices and + transport specific state (msix vectors, indicators, ...) + - virtio device (net, blk, ...), for the different device types and their + state (mac address, request queue, ...) +- most fields are saved via the stream interface; subsequently, subsections + have been added to make cross-version migration possible + +This file attempts to document the current procedure and point out some +caveats. + + +Save state procedure +==================== + +virtio core virtio transport virtio device +----------- ---------------- ------------- + + save() function registered + via register_savevm() +virtio_save() <---------- + ------> save_config() + - save proxy device + - save transport-specific + device fields +- save common device + fields +- save common virtqueue + fields + ------> save_queue() + - save transport-specific + virtqueue fields + ------> save_device() + - save device-specific + fields +- save subsections + - device endianness, + if changed from + default endianness + - 64 bit features, if + any high feature bit + is set + - virtio-1 virtqueue + fields, if VERSION_1 + is set + + +Load state procedure +==================== + +virtio core virtio transport virtio device +----------- ---------------- ------------- + + load() function registered + via register_savevm() +virtio_load() <---------- + ------> load_config() + - load proxy device + - load transport-specific + device fields +- load common device + fields +- load common virtqueue + fields + ------> load_queue() + - load transport-specific + virtqueue fields +- notify guest + ------> load_device() + - load device-specific + fields +- load subsections + - device endianness + - 64 bit features + - virtio-1 virtqueue + fields +- sanitize endianness +- sanitize features +- virtqueue index sanity + check + - feature-dependent setup + + +Implications of this setup +========================== + +Devices need to be careful in their state processing during load: The +load_device() procedure is invoked by the core before subsections have +been loaded. Any code that depends on information transmitted in subsections +therefore has to be invoked in the device's load() function _after_ +virtio_load() returned (like e.g. code depending on features). + +Any extension of the state being migrated should be done in subsections +added to the core for compatibility reasons. If transport or device specific +state is added, core needs to invoke a callback from the new subsection. diff --git a/src/docs/vnc-ledstate-Pseudo-encoding.txt b/src/docs/vnc-ledstate-Pseudo-encoding.txt new file mode 100644 index 0000000..0f124f6 --- /dev/null +++ b/src/docs/vnc-ledstate-Pseudo-encoding.txt @@ -0,0 +1,50 @@ +VNC LED state Pseudo-encoding +============================= + +Introduction +------------ + +This document describes the Pseudo-encoding of LED state for RFB which +is the protocol used in VNC as reference link below: + +http://tigervnc.svn.sourceforge.net/viewvc/tigervnc/rfbproto/rfbproto.rst?content-type=text/plain + +When accessing a guest by console through VNC, there might be mismatch +between the lock keys notification LED on the computer running the VNC +client session and the current status of the lock keys on the guest +machine. + +To solve this problem it attempts to add LED state Pseudo-encoding +extension to VNC protocol to deal with setting LED state. + +Pseudo-encoding +--------------- + +This Pseudo-encoding requested by client declares to server that it supports +LED state extensions to the protocol. + +The Pseudo-encoding number for LED state defined as: + +======= =============================================================== +Number Name +======= =============================================================== +-261 'LED state Pseudo-encoding' +======= =============================================================== + +LED state Pseudo-encoding +-------------------------- + +The LED state Pseudo-encoding describes the encoding of LED state which +consists of 3 bits, from left to right each bit represents the Caps, Num, +and Scroll lock key respectively. '1' indicates that the LED should be +on and '0' should be off. + +Some example encodings for it as following: + +======= =============================================================== +Code Description +======= =============================================================== +100 CapsLock is on, NumLock and ScrollLock are off +010 NumLock is on, CapsLock and ScrollLock are off +111 CapsLock, NumLock and ScrollLock are on +======= =============================================================== diff --git a/src/docs/win32-qemu-event.promela b/src/docs/win32-qemu-event.promela new file mode 100644 index 0000000..c446a71 --- /dev/null +++ b/src/docs/win32-qemu-event.promela @@ -0,0 +1,98 @@ +/* + * This model describes the implementation of QemuEvent in + * util/qemu-thread-win32.c. + * + * Author: Paolo Bonzini <pbonzini@redhat.com> + * + * This file is in the public domain. If you really want a license, + * the WTFPL will do. + * + * To verify it: + * spin -a docs/event.promela + * gcc -O2 pan.c -DSAFETY + * ./a.out + */ + +bool event; +int value; + +/* Primitives for a Win32 event */ +#define RAW_RESET event = false +#define RAW_SET event = true +#define RAW_WAIT do :: event -> break; od + +#if 0 +/* Basic sanity checking: test the Win32 event primitives */ +#define RESET RAW_RESET +#define SET RAW_SET +#define WAIT RAW_WAIT +#else +/* Full model: layer a userspace-only fast path on top of the RAW_* + * primitives. SET/RESET/WAIT have exactly the same semantics as + * RAW_SET/RAW_RESET/RAW_WAIT, but try to avoid invoking them. + */ +#define EV_SET 0 +#define EV_FREE 1 +#define EV_BUSY -1 + +int state = EV_FREE; + +int xchg_result; +#define SET if :: state != EV_SET -> \ + atomic { /* xchg_result=xchg(state, EV_SET) */ \ + xchg_result = state; \ + state = EV_SET; \ + } \ + if :: xchg_result == EV_BUSY -> RAW_SET; \ + :: else -> skip; \ + fi; \ + :: else -> skip; \ + fi + +#define RESET if :: state == EV_SET -> atomic { state = state | EV_FREE; } \ + :: else -> skip; \ + fi + +int tmp1, tmp2; +#define WAIT tmp1 = state; \ + if :: tmp1 != EV_SET -> \ + if :: tmp1 == EV_FREE -> \ + RAW_RESET; \ + atomic { /* tmp2=cas(state, EV_FREE, EV_BUSY) */ \ + tmp2 = state; \ + if :: tmp2 == EV_FREE -> state = EV_BUSY; \ + :: else -> skip; \ + fi; \ + } \ + if :: tmp2 == EV_SET -> tmp1 = EV_SET; \ + :: else -> tmp1 = EV_BUSY; \ + fi; \ + :: else -> skip; \ + fi; \ + assert(tmp1 != EV_FREE); \ + if :: tmp1 == EV_BUSY -> RAW_WAIT; \ + :: else -> skip; \ + fi; \ + :: else -> skip; \ + fi +#endif + +active proctype waiter() +{ + if + :: !value -> + RESET; + if + :: !value -> WAIT; + :: else -> skip; + fi; + :: else -> skip; + fi; + assert(value); +} + +active proctype notifier() +{ + value = true; + SET; +} diff --git a/src/docs/writing-qmp-commands.txt b/src/docs/writing-qmp-commands.txt new file mode 100644 index 0000000..59aa77a --- /dev/null +++ b/src/docs/writing-qmp-commands.txt @@ -0,0 +1,649 @@ += How to write QMP commands using the QAPI framework = + +This document is a step-by-step guide on how to write new QMP commands using +the QAPI framework. It also shows how to implement new style HMP commands. + +This document doesn't discuss QMP protocol level details, nor does it dive +into the QAPI framework implementation. + +For an in-depth introduction to the QAPI framework, please refer to +docs/qapi-code-gen.txt. For documentation about the QMP protocol, please +check the files in QMP/. + +== Overview == + +Generally speaking, the following steps should be taken in order to write a +new QMP command. + +1. Write the command's and type(s) specification in the QAPI schema file + (qapi-schema.json in the root source directory) + +2. Write the QMP command itself, which is a regular C function. Preferably, + the command should be exported by some QEMU subsystem. But it can also be + added to the qmp.c file + +3. At this point the command can be tested under the QMP protocol + +4. Write the HMP command equivalent. This is not required and should only be + done if it does make sense to have the functionality in HMP. The HMP command + is implemented in terms of the QMP command + +The following sections will demonstrate each of the steps above. We will start +very simple and get more complex as we progress. + +=== Testing === + +For all the examples in the next sections, the test setup is the same and is +shown here. + +First, QEMU should be started as: + +# /path/to/your/source/qemu [...] \ + -chardev socket,id=qmp,port=4444,host=localhost,server \ + -mon chardev=qmp,mode=control,pretty=on + +Then, in a different terminal: + +$ telnet localhost 4444 +Trying 127.0.0.1... +Connected to localhost. +Escape character is '^]'. +{ + "QMP": { + "version": { + "qemu": { + "micro": 50, + "minor": 15, + "major": 0 + }, + "package": "" + }, + "capabilities": [ + ] + } +} + +The above output is the QMP server saying you're connected. The server is +actually in capabilities negotiation mode. To enter in command mode type: + +{ "execute": "qmp_capabilities" } + +Then the server should respond: + +{ + "return": { + } +} + +Which is QMP's way of saying "the latest command executed OK and didn't return +any data". Now you're ready to enter the QMP example commands as explained in +the following sections. + +== Writing a command that doesn't return data == + +That's the most simple QMP command that can be written. Usually, this kind of +command carries some meaningful action in QEMU but here it will just print +"Hello, world" to the standard output. + +Our command will be called "hello-world". It takes no arguments, nor does it +return any data. + +The first step is to add the following line to the bottom of the +qapi-schema.json file: + +{ 'command': 'hello-world' } + +The "command" keyword defines a new QMP command. It's an JSON object. All +schema entries are JSON objects. The line above will instruct the QAPI to +generate any prototypes and the necessary code to marshal and unmarshal +protocol data. + +The next step is to write the "hello-world" implementation. As explained +earlier, it's preferable for commands to live in QEMU subsystems. But +"hello-world" doesn't pertain to any, so we put its implementation in qmp.c: + +void qmp_hello_world(Error **errp) +{ + printf("Hello, world!\n"); +} + +There are a few things to be noticed: + +1. QMP command implementation functions must be prefixed with "qmp_" +2. qmp_hello_world() returns void, this is in accordance with the fact that the + command doesn't return any data +3. It takes an "Error **" argument. This is required. Later we will see how to + return errors and take additional arguments. The Error argument should not + be touched if the command doesn't return errors +4. We won't add the function's prototype. That's automatically done by the QAPI +5. Printing to the terminal is discouraged for QMP commands, we do it here + because it's the easiest way to demonstrate a QMP command + +Now a little hack is needed. As we're still using the old QMP server we need +to add the new command to its internal dispatch table. This step won't be +required in the near future. Open the qmp-commands.hx file and add the +following at the bottom: + + { + .name = "hello-world", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_hello_world, + }, + +You're done. Now build qemu, run it as suggested in the "Testing" section, +and then type the following QMP command: + +{ "execute": "hello-world" } + +Then check the terminal running qemu and look for the "Hello, world" string. If +you don't see it then something went wrong. + +=== Arguments === + +Let's add an argument called "message" to our "hello-world" command. The new +argument will contain the string to be printed to stdout. It's an optional +argument, if it's not present we print our default "Hello, World" string. + +The first change we have to do is to modify the command specification in the +schema file to the following: + +{ 'command': 'hello-world', 'data': { '*message': 'str' } } + +Notice the new 'data' member in the schema. It's an JSON object whose each +element is an argument to the command in question. Also notice the asterisk, +it's used to mark the argument optional (that means that you shouldn't use it +for mandatory arguments). Finally, 'str' is the argument's type, which +stands for "string". The QAPI also supports integers, booleans, enumerations +and user defined types. + +Now, let's update our C implementation in qmp.c: + +void qmp_hello_world(bool has_message, const char *message, Error **errp) +{ + if (has_message) { + printf("%s\n", message); + } else { + printf("Hello, world\n"); + } +} + +There are two important details to be noticed: + +1. All optional arguments are accompanied by a 'has_' boolean, which is set + if the optional argument is present or false otherwise +2. The C implementation signature must follow the schema's argument ordering, + which is defined by the "data" member + +The last step is to update the qmp-commands.hx file: + + { + .name = "hello-world", + .args_type = "message:s?", + .mhandler.cmd_new = qmp_marshal_hello_world, + }, + +Notice that the "args_type" member got our "message" argument. The character +"s" stands for "string" and "?" means it's optional. This too must be ordered +according to the C implementation and schema file. You can look for more +examples in the qmp-commands.hx file if you need to define more arguments. + +Again, this step won't be required in the future. + +Time to test our new version of the "hello-world" command. Build qemu, run it as +described in the "Testing" section and then send two commands: + +{ "execute": "hello-world" } +{ + "return": { + } +} + +{ "execute": "hello-world", "arguments": { "message": "We love qemu" } } +{ + "return": { + } +} + +You should see "Hello, world" and "we love qemu" in the terminal running qemu, +if you don't see these strings, then something went wrong. + +=== Errors === + +QMP commands should use the error interface exported by the error.h header +file. Basically, most errors are set by calling the error_setg() function. + +Let's say we don't accept the string "message" to contain the word "love". If +it does contain it, we want the "hello-world" command to return an error: + +void qmp_hello_world(bool has_message, const char *message, Error **errp) +{ + if (has_message) { + if (strstr(message, "love")) { + error_setg(errp, "the word 'love' is not allowed"); + return; + } + printf("%s\n", message); + } else { + printf("Hello, world\n"); + } +} + +The first argument to the error_setg() function is the Error pointer +to pointer, which is passed to all QMP functions. The next argument is a human +description of the error, this is a free-form printf-like string. + +Let's test the example above. Build qemu, run it as defined in the "Testing" +section, and then issue the following command: + +{ "execute": "hello-world", "arguments": { "message": "all you need is love" } } + +The QMP server's response should be: + +{ + "error": { + "class": "GenericError", + "desc": "the word 'love' is not allowed" + } +} + +As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR +(done by default when using error_setg()). There are two exceptions to +this rule: + + 1. A non-generic ErrorClass value exists* for the failure you want to report + (eg. DeviceNotFound) + + 2. Management applications have to take special action on the failure you + want to report, hence you have to add a new ErrorClass value so that they + can check for it + +If the failure you want to report falls into one of the two cases above, +use error_set() with a second argument of an ErrorClass value. + + * All existing ErrorClass values are defined in the qapi-schema.json file + +=== Command Documentation === + +There's only one step missing to make "hello-world"'s implementation complete, +and that's its documentation in the schema file. + +This is very important. No QMP command will be accepted in QEMU without proper +documentation. + +There are many examples of such documentation in the schema file already, but +here goes "hello-world"'s new entry for the qapi-schema.json file: + +## +# @hello-world +# +# Print a client provided string to the standard output stream. +# +# @message: #optional string to be printed +# +# Returns: Nothing on success. +# +# Notes: if @message is not provided, the "Hello, world" string will +# be printed instead +# +# Since: <next qemu stable release, eg. 1.0> +## +{ 'command': 'hello-world', 'data': { '*message': 'str' } } + +Please, note that the "Returns" clause is optional if a command doesn't return +any data nor any errors. + +=== Implementing the HMP command === + +Now that the QMP command is in place, we can also make it available in the human +monitor (HMP). + +With the introduction of the QAPI, HMP commands make QMP calls. Most of the +time HMP commands are simple wrappers. All HMP commands implementation exist in +the hmp.c file. + +Here's the implementation of the "hello-world" HMP command: + +void hmp_hello_world(Monitor *mon, const QDict *qdict) +{ + const char *message = qdict_get_try_str(qdict, "message"); + Error *err = NULL; + + qmp_hello_world(!!message, message, &err); + if (err) { + monitor_printf(mon, "%s\n", error_get_pretty(err)); + error_free(err); + return; + } +} + +Also, you have to add the function's prototype to the hmp.h file. + +There are three important points to be noticed: + +1. The "mon" and "qdict" arguments are mandatory for all HMP functions. The + former is the monitor object. The latter is how the monitor passes + arguments entered by the user to the command implementation +2. hmp_hello_world() performs error checking. In this example we just print + the error description to the user, but we could do more, like taking + different actions depending on the error qmp_hello_world() returns +3. The "err" variable must be initialized to NULL before performing the + QMP call + +There's one last step to actually make the command available to monitor users, +we should add it to the hmp-commands.hx file: + + { + .name = "hello-world", + .args_type = "message:s?", + .params = "hello-world [message]", + .help = "Print message to the standard output", + .mhandler.cmd = hmp_hello_world, + }, + +STEXI +@item hello_world @var{message} +@findex hello_world +Print message to the standard output +ETEXI + +To test this you have to open a user monitor and issue the "hello-world" +command. It might be instructive to check the command's documentation with +HMP's "help" command. + +Please, check the "-monitor" command-line option to know how to open a user +monitor. + +== Writing a command that returns data == + +A QMP command is capable of returning any data the QAPI supports like integers, +strings, booleans, enumerations and user defined types. + +In this section we will focus on user defined types. Please, check the QAPI +documentation for information about the other types. + +=== User Defined Types === + +FIXME This example needs to be redone after commit 6d32717 + +For this example we will write the query-alarm-clock command, which returns +information about QEMU's timer alarm. For more information about it, please +check the "-clock" command-line option. + +We want to return two pieces of information. The first one is the alarm clock's +name. The second one is when the next alarm will fire. The former information is +returned as a string, the latter is an integer in nanoseconds (which is not +very useful in practice, as the timer has probably already fired when the +information reaches the client). + +The best way to return that data is to create a new QAPI type, as shown below: + +## +# @QemuAlarmClock +# +# QEMU alarm clock information. +# +# @clock-name: The alarm clock method's name. +# +# @next-deadline: #optional The time (in nanoseconds) the next alarm will fire. +# +# Since: 1.0 +## +{ 'type': 'QemuAlarmClock', + 'data': { 'clock-name': 'str', '*next-deadline': 'int' } } + +The "type" keyword defines a new QAPI type. Its "data" member contains the +type's members. In this example our members are the "clock-name" and the +"next-deadline" one, which is optional. + +Now let's define the query-alarm-clock command: + +## +# @query-alarm-clock +# +# Return information about QEMU's alarm clock. +# +# Returns a @QemuAlarmClock instance describing the alarm clock method +# being currently used by QEMU (this is usually set by the '-clock' +# command-line option). +# +# Since: 1.0 +## +{ 'command': 'query-alarm-clock', 'returns': 'QemuAlarmClock' } + +Notice the "returns" keyword. As its name suggests, it's used to define the +data returned by a command. + +It's time to implement the qmp_query_alarm_clock() function, you can put it +in the qemu-timer.c file: + +QemuAlarmClock *qmp_query_alarm_clock(Error **errp) +{ + QemuAlarmClock *clock; + int64_t deadline; + + clock = g_malloc0(sizeof(*clock)); + + deadline = qemu_next_alarm_deadline(); + if (deadline > 0) { + clock->has_next_deadline = true; + clock->next_deadline = deadline; + } + clock->clock_name = g_strdup(alarm_timer->name); + + return clock; +} + +There are a number of things to be noticed: + +1. The QemuAlarmClock type is automatically generated by the QAPI framework, + its members correspond to the type's specification in the schema file +2. As specified in the schema file, the function returns a QemuAlarmClock + instance and takes no arguments (besides the "errp" one, which is mandatory + for all QMP functions) +3. The "clock" variable (which will point to our QAPI type instance) is + allocated by the regular g_malloc0() function. Note that we chose to + initialize the memory to zero. This is recommended for all QAPI types, as + it helps avoiding bad surprises (specially with booleans) +4. Remember that "next_deadline" is optional? All optional members have a + 'has_TYPE_NAME' member that should be properly set by the implementation, + as shown above +5. Even static strings, such as "alarm_timer->name", should be dynamically + allocated by the implementation. This is so because the QAPI also generates + a function to free its types and it cannot distinguish between dynamically + or statically allocated strings +6. You have to include the "qmp-commands.h" header file in qemu-timer.c, + otherwise qemu won't build + +The last step is to add the correspoding entry in the qmp-commands.hx file: + + { + .name = "query-alarm-clock", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_query_alarm_clock, + }, + +Time to test the new command. Build qemu, run it as described in the "Testing" +section and try this: + +{ "execute": "query-alarm-clock" } +{ + "return": { + "next-deadline": 2368219, + "clock-name": "dynticks" + } +} + +==== The HMP command ==== + +Here's the HMP counterpart of the query-alarm-clock command: + +void hmp_info_alarm_clock(Monitor *mon) +{ + QemuAlarmClock *clock; + Error *err = NULL; + + clock = qmp_query_alarm_clock(&err); + if (err) { + monitor_printf(mon, "Could not query alarm clock information\n"); + error_free(err); + return; + } + + monitor_printf(mon, "Alarm clock method in use: '%s'\n", clock->clock_name); + if (clock->has_next_deadline) { + monitor_printf(mon, "Next alarm will fire in %" PRId64 " nanoseconds\n", + clock->next_deadline); + } + + qapi_free_QemuAlarmClock(clock); +} + +It's important to notice that hmp_info_alarm_clock() calls +qapi_free_QemuAlarmClock() to free the data returned by qmp_query_alarm_clock(). +For user defined types, the QAPI will generate a qapi_free_QAPI_TYPE_NAME() +function and that's what you have to use to free the types you define and +qapi_free_QAPI_TYPE_NAMEList() for list types (explained in the next section). +If the QMP call returns a string, then you should g_free() to free it. + +Also note that hmp_info_alarm_clock() performs error handling. That's not +strictly required if you're sure the QMP function doesn't return errors, but +it's good practice to always check for errors. + +Another important detail is that HMP's "info" commands don't go into the +hmp-commands.hx. Instead, they go into the info_cmds[] table, which is defined +in the monitor.c file. The entry for the "info alarmclock" follows: + + { + .name = "alarmclock", + .args_type = "", + .params = "", + .help = "show information about the alarm clock", + .mhandler.info = hmp_info_alarm_clock, + }, + +To test this, run qemu and type "info alarmclock" in the user monitor. + +=== Returning Lists === + +For this example, we're going to return all available methods for the timer +alarm, which is pretty much what the command-line option "-clock ?" does, +except that we're also going to inform which method is in use. + +This first step is to define a new type: + +## +# @TimerAlarmMethod +# +# Timer alarm method information. +# +# @method-name: The method's name. +# +# @current: true if this alarm method is currently in use, false otherwise +# +# Since: 1.0 +## +{ 'type': 'TimerAlarmMethod', + 'data': { 'method-name': 'str', 'current': 'bool' } } + +The command will be called "query-alarm-methods", here is its schema +specification: + +## +# @query-alarm-methods +# +# Returns information about available alarm methods. +# +# Returns: a list of @TimerAlarmMethod for each method +# +# Since: 1.0 +## +{ 'command': 'query-alarm-methods', 'returns': ['TimerAlarmMethod'] } + +Notice the syntax for returning lists "'returns': ['TimerAlarmMethod']", this +should be read as "returns a list of TimerAlarmMethod instances". + +The C implementation follows: + +TimerAlarmMethodList *qmp_query_alarm_methods(Error **errp) +{ + TimerAlarmMethodList *method_list = NULL; + const struct qemu_alarm_timer *p; + bool current = true; + + for (p = alarm_timers; p->name; p++) { + TimerAlarmMethodList *info = g_malloc0(sizeof(*info)); + info->value = g_malloc0(sizeof(*info->value)); + info->value->method_name = g_strdup(p->name); + info->value->current = current; + + current = false; + + info->next = method_list; + method_list = info; + } + + return method_list; +} + +The most important difference from the previous examples is the +TimerAlarmMethodList type, which is automatically generated by the QAPI from +the TimerAlarmMethod type. + +Each list node is represented by a TimerAlarmMethodList instance. We have to +allocate it, and that's done inside the for loop: the "info" pointer points to +an allocated node. We also have to allocate the node's contents, which is +stored in its "value" member. In our example, the "value" member is a pointer +to an TimerAlarmMethod instance. + +Notice that the "current" variable is used as "true" only in the first +iteration of the loop. That's because the alarm timer method in use is the +first element of the alarm_timers array. Also notice that QAPI lists are handled +by hand and we return the head of the list. + +To test this you have to add the corresponding qmp-commands.hx entry: + + { + .name = "query-alarm-methods", + .args_type = "", + .mhandler.cmd_new = qmp_marshal_query_alarm_methods, + }, + +Now Build qemu, run it as explained in the "Testing" section and try our new +command: + +{ "execute": "query-alarm-methods" } +{ + "return": [ + { + "current": false, + "method-name": "unix" + }, + { + "current": true, + "method-name": "dynticks" + } + ] +} + +The HMP counterpart is a bit more complex than previous examples because it +has to traverse the list, it's shown below for reference: + +void hmp_info_alarm_methods(Monitor *mon) +{ + TimerAlarmMethodList *method_list, *method; + Error *err = NULL; + + method_list = qmp_query_alarm_methods(&err); + if (err) { + monitor_printf(mon, "Could not query alarm methods\n"); + error_free(err); + return; + } + + for (method = method_list; method; method = method->next) { + monitor_printf(mon, "%c %s\n", method->value->current ? '*' : ' ', + method->value->method_name); + } + + qapi_free_TimerAlarmMethodList(method_list); +} diff --git a/src/docs/xbzrle.txt b/src/docs/xbzrle.txt new file mode 100644 index 0000000..52c8511 --- /dev/null +++ b/src/docs/xbzrle.txt @@ -0,0 +1,136 @@ +XBZRLE (Xor Based Zero Run Length Encoding) +=========================================== + +Using XBZRLE (Xor Based Zero Run Length Encoding) allows for the reduction +of VM downtime and the total live-migration time of Virtual machines. +It is particularly useful for virtual machines running memory write intensive +workloads that are typical of large enterprise applications such as SAP ERP +Systems, and generally speaking for any application that uses a sparse memory +update pattern. + +Instead of sending the changed guest memory page this solution will send a +compressed version of the updates, thus reducing the amount of data sent during +live migration. +In order to be able to calculate the update, the previous memory pages need to +be stored on the source. Those pages are stored in a dedicated cache +(hash table) and are accessed by their address. +The larger the cache size the better the chances are that the page has already +been stored in the cache. +A small cache size will result in high cache miss rate. +Cache size can be changed before and during migration. + +Format +======= + +The compression format performs a XOR between the previous and current content +of the page, where zero represents an unchanged value. +The page data delta is represented by zero and non zero runs. +A zero run is represented by its length (in bytes). +A non zero run is represented by its length (in bytes) and the new data. +The run length is encoded using ULEB128 (http://en.wikipedia.org/wiki/LEB128) + +There can be more than one valid encoding, the sender may send a longer encoding +for the benefit of reducing computation cost. + +page = zrun nzrun + | zrun nzrun page + +zrun = length + +nzrun = length byte... + +length = uleb128 encoded integer + +On the sender side XBZRLE is used as a compact delta encoding of page updates, +retrieving the old page content from the cache (default size of 512 MB). The +receiving side uses the existing page's content and XBZRLE to decode the new +page's content. + +This work was originally based on research results published +VEE 2011: Evaluation of Delta Compression Techniques for Efficient Live +Migration of Large Virtual Machines by Benoit, Svard, Tordsson and Elmroth. +Additionally the delta encoder XBRLE was improved further using the XBZRLE +instead. + +XBZRLE has a sustained bandwidth of 2-2.5 GB/s for typical workloads making it +ideal for in-line, real-time encoding such as is needed for live-migration. + +Example +old buffer: +1001 zeros +05 06 07 08 09 0a 0b 0c 0d 0e 0f 10 11 12 13 68 00 00 6b 00 6d +3074 zeros + +new buffer: +1001 zeros +01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 68 00 00 67 00 69 +3074 zeros + +encoded buffer: + +encoded length 24 +e9 07 0f 01 02 03 04 05 06 07 08 09 0a 0b 0c 0d 0e 0f 03 01 67 01 01 69 + +Cache update strategy +===================== +Keeping the hot pages in the cache is effective for decreased cache +misses. XBZRLE uses a counter as the age of each page. The counter will +increase after each ram dirty bitmap sync. When a cache conflict is +detected, XBZRLE will only evict pages in the cache that are older than +a threshold. + +Usage +====================== +1. Verify the destination QEMU version is able to decode the new format. + {qemu} info migrate_capabilities + {qemu} xbzrle: off , ... + +2. Activate xbzrle on both source and destination: + {qemu} migrate_set_capability xbzrle on + +3. Set the XBZRLE cache size - the cache size is in MBytes and should be a +power of 2. The cache default value is 64MBytes. (on source only) + {qemu} migrate_set_cache_size 256m + +4. Start outgoing migration + {qemu} migrate -d tcp:destination.host:4444 + {qemu} info migrate + capabilities: xbzrle: on + Migration status: active + transferred ram: A kbytes + remaining ram: B kbytes + total ram: C kbytes + total time: D milliseconds + duplicate: E pages + normal: F pages + normal bytes: G kbytes + cache size: H bytes + xbzrle transferred: I kbytes + xbzrle pages: J pages + xbzrle cache miss: K + xbzrle overflow : L + +xbzrle cache-miss: the number of cache misses to date - high cache-miss rate +indicates that the cache size is set too low. +xbzrle overflow: the number of overflows in the decoding which where the delta +could not be compressed. This can happen if the changes in the pages are too +large or there are many short changes; for example, changing every second byte +(half a page). + +Testing: Testing indicated that live migration with XBZRLE was completed in 110 +seconds, whereas without it would not be able to complete. + +A simple synthetic memory r/w load generator: +.. include <stdlib.h> +.. include <stdio.h> +.. int main() +.. { +.. char *buf = (char *) calloc(4096, 4096); +.. while (1) { +.. int i; +.. for (i = 0; i < 4096 * 4; i++) { +.. buf[i * 4096 / 4]++; +.. } +.. printf("."); +.. } +.. } diff --git a/src/docs/xen-save-devices-state.txt b/src/docs/xen-save-devices-state.txt new file mode 100644 index 0000000..92e08db --- /dev/null +++ b/src/docs/xen-save-devices-state.txt @@ -0,0 +1,34 @@ += Save Devices = + +QEMU has code to load/save the state of the guest that it is running. +These are two complementary operations. Saving the state just does +that, saves the state for each device that the guest is running. + +These operations are normally used with migration (see migration.txt), +however it is also possible to save the state of all devices to file, +without saving the RAM or the block devices of the VM. + +This operation is called "xen-save-devices-state" (see +QMP/qmp-commands.txt) + + +The binary format used in the file is the following: + + +------------------------------------------- + +32 bit big endian: QEMU_VM_FILE_MAGIC +32 bit big endian: QEMU_VM_FILE_VERSION + +for_each_device +{ + 8 bit: QEMU_VM_SECTION_FULL + 32 bit big endian: section_id + 8 bit: idstr (ID string) length + string: idstr (ID string) + 32 bit big endian: instance_id + 32 bit big endian: version_id + buffer: device specific data +} + +8 bit: QEMU_VM_EOF |