297 files changed, 15329 insertions, 2679 deletions
diff --git a/.gitignore b/.gitignore
index 75bceb9..88a80ff 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,8 @@
 /*-darwin-user
 /*-linux-user
 /*-bsd-user
+/ivshmem-client
+/ivshmem-server
 /libdis*
 /libuser
 /linux-headers/asm
diff --git a/MAINTAINERS b/MAINTAINERS
index fc8abe8..9e1fa72 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -237,9 +237,14 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Maintained
 F: target-s390x/kvm.c
+F: target-s390x/ioinst.[ch]
+F: target-s390x/machine.c
 F: hw/intc/s390_flic.c
 F: hw/intc/s390_flic_kvm.c
 F: include/hw/s390x/s390_flic.h
+F: gdb-xml/s390*.xml
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next
 
 X86
 M: Paolo Bonzini <pbonzini@redhat.com>
@@ -637,15 +642,13 @@ M: Christian Borntraeger <borntraeger@de.ibm.com>
 M: Alexander Graf <agraf@suse.de>
 S: Supported
 F: hw/char/sclp*.[hc]
-F: hw/s390x/s390-virtio-ccw.c
-F: hw/s390x/css.[hc]
-F: hw/s390x/sclp*.[hc]
-F: hw/s390x/ipl*.[hc]
-F: hw/s390x/*pci*.[hc]
-F: hw/s390x/s390-skeys*.c
+F: hw/s390x/
+X: hw/s390x/s390-virtio-bus.[ch]
 F: include/hw/s390x/
 F: pc-bios/s390-ccw/
-T: git git://github.com/cohuck/qemu virtio-ccw-upstr
+F: hw/watchdog/wdt_diag288.c
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next
 
 UniCore32 Machines
 -------------
@@ -872,7 +875,8 @@ M: Cornelia Huck <cornelia.huck@de.ibm.com>
 M: Christian Borntraeger <borntraeger@de.ibm.com>
 S: Supported
 F: hw/s390x/virtio-ccw.[hc]
-T: git git://github.com/cohuck/qemu virtio-ccw-upstr
+T: git git://github.com/cohuck/qemu.git s390-next
+T: git git://github.com/borntraeger/qemu.git s390-next
 
 virtio-input
 M: Gerd Hoffmann <kraxel@redhat.com>
diff --git a/Makefile.objs b/Makefile.objs
index fe02ee2..77be052 100644
--- a/Makefile.objs
+++ b/Makefile.objs
@@ -54,6 +54,8 @@ common-obj-y += audio/
 common-obj-y += hw/
 common-obj-y += accel.o
 
+common-obj-y += replay/
+
 common-obj-y += ui/
 common-obj-y += bt-host.o bt-vhci.o
 bt-host.o-cflags := $(BLUEZ_CFLAGS)
diff --git a/VERSION b/VERSION
index 9cf89c6..38f621b 100644
--- a/VERSION
+++ b/VERSION
@@ -1 +1 @@
-2.4.50
+2.4.90
diff --git a/aio-posix.c b/aio-posix.c
index 0467f23..482b316 100644
--- a/aio-posix.c
+++ b/aio-posix.c
@@ -17,6 +17,9 @@
 #include "block/block.h"
 #include "qemu/queue.h"
 #include "qemu/sockets.h"
+#ifdef CONFIG_EPOLL
+#include <sys/epoll.h>
+#endif
 
 struct AioHandler
 {
@@ -29,6 +32,162 @@ struct AioHandler
     QLIST_ENTRY(AioHandler) node;
 };
 
+#ifdef CONFIG_EPOLL
+
+/* The fd number threashold to switch to epoll */
+#define EPOLL_ENABLE_THRESHOLD 64
+
+static void aio_epoll_disable(AioContext *ctx)
+{
+    ctx->epoll_available = false;
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    ctx->epoll_enabled = false;
+    close(ctx->epollfd);
+}
+
+static inline int epoll_events_from_pfd(int pfd_events)
+{
+    return (pfd_events & G_IO_IN ? EPOLLIN : 0) |
+           (pfd_events & G_IO_OUT ? EPOLLOUT : 0) |
+           (pfd_events & G_IO_HUP ? EPOLLHUP : 0) |
+           (pfd_events & G_IO_ERR ? EPOLLERR : 0);
+}
+
+static bool aio_epoll_try_enable(AioContext *ctx)
+{
+    AioHandler *node;
+    struct epoll_event event;
+
+    QLIST_FOREACH(node, &ctx->aio_handlers, node) {
+        int r;
+        if (node->deleted || !node->pfd.events) {
+            continue;
+        }
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        event.data.ptr = node;
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+        if (r) {
+            return false;
+        }
+    }
+    ctx->epoll_enabled = true;
+    return true;
+}
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+    struct epoll_event event;
+    int r;
+
+    if (!ctx->epoll_enabled) {
+        return;
+    }
+    if (!node->pfd.events) {
+        r = epoll_ctl(ctx->epollfd, EPOLL_CTL_DEL, node->pfd.fd, &event);
+        if (r) {
+            aio_epoll_disable(ctx);
+        }
+    } else {
+        event.data.ptr = node;
+        event.events = epoll_events_from_pfd(node->pfd.events);
+        if (is_new) {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_ADD, node->pfd.fd, &event);
+            if (r) {
+                aio_epoll_disable(ctx);
+            }
+        } else {
+            r = epoll_ctl(ctx->epollfd, EPOLL_CTL_MOD, node->pfd.fd, &event);
+            if (r) {
+                aio_epoll_disable(ctx);
+            }
+        }
+    }
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    AioHandler *node;
+    int i, ret = 0;
+    struct epoll_event events[128];
+
+    assert(npfd == 1);
+    assert(pfds[0].fd == ctx->epollfd);
+    if (timeout > 0) {
+        ret = qemu_poll_ns(pfds, npfd, timeout);
+    }
+    if (timeout <= 0 || ret > 0) {
+        ret = epoll_wait(ctx->epollfd, events,
+                         sizeof(events) / sizeof(events[0]),
+                         timeout);
+        if (ret <= 0) {
+            goto out;
+        }
+        for (i = 0; i < ret; i++) {
+            int ev = events[i].events;
+            node = events[i].data.ptr;
+            node->pfd.revents = (ev & EPOLLIN ? G_IO_IN : 0) |
+                (ev & EPOLLOUT ? G_IO_OUT : 0) |
+                (ev & EPOLLHUP ? G_IO_HUP : 0) |
+                (ev & EPOLLERR ? G_IO_ERR : 0);
+        }
+    }
+out:
+    return ret;
+}
+
+static bool aio_epoll_enabled(AioContext *ctx)
+{
+    /* Fall back to ppoll when external clients are disabled. */
+    return !aio_external_disabled(ctx) && ctx->epoll_enabled;
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                                 unsigned npfd, int64_t timeout)
+{
+    if (!ctx->epoll_available) {
+        return false;
+    }
+    if (aio_epoll_enabled(ctx)) {
+        return true;
+    }
+    if (npfd >= EPOLL_ENABLE_THRESHOLD) {
+        if (aio_epoll_try_enable(ctx)) {
+            return true;
+        } else {
+            aio_epoll_disable(ctx);
+        }
+    }
+    return false;
+}
+
+#else
+
+static void aio_epoll_update(AioContext *ctx, AioHandler *node, bool is_new)
+{
+}
+
+static int aio_epoll(AioContext *ctx, GPollFD *pfds,
+                     unsigned npfd, int64_t timeout)
+{
+    assert(false);
+}
+
+static bool aio_epoll_enabled(AioContext *ctx)
+{
+    return false;
+}
+
+static bool aio_epoll_check_poll(AioContext *ctx, GPollFD *pfds,
+                          unsigned npfd, int64_t timeout)
+{
+    return false;
+}
+
+#endif
+
 static AioHandler *find_aio_handler(AioContext *ctx, int fd)
 {
     AioHandler *node;
@@ -50,6 +209,8 @@ void aio_set_fd_handler(AioContext *ctx,
                         void *opaque)
 {
     AioHandler *node;
+    bool is_new = false;
+    bool deleted = false;
 
     node = find_aio_handler(ctx, fd);
 
@@ -68,7 +229,7 @@ void aio_set_fd_handler(AioContext *ctx,
                  * releasing the walking_handlers lock.
                  */
                 QLIST_REMOVE(node, node);
-                g_free(node);
+                deleted = true;
             }
         }
     } else {
@@ -79,6 +240,7 @@ void aio_set_fd_handler(AioContext *ctx,
             QLIST_INSERT_HEAD(&ctx->aio_handlers, node, node);
 
             g_source_add_poll(&ctx->source, &node->pfd);
+            is_new = true;
         }
         /* Update handler with latest information */
         node->io_read = io_read;
@@ -90,7 +252,11 @@ void aio_set_fd_handler(AioContext *ctx,
         node->pfd.events |= (io_write ? G_IO_OUT | G_IO_ERR : 0);
     }
 
+    aio_epoll_update(ctx, node, is_new);
     aio_notify(ctx);
+    if (deleted) {
+        g_free(node);
+    }
 }
 
 void aio_set_event_notifier(AioContext *ctx,
@@ -262,6 +428,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     /* fill pollfds */
     QLIST_FOREACH(node, &ctx->aio_handlers, node) {
         if (!node->deleted && node->pfd.events
+            && !aio_epoll_enabled(ctx)
             && aio_node_check(ctx, node->is_external)) {
             add_pollfd(node);
         }
@@ -273,7 +440,17 @@ bool aio_poll(AioContext *ctx, bool blocking)
     if (timeout) {
         aio_context_release(ctx);
     }
-    ret = qemu_poll_ns((GPollFD *)pollfds, npfd, timeout);
+    if (aio_epoll_check_poll(ctx, pollfds, npfd, timeout)) {
+        AioHandler epoll_handler;
+
+        epoll_handler.pfd.fd = ctx->epollfd;
+        epoll_handler.pfd.events = G_IO_IN | G_IO_OUT | G_IO_HUP | G_IO_ERR;
+        npfd = 0;
+        add_pollfd(&epoll_handler);
+        ret = aio_epoll(ctx, pollfds, npfd, timeout);
+    } else  {
+        ret = qemu_poll_ns(pollfds, npfd, timeout);
+    }
     if (blocking) {
         atomic_sub(&ctx->notify_me, 2);
     }
@@ -302,3 +479,16 @@ bool aio_poll(AioContext *ctx, bool blocking)
 
     return progress;
 }
+
+void aio_context_setup(AioContext *ctx, Error **errp)
+{
+#ifdef CONFIG_EPOLL
+    assert(!ctx->epollfd);
+    ctx->epollfd = epoll_create1(EPOLL_CLOEXEC);
+    if (ctx->epollfd == -1) {
+        ctx->epoll_available = false;
+    } else {
+        ctx->epoll_available = true;
+    }
+#endif
+}
diff --git a/aio-win32.c b/aio-win32.c
index 43c4c79..cdc4456 100644
--- a/aio-win32.c
+++ b/aio-win32.c
@@ -369,3 +369,7 @@ bool aio_poll(AioContext *ctx, bool blocking)
     aio_context_release(ctx);
     return progress;
 }
+
+void aio_context_setup(AioContext *ctx, Error **errp)
+{
+}
diff --git a/async.c b/async.c
index bdc64a3..e106072 100644
--- a/async.c
+++ b/async.c
@@ -59,6 +59,11 @@ QEMUBH *aio_bh_new(AioContext *ctx, QEMUBHFunc *cb, void *opaque)
     return bh;
 }
 
+void aio_bh_call(QEMUBH *bh)
+{
+    bh->cb(bh->opaque);
+}
+
 /* Multiple occurrences of aio_bh_poll cannot be called concurrently */
 int aio_bh_poll(AioContext *ctx)
 {
@@ -84,7 +89,7 @@ int aio_bh_poll(AioContext *ctx)
                 ret = 1;
             }
             bh->idle = 0;
-            bh->cb(bh->opaque);
+            aio_bh_call(bh);
         }
     }
 
@@ -320,12 +325,18 @@ AioContext *aio_context_new(Error **errp)
 {
     int ret;
     AioContext *ctx;
+    Error *local_err = NULL;
+
     ctx = (AioContext *) g_source_new(&aio_source_funcs, sizeof(AioContext));
+    aio_context_setup(ctx, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        goto fail;
+    }
     ret = event_notifier_init(&ctx->notifier, false);
     if (ret < 0) {
-        g_source_destroy(&ctx->source);
         error_setg_errno(errp, -ret, "Failed to initialize event notifier");
-        return NULL;
+        goto fail;
     }
     g_source_set_can_recurse(&ctx->source, true);
     aio_set_event_notifier(ctx, &ctx->notifier,
@@ -340,6 +351,9 @@ AioContext *aio_context_new(Error **errp)
     ctx->notify_dummy_bh = aio_bh_new(ctx, notify_dummy_bh, NULL);
 
     return ctx;
+fail:
+    g_source_destroy(&ctx->source);
+    return NULL;
 }
 
 void aio_context_ref(AioContext *ctx)
diff --git a/balloon.c b/balloon.c
index 5d69e8a..0f45d1b 100644
--- a/balloon.c
+++ b/balloon.c
@@ -36,6 +36,17 @@
 static QEMUBalloonEvent *balloon_event_fn;
 static QEMUBalloonStatus *balloon_stat_fn;
 static void *balloon_opaque;
+static bool balloon_inhibited;
+
+bool qemu_balloon_is_inhibited(void)
+{
+    return balloon_inhibited;
+}
+
+void qemu_balloon_inhibit(bool state)
+{
+    balloon_inhibited = state;
+}
 
 static bool have_balloon(Error **errp)
 {
diff --git a/block.c b/block.c
index e9f40dc..3a7324b 100644
--- a/block.c
+++ b/block.c
@@ -73,8 +73,7 @@ struct BdrvDirtyBitmap {
 
 #define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */
 
-static QTAILQ_HEAD(, BlockDriverState) bdrv_states =
-    QTAILQ_HEAD_INITIALIZER(bdrv_states);
+struct BdrvStates bdrv_states = QTAILQ_HEAD_INITIALIZER(bdrv_states);
 
 static QTAILQ_HEAD(, BlockDriverState) graph_bdrv_states =
     QTAILQ_HEAD_INITIALIZER(graph_bdrv_states);
@@ -1394,6 +1393,7 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
     BlockDriverState *bs;
     BlockDriver *drv = NULL;
     const char *drvname;
+    const char *backing;
     Error *local_err = NULL;
     int snapshot_flags = 0;
 
@@ -1461,6 +1461,12 @@ static int bdrv_open_inherit(BlockDriverState **pbs, const char *filename,
 
     assert(drvname || !(flags & BDRV_O_PROTOCOL));
 
+    backing = qdict_get_try_str(options, "backing");
+    if (backing && *backing == '\0') {
+        flags |= BDRV_O_NO_BACKING;
+        qdict_del(options, "backing");
+    }
+
     bs->open_flags = flags;
     bs->options = options;
     options = qdict_clone_shallow(options);
@@ -1795,8 +1801,7 @@ int bdrv_reopen_prepare(BDRVReopenState *reopen_state, BlockReopenQueue *queue,
 
     ret = bdrv_flush(reopen_state->bs);
     if (ret) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR, "Error (%s) flushing drive",
-                  strerror(-ret));
+        error_setg_errno(errp, -ret, "Error flushing drive");
         goto error;
     }
 
@@ -1901,7 +1906,7 @@ void bdrv_close(BlockDriverState *bs)
     }
 
     /* Disable I/O limits and drain all pending throttled requests */
-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
         bdrv_io_limits_disable(bs);
     }
 
@@ -2683,12 +2688,12 @@ BlockDriverState *bdrv_lookup_bs(const char *device,
         blk = blk_by_name(device);
 
         if (blk) {
-            if (!blk_bs(blk)) {
+            bs = blk_bs(blk);
+            if (!bs) {
                 error_setg(errp, "Device '%s' has no medium", device);
-                return NULL;
             }
 
-            return blk_bs(blk);
+            return bs;
         }
     }
 
@@ -3399,10 +3404,25 @@ void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
     hbitmap_reset(bitmap->bitmap, cur_sector, nr_sectors);
 }
 
-void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap)
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out)
+{
+    assert(bdrv_dirty_bitmap_enabled(bitmap));
+    if (!out) {
+        hbitmap_reset_all(bitmap->bitmap);
+    } else {
+        HBitmap *backup = bitmap->bitmap;
+        bitmap->bitmap = hbitmap_alloc(bitmap->size,
+                                       hbitmap_granularity(backup));
+        *out = backup;
+    }
+}
+
+void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in)
 {
+    HBitmap *tmp = bitmap->bitmap;
     assert(bdrv_dirty_bitmap_enabled(bitmap));
-    hbitmap_reset_all(bitmap->bitmap);
+    bitmap->bitmap = in;
+    hbitmap_free(tmp);
 }
 
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector,
@@ -3706,7 +3726,7 @@ void bdrv_detach_aio_context(BlockDriverState *bs)
         baf->detach_aio_context(baf->opaque);
     }
 
-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
         throttle_timers_detach_aio_context(&bs->throttle_timers);
     }
     if (bs->drv->bdrv_detach_aio_context) {
@@ -3742,7 +3762,7 @@ void bdrv_attach_aio_context(BlockDriverState *bs,
     if (bs->drv->bdrv_attach_aio_context) {
         bs->drv->bdrv_attach_aio_context(bs, new_context);
     }
-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
         throttle_timers_attach_aio_context(&bs->throttle_timers, new_context);
     }
 
diff --git a/block/accounting.c b/block/accounting.c
index a423560..185025e 100644
--- a/block/accounting.c
+++ b/block/accounting.c
@@ -2,6 +2,7 @@
  * QEMU System Emulator block accounting
  *
  * Copyright (c) 2011 Christoph Hellwig
+ * Copyright (c) 2015 Igalia, S.L.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -25,6 +26,54 @@
 #include "block/accounting.h"
 #include "block/block_int.h"
 #include "qemu/timer.h"
+#include "sysemu/qtest.h"
+
+static QEMUClockType clock_type = QEMU_CLOCK_REALTIME;
+static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000;
+
+void block_acct_init(BlockAcctStats *stats, bool account_invalid,
+                     bool account_failed)
+{
+    stats->account_invalid = account_invalid;
+    stats->account_failed = account_failed;
+
+    if (qtest_enabled()) {
+        clock_type = QEMU_CLOCK_VIRTUAL;
+    }
+}
+
+void block_acct_cleanup(BlockAcctStats *stats)
+{
+    BlockAcctTimedStats *s, *next;
+    QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) {
+        g_free(s);
+    }
+}
+
+void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length)
+{
+    BlockAcctTimedStats *s;
+    unsigned i;
+
+    s = g_new0(BlockAcctTimedStats, 1);
+    s->interval_length = interval_length;
+    QSLIST_INSERT_HEAD(&stats->intervals, s, entries);
+
+    for (i = 0; i < BLOCK_MAX_IOTYPE; i++) {
+        timed_average_init(&s->latency[i], clock_type,
+                           (uint64_t) interval_length * NANOSECONDS_PER_SECOND);
+    }
+}
+
+BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats,
+                                              BlockAcctTimedStats *s)
+{
+    if (s == NULL) {
+        return QSLIST_FIRST(&stats->intervals);
+    } else {
+        return QSLIST_NEXT(s, entries);
+    }
+}
 
 void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
                       int64_t bytes, enum BlockAcctType type)
@@ -32,20 +81,71 @@ void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
     assert(type < BLOCK_MAX_IOTYPE);
 
     cookie->bytes = bytes;
-    cookie->start_time_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME);
+    cookie->start_time_ns = qemu_clock_get_ns(clock_type);
     cookie->type = type;
 }
 
 void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie)
 {
+    BlockAcctTimedStats *s;
+    int64_t time_ns = qemu_clock_get_ns(clock_type);
+    int64_t latency_ns = time_ns - cookie->start_time_ns;
+
+    if (qtest_enabled()) {
+        latency_ns = qtest_latency_ns;
+    }
+
     assert(cookie->type < BLOCK_MAX_IOTYPE);
 
     stats->nr_bytes[cookie->type] += cookie->bytes;
     stats->nr_ops[cookie->type]++;
-    stats->total_time_ns[cookie->type] +=
-        qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - cookie->start_time_ns;
+    stats->total_time_ns[cookie->type] += latency_ns;
+    stats->last_access_time_ns = time_ns;
+
+    QSLIST_FOREACH(s, &stats->intervals, entries) {
+        timed_average_account(&s->latency[cookie->type], latency_ns);
+    }
+}
+
+void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie)
+{
+    assert(cookie->type < BLOCK_MAX_IOTYPE);
+
+    stats->failed_ops[cookie->type]++;
+
+    if (stats->account_failed) {
+        BlockAcctTimedStats *s;
+        int64_t time_ns = qemu_clock_get_ns(clock_type);
+        int64_t latency_ns = time_ns - cookie->start_time_ns;
+
+        if (qtest_enabled()) {
+            latency_ns = qtest_latency_ns;
+        }
+
+        stats->total_time_ns[cookie->type] += latency_ns;
+        stats->last_access_time_ns = time_ns;
+
+        QSLIST_FOREACH(s, &stats->intervals, entries) {
+            timed_average_account(&s->latency[cookie->type], latency_ns);
+        }
+    }
 }
 
+void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type)
+{
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    /* block_acct_done() and block_acct_failed() update
+     * total_time_ns[], but this one does not. The reason is that
+     * invalid requests are accounted during their submission,
+     * therefore there's no actual I/O involved. */
+
+    stats->invalid_ops[type]++;
+
+    if (stats->account_invalid) {
+        stats->last_access_time_ns = qemu_clock_get_ns(clock_type);
+    }
+}
 
 void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
                       int num_requests)
@@ -53,3 +153,20 @@ void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
     assert(type < BLOCK_MAX_IOTYPE);
     stats->merged[type] += num_requests;
 }
+
+int64_t block_acct_idle_time_ns(BlockAcctStats *stats)
+{
+    return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns;
+}
+
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+                              enum BlockAcctType type)
+{
+    uint64_t sum, elapsed;
+
+    assert(type < BLOCK_MAX_IOTYPE);
+
+    sum = timed_average_sum(&stats->latency[type], &elapsed);
+
+    return (double) sum / elapsed;
+}
diff --git a/block/backup.c b/block/backup.c
index ec01db8..3b39119 100644
--- a/block/backup.c
+++ b/block/backup.c
@@ -221,11 +221,45 @@ static void backup_iostatus_reset(BlockJob *job)
     }
 }
 
+static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret)
+{
+    BdrvDirtyBitmap *bm;
+    BlockDriverState *bs = job->common.bs;
+
+    if (ret < 0 || block_job_is_cancelled(&job->common)) {
+        /* Merge the successor back into the parent, delete nothing. */
+        bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
+        assert(bm);
+    } else {
+        /* Everything is fine, delete this bitmap and install the backup. */
+        bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
+        assert(bm);
+    }
+}
+
+static void backup_commit(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, 0);
+    }
+}
+
+static void backup_abort(BlockJob *job)
+{
+    BackupBlockJob *s = container_of(job, BackupBlockJob, common);
+    if (s->sync_bitmap) {
+        backup_cleanup_sync_bitmap(s, -1);
+    }
+}
+
 static const BlockJobDriver backup_job_driver = {
     .instance_size  = sizeof(BackupBlockJob),
     .job_type       = BLOCK_JOB_TYPE_BACKUP,
     .set_speed      = backup_set_speed,
     .iostatus_reset = backup_iostatus_reset,
+    .commit         = backup_commit,
+    .abort          = backup_abort,
 };
 
 static BlockErrorAction backup_error_action(BackupBlockJob *job,
@@ -441,19 +475,6 @@ static void coroutine_fn backup_run(void *opaque)
     /* wait until pending backup_do_cow() calls have completed */
     qemu_co_rwlock_wrlock(&job->flush_rwlock);
     qemu_co_rwlock_unlock(&job->flush_rwlock);
-
-    if (job->sync_bitmap) {
-        BdrvDirtyBitmap *bm;
-        if (ret < 0 || block_job_is_cancelled(&job->common)) {
-            /* Merge the successor back into the parent, delete nothing. */
-            bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL);
-            assert(bm);
-        } else {
-            /* Everything is fine, delete this bitmap and install the backup. */
-            bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL);
-            assert(bm);
-        }
-    }
     hbitmap_free(job->bitmap);
 
     if (target->blk) {
@@ -472,7 +493,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
-                  Error **errp)
+                  BlockJobTxn *txn, Error **errp)
 {
     int64_t len;
 
@@ -554,6 +575,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                        sync_bitmap : NULL;
     job->common.len = len;
     job->common.co = qemu_coroutine_create(backup_run);
+    block_job_txn_add_job(txn, &job->common);
     qemu_coroutine_enter(job->common.co, job);
     return;
 
diff --git a/block/block-backend.c b/block/block-backend.c
index 19fdaae..9889e81 100644
--- a/block/block-backend.c
+++ b/block/block-backend.c
@@ -176,6 +176,7 @@ static void blk_delete(BlockBackend *blk)
     }
     g_free(blk->name);
     drive_info_del(blk->legacy_dinfo);
+    block_acct_cleanup(&blk->stats);
     g_free(blk);
 }
 
@@ -189,6 +190,11 @@ static void drive_info_del(DriveInfo *dinfo)
     g_free(dinfo);
 }
 
+int blk_get_refcnt(BlockBackend *blk)
+{
+    return blk ? blk->refcnt : 0;
+}
+
 /*
  * Increment @blk's reference count.
  * @blk must not be null.
@@ -334,6 +340,18 @@ void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk)
 }
 
 /*
+ * Disassociates the currently associated BlockDriverState from @blk.
+ */
+void blk_remove_bs(BlockBackend *blk)
+{
+    blk_update_root_state(blk);
+
+    blk->bs->blk = NULL;
+    bdrv_unref(blk->bs);
+    blk->bs = NULL;
+}
+
+/*
  * Associates a new BlockDriverState with @blk.
  */
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs)
@@ -417,18 +435,15 @@ void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops,
 void blk_dev_change_media_cb(BlockBackend *blk, bool load)
 {
     if (blk->dev_ops && blk->dev_ops->change_media_cb) {
-        bool tray_was_closed = !blk_dev_is_tray_open(blk);
+        bool tray_was_open, tray_is_open;
 
+        tray_was_open = blk_dev_is_tray_open(blk);
         blk->dev_ops->change_media_cb(blk->dev_opaque, load);
-        if (tray_was_closed) {
-            /* tray open */
-            qapi_event_send_device_tray_moved(blk_name(blk),
-                                              true, &error_abort);
-        }
-        if (load) {
-            /* tray close */
-            qapi_event_send_device_tray_moved(blk_name(blk),
-                                              false, &error_abort);
+        tray_is_open = blk_dev_is_tray_open(blk);
+
+        if (tray_was_open != tray_is_open) {
+            qapi_event_send_device_tray_moved(blk_name(blk), tray_is_open,
+                                              &error_abort);
         }
     }
 }
@@ -1227,6 +1242,33 @@ void blk_update_root_state(BlockBackend *blk)
     }
 }
 
+/*
+ * Applies the information in the root state to the given BlockDriverState. This
+ * does not include the flags which have to be specified for bdrv_open(), use
+ * blk_get_open_flags_from_root_state() to inquire them.
+ */
+void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs)
+{
+    bs->detect_zeroes = blk->root_state.detect_zeroes;
+    if (blk->root_state.throttle_group) {
+        bdrv_io_limits_enable(bs, blk->root_state.throttle_group);
+    }
+}
+
+/*
+ * Returns the flags to be used for bdrv_open() of a BlockDriverState which is
+ * supposed to inherit the root state.
+ */
+int blk_get_open_flags_from_root_state(BlockBackend *blk)
+{
+    int bs_flags;
+
+    bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR;
+    bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR;
+
+    return bs_flags;
+}
+
 BlockBackendRootState *blk_get_root_state(BlockBackend *blk)
 {
     return &blk->root_state;
diff --git a/block/commit.c b/block/commit.c
index fdebe87..a5d02aa 100644
--- a/block/commit.c
+++ b/block/commit.c
@@ -236,14 +236,14 @@ void commit_start(BlockDriverState *bs, BlockDriverState *base,
     orig_overlay_flags = bdrv_get_flags(overlay_bs);
 
     /* convert base & overlay_bs to r/w, if necessary */
-    if (!(orig_base_flags & BDRV_O_RDWR)) {
-        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
-                                         orig_base_flags | BDRV_O_RDWR);
-    }
     if (!(orig_overlay_flags & BDRV_O_RDWR)) {
         reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL,
                                          orig_overlay_flags | BDRV_O_RDWR);
     }
+    if (!(orig_base_flags & BDRV_O_RDWR)) {
+        reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL,
+                                         orig_base_flags | BDRV_O_RDWR);
+    }
     if (reopen_queue) {
         bdrv_reopen_multiple(reopen_queue, &local_err);
         if (local_err != NULL) {
diff --git a/block/gluster.c b/block/gluster.c
index 1eb3a8c..0857c14 100644
--- a/block/gluster.c
+++ b/block/gluster.c
@@ -429,28 +429,23 @@ static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, BdrvRequestFlags flags)
 {
     int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
     off_t size = nb_sectors * BDRV_SECTOR_SIZE;
     off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
-    acb->size = size;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = size;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);
 
-    ret = glfs_zerofill_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
     if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
     }
 
     qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }
 
 static inline bool gluster_supports_zerofill(void)
@@ -541,35 +536,30 @@ static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write)
 {
     int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
     size_t size = nb_sectors * BDRV_SECTOR_SIZE;
     off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
-    acb->size = size;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = size;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);
 
     if (write) {
         ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            &gluster_finish_aiocb, acb);
+            gluster_finish_aiocb, &acb);
     } else {
         ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0,
-            &gluster_finish_aiocb, acb);
+            gluster_finish_aiocb, &acb);
     }
 
     if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
     }
 
     qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }
 
 static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset)
@@ -600,26 +590,21 @@ static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs,
 static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs)
 {
     int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
 
-    acb->size = 0;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = 0;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);
 
-    ret = glfs_fsync_async(s->fd, &gluster_finish_aiocb, acb);
+    ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb);
     if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
     }
 
     qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }
 
 #ifdef CONFIG_GLUSTERFS_DISCARD
@@ -627,28 +612,23 @@ static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs,
         int64_t sector_num, int nb_sectors)
 {
     int ret;
-    GlusterAIOCB *acb = g_slice_new(GlusterAIOCB);
+    GlusterAIOCB acb;
     BDRVGlusterState *s = bs->opaque;
     size_t size = nb_sectors * BDRV_SECTOR_SIZE;
     off_t offset = sector_num * BDRV_SECTOR_SIZE;
 
-    acb->size = 0;
-    acb->ret = 0;
-    acb->coroutine = qemu_coroutine_self();
-    acb->aio_context = bdrv_get_aio_context(bs);
+    acb.size = 0;
+    acb.ret = 0;
+    acb.coroutine = qemu_coroutine_self();
+    acb.aio_context = bdrv_get_aio_context(bs);
 
-    ret = glfs_discard_async(s->fd, offset, size, &gluster_finish_aiocb, acb);
+    ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb);
     if (ret < 0) {
-        ret = -errno;
-        goto out;
+        return -errno;
     }
 
     qemu_coroutine_yield();
-    ret = acb->ret;
-
-out:
-    g_slice_free(GlusterAIOCB, acb);
-    return ret;
+    return acb.ret;
 }
 #endif
 
diff --git a/block/io.c b/block/io.c
index 8dcad3b..adc1eab 100644
--- a/block/io.c
+++ b/block/io.c
@@ -237,8 +237,21 @@ bool bdrv_requests_pending(BlockDriverState *bs)
     return false;
 }
 
+static void bdrv_drain_recurse(BlockDriverState *bs)
+{
+    BdrvChild *child;
+
+    if (bs->drv && bs->drv->bdrv_drain) {
+        bs->drv->bdrv_drain(bs);
+    }
+    QLIST_FOREACH(child, &bs->children, next) {
+        bdrv_drain_recurse(child->bs);
+    }
+}
+
 /*
- * Wait for pending requests to complete on a single BlockDriverState subtree
+ * Wait for pending requests to complete on a single BlockDriverState subtree,
+ * and suspend block driver's internal I/O until next request arrives.
  *
  * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState
  * AioContext.
@@ -251,6 +264,7 @@ void bdrv_drain(BlockDriverState *bs)
 {
     bool busy = true;
 
+    bdrv_drain_recurse(bs);
     while (busy) {
         /* Keep iterating */
          bdrv_flush_io_queue(bs);
@@ -348,13 +362,14 @@ static void tracked_request_end(BdrvTrackedRequest *req)
 static void tracked_request_begin(BdrvTrackedRequest *req,
                                   BlockDriverState *bs,
                                   int64_t offset,
-                                  unsigned int bytes, bool is_write)
+                                  unsigned int bytes,
+                                  enum BdrvTrackedRequestType type)
 {
     *req = (BdrvTrackedRequest){
         .bs = bs,
         .offset         = offset,
         .bytes          = bytes,
-        .is_write       = is_write,
+        .type           = type,
         .co             = qemu_coroutine_self(),
         .serialising    = false,
         .overlap_offset = offset,
@@ -971,7 +986,7 @@ static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs,
         bytes = ROUND_UP(bytes, align);
     }
 
-    tracked_request_begin(&req, bs, offset, bytes, false);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ);
     ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align,
                               use_local_qiov ? &local_qiov : qiov,
                               flags);
@@ -1292,7 +1307,7 @@ static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs,
      * Pad qiov with the read parts and be sure to have a tracked request not
      * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle.
      */
-    tracked_request_begin(&req, bs, offset, bytes, true);
+    tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE);
 
     if (!qiov) {
         ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req);
@@ -2317,18 +2332,20 @@ static void coroutine_fn bdrv_flush_co_entry(void *opaque)
 int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
 {
     int ret;
+    BdrvTrackedRequest req;
 
     if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) ||
         bdrv_is_sg(bs)) {
         return 0;
     }
 
+    tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH);
     /* Write back cached data to the OS even with cache=unsafe */
     BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS);
     if (bs->drv->bdrv_co_flush_to_os) {
         ret = bs->drv->bdrv_co_flush_to_os(bs);
         if (ret < 0) {
-            return ret;
+            goto out;
         }
     }
 
@@ -2368,14 +2385,17 @@ int coroutine_fn bdrv_co_flush(BlockDriverState *bs)
         ret = 0;
     }
     if (ret < 0) {
-        return ret;
+        goto out;
     }
 
     /* Now flush the underlying protocol.  It will also have BDRV_O_NO_FLUSH
      * in the case of cache=unsafe, so there are no useless flushes.
      */
 flush_parent:
-    return bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+    ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0;
+out:
+    tracked_request_end(&req);
+    return ret;
 }
 
 int bdrv_flush(BlockDriverState *bs)
@@ -2418,6 +2438,7 @@ static void coroutine_fn bdrv_discard_co_entry(void *opaque)
 int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
                                  int nb_sectors)
 {
+    BdrvTrackedRequest req;
     int max_discard, ret;
 
     if (!bs->drv) {
@@ -2440,6 +2461,8 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
         return 0;
     }
 
+    tracked_request_begin(&req, bs, sector_num, nb_sectors,
+                          BDRV_TRACKED_DISCARD);
     bdrv_set_dirty(bs, sector_num, nb_sectors);
 
     max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS);
@@ -2473,20 +2496,24 @@ int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num,
             acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors,
                                             bdrv_co_io_em_complete, &co);
             if (acb == NULL) {
-                return -EIO;
+                ret = -EIO;
+                goto out;
             } else {
                 qemu_coroutine_yield();
                 ret = co.ret;
             }
         }
         if (ret && ret != -ENOTSUP) {
-            return ret;
+            goto out;
         }
 
         sector_num += num;
         nb_sectors -= num;
     }
-    return 0;
+    ret = 0;
+out:
+    tracked_request_end(&req);
+    return ret;
 }
 
 int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
@@ -2515,26 +2542,109 @@ int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors)
     return rwco.ret;
 }
 
-/* needed for generic scsi interface */
+typedef struct {
+    CoroutineIOCompletion *co;
+    QEMUBH *bh;
+} BdrvIoctlCompletionData;
 
-int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+static void bdrv_ioctl_bh_cb(void *opaque)
+{
+    BdrvIoctlCompletionData *data = opaque;
+
+    bdrv_co_io_em_complete(data->co, -ENOTSUP);
+    qemu_bh_delete(data->bh);
+}
+
+static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf)
 {
     BlockDriver *drv = bs->drv;
+    BdrvTrackedRequest tracked_req;
+    CoroutineIOCompletion co = {
+        .coroutine = qemu_coroutine_self(),
+    };
+    BlockAIOCB *acb;
 
-    if (drv && drv->bdrv_ioctl)
-        return drv->bdrv_ioctl(bs, req, buf);
-    return -ENOTSUP;
+    tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL);
+    if (!drv || !drv->bdrv_aio_ioctl) {
+        co.ret = -ENOTSUP;
+        goto out;
+    }
+
+    acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co);
+    if (!acb) {
+        BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1);
+        data->bh = aio_bh_new(bdrv_get_aio_context(bs),
+                                bdrv_ioctl_bh_cb, data);
+        data->co = &co;
+        qemu_bh_schedule(data->bh);
+    }
+    qemu_coroutine_yield();
+out:
+    tracked_request_end(&tracked_req);
+    return co.ret;
+}
+
+typedef struct {
+    BlockDriverState *bs;
+    int req;
+    void *buf;
+    int ret;
+} BdrvIoctlCoData;
+
+static void coroutine_fn bdrv_co_ioctl_entry(void *opaque)
+{
+    BdrvIoctlCoData *data = opaque;
+    data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf);
+}
+
+/* needed for generic scsi interface */
+int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
+{
+    BdrvIoctlCoData data = {
+        .bs = bs,
+        .req = req,
+        .buf = buf,
+        .ret = -EINPROGRESS,
+    };
+
+    if (qemu_in_coroutine()) {
+        /* Fast-path if already in coroutine context */
+        bdrv_co_ioctl_entry(&data);
+    } else {
+        Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry);
+        qemu_coroutine_enter(co, &data);
+    }
+    while (data.ret == -EINPROGRESS) {
+        aio_poll(bdrv_get_aio_context(bs), true);
+    }
+    return data.ret;
+}
+
+static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque)
+{
+    BlockAIOCBCoroutine *acb = opaque;
+    acb->req.error = bdrv_co_do_ioctl(acb->common.bs,
+                                      acb->req.req, acb->req.buf);
+    bdrv_co_complete(acb);
 }
 
 BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs,
         unsigned long int req, void *buf,
         BlockCompletionFunc *cb, void *opaque)
 {
-    BlockDriver *drv = bs->drv;
+    BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info,
+                                            bs, cb, opaque);
+    Coroutine *co;
 
-    if (drv && drv->bdrv_aio_ioctl)
-        return drv->bdrv_aio_ioctl(bs, req, buf, cb, opaque);
-    return NULL;
+    acb->need_bh = true;
+    acb->req.error = -EINPROGRESS;
+    acb->req.req = req;
+    acb->req.buf = buf;
+    co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry);
+    qemu_coroutine_enter(co, acb);
+
+    bdrv_co_maybe_schedule_bh(acb);
+    return &acb->common;
 }
 
 void *qemu_blockalign(BlockDriverState *bs, size_t size)
diff --git a/block/iscsi.c b/block/iscsi.c
index 9a628b7..bd1f1bf 100644
--- a/block/iscsi.c
+++ b/block/iscsi.c
@@ -84,6 +84,7 @@ typedef struct IscsiTask {
     IscsiLun *iscsilun;
     QEMUTimer retry_timer;
     bool force_next_flush;
+    int err_code;
 } IscsiTask;
 
 typedef struct IscsiAIOCB {
@@ -96,6 +97,7 @@ typedef struct IscsiAIOCB {
     int status;
     int64_t sector_num;
     int nb_sectors;
+    int ret;
 #ifdef __linux__
     sg_io_hdr_t *ioh;
 #endif
@@ -169,19 +171,70 @@ static inline unsigned exp_random(double mean)
     return -mean * log((double)rand() / RAND_MAX);
 }
 
-/* SCSI_STATUS_TASK_SET_FULL and SCSI_STATUS_TIMEOUT were introduced
- * in libiscsi 1.10.0 as part of an enum. The LIBISCSI_API_VERSION
- * macro was introduced in 1.11.0. So use the API_VERSION macro as
- * a hint that the macros are defined and define them ourselves
- * otherwise to keep the required libiscsi version at 1.9.0 */
-#if !defined(LIBISCSI_API_VERSION)
-#define QEMU_SCSI_STATUS_TASK_SET_FULL  0x28
-#define QEMU_SCSI_STATUS_TIMEOUT        0x0f000002
-#else
-#define QEMU_SCSI_STATUS_TASK_SET_FULL  SCSI_STATUS_TASK_SET_FULL
-#define QEMU_SCSI_STATUS_TIMEOUT        SCSI_STATUS_TIMEOUT
+/* SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST was introduced in
+ * libiscsi 1.10.0, together with other constants we need.  Use it as
+ * a hint that we have to define them ourselves if needed, to keep the
+ * minimum required libiscsi version at 1.9.0.  We use an ASCQ macro for
+ * the test because SCSI_STATUS_* is an enum.
+ *
+ * To guard against future changes where SCSI_SENSE_ASCQ_* also becomes
+ * an enum, check against the LIBISCSI_API_VERSION macro, which was
+ * introduced in 1.11.0.  If it is present, there is no need to define
+ * anything.
+ */
+#if !defined(SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST) && \
+    !defined(LIBISCSI_API_VERSION)
+#define SCSI_STATUS_TASK_SET_FULL                          0x28
+#define SCSI_STATUS_TIMEOUT                                0x0f000002
+#define SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST    0x2600
+#define SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR        0x1a00
 #endif
 
+static int iscsi_translate_sense(struct scsi_sense *sense)
+{
+    int ret;
+
+    switch (sense->key) {
+    case SCSI_SENSE_NOT_READY:
+        return -EBUSY;
+    case SCSI_SENSE_DATA_PROTECTION:
+        return -EACCES;
+    case SCSI_SENSE_COMMAND_ABORTED:
+        return -ECANCELED;
+    case SCSI_SENSE_ILLEGAL_REQUEST:
+        /* Parse ASCQ */
+        break;
+    default:
+        return -EIO;
+    }
+    switch (sense->ascq) {
+    case SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR:
+    case SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE:
+    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB:
+    case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST:
+        ret = -EINVAL;
+        break;
+    case SCSI_SENSE_ASCQ_LBA_OUT_OF_RANGE:
+        ret = -ENOSPC;
+        break;
+    case SCSI_SENSE_ASCQ_LOGICAL_UNIT_NOT_SUPPORTED:
+        ret = -ENOTSUP;
+        break;
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT:
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_CLOSED:
+    case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_OPEN:
+        ret = -ENOMEDIUM;
+        break;
+    case SCSI_SENSE_ASCQ_WRITE_PROTECTED:
+        ret = -EACCES;
+        break;
+    default:
+        ret = -EIO;
+        break;
+    }
+    return ret;
+}
+
 static void
 iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                         void *command_data, void *opaque)
@@ -203,11 +256,11 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                 goto out;
             }
             if (status == SCSI_STATUS_BUSY ||
-                status == QEMU_SCSI_STATUS_TIMEOUT ||
-                status == QEMU_SCSI_STATUS_TASK_SET_FULL) {
+                status == SCSI_STATUS_TIMEOUT ||
+                status == SCSI_STATUS_TASK_SET_FULL) {
                 unsigned retry_time =
                     exp_random(iscsi_retry_times[iTask->retries - 1]);
-                if (status == QEMU_SCSI_STATUS_TIMEOUT) {
+                if (status == SCSI_STATUS_TIMEOUT) {
                     /* make sure the request is rescheduled AFTER the
                      * reconnect is initiated */
                     retry_time = EVENT_INTERVAL * 2;
@@ -226,6 +279,7 @@ iscsi_co_generic_cb(struct iscsi_context *iscsi, int status,
                 return;
             }
         }
+        iTask->err_code = iscsi_translate_sense(&task->sense);
         error_report("iSCSI Failure: %s", iscsi_get_error(iscsi));
     } else {
         iTask->iscsilun->force_next_flush |= iTask->force_next_flush;
@@ -455,7 +509,7 @@ retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
     }
 
     iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors);
@@ -644,7 +698,7 @@ retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
     }
 
     return 0;
@@ -683,7 +737,7 @@ retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
     }
 
     return 0;
@@ -703,7 +757,7 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
     if (status < 0) {
         error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s",
                      iscsi_get_error(iscsi));
-        acb->status = -EIO;
+        acb->status = iscsi_translate_sense(&acb->task->sense);
     }
 
     acb->ioh->driver_status = 0;
@@ -726,6 +780,38 @@ iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status,
     iscsi_schedule_bh(acb);
 }
 
+static void iscsi_ioctl_bh_completion(void *opaque)
+{
+    IscsiAIOCB *acb = opaque;
+
+    qemu_bh_delete(acb->bh);
+    acb->common.cb(acb->common.opaque, acb->ret);
+    qemu_aio_unref(acb);
+}
+
+static void iscsi_ioctl_handle_emulated(IscsiAIOCB *acb, int req, void *buf)
+{
+    BlockDriverState *bs = acb->common.bs;
+    IscsiLun *iscsilun = bs->opaque;
+    int ret = 0;
+
+    switch (req) {
+    case SG_GET_VERSION_NUM:
+        *(int *)buf = 30000;
+        break;
+    case SG_GET_SCSI_ID:
+        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
+        break;
+    default:
+        ret = -EINVAL;
+    }
+    assert(!acb->bh);
+    acb->bh = aio_bh_new(bdrv_get_aio_context(bs),
+                         iscsi_ioctl_bh_completion, acb);
+    acb->ret = ret;
+    qemu_bh_schedule(acb->bh);
+}
+
 static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
         unsigned long int req, void *buf,
         BlockCompletionFunc *cb, void *opaque)
@@ -735,8 +821,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
     struct iscsi_data data;
     IscsiAIOCB *acb;
 
-    assert(req == SG_IO);
-
     acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque);
 
     acb->iscsilun = iscsilun;
@@ -745,6 +829,11 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
     acb->buf         = NULL;
     acb->ioh         = buf;
 
+    if (req != SG_IO) {
+        iscsi_ioctl_handle_emulated(acb, req, buf);
+        return &acb->common;
+    }
+
     acb->task = malloc(sizeof(struct scsi_task));
     if (acb->task == NULL) {
         error_report("iSCSI: Failed to allocate task for scsi command. %s",
@@ -809,38 +898,6 @@ static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs,
     return &acb->common;
 }
 
-static void ioctl_cb(void *opaque, int status)
-{
-    int *p_status = opaque;
-    *p_status = status;
-}
-
-static int iscsi_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    IscsiLun *iscsilun = bs->opaque;
-    int status;
-
-    switch (req) {
-    case SG_GET_VERSION_NUM:
-        *(int *)buf = 30000;
-        break;
-    case SG_GET_SCSI_ID:
-        ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type;
-        break;
-    case SG_IO:
-        status = -EINPROGRESS;
-        iscsi_aio_ioctl(bs, req, buf, ioctl_cb, &status);
-
-        while (status == -EINPROGRESS) {
-            aio_poll(iscsilun->aio_context, true);
-        }
-
-        return 0;
-    default:
-        return -1;
-    }
-    return 0;
-}
 #endif
 
 static int64_t
@@ -905,7 +962,7 @@ retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
     }
 
     iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors);
@@ -999,7 +1056,7 @@ retry:
     }
 
     if (iTask.status != SCSI_STATUS_GOOD) {
-        return -EIO;
+        return iTask.err_code;
     }
 
     if (flags & BDRV_REQ_MAY_UNMAP) {
@@ -1771,7 +1828,6 @@ static BlockDriver bdrv_iscsi = {
     .bdrv_co_flush_to_disk = iscsi_co_flush,
 
 #ifdef __linux__
-    .bdrv_ioctl       = iscsi_ioctl,
     .bdrv_aio_ioctl   = iscsi_aio_ioctl,
 #endif
 
diff --git a/block/mirror.c b/block/mirror.c
index b1252a1..52c9abf 100644
--- a/block/mirror.c
+++ b/block/mirror.c
@@ -384,6 +384,7 @@ static void mirror_exit(BlockJob *job, void *opaque)
         aio_context_release(replace_aio_context);
     }
     g_free(s->replaces);
+    bdrv_op_unblock_all(s->target, s->common.blocker);
     bdrv_unref(s->target);
     block_job_completed(&s->common, data->ret);
     g_free(data);
@@ -741,9 +742,12 @@ static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target,
     s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp);
     if (!s->dirty_bitmap) {
         g_free(s->replaces);
-        block_job_release(bs);
+        block_job_unref(&s->common);
         return;
     }
+
+    bdrv_op_block_all(s->target, s->common.blocker);
+
     bdrv_set_enable_write_cache(s->target, true);
     if (s->target->blk) {
         blk_set_on_error(s->target->blk, on_target_error, on_target_error);
diff --git a/block/qapi.c b/block/qapi.c
index ec0f513..d20262d 100644
--- a/block/qapi.c
+++ b/block/qapi.c
@@ -64,7 +64,7 @@ BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp)
     info->backing_file_depth = bdrv_get_backing_file_depth(bs);
     info->detect_zeroes = bs->detect_zeroes;
 
-    if (bs->io_limits_enabled) {
+    if (bs->throttle_state) {
         ThrottleConfig cfg;
 
         throttle_group_get_config(bs, &cfg);
@@ -346,17 +346,68 @@ static BlockStats *bdrv_query_stats(const BlockDriverState *bs,
     s->stats = g_malloc0(sizeof(*s->stats));
     if (bs->blk) {
         BlockAcctStats *stats = blk_get_stats(bs->blk);
+        BlockAcctTimedStats *ts = NULL;
 
         s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ];
         s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE];
         s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ];
         s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE];
+
+        s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ];
+        s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE];
+        s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH];
+
+        s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ];
+        s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE];
+        s->stats->invalid_flush_operations =
+            stats->invalid_ops[BLOCK_ACCT_FLUSH];
+
         s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ];
         s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE];
         s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH];
         s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE];
         s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ];
         s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH];
+
+        s->stats->has_idle_time_ns = stats->last_access_time_ns > 0;
+        if (s->stats->has_idle_time_ns) {
+            s->stats->idle_time_ns = block_acct_idle_time_ns(stats);
+        }
+
+        s->stats->account_invalid = stats->account_invalid;
+        s->stats->account_failed = stats->account_failed;
+
+        while ((ts = block_acct_interval_next(stats, ts))) {
+            BlockDeviceTimedStatsList *timed_stats =
+                g_malloc0(sizeof(*timed_stats));
+            BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats));
+            timed_stats->next = s->stats->timed_stats;
+            timed_stats->value = dev_stats;
+            s->stats->timed_stats = timed_stats;
+
+            TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ];
+            TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE];
+            TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH];
+
+            dev_stats->interval_length = ts->interval_length;
+
+            dev_stats->min_rd_latency_ns = timed_average_min(rd);
+            dev_stats->max_rd_latency_ns = timed_average_max(rd);
+            dev_stats->avg_rd_latency_ns = timed_average_avg(rd);
+
+            dev_stats->min_wr_latency_ns = timed_average_min(wr);
+            dev_stats->max_wr_latency_ns = timed_average_max(wr);
+            dev_stats->avg_wr_latency_ns = timed_average_avg(wr);
+
+            dev_stats->min_flush_latency_ns = timed_average_min(fl);
+            dev_stats->max_flush_latency_ns = timed_average_max(fl);
+            dev_stats->avg_flush_latency_ns = timed_average_avg(fl);
+
+            dev_stats->avg_rd_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_READ);
+            dev_stats->avg_wr_queue_depth =
+                block_acct_queue_depth(ts, BLOCK_ACCT_WRITE);
+        }
     }
 
     s->stats->wr_highest_offset = bs->wr_highest_offset;
diff --git a/block/qcow2-cluster.c b/block/qcow2-cluster.c
index 67be0ce..24a60e2 100644
--- a/block/qcow2-cluster.c
+++ b/block/qcow2-cluster.c
@@ -312,7 +312,7 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
     if (!offset)
         return 0;
 
-    assert(qcow2_get_cluster_type(first_entry) != QCOW2_CLUSTER_COMPRESSED);
+    assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL);
 
     for (i = 0; i < nb_clusters; i++) {
         uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask;
@@ -324,14 +324,16 @@ static int count_contiguous_clusters(int nb_clusters, int cluster_size,
 	return i;
 }
 
-static int count_contiguous_free_clusters(int nb_clusters, uint64_t *l2_table)
+static int count_contiguous_clusters_by_type(int nb_clusters,
+                                             uint64_t *l2_table,
+                                             int wanted_type)
 {
     int i;
 
     for (i = 0; i < nb_clusters; i++) {
         int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i]));
 
-        if (type != QCOW2_CLUSTER_UNALLOCATED) {
+        if (type != wanted_type) {
             break;
         }
     }
@@ -554,13 +556,14 @@ int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset,
             ret = -EIO;
             goto fail;
         }
-        c = count_contiguous_clusters(nb_clusters, s->cluster_size,
-                &l2_table[l2_index], QCOW_OFLAG_ZERO);
+        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
+                                              QCOW2_CLUSTER_ZERO);
         *cluster_offset = 0;
         break;
     case QCOW2_CLUSTER_UNALLOCATED:
         /* how many empty clusters ? */
-        c = count_contiguous_free_clusters(nb_clusters, &l2_table[l2_index]);
+        c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index],
+                                              QCOW2_CLUSTER_UNALLOCATED);
         *cluster_offset = 0;
         break;
     case QCOW2_CLUSTER_NORMAL:
diff --git a/block/qcow2-refcount.c b/block/qcow2-refcount.c
index 4b81c8d..6e0e5bd 100644
--- a/block/qcow2-refcount.c
+++ b/block/qcow2-refcount.c
@@ -560,13 +560,16 @@ static int alloc_refcount_block(BlockDriverState *bs,
     }
 
     /* Hook up the new refcount table in the qcow2 header */
-    uint8_t data[12];
-    cpu_to_be64w((uint64_t*)data, table_offset);
-    cpu_to_be32w((uint32_t*)(data + 8), table_clusters);
+    struct QEMU_PACKED {
+        uint64_t d64;
+        uint32_t d32;
+    } data;
+    cpu_to_be64w(&data.d64, table_offset);
+    cpu_to_be32w(&data.d32, table_clusters);
     BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE);
     ret = bdrv_pwrite_sync(bs->file->bs,
                            offsetof(QCowHeader, refcount_table_offset),
-                           data, sizeof(data));
+                           &data, sizeof(data));
     if (ret < 0) {
         goto fail_table;
     }
diff --git a/block/qed.c b/block/qed.c
index 5ea05d4..9b88895 100644
--- a/block/qed.c
+++ b/block/qed.c
@@ -375,6 +375,18 @@ static void bdrv_qed_attach_aio_context(BlockDriverState *bs,
     }
 }
 
+static void bdrv_qed_drain(BlockDriverState *bs)
+{
+    BDRVQEDState *s = bs->opaque;
+
+    /* Cancel timer and start doing I/O that were meant to happen as if it
+     * fired, that way we get bdrv_drain() taking care of the ongoing requests
+     * correctly. */
+    qed_cancel_need_check_timer(s);
+    qed_plug_allocating_write_reqs(s);
+    bdrv_aio_flush(s->bs, qed_clear_need_check, s);
+}
+
 static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags,
                          Error **errp)
 {
@@ -1676,6 +1688,7 @@ static BlockDriver bdrv_qed = {
     .bdrv_check               = bdrv_qed_check,
     .bdrv_detach_aio_context  = bdrv_qed_detach_aio_context,
     .bdrv_attach_aio_context  = bdrv_qed_attach_aio_context,
+    .bdrv_drain               = bdrv_qed_drain,
 };
 
 static void bdrv_qed_init(void)
diff --git a/block/raw-posix.c b/block/raw-posix.c
index 918c756..aec9ec6 100644
--- a/block/raw-posix.c
+++ b/block/raw-posix.c
@@ -2175,12 +2175,6 @@ static int hdev_open(BlockDriverState *bs, QDict *options, int flags,
 }
 
 #if defined(__linux__)
-static int hdev_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    BDRVRawState *s = bs->opaque;
-
-    return ioctl(s->fd, req, buf);
-}
 
 static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs,
         unsigned long int req, void *buf,
@@ -2338,7 +2332,6 @@ static BlockDriver bdrv_host_device = {
 
     /* generic scsi device */
 #ifdef __linux__
-    .bdrv_ioctl         = hdev_ioctl,
     .bdrv_aio_ioctl     = hdev_aio_ioctl,
 #endif
 };
@@ -2471,7 +2464,6 @@ static BlockDriver bdrv_host_cdrom = {
     .bdrv_lock_medium   = cdrom_lock_medium,
 
     /* generic scsi device */
-    .bdrv_ioctl         = hdev_ioctl,
     .bdrv_aio_ioctl     = hdev_aio_ioctl,
 };
 #endif /* __linux__ */
diff --git a/block/raw_bsd.c b/block/raw_bsd.c
index 0aded31..915d6fd 100644
--- a/block/raw_bsd.c
+++ b/block/raw_bsd.c
@@ -169,11 +169,6 @@ static void raw_lock_medium(BlockDriverState *bs, bool locked)
     bdrv_lock_medium(bs->file->bs, locked);
 }
 
-static int raw_ioctl(BlockDriverState *bs, unsigned long int req, void *buf)
-{
-    return bdrv_ioctl(bs->file->bs, req, buf);
-}
-
 static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs,
                                  unsigned long int req, void *buf,
                                  BlockCompletionFunc *cb,
@@ -262,7 +257,6 @@ BlockDriver bdrv_raw = {
     .bdrv_media_changed   = &raw_media_changed,
     .bdrv_eject           = &raw_eject,
     .bdrv_lock_medium     = &raw_lock_medium,
-    .bdrv_ioctl           = &raw_ioctl,
     .bdrv_aio_ioctl       = &raw_aio_ioctl,
     .create_opts          = &raw_create_opts,
     .bdrv_has_zero_init   = &raw_has_zero_init
diff --git a/block/throttle-groups.c b/block/throttle-groups.c
index 3419af7..13b5baa 100644
--- a/block/throttle-groups.c
+++ b/block/throttle-groups.c
@@ -437,6 +437,9 @@ void throttle_group_register_bs(BlockDriverState *bs, const char *groupname)
  * list, destroying the timers and setting the throttle_state pointer
  * to NULL.
  *
+ * The BlockDriverState must not have pending throttled requests, so
+ * the caller has to drain them first.
+ *
  * The group will be destroyed if it's empty after this operation.
  *
  * @bs: the BlockDriverState to remove
@@ -446,6 +449,10 @@ void throttle_group_unregister_bs(BlockDriverState *bs)
     ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts);
     int i;
 
+    assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0);
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[0]));
+    assert(qemu_co_queue_empty(&bs->throttled_reqs[1]));
+
     qemu_mutex_lock(&tg->lock);
     for (i = 0; i < 2; i++) {
         if (tg->tokens[i] == bs) {
diff --git a/blockdev.c b/blockdev.c
index 8b8bfa9..917ae06 100644
--- a/blockdev.c
+++ b/blockdev.c
@@ -283,32 +283,6 @@ typedef struct {
     BlockDriverState *bs;
 } BDRVPutRefBH;
 
-static void bdrv_put_ref_bh(void *opaque)
-{
-    BDRVPutRefBH *s = opaque;
-
-    bdrv_unref(s->bs);
-    qemu_bh_delete(s->bh);
-    g_free(s);
-}
-
-/*
- * Release a BDS reference in a BH
- *
- * It is not safe to use bdrv_unref() from a callback function when the callers
- * still need the BlockDriverState.  In such cases we schedule a BH to release
- * the reference.
- */
-static void bdrv_put_ref_bh_schedule(BlockDriverState *bs)
-{
-    BDRVPutRefBH *s;
-
-    s = g_new(BDRVPutRefBH, 1);
-    s->bh = qemu_bh_new(bdrv_put_ref_bh, s);
-    s->bs = bs;
-    qemu_bh_schedule(s->bh);
-}
-
 static int parse_block_error_action(const char *buf, bool is_read, Error **errp)
 {
     if (!strcmp(buf, "ignore")) {
@@ -326,6 +300,45 @@ static int parse_block_error_action(const char *buf, bool is_read, Error **errp)
     }
 }
 
+static bool parse_stats_intervals(BlockAcctStats *stats, QList *intervals,
+                                  Error **errp)
+{
+    const QListEntry *entry;
+    for (entry = qlist_first(intervals); entry; entry = qlist_next(entry)) {
+        switch (qobject_type(entry->value)) {
+
+        case QTYPE_QSTRING: {
+            unsigned long long length;
+            const char *str = qstring_get_str(qobject_to_qstring(entry->value));
+            if (parse_uint_full(str, &length, 10) == 0 &&
+                length > 0 && length <= UINT_MAX) {
+                block_acct_add_interval(stats, (unsigned) length);
+            } else {
+                error_setg(errp, "Invalid interval length: %s", str);
+                return false;
+            }
+            break;
+        }
+
+        case QTYPE_QINT: {
+            int64_t length = qint_get_int(qobject_to_qint(entry->value));
+            if (length > 0 && length <= UINT_MAX) {
+                block_acct_add_interval(stats, (unsigned) length);
+            } else {
+                error_setg(errp, "Invalid interval length: %" PRId64, length);
+                return false;
+            }
+            break;
+        }
+
+        default:
+            error_setg(errp, "The specification of stats-intervals is invalid");
+            return false;
+        }
+    }
+    return true;
+}
+
 static bool check_throttle_config(ThrottleConfig *cfg, Error **errp)
 {
     if (throttle_conflicting(cfg)) {
@@ -467,12 +480,15 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
     const char *buf;
     int bdrv_flags = 0;
     int on_read_error, on_write_error;
+    bool account_invalid, account_failed;
     BlockBackend *blk;
     BlockDriverState *bs;
     ThrottleConfig cfg;
     int snapshot = 0;
     Error *error = NULL;
     QemuOpts *opts;
+    QDict *interval_dict = NULL;
+    QList *interval_list = NULL;
     const char *id;
     bool has_driver_specific_opts;
     BlockdevDetectZeroesOptions detect_zeroes =
@@ -503,6 +519,18 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
     /* extract parameters */
     snapshot = qemu_opt_get_bool(opts, "snapshot", 0);
 
+    account_invalid = qemu_opt_get_bool(opts, "stats-account-invalid", true);
+    account_failed = qemu_opt_get_bool(opts, "stats-account-failed", true);
+
+    qdict_extract_subqdict(bs_opts, &interval_dict, "stats-intervals.");
+    qdict_array_split(interval_dict, &interval_list);
+
+    if (qdict_size(interval_dict) != 0) {
+        error_setg(errp, "Invalid option stats-intervals.%s",
+                   qdict_first(interval_dict)->key);
+        goto early_err;
+    }
+
     extract_common_blockdev_options(opts, &bdrv_flags, &throttling_group, &cfg,
                                     &detect_zeroes, &error);
     if (error) {
@@ -599,16 +627,28 @@ static BlockBackend *blockdev_init(const char *file, QDict *bs_opts,
         if (bdrv_key_required(bs)) {
             autostart = 0;
         }
+
+        block_acct_init(blk_get_stats(blk), account_invalid, account_failed);
+
+        if (!parse_stats_intervals(blk_get_stats(blk), interval_list, errp)) {
+            blk_unref(blk);
+            blk = NULL;
+            goto err_no_bs_opts;
+        }
     }
 
     blk_set_on_error(blk, on_read_error, on_write_error);
 
 err_no_bs_opts:
     qemu_opts_del(opts);
+    QDECREF(interval_dict);
+    QDECREF(interval_list);
     return blk;
 
 early_err:
     qemu_opts_del(opts);
+    QDECREF(interval_dict);
+    QDECREF(interval_list);
 err_no_opts:
     QDECREF(bs_opts);
     return NULL;
@@ -1120,6 +1160,9 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
     if (!strcmp(device, "all")) {
         ret = bdrv_commit_all();
     } else {
+        BlockDriverState *bs;
+        AioContext *aio_context;
+
         blk = blk_by_name(device);
         if (!blk) {
             monitor_printf(mon, "Device '%s' not found\n", device);
@@ -1129,7 +1172,14 @@ void hmp_commit(Monitor *mon, const QDict *qdict)
             monitor_printf(mon, "Device '%s' has no medium\n", device);
             return;
         }
-        ret = bdrv_commit(blk_bs(blk));
+
+        bs = blk_bs(blk);
+        aio_context = bdrv_get_aio_context(bs);
+        aio_context_acquire(aio_context);
+
+        ret = bdrv_commit(bs);
+
+        aio_context_release(aio_context);
     }
     if (ret < 0) {
         monitor_printf(mon, "'commit' error for '%s': %s\n", device,
@@ -1147,7 +1197,7 @@ static void blockdev_do_action(TransactionActionKind type, void *data,
     action.u.data = data;
     list.value = &action;
     list.next = NULL;
-    qmp_transaction(&list, errp);
+    qmp_transaction(&list, false, NULL, errp);
 }
 
 void qmp_blockdev_snapshot_sync(bool has_device, const char *device,
@@ -1158,7 +1208,7 @@ void qmp_blockdev_snapshot_sync(bool has_device, const char *device,
                                 bool has_format, const char *format,
                                 bool has_mode, NewImageMode mode, Error **errp)
 {
-    BlockdevSnapshot snapshot = {
+    BlockdevSnapshotSync snapshot = {
         .has_device = has_device,
         .device = (char *) device,
         .has_node_name = has_node_name,
@@ -1175,6 +1225,18 @@ void qmp_blockdev_snapshot_sync(bool has_device, const char *device,
                        &snapshot, errp);
 }
 
+void qmp_blockdev_snapshot(const char *node, const char *overlay,
+                           Error **errp)
+{
+    BlockdevSnapshot snapshot_data = {
+        .node = (char *) node,
+        .overlay = (char *) overlay
+    };
+
+    blockdev_do_action(TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT,
+                       &snapshot_data, errp);
+}
+
 void qmp_blockdev_snapshot_internal_sync(const char *device,
                                          const char *name,
                                          Error **errp)
@@ -1337,44 +1399,75 @@ static BdrvDirtyBitmap *block_dirty_bitmap_lookup(const char *node,
 
 /* New and old BlockDriverState structs for atomic group operations */
 
-typedef struct BlkTransactionState BlkTransactionState;
+typedef struct BlkActionState BlkActionState;
 
-/* Only prepare() may fail. In a single transaction, only one of commit() or
-   abort() will be called, clean() will always be called if it present. */
-typedef struct BdrvActionOps {
-    /* Size of state struct, in bytes. */
+/**
+ * BlkActionOps:
+ * Table of operations that define an Action.
+ *
+ * @instance_size: Size of state struct, in bytes.
+ * @prepare: Prepare the work, must NOT be NULL.
+ * @commit: Commit the changes, can be NULL.
+ * @abort: Abort the changes on fail, can be NULL.
+ * @clean: Clean up resources after all transaction actions have called
+ *         commit() or abort(). Can be NULL.
+ *
+ * Only prepare() may fail. In a single transaction, only one of commit() or
+ * abort() will be called. clean() will always be called if it is present.
+ */
+typedef struct BlkActionOps {
     size_t instance_size;
-    /* Prepare the work, must NOT be NULL. */
-    void (*prepare)(BlkTransactionState *common, Error **errp);
-    /* Commit the changes, can be NULL. */
-    void (*commit)(BlkTransactionState *common);
-    /* Abort the changes on fail, can be NULL. */
-    void (*abort)(BlkTransactionState *common);
-    /* Clean up resource in the end, can be NULL. */
-    void (*clean)(BlkTransactionState *common);
-} BdrvActionOps;
+    void (*prepare)(BlkActionState *common, Error **errp);
+    void (*commit)(BlkActionState *common);
+    void (*abort)(BlkActionState *common);
+    void (*clean)(BlkActionState *common);
+} BlkActionOps;
 
-/*
- * This structure must be arranged as first member in child type, assuming
- * that compiler will also arrange it to the same address with parent instance.
- * Later it will be used in free().
+/**
+ * BlkActionState:
+ * Describes one Action's state within a Transaction.
+ *
+ * @action: QAPI-defined enum identifying which Action to perform.
+ * @ops: Table of ActionOps this Action can perform.
+ * @block_job_txn: Transaction which this action belongs to.
+ * @entry: List membership for all Actions in this Transaction.
+ *
+ * This structure must be arranged as first member in a subclassed type,
+ * assuming that the compiler will also arrange it to the same offsets as the
+ * base class.
  */
-struct BlkTransactionState {
+struct BlkActionState {
     TransactionAction *action;
-    const BdrvActionOps *ops;
-    QSIMPLEQ_ENTRY(BlkTransactionState) entry;
+    const BlkActionOps *ops;
+    BlockJobTxn *block_job_txn;
+    TransactionProperties *txn_props;
+    QSIMPLEQ_ENTRY(BlkActionState) entry;
 };
 
 /* internal snapshot private data */
 typedef struct InternalSnapshotState {
-    BlkTransactionState common;
+    BlkActionState common;
     BlockDriverState *bs;
     AioContext *aio_context;
     QEMUSnapshotInfo sn;
     bool created;
 } InternalSnapshotState;
 
-static void internal_snapshot_prepare(BlkTransactionState *common,
+
+static int action_check_completion_mode(BlkActionState *s, Error **errp)
+{
+    if (s->txn_props->completion_mode != ACTION_COMPLETION_MODE_INDIVIDUAL) {
+        error_setg(errp,
+                   "Action '%s' does not support Transaction property "
+                   "completion-mode = %s",
+                   TransactionActionKind_lookup[s->action->type],
+                   ActionCompletionMode_lookup[s->txn_props->completion_mode]);
+        return -1;
+    }
+    return 0;
+}
+
+static void internal_snapshot_prepare(BlkActionState *common,
                                       Error **errp)
 {
     Error *local_err = NULL;
@@ -1399,6 +1492,10 @@ static void internal_snapshot_prepare(BlkTransactionState *common,
     name = internal->name;
 
     /* 2. check for validation */
+    if (action_check_completion_mode(common, errp) < 0) {
+        return;
+    }
+
     blk = blk_by_name(device);
     if (!blk) {
         error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
@@ -1473,7 +1570,7 @@ static void internal_snapshot_prepare(BlkTransactionState *common,
     state->created = true;
 }
 
-static void internal_snapshot_abort(BlkTransactionState *common)
+static void internal_snapshot_abort(BlkActionState *common)
 {
     InternalSnapshotState *state =
                              DO_UPCAST(InternalSnapshotState, common, common);
@@ -1496,7 +1593,7 @@ static void internal_snapshot_abort(BlkTransactionState *common)
     }
 }
 
-static void internal_snapshot_clean(BlkTransactionState *common)
+static void internal_snapshot_clean(BlkActionState *common)
 {
     InternalSnapshotState *state = DO_UPCAST(InternalSnapshotState,
                                              common, common);
@@ -1511,66 +1608,61 @@ static void internal_snapshot_clean(BlkTransactionState *common)
 
 /* external snapshot private data */
 typedef struct ExternalSnapshotState {
-    BlkTransactionState common;
+    BlkActionState common;
     BlockDriverState *old_bs;
     BlockDriverState *new_bs;
     AioContext *aio_context;
 } ExternalSnapshotState;
 
-static void external_snapshot_prepare(BlkTransactionState *common,
+static void external_snapshot_prepare(BlkActionState *common,
                                       Error **errp)
 {
-    int flags, ret;
-    QDict *options;
+    int flags = 0, ret;
+    QDict *options = NULL;
     Error *local_err = NULL;
-    bool has_device = false;
+    /* Device and node name of the image to generate the snapshot from */
     const char *device;
-    bool has_node_name = false;
     const char *node_name;
-    bool has_snapshot_node_name = false;
-    const char *snapshot_node_name;
+    /* Reference to the new image (for 'blockdev-snapshot') */
+    const char *snapshot_ref;
+    /* File name of the new image (for 'blockdev-snapshot-sync') */
     const char *new_image_file;
-    const char *format = "qcow2";
-    enum NewImageMode mode = NEW_IMAGE_MODE_ABSOLUTE_PATHS;
     ExternalSnapshotState *state =
                              DO_UPCAST(ExternalSnapshotState, common, common);
     TransactionAction *action = common->action;
 
-    /* get parameters */
-    g_assert(action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC);
-
-    has_device = action->u.blockdev_snapshot_sync->has_device;
-    device = action->u.blockdev_snapshot_sync->device;
-    has_node_name = action->u.blockdev_snapshot_sync->has_node_name;
-    node_name = action->u.blockdev_snapshot_sync->node_name;
-    has_snapshot_node_name =
-        action->u.blockdev_snapshot_sync->has_snapshot_node_name;
-    snapshot_node_name = action->u.blockdev_snapshot_sync->snapshot_node_name;
-
-    new_image_file = action->u.blockdev_snapshot_sync->snapshot_file;
-    if (action->u.blockdev_snapshot_sync->has_format) {
-        format = action->u.blockdev_snapshot_sync->format;
-    }
-    if (action->u.blockdev_snapshot_sync->has_mode) {
-        mode = action->u.blockdev_snapshot_sync->mode;
+    /* 'blockdev-snapshot' and 'blockdev-snapshot-sync' have similar
+     * purpose but a different set of parameters */
+    switch (action->type) {
+    case TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT:
+        {
+            BlockdevSnapshot *s = action->u.blockdev_snapshot;
+            device = s->node;
+            node_name = s->node;
+            new_image_file = NULL;
+            snapshot_ref = s->overlay;
+        }
+        break;
+    case TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC:
+        {
+            BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync;
+            device = s->has_device ? s->device : NULL;
+            node_name = s->has_node_name ? s->node_name : NULL;
+            new_image_file = s->snapshot_file;
+            snapshot_ref = NULL;
+        }
+        break;
+    default:
+        g_assert_not_reached();
     }
 
     /* start processing */
-    state->old_bs = bdrv_lookup_bs(has_device ? device : NULL,
-                                   has_node_name ? node_name : NULL,
-                                   &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
-        return;
-    }
-
-    if (has_node_name && !has_snapshot_node_name) {
-        error_setg(errp, "New snapshot node name missing");
+    if (action_check_completion_mode(common, errp) < 0) {
         return;
     }
 
-    if (has_snapshot_node_name && bdrv_find_node(snapshot_node_name)) {
-        error_setg(errp, "New snapshot node name already existing");
+    state->old_bs = bdrv_lookup_bs(device, node_name, errp);
+    if (!state->old_bs) {
         return;
     }
 
@@ -1601,39 +1693,79 @@ static void external_snapshot_prepare(BlkTransactionState *common,
         return;
     }
 
-    flags = state->old_bs->open_flags;
+    if (action->type == TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC) {
+        BlockdevSnapshotSync *s = action->u.blockdev_snapshot_sync;
+        const char *format = s->has_format ? s->format : "qcow2";
+        enum NewImageMode mode;
+        const char *snapshot_node_name =
+            s->has_snapshot_node_name ? s->snapshot_node_name : NULL;
 
-    /* create new image w/backing file */
-    if (mode != NEW_IMAGE_MODE_EXISTING) {
-        bdrv_img_create(new_image_file, format,
-                        state->old_bs->filename,
-                        state->old_bs->drv->format_name,
-                        NULL, -1, flags, &local_err, false);
-        if (local_err) {
-            error_propagate(errp, local_err);
+        if (node_name && !snapshot_node_name) {
+            error_setg(errp, "New snapshot node name missing");
             return;
         }
-    }
 
-    options = qdict_new();
-    if (has_snapshot_node_name) {
-        qdict_put(options, "node-name",
-                  qstring_from_str(snapshot_node_name));
+        if (snapshot_node_name &&
+            bdrv_lookup_bs(snapshot_node_name, snapshot_node_name, NULL)) {
+            error_setg(errp, "New snapshot node name already in use");
+            return;
+        }
+
+        flags = state->old_bs->open_flags;
+
+        /* create new image w/backing file */
+        mode = s->has_mode ? s->mode : NEW_IMAGE_MODE_ABSOLUTE_PATHS;
+        if (mode != NEW_IMAGE_MODE_EXISTING) {
+            bdrv_img_create(new_image_file, format,
+                            state->old_bs->filename,
+                            state->old_bs->drv->format_name,
+                            NULL, -1, flags, &local_err, false);
+            if (local_err) {
+                error_propagate(errp, local_err);
+                return;
+            }
+        }
+
+        options = qdict_new();
+        if (s->has_snapshot_node_name) {
+            qdict_put(options, "node-name",
+                      qstring_from_str(snapshot_node_name));
+        }
+        qdict_put(options, "driver", qstring_from_str(format));
+
+        flags |= BDRV_O_NO_BACKING;
     }
-    qdict_put(options, "driver", qstring_from_str(format));
 
-    /* TODO Inherit bs->options or only take explicit options with an
-     * extended QMP command? */
     assert(state->new_bs == NULL);
-    ret = bdrv_open(&state->new_bs, new_image_file, NULL, options,
-                    flags | BDRV_O_NO_BACKING, &local_err);
+    ret = bdrv_open(&state->new_bs, new_image_file, snapshot_ref, options,
+                    flags, errp);
     /* We will manually add the backing_hd field to the bs later */
     if (ret != 0) {
-        error_propagate(errp, local_err);
+        return;
+    }
+
+    if (state->new_bs->blk != NULL) {
+        error_setg(errp, "The snapshot is already in use by %s",
+                   blk_name(state->new_bs->blk));
+        return;
+    }
+
+    if (bdrv_op_is_blocked(state->new_bs, BLOCK_OP_TYPE_EXTERNAL_SNAPSHOT,
+                           errp)) {
+        return;
+    }
+
+    if (state->new_bs->backing != NULL) {
+        error_setg(errp, "The snapshot already has a backing image");
+        return;
+    }
+
+    if (!state->new_bs->drv->supports_backing) {
+        error_setg(errp, "The snapshot does not support backing images");
     }
 }
 
-static void external_snapshot_commit(BlkTransactionState *common)
+static void external_snapshot_commit(BlkActionState *common)
 {
     ExternalSnapshotState *state =
                              DO_UPCAST(ExternalSnapshotState, common, common);
@@ -1649,7 +1781,7 @@ static void external_snapshot_commit(BlkTransactionState *common)
                 NULL);
 }
 
-static void external_snapshot_abort(BlkTransactionState *common)
+static void external_snapshot_abort(BlkActionState *common)
 {
     ExternalSnapshotState *state =
                              DO_UPCAST(ExternalSnapshotState, common, common);
@@ -1658,7 +1790,7 @@ static void external_snapshot_abort(BlkTransactionState *common)
     }
 }
 
-static void external_snapshot_clean(BlkTransactionState *common)
+static void external_snapshot_clean(BlkActionState *common)
 {
     ExternalSnapshotState *state =
                              DO_UPCAST(ExternalSnapshotState, common, common);
@@ -1669,13 +1801,25 @@ static void external_snapshot_clean(BlkTransactionState *common)
 }
 
 typedef struct DriveBackupState {
-    BlkTransactionState common;
+    BlkActionState common;
     BlockDriverState *bs;
     AioContext *aio_context;
     BlockJob *job;
 } DriveBackupState;
 
-static void drive_backup_prepare(BlkTransactionState *common, Error **errp)
+static void do_drive_backup(const char *device, const char *target,
+                            bool has_format, const char *format,
+                            enum MirrorSyncMode sync,
+                            bool has_mode, enum NewImageMode mode,
+                            bool has_speed, int64_t speed,
+                            bool has_bitmap, const char *bitmap,
+                            bool has_on_source_error,
+                            BlockdevOnError on_source_error,
+                            bool has_on_target_error,
+                            BlockdevOnError on_target_error,
+                            BlockJobTxn *txn, Error **errp);
+
+static void drive_backup_prepare(BlkActionState *common, Error **errp)
 {
     DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
     BlockBackend *blk;
@@ -1703,15 +1847,15 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp)
     bdrv_drained_begin(blk_bs(blk));
     state->bs = blk_bs(blk);
 
-    qmp_drive_backup(backup->device, backup->target,
-                     backup->has_format, backup->format,
-                     backup->sync,
-                     backup->has_mode, backup->mode,
-                     backup->has_speed, backup->speed,
-                     backup->has_bitmap, backup->bitmap,
-                     backup->has_on_source_error, backup->on_source_error,
-                     backup->has_on_target_error, backup->on_target_error,
-                     &local_err);
+    do_drive_backup(backup->device, backup->target,
+                    backup->has_format, backup->format,
+                    backup->sync,
+                    backup->has_mode, backup->mode,
+                    backup->has_speed, backup->speed,
+                    backup->has_bitmap, backup->bitmap,
+                    backup->has_on_source_error, backup->on_source_error,
+                    backup->has_on_target_error, backup->on_target_error,
+                    common->block_job_txn, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
@@ -1720,7 +1864,7 @@ static void drive_backup_prepare(BlkTransactionState *common, Error **errp)
     state->job = state->bs->job;
 }
 
-static void drive_backup_abort(BlkTransactionState *common)
+static void drive_backup_abort(BlkActionState *common)
 {
     DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
     BlockDriverState *bs = state->bs;
@@ -1731,7 +1875,7 @@ static void drive_backup_abort(BlkTransactionState *common)
     }
 }
 
-static void drive_backup_clean(BlkTransactionState *common)
+static void drive_backup_clean(BlkActionState *common)
 {
     DriveBackupState *state = DO_UPCAST(DriveBackupState, common, common);
 
@@ -1742,13 +1886,22 @@ static void drive_backup_clean(BlkTransactionState *common)
 }
 
 typedef struct BlockdevBackupState {
-    BlkTransactionState common;
+    BlkActionState common;
     BlockDriverState *bs;
     BlockJob *job;
     AioContext *aio_context;
 } BlockdevBackupState;
 
-static void blockdev_backup_prepare(BlkTransactionState *common, Error **errp)
+static void do_blockdev_backup(const char *device, const char *target,
+                               enum MirrorSyncMode sync,
+                               bool has_speed, int64_t speed,
+                               bool has_on_source_error,
+                               BlockdevOnError on_source_error,
+                               bool has_on_target_error,
+                               BlockdevOnError on_target_error,
+                               BlockJobTxn *txn, Error **errp);
+
+static void blockdev_backup_prepare(BlkActionState *common, Error **errp)
 {
     BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
     BlockdevBackup *backup;
@@ -1786,12 +1939,12 @@ static void blockdev_backup_prepare(BlkTransactionState *common, Error **errp)
     state->bs = blk_bs(blk);
     bdrv_drained_begin(state->bs);
 
-    qmp_blockdev_backup(backup->device, backup->target,
-                        backup->sync,
-                        backup->has_speed, backup->speed,
-                        backup->has_on_source_error, backup->on_source_error,
-                        backup->has_on_target_error, backup->on_target_error,
-                        &local_err);
+    do_blockdev_backup(backup->device, backup->target,
+                       backup->sync,
+                       backup->has_speed, backup->speed,
+                       backup->has_on_source_error, backup->on_source_error,
+                       backup->has_on_target_error, backup->on_target_error,
+                       common->block_job_txn, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
         return;
@@ -1800,7 +1953,7 @@ static void blockdev_backup_prepare(BlkTransactionState *common, Error **errp)
     state->job = state->bs->job;
 }
 
-static void blockdev_backup_abort(BlkTransactionState *common)
+static void blockdev_backup_abort(BlkActionState *common)
 {
     BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
     BlockDriverState *bs = state->bs;
@@ -1811,7 +1964,7 @@ static void blockdev_backup_abort(BlkTransactionState *common)
     }
 }
 
-static void blockdev_backup_clean(BlkTransactionState *common)
+static void blockdev_backup_clean(BlkActionState *common)
 {
     BlockdevBackupState *state = DO_UPCAST(BlockdevBackupState, common, common);
 
@@ -1821,17 +1974,131 @@ static void blockdev_backup_clean(BlkTransactionState *common)
     }
 }
 
-static void abort_prepare(BlkTransactionState *common, Error **errp)
+typedef struct BlockDirtyBitmapState {
+    BlkActionState common;
+    BdrvDirtyBitmap *bitmap;
+    BlockDriverState *bs;
+    AioContext *aio_context;
+    HBitmap *backup;
+    bool prepared;
+} BlockDirtyBitmapState;
+
+static void block_dirty_bitmap_add_prepare(BlkActionState *common,
+                                           Error **errp)
+{
+    Error *local_err = NULL;
+    BlockDirtyBitmapAdd *action;
+    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
+                                             common, common);
+
+    if (action_check_completion_mode(common, errp) < 0) {
+        return;
+    }
+
+    action = common->action->u.block_dirty_bitmap_add;
+    /* AIO context taken and released within qmp_block_dirty_bitmap_add */
+    qmp_block_dirty_bitmap_add(action->node, action->name,
+                               action->has_granularity, action->granularity,
+                               &local_err);
+
+    if (!local_err) {
+        state->prepared = true;
+    } else {
+        error_propagate(errp, local_err);
+    }
+}
+
+static void block_dirty_bitmap_add_abort(BlkActionState *common)
+{
+    BlockDirtyBitmapAdd *action;
+    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
+                                             common, common);
+
+    action = common->action->u.block_dirty_bitmap_add;
+    /* Should not be able to fail: IF the bitmap was added via .prepare(),
+     * then the node reference and bitmap name must have been valid.
+     */
+    if (state->prepared) {
+        qmp_block_dirty_bitmap_remove(action->node, action->name, &error_abort);
+    }
+}
+
+static void block_dirty_bitmap_clear_prepare(BlkActionState *common,
+                                             Error **errp)
+{
+    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
+                                             common, common);
+    BlockDirtyBitmap *action;
+
+    if (action_check_completion_mode(common, errp) < 0) {
+        return;
+    }
+
+    action = common->action->u.block_dirty_bitmap_clear;
+    state->bitmap = block_dirty_bitmap_lookup(action->node,
+                                              action->name,
+                                              &state->bs,
+                                              &state->aio_context,
+                                              errp);
+    if (!state->bitmap) {
+        return;
+    }
+
+    if (bdrv_dirty_bitmap_frozen(state->bitmap)) {
+        error_setg(errp, "Cannot modify a frozen bitmap");
+        return;
+    } else if (!bdrv_dirty_bitmap_enabled(state->bitmap)) {
+        error_setg(errp, "Cannot clear a disabled bitmap");
+        return;
+    }
+
+    bdrv_clear_dirty_bitmap(state->bitmap, &state->backup);
+    /* AioContext is released in .clean() */
+}
+
+static void block_dirty_bitmap_clear_abort(BlkActionState *common)
+{
+    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
+                                             common, common);
+
+    bdrv_undo_clear_dirty_bitmap(state->bitmap, state->backup);
+}
+
+static void block_dirty_bitmap_clear_commit(BlkActionState *common)
+{
+    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
+                                             common, common);
+
+    hbitmap_free(state->backup);
+}
+
+static void block_dirty_bitmap_clear_clean(BlkActionState *common)
+{
+    BlockDirtyBitmapState *state = DO_UPCAST(BlockDirtyBitmapState,
+                                             common, common);
+
+    if (state->aio_context) {
+        aio_context_release(state->aio_context);
+    }
+}
+
+static void abort_prepare(BlkActionState *common, Error **errp)
 {
     error_setg(errp, "Transaction aborted using Abort action");
 }
 
-static void abort_commit(BlkTransactionState *common)
+static void abort_commit(BlkActionState *common)
 {
     g_assert_not_reached(); /* this action never succeeds */
 }
 
-static const BdrvActionOps actions[] = {
+static const BlkActionOps actions[] = {
+    [TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT] = {
+        .instance_size = sizeof(ExternalSnapshotState),
+        .prepare  = external_snapshot_prepare,
+        .commit   = external_snapshot_commit,
+        .abort = external_snapshot_abort,
+    },
     [TRANSACTION_ACTION_KIND_BLOCKDEV_SNAPSHOT_SYNC] = {
         .instance_size = sizeof(ExternalSnapshotState),
         .prepare  = external_snapshot_prepare,
@@ -1852,7 +2119,7 @@ static const BdrvActionOps actions[] = {
         .clean = blockdev_backup_clean,
     },
     [TRANSACTION_ACTION_KIND_ABORT] = {
-        .instance_size = sizeof(BlkTransactionState),
+        .instance_size = sizeof(BlkActionState),
         .prepare = abort_prepare,
         .commit = abort_commit,
     },
@@ -1862,28 +2129,71 @@ static const BdrvActionOps actions[] = {
         .abort = internal_snapshot_abort,
         .clean = internal_snapshot_clean,
     },
+    [TRANSACTION_ACTION_KIND_BLOCK_DIRTY_BITMAP_ADD] = {
+        .instance_size = sizeof(BlockDirtyBitmapState),
+        .prepare = block_dirty_bitmap_add_prepare,
+        .abort = block_dirty_bitmap_add_abort,
+    },
+    [TRANSACTION_ACTION_KIND_BLOCK_DIRTY_BITMAP_CLEAR] = {
+        .instance_size = sizeof(BlockDirtyBitmapState),
+        .prepare = block_dirty_bitmap_clear_prepare,
+        .commit = block_dirty_bitmap_clear_commit,
+        .abort = block_dirty_bitmap_clear_abort,
+        .clean = block_dirty_bitmap_clear_clean,
+    }
 };
 
+/**
+ * Allocate a TransactionProperties structure if necessary, and fill
+ * that structure with desired defaults if they are unset.
+ */
+static TransactionProperties *get_transaction_properties(
+    TransactionProperties *props)
+{
+    if (!props) {
+        props = g_new0(TransactionProperties, 1);
+    }
+
+    if (!props->has_completion_mode) {
+        props->has_completion_mode = true;
+        props->completion_mode = ACTION_COMPLETION_MODE_INDIVIDUAL;
+    }
+
+    return props;
+}
+
 /*
  * 'Atomic' group operations.  The operations are performed as a set, and if
  * any fail then we roll back all operations in the group.
  */
-void qmp_transaction(TransactionActionList *dev_list, Error **errp)
+void qmp_transaction(TransactionActionList *dev_list,
+                     bool has_props,
+                     struct TransactionProperties *props,
+                     Error **errp)
 {
     TransactionActionList *dev_entry = dev_list;
-    BlkTransactionState *state, *next;
+    BlockJobTxn *block_job_txn = NULL;
+    BlkActionState *state, *next;
     Error *local_err = NULL;
 
-    QSIMPLEQ_HEAD(snap_bdrv_states, BlkTransactionState) snap_bdrv_states;
+    QSIMPLEQ_HEAD(snap_bdrv_states, BlkActionState) snap_bdrv_states;
     QSIMPLEQ_INIT(&snap_bdrv_states);
 
+    /* Does this transaction get canceled as a group on failure?
+     * If not, we don't really need to make a BlockJobTxn.
+     */
+    props = get_transaction_properties(props);
+    if (props->completion_mode != ACTION_COMPLETION_MODE_INDIVIDUAL) {
+        block_job_txn = block_job_txn_new();
+    }
+
     /* drain all i/o before any operations */
     bdrv_drain_all();
 
     /* We don't do anything in this loop that commits us to the operations */
     while (NULL != dev_entry) {
         TransactionAction *dev_info = NULL;
-        const BdrvActionOps *ops;
+        const BlkActionOps *ops;
 
         dev_info = dev_entry->value;
         dev_entry = dev_entry->next;
@@ -1896,6 +2206,8 @@ void qmp_transaction(TransactionActionList *dev_list, Error **errp)
         state = g_malloc0(ops->instance_size);
         state->ops = ops;
         state->action = dev_info;
+        state->block_job_txn = block_job_txn;
+        state->txn_props = props;
         QSIMPLEQ_INSERT_TAIL(&snap_bdrv_states, state, entry);
 
         state->ops->prepare(state, &local_err);
@@ -1928,47 +2240,86 @@ exit:
         }
         g_free(state);
     }
+    if (!has_props) {
+        qapi_free_TransactionProperties(props);
+    }
+    block_job_txn_unref(block_job_txn);
 }
 
+void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
+{
+    Error *local_err = NULL;
 
-static void eject_device(BlockBackend *blk, int force, Error **errp)
+    qmp_blockdev_open_tray(device, has_force, force, &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
+        return;
+    }
+
+    qmp_blockdev_remove_medium(device, errp);
+}
+
+void qmp_block_passwd(bool has_device, const char *device,
+                      bool has_node_name, const char *node_name,
+                      const char *password, Error **errp)
 {
-    BlockDriverState *bs = blk_bs(blk);
+    Error *local_err = NULL;
+    BlockDriverState *bs;
     AioContext *aio_context;
 
-    if (!bs) {
-        /* No medium inserted, so there is nothing to do */
+    bs = bdrv_lookup_bs(has_device ? device : NULL,
+                        has_node_name ? node_name : NULL,
+                        &local_err);
+    if (local_err) {
+        error_propagate(errp, local_err);
         return;
     }
 
     aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(aio_context);
 
-    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_EJECT, errp)) {
-        goto out;
+    bdrv_add_key(bs, password, errp);
+
+    aio_context_release(aio_context);
+}
+
+void qmp_blockdev_open_tray(const char *device, bool has_force, bool force,
+                            Error **errp)
+{
+    BlockBackend *blk;
+    bool locked;
+
+    if (!has_force) {
+        force = false;
     }
+
+    blk = blk_by_name(device);
+    if (!blk) {
+        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+                  "Device '%s' not found", device);
+        return;
+    }
+
     if (!blk_dev_has_removable_media(blk)) {
-        error_setg(errp, "Device '%s' is not removable",
-                   bdrv_get_device_name(bs));
-        goto out;
+        error_setg(errp, "Device '%s' is not removable", device);
+        return;
     }
 
-    if (blk_dev_is_medium_locked(blk) && !blk_dev_is_tray_open(blk)) {
-        blk_dev_eject_request(blk, force);
-        if (!force) {
-            error_setg(errp, "Device '%s' is locked",
-                       bdrv_get_device_name(bs));
-            goto out;
-        }
+    if (blk_dev_is_tray_open(blk)) {
+        return;
     }
 
-    bdrv_close(bs);
+    locked = blk_dev_is_medium_locked(blk);
+    if (locked) {
+        blk_dev_eject_request(blk, force);
+    }
 
-out:
-    aio_context_release(aio_context);
+    if (!locked || force) {
+        blk_dev_change_media_cb(blk, false);
+    }
 }
 
-void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
+void qmp_blockdev_close_tray(const char *device, Error **errp)
 {
     BlockBackend *blk;
 
@@ -1979,103 +2330,214 @@ void qmp_eject(const char *device, bool has_force, bool force, Error **errp)
         return;
     }
 
-    eject_device(blk, force, errp);
+    if (!blk_dev_has_removable_media(blk)) {
+        error_setg(errp, "Device '%s' is not removable", device);
+        return;
+    }
+
+    if (!blk_dev_is_tray_open(blk)) {
+        return;
+    }
+
+    blk_dev_change_media_cb(blk, true);
 }
 
-void qmp_block_passwd(bool has_device, const char *device,
-                      bool has_node_name, const char *node_name,
-                      const char *password, Error **errp)
+void qmp_blockdev_remove_medium(const char *device, Error **errp)
 {
-    Error *local_err = NULL;
+    BlockBackend *blk;
     BlockDriverState *bs;
     AioContext *aio_context;
+    bool has_device;
 
-    bs = bdrv_lookup_bs(has_device ? device : NULL,
-                        has_node_name ? node_name : NULL,
-                        &local_err);
-    if (local_err) {
-        error_propagate(errp, local_err);
+    blk = blk_by_name(device);
+    if (!blk) {
+        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+                  "Device '%s' not found", device);
+        return;
+    }
+
+    /* For BBs without a device, we can exchange the BDS tree at will */
+    has_device = blk_get_attached_dev(blk);
+
+    if (has_device && !blk_dev_has_removable_media(blk)) {
+        error_setg(errp, "Device '%s' is not removable", device);
+        return;
+    }
+
+    if (has_device && !blk_dev_is_tray_open(blk)) {
+        error_setg(errp, "Tray of device '%s' is not open", device);
+        return;
+    }
+
+    bs = blk_bs(blk);
+    if (!bs) {
         return;
     }
 
     aio_context = bdrv_get_aio_context(bs);
     aio_context_acquire(aio_context);
 
-    bdrv_add_key(bs, password, errp);
+    if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_EJECT, errp)) {
+        goto out;
+    }
+
+    /* This follows the convention established by bdrv_make_anon() */
+    if (bs->device_list.tqe_prev) {
+        QTAILQ_REMOVE(&bdrv_states, bs, device_list);
+        bs->device_list.tqe_prev = NULL;
+    }
+
+    blk_remove_bs(blk);
 
+out:
     aio_context_release(aio_context);
 }
 
-/* Assumes AioContext is held */
-static void qmp_bdrv_open_encrypted(BlockDriverState **pbs,
-                                    const char *filename,
-                                    int bdrv_flags, const char *format,
-                                    const char *password, Error **errp)
+static void qmp_blockdev_insert_anon_medium(const char *device,
+                                            BlockDriverState *bs, Error **errp)
 {
-    Error *local_err = NULL;
-    QDict *options = NULL;
-    int ret;
+    BlockBackend *blk;
+    bool has_device;
 
-    if (format) {
-        options = qdict_new();
-        qdict_put(options, "driver", qstring_from_str(format));
+    blk = blk_by_name(device);
+    if (!blk) {
+        error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
+                  "Device '%s' not found", device);
+        return;
     }
 
-    ret = bdrv_open(pbs, filename, NULL, options, bdrv_flags, &local_err);
-    if (ret < 0) {
-        error_propagate(errp, local_err);
+    /* For BBs without a device, we can exchange the BDS tree at will */
+    has_device = blk_get_attached_dev(blk);
+
+    if (has_device && !blk_dev_has_removable_media(blk)) {
+        error_setg(errp, "Device '%s' is not removable", device);
         return;
     }
 
-    bdrv_add_key(*pbs, password, errp);
+    if (has_device && !blk_dev_is_tray_open(blk)) {
+        error_setg(errp, "Tray of device '%s' is not open", device);
+        return;
+    }
+
+    if (blk_bs(blk)) {
+        error_setg(errp, "There already is a medium in device '%s'", device);
+        return;
+    }
+
+    blk_insert_bs(blk, bs);
+
+    QTAILQ_INSERT_TAIL(&bdrv_states, bs, device_list);
 }
 
-void qmp_change_blockdev(const char *device, const char *filename,
-                         const char *format, Error **errp)
+void qmp_blockdev_insert_medium(const char *device, const char *node_name,
+                                Error **errp)
 {
-    BlockBackend *blk;
     BlockDriverState *bs;
-    AioContext *aio_context;
-    int bdrv_flags;
-    bool new_bs;
+
+    bs = bdrv_find_node(node_name);
+    if (!bs) {
+        error_setg(errp, "Node '%s' not found", node_name);
+        return;
+    }
+
+    if (bs->blk) {
+        error_setg(errp, "Node '%s' is already in use by '%s'", node_name,
+                   blk_name(bs->blk));
+        return;
+    }
+
+    qmp_blockdev_insert_anon_medium(device, bs, errp);
+}
+
+void qmp_blockdev_change_medium(const char *device, const char *filename,
+                                bool has_format, const char *format,
+                                bool has_read_only,
+                                BlockdevChangeReadOnlyMode read_only,
+                                Error **errp)
+{
+    BlockBackend *blk;
+    BlockDriverState *medium_bs = NULL;
+    int bdrv_flags, ret;
+    QDict *options = NULL;
     Error *err = NULL;
 
     blk = blk_by_name(device);
     if (!blk) {
         error_set(errp, ERROR_CLASS_DEVICE_NOT_FOUND,
                   "Device '%s' not found", device);
-        return;
+        goto fail;
     }
-    bs = blk_bs(blk);
-    new_bs = !bs;
 
-    aio_context = blk_get_aio_context(blk);
-    aio_context_acquire(aio_context);
+    if (blk_bs(blk)) {
+        blk_update_root_state(blk);
+    }
+
+    bdrv_flags = blk_get_open_flags_from_root_state(blk);
+
+    if (!has_read_only) {
+        read_only = BLOCKDEV_CHANGE_READ_ONLY_MODE_RETAIN;
+    }
+
+    switch (read_only) {
+    case BLOCKDEV_CHANGE_READ_ONLY_MODE_RETAIN:
+        break;
+
+    case BLOCKDEV_CHANGE_READ_ONLY_MODE_READ_ONLY:
+        bdrv_flags &= ~BDRV_O_RDWR;
+        break;
+
+    case BLOCKDEV_CHANGE_READ_ONLY_MODE_READ_WRITE:
+        bdrv_flags |= BDRV_O_RDWR;
+        break;
+
+    default:
+        abort();
+    }
 
-    eject_device(blk, 0, &err);
+    if (has_format) {
+        options = qdict_new();
+        qdict_put(options, "driver", qstring_from_str(format));
+    }
+
+    assert(!medium_bs);
+    ret = bdrv_open(&medium_bs, filename, NULL, options, bdrv_flags, errp);
+    if (ret < 0) {
+        goto fail;
+    }
+
+    blk_apply_root_state(blk, medium_bs);
+
+    bdrv_add_key(medium_bs, NULL, &err);
     if (err) {
         error_propagate(errp, err);
-        goto out;
+        goto fail;
     }
 
-    bdrv_flags = blk_is_read_only(blk) ? 0 : BDRV_O_RDWR;
-    bdrv_flags |= blk_get_root_state(blk)->open_flags & ~BDRV_O_RDWR;
+    qmp_blockdev_open_tray(device, false, false, &err);
+    if (err) {
+        error_propagate(errp, err);
+        goto fail;
+    }
 
-    qmp_bdrv_open_encrypted(&bs, filename, bdrv_flags, format, NULL, &err);
+    qmp_blockdev_remove_medium(device, &err);
     if (err) {
         error_propagate(errp, err);
-        goto out;
+        goto fail;
     }
 
-    if (new_bs) {
-        blk_insert_bs(blk, bs);
-        /* Has been sent automatically by bdrv_open() if blk_bs(blk) was not
-         * NULL */
-        blk_dev_change_media_cb(blk, true);
+    qmp_blockdev_insert_anon_medium(device, medium_bs, &err);
+    if (err) {
+        error_propagate(errp, err);
+        goto fail;
     }
 
-out:
-    aio_context_release(aio_context);
+    qmp_blockdev_close_tray(device, errp);
+
+fail:
+    /* If the medium has been inserted, the device has its own reference, so
+     * ours must be relinquished; and if it has not been inserted successfully,
+     * the reference must be relinquished anyway */
+    bdrv_unref(medium_bs);
 }
 
 /* throttling disk I/O limits */
@@ -2161,14 +2623,14 @@ void qmp_block_set_io_throttle(const char *device, int64_t bps, int64_t bps_rd,
     if (throttle_enabled(&cfg)) {
         /* Enable I/O limits if they're not enabled yet, otherwise
          * just update the throttling group. */
-        if (!bs->io_limits_enabled) {
+        if (!bs->throttle_state) {
             bdrv_io_limits_enable(bs, has_group ? group : device);
         } else if (has_group) {
             bdrv_io_limits_update_group(bs, group);
         }
         /* Set the new throttling configuration */
         bdrv_set_io_limits(bs, &cfg);
-    } else if (bs->io_limits_enabled) {
+    } else if (bs->throttle_state) {
         /* If all throttling settings are set to 0, disable I/O limits */
         bdrv_io_limits_disable(bs);
     }
@@ -2267,7 +2729,7 @@ void qmp_block_dirty_bitmap_clear(const char *node, const char *name,
         goto out;
     }
 
-    bdrv_clear_dirty_bitmap(bitmap);
+    bdrv_clear_dirty_bitmap(bitmap, NULL);
 
  out:
     aio_context_release(aio_context);
@@ -2410,8 +2872,6 @@ static void block_job_cb(void *opaque, int ret)
     } else {
         block_job_event_completed(bs->job, msg);
     }
-
-    bdrv_put_ref_bh_schedule(bs);
 }
 
 void qmp_block_stream(const char *device,
@@ -2592,15 +3052,17 @@ out:
     aio_context_release(aio_context);
 }
 
-void qmp_drive_backup(const char *device, const char *target,
-                      bool has_format, const char *format,
-                      enum MirrorSyncMode sync,
-                      bool has_mode, enum NewImageMode mode,
-                      bool has_speed, int64_t speed,
-                      bool has_bitmap, const char *bitmap,
-                      bool has_on_source_error, BlockdevOnError on_source_error,
-                      bool has_on_target_error, BlockdevOnError on_target_error,
-                      Error **errp)
+static void do_drive_backup(const char *device, const char *target,
+                            bool has_format, const char *format,
+                            enum MirrorSyncMode sync,
+                            bool has_mode, enum NewImageMode mode,
+                            bool has_speed, int64_t speed,
+                            bool has_bitmap, const char *bitmap,
+                            bool has_on_source_error,
+                            BlockdevOnError on_source_error,
+                            bool has_on_target_error,
+                            BlockdevOnError on_target_error,
+                            BlockJobTxn *txn, Error **errp)
 {
     BlockBackend *blk;
     BlockDriverState *bs;
@@ -2715,7 +3177,7 @@ void qmp_drive_backup(const char *device, const char *target,
 
     backup_start(bs, target_bs, speed, sync, bmap,
                  on_source_error, on_target_error,
-                 block_job_cb, bs, &local_err);
+                 block_job_cb, bs, txn, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
         error_propagate(errp, local_err);
@@ -2726,19 +3188,37 @@ out:
     aio_context_release(aio_context);
 }
 
+void qmp_drive_backup(const char *device, const char *target,
+                      bool has_format, const char *format,
+                      enum MirrorSyncMode sync,
+                      bool has_mode, enum NewImageMode mode,
+                      bool has_speed, int64_t speed,
+                      bool has_bitmap, const char *bitmap,
+                      bool has_on_source_error, BlockdevOnError on_source_error,
+                      bool has_on_target_error, BlockdevOnError on_target_error,
+                      Error **errp)
+{
+    return do_drive_backup(device, target, has_format, format, sync,
+                           has_mode, mode, has_speed, speed,
+                           has_bitmap, bitmap,
+                           has_on_source_error, on_source_error,
+                           has_on_target_error, on_target_error,
+                           NULL, errp);
+}
+
 BlockDeviceInfoList *qmp_query_named_block_nodes(Error **errp)
 {
     return bdrv_named_nodes_list(errp);
 }
 
-void qmp_blockdev_backup(const char *device, const char *target,
+void do_blockdev_backup(const char *device, const char *target,
                          enum MirrorSyncMode sync,
                          bool has_speed, int64_t speed,
                          bool has_on_source_error,
                          BlockdevOnError on_source_error,
                          bool has_on_target_error,
                          BlockdevOnError on_target_error,
-                         Error **errp)
+                         BlockJobTxn *txn, Error **errp)
 {
     BlockBackend *blk, *target_blk;
     BlockDriverState *bs;
@@ -2786,7 +3266,7 @@ void qmp_blockdev_backup(const char *device, const char *target,
     bdrv_ref(target_bs);
     bdrv_set_aio_context(target_bs, aio_context);
     backup_start(bs, target_bs, speed, sync, NULL, on_source_error,
-                 on_target_error, block_job_cb, bs, &local_err);
+                 on_target_error, block_job_cb, bs, txn, &local_err);
     if (local_err != NULL) {
         bdrv_unref(target_bs);
         error_propagate(errp, local_err);
@@ -2795,6 +3275,21 @@ out:
     aio_context_release(aio_context);
 }
 
+void qmp_blockdev_backup(const char *device, const char *target,
+                         enum MirrorSyncMode sync,
+                         bool has_speed, int64_t speed,
+                         bool has_on_source_error,
+                         BlockdevOnError on_source_error,
+                         bool has_on_target_error,
+                         BlockdevOnError on_target_error,
+                         Error **errp)
+{
+    do_blockdev_backup(device, target, sync, has_speed, speed,
+                       has_on_source_error, on_source_error,
+                       has_on_target_error, on_target_error,
+                       NULL, errp);
+}
+
 void qmp_drive_mirror(const char *device, const char *target,
                       bool has_format, const char *format,
                       bool has_node_name, const char *node_name,
@@ -3274,6 +3769,72 @@ fail:
     qmp_output_visitor_cleanup(ov);
 }
 
+void qmp_x_blockdev_del(bool has_id, const char *id,
+                        bool has_node_name, const char *node_name, Error **errp)
+{
+    AioContext *aio_context;
+    BlockBackend *blk;
+    BlockDriverState *bs;
+
+    if (has_id && has_node_name) {
+        error_setg(errp, "Only one of id and node-name must be specified");
+        return;
+    } else if (!has_id && !has_node_name) {
+        error_setg(errp, "No block device specified");
+        return;
+    }
+
+    if (has_id) {
+        blk = blk_by_name(id);
+        if (!blk) {
+            error_setg(errp, "Cannot find block backend %s", id);
+            return;
+        }
+        if (blk_get_refcnt(blk) > 1) {
+            error_setg(errp, "Block backend %s is in use", id);
+            return;
+        }
+        bs = blk_bs(blk);
+        aio_context = blk_get_aio_context(blk);
+    } else {
+        bs = bdrv_find_node(node_name);
+        if (!bs) {
+            error_setg(errp, "Cannot find node %s", node_name);
+            return;
+        }
+        blk = bs->blk;
+        if (blk) {
+            error_setg(errp, "Node %s is in use by %s",
+                       node_name, blk_name(blk));
+            return;
+        }
+        aio_context = bdrv_get_aio_context(bs);
+    }
+
+    aio_context_acquire(aio_context);
+
+    if (bs) {
+        if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_DRIVE_DEL, errp)) {
+            goto out;
+        }
+
+        if (bs->refcnt > 1 || !QLIST_EMPTY(&bs->parents)) {
+            error_setg(errp, "Block device %s is in use",
+                       bdrv_get_device_or_node_name(bs));
+            goto out;
+        }
+    }
+
+    if (blk) {
+        blk_unref(blk);
+    } else {
+        bdrv_unref(bs);
+    }
+
+out:
+    aio_context_release(aio_context);
+}
+
 BlockJobInfoList *qmp_query_block_jobs(Error **errp)
 {
     BlockJobInfoList *head = NULL, **p_next = &head;
@@ -3405,6 +3966,16 @@ QemuOptsList qemu_common_drive_opts = {
             .name = "detect-zeroes",
             .type = QEMU_OPT_STRING,
             .help = "try to optimize zero writes (off, on, unmap)",
+        },{
+            .name = "stats-account-invalid",
+            .type = QEMU_OPT_BOOL,
+            .help = "whether to account for invalid I/O operations "
+                    "in the statistics",
+        },{
+            .name = "stats-account-failed",
+            .type = QEMU_OPT_BOOL,
+            .help = "whether to account for failed I/O operations "
+                    "in the statistics",
         },
         { /* end of list */ }
     },
diff --git a/blockjob.c b/blockjob.c
index c02fe59..80adb9d 100644
--- a/blockjob.c
+++ b/blockjob.c
@@ -37,6 +37,19 @@
 #include "qemu/timer.h"
 #include "qapi-event.h"
 
+/* Transactional group of block jobs */
+struct BlockJobTxn {
+
+    /* Is this txn being cancelled? */
+    bool aborting;
+
+    /* List of jobs */
+    QLIST_HEAD(, BlockJob) jobs;
+
+    /* Reference count */
+    int refcnt;
+};
+
 void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
                        int64_t speed, BlockCompletionFunc *cb,
                        void *opaque, Error **errp)
@@ -60,6 +73,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
     job->cb            = cb;
     job->opaque        = opaque;
     job->busy          = true;
+    job->refcnt        = 1;
     bs->job = job;
 
     /* Only set speed when necessary to avoid NotSupported error */
@@ -68,7 +82,7 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
 
         block_job_set_speed(job, speed, &local_err);
         if (local_err) {
-            block_job_release(bs);
+            block_job_unref(job);
             error_propagate(errp, local_err);
             return NULL;
         }
@@ -76,15 +90,101 @@ void *block_job_create(const BlockJobDriver *driver, BlockDriverState *bs,
     return job;
 }
 
-void block_job_release(BlockDriverState *bs)
+void block_job_ref(BlockJob *job)
 {
-    BlockJob *job = bs->job;
+    ++job->refcnt;
+}
 
-    bs->job = NULL;
-    bdrv_op_unblock_all(bs, job->blocker);
-    error_free(job->blocker);
-    g_free(job->id);
-    g_free(job);
+void block_job_unref(BlockJob *job)
+{
+    if (--job->refcnt == 0) {
+        job->bs->job = NULL;
+        bdrv_op_unblock_all(job->bs, job->blocker);
+        bdrv_unref(job->bs);
+        error_free(job->blocker);
+        g_free(job->id);
+        g_free(job);
+    }
+}
+
+static void block_job_completed_single(BlockJob *job)
+{
+    if (!job->ret) {
+        if (job->driver->commit) {
+            job->driver->commit(job);
+        }
+    } else {
+        if (job->driver->abort) {
+            job->driver->abort(job);
+        }
+    }
+    job->cb(job->opaque, job->ret);
+    if (job->txn) {
+        block_job_txn_unref(job->txn);
+    }
+    block_job_unref(job);
+}
+
+static void block_job_completed_txn_abort(BlockJob *job)
+{
+    AioContext *ctx;
+    BlockJobTxn *txn = job->txn;
+    BlockJob *other_job, *next;
+
+    if (txn->aborting) {
+        /*
+         * We are cancelled by another job, which will handle everything.
+         */
+        return;
+    }
+    txn->aborting = true;
+    /* We are the first failed job. Cancel other jobs. */
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        aio_context_acquire(ctx);
+    }
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        if (other_job == job || other_job->completed) {
+            /* Other jobs are "effectively" cancelled by us, set the status for
+             * them; this job, however, may or may not be cancelled, depending
+             * on the caller, so leave it. */
+            if (other_job != job) {
+                other_job->cancelled = true;
+            }
+            continue;
+        }
+        block_job_cancel_sync(other_job);
+        assert(other_job->completed);
+    }
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        block_job_completed_single(other_job);
+        aio_context_release(ctx);
+    }
+}
+
+static void block_job_completed_txn_success(BlockJob *job)
+{
+    AioContext *ctx;
+    BlockJobTxn *txn = job->txn;
+    BlockJob *other_job, *next;
+    /*
+     * Successful completion, see if there are other running jobs in this
+     * txn.
+     */
+    QLIST_FOREACH(other_job, &txn->jobs, txn_list) {
+        if (!other_job->completed) {
+            return;
+        }
+    }
+    /* We are the last completed job, commit the transaction. */
+    QLIST_FOREACH_SAFE(other_job, &txn->jobs, txn_list, next) {
+        ctx = bdrv_get_aio_context(other_job->bs);
+        aio_context_acquire(ctx);
+        assert(other_job->ret == 0);
+        block_job_completed_single(other_job);
+        aio_context_release(ctx);
+    }
 }
 
 void block_job_completed(BlockJob *job, int ret)
@@ -92,8 +192,16 @@ void block_job_completed(BlockJob *job, int ret)
     BlockDriverState *bs = job->bs;
 
     assert(bs->job == job);
-    job->cb(job->opaque, ret);
-    block_job_release(bs);
+    assert(!job->completed);
+    job->completed = true;
+    job->ret = ret;
+    if (!job->txn) {
+        block_job_completed_single(job);
+    } else if (ret < 0 || block_job_is_cancelled(job)) {
+        block_job_completed_txn_abort(job);
+    } else {
+        block_job_completed_txn_success(job);
+    }
 }
 
 void block_job_set_speed(BlockJob *job, int64_t speed, Error **errp)
@@ -178,43 +286,29 @@ struct BlockFinishData {
     int ret;
 };
 
-static void block_job_finish_cb(void *opaque, int ret)
-{
-    struct BlockFinishData *data = opaque;
-
-    data->cancelled = block_job_is_cancelled(data->job);
-    data->ret = ret;
-    data->cb(data->opaque, ret);
-}
-
 static int block_job_finish_sync(BlockJob *job,
                                  void (*finish)(BlockJob *, Error **errp),
                                  Error **errp)
 {
-    struct BlockFinishData data;
     BlockDriverState *bs = job->bs;
     Error *local_err = NULL;
+    int ret;
 
     assert(bs->job == job);
 
-    /* Set up our own callback to store the result and chain to
-     * the original callback.
-     */
-    data.job = job;
-    data.cb = job->cb;
-    data.opaque = job->opaque;
-    data.ret = -EINPROGRESS;
-    job->cb = block_job_finish_cb;
-    job->opaque = &data;
+    block_job_ref(job);
     finish(job, &local_err);
     if (local_err) {
         error_propagate(errp, local_err);
+        block_job_unref(job);
         return -EBUSY;
     }
-    while (data.ret == -EINPROGRESS) {
+    while (!job->completed) {
         aio_poll(bdrv_get_aio_context(bs), true);
     }
-    return (data.cancelled && data.ret == 0) ? -ECANCELED : data.ret;
+    ret = (job->cancelled && job->ret == 0) ? -ECANCELED : job->ret;
+    block_job_unref(job);
+    return ret;
 }
 
 /* A wrapper around block_job_cancel() taking an Error ** parameter so it may be
@@ -406,3 +500,36 @@ void block_job_defer_to_main_loop(BlockJob *job,
 
     qemu_bh_schedule(data->bh);
 }
+
+BlockJobTxn *block_job_txn_new(void)
+{
+    BlockJobTxn *txn = g_new0(BlockJobTxn, 1);
+    QLIST_INIT(&txn->jobs);
+    txn->refcnt = 1;
+    return txn;
+}
+
+static void block_job_txn_ref(BlockJobTxn *txn)
+{
+    txn->refcnt++;
+}
+
+void block_job_txn_unref(BlockJobTxn *txn)
+{
+    if (txn && --txn->refcnt == 0) {
+        g_free(txn);
+    }
+}
+
+void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job)
+{
+    if (!txn) {
+        return;
+    }
+
+    assert(!job->txn);
+    job->txn = txn;
+
+    QLIST_INSERT_HEAD(&txn->jobs, job, txn_list);
+    block_job_txn_ref(txn);
+}
diff --git a/configure b/configure
index b4a3488..0a4c78a 100755
--- a/configure
+++ b/configure
@@ -8,6 +8,9 @@
 CLICOLOR_FORCE= GREP_OPTIONS=
 unset CLICOLOR_FORCE GREP_OPTIONS
 
+# Don't allow CCACHE, if present, to use cached results of compile tests!
+export CCACHE_RECACHE=yes
+
 # Temporary directory used for files created while
 # configure runs. Since it is in the build directory
 # we can safely blow away any previous version of it
@@ -261,6 +264,7 @@ rdma=""
 gprof="no"
 debug_tcg="no"
 debug="no"
+fortify_source=""
 strip_opt="yes"
 tcg_interpreter="no"
 bigendian="no"
@@ -787,6 +791,9 @@ for opt do
   --enable-modules)
       modules="yes"
   ;;
+  --disable-modules)
+      modules="no"
+  ;;
   --cpu=*)
   ;;
   --target-list=*) target_list="$optarg"
@@ -876,6 +883,7 @@ for opt do
       debug_tcg="yes"
       debug="yes"
       strip_opt="no"
+      fortify_source="no"
   ;;
   --enable-sparse) sparse="yes"
   ;;
@@ -1340,7 +1348,6 @@ disabled with --disable-FEATURE, default is enabled if available:
   vte             vte support for the gtk UI
   curses          curses UI
   vnc             VNC UI support
-  vnc-tls         TLS encryption for VNC server
   vnc-sasl        SASL encryption for VNC server
   vnc-jpeg        JPEG lossy compression for VNC server
   vnc-png         PNG compression for VNC server
@@ -1881,16 +1888,34 @@ fi
 # libseccomp check
 
 if test "$seccomp" != "no" ; then
-    if test "$cpu" = "i386" || test "$cpu" = "x86_64" &&
-        $pkg_config --atleast-version=2.1.1 libseccomp; then
+    case "$cpu" in
+    i386|x86_64)
+        libseccomp_minver="2.1.0"
+        ;;
+    arm|aarch64)
+        libseccomp_minver="2.2.3"
+        ;;
+    *)
+        libseccomp_minver=""
+        ;;
+    esac
+
+    if test "$libseccomp_minver" != "" &&
+       $pkg_config --atleast-version=$libseccomp_minver libseccomp ; then
         libs_softmmu="$libs_softmmu `$pkg_config --libs libseccomp`"
         QEMU_CFLAGS="$QEMU_CFLAGS `$pkg_config --cflags libseccomp`"
-	seccomp="yes"
+        seccomp="yes"
     else
-	if test "$seccomp" = "yes"; then
-            feature_not_found "libseccomp" "Install libseccomp devel >= 2.1.1"
-	fi
-	seccomp="no"
+        if test "$seccomp" = "yes" ; then
+            if test "$libseccomp_minver" != "" ; then
+                feature_not_found "libseccomp" \
+                    "Install libseccomp devel >= $libseccomp_minver"
+            else
+                feature_not_found "libseccomp" \
+                    "libseccomp is not supported for host cpu $cpu"
+            fi
+        fi
+        seccomp="no"
     fi
 fi
 ##########################################
@@ -4398,6 +4423,7 @@ fi
 # check if ccache is interfering with
 # semantic analysis of macros
 
+unset CCACHE_CPP2
 ccache_cpp2=no
 cat > $TMPC << EOF
 static const int Z = 1;
@@ -4421,6 +4447,20 @@ if ! compile_object "-Werror"; then
     ccache_cpp2=yes
 fi
 
+#################################################
+# clang does not support glibc + FORTIFY_SOURCE.
+
+if test "$fortify_source" != "no"; then
+  if echo | $cc -dM -E - | grep __clang__ > /dev/null 2>&1 ; then
+    fortify_source="no";
+  elif test -n "$cxx" &&
+       echo | $cxx -dM -E - | grep __clang__ >/dev/null 2>&1 ; then
+    fortify_source="no";
+  else
+    fortify_source="yes"
+  fi
+fi
+
 ##########################################
 # End of CC checks
 # After here, no more $cc or $ld runs
@@ -4428,8 +4468,10 @@ fi
 if test "$gcov" = "yes" ; then
   CFLAGS="-fprofile-arcs -ftest-coverage -g $CFLAGS"
   LDFLAGS="-fprofile-arcs -ftest-coverage $LDFLAGS"
-elif test "$debug" = "no" ; then
+elif test "$fortify_source" = "yes" ; then
   CFLAGS="-O2 -U_FORTIFY_SOURCE -D_FORTIFY_SOURCE=2 $CFLAGS"
+elif test "$debug" = "no"; then
+  CFLAGS="-O2 $CFLAGS"
 fi
 
 ##########################################
diff --git a/contrib/ivshmem-server/ivshmem-server.c b/contrib/ivshmem-server/ivshmem-server.c
index 5e5239c..d9e26b0 100644
--- a/contrib/ivshmem-server/ivshmem-server.c
+++ b/contrib/ivshmem-server/ivshmem-server.c
@@ -168,7 +168,9 @@ ivshmem_server_handle_new_conn(IvshmemServer *server)
     }
     if (i == G_MAXUINT16) {
         IVSHMEM_SERVER_DEBUG(server, "cannot allocate new client id\n");
-        goto fail;
+        close(newfd);
+        g_free(peer);
+        return -1;
     }
     peer->id = server->cur_id++;
 
diff --git a/cpu-exec.c b/cpu-exec.c
index 7eef083..c88d0ff 100644
--- a/cpu-exec.c
+++ b/cpu-exec.c
@@ -30,6 +30,7 @@
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
 #include "hw/i386/apic.h"
 #endif
+#include "sysemu/replay.h"
 
 /* -icount align implementation. */
 
@@ -184,7 +185,7 @@ static inline tcg_target_ulong cpu_tb_exec(CPUState *cpu, uint8_t *tb_ptr)
 /* Execute the code without caching the generated code. An interpreter
    could be used if available. */
 static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
-                             TranslationBlock *orig_tb)
+                             TranslationBlock *orig_tb, bool ignore_icount)
 {
     TranslationBlock *tb;
 
@@ -194,7 +195,8 @@ static void cpu_exec_nocache(CPUState *cpu, int max_cycles,
         max_cycles = CF_COUNT_MASK;
 
     tb = tb_gen_code(cpu, orig_tb->pc, orig_tb->cs_base, orig_tb->flags,
-                     max_cycles | CF_NOCACHE);
+                     max_cycles | CF_NOCACHE
+                         | (ignore_icount ? CF_IGNORE_ICOUNT : 0));
     tb->orig_tb = tcg_ctx.tb_ctx.tb_invalidated_flag ? NULL : orig_tb;
     cpu->current_tb = tb;
     /* execute the generated code */
@@ -345,21 +347,25 @@ int cpu_exec(CPUState *cpu)
     uintptr_t next_tb;
     SyncClocks sc;
 
+    /* replay_interrupt may need current_cpu */
+    current_cpu = cpu;
+
     if (cpu->halted) {
 #if defined(TARGET_I386) && !defined(CONFIG_USER_ONLY)
-        if (cpu->interrupt_request & CPU_INTERRUPT_POLL) {
+        if ((cpu->interrupt_request & CPU_INTERRUPT_POLL)
+            && replay_interrupt()) {
             apic_poll_irq(x86_cpu->apic_state);
             cpu_reset_interrupt(cpu, CPU_INTERRUPT_POLL);
         }
 #endif
         if (!cpu_has_work(cpu)) {
+            current_cpu = NULL;
             return EXCP_HALTED;
         }
 
         cpu->halted = 0;
     }
 
-    current_cpu = cpu;
     atomic_mb_set(&tcg_current_cpu, cpu);
     rcu_read_lock();
 
@@ -401,10 +407,22 @@ int cpu_exec(CPUState *cpu)
                     cpu->exception_index = -1;
                     break;
 #else
-                    cc->do_interrupt(cpu);
-                    cpu->exception_index = -1;
+                    if (replay_exception()) {
+                        cc->do_interrupt(cpu);
+                        cpu->exception_index = -1;
+                    } else if (!replay_has_interrupt()) {
+                        /* give a chance to iothread in replay mode */
+                        ret = EXCP_INTERRUPT;
+                        break;
+                    }
 #endif
                 }
+            } else if (replay_has_exception()
+                       && cpu->icount_decr.u16.low + cpu->icount_extra == 0) {
+                /* try to cause an exception pending in the log */
+                cpu_exec_nocache(cpu, 1, tb_find_fast(cpu), true);
+                ret = -1;
+                break;
             }
 
             next_tb = 0; /* force lookup of first TB */
@@ -420,30 +438,40 @@ int cpu_exec(CPUState *cpu)
                         cpu->exception_index = EXCP_DEBUG;
                         cpu_loop_exit(cpu);
                     }
-                    if (interrupt_request & CPU_INTERRUPT_HALT) {
+                    if (replay_mode == REPLAY_MODE_PLAY
+                        && !replay_has_interrupt()) {
+                        /* Do nothing */
+                    } else if (interrupt_request & CPU_INTERRUPT_HALT) {
+                        replay_interrupt();
                         cpu->interrupt_request &= ~CPU_INTERRUPT_HALT;
                         cpu->halted = 1;
                         cpu->exception_index = EXCP_HLT;
                         cpu_loop_exit(cpu);
                     }
 #if defined(TARGET_I386)
-                    if (interrupt_request & CPU_INTERRUPT_INIT) {
+                    else if (interrupt_request & CPU_INTERRUPT_INIT) {
+                        replay_interrupt();
                         cpu_svm_check_intercept_param(env, SVM_EXIT_INIT, 0);
                         do_cpu_init(x86_cpu);
                         cpu->exception_index = EXCP_HALTED;
                         cpu_loop_exit(cpu);
                     }
 #else
-                    if (interrupt_request & CPU_INTERRUPT_RESET) {
+                    else if (interrupt_request & CPU_INTERRUPT_RESET) {
+                        replay_interrupt();
                         cpu_reset(cpu);
+                        cpu_loop_exit(cpu);
                     }
 #endif
                     /* The target hook has 3 exit conditions:
                        False when the interrupt isn't processed,
                        True when it is, and we should restart on a new TB,
                        and via longjmp via cpu_loop_exit.  */
-                    if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
-                        next_tb = 0;
+                    else {
+                        replay_interrupt();
+                        if (cc->cpu_exec_interrupt(cpu, interrupt_request)) {
+                            next_tb = 0;
+                        }
                     }
                     /* Don't use the cached interrupt_request value,
                        do_interrupt may have updated the EXITTB flag. */
@@ -454,7 +482,8 @@ int cpu_exec(CPUState *cpu)
                         next_tb = 0;
                     }
                 }
-                if (unlikely(cpu->exit_request)) {
+                if (unlikely(cpu->exit_request
+                             || replay_has_interrupt())) {
                     cpu->exit_request = 0;
                     cpu->exception_index = EXCP_INTERRUPT;
                     cpu_loop_exit(cpu);
@@ -519,7 +548,7 @@ int cpu_exec(CPUState *cpu)
                             if (insns_left > 0) {
                                 /* Execute remaining instructions.  */
                                 tb = (TranslationBlock *)(next_tb & ~TB_EXIT_MASK);
-                                cpu_exec_nocache(cpu, insns_left, tb);
+                                cpu_exec_nocache(cpu, insns_left, tb, false);
                                 align_clocks(&sc, cpu);
                             }
                             cpu->exception_index = EXCP_INTERRUPT;
@@ -539,15 +568,27 @@ int cpu_exec(CPUState *cpu)
                    only be set by a memory fault) */
             } /* for(;;) */
         } else {
-            /* Reload env after longjmp - the compiler may have smashed all
-             * local variables as longjmp is marked 'noreturn'. */
+#if defined(__clang__) || !QEMU_GNUC_PREREQ(4, 6)
+            /* Some compilers wrongly smash all local variables after
+             * siglongjmp. There were bug reports for gcc 4.5.0 and clang.
+             * Reload essential local variables here for those compilers.
+             * Newer versions of gcc would complain about this code (-Wclobbered). */
             cpu = current_cpu;
             cc = CPU_GET_CLASS(cpu);
-            cpu->can_do_io = 1;
 #ifdef TARGET_I386
             x86_cpu = X86_CPU(cpu);
             env = &x86_cpu->env;
 #endif
+#else /* buggy compiler */
+            /* Assert that the compiler does not smash local variables. */
+            g_assert(cpu == current_cpu);
+            g_assert(cc == CPU_GET_CLASS(cpu));
+#ifdef TARGET_I386
+            g_assert(x86_cpu == X86_CPU(cpu));
+            g_assert(env == &x86_cpu->env);
+#endif
+#endif /* buggy compiler */
+            cpu->can_do_io = 1;
             tb_lock_reset();
         }
     } /* for(;;) */
diff --git a/cpus.c b/cpus.c
index d2e9e4f..877bd70 100644
--- a/cpus.c
+++ b/cpus.c
@@ -42,6 +42,7 @@
 #include "qemu/seqlock.h"
 #include "qapi-event.h"
 #include "hw/nmi.h"
+#include "sysemu/replay.h"
 
 #ifndef _WIN32
 #include "qemu/compatfd.h"
@@ -334,7 +335,7 @@ static int64_t qemu_icount_round(int64_t count)
     return (count + (1 << icount_time_shift) - 1) >> icount_time_shift;
 }
 
-static void icount_warp_rt(void *opaque)
+static void icount_warp_rt(void)
 {
     /* The icount_warp_timer is rescheduled soon after vm_clock_warp_start
      * changes from -1 to another value, so the race here is okay.
@@ -345,7 +346,8 @@ static void icount_warp_rt(void *opaque)
 
     seqlock_write_lock(&timers_state.vm_clock_seqlock);
     if (runstate_is_running()) {
-        int64_t clock = cpu_get_clock_locked();
+        int64_t clock = REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT,
+                                     cpu_get_clock_locked());
         int64_t warp_delta;
 
         warp_delta = clock - vm_clock_warp_start;
@@ -368,6 +370,11 @@ static void icount_warp_rt(void *opaque)
     }
 }
 
+static void icount_dummy_timer(void *opaque)
+{
+    (void)opaque;
+}
+
 void qtest_clock_warp(int64_t dest)
 {
     int64_t clock = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
@@ -403,6 +410,18 @@ void qemu_clock_warp(QEMUClockType type)
         return;
     }
 
+    /* Nothing to do if the VM is stopped: QEMU_CLOCK_VIRTUAL timers
+     * do not fire, so computing the deadline does not make sense.
+     */
+    if (!runstate_is_running()) {
+        return;
+    }
+
+    /* warp clock deterministically in record/replay mode */
+    if (!replay_checkpoint(CHECKPOINT_CLOCK_WARP)) {
+        return;
+    }
+
     if (icount_sleep) {
         /*
          * If the CPUs have been sleeping, advance QEMU_CLOCK_VIRTUAL timer now.
@@ -412,7 +431,7 @@ void qemu_clock_warp(QEMUClockType type)
          * the CPU starts running, in case the CPU is woken by an event other
          * than the earliest QEMU_CLOCK_VIRTUAL timer.
          */
-        icount_warp_rt(NULL);
+        icount_warp_rt();
         timer_del(icount_warp_timer);
     }
     if (!all_cpu_threads_idle()) {
@@ -605,7 +624,7 @@ void configure_icount(QemuOpts *opts, Error **errp)
     icount_sleep = qemu_opt_get_bool(opts, "sleep", true);
     if (icount_sleep) {
         icount_warp_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL_RT,
-                                         icount_warp_rt, NULL);
+                                         icount_dummy_timer, NULL);
     }
 
     icount_align_option = qemu_opt_get_bool(opts, "align", false);
@@ -694,15 +713,6 @@ void cpu_synchronize_all_post_init(void)
     }
 }
 
-void cpu_clean_all_dirty(void)
-{
-    CPUState *cpu;
-
-    CPU_FOREACH(cpu) {
-        cpu_clean_state(cpu);
-    }
-}
-
 static int do_vm_stop(RunState state)
 {
     int ret = 0;
@@ -1411,6 +1421,28 @@ int vm_stop_force_state(RunState state)
     }
 }
 
+static int64_t tcg_get_icount_limit(void)
+{
+    int64_t deadline;
+
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
+
+        /* Maintain prior (possibly buggy) behaviour where if no deadline
+         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
+         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
+         * nanoseconds.
+         */
+        if ((deadline < 0) || (deadline > INT32_MAX)) {
+            deadline = INT32_MAX;
+        }
+
+        return qemu_icount_round(deadline);
+    } else {
+        return replay_get_instructions();
+    }
+}
+
 static int tcg_cpu_exec(CPUState *cpu)
 {
     int ret;
@@ -1423,24 +1455,12 @@ static int tcg_cpu_exec(CPUState *cpu)
 #endif
     if (use_icount) {
         int64_t count;
-        int64_t deadline;
         int decr;
         timers_state.qemu_icount -= (cpu->icount_decr.u16.low
                                     + cpu->icount_extra);
         cpu->icount_decr.u16.low = 0;
         cpu->icount_extra = 0;
-        deadline = qemu_clock_deadline_ns_all(QEMU_CLOCK_VIRTUAL);
-
-        /* Maintain prior (possibly buggy) behaviour where if no deadline
-         * was set (as there is no QEMU_CLOCK_VIRTUAL timer) or it is more than
-         * INT32_MAX nanoseconds ahead, we still use INT32_MAX
-         * nanoseconds.
-         */
-        if ((deadline < 0) || (deadline > INT32_MAX)) {
-            deadline = INT32_MAX;
-        }
-
-        count = qemu_icount_round(deadline);
+        count = tcg_get_icount_limit();
         timers_state.qemu_icount += count;
         decr = (count > 0xffff) ? 0xffff : count;
         count -= decr;
@@ -1458,6 +1478,7 @@ static int tcg_cpu_exec(CPUState *cpu)
                         + cpu->icount_extra);
         cpu->icount_decr.u32 = 0;
         cpu->icount_extra = 0;
+        replay_account_executed_instructions();
     }
     return ret;
 }
diff --git a/disas/arm.c b/disas/arm.c
index 6165246..7a7354b 100644
--- a/disas/arm.c
+++ b/disas/arm.c
@@ -1779,7 +1779,7 @@ print_insn_coprocessor (bfd_vma pc, struct disassemble_info *info, long given,
 
 			/* Is ``imm'' a negative number?  */
 			if (imm & 0x40)
-			  imm |= (-1 << 7);
+			  imm |= (~0u << 7);
 
 			func (stream, "%d", imm);
 		      }
diff --git a/docs/bitmaps.md b/docs/bitmaps.md
index fa87f07..a2e8d51 100644
--- a/docs/bitmaps.md
+++ b/docs/bitmaps.md
@@ -19,12 +19,20 @@ which is included at the end of this document.
 * A dirty bitmap's name is unique to the node, but bitmaps attached to different
   nodes can share the same name.
 
+* Dirty bitmaps created for internal use by QEMU may be anonymous and have no
+  name, but any user-created bitmaps may not be. There can be any number of
+  anonymous bitmaps per node.
+
+* The name of a user-created bitmap must not be empty ("").
+
 ## Bitmap Modes
 
 * A Bitmap can be "frozen," which means that it is currently in-use by a backup
   operation and cannot be deleted, renamed, written to, reset,
   etc.
 
+* The normal operating mode for a bitmap is "active."
+
 ## Basic QMP Usage
 
 ### Supported Commands ###
@@ -97,11 +105,7 @@ which is included at the end of this document.
 }
 ```
 
-## Transactions (Not yet implemented)
-
-* Transactional commands are forthcoming in a future version,
-  and are not yet available for use. This section serves as
-  documentation of intent for their design and usage.
+## Transactions
 
 ### Justification
 
@@ -323,6 +327,155 @@ full backup as a backing image.
       "event": "BLOCK_JOB_COMPLETED" }
     ```
 
+### Partial Transactional Failures
+
+* Sometimes, a transaction will succeed in launching and return success,
+  but then later the backup jobs themselves may fail. It is possible that
+  a management application may have to deal with a partial backup failure
+  after a successful transaction.
+
+* If multiple backup jobs are specified in a single transaction, when one of
+  them fails, it will not interact with the other backup jobs in any way.
+
+* The job(s) that succeeded will clear the dirty bitmap associated with the
+  operation, but the job(s) that failed will not. It is not "safe" to delete
+  any incremental backups that were created successfully in this scenario,
+  even though others failed.
+
+#### Example
+
+* QMP example highlighting two backup jobs:
+
+    ```json
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          { "type": "drive-backup",
+            "data": { "device": "drive0", "bitmap": "bitmap0",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+          { "type": "drive-backup",
+            "data": { "device": "drive1", "bitmap": "bitmap1",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+        ]
+      }
+    }
+    ```
+
+* QMP example response, highlighting one success and one failure:
+    * Acknowledgement that the Transaction was accepted and jobs were launched:
+        ```json
+        { "return": {} }
+        ```
+
+    * Later, QEMU sends notice that the first job was completed:
+        ```json
+        { "timestamp": { "seconds": 1447192343, "microseconds": 615698 },
+          "data": { "device": "drive0", "type": "backup",
+                     "speed": 0, "len": 67108864, "offset": 67108864 },
+          "event": "BLOCK_JOB_COMPLETED"
+        }
+        ```
+
+    * Later yet, QEMU sends notice that the second job has failed:
+        ```json
+        { "timestamp": { "seconds": 1447192399, "microseconds": 683015 },
+          "data": { "device": "drive1", "action": "report",
+                    "operation": "read" },
+          "event": "BLOCK_JOB_ERROR" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447192399, "microseconds": 685853 },
+          "data": { "speed": 0, "offset": 0, "len": 67108864,
+                    "error": "Input/output error",
+                    "device": "drive1", "type": "backup" },
+          "event": "BLOCK_JOB_COMPLETED" }
+
+* In the above example, "d0-incr-1.qcow2" is valid and must be kept,
+  but "d1-incr-1.qcow2" is invalid and should be deleted. If a VM-wide
+  incremental backup of all drives at a point-in-time is to be made,
+  new backups for both drives will need to be made, taking into account
+  that a new incremental backup for drive0 needs to be based on top of
+  "d0-incr-1.qcow2."
+
+### Grouped Completion Mode
+
+* While jobs launched by transactions normally complete or fail on their own,
+  it is possible to instruct them to complete or fail together as a group.
+
+* QMP transactions take an optional properties structure that can affect
+  the semantics of the transaction.
+
+* The "completion-mode" transaction property can be either "individual"
+  which is the default, legacy behavior described above, or "grouped,"
+  a new behavior detailed below.
+
+* Delayed Completion: In grouped completion mode, no jobs will report
+  success until all jobs are ready to report success.
+
+* Grouped failure: If any job fails in grouped completion mode, all remaining
+  jobs will be cancelled. Any incremental backups will restore their dirty
+  bitmap objects as if no backup command was ever issued.
+
+    * Regardless of if QEMU reports a particular incremental backup job as
+      CANCELLED or as an ERROR, the in-memory bitmap will be restored.
+
+#### Example
+
+* Here's the same example scenario from above with the new property:
+
+    ```json
+    { "execute": "transaction",
+      "arguments": {
+        "actions": [
+          { "type": "drive-backup",
+            "data": { "device": "drive0", "bitmap": "bitmap0",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d0-incr-1.qcow2" } },
+          { "type": "drive-backup",
+            "data": { "device": "drive1", "bitmap": "bitmap1",
+                      "format": "qcow2", "mode": "existing",
+                      "sync": "incremental", "target": "d1-incr-1.qcow2" } },
+        ],
+        "properties": {
+          "completion-mode": "grouped"
+        }
+      }
+    }
+    ```
+
+* QMP example response, highlighting a failure for drive2:
+    * Acknowledgement that the Transaction was accepted and jobs were launched:
+        ```json
+        { "return": {} }
+        ```
+
+    * Later, QEMU sends notice that the second job has errored out,
+      but that the first job was also cancelled:
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 632377 },
+          "data": { "device": "drive1", "action": "report",
+                    "operation": "read" },
+          "event": "BLOCK_JOB_ERROR" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 640074 },
+          "data": { "speed": 0, "offset": 0, "len": 67108864,
+                    "error": "Input/output error",
+                    "device": "drive1", "type": "backup" },
+          "event": "BLOCK_JOB_COMPLETED" }
+        ```
+
+        ```json
+        { "timestamp": { "seconds": 1447193702, "microseconds": 640163 },
+          "data": { "device": "drive0", "type": "backup", "speed": 0,
+                    "len": 67108864, "offset": 16777216 },
+          "event": "BLOCK_JOB_CANCELLED" }
+        ```
+
 <!--
 The FreeBSD Documentation License
 
diff --git a/docs/migration.txt b/docs/migration.txt
index f6df4be..fda8d61 100644
--- a/docs/migration.txt
+++ b/docs/migration.txt
@@ -291,3 +291,194 @@ save/send this state when we are in the middle of a pio operation
 (that is what ide_drive_pio_state_needed() checks).  If DRQ_STAT is
 not enabled, the values on that fields are garbage and don't need to
 be sent.
+
+= Return path =
+
+In most migration scenarios there is only a single data path that runs
+from the source VM to the destination, typically along a single fd (although
+possibly with another fd or similar for some fast way of throwing pages across).
+
+However, some uses need two way communication; in particular the Postcopy
+destination needs to be able to request pages on demand from the source.
+
+For these scenarios there is a 'return path' from the destination to the source;
+qemu_file_get_return_path(QEMUFile* fwdpath) gives the QEMUFile* for the return
+path.
+
+  Source side
+     Forward path - written by migration thread
+     Return path  - opened by main thread, read by return-path thread
+
+  Destination side
+     Forward path - read by main thread
+     Return path  - opened by main thread, written by main thread AND postcopy
+                    thread (protected by rp_mutex)
+
+= Postcopy =
+'Postcopy' migration is a way to deal with migrations that refuse to converge
+(or take too long to converge) its plus side is that there is an upper bound on
+the amount of migration traffic and time it takes, the down side is that during
+the postcopy phase, a failure of *either* side or the network connection causes
+the guest to be lost.
+
+In postcopy the destination CPUs are started before all the memory has been
+transferred, and accesses to pages that are yet to be transferred cause
+a fault that's translated by QEMU into a request to the source QEMU.
+
+Postcopy can be combined with precopy (i.e. normal migration) so that if precopy
+doesn't finish in a given time the switch is made to postcopy.
+
+=== Enabling postcopy ===
+
+To enable postcopy, issue this command on the monitor prior to the
+start of migration:
+
+migrate_set_capability x-postcopy-ram on
+
+The normal commands are then used to start a migration, which is still
+started in precopy mode.  Issuing:
+
+migrate_start_postcopy
+
+will now cause the transition from precopy to postcopy.
+It can be issued immediately after migration is started or any
+time later on.  Issuing it after the end of a migration is harmless.
+
+Note: During the postcopy phase, the bandwidth limits set using
+migrate_set_speed is ignored (to avoid delaying requested pages that
+the destination is waiting for).
+
+=== Postcopy device transfer ===
+
+Loading of device data may cause the device emulation to access guest RAM
+that may trigger faults that have to be resolved by the source, as such
+the migration stream has to be able to respond with page data *during* the
+device load, and hence the device data has to be read from the stream completely
+before the device load begins to free the stream up.  This is achieved by
+'packaging' the device data into a blob that's read in one go.
+
+Source behaviour
+
+Until postcopy is entered the migration stream is identical to normal
+precopy, except for the addition of a 'postcopy advise' command at
+the beginning, to tell the destination that postcopy might happen.
+When postcopy starts the source sends the page discard data and then
+forms the 'package' containing:
+
+   Command: 'postcopy listen'
+   The device state
+      A series of sections, identical to the precopy streams device state stream
+      containing everything except postcopiable devices (i.e. RAM)
+   Command: 'postcopy run'
+
+The 'package' is sent as the data part of a Command: 'CMD_PACKAGED', and the
+contents are formatted in the same way as the main migration stream.
+
+During postcopy the source scans the list of dirty pages and sends them
+to the destination without being requested (in much the same way as precopy),
+however when a page request is received from the destination, the dirty page
+scanning restarts from the requested location.  This causes requested pages
+to be sent quickly, and also causes pages directly after the requested page
+to be sent quickly in the hope that those pages are likely to be used
+by the destination soon.
+
+Destination behaviour
+
+Initially the destination looks the same as precopy, with a single thread
+reading the migration stream; the 'postcopy advise' and 'discard' commands
+are processed to change the way RAM is managed, but don't affect the stream
+processing.
+
+------------------------------------------------------------------------------
+                        1      2   3     4 5                      6   7
+main -----DISCARD-CMD_PACKAGED ( LISTEN  DEVICE     DEVICE DEVICE RUN )
+thread                             |       |
+                                   |     (page request)
+                                   |        \___
+                                   v            \
+listen thread:                     --- page -- page -- page -- page -- page --
+
+                                   a   b        c
+------------------------------------------------------------------------------
+
+On receipt of CMD_PACKAGED (1)
+   All the data associated with the package - the ( ... ) section in the
+diagram - is read into memory (into a QEMUSizedBuffer), and the main thread
+recurses into qemu_loadvm_state_main to process the contents of the package (2)
+which contains commands (3,6) and devices (4...)
+
+On receipt of 'postcopy listen' - 3 -(i.e. the 1st command in the package)
+a new thread (a) is started that takes over servicing the migration stream,
+while the main thread carries on loading the package.   It loads normal
+background page data (b) but if during a device load a fault happens (5) the
+returned page (c) is loaded by the listen thread allowing the main threads
+device load to carry on.
+
+The last thing in the CMD_PACKAGED is a 'RUN' command (6) letting the destination
+CPUs start running.
+At the end of the CMD_PACKAGED (7) the main thread returns to normal running behaviour
+and is no longer used by migration, while the listen thread carries
+on servicing page data until the end of migration.
+
+=== Postcopy states ===
+
+Postcopy moves through a series of states (see postcopy_state) from
+ADVISE->DISCARD->LISTEN->RUNNING->END
+
+  Advise:  Set at the start of migration if postcopy is enabled, even
+           if it hasn't had the start command; here the destination
+           checks that its OS has the support needed for postcopy, and performs
+           setup to ensure the RAM mappings are suitable for later postcopy.
+           The destination will fail early in migration at this point if the
+           required OS support is not present.
+           (Triggered by reception of POSTCOPY_ADVISE command)
+
+  Discard: Entered on receipt of the first 'discard' command; prior to
+           the first Discard being performed, hugepages are switched off
+           (using madvise) to ensure that no new huge pages are created
+           during the postcopy phase, and to cause any huge pages that
+           have discards on them to be broken.
+
+  Listen:  The first command in the package, POSTCOPY_LISTEN, switches
+           the destination state to Listen, and starts a new thread
+           (the 'listen thread') which takes over the job of receiving
+           pages off the migration stream, while the main thread carries
+           on processing the blob.  With this thread able to process page
+           reception, the destination now 'sensitises' the RAM to detect
+           any access to missing pages (on Linux using the 'userfault'
+           system).
+
+  Running: POSTCOPY_RUN causes the destination to synchronise all
+           state and start the CPUs and IO devices running.  The main
+           thread now finishes processing the migration package and
+           now carries on as it would for normal precopy migration
+           (although it can't do the cleanup it would do as it
+           finishes a normal migration).
+
+  End:     The listen thread can now quit, and perform the cleanup of migration
+           state, the migration is now complete.
+
+=== Source side page maps ===
+
+The source side keeps two bitmaps during postcopy; 'the migration bitmap'
+and 'unsent map'.  The 'migration bitmap' is basically the same as in
+the precopy case, and holds a bit to indicate that page is 'dirty' -
+i.e. needs sending.  During the precopy phase this is updated as the CPU
+dirties pages, however during postcopy the CPUs are stopped and nothing
+should dirty anything any more.
+
+The 'unsent map' is used for the transition to postcopy. It is a bitmap that
+has a bit cleared whenever a page is sent to the destination, however during
+the transition to postcopy mode it is combined with the migration bitmap
+to form a set of pages that:
+   a) Have been sent but then redirtied (which must be discarded)
+   b) Have not yet been sent - which also must be discarded to cause any
+      transparent huge pages built during precopy to be broken.
+
+Note that the contents of the unsentmap are sacrificed during the calculation
+of the discard set and thus aren't valid once in postcopy.  The dirtymap
+is still valid and is used to ensure that no page is sent more than once.  Any
+request for a page that has already been sent is ignored.  Duplicate requests
+such as this can happen as a page is sent at about the same time the
+destination accesses it.
+
diff --git a/docs/qapi-code-gen.txt b/docs/qapi-code-gen.txt
index 4d8c2fc..ceb9a78 100644
--- a/docs/qapi-code-gen.txt
+++ b/docs/qapi-code-gen.txt
@@ -514,8 +514,23 @@ exactly the server (QEMU) supports.
 For this purpose, QMP provides introspection via command
 query-qmp-schema.  QGA currently doesn't support introspection.
 
+While Client JSON Protocol wire compatibility should be maintained
+between qemu versions, we cannot make the same guarantees for
+introspection stability.  For example, one version of qemu may provide
+a non-variant optional member of a struct, and a later version rework
+the member to instead be non-optional and associated with a variant.
+Likewise, one version of qemu may list a member with open-ended type
+'str', and a later version could convert it to a finite set of strings
+via an enum type; or a member may be converted from a specific type to
+an alternate that represents a choice between the original type and
+something else.
+
 query-qmp-schema returns a JSON array of SchemaInfo objects.  These
 objects together describe the wire ABI, as defined in the QAPI schema.
+There is no specified order to the SchemaInfo objects returned; a
+client must search for a particular name throughout the entire array
+to learn more about that name, but is at least guaranteed that there
+will be no collisions between type, command, and event names.
 
 However, the SchemaInfo can't reflect all the rules and restrictions
 that apply to QMP.  It's interface introspection (figuring out what's
@@ -596,7 +611,9 @@ any.  Each element is a JSON object with members "name" (the member's
 name), "type" (the name of its type), and optionally "default".  The
 member is optional if "default" is present.  Currently, "default" can
 only have value null.  Other values are reserved for future
-extensions.
+extensions.  The "members" array is in no particular order; clients
+must search the entire object when learning whether a particular
+member is supported.
 
 Example: the SchemaInfo for MyType from section Struct types
 
@@ -610,7 +627,9 @@ Example: the SchemaInfo for MyType from section Struct types
 "variants" is a JSON array describing the object's variant members.
 Each element is a JSON object with members "case" (the value of type
 tag this element applies to) and "type" (the name of an object type
-that provides the variant members for this type tag value).
+that provides the variant members for this type tag value).  The
+"variants" array is in no particular order, and is not guaranteed to
+list cases in the same order as the corresponding "tag" enum type.
 
 Example: the SchemaInfo for flat union BlockdevOptions from section
 Union types
@@ -651,7 +670,8 @@ Union types
 The SchemaInfo for an alternate type has meta-type "alternate", and
 variant member "members".  "members" is a JSON array.  Each element is
 a JSON object with member "type", which names a type.  Values of the
-alternate type conform to exactly one of its member types.
+alternate type conform to exactly one of its member types.  There is
+no guarantee on the order in which "members" will be listed.
 
 Example: the SchemaInfo for BlockRef from section Alternate types
 
@@ -662,15 +682,20 @@ Example: the SchemaInfo for BlockRef from section Alternate types
 
 The SchemaInfo for an array type has meta-type "array", and variant
 member "element-type", which names the array's element type.  Array
-types are implicitly defined.
+types are implicitly defined.  For convenience, the array's name may
+resemble the element type; however, clients should examine member
+"element-type" instead of making assumptions based on parsing member
+"name".
 
 Example: the SchemaInfo for ['str']
 
-    { "name": "strList", "meta-type": "array",
+    { "name": "[str]", "meta-type": "array",
       "element-type": "str" }
 
 The SchemaInfo for an enumeration type has meta-type "enum" and
-variant member "values".
+variant member "values".  The values are listed in no particular
+order; clients must search the entire enum when learning whether a
+particular value is supported.
 
 Example: the SchemaInfo for MyEnum from section Enumeration types
 
diff --git a/docs/replay.txt b/docs/replay.txt
new file mode 100644
index 0000000..149727e
--- /dev/null
+++ b/docs/replay.txt
@@ -0,0 +1,168 @@
+Copyright (c) 2010-2015 Institute for System Programming
+                        of the Russian Academy of Sciences.
+
+This work is licensed under the terms of the GNU GPL, version 2 or later.
+See the COPYING file in the top-level directory.
+
+Record/replay
+-------------
+
+Record/replay functions are used for the reverse execution and deterministic
+replay of qemu execution. This implementation of deterministic replay can
+be used for deterministic debugging of guest code through a gdb remote
+interface.
+
+Execution recording writes a non-deterministic events log, which can be later
+used for replaying the execution anywhere and for unlimited number of times.
+It also supports checkpointing for faster rewinding during reverse debugging.
+Execution replaying reads the log and replays all non-deterministic events
+including external input, hardware clocks, and interrupts.
+
+Deterministic replay has the following features:
+ * Deterministically replays whole system execution and all contents of
+   the memory, state of the hardware devices, clocks, and screen of the VM.
+ * Writes execution log into the file for later replaying for multiple times
+   on different machines.
+ * Supports i386, x86_64, and ARM hardware platforms.
+ * Performs deterministic replay of all operations with keyboard and mouse
+   input devices.
+
+Usage of the record/replay:
+ * First, record the execution, by adding the following arguments to the command line:
+   '-icount shift=7,rr=record,rrfile=replay.bin -net none'.
+   Block devices' images are not actually changed in the recording mode,
+   because all of the changes are written to the temporary overlay file.
+ * Then you can replay it by using another command
+   line option: '-icount shift=7,rr=replay,rrfile=replay.bin -net none'
+ * '-net none' option should also be specified if network replay patches
+   are not applied.
+
+Papers with description of deterministic replay implementation:
+http://www.computer.org/csdl/proceedings/csmr/2012/4666/00/4666a553-abs.html
+http://dl.acm.org/citation.cfm?id=2786805.2803179
+
+Modifications of qemu include:
+ * wrappers for clock and time functions to save their return values in the log
+ * saving different asynchronous events (e.g. system shutdown) into the log
+ * synchronization of the bottom halves execution
+ * synchronization of the threads from thread pool
+ * recording/replaying user input (mouse and keyboard)
+ * adding internal checkpoints for cpu and io synchronization
+
+Non-deterministic events
+------------------------
+
+Our record/replay system is based on saving and replaying non-deterministic
+events (e.g. keyboard input) and simulating deterministic ones (e.g. reading
+from HDD or memory of the VM). Saving only non-deterministic events makes
+log file smaller, simulation faster, and allows using reverse debugging even
+for realtime applications.
+
+The following non-deterministic data from peripheral devices is saved into
+the log: mouse and keyboard input, network packets, audio controller input,
+USB packets, serial port input, and hardware clocks (they are non-deterministic
+too, because their values are taken from the host machine). Inputs from
+simulated hardware, memory of VM, software interrupts, and execution of
+instructions are not saved into the log, because they are deterministic and
+can be replayed by simulating the behavior of virtual machine starting from
+initial state.
+
+We had to solve three tasks to implement deterministic replay: recording
+non-deterministic events, replaying non-deterministic events, and checking
+that there is no divergence between record and replay modes.
+
+We changed several parts of QEMU to make event log recording and replaying.
+Devices' models that have non-deterministic input from external devices were
+changed to write every external event into the execution log immediately.
+E.g. network packets are written into the log when they arrive into the virtual
+network adapter.
+
+All non-deterministic events are coming from these devices. But to
+replay them we need to know at which moments they occur. We specify
+these moments by counting the number of instructions executed between
+every pair of consecutive events.
+
+Instruction counting
+--------------------
+
+QEMU should work in icount mode to use record/replay feature. icount was
+designed to allow deterministic execution in absence of external inputs
+of the virtual machine. We also use icount to control the occurrence of the
+non-deterministic events. The number of instructions elapsed from the last event
+is written to the log while recording the execution. In replay mode we
+can predict when to inject that event using the instruction counter.
+
+Timers
+------
+
+Timers are used to execute callbacks from different subsystems of QEMU
+at the specified moments of time. There are several kinds of timers:
+ * Real time clock. Based on host time and used only for callbacks that
+   do not change the virtual machine state. For this reason real time
+   clock and timers does not affect deterministic replay at all.
+ * Virtual clock. These timers run only during the emulation. In icount
+   mode virtual clock value is calculated using executed instructions counter.
+   That is why it is completely deterministic and does not have to be recorded.
+ * Host clock. This clock is used by device models that simulate real time
+   sources (e.g. real time clock chip). Host clock is the one of the sources
+   of non-determinism. Host clock read operations should be logged to
+   make the execution deterministic.
+ * Real time clock for icount. This clock is similar to real time clock but
+   it is used only for increasing virtual clock while virtual machine is
+   sleeping. Due to its nature it is also non-deterministic as the host clock
+   and has to be logged too.
+
+Checkpoints
+-----------
+
+Replaying of the execution of virtual machine is bound by sources of
+non-determinism. These are inputs from clock and peripheral devices,
+and QEMU thread scheduling. Thread scheduling affect on processing events
+from timers, asynchronous input-output, and bottom halves.
+
+Invocations of timers are coupled with clock reads and changing the state
+of the virtual machine. Reads produce non-deterministic data taken from
+host clock. And VM state changes should preserve their order. Their relative
+order in replay mode must replicate the order of callbacks in record mode.
+To preserve this order we use checkpoints. When a specific clock is processed
+in record mode we save to the log special "checkpoint" event.
+Checkpoints here do not refer to virtual machine snapshots. They are just
+record/replay events used for synchronization.
+
+QEMU in replay mode will try to invoke timers processing in random moment
+of time. That's why we do not process a group of timers until the checkpoint
+event will be read from the log. Such an event allows synchronizing CPU
+execution and timer events.
+
+Another checkpoints application in record/replay is instruction counting
+while the virtual machine is idle. This function (qemu_clock_warp) is called
+from the wait loop. It changes virtual machine state and must be deterministic
+then. That is why we added checkpoint to this function to prevent its
+operation in replay mode when it does not correspond to record mode.
+
+Bottom halves
+-------------
+
+Disk I/O events are completely deterministic in our model, because
+in both record and replay modes we start virtual machine from the same
+disk state. But callbacks that virtual disk controller uses for reading and
+writing the disk may occur at different moments of time in record and replay
+modes.
+
+Reading and writing requests are created by CPU thread of QEMU. Later these
+requests proceed to block layer which creates "bottom halves". Bottom
+halves consist of callback and its parameters. They are processed when
+main loop locks the global mutex. These locks are not synchronized with
+replaying process because main loop also processes the events that do not
+affect the virtual machine state (like user interaction with monitor).
+
+That is why we had to implement saving and replaying bottom halves callbacks
+synchronously to the CPU execution. When the callback is about to execute
+it is added to the queue in the replay module. This queue is written to the
+log when its callbacks are executed. In replay mode callbacks are not processed
+until the corresponding event is read from the events log file.
+
+Sometimes the block layer uses asynchronous callbacks for its internal purposes
+(like reading or writing VM snapshots or disk image cluster tables). In this
+case bottom halves are not marked as "replayable" and do not saved
+into the log.
diff --git a/docs/specs/vhost-user.txt b/docs/specs/vhost-user.txt
index e0d71e2..26dde2e 100644
--- a/docs/specs/vhost-user.txt
+++ b/docs/specs/vhost-user.txt
@@ -98,6 +98,7 @@ typedef struct VhostUserMsg {
         struct vhost_vring_state state;
         struct vhost_vring_addr addr;
         VhostUserMemory memory;
+        VhostUserLog log;
     };
 } QEMU_PACKED VhostUserMsg;
 
@@ -255,10 +256,10 @@ Message types
       as an owner of the session. This can be used on the Slave as a
       "session start" flag.
 
- * VHOST_USER_RESET_DEVICE
+ * VHOST_USER_RESET_OWNER
 
       Id: 4
-      Equivalent ioctl: VHOST_RESET_DEVICE
+      Equivalent ioctl: VHOST_RESET_OWNER
       Master payload: N/A
 
       Issued when a new connection is about to be closed. The Master will no
@@ -282,7 +283,12 @@ Message types
       Master payload: u64
       Slave payload: N/A
 
-      Sets the logging base address.
+      Sets logging shared memory space.
+      When slave has VHOST_USER_PROTOCOL_F_LOG_SHMFD protocol
+      feature, the log memory fd is provided in the ancillary data of
+      VHOST_USER_SET_LOG_BASE message, the size and offset of shared
+      memory area provided in the message.
+
 
  * VHOST_USER_SET_LOG_FD
 
diff --git a/docs/writing-qmp-commands.txt b/docs/writing-qmp-commands.txt
index 8647cac..59aa77a 100644
--- a/docs/writing-qmp-commands.txt
+++ b/docs/writing-qmp-commands.txt
@@ -210,7 +210,7 @@ if you don't see these strings, then something went wrong.
 === Errors ===
 
 QMP commands should use the error interface exported by the error.h header
-file. Basically, errors are set by calling the error_set() function.
+file. Basically, most errors are set by calling the error_setg() function.
 
 Let's say we don't accept the string "message" to contain the word "love". If
 it does contain it, we want the "hello-world" command to return an error:
@@ -219,8 +219,7 @@ void qmp_hello_world(bool has_message, const char *message, Error **errp)
 {
     if (has_message) {
         if (strstr(message, "love")) {
-            error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                      "the word 'love' is not allowed");
+            error_setg(errp, "the word 'love' is not allowed");
             return;
         }
         printf("%s\n", message);
@@ -229,10 +228,8 @@ void qmp_hello_world(bool has_message, const char *message, Error **errp)
     }
 }
 
-The first argument to the error_set() function is the Error pointer to pointer,
-which is passed to all QMP functions. The second argument is a ErrorClass
-value, which should be ERROR_CLASS_GENERIC_ERROR most of the time (more
-details about error classes are given below). The third argument is a human
+The first argument to the error_setg() function is the Error pointer
+to pointer, which is passed to all QMP functions. The next argument is a human
 description of the error, this is a free-form printf-like string.
 
 Let's test the example above. Build qemu, run it as defined in the "Testing"
@@ -249,8 +246,9 @@ The QMP server's response should be:
     }
 }
 
-As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR. There
-are two exceptions to this rule:
+As a general rule, all QMP errors should use ERROR_CLASS_GENERIC_ERROR
+(done by default when using error_setg()). There are two exceptions to
+this rule:
 
  1. A non-generic ErrorClass value exists* for the failure you want to report
     (eg. DeviceNotFound)
@@ -259,8 +257,8 @@ are two exceptions to this rule:
     want to report, hence you have to add a new ErrorClass value so that they
     can check for it
 
-If the failure you want to report doesn't fall in one of the two cases above,
-just report ERROR_CLASS_GENERIC_ERROR.
+If the failure you want to report falls into one of the two cases above,
+use error_set() with a second argument of an ErrorClass value.
 
  * All existing ErrorClass values are defined in the qapi-schema.json file
 
diff --git a/exec.c b/exec.c
index 8af2570..b09f18b 100644
--- a/exec.c
+++ b/exec.c
@@ -50,6 +50,7 @@
 #include "qemu/rcu_queue.h"
 #include "qemu/main-loop.h"
 #include "translate-all.h"
+#include "sysemu/replay.h"
 
 #include "exec/memory-internal.h"
 #include "exec/ram_addr.h"
@@ -882,6 +883,7 @@ void cpu_abort(CPUState *cpu, const char *fmt, ...)
     }
     va_end(ap2);
     va_end(ap);
+    replay_finish();
 #if defined(CONFIG_USER_ONLY)
     {
         struct sigaction act;
@@ -901,7 +903,7 @@ static RAMBlock *qemu_get_ram_block(ram_addr_t addr)
 
     block = atomic_rcu_read(&ram_list.mru_block);
     if (block && addr - block->offset < block->max_length) {
-        goto found;
+        return block;
     }
     QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
         if (addr - block->offset < block->max_length) {
@@ -1205,6 +1207,7 @@ static void *file_ram_alloc(RAMBlock *block,
                             const char *path,
                             Error **errp)
 {
+    struct stat st;
     char *filename;
     char *sanitized_name;
     char *c;
@@ -1233,26 +1236,33 @@ static void *file_ram_alloc(RAMBlock *block,
         goto error;
     }
 
-    /* Make name safe to use with mkstemp by replacing '/' with '_'. */
-    sanitized_name = g_strdup(memory_region_name(block->mr));
-    for (c = sanitized_name; *c != '\0'; c++) {
-        if (*c == '/')
-            *c = '_';
-    }
+    if (!stat(path, &st) && S_ISDIR(st.st_mode)) {
+        /* Make name safe to use with mkstemp by replacing '/' with '_'. */
+        sanitized_name = g_strdup(memory_region_name(block->mr));
+        for (c = sanitized_name; *c != '\0'; c++) {
+            if (*c == '/') {
+                *c = '_';
+            }
+        }
 
-    filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
-                               sanitized_name);
-    g_free(sanitized_name);
+        filename = g_strdup_printf("%s/qemu_back_mem.%s.XXXXXX", path,
+                                   sanitized_name);
+        g_free(sanitized_name);
+
+        fd = mkstemp(filename);
+        if (fd >= 0) {
+            unlink(filename);
+        }
+        g_free(filename);
+    } else {
+        fd = open(path, O_RDWR | O_CREAT, 0644);
+    }
 
-    fd = mkstemp(filename);
     if (fd < 0) {
         error_setg_errno(errp, errno,
                          "unable to create backing store for hugepages");
-        g_free(filename);
         goto error;
     }
-    unlink(filename);
-    g_free(filename);
 
     memory = ROUND_UP(memory, hpagesize);
 
@@ -1282,10 +1292,6 @@ static void *file_ram_alloc(RAMBlock *block,
     return area;
 
 error:
-    if (mem_prealloc) {
-        error_report("%s", error_get_pretty(*errp));
-        exit(1);
-    }
     return NULL;
 }
 #endif
@@ -1371,6 +1377,11 @@ static RAMBlock *find_ram_block(ram_addr_t addr)
     return NULL;
 }
 
+const char *qemu_ram_get_idstr(RAMBlock *rb)
+{
+    return rb->idstr;
+}
+
 /* Called with iothread lock held.  */
 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev)
 {
@@ -1441,7 +1452,7 @@ int qemu_ram_resize(ram_addr_t base, ram_addr_t newsize, Error **errp)
 
     assert(block);
 
-    newsize = TARGET_PAGE_ALIGN(newsize);
+    newsize = HOST_PAGE_ALIGN(newsize);
 
     if (block->used_length == newsize) {
         return 0;
@@ -1585,7 +1596,7 @@ ram_addr_t qemu_ram_alloc_from_file(ram_addr_t size, MemoryRegion *mr,
         return -1;
     }
 
-    size = TARGET_PAGE_ALIGN(size);
+    size = HOST_PAGE_ALIGN(size);
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
     new_block->used_length = size;
@@ -1621,8 +1632,8 @@ ram_addr_t qemu_ram_alloc_internal(ram_addr_t size, ram_addr_t max_size,
     ram_addr_t addr;
     Error *local_err = NULL;
 
-    size = TARGET_PAGE_ALIGN(size);
-    max_size = TARGET_PAGE_ALIGN(max_size);
+    size = HOST_PAGE_ALIGN(size);
+    max_size = HOST_PAGE_ALIGN(max_size);
     new_block = g_malloc0(sizeof(*new_block));
     new_block->mr = mr;
     new_block->resized = resized;
@@ -1871,8 +1882,16 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
     }
 }
 
-/* Some of the softmmu routines need to translate from a host pointer
- * (typically a TLB entry) back to a ram offset.
+/*
+ * Translates a host ptr back to a RAMBlock, a ram_addr and an offset
+ * in that RAMBlock.
+ *
+ * ptr: Host pointer to look up
+ * round_offset: If true round the result offset down to a page boundary
+ * *ram_addr: set to result ram_addr
+ * *offset: set to result offset within the RAMBlock
+ *
+ * Returns: RAMBlock (or NULL if not found)
  *
  * By the time this function returns, the returned pointer is not protected
  * by RCU anymore.  If the caller is not within an RCU critical section and
@@ -1880,18 +1899,22 @@ static void *qemu_ram_ptr_length(ram_addr_t addr, hwaddr *size)
  * pointer, such as a reference to the region that includes the incoming
  * ram_addr_t.
  */
-MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+                                   ram_addr_t *ram_addr,
+                                   ram_addr_t *offset)
 {
     RAMBlock *block;
     uint8_t *host = ptr;
-    MemoryRegion *mr;
 
     if (xen_enabled()) {
         rcu_read_lock();
         *ram_addr = xen_ram_addr_from_mapcache(ptr);
-        mr = qemu_get_ram_block(*ram_addr)->mr;
+        block = qemu_get_ram_block(*ram_addr);
+        if (block) {
+            *offset = (host - block->host);
+        }
         rcu_read_unlock();
-        return mr;
+        return block;
     }
 
     rcu_read_lock();
@@ -1914,10 +1937,49 @@ MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
     return NULL;
 
 found:
-    *ram_addr = block->offset + (host - block->host);
-    mr = block->mr;
+    *offset = (host - block->host);
+    if (round_offset) {
+        *offset &= TARGET_PAGE_MASK;
+    }
+    *ram_addr = block->offset + *offset;
     rcu_read_unlock();
-    return mr;
+    return block;
+}
+
+/*
+ * Finds the named RAMBlock
+ *
+ * name: The name of RAMBlock to find
+ *
+ * Returns: RAMBlock (or NULL if not found)
+ */
+RAMBlock *qemu_ram_block_by_name(const char *name)
+{
+    RAMBlock *block;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        if (!strcmp(name, block->idstr)) {
+            return block;
+        }
+    }
+
+    return NULL;
+}
+
+/* Some of the softmmu routines need to translate from a host pointer
+   (typically a TLB entry) back to a ram offset.  */
+MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr)
+{
+    RAMBlock *block;
+    ram_addr_t offset; /* Not used */
+
+    block = qemu_ram_block_from_host(ptr, false, ram_addr, &offset);
+
+    if (!block) {
+        return NULL;
+    }
+
+    return block->mr;
 }
 
 static void notdirty_mem_write(void *opaque, hwaddr ram_addr,
@@ -2698,8 +2760,8 @@ void cpu_register_map_client(QEMUBH *bh)
 void cpu_exec_init_all(void)
 {
     qemu_mutex_init(&ram_list.mutex);
-    memory_map_init();
     io_mem_init();
+    memory_map_init();
     qemu_mutex_init(&map_client_list_lock);
 }
 
@@ -3496,6 +3558,16 @@ int cpu_memory_rw_debug(CPUState *cpu, target_ulong addr,
     }
     return 0;
 }
+
+/*
+ * Allows code that needs to deal with migration bitmaps etc to still be built
+ * target independent.
+ */
+size_t qemu_target_page_bits(void)
+{
+    return TARGET_PAGE_BITS;
+}
+
 #endif
 
 /*
diff --git a/gdbstub.c b/gdbstub.c
index d2c95b5..9c29aa0 100644
--- a/gdbstub.c
+++ b/gdbstub.c
@@ -956,6 +956,13 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
         if (*p == ',')
             p++;
         len = strtoull(p, NULL, 16);
+
+        /* memtohex() doubles the required space */
+        if (len > MAX_PACKET_LENGTH / 2) {
+            put_packet (s, "E22");
+            break;
+        }
+
         if (target_memory_rw_debug(s->g_cpu, addr, mem_buf, len, false) != 0) {
             put_packet (s, "E14");
         } else {
@@ -970,6 +977,12 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
         len = strtoull(p, (char **)&p, 16);
         if (*p == ':')
             p++;
+
+        /* hextomem() reads 2*len bytes */
+        if (len > strlen(p) / 2) {
+            put_packet (s, "E22");
+            break;
+        }
         hextomem(mem_buf, p, len);
         if (target_memory_rw_debug(s->g_cpu, addr, mem_buf, len,
                                    true) != 0) {
@@ -1107,7 +1120,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
             cpu = find_cpu(thread);
             if (cpu != NULL) {
                 cpu_synchronize_state(cpu);
-                len = snprintf((char *)mem_buf, sizeof(mem_buf),
+                /* memtohex() doubles the required space */
+                len = snprintf((char *)mem_buf, sizeof(buf) / 2,
                                "CPU#%d [%s]", cpu->cpu_index,
                                cpu->halted ? "halted " : "running");
                 memtohex(buf, mem_buf, len);
@@ -1136,8 +1150,8 @@ static int gdb_handle_packet(GDBState *s, const char *line_buf)
                 put_packet(s, "E01");
                 break;
             }
-            hextomem(mem_buf, p + 5, len);
             len = len / 2;
+            hextomem(mem_buf, p + 5, len);
             mem_buf[len++] = 0;
             qemu_chr_be_write(s->mon_chr, mem_buf, len);
             put_packet(s, "OK");
diff --git a/hmp-commands.hx b/hmp-commands.hx
index 3a4ae39..bb52e4d 100644
--- a/hmp-commands.hx
+++ b/hmp-commands.hx
@@ -194,8 +194,8 @@ ETEXI
 
     {
         .name       = "change",
-        .args_type  = "device:B,target:F,arg:s?",
-        .params     = "device filename [format]",
+        .args_type  = "device:B,target:F,arg:s?,read-only-mode:s?",
+        .params     = "device filename [format [read-only-mode]]",
         .help       = "change a removable medium, optional format",
         .mhandler.cmd = hmp_change,
     },
@@ -206,7 +206,7 @@ STEXI
 Change the configuration of a device.
 
 @table @option
-@item change @var{diskdevice} @var{filename} [@var{format}]
+@item change @var{diskdevice} @var{filename} [@var{format} [@var{read-only-mode}]]
 Change the medium for a removable disk device to point to @var{filename}. eg
 
 @example
@@ -215,6 +215,20 @@ Change the medium for a removable disk device to point to @var{filename}. eg
 
 @var{format} is optional.
 
+@var{read-only-mode} may be used to change the read-only status of the device.
+It accepts the following values:
+
+@table @var
+@item retain
+Retains the current status; this is the default.
+
+@item read-only
+Makes the device read-only.
+
+@item read-write
+Makes the device writable.
+@end table
+
 @item change vnc @var{display},@var{options}
 Change the configuration of the VNC server. The valid syntax for @var{display}
 and @var{options} are described at @ref{sec_invocation}. eg
@@ -1008,6 +1022,23 @@ Set the parameter @var{parameter} for migration.
 ETEXI
 
     {
+        .name       = "migrate_start_postcopy",
+        .args_type  = "",
+        .params     = "",
+        .help       = "Followup to a migration command to switch the migration"
+                      " to postcopy mode. The x-postcopy-ram capability must "
+                      "be set before the original migration command.",
+        .mhandler.cmd = hmp_migrate_start_postcopy,
+    },
+
+STEXI
+@item migrate_start_postcopy
+@findex migrate_start_postcopy
+Switch in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
+ETEXI
+
+    {
         .name       = "client_migrate_info",
         .args_type  = "protocol:s,hostname:s,port:i?,tls-port:i?,cert-subject:s?",
         .params     = "protocol hostname port tls-port cert-subject",
diff --git a/hmp.c b/hmp.c
index a15d00c..2140605 100644
--- a/hmp.c
+++ b/hmp.c
@@ -27,6 +27,7 @@
 #include "qapi/opts-visitor.h"
 #include "qapi/qmp/qerror.h"
 #include "qapi/string-output-visitor.h"
+#include "qapi/util.h"
 #include "qapi-visit.h"
 #include "ui/console.h"
 #include "block/qapi.h"
@@ -521,6 +522,7 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        " flush_total_time_ns=%" PRId64
                        " rd_merged=%" PRId64
                        " wr_merged=%" PRId64
+                       " idle_time_ns=%" PRId64
                        "\n",
                        stats->value->stats->rd_bytes,
                        stats->value->stats->wr_bytes,
@@ -531,7 +533,8 @@ void hmp_info_blockstats(Monitor *mon, const QDict *qdict)
                        stats->value->stats->rd_total_time_ns,
                        stats->value->stats->flush_total_time_ns,
                        stats->value->stats->rd_merged,
-                       stats->value->stats->wr_merged);
+                       stats->value->stats->wr_merged,
+                       stats->value->stats->idle_time_ns);
     }
 
     qapi_free_BlockStatsList(stats_list);
@@ -1293,6 +1296,13 @@ void hmp_client_migrate_info(Monitor *mon, const QDict *qdict)
     hmp_handle_error(mon, &err);
 }
 
+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict)
+{
+    Error *err = NULL;
+    qmp_migrate_start_postcopy(&err);
+    hmp_handle_error(mon, &err);
+}
+
 void hmp_set_password(Monitor *mon, const QDict *qdict)
 {
     const char *protocol  = qdict_get_str(qdict, "protocol");
@@ -1336,24 +1346,46 @@ void hmp_change(Monitor *mon, const QDict *qdict)
     const char *device = qdict_get_str(qdict, "device");
     const char *target = qdict_get_str(qdict, "target");
     const char *arg = qdict_get_try_str(qdict, "arg");
+    const char *read_only = qdict_get_try_str(qdict, "read-only-mode");
+    BlockdevChangeReadOnlyMode read_only_mode = 0;
     Error *err = NULL;
 
-    if (strcmp(device, "vnc") == 0 &&
-            (strcmp(target, "passwd") == 0 ||
-             strcmp(target, "password") == 0)) {
-        if (!arg) {
-            monitor_read_password(mon, hmp_change_read_arg, NULL);
+    if (strcmp(device, "vnc") == 0) {
+        if (read_only) {
+            monitor_printf(mon,
+                           "Parameter 'read-only-mode' is invalid for VNC\n");
             return;
         }
-    }
+        if (strcmp(target, "passwd") == 0 ||
+            strcmp(target, "password") == 0) {
+            if (!arg) {
+                monitor_read_password(mon, hmp_change_read_arg, NULL);
+                return;
+            }
+        }
+        qmp_change("vnc", target, !!arg, arg, &err);
+    } else {
+        if (read_only) {
+            read_only_mode =
+                qapi_enum_parse(BlockdevChangeReadOnlyMode_lookup,
+                                read_only, BLOCKDEV_CHANGE_READ_ONLY_MODE_MAX,
+                                BLOCKDEV_CHANGE_READ_ONLY_MODE_RETAIN, &err);
+            if (err) {
+                hmp_handle_error(mon, &err);
+                return;
+            }
+        }
 
-    qmp_change(device, target, !!arg, arg, &err);
-    if (err &&
-        error_get_class(err) == ERROR_CLASS_DEVICE_ENCRYPTED) {
-        error_free(err);
-        monitor_read_block_device_key(mon, device, NULL, NULL);
-        return;
+        qmp_blockdev_change_medium(device, target, !!arg, arg,
+                                   !!read_only, read_only_mode, &err);
+        if (err &&
+            error_get_class(err) == ERROR_CLASS_DEVICE_ENCRYPTED) {
+            error_free(err);
+            monitor_read_block_device_key(mon, device, NULL, NULL);
+            return;
+        }
     }
+
     hmp_handle_error(mon, &err);
 }
 
diff --git a/hmp.h b/hmp.h
index 81656c3..a8c5b5a 100644
--- a/hmp.h
+++ b/hmp.h
@@ -69,6 +69,7 @@ void hmp_migrate_set_capability(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_parameter(Monitor *mon, const QDict *qdict);
 void hmp_migrate_set_cache_size(Monitor *mon, const QDict *qdict);
 void hmp_client_migrate_info(Monitor *mon, const QDict *qdict);
+void hmp_migrate_start_postcopy(Monitor *mon, const QDict *qdict);
 void hmp_set_password(Monitor *mon, const QDict *qdict);
 void hmp_expire_password(Monitor *mon, const QDict *qdict);
 void hmp_eject(Monitor *mon, const QDict *qdict);
diff --git a/hw/9pfs/virtio-9p-device.c b/hw/9pfs/virtio-9p-device.c
index 93a407c..e3abcfa 100644
--- a/hw/9pfs/virtio-9p-device.c
+++ b/hw/9pfs/virtio-9p-device.c
@@ -43,6 +43,16 @@ static void virtio_9p_get_config(VirtIODevice *vdev, uint8_t *config)
     g_free(cfg);
 }
 
+static void virtio_9p_save(QEMUFile *f, void *opaque)
+{
+    virtio_save(VIRTIO_DEVICE(opaque), f);
+}
+
+static int virtio_9p_load(QEMUFile *f, void *opaque, int version_id)
+{
+    return virtio_load(VIRTIO_DEVICE(opaque), f, version_id);
+}
+
 static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
 {
     VirtIODevice *vdev = VIRTIO_DEVICE(dev);
@@ -130,6 +140,7 @@ static void virtio_9p_device_realize(DeviceState *dev, Error **errp)
     }
     v9fs_path_free(&path);
 
+    register_savevm(dev, "virtio-9p", -1, 1, virtio_9p_save, virtio_9p_load, s);
     return;
 out:
     g_free(s->ctx.fs_root);
diff --git a/hw/acpi/aml-build.c b/hw/acpi/aml-build.c
index 0d4b324..a00a0ab 100644
--- a/hw/acpi/aml-build.c
+++ b/hw/acpi/aml-build.c
@@ -1163,9 +1163,7 @@ void *acpi_data_push(GArray *table_data, unsigned size)
 
 unsigned acpi_data_len(GArray *table)
 {
-#if GLIB_CHECK_VERSION(2, 22, 0)
     assert(g_array_get_element_size(table) == 1);
-#endif
     return table->len;
 }
 
diff --git a/hw/arm/allwinner-a10.c b/hw/arm/allwinner-a10.c
index 43dc0a1..b0ca81c 100644
--- a/hw/arm/allwinner-a10.c
+++ b/hw/arm/allwinner-a10.c
@@ -39,6 +39,9 @@ static void aw_a10_init(Object *obj)
         qemu_check_nic_model(&nd_table[0], TYPE_AW_EMAC);
         qdev_set_nic_properties(DEVICE(&s->emac), &nd_table[0]);
     }
+
+    object_initialize(&s->sata, sizeof(s->sata), TYPE_ALLWINNER_AHCI);
+    qdev_set_parent_bus(DEVICE(&s->sata), sysbus_get_default());
 }
 
 static void aw_a10_realize(DeviceState *dev, Error **errp)
@@ -93,6 +96,14 @@ static void aw_a10_realize(DeviceState *dev, Error **errp)
     sysbus_mmio_map(sysbusdev, 0, AW_A10_EMAC_BASE);
     sysbus_connect_irq(sysbusdev, 0, s->irq[55]);
 
+    object_property_set_bool(OBJECT(&s->sata), true, "realized", &err);
+    if (err) {
+        error_propagate(errp, err);
+        return;
+    }
+    sysbus_mmio_map(SYS_BUS_DEVICE(&s->sata), 0, AW_A10_SATA_BASE);
+    sysbus_connect_irq(SYS_BUS_DEVICE(&s->sata), 0, s->irq[56]);
+
     /* FIXME use a qdev chardev prop instead of serial_hds[] */
     serial_mm_init(get_system_memory(), AW_A10_UART0_REG_BASE, 2, s->irq[1],
                    115200, serial_hds[0], DEVICE_NATIVE_ENDIAN);
diff --git a/hw/arm/boot.c b/hw/arm/boot.c
index b0879a5..75f69bf 100644
--- a/hw/arm/boot.c
+++ b/hw/arm/boot.c
@@ -11,6 +11,7 @@
 #include "hw/hw.h"
 #include "hw/arm/arm.h"
 #include "hw/arm/linux-boot-if.h"
+#include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
 #include "hw/boards.h"
 #include "hw/loader.h"
@@ -495,7 +496,8 @@ static void do_cpu_reset(void *opaque)
                 }
 
                 /* Set to non-secure if not a secure boot */
-                if (!info->secure_boot) {
+                if (!info->secure_boot &&
+                    (cs != first_cpu || !info->secure_board_setup)) {
                     /* Linux expects non-secure state */
                     env->cp15.scr_el3 |= SCR_NS;
                 }
@@ -598,6 +600,12 @@ static void arm_load_kernel_notify(Notifier *notifier, void *data)
     struct arm_boot_info *info =
         container_of(n, struct arm_boot_info, load_kernel_notifier);
 
+    /* The board code is not supposed to set secure_board_setup unless
+     * running its code in secure mode is actually possible, and KVM
+     * doesn't support secure.
+     */
+    assert(!(info->secure_board_setup && kvm_enabled()));
+
     /* Load the kernel.  */
     if (!info->kernel_filename || info->firmware_loaded) {
 
diff --git a/hw/arm/highbank.c b/hw/arm/highbank.c
index be04b27..85ae69e 100644
--- a/hw/arm/highbank.c
+++ b/hw/arm/highbank.c
@@ -22,6 +22,7 @@
 #include "hw/devices.h"
 #include "hw/loader.h"
 #include "net/net.h"
+#include "sysemu/kvm.h"
 #include "sysemu/sysemu.h"
 #include "hw/boards.h"
 #include "sysemu/block-backend.h"
@@ -32,10 +33,52 @@
 #define SMP_BOOT_REG            0x40
 #define MPCORE_PERIPHBASE       0xfff10000
 
+#define MVBAR_ADDR              0x200
+
 #define NIRQ_GIC                160
 
 /* Board init.  */
 
+/* MVBAR_ADDR is limited by precision of movw */
+
+QEMU_BUILD_BUG_ON(MVBAR_ADDR >= (1 << 16));
+
+#define ARMV7_IMM16(x) (extract32((x),  0, 12) | \
+                        extract32((x), 12,  4) << 16)
+
+static void hb_write_board_setup(ARMCPU *cpu,
+                                 const struct arm_boot_info *info)
+{
+    int n;
+    uint32_t board_setup_blob[] = {
+        /* MVBAR_ADDR */
+        /* Default unimplemented and unused vectors to spin. Makes it
+         * easier to debug (as opposed to the CPU running away).
+         */
+        0xeafffffe, /* notused1: b notused */
+        0xeafffffe, /* notused2: b notused */
+        0xe1b0f00e, /* smc: movs pc, lr - exception return */
+        0xeafffffe, /* prefetch_abort: b prefetch_abort */
+        0xeafffffe, /* data_abort: b data_abort */
+        0xeafffffe, /* notused3: b notused3 */
+        0xeafffffe, /* irq: b irq */
+        0xeafffffe, /* fiq: b fiq */
+#define BOARD_SETUP_ADDR (MVBAR_ADDR + 8 * sizeof(uint32_t))
+        0xe3000000 + ARMV7_IMM16(MVBAR_ADDR), /* movw r0, MVBAR_ADDR */
+        0xee0c0f30, /* mcr p15, 0, r0, c12, c0, 1 - set MVBAR */
+        0xee110f11, /* mrc p15, 0, r0, c1 , c1, 0 - get SCR */
+        0xe3810001, /* orr r0, #1 - set NS */
+        0xee010f11, /* mcr p15, 0, r0, c1 , c1, 0 - set SCR */
+        0xe1600070, /* smc - go to monitor mode to flush NS change */
+        0xe12fff1e, /* bx lr - return to caller */
+    };
+    for (n = 0; n < ARRAY_SIZE(board_setup_blob); n++) {
+        board_setup_blob[n] = tswap32(board_setup_blob[n]);
+    }
+    rom_add_blob_fixed("board-setup", board_setup_blob,
+                       sizeof(board_setup_blob), MVBAR_ADDR);
+}
+
 static void hb_write_secondary(ARMCPU *cpu, const struct arm_boot_info *info)
 {
     int n;
@@ -223,15 +266,13 @@ static void calxeda_init(MachineState *machine, enum cxmachines machine_id)
     MemoryRegion *sysmem;
     char *sysboot_filename;
 
-    if (!cpu_model) {
-        switch (machine_id) {
-        case CALXEDA_HIGHBANK:
-            cpu_model = "cortex-a9";
-            break;
-        case CALXEDA_MIDWAY:
-            cpu_model = "cortex-a15";
-            break;
-        }
+    switch (machine_id) {
+    case CALXEDA_HIGHBANK:
+        cpu_model = "cortex-a9";
+        break;
+    case CALXEDA_MIDWAY:
+        cpu_model = "cortex-a15";
+        break;
     }
 
     for (n = 0; n < smp_cpus; n++) {
@@ -240,24 +281,16 @@ static void calxeda_init(MachineState *machine, enum cxmachines machine_id)
         ARMCPU *cpu;
         Error *err = NULL;
 
-        if (!oc) {
-            error_report("Unable to find CPU definition");
-            exit(1);
-        }
-
         cpuobj = object_new(object_class_get_name(oc));
         cpu = ARM_CPU(cpuobj);
 
-        /* By default A9 and A15 CPUs have EL3 enabled.  This board does not
-         * currently support EL3 so the CPU EL3 property is disabled before
-         * realization.
-         */
-        if (object_property_find(cpuobj, "has_el3", NULL)) {
-            object_property_set_bool(cpuobj, false, "has_el3", &err);
-            if (err) {
-                error_report_err(err);
-                exit(1);
-            }
+        object_property_set_int(cpuobj, QEMU_PSCI_CONDUIT_SMC,
+                                "psci-conduit", &error_abort);
+
+        if (n) {
+            /* Secondary CPUs start in PSCI powered-down state */
+            object_property_set_bool(cpuobj, true,
+                                     "start-powered-off", &error_abort);
         }
 
         if (object_property_find(cpuobj, "reset-cbar", NULL)) {
@@ -378,6 +411,16 @@ static void calxeda_init(MachineState *machine, enum cxmachines machine_id)
     highbank_binfo.loader_start = 0;
     highbank_binfo.write_secondary_boot = hb_write_secondary;
     highbank_binfo.secondary_cpu_reset_hook = hb_reset_secondary;
+    if (!kvm_enabled()) {
+        highbank_binfo.board_setup_addr = BOARD_SETUP_ADDR;
+        highbank_binfo.write_board_setup = hb_write_board_setup;
+        highbank_binfo.secure_board_setup = true;
+    } else {
+        error_report("WARNING: cannot load built-in Monitor support "
+                     "if KVM is enabled. Some guests (such as Linux) "
+                     "may not boot.");
+    }
+
     arm_load_kernel(ARM_CPU(first_cpu), &highbank_binfo);
 }
 
diff --git a/hw/arm/nseries.c b/hw/arm/nseries.c
index 6a6b3e6..2a8835e 100644
--- a/hw/arm/nseries.c
+++ b/hw/arm/nseries.c
@@ -1275,7 +1275,7 @@ static int n8x0_atag_setup(void *p, int model)
     strcpy((void *) w, "hw-build");		/* char component[12] */
     w += 6;
     strcpy((void *) w, "QEMU ");
-    pstrcat((void *) w, 12, qemu_get_version()); /* char version[12] */
+    pstrcat((void *) w, 12, qemu_hw_version()); /* char version[12] */
     w += 6;
 
     tag = (model == 810) ? "1.1.10-qemu" : "1.1.6-qemu";
diff --git a/hw/arm/virt.c b/hw/arm/virt.c
index 77d9267..9c6792c 100644
--- a/hw/arm/virt.c
+++ b/hw/arm/virt.c
@@ -941,8 +941,8 @@ static void machvirt_init(MachineState *machine)
     if (!gic_version) {
         gic_version = kvm_arm_vgic_probe();
         if (!gic_version) {
-            error_report("Unable to determine GIC version supported by host\n"
-                         "Probably KVM acceleration is not supported\n");
+            error_report("Unable to determine GIC version supported by host");
+            error_printf("KVM acceleration is probably not supported\n");
             exit(1);
         }
     }
@@ -990,7 +990,7 @@ static void machvirt_init(MachineState *machine)
         char *cpuopts = g_strdup(cpustr[1]);
 
         if (!oc) {
-            fprintf(stderr, "Unable to find CPU definition\n");
+            error_report("Unable to find CPU definition");
             exit(1);
         }
         cpuobj = object_new(object_class_get_name(oc));
@@ -1126,8 +1126,8 @@ static void virt_set_gic_version(Object *obj, const char *value, Error **errp)
     } else if (!strcmp(value, "host")) {
         vms->gic_version = 0; /* Will probe later */
     } else {
-        error_report("Invalid gic-version option value\n"
-                     "Allowed values are: 3, 2, host\n");
+        error_report("Invalid gic-version option value");
+        error_printf("Allowed gic-version values are: 3, 2, host\n");
         exit(1);
     }
 }
diff --git a/hw/arm/xilinx_zynq.c b/hw/arm/xilinx_zynq.c
index 82a9db8..1c1a445 100644
--- a/hw/arm/xilinx_zynq.c
+++ b/hw/arm/xilinx_zynq.c
@@ -24,6 +24,7 @@
 #include "hw/block/flash.h"
 #include "sysemu/block-backend.h"
 #include "hw/loader.h"
+#include "hw/misc/zynq-xadc.h"
 #include "hw/ssi.h"
 #include "qemu/error-report.h"
 
@@ -264,6 +265,11 @@ static void zynq_init(MachineState *machine)
     sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, 0xE0101000);
     sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, pic[79-IRQ_OFFSET]);
 
+    dev = qdev_create(NULL, TYPE_ZYNQ_XADC);
+    qdev_init_nofail(dev);
+    sysbus_mmio_map(SYS_BUS_DEVICE(dev), 0, 0xF8007100);
+    sysbus_connect_irq(SYS_BUS_DEVICE(dev), 0, pic[39-IRQ_OFFSET]);
+
     dev = qdev_create(NULL, "pl330");
     qdev_prop_set_uint8(dev, "num_chnls",  8);
     qdev_prop_set_uint8(dev, "num_periph_req",  4);
diff --git a/hw/block/nvme.c b/hw/block/nvme.c
index 5da41b2..169e4fa 100644
--- a/hw/block/nvme.c
+++ b/hw/block/nvme.c
@@ -201,10 +201,11 @@ static void nvme_rw_cb(void *opaque, int ret)
     NvmeCtrl *n = sq->ctrl;
     NvmeCQueue *cq = n->cq[sq->cqid];
 
-    block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
     if (!ret) {
+        block_acct_done(blk_get_stats(n->conf.blk), &req->acct);
         req->status = NVME_SUCCESS;
     } else {
+        block_acct_failed(blk_get_stats(n->conf.blk), &req->acct);
         req->status = NVME_INTERNAL_DEV_ERROR;
     }
     if (req->has_sg) {
@@ -238,18 +239,22 @@ static uint16_t nvme_rw(NvmeCtrl *n, NvmeNamespace *ns, NvmeCmd *cmd,
     uint64_t data_size = (uint64_t)nlb << data_shift;
     uint64_t aio_slba  = slba << (data_shift - BDRV_SECTOR_BITS);
     int is_write = rw->opcode == NVME_CMD_WRITE ? 1 : 0;
+    enum BlockAcctType acct = is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ;
 
     if ((slba + nlb) > ns->id_ns.nsze) {
+        block_acct_invalid(blk_get_stats(n->conf.blk), acct);
         return NVME_LBA_RANGE | NVME_DNR;
     }
+
     if (nvme_map_prp(&req->qsg, prp1, prp2, data_size, n)) {
+        block_acct_invalid(blk_get_stats(n->conf.blk), acct);
         return NVME_INVALID_FIELD | NVME_DNR;
     }
+
     assert((nlb << data_shift) == req->qsg.size);
 
     req->has_sg = true;
-    dma_acct_start(n->conf.blk, &req->acct, &req->qsg,
-                   is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
+    dma_acct_start(n->conf.blk, &req->acct, &req->qsg, acct);
     req->aiocb = is_write ?
         dma_blk_write(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req) :
         dma_blk_read(n->conf.blk, &req->qsg, aio_slba, nvme_rw_cb, req);
diff --git a/hw/block/virtio-blk.c b/hw/block/virtio-blk.c
index 093e475..848f3fe 100644
--- a/hw/block/virtio-blk.c
+++ b/hw/block/virtio-blk.c
@@ -76,7 +76,7 @@ static int virtio_blk_handle_rw_error(VirtIOBlockReq *req, int error,
         s->rq = req;
     } else if (action == BLOCK_ERROR_ACTION_REPORT) {
         virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
-        block_acct_done(blk_get_stats(s->blk), &req->acct);
+        block_acct_failed(blk_get_stats(s->blk), &req->acct);
         virtio_blk_free_request(req);
     }
 
@@ -112,6 +112,10 @@ static void virtio_blk_rw_complete(void *opaque, int ret)
              * happen on the other side of the migration).
              */
             if (virtio_blk_handle_rw_error(req, -ret, is_read)) {
+                /* Break the link in case the next request is added to the
+                 * restart queue and is going to be parsed from the ring again.
+                 */
+                req->mr_next = NULL;
                 continue;
             }
         }
@@ -536,6 +540,8 @@ void virtio_blk_handle_request(VirtIOBlockReq *req, MultiReqBuffer *mrb)
         if (!virtio_blk_sect_range_ok(req->dev, req->sector_num,
                                       req->qiov.size)) {
             virtio_blk_req_complete(req, VIRTIO_BLK_S_IOERR);
+            block_acct_invalid(blk_get_stats(req->dev->blk),
+                               is_write ? BLOCK_ACCT_WRITE : BLOCK_ACCT_READ);
             virtio_blk_free_request(req);
             return;
         }
diff --git a/hw/block/xen_disk.c b/hw/block/xen_disk.c
index 1bbc111..02eda6e 100644
--- a/hw/block/xen_disk.c
+++ b/hw/block/xen_disk.c
@@ -537,7 +537,11 @@ static void qemu_aio_complete(void *opaque, int ret)
             break;
         }
     case BLKIF_OP_READ:
-        block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
+        if (ioreq->status == BLKIF_RSP_OKAY) {
+            block_acct_done(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
+        } else {
+            block_acct_failed(blk_get_stats(ioreq->blkdev->blk), &ioreq->acct);
+        }
         break;
     case BLKIF_OP_DISCARD:
     default:
@@ -576,7 +580,9 @@ static int ioreq_runio_qemu_aio(struct ioreq *ioreq)
         }
 
         block_acct_start(blk_get_stats(blkdev->blk), &ioreq->acct,
-                         ioreq->v.size, BLOCK_ACCT_WRITE);
+                         ioreq->v.size,
+                         ioreq->req.operation == BLKIF_OP_WRITE ?
+                         BLOCK_ACCT_WRITE : BLOCK_ACCT_FLUSH);
         ioreq->aio_inflight++;
         blk_aio_writev(blkdev->blk, ioreq->start / BLOCK_SIZE,
                        &ioreq->v, ioreq->v.size / BLOCK_SIZE,
@@ -720,6 +726,23 @@ static void blk_handle_requests(struct XenBlkDev *blkdev)
 
         /* parse them */
         if (ioreq_parse(ioreq) != 0) {
+
+            switch (ioreq->req.operation) {
+            case BLKIF_OP_READ:
+                block_acct_invalid(blk_get_stats(blkdev->blk),
+                                   BLOCK_ACCT_READ);
+                break;
+            case BLKIF_OP_WRITE:
+                block_acct_invalid(blk_get_stats(blkdev->blk),
+                                   BLOCK_ACCT_WRITE);
+                break;
+            case BLKIF_OP_FLUSH_DISKCACHE:
+                block_acct_invalid(blk_get_stats(blkdev->blk),
+                                   BLOCK_ACCT_FLUSH);
+            default:
+                break;
+            };
+
             if (blk_send_response_one(ioreq)) {
                 xen_be_send_notify(&blkdev->xendev);
             }
diff --git a/hw/bt/hci.c b/hw/bt/hci.c
index 6a88d49..2151d01 100644
--- a/hw/bt/hci.c
+++ b/hw/bt/hci.c
@@ -23,6 +23,8 @@
 #include "hw/usb.h"
 #include "sysemu/bt.h"
 #include "hw/bt.h"
+#include "qapi/qmp/qerror.h"
+#include "sysemu/replay.h"
 
 struct bt_hci_s {
     uint8_t *(*evt_packet)(void *opaque);
@@ -72,6 +74,8 @@ struct bt_hci_s {
 
     struct HCIInfo info;
     struct bt_device_s device;
+
+    Error *replay_blocker;
 };
 
 #define DEFAULT_RSSI_DBM	20
@@ -2189,6 +2193,9 @@ struct HCIInfo *bt_new_hci(struct bt_scatternet_s *net)
 
     s->device.handle_destroy = bt_hci_destroy;
 
+    error_setg(&s->replay_blocker, QERR_REPLAY_NOT_SUPPORTED, "-bt hci");
+    replay_add_blocker(s->replay_blocker);
+
     return &s->info;
 }
 
diff --git a/hw/bt/sdp.c b/hw/bt/sdp.c
index c903747..b9bcdcc 100644
--- a/hw/bt/sdp.c
+++ b/hw/bt/sdp.c
@@ -150,12 +150,14 @@ static ssize_t sdp_svc_search(struct bt_l2cap_sdp_state_s *sdp,
         if (seqlen < 3 || len < seqlen)
             return -SDP_INVALID_SYNTAX;
         len -= seqlen;
-
         while (seqlen)
             if (sdp_svc_match(sdp, &req, &seqlen))
                 return -SDP_INVALID_SYNTAX;
-    } else if (sdp_svc_match(sdp, &req, &seqlen))
-        return -SDP_INVALID_SYNTAX;
+    } else {
+        if (sdp_svc_match(sdp, &req, &len)) {
+            return -SDP_INVALID_SYNTAX;
+        }
+    }
 
     if (len < 3)
         return -SDP_INVALID_SYNTAX;
@@ -278,8 +280,11 @@ static ssize_t sdp_attr_get(struct bt_l2cap_sdp_state_s *sdp,
         while (seqlen)
             if (sdp_attr_match(record, &req, &seqlen))
                 return -SDP_INVALID_SYNTAX;
-    } else if (sdp_attr_match(record, &req, &seqlen))
-        return -SDP_INVALID_SYNTAX;
+    } else {
+        if (sdp_attr_match(record, &req, &len)) {
+            return -SDP_INVALID_SYNTAX;
+        }
+    }
 
     if (len < 1)
         return -SDP_INVALID_SYNTAX;
@@ -393,8 +398,11 @@ static ssize_t sdp_svc_search_attr_get(struct bt_l2cap_sdp_state_s *sdp,
         while (seqlen)
             if (sdp_svc_match(sdp, &req, &seqlen))
                 return -SDP_INVALID_SYNTAX;
-    } else if (sdp_svc_match(sdp, &req, &seqlen))
-        return -SDP_INVALID_SYNTAX;
+    } else {
+        if (sdp_svc_match(sdp, &req, &len)) {
+            return -SDP_INVALID_SYNTAX;
+        }
+    }
 
     if (len < 3)
         return -SDP_INVALID_SYNTAX;
@@ -413,8 +421,11 @@ static ssize_t sdp_svc_search_attr_get(struct bt_l2cap_sdp_state_s *sdp,
         while (seqlen)
             if (sdp_svc_attr_match(sdp, &req, &seqlen))
                 return -SDP_INVALID_SYNTAX;
-    } else if (sdp_svc_attr_match(sdp, &req, &seqlen))
-        return -SDP_INVALID_SYNTAX;
+    } else {
+        if (sdp_svc_attr_match(sdp, &req, &len)) {
+            return -SDP_INVALID_SYNTAX;
+        }
+    }
 
     if (len < 1)
         return -SDP_INVALID_SYNTAX;
diff --git a/hw/core/ptimer.c b/hw/core/ptimer.c
index 8437bd6..edf077c 100644
--- a/hw/core/ptimer.c
+++ b/hw/core/ptimer.c
@@ -9,6 +9,7 @@
 #include "qemu/timer.h"
 #include "hw/ptimer.h"
 #include "qemu/host-utils.h"
+#include "sysemu/replay.h"
 
 struct ptimer_state
 {
@@ -27,7 +28,7 @@ struct ptimer_state
 static void ptimer_trigger(ptimer_state *s)
 {
     if (s->bh) {
-        qemu_bh_schedule(s->bh);
+        replay_bh_schedule_event(s->bh);
     }
 }
 
diff --git a/hw/core/qdev.c b/hw/core/qdev.c
index 4ab04aa..b3ad467 100644
--- a/hw/core/qdev.c
+++ b/hw/core/qdev.c
@@ -325,6 +325,11 @@ void qdev_reset_all(DeviceState *dev)
     qdev_walk_children(dev, NULL, NULL, qdev_reset_one, qbus_reset_one, NULL);
 }
 
+void qdev_reset_all_fn(void *opaque)
+{
+    qdev_reset_all(DEVICE(opaque));
+}
+
 void qbus_reset_all(BusState *bus)
 {
     qbus_walk_children(bus, NULL, NULL, qdev_reset_one, qbus_reset_one, NULL);
diff --git a/hw/display/qxl.c b/hw/display/qxl.c
index 9c961da..8a3040c 100644
--- a/hw/display/qxl.c
+++ b/hw/display/qxl.c
@@ -2156,7 +2156,7 @@ static int qxl_post_load(void *opaque, int version)
         qxl_create_guest_primary(d, 1, QXL_SYNC);
 
         /* replay surface-create and cursor-set commands */
-        cmds = g_malloc0(sizeof(QXLCommandExt) * (d->ssd.num_surfaces + 1));
+        cmds = g_new0(QXLCommandExt, d->ssd.num_surfaces + 1);
         for (in = 0, out = 0; in < d->ssd.num_surfaces; in++) {
             if (d->guest_surfaces.cmds[in] == 0) {
                 continue;
diff --git a/hw/display/tcx.c b/hw/display/tcx.c
index bf119bc..d720ea6 100644
--- a/hw/display/tcx.c
+++ b/hw/display/tcx.c
@@ -944,57 +944,55 @@ static void tcx_initfn(Object *obj)
     SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
     TCXState *s = TCX(obj);
 
-    memory_region_init_ram(&s->rom, OBJECT(s), "tcx.prom", FCODE_MAX_ROM_SIZE,
+    memory_region_init_ram(&s->rom, obj, "tcx.prom", FCODE_MAX_ROM_SIZE,
                            &error_fatal);
     memory_region_set_readonly(&s->rom, true);
     sysbus_init_mmio(sbd, &s->rom);
 
     /* 2/STIP : Stippler */
-    memory_region_init_io(&s->stip, OBJECT(s), &tcx_stip_ops, s, "tcx.stip",
+    memory_region_init_io(&s->stip, obj, &tcx_stip_ops, s, "tcx.stip",
                           TCX_STIP_NREGS);
     sysbus_init_mmio(sbd, &s->stip);
 
     /* 3/BLIT : Blitter */
-    memory_region_init_io(&s->blit, OBJECT(s), &tcx_blit_ops, s, "tcx.blit",
+    memory_region_init_io(&s->blit, obj, &tcx_blit_ops, s, "tcx.blit",
                           TCX_BLIT_NREGS);
     sysbus_init_mmio(sbd, &s->blit);
 
     /* 5/RSTIP : Raw Stippler */
-    memory_region_init_io(&s->rstip, OBJECT(s), &tcx_rstip_ops, s, "tcx.rstip",
+    memory_region_init_io(&s->rstip, obj, &tcx_rstip_ops, s, "tcx.rstip",
                           TCX_RSTIP_NREGS);
     sysbus_init_mmio(sbd, &s->rstip);
 
     /* 6/RBLIT : Raw Blitter */
-    memory_region_init_io(&s->rblit, OBJECT(s), &tcx_rblit_ops, s, "tcx.rblit",
+    memory_region_init_io(&s->rblit, obj, &tcx_rblit_ops, s, "tcx.rblit",
                           TCX_RBLIT_NREGS);
     sysbus_init_mmio(sbd, &s->rblit);
 
     /* 7/TEC : ??? */
-    memory_region_init_io(&s->tec, OBJECT(s), &tcx_dummy_ops, s,
-                          "tcx.tec", TCX_TEC_NREGS);
+    memory_region_init_io(&s->tec, obj, &tcx_dummy_ops, s, "tcx.tec",
+                          TCX_TEC_NREGS);
     sysbus_init_mmio(sbd, &s->tec);
 
     /* 8/CMAP : DAC */
-    memory_region_init_io(&s->dac, OBJECT(s), &tcx_dac_ops, s,
-                          "tcx.dac", TCX_DAC_NREGS);
+    memory_region_init_io(&s->dac, obj, &tcx_dac_ops, s, "tcx.dac",
+                          TCX_DAC_NREGS);
     sysbus_init_mmio(sbd, &s->dac);
 
     /* 9/THC : Cursor */
-    memory_region_init_io(&s->thc, OBJECT(s), &tcx_thc_ops, s, "tcx.thc",
+    memory_region_init_io(&s->thc, obj, &tcx_thc_ops, s, "tcx.thc",
                           TCX_THC_NREGS);
     sysbus_init_mmio(sbd, &s->thc);
 
     /* 11/DHC : ??? */
-    memory_region_init_io(&s->dhc, OBJECT(s), &tcx_dummy_ops, s, "tcx.dhc",
+    memory_region_init_io(&s->dhc, obj, &tcx_dummy_ops, s, "tcx.dhc",
                           TCX_DHC_NREGS);
     sysbus_init_mmio(sbd, &s->dhc);
 
     /* 12/ALT : ??? */
-    memory_region_init_io(&s->alt, OBJECT(s), &tcx_dummy_ops, s, "tcx.alt",
+    memory_region_init_io(&s->alt, obj, &tcx_dummy_ops, s, "tcx.alt",
                           TCX_ALT_NREGS);
     sysbus_init_mmio(sbd, &s->alt);
-
-    return;
 }
 
 static void tcx_realizefn(DeviceState *dev, Error **errp)
diff --git a/hw/dma/pxa2xx_dma.c b/hw/dma/pxa2xx_dma.c
index d4501fb..54cdb25 100644
--- a/hw/dma/pxa2xx_dma.c
+++ b/hw/dma/pxa2xx_dma.c
@@ -459,9 +459,8 @@ static int pxa2xx_dma_init(SysBusDevice *sbd)
         return -1;
     }
 
-    s->chan = g_malloc0(sizeof(PXA2xxDMAChannel) * s->channels);
+    s->chan = g_new0(PXA2xxDMAChannel, s->channels);
 
-    memset(s->chan, 0, sizeof(PXA2xxDMAChannel) * s->channels);
     for (i = 0; i < s->channels; i ++)
         s->chan[i].state = DCSR_STOPINTR;
 
diff --git a/hw/i386/kvm/clock.c b/hw/i386/kvm/clock.c
index efdf165..0593a3f 100644
--- a/hw/i386/kvm/clock.c
+++ b/hw/i386/kvm/clock.c
@@ -17,7 +17,7 @@
 #include "qemu/host-utils.h"
 #include "sysemu/sysemu.h"
 #include "sysemu/kvm.h"
-#include "sysemu/cpus.h"
+#include "kvm_i386.h"
 #include "hw/sysbus.h"
 #include "hw/kvm/clock.h"
 
@@ -125,21 +125,7 @@ static void kvmclock_vm_state_change(void *opaque, int running,
             return;
         }
 
-        cpu_synchronize_all_states();
-        /* In theory, the cpu_synchronize_all_states() call above wouldn't
-         * affect the rest of the code, as the VCPU state inside CPUState
-         * is supposed to always match the VCPU state on the kernel side.
-         *
-         * In practice, calling cpu_synchronize_state() too soon will load the
-         * kernel-side APIC state into X86CPU.apic_state too early, APIC state
-         * won't be reloaded later because CPUState.vcpu_dirty==true, and
-         * outdated APIC state may be migrated to another host.
-         *
-         * The real fix would be to make sure outdated APIC state is read
-         * from the kernel again when necessary. While this is not fixed, we
-         * need the cpu_clean_all_dirty() call below.
-         */
-        cpu_clean_all_dirty();
+        kvm_synchronize_all_tsc();
 
         ret = kvm_vm_ioctl(kvm_state, KVM_GET_CLOCK, &data);
         if (ret < 0) {
diff --git a/hw/i386/pc.c b/hw/i386/pc.c
index 0cb8afd..5e20e07 100644
--- a/hw/i386/pc.c
+++ b/hw/i386/pc.c
@@ -1795,9 +1795,9 @@ static void pc_machine_set_max_ram_below_4g(Object *obj, Visitor *v,
         return;
     }
     if (value > (1ULL << 32)) {
-        error_set(&error, ERROR_CLASS_GENERIC_ERROR,
-                  "Machine option 'max-ram-below-4g=%"PRIu64
-                  "' expects size less than or equal to 4G", value);
+        error_setg(&error,
+                   "Machine option 'max-ram-below-4g=%"PRIu64
+                   "' expects size less than or equal to 4G", value);
         error_propagate(errp, error);
         return;
     }
diff --git a/hw/i386/pc_piix.c b/hw/i386/pc_piix.c
index 393dcc4..07d0baa 100644
--- a/hw/i386/pc_piix.c
+++ b/hw/i386/pc_piix.c
@@ -472,6 +472,7 @@ static void pc_i440fx_machine_options(MachineClass *m)
 static void pc_i440fx_2_5_machine_options(MachineClass *m)
 {
     pc_i440fx_machine_options(m);
+    m->hw_version = QEMU_VERSION;
     m->alias = "pc";
     m->is_default = 1;
 }
@@ -484,6 +485,7 @@ static void pc_i440fx_2_4_machine_options(MachineClass *m)
 {
     PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
     pc_i440fx_2_5_machine_options(m);
+    m->hw_version = "2.4.0";
     m->alias = NULL;
     m->is_default = 0;
     pcmc->broken_reserved_end = true;
@@ -497,6 +499,7 @@ DEFINE_I440FX_MACHINE(v2_4, "pc-i440fx-2.4", NULL,
 static void pc_i440fx_2_3_machine_options(MachineClass *m)
 {
     pc_i440fx_2_4_machine_options(m);
+    m->hw_version = "2.3.0";
     m->alias = NULL;
     m->is_default = 0;
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_3);
@@ -509,6 +512,7 @@ DEFINE_I440FX_MACHINE(v2_3, "pc-i440fx-2.3", pc_compat_2_3,
 static void pc_i440fx_2_2_machine_options(MachineClass *m)
 {
     pc_i440fx_2_3_machine_options(m);
+    m->hw_version = "2.2.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_2);
 }
 
@@ -519,6 +523,7 @@ DEFINE_I440FX_MACHINE(v2_2, "pc-i440fx-2.2", pc_compat_2_2,
 static void pc_i440fx_2_1_machine_options(MachineClass *m)
 {
     pc_i440fx_2_2_machine_options(m);
+    m->hw_version = "2.1.0";
     m->default_display = NULL;
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_1);
 }
@@ -531,6 +536,7 @@ DEFINE_I440FX_MACHINE(v2_1, "pc-i440fx-2.1", pc_compat_2_1,
 static void pc_i440fx_2_0_machine_options(MachineClass *m)
 {
     pc_i440fx_2_1_machine_options(m);
+    m->hw_version = "2.0.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_0);
 }
 
@@ -541,6 +547,7 @@ DEFINE_I440FX_MACHINE(v2_0, "pc-i440fx-2.0", pc_compat_2_0,
 static void pc_i440fx_1_7_machine_options(MachineClass *m)
 {
     pc_i440fx_2_0_machine_options(m);
+    m->hw_version = "1.7.0";
     m->default_machine_opts = NULL;
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_7);
 }
@@ -552,6 +559,7 @@ DEFINE_I440FX_MACHINE(v1_7, "pc-i440fx-1.7", pc_compat_1_7,
 static void pc_i440fx_1_6_machine_options(MachineClass *m)
 {
     pc_i440fx_1_7_machine_options(m);
+    m->hw_version = "1.6.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_6);
 }
 
@@ -562,6 +570,7 @@ DEFINE_I440FX_MACHINE(v1_6, "pc-i440fx-1.6", pc_compat_1_6,
 static void pc_i440fx_1_5_machine_options(MachineClass *m)
 {
     pc_i440fx_1_6_machine_options(m);
+    m->hw_version = "1.5.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_5);
 }
 
@@ -572,6 +581,7 @@ DEFINE_I440FX_MACHINE(v1_5, "pc-i440fx-1.5", pc_compat_1_5,
 static void pc_i440fx_1_4_machine_options(MachineClass *m)
 {
     pc_i440fx_1_5_machine_options(m);
+    m->hw_version = "1.4.0";
     m->hot_add_cpu = NULL;
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_4);
 }
@@ -604,6 +614,7 @@ DEFINE_I440FX_MACHINE(v1_4, "pc-i440fx-1.4", pc_compat_1_4,
 static void pc_i440fx_1_3_machine_options(MachineClass *m)
 {
     pc_i440fx_1_4_machine_options(m);
+    m->hw_version = "1.3.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_3);
 }
 
@@ -642,6 +653,7 @@ DEFINE_I440FX_MACHINE(v1_3, "pc-1.3", pc_compat_1_3,
 static void pc_i440fx_1_2_machine_options(MachineClass *m)
 {
     pc_i440fx_1_3_machine_options(m);
+    m->hw_version = "1.2.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_2);
 }
 
@@ -684,6 +696,7 @@ DEFINE_I440FX_MACHINE(v1_2, "pc-1.2", pc_compat_1_2,
 static void pc_i440fx_1_1_machine_options(MachineClass *m)
 {
     pc_i440fx_1_2_machine_options(m);
+    m->hw_version = "1.1.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_1);
 }
 
diff --git a/hw/i386/pc_q35.c b/hw/i386/pc_q35.c
index 2f8f396..0fdae09 100644
--- a/hw/i386/pc_q35.c
+++ b/hw/i386/pc_q35.c
@@ -373,6 +373,7 @@ static void pc_q35_machine_options(MachineClass *m)
 static void pc_q35_2_5_machine_options(MachineClass *m)
 {
     pc_q35_machine_options(m);
+    m->hw_version = QEMU_VERSION;
     m->alias = "q35";
 }
 
@@ -383,6 +384,7 @@ static void pc_q35_2_4_machine_options(MachineClass *m)
 {
     PCMachineClass *pcmc = PC_MACHINE_CLASS(m);
     pc_q35_2_5_machine_options(m);
+    m->hw_version = "2.4.0";
     m->alias = NULL;
     pcmc->broken_reserved_end = true;
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_4);
@@ -395,6 +397,7 @@ DEFINE_Q35_MACHINE(v2_4, "pc-q35-2.4", NULL,
 static void pc_q35_2_3_machine_options(MachineClass *m)
 {
     pc_q35_2_4_machine_options(m);
+    m->hw_version = "2.3.0";
     m->no_floppy = 0;
     m->no_tco = 1;
     m->alias = NULL;
@@ -408,6 +411,7 @@ DEFINE_Q35_MACHINE(v2_3, "pc-q35-2.3", pc_compat_2_3,
 static void pc_q35_2_2_machine_options(MachineClass *m)
 {
     pc_q35_2_3_machine_options(m);
+    m->hw_version = "2.2.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_2);
 }
 
@@ -418,6 +422,7 @@ DEFINE_Q35_MACHINE(v2_2, "pc-q35-2.2", pc_compat_2_2,
 static void pc_q35_2_1_machine_options(MachineClass *m)
 {
     pc_q35_2_2_machine_options(m);
+    m->hw_version = "2.1.0";
     m->default_display = NULL;
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_1);
 }
@@ -429,6 +434,7 @@ DEFINE_Q35_MACHINE(v2_1, "pc-q35-2.1", pc_compat_2_1,
 static void pc_q35_2_0_machine_options(MachineClass *m)
 {
     pc_q35_2_1_machine_options(m);
+    m->hw_version = "2.0.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_2_0);
 }
 
@@ -439,6 +445,7 @@ DEFINE_Q35_MACHINE(v2_0, "pc-q35-2.0", pc_compat_2_0,
 static void pc_q35_1_7_machine_options(MachineClass *m)
 {
     pc_q35_2_0_machine_options(m);
+    m->hw_version = "1.7.0";
     m->default_machine_opts = NULL;
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_7);
 }
@@ -450,6 +457,7 @@ DEFINE_Q35_MACHINE(v1_7, "pc-q35-1.7", pc_compat_1_7,
 static void pc_q35_1_6_machine_options(MachineClass *m)
 {
     pc_q35_machine_options(m);
+    m->hw_version = "1.6.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_6);
 }
 
@@ -460,6 +468,7 @@ DEFINE_Q35_MACHINE(v1_6, "pc-q35-1.6", pc_compat_1_6,
 static void pc_q35_1_5_machine_options(MachineClass *m)
 {
     pc_q35_1_6_machine_options(m);
+    m->hw_version = "1.5.0";
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_5);
 }
 
@@ -470,6 +479,7 @@ DEFINE_Q35_MACHINE(v1_5, "pc-q35-1.5", pc_compat_1_5,
 static void pc_q35_1_4_machine_options(MachineClass *m)
 {
     pc_q35_1_5_machine_options(m);
+    m->hw_version = "1.4.0";
     m->hot_add_cpu = NULL;
     SET_MACHINE_COMPAT(m, PC_COMPAT_1_4);
 }
diff --git a/hw/i386/pci-assign-load-rom.c b/hw/i386/pci-assign-load-rom.c
index 34a3a7e..e40b586 100644
--- a/hw/i386/pci-assign-load-rom.c
+++ b/hw/i386/pci-assign-load-rom.c
@@ -45,14 +45,10 @@ void *pci_assign_dev_load_option_rom(PCIDevice *dev, struct Object *owner,
         return NULL;
     }
 
-    if (access(rom_file, F_OK)) {
-        error_report("pci-assign: Insufficient privileges for %s", rom_file);
-        return NULL;
-    }
-
     /* Write "1" to the ROM file to enable it */
     fp = fopen(rom_file, "r+");
     if (fp == NULL) {
+        error_report("pci-assign: Cannot open %s: %s", rom_file, strerror(errno));
         return NULL;
     }
     val = 1;
diff --git a/hw/ide/ahci.c b/hw/ide/ahci.c
index 21f76ed..dd1912e 100644
--- a/hw/ide/ahci.c
+++ b/hw/ide/ahci.c
@@ -378,17 +378,23 @@ static uint64_t ahci_mem_read(void *opaque, hwaddr addr, unsigned size)
     int ofst = addr - aligned;
     uint64_t lo = ahci_mem_read_32(opaque, aligned);
     uint64_t hi;
+    uint64_t val;
 
     /* if < 8 byte read does not cross 4 byte boundary */
     if (ofst + size <= 4) {
-        return lo >> (ofst * 8);
+        val = lo >> (ofst * 8);
+    } else {
+        g_assert_cmpint(size, >, 1);
+
+        /* If the 64bit read is unaligned, we will produce undefined
+         * results. AHCI does not support unaligned 64bit reads. */
+        hi = ahci_mem_read_32(opaque, aligned + 4);
+        val = (hi << 32 | lo) >> (ofst * 8);
     }
-    g_assert_cmpint(size, >, 1);
 
-    /* If the 64bit read is unaligned, we will produce undefined
-     * results. AHCI does not support unaligned 64bit reads. */
-    hi = ahci_mem_read_32(opaque, aligned + 4);
-    return (hi << 32 | lo) >> (ofst * 8);
+    DPRINTF(-1, "addr=0x%" HWADDR_PRIx " val=0x%" PRIx64 ", size=%d\n",
+            addr, val, size);
+    return val;
 }
 
 
@@ -397,6 +403,9 @@ static void ahci_mem_write(void *opaque, hwaddr addr,
 {
     AHCIState *s = opaque;
 
+    DPRINTF(-1, "addr=0x%" HWADDR_PRIx " val=0x%" PRIx64 ", size=%d\n",
+            addr, val, size);
+
     /* Only aligned reads are allowed on AHCI */
     if (addr & 3) {
         fprintf(stderr, "ahci: Mis-aligned write to addr 0x"
@@ -804,8 +813,21 @@ static int prdt_tbl_entry_size(const AHCI_SG *tbl)
     return (le32_to_cpu(tbl->flags_size) & AHCI_PRDT_SIZE_MASK) + 1;
 }
 
+/**
+ * Fetch entries in a guest-provided PRDT and convert it into a QEMU SGlist.
+ * @ad: The AHCIDevice for whom we are building the SGList.
+ * @sglist: The SGList target to add PRD entries to.
+ * @cmd: The AHCI Command Header that describes where the PRDT is.
+ * @limit: The remaining size of the S/ATA transaction, in bytes.
+ * @offset: The number of bytes already transferred, in bytes.
+ *
+ * The AHCI PRDT can describe up to 256GiB. S/ATA only support transactions of
+ * up to 32MiB as of ATA8-ACS3 rev 1b, assuming a 512 byte sector size. We stop
+ * building the sglist from the PRDT as soon as we hit @limit bytes,
+ * which is <= INT32_MAX/2GiB.
+ */
 static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
-                                AHCICmdHdr *cmd, int64_t limit, int32_t offset)
+                                AHCICmdHdr *cmd, int64_t limit, uint64_t offset)
 {
     uint16_t opts = le16_to_cpu(cmd->opts);
     uint16_t prdtl = le16_to_cpu(cmd->prdtl);
@@ -823,14 +845,6 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
     IDEBus *bus = &ad->port;
     BusState *qbus = BUS(bus);
 
-    /*
-     * Note: AHCI PRDT can describe up to 256GiB. SATA/ATA only support
-     * transactions of up to 32MiB as of ATA8-ACS3 rev 1b, assuming a
-     * 512 byte sector size. We limit the PRDT in this implementation to
-     * a reasonably large 2GiB, which can accommodate the maximum transfer
-     * request for sector sizes up to 32K.
-     */
-
     if (!prdtl) {
         DPRINTF(ad->port_no, "no sg list given by guest: 0x%08x\n", opts);
         return -1;
@@ -880,13 +894,6 @@ static int ahci_populate_sglist(AHCIDevice *ad, QEMUSGList *sglist,
             qemu_sglist_add(sglist, le64_to_cpu(tbl[i].addr),
                             MIN(prdt_tbl_entry_size(&tbl[i]),
                                 limit - sglist->size));
-            if (sglist->size > INT32_MAX) {
-                error_report("AHCI Physical Region Descriptor Table describes "
-                             "more than 2 GiB.");
-                qemu_sglist_destroy(sglist);
-                r = -1;
-                goto out;
-            }
         }
     }
 
@@ -1427,24 +1434,26 @@ static const IDEDMAOps ahci_dma_ops = {
     .cmd_done = ahci_cmd_done,
 };
 
-void ahci_init(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports)
+void ahci_init(AHCIState *s, DeviceState *qdev)
 {
-    qemu_irq *irqs;
-    int i;
-
-    s->as = as;
-    s->ports = ports;
-    s->dev = g_new0(AHCIDevice, ports);
     s->container = qdev;
-    ahci_reg_init(s);
     /* XXX BAR size should be 1k, but that breaks, so bump it to 4k for now */
     memory_region_init_io(&s->mem, OBJECT(qdev), &ahci_mem_ops, s,
                           "ahci", AHCI_MEM_BAR_SIZE);
     memory_region_init_io(&s->idp, OBJECT(qdev), &ahci_idp_ops, s,
                           "ahci-idp", 32);
+}
 
-    irqs = qemu_allocate_irqs(ahci_irq_set, s, s->ports);
+void ahci_realize(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports)
+{
+    qemu_irq *irqs;
+    int i;
 
+    s->as = as;
+    s->ports = ports;
+    s->dev = g_new0(AHCIDevice, ports);
+    ahci_reg_init(s);
+    irqs = qemu_allocate_irqs(ahci_irq_set, s, s->ports);
     for (i = 0; i < s->ports; i++) {
         AHCIDevice *ad = &s->dev[i];
 
@@ -1639,17 +1648,24 @@ static void sysbus_ahci_reset(DeviceState *dev)
     ahci_reset(&s->ahci);
 }
 
-static void sysbus_ahci_realize(DeviceState *dev, Error **errp)
+static void sysbus_ahci_init(Object *obj)
 {
-    SysBusDevice *sbd = SYS_BUS_DEVICE(dev);
-    SysbusAHCIState *s = SYSBUS_AHCI(dev);
+    SysbusAHCIState *s = SYSBUS_AHCI(obj);
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
 
-    ahci_init(&s->ahci, dev, &address_space_memory, s->num_ports);
+    ahci_init(&s->ahci, DEVICE(obj));
 
     sysbus_init_mmio(sbd, &s->ahci.mem);
     sysbus_init_irq(sbd, &s->ahci.irq);
 }
 
+static void sysbus_ahci_realize(DeviceState *dev, Error **errp)
+{
+    SysbusAHCIState *s = SYSBUS_AHCI(dev);
+
+    ahci_realize(&s->ahci, dev, &address_space_memory, s->num_ports);
+}
+
 static Property sysbus_ahci_properties[] = {
     DEFINE_PROP_UINT32("num-ports", SysbusAHCIState, num_ports, 1),
     DEFINE_PROP_END_OF_LIST(),
@@ -1670,12 +1686,108 @@ static const TypeInfo sysbus_ahci_info = {
     .name          = TYPE_SYSBUS_AHCI,
     .parent        = TYPE_SYS_BUS_DEVICE,
     .instance_size = sizeof(SysbusAHCIState),
+    .instance_init = sysbus_ahci_init,
     .class_init    = sysbus_ahci_class_init,
 };
 
+#define ALLWINNER_AHCI_BISTAFR    ((0xa0 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_BISTCR     ((0xa4 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_BISTFCTR   ((0xa8 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_BISTSR     ((0xac - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_BISTDECR   ((0xb0 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_DIAGNR0    ((0xb4 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_DIAGNR1    ((0xb8 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_OOBR       ((0xbc - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_PHYCS0R    ((0xc0 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_PHYCS1R    ((0xc4 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_PHYCS2R    ((0xc8 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_TIMER1MS   ((0xe0 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_GPARAM1R   ((0xe8 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_GPARAM2R   ((0xec - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_PPARAMR    ((0xf0 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_TESTR      ((0xf4 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_VERSIONR   ((0xf8 - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_IDR        ((0xfc - ALLWINNER_AHCI_MMIO_OFF) / 4)
+#define ALLWINNER_AHCI_RWCR       ((0xfc - ALLWINNER_AHCI_MMIO_OFF) / 4)
+
+static uint64_t allwinner_ahci_mem_read(void *opaque, hwaddr addr,
+                                        unsigned size)
+{
+    AllwinnerAHCIState *a = opaque;
+    uint64_t val = a->regs[addr/4];
+
+    switch (addr / 4) {
+    case ALLWINNER_AHCI_PHYCS0R:
+        val |= 0x2 << 28;
+        break;
+    case ALLWINNER_AHCI_PHYCS2R:
+        val &= ~(0x1 << 24);
+        break;
+    }
+    DPRINTF(-1, "addr=0x%" HWADDR_PRIx " val=0x%" PRIx64 ", size=%d\n",
+            addr, val, size);
+    return  val;
+}
+
+static void allwinner_ahci_mem_write(void *opaque, hwaddr addr,
+                                     uint64_t val, unsigned size)
+{
+    AllwinnerAHCIState *a = opaque;
+
+    DPRINTF(-1, "addr=0x%" HWADDR_PRIx " val=0x%" PRIx64 ", size=%d\n",
+            addr, val, size);
+    a->regs[addr/4] = val;
+}
+
+static const MemoryRegionOps allwinner_ahci_mem_ops = {
+    .read = allwinner_ahci_mem_read,
+    .write = allwinner_ahci_mem_write,
+    .valid.min_access_size = 4,
+    .valid.max_access_size = 4,
+    .endianness = DEVICE_LITTLE_ENDIAN,
+};
+
+static void allwinner_ahci_init(Object *obj)
+{
+    SysbusAHCIState *s = SYSBUS_AHCI(obj);
+    AllwinnerAHCIState *a = ALLWINNER_AHCI(obj);
+
+    memory_region_init_io(&a->mmio, OBJECT(obj), &allwinner_ahci_mem_ops, a,
+                          "allwinner-ahci", ALLWINNER_AHCI_MMIO_SIZE);
+    memory_region_add_subregion(&s->ahci.mem, ALLWINNER_AHCI_MMIO_OFF,
+                                &a->mmio);
+}
+
+static const VMStateDescription vmstate_allwinner_ahci = {
+    .name = "allwinner-ahci",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32_ARRAY(regs, AllwinnerAHCIState,
+                             ALLWINNER_AHCI_MMIO_SIZE/4),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void allwinner_ahci_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->vmsd = &vmstate_allwinner_ahci;
+}
+
+static const TypeInfo allwinner_ahci_info = {
+    .name          = TYPE_ALLWINNER_AHCI,
+    .parent        = TYPE_SYSBUS_AHCI,
+    .instance_size = sizeof(AllwinnerAHCIState),
+    .instance_init = allwinner_ahci_init,
+    .class_init    = allwinner_ahci_class_init,
+};
+
 static void sysbus_ahci_register_types(void)
 {
     type_register_static(&sysbus_ahci_info);
+    type_register_static(&allwinner_ahci_info);
 }
 
 type_init(sysbus_ahci_register_types)
diff --git a/hw/ide/ahci.h b/hw/ide/ahci.h
index c9b3805..bc777ed 100644
--- a/hw/ide/ahci.h
+++ b/hw/ide/ahci.h
@@ -366,7 +366,8 @@ typedef struct SDBFIS {
     uint32_t payload;
 } QEMU_PACKED SDBFIS;
 
-void ahci_init(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports);
+void ahci_realize(AHCIState *s, DeviceState *qdev, AddressSpace *as, int ports);
+void ahci_init(AHCIState *s, DeviceState *qdev);
 void ahci_uninit(AHCIState *s);
 
 void ahci_reset(AHCIState *s);
@@ -385,4 +386,20 @@ typedef struct SysbusAHCIState {
     uint32_t num_ports;
 } SysbusAHCIState;
 
+#define TYPE_ALLWINNER_AHCI "allwinner-ahci"
+#define ALLWINNER_AHCI(obj) OBJECT_CHECK(AllwinnerAHCIState, (obj), \
+                       TYPE_ALLWINNER_AHCI)
+
+#define ALLWINNER_AHCI_MMIO_OFF  0x80
+#define ALLWINNER_AHCI_MMIO_SIZE 0x80
+
+struct AllwinnerAHCIState {
+    /*< private >*/
+    SysbusAHCIState parent_obj;
+    /*< public >*/
+
+    MemoryRegion mmio;
+    uint32_t regs[ALLWINNER_AHCI_MMIO_SIZE/4];
+};
+
 #endif /* HW_IDE_AHCI_H */
diff --git a/hw/ide/atapi.c b/hw/ide/atapi.c
index 747f466..ed8bb2c 100644
--- a/hw/ide/atapi.c
+++ b/hw/ide/atapi.c
@@ -108,27 +108,30 @@ static void cd_data_to_raw(uint8_t *buf, int lba)
 static int cd_read_sector(IDEState *s, int lba, uint8_t *buf, int sector_size)
 {
     int ret;
+    block_acct_start(blk_get_stats(s->blk), &s->acct,
+                     4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
 
     switch(sector_size) {
     case 2048:
-        block_acct_start(blk_get_stats(s->blk), &s->acct,
-                         4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
         ret = blk_read(s->blk, (int64_t)lba << 2, buf, 4);
-        block_acct_done(blk_get_stats(s->blk), &s->acct);
         break;
     case 2352:
-        block_acct_start(blk_get_stats(s->blk), &s->acct,
-                         4 * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
         ret = blk_read(s->blk, (int64_t)lba << 2, buf + 16, 4);
-        block_acct_done(blk_get_stats(s->blk), &s->acct);
-        if (ret < 0)
-            return ret;
-        cd_data_to_raw(buf, lba);
+        if (ret >= 0) {
+            cd_data_to_raw(buf, lba);
+        }
         break;
     default:
-        ret = -EIO;
-        break;
+        block_acct_invalid(blk_get_stats(s->blk), BLOCK_ACCT_READ);
+        return -EIO;
+    }
+
+    if (ret < 0) {
+        block_acct_failed(blk_get_stats(s->blk), &s->acct);
+    } else {
+        block_acct_done(blk_get_stats(s->blk), &s->acct);
     }
+
     return ret;
 }
 
@@ -167,6 +170,17 @@ void ide_atapi_io_error(IDEState *s, int ret)
     }
 }
 
+static uint16_t atapi_byte_count_limit(IDEState *s)
+{
+    uint16_t bcl;
+
+    bcl = s->lcyl | (s->hcyl << 8);
+    if (bcl == 0xffff) {
+        return 0xfffe;
+    }
+    return bcl;
+}
+
 /* The whole ATAPI transfer logic is handled in this function */
 void ide_atapi_cmd_reply_end(IDEState *s)
 {
@@ -209,12 +223,10 @@ void ide_atapi_cmd_reply_end(IDEState *s)
         } else {
             /* a new transfer is needed */
             s->nsector = (s->nsector & ~7) | ATAPI_INT_REASON_IO;
-            byte_count_limit = s->lcyl | (s->hcyl << 8);
+            byte_count_limit = atapi_byte_count_limit(s);
 #ifdef DEBUG_IDE_ATAPI
             printf("byte_count_limit=%d\n", byte_count_limit);
 #endif
-            if (byte_count_limit == 0xffff)
-                byte_count_limit--;
             size = s->packet_transfer_size;
             if (size > byte_count_limit) {
                 /* byte count limit must be even if this case */
@@ -357,7 +369,11 @@ static void ide_atapi_cmd_read_dma_cb(void *opaque, int ret)
     return;
 
 eot:
-    block_acct_done(blk_get_stats(s->blk), &s->acct);
+    if (ret < 0) {
+        block_acct_failed(blk_get_stats(s->blk), &s->acct);
+    } else {
+        block_acct_done(blk_get_stats(s->blk), &s->acct);
+    }
     ide_set_inactive(s, false);
 }
 
@@ -1179,7 +1195,7 @@ enum {
     NONDATA = 0x04,
 };
 
-static const struct {
+static const struct AtapiCmd {
     void (*handler)(IDEState *s, uint8_t *buf);
     int flags;
 } atapi_cmd_table[0x100] = {
@@ -1206,9 +1222,9 @@ static const struct {
 
 void ide_atapi_cmd(IDEState *s)
 {
-    uint8_t *buf;
+    uint8_t *buf = s->io_buffer;
+    const struct AtapiCmd *cmd = &atapi_cmd_table[s->io_buffer[0]];
 
-    buf = s->io_buffer;
 #ifdef DEBUG_IDE_ATAPI
     {
         int i;
@@ -1219,14 +1235,14 @@ void ide_atapi_cmd(IDEState *s)
         printf("\n");
     }
 #endif
+
     /*
      * If there's a UNIT_ATTENTION condition pending, only command flagged with
      * ALLOW_UA are allowed to complete. with other commands getting a CHECK
      * condition response unless a higher priority status, defined by the drive
      * here, is pending.
      */
-    if (s->sense_key == UNIT_ATTENTION &&
-        !(atapi_cmd_table[s->io_buffer[0]].flags & ALLOW_UA)) {
+    if (s->sense_key == UNIT_ATTENTION && !(cmd->flags & ALLOW_UA)) {
         ide_atapi_cmd_check_status(s);
         return;
     }
@@ -1237,7 +1253,7 @@ void ide_atapi_cmd(IDEState *s)
      * GET_EVENT_STATUS_NOTIFICATION to detect such tray open/close
      * states rely on this behavior.
      */
-    if (!(atapi_cmd_table[s->io_buffer[0]].flags & ALLOW_UA) &&
+    if (!(cmd->flags & ALLOW_UA) &&
         !s->tray_open && blk_is_inserted(s->blk) && s->cdrom_changed) {
 
         if (s->cdrom_changed == 1) {
@@ -1252,7 +1268,7 @@ void ide_atapi_cmd(IDEState *s)
     }
 
     /* Report a Not Ready condition if appropriate for the command */
-    if ((atapi_cmd_table[s->io_buffer[0]].flags & CHECK_READY) &&
+    if ((cmd->flags & CHECK_READY) &&
         (!media_present(s) || !blk_is_inserted(s->blk)))
     {
         ide_atapi_cmd_error(s, NOT_READY, ASC_MEDIUM_NOT_PRESENT);
@@ -1263,10 +1279,9 @@ void ide_atapi_cmd(IDEState *s)
      * If this is a data-transferring PIO command and BCL is 0,
      * we abort at the /ATA/ level, not the ATAPI level.
      * See ATA8 ACS3 section 7.17.6.49 and 7.21.5 */
-    if (!(atapi_cmd_table[s->io_buffer[0]].flags & NONDATA)) {
+    if (cmd->handler && !(cmd->flags & NONDATA)) {
         /* TODO: Check IDENTIFY data word 125 for default BCL (currently 0) */
-        uint16_t byte_count_limit = s->lcyl | (s->hcyl << 8);
-        if (!(byte_count_limit || s->atapi_dma)) {
+        if (!(atapi_byte_count_limit(s) || s->atapi_dma)) {
             /* TODO: Move abort back into core.c and make static inline again */
             ide_abort_command(s);
             return;
@@ -1274,8 +1289,8 @@ void ide_atapi_cmd(IDEState *s)
     }
 
     /* Execute the command */
-    if (atapi_cmd_table[s->io_buffer[0]].handler) {
-        atapi_cmd_table[s->io_buffer[0]].handler(s, buf);
+    if (cmd->handler) {
+        cmd->handler(s, buf);
         return;
     }
 
diff --git a/hw/ide/core.c b/hw/ide/core.c
index 317406d..2725dd3 100644
--- a/hw/ide/core.c
+++ b/hw/ide/core.c
@@ -574,7 +574,6 @@ static void ide_sector_read_cb(void *opaque, int ret)
     if (ret == -ECANCELED) {
         return;
     }
-    block_acct_done(blk_get_stats(s->blk), &s->acct);
     if (ret != 0) {
         if (ide_handle_rw_error(s, -ret, IDE_RETRY_PIO |
                                 IDE_RETRY_READ)) {
@@ -582,6 +581,8 @@ static void ide_sector_read_cb(void *opaque, int ret)
         }
     }
 
+    block_acct_done(blk_get_stats(s->blk), &s->acct);
+
     n = s->nsector;
     if (n > s->req_nb_sectors) {
         n = s->req_nb_sectors;
@@ -621,6 +622,7 @@ static void ide_sector_read(IDEState *s)
 
     if (!ide_sect_range_ok(s, sector_num, n)) {
         ide_rw_error(s);
+        block_acct_invalid(blk_get_stats(s->blk), BLOCK_ACCT_READ);
         return;
     }
 
@@ -672,6 +674,7 @@ static int ide_handle_rw_error(IDEState *s, int error, int op)
         assert(s->bus->retry_unit == s->unit);
         s->bus->error_status = op;
     } else if (action == BLOCK_ERROR_ACTION_REPORT) {
+        block_acct_failed(blk_get_stats(s->blk), &s->acct);
         if (op & IDE_RETRY_DMA) {
             ide_dma_error(s);
         } else {
@@ -750,6 +753,7 @@ static void ide_dma_cb(void *opaque, int ret)
     if ((s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) &&
         !ide_sect_range_ok(s, sector_num, n)) {
         ide_dma_error(s);
+        block_acct_invalid(blk_get_stats(s->blk), s->acct.type);
         return;
     }
 
@@ -826,7 +830,6 @@ static void ide_sector_write_cb(void *opaque, int ret)
     if (ret == -ECANCELED) {
         return;
     }
-    block_acct_done(blk_get_stats(s->blk), &s->acct);
 
     s->pio_aiocb = NULL;
     s->status &= ~BUSY_STAT;
@@ -837,6 +840,8 @@ static void ide_sector_write_cb(void *opaque, int ret)
         }
     }
 
+    block_acct_done(blk_get_stats(s->blk), &s->acct);
+
     n = s->nsector;
     if (n > s->req_nb_sectors) {
         n = s->req_nb_sectors;
@@ -887,6 +892,7 @@ static void ide_sector_write(IDEState *s)
 
     if (!ide_sect_range_ok(s, sector_num, n)) {
         ide_rw_error(s);
+        block_acct_invalid(blk_get_stats(s->blk), BLOCK_ACCT_WRITE);
         return;
     }
 
@@ -895,7 +901,7 @@ static void ide_sector_write(IDEState *s)
     qemu_iovec_init_external(&s->qiov, &s->iov, 1);
 
     block_acct_start(blk_get_stats(s->blk), &s->acct,
-                     n * BDRV_SECTOR_SIZE, BLOCK_ACCT_READ);
+                     n * BDRV_SECTOR_SIZE, BLOCK_ACCT_WRITE);
     s->pio_aiocb = blk_aio_writev(s->blk, sector_num, &s->qiov, n,
                                   ide_sector_write_cb, s);
 }
@@ -2312,7 +2318,7 @@ int ide_init_drive(IDEState *s, BlockBackend *blk, IDEDriveKind kind,
     if (version) {
         pstrcpy(s->version, sizeof(s->version), version);
     } else {
-        pstrcpy(s->version, sizeof(s->version), qemu_get_version());
+        pstrcpy(s->version, sizeof(s->version), qemu_hw_version());
     }
 
     ide_reset(s);
diff --git a/hw/ide/ich.c b/hw/ide/ich.c
index 350c7f1..16925fa 100644
--- a/hw/ide/ich.c
+++ b/hw/ide/ich.c
@@ -97,6 +97,13 @@ static void pci_ich9_reset(DeviceState *dev)
     ahci_reset(&d->ahci);
 }
 
+static void pci_ich9_ahci_init(Object *obj)
+{
+    struct AHCIPCIState *d = ICH_AHCI(obj);
+
+    ahci_init(&d->ahci, DEVICE(obj));
+}
+
 static void pci_ich9_ahci_realize(PCIDevice *dev, Error **errp)
 {
     struct AHCIPCIState *d;
@@ -104,7 +111,7 @@ static void pci_ich9_ahci_realize(PCIDevice *dev, Error **errp)
     uint8_t *sata_cap;
     d = ICH_AHCI(dev);
 
-    ahci_init(&d->ahci, DEVICE(dev), pci_get_address_space(dev), 6);
+    ahci_realize(&d->ahci, DEVICE(dev), pci_get_address_space(dev), 6);
 
     pci_config_set_prog_interface(dev->config, AHCI_PROGMODE_MAJOR_REV_1);
 
@@ -171,6 +178,7 @@ static const TypeInfo ich_ahci_info = {
     .name          = TYPE_ICH9_AHCI,
     .parent        = TYPE_PCI_DEVICE,
     .instance_size = sizeof(AHCIPCIState),
+    .instance_init = pci_ich9_ahci_init,
     .class_init    = ich_ahci_class_init,
 };
 
diff --git a/hw/ide/internal.h b/hw/ide/internal.h
index 05e93ff..e4629b0 100644
--- a/hw/ide/internal.h
+++ b/hw/ide/internal.h
@@ -397,7 +397,7 @@ struct IDEState {
     struct iovec iov;
     QEMUIOVector qiov;
     /* ATA DMA state */
-    int32_t io_buffer_offset;
+    uint64_t io_buffer_offset;
     int32_t io_buffer_size;
     QEMUSGList sg;
     /* PIO transfer handling */
diff --git a/hw/ide/macio.c b/hw/ide/macio.c
index 893c9b9..3ee962f 100644
--- a/hw/ide/macio.c
+++ b/hw/ide/macio.c
@@ -286,7 +286,11 @@ static void pmac_ide_atapi_transfer_cb(void *opaque, int ret)
     return;
 
 done:
-    block_acct_done(blk_get_stats(s->blk), &s->acct);
+    if (ret < 0) {
+        block_acct_failed(blk_get_stats(s->blk), &s->acct);
+    } else {
+        block_acct_done(blk_get_stats(s->blk), &s->acct);
+    }
     io->dma_end(opaque);
 
     return;
@@ -348,7 +352,11 @@ static void pmac_ide_transfer_cb(void *opaque, int ret)
 
 done:
     if (s->dma_cmd == IDE_DMA_READ || s->dma_cmd == IDE_DMA_WRITE) {
-        block_acct_done(blk_get_stats(s->blk), &s->acct);
+        if (ret < 0) {
+            block_acct_failed(blk_get_stats(s->blk), &s->acct);
+        } else {
+            block_acct_done(blk_get_stats(s->blk), &s->acct);
+        }
     }
     io->dma_end(opaque);
 }
diff --git a/hw/ide/pci.c b/hw/ide/pci.c
index d31ff88..9c54b37 100644
--- a/hw/ide/pci.c
+++ b/hw/ide/pci.c
@@ -103,13 +103,6 @@ static int32_t bmdma_prepare_buf(IDEDMA *dma, int32_t limit)
                 qemu_sglist_add(&s->sg, bm->cur_prd_addr, sg_len);
             }
 
-            /* Note: We limit the max transfer to be 2GiB.
-             * This should accommodate the largest ATA transaction
-             * for LBA48 (65,536 sectors) and 32K sector sizes. */
-            if (s->sg.size > INT32_MAX) {
-                error_report("IDE: sglist describes more than 2GiB.");
-                break;
-            }
             bm->cur_prd_addr += l;
             bm->cur_prd_len -= l;
             s->io_buffer_size += l;
diff --git a/hw/input/tsc210x.c b/hw/input/tsc210x.c
index fae3385..1073bbf 100644
--- a/hw/input/tsc210x.c
+++ b/hw/input/tsc210x.c
@@ -1086,9 +1086,7 @@ uWireSlave *tsc2102_init(qemu_irq pint)
 {
     TSC210xState *s;
 
-    s = (TSC210xState *)
-            g_malloc0(sizeof(TSC210xState));
-    memset(s, 0, sizeof(TSC210xState));
+    s = g_new0(TSC210xState, 1);
     s->x = 160;
     s->y = 160;
     s->pressure = 0;
@@ -1135,9 +1133,7 @@ uWireSlave *tsc2301_init(qemu_irq penirq, qemu_irq kbirq, qemu_irq dav)
 {
     TSC210xState *s;
 
-    s = (TSC210xState *)
-            g_malloc0(sizeof(TSC210xState));
-    memset(s, 0, sizeof(TSC210xState));
+    s = g_new0(TSC210xState, 1);
     s->x = 400;
     s->y = 240;
     s->pressure = 0;
diff --git a/hw/intc/arm_gic.c b/hw/intc/arm_gic.c
index 8bad132..d71aeb8 100644
--- a/hw/intc/arm_gic.c
+++ b/hw/intc/arm_gic.c
@@ -35,8 +35,6 @@ static const uint8_t gic_id[] = {
     0x90, 0x13, 0x04, 0x00, 0x0d, 0xf0, 0x05, 0xb1
 };
 
-#define NUM_CPU(s) ((s)->num_cpu)
-
 static inline int gic_get_current_cpu(GICState *s)
 {
     if (s->num_cpu > 1) {
@@ -64,7 +62,7 @@ void gic_update(GICState *s)
     int cpu;
     int cm;
 
-    for (cpu = 0; cpu < NUM_CPU(s); cpu++) {
+    for (cpu = 0; cpu < s->num_cpu; cpu++) {
         cm = 1 << cpu;
         s->current_pending[cpu] = 1023;
         if (!(s->ctlr & (GICD_CTLR_EN_GRP0 | GICD_CTLR_EN_GRP1))
@@ -567,7 +565,7 @@ static uint32_t gic_dist_readb(void *opaque, hwaddr offset, MemTxAttrs attrs)
         if (offset == 4)
             /* Interrupt Controller Type Register */
             return ((s->num_irq / 32) - 1)
-                    | ((NUM_CPU(s) - 1) << 5)
+                    | ((s->num_cpu - 1) << 5)
                     | (s->security_extn << 10);
         if (offset < 0x08)
             return 0;
@@ -1284,7 +1282,7 @@ static void arm_gic_realize(DeviceState *dev, Error **errp)
      * GIC v2 defines a larger memory region (0x1000) so this will need
      * to be extended when we implement A15.
      */
-    for (i = 0; i < NUM_CPU(s); i++) {
+    for (i = 0; i < s->num_cpu; i++) {
         s->backref[i] = s;
         memory_region_init_io(&s->cpuiomem[i+1], OBJECT(s), &gic_cpu_ops,
                               &s->backref[i], "gic_cpu", 0x100);
diff --git a/hw/misc/Makefile.objs b/hw/misc/Makefile.objs
index 4aa76ff..aeb6b7d 100644
--- a/hw/misc/Makefile.objs
+++ b/hw/misc/Makefile.objs
@@ -36,6 +36,7 @@ obj-$(CONFIG_OMAP) += omap_sdrc.o
 obj-$(CONFIG_OMAP) += omap_tap.o
 obj-$(CONFIG_SLAVIO) += slavio_misc.o
 obj-$(CONFIG_ZYNQ) += zynq_slcr.o
+obj-$(CONFIG_ZYNQ) += zynq-xadc.o
 obj-$(CONFIG_STM32F2XX_SYSCFG) += stm32f2xx_syscfg.o
 
 obj-$(CONFIG_PVPANIC) += pvpanic.o
diff --git a/hw/misc/macio/cuda.c b/hw/misc/macio/cuda.c
index 0fd75b3..9db4c64 100644
--- a/hw/misc/macio/cuda.c
+++ b/hw/misc/macio/cuda.c
@@ -57,6 +57,8 @@
 #define IER_SET		0x80		/* set bits in IER */
 #define IER_CLR		0		/* clear bits in IER */
 #define SR_INT		0x04		/* Shift register full/empty */
+#define SR_DATA_INT	0x08
+#define SR_CLOCK_INT	0x10
 #define T1_INT          0x40            /* Timer 1 interrupt */
 #define T2_INT          0x20            /* Timer 2 interrupt */
 
@@ -108,6 +110,24 @@
 /* CUDA returns time_t's offset from Jan 1, 1904, not 1970 */
 #define RTC_OFFSET                      2082844800
 
+/* CUDA registers */
+#define CUDA_REG_B       0x00
+#define CUDA_REG_A       0x01
+#define CUDA_REG_DIRB    0x02
+#define CUDA_REG_DIRA    0x03
+#define CUDA_REG_T1CL    0x04
+#define CUDA_REG_T1CH    0x05
+#define CUDA_REG_T1LL    0x06
+#define CUDA_REG_T1LH    0x07
+#define CUDA_REG_T2CL    0x08
+#define CUDA_REG_T2CH    0x09
+#define CUDA_REG_SR      0x0a
+#define CUDA_REG_ACR     0x0b
+#define CUDA_REG_PCR     0x0c
+#define CUDA_REG_IFR     0x0d
+#define CUDA_REG_IER     0x0e
+#define CUDA_REG_ANH     0x0f
+
 static void cuda_update(CUDAState *s);
 static void cuda_receive_packet_from_host(CUDAState *s,
                                           const uint8_t *data, int len);
@@ -116,47 +136,48 @@ static void cuda_timer_update(CUDAState *s, CUDATimer *ti,
 
 static void cuda_update_irq(CUDAState *s)
 {
-    if (s->ifr & s->ier & (SR_INT | T1_INT)) {
+    if (s->ifr & s->ier & (SR_INT | T1_INT | T2_INT)) {
         qemu_irq_raise(s->irq);
     } else {
         qemu_irq_lower(s->irq);
     }
 }
 
-static uint64_t get_tb(uint64_t freq)
+static uint64_t get_tb(uint64_t time, uint64_t freq)
 {
-    return muldiv64(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
-                    freq, get_ticks_per_sec());
+    return muldiv64(time, freq, get_ticks_per_sec());
 }
 
-static unsigned int get_counter(CUDATimer *s)
+static unsigned int get_counter(CUDATimer *ti)
 {
     int64_t d;
     unsigned int counter;
     uint64_t tb_diff;
+    uint64_t current_time = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL);
 
     /* Reverse of the tb calculation algorithm that Mac OS X uses on bootup. */
-    tb_diff = get_tb(s->frequency) - s->load_time;
-    d = (tb_diff * 0xBF401675E5DULL) / (s->frequency << 24);
+    tb_diff = get_tb(current_time, ti->frequency) - ti->load_time;
+    d = (tb_diff * 0xBF401675E5DULL) / (ti->frequency << 24);
 
-    if (s->index == 0) {
+    if (ti->index == 0) {
         /* the timer goes down from latch to -1 (period of latch + 2) */
-        if (d <= (s->counter_value + 1)) {
-            counter = (s->counter_value - d) & 0xffff;
+        if (d <= (ti->counter_value + 1)) {
+            counter = (ti->counter_value - d) & 0xffff;
         } else {
-            counter = (d - (s->counter_value + 1)) % (s->latch + 2);
-            counter = (s->latch - counter) & 0xffff;
+            counter = (d - (ti->counter_value + 1)) % (ti->latch + 2);
+            counter = (ti->latch - counter) & 0xffff;
         }
     } else {
-        counter = (s->counter_value - d) & 0xffff;
+        counter = (ti->counter_value - d) & 0xffff;
     }
     return counter;
 }
 
 static void set_counter(CUDAState *s, CUDATimer *ti, unsigned int val)
 {
-    CUDA_DPRINTF("T%d.counter=%d\n", 1 + (ti->timer == NULL), val);
-    ti->load_time = get_tb(s->frequency);
+    CUDA_DPRINTF("T%d.counter=%d\n", 1 + ti->index, val);
+    ti->load_time = get_tb(qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL),
+                           s->frequency);
     ti->counter_value = val;
     cuda_timer_update(s, ti, ti->load_time);
 }
@@ -199,7 +220,7 @@ static void cuda_timer_update(CUDAState *s, CUDATimer *ti,
 {
     if (!ti->timer)
         return;
-    if ((s->acr & T1MODE) != T1MODE_CONT) {
+    if (ti->index == 0 && (s->acr & T1MODE) != T1MODE_CONT) {
         timer_del(ti->timer);
     } else {
         ti->next_irq_time = get_next_irq_time(ti, current_time);
@@ -217,6 +238,41 @@ static void cuda_timer1(void *opaque)
     cuda_update_irq(s);
 }
 
+static void cuda_timer2(void *opaque)
+{
+    CUDAState *s = opaque;
+    CUDATimer *ti = &s->timers[1];
+
+    cuda_timer_update(s, ti, ti->next_irq_time);
+    s->ifr |= T2_INT;
+    cuda_update_irq(s);
+}
+
+static void cuda_set_sr_int(void *opaque)
+{
+    CUDAState *s = opaque;
+
+    CUDA_DPRINTF("CUDA: %s:%d\n", __func__, __LINE__);
+    s->ifr |= SR_INT;
+    cuda_update_irq(s);
+}
+
+static void cuda_delay_set_sr_int(CUDAState *s)
+{
+    int64_t expire;
+
+    if (s->dirb == 0xff) {
+        /* Not in Mac OS, fire the IRQ directly */
+        cuda_set_sr_int(s);
+        return;
+    }
+
+    CUDA_DPRINTF("CUDA: %s:%d\n", __func__, __LINE__);
+
+    expire = qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + 300 * SCALE_US;
+    timer_mod(s->sr_delay_timer, expire);
+}
+
 static uint32_t cuda_readb(void *opaque, hwaddr addr)
 {
     CUDAState *s = opaque;
@@ -224,66 +280,68 @@ static uint32_t cuda_readb(void *opaque, hwaddr addr)
 
     addr = (addr >> 9) & 0xf;
     switch(addr) {
-    case 0:
+    case CUDA_REG_B:
         val = s->b;
         break;
-    case 1:
+    case CUDA_REG_A:
         val = s->a;
         break;
-    case 2:
+    case CUDA_REG_DIRB:
         val = s->dirb;
         break;
-    case 3:
+    case CUDA_REG_DIRA:
         val = s->dira;
         break;
-    case 4:
+    case CUDA_REG_T1CL:
         val = get_counter(&s->timers[0]) & 0xff;
         s->ifr &= ~T1_INT;
         cuda_update_irq(s);
         break;
-    case 5:
+    case CUDA_REG_T1CH:
         val = get_counter(&s->timers[0]) >> 8;
         cuda_update_irq(s);
         break;
-    case 6:
+    case CUDA_REG_T1LL:
         val = s->timers[0].latch & 0xff;
         break;
-    case 7:
+    case CUDA_REG_T1LH:
         /* XXX: check this */
         val = (s->timers[0].latch >> 8) & 0xff;
         break;
-    case 8:
+    case CUDA_REG_T2CL:
         val = get_counter(&s->timers[1]) & 0xff;
         s->ifr &= ~T2_INT;
+        cuda_update_irq(s);
         break;
-    case 9:
+    case CUDA_REG_T2CH:
         val = get_counter(&s->timers[1]) >> 8;
         break;
-    case 10:
+    case CUDA_REG_SR:
         val = s->sr;
-        s->ifr &= ~SR_INT;
+        s->ifr &= ~(SR_INT | SR_CLOCK_INT | SR_DATA_INT);
         cuda_update_irq(s);
         break;
-    case 11:
+    case CUDA_REG_ACR:
         val = s->acr;
         break;
-    case 12:
+    case CUDA_REG_PCR:
         val = s->pcr;
         break;
-    case 13:
+    case CUDA_REG_IFR:
         val = s->ifr;
-        if (s->ifr & s->ier)
+        if (s->ifr & s->ier) {
             val |= 0x80;
+        }
         break;
-    case 14:
+    case CUDA_REG_IER:
         val = s->ier | 0x80;
         break;
     default:
-    case 15:
+    case CUDA_REG_ANH:
         val = s->anh;
         break;
     }
-    if (addr != 13 || val != 0) {
+    if (addr != CUDA_REG_IFR || val != 0) {
         CUDA_DPRINTF("read: reg=0x%x val=%02x\n", (int)addr, val);
     }
 
@@ -298,61 +356,65 @@ static void cuda_writeb(void *opaque, hwaddr addr, uint32_t val)
     CUDA_DPRINTF("write: reg=0x%x val=%02x\n", (int)addr, val);
 
     switch(addr) {
-    case 0:
+    case CUDA_REG_B:
         s->b = val;
         cuda_update(s);
         break;
-    case 1:
+    case CUDA_REG_A:
         s->a = val;
         break;
-    case 2:
+    case CUDA_REG_DIRB:
         s->dirb = val;
         break;
-    case 3:
+    case CUDA_REG_DIRA:
         s->dira = val;
         break;
-    case 4:
+    case CUDA_REG_T1CL:
         s->timers[0].latch = (s->timers[0].latch & 0xff00) | val;
         cuda_timer_update(s, &s->timers[0], qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
         break;
-    case 5:
+    case CUDA_REG_T1CH:
         s->timers[0].latch = (s->timers[0].latch & 0xff) | (val << 8);
         s->ifr &= ~T1_INT;
         set_counter(s, &s->timers[0], s->timers[0].latch);
         break;
-    case 6:
+    case CUDA_REG_T1LL:
         s->timers[0].latch = (s->timers[0].latch & 0xff00) | val;
         cuda_timer_update(s, &s->timers[0], qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
         break;
-    case 7:
+    case CUDA_REG_T1LH:
         s->timers[0].latch = (s->timers[0].latch & 0xff) | (val << 8);
         s->ifr &= ~T1_INT;
         cuda_timer_update(s, &s->timers[0], qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
         break;
-    case 8:
-        s->timers[1].latch = val;
-        set_counter(s, &s->timers[1], val);
+    case CUDA_REG_T2CL:
+        s->timers[1].latch = (s->timers[1].latch & 0xff00) | val;
         break;
-    case 9:
-        set_counter(s, &s->timers[1], (val << 8) | s->timers[1].latch);
+    case CUDA_REG_T2CH:
+        /* To ensure T2 generates an interrupt on zero crossing with the
+           common timer code, write the value directly from the latch to
+           the counter */
+        s->timers[1].latch = (s->timers[1].latch & 0xff) | (val << 8);
+        s->ifr &= ~T2_INT;
+        set_counter(s, &s->timers[1], s->timers[1].latch);
         break;
-    case 10:
+    case CUDA_REG_SR:
         s->sr = val;
         break;
-    case 11:
+    case CUDA_REG_ACR:
         s->acr = val;
         cuda_timer_update(s, &s->timers[0], qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL));
         cuda_update(s);
         break;
-    case 12:
+    case CUDA_REG_PCR:
         s->pcr = val;
         break;
-    case 13:
+    case CUDA_REG_IFR:
         /* reset bits */
         s->ifr &= ~val;
         cuda_update_irq(s);
         break;
-    case 14:
+    case CUDA_REG_IER:
         if (val & IER_SET) {
             /* set bits */
             s->ier |= val & 0x7f;
@@ -363,7 +425,7 @@ static void cuda_writeb(void *opaque, hwaddr addr, uint32_t val)
         cuda_update_irq(s);
         break;
     default:
-    case 15:
+    case CUDA_REG_ANH:
         s->anh = val;
         break;
     }
@@ -384,8 +446,7 @@ static void cuda_update(CUDAState *s)
                 if (s->data_out_index < sizeof(s->data_out)) {
                     CUDA_DPRINTF("send: %02x\n", s->sr);
                     s->data_out[s->data_out_index++] = s->sr;
-                    s->ifr |= SR_INT;
-                    cuda_update_irq(s);
+                    cuda_delay_set_sr_int(s);
                 }
             }
         } else {
@@ -398,8 +459,7 @@ static void cuda_update(CUDAState *s)
                     if (s->data_in_index >= s->data_in_size) {
                         s->b = (s->b | TREQ);
                     }
-                    s->ifr |= SR_INT;
-                    cuda_update_irq(s);
+                    cuda_delay_set_sr_int(s);
                 }
             }
         }
@@ -411,15 +471,13 @@ static void cuda_update(CUDAState *s)
                 s->b = (s->b | TREQ);
             else
                 s->b = (s->b & ~TREQ);
-            s->ifr |= SR_INT;
-            cuda_update_irq(s);
+            cuda_delay_set_sr_int(s);
         } else {
             if (!(s->last_b & TIP)) {
                 /* handle end of host to cuda transfer */
                 packet_received = (s->data_out_index > 0);
                 /* always an IRQ at the end of transfer */
-                s->ifr |= SR_INT;
-                cuda_update_irq(s);
+                cuda_delay_set_sr_int(s);
             }
             /* signal if there is data to read */
             if (s->data_in_index < s->data_in_size) {
@@ -456,8 +514,7 @@ static void cuda_send_packet_to_host(CUDAState *s,
     s->data_in_size = len;
     s->data_in_index = 0;
     cuda_update(s);
-    s->ifr |= SR_INT;
-    cuda_update_irq(s);
+    cuda_delay_set_sr_int(s);
 }
 
 static void cuda_adb_poll(void *opaque)
@@ -480,7 +537,7 @@ static void cuda_adb_poll(void *opaque)
 static void cuda_receive_packet(CUDAState *s,
                                 const uint8_t *data, int len)
 {
-    uint8_t obuf[16];
+    uint8_t obuf[16] = { CUDA_PACKET, 0, data[0] };
     int autopoll;
     uint32_t ti;
 
@@ -497,23 +554,18 @@ static void cuda_receive_packet(CUDAState *s,
                 timer_del(s->adb_poll_timer);
             }
         }
-        obuf[0] = CUDA_PACKET;
-        obuf[1] = data[1];
-        cuda_send_packet_to_host(s, obuf, 2);
+        cuda_send_packet_to_host(s, obuf, 3);
+        break;
+    case CUDA_GET_6805_ADDR:
+        cuda_send_packet_to_host(s, obuf, 3);
         break;
     case CUDA_SET_TIME:
         ti = (((uint32_t)data[1]) << 24) + (((uint32_t)data[2]) << 16) + (((uint32_t)data[3]) << 8) + data[4];
         s->tick_offset = ti - (qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / get_ticks_per_sec());
-        obuf[0] = CUDA_PACKET;
-        obuf[1] = 0;
-        obuf[2] = 0;
         cuda_send_packet_to_host(s, obuf, 3);
         break;
     case CUDA_GET_TIME:
         ti = s->tick_offset + (qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) / get_ticks_per_sec());
-        obuf[0] = CUDA_PACKET;
-        obuf[1] = 0;
-        obuf[2] = 0;
         obuf[3] = ti >> 24;
         obuf[4] = ti >> 16;
         obuf[5] = ti >> 8;
@@ -524,22 +576,34 @@ static void cuda_receive_packet(CUDAState *s,
     case CUDA_SET_DEVICE_LIST:
     case CUDA_SET_AUTO_RATE:
     case CUDA_SET_POWER_MESSAGES:
-        obuf[0] = CUDA_PACKET;
-        obuf[1] = 0;
-        cuda_send_packet_to_host(s, obuf, 2);
+        cuda_send_packet_to_host(s, obuf, 3);
         break;
     case CUDA_POWERDOWN:
-        obuf[0] = CUDA_PACKET;
-        obuf[1] = 0;
-        cuda_send_packet_to_host(s, obuf, 2);
+        cuda_send_packet_to_host(s, obuf, 3);
         qemu_system_shutdown_request();
         break;
     case CUDA_RESET_SYSTEM:
-        obuf[0] = CUDA_PACKET;
-        obuf[1] = 0;
-        cuda_send_packet_to_host(s, obuf, 2);
+        cuda_send_packet_to_host(s, obuf, 3);
         qemu_system_reset_request();
         break;
+    case CUDA_COMBINED_FORMAT_IIC:
+        obuf[0] = ERROR_PACKET;
+        obuf[1] = 0x5;
+        obuf[2] = CUDA_PACKET;
+        obuf[3] = data[0];
+        cuda_send_packet_to_host(s, obuf, 4);
+        break;
+    case CUDA_GET_SET_IIC:
+        if (len == 4) {
+            cuda_send_packet_to_host(s, obuf, 3);
+        } else {
+            obuf[0] = ERROR_PACKET;
+            obuf[1] = 0x2;
+            obuf[2] = CUDA_PACKET;
+            obuf[3] = data[0];
+            cuda_send_packet_to_host(s, obuf, 4);
+        }
+        break;
     default:
         break;
     }
@@ -560,19 +624,21 @@ static void cuda_receive_packet_from_host(CUDAState *s,
     switch(data[0]) {
     case ADB_PACKET:
         {
-            uint8_t obuf[ADB_MAX_OUT_LEN + 2];
+            uint8_t obuf[ADB_MAX_OUT_LEN + 3];
             int olen;
             olen = adb_request(&s->adb_bus, obuf + 2, data + 1, len - 1);
             if (olen > 0) {
                 obuf[0] = ADB_PACKET;
                 obuf[1] = 0x00;
+                cuda_send_packet_to_host(s, obuf, olen + 2);
             } else {
                 /* error */
                 obuf[0] = ADB_PACKET;
                 obuf[1] = -olen;
+                obuf[2] = data[1];
                 olen = 0;
+                cuda_send_packet_to_host(s, obuf, olen + 3);
             }
-            cuda_send_packet_to_host(s, obuf, olen + 2);
         }
         break;
     case CUDA_PACKET:
@@ -671,7 +737,7 @@ static void cuda_reset(DeviceState *dev)
 
     s->b = 0;
     s->a = 0;
-    s->dirb = 0;
+    s->dirb = 0xff;
     s->dira = 0;
     s->sr = 0;
     s->acr = 0;
@@ -688,8 +754,9 @@ static void cuda_reset(DeviceState *dev)
     s->timers[0].latch = 0xffff;
     set_counter(s, &s->timers[0], 0xffff);
 
-    s->timers[1].latch = 0;
-    set_counter(s, &s->timers[1], 0xffff);
+    s->timers[1].latch = 0xffff;
+
+    s->sr_delay_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, cuda_set_sr_int, s);
 }
 
 static void cuda_realizefn(DeviceState *dev, Error **errp)
@@ -699,7 +766,8 @@ static void cuda_realizefn(DeviceState *dev, Error **errp)
 
     s->timers[0].timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, cuda_timer1, s);
     s->timers[0].frequency = s->frequency;
-    s->timers[1].frequency = s->frequency;
+    s->timers[1].timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, cuda_timer2, s);
+    s->timers[1].frequency = (SCALE_US * 6000) / 4700;
 
     qemu_get_timedate(&tm, 0);
     s->tick_offset = (uint32_t)mktimegm(&tm) + RTC_OFFSET;
diff --git a/hw/misc/zynq-xadc.c b/hw/misc/zynq-xadc.c
new file mode 100644
index 0000000..1a32595
--- /dev/null
+++ b/hw/misc/zynq-xadc.c
@@ -0,0 +1,302 @@
+/*
+ * ADC registers for Xilinx Zynq Platform
+ *
+ * Copyright (c) 2015 Guenter Roeck
+ * Based on hw/misc/zynq_slcr.c, written by Michal Simek
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "hw/hw.h"
+#include "hw/misc/zynq-xadc.h"
+#include "qemu/timer.h"
+#include "sysemu/sysemu.h"
+
+enum {
+    CFG                = 0x000 / 4,
+    INT_STS,
+    INT_MASK,
+    MSTS,
+    CMDFIFO,
+    RDFIFO,
+    MCTL,
+};
+
+#define CFG_ENABLE              BIT(31)
+#define CFG_CFIFOTH_SHIFT       20
+#define CFG_CFIFOTH_LENGTH      4
+#define CFG_DFIFOTH_SHIFT       16
+#define CFG_DFIFOTH_LENGTH      4
+#define CFG_WEDGE               BIT(13)
+#define CFG_REDGE               BIT(12)
+#define CFG_TCKRATE_SHIFT       8
+#define CFG_TCKRATE_LENGTH      2
+
+#define CFG_TCKRATE_DIV(x)      (0x1 << (x - 1))
+
+#define CFG_IGAP_SHIFT          0
+#define CFG_IGAP_LENGTH         5
+
+#define INT_CFIFO_LTH           BIT(9)
+#define INT_DFIFO_GTH           BIT(8)
+#define INT_OT                  BIT(7)
+#define INT_ALM_SHIFT           0
+#define INT_ALM_LENGTH          7
+#define INT_ALM_MASK            (((1 << INT_ALM_LENGTH) - 1) << INT_ALM_SHIFT)
+
+#define INT_ALL (INT_CFIFO_LTH | INT_DFIFO_GTH | INT_OT | INT_ALM_MASK)
+
+#define MSTS_CFIFO_LVL_SHIFT    16
+#define MSTS_CFIFO_LVL_LENGTH   4
+#define MSTS_DFIFO_LVL_SHIFT    12
+#define MSTS_DFIFO_LVL_LENGTH   4
+#define MSTS_CFIFOF             BIT(11)
+#define MSTS_CFIFOE             BIT(10)
+#define MSTS_DFIFOF             BIT(9)
+#define MSTS_DFIFOE             BIT(8)
+#define MSTS_OT                 BIT(7)
+#define MSTS_ALM_SHIFT          0
+#define MSTS_ALM_LENGTH         7
+
+#define MCTL_RESET              BIT(4)
+
+#define CMD_NOP                 0x00
+#define CMD_READ                0x01
+#define CMD_WRITE               0x02
+
+static void zynq_xadc_update_ints(ZynqXADCState *s)
+{
+
+    /* We are fast, commands are actioned instantly so the CFIFO is always
+     * empty (and below threshold).
+     */
+    s->regs[INT_STS] |= INT_CFIFO_LTH;
+
+    if (s->xadc_dfifo_entries >
+        extract32(s->regs[CFG], CFG_DFIFOTH_SHIFT, CFG_DFIFOTH_LENGTH)) {
+        s->regs[INT_STS] |= INT_DFIFO_GTH;
+    }
+
+    qemu_set_irq(s->qemu_irq, !!(s->regs[INT_STS] & ~s->regs[INT_MASK]));
+}
+
+static void zynq_xadc_reset(DeviceState *d)
+{
+    ZynqXADCState *s = ZYNQ_XADC(d);
+
+    s->regs[CFG] = 0x14 << CFG_IGAP_SHIFT |
+                   CFG_TCKRATE_DIV(4) << CFG_TCKRATE_SHIFT | CFG_REDGE;
+    s->regs[INT_STS] = INT_CFIFO_LTH;
+    s->regs[INT_MASK] = 0xffffffff;
+    s->regs[CMDFIFO] = 0;
+    s->regs[RDFIFO] = 0;
+    s->regs[MCTL] = MCTL_RESET;
+
+    memset(s->xadc_regs, 0, sizeof(s->xadc_regs));
+    memset(s->xadc_dfifo, 0, sizeof(s->xadc_dfifo));
+    s->xadc_dfifo_entries = 0;
+
+    zynq_xadc_update_ints(s);
+}
+
+static uint16_t xadc_pop_dfifo(ZynqXADCState *s)
+{
+    uint16_t rv = s->xadc_dfifo[0];
+    int i;
+
+    if (s->xadc_dfifo_entries > 0) {
+        s->xadc_dfifo_entries--;
+    }
+    for (i = 0; i < s->xadc_dfifo_entries; i++) {
+        s->xadc_dfifo[i] = s->xadc_dfifo[i + 1];
+    }
+    s->xadc_dfifo[s->xadc_dfifo_entries] = 0;
+    zynq_xadc_update_ints(s);
+    return rv;
+}
+
+static void xadc_push_dfifo(ZynqXADCState *s, uint16_t regval)
+{
+    if (s->xadc_dfifo_entries < ZYNQ_XADC_FIFO_DEPTH) {
+        s->xadc_dfifo[s->xadc_dfifo_entries++] = s->xadc_read_reg_previous;
+    }
+    s->xadc_read_reg_previous = regval;
+    zynq_xadc_update_ints(s);
+}
+
+static bool zynq_xadc_check_offset(hwaddr offset, bool rnw)
+{
+    switch (offset) {
+    case CFG:
+    case INT_MASK:
+    case INT_STS:
+    case MCTL:
+        return true;
+    case RDFIFO:
+    case MSTS:
+        return rnw;     /* read only */
+    case CMDFIFO:
+        return !rnw;    /* write only */
+    default:
+        return false;
+    }
+}
+
+static uint64_t zynq_xadc_read(void *opaque, hwaddr offset, unsigned size)
+{
+    ZynqXADCState *s = opaque;
+    int reg = offset / 4;
+    uint32_t rv = 0;
+
+    if (!zynq_xadc_check_offset(reg, true)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "zynq_xadc: Invalid read access to "
+                      "addr %" HWADDR_PRIx "\n", offset);
+        return 0;
+    }
+
+    switch (reg) {
+    case CFG:
+    case INT_MASK:
+    case INT_STS:
+    case MCTL:
+        rv = s->regs[reg];
+        break;
+    case MSTS:
+        rv = MSTS_CFIFOE;
+        rv |= s->xadc_dfifo_entries << MSTS_DFIFO_LVL_SHIFT;
+        if (!s->xadc_dfifo_entries) {
+            rv |= MSTS_DFIFOE;
+        } else if (s->xadc_dfifo_entries == ZYNQ_XADC_FIFO_DEPTH) {
+            rv |= MSTS_DFIFOF;
+        }
+        break;
+    case RDFIFO:
+        rv = xadc_pop_dfifo(s);
+        break;
+    }
+    return rv;
+}
+
+static void zynq_xadc_write(void *opaque, hwaddr offset, uint64_t val,
+                            unsigned size)
+{
+    ZynqXADCState *s = (ZynqXADCState *)opaque;
+    int reg = offset / 4;
+    int xadc_reg;
+    int xadc_cmd;
+    int xadc_data;
+
+    if (!zynq_xadc_check_offset(reg, false)) {
+        qemu_log_mask(LOG_GUEST_ERROR, "zynq_xadc: Invalid write access "
+                      "to addr %" HWADDR_PRIx "\n", offset);
+        return;
+    }
+
+    switch (reg) {
+    case CFG:
+        s->regs[CFG] = val;
+        break;
+    case INT_STS:
+        s->regs[INT_STS] &= ~val;
+        break;
+    case INT_MASK:
+        s->regs[INT_MASK] = val & INT_ALL;
+        break;
+    case CMDFIFO:
+        xadc_cmd  = extract32(val, 26,  4);
+        xadc_reg  = extract32(val, 16, 10);
+        xadc_data = extract32(val,  0, 16);
+
+        if (s->regs[MCTL] & MCTL_RESET) {
+            qemu_log_mask(LOG_GUEST_ERROR, "zynq_xadc: Sending command "
+                          "while comm channel held in reset: %" PRIx32 "\n",
+                          (uint32_t) val);
+            break;
+        }
+
+        if (xadc_reg > ZYNQ_XADC_NUM_ADC_REGS && xadc_cmd != CMD_NOP) {
+            qemu_log_mask(LOG_GUEST_ERROR, "read/write op to invalid xadc "
+                          "reg 0x%x\n", xadc_reg);
+            break;
+        }
+
+        switch (xadc_cmd) {
+        case CMD_READ:
+            xadc_push_dfifo(s, s->xadc_regs[xadc_reg]);
+            break;
+        case CMD_WRITE:
+            s->xadc_regs[xadc_reg] = xadc_data;
+            /* fallthrough */
+        case CMD_NOP:
+            xadc_push_dfifo(s, 0);
+            break;
+        }
+        break;
+    case MCTL:
+        s->regs[MCTL] = val & 0x00fffeff;
+        break;
+    }
+    zynq_xadc_update_ints(s);
+}
+
+static const MemoryRegionOps xadc_ops = {
+    .read = zynq_xadc_read,
+    .write = zynq_xadc_write,
+    .endianness = DEVICE_NATIVE_ENDIAN,
+};
+
+static void zynq_xadc_init(Object *obj)
+{
+    SysBusDevice *sbd = SYS_BUS_DEVICE(obj);
+    ZynqXADCState *s = ZYNQ_XADC(obj);
+
+    memory_region_init_io(&s->iomem, obj, &xadc_ops, s, "zynq-xadc",
+                          ZYNQ_XADC_MMIO_SIZE);
+    sysbus_init_mmio(sbd, &s->iomem);
+    sysbus_init_irq(sbd, &s->qemu_irq);
+}
+
+static const VMStateDescription vmstate_zynq_xadc = {
+    .name = "zynq-xadc",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32_ARRAY(regs, ZynqXADCState, ZYNQ_XADC_NUM_IO_REGS),
+        VMSTATE_UINT16_ARRAY(xadc_regs, ZynqXADCState,
+                             ZYNQ_XADC_NUM_ADC_REGS),
+        VMSTATE_UINT16_ARRAY(xadc_dfifo, ZynqXADCState,
+                             ZYNQ_XADC_FIFO_DEPTH),
+        VMSTATE_UINT16(xadc_read_reg_previous, ZynqXADCState),
+        VMSTATE_UINT16(xadc_dfifo_entries, ZynqXADCState),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static void zynq_xadc_class_init(ObjectClass *klass, void *data)
+{
+    DeviceClass *dc = DEVICE_CLASS(klass);
+
+    dc->vmsd = &vmstate_zynq_xadc;
+    dc->reset = zynq_xadc_reset;
+}
+
+static const TypeInfo zynq_xadc_info = {
+    .class_init = zynq_xadc_class_init,
+    .name  = TYPE_ZYNQ_XADC,
+    .parent = TYPE_SYS_BUS_DEVICE,
+    .instance_size  = sizeof(ZynqXADCState),
+    .instance_init = zynq_xadc_init,
+};
+
+static void zynq_xadc_register_types(void)
+{
+    type_register_static(&zynq_xadc_info);
+}
+
+type_init(zynq_xadc_register_types)
diff --git a/hw/net/e1000.c b/hw/net/e1000.c
index 910de3a..c877e06 100644
--- a/hw/net/e1000.c
+++ b/hw/net/e1000.c
@@ -37,24 +37,26 @@
 
 #include "e1000_regs.h"
 
+static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
+
 #define E1000_DEBUG
 
 #ifdef E1000_DEBUG
 enum {
-    DEBUG_GENERAL,	DEBUG_IO,	DEBUG_MMIO,	DEBUG_INTERRUPT,
-    DEBUG_RX,		DEBUG_TX,	DEBUG_MDIC,	DEBUG_EEPROM,
-    DEBUG_UNKNOWN,	DEBUG_TXSUM,	DEBUG_TXERR,	DEBUG_RXERR,
+    DEBUG_GENERAL,      DEBUG_IO,       DEBUG_MMIO,     DEBUG_INTERRUPT,
+    DEBUG_RX,           DEBUG_TX,       DEBUG_MDIC,     DEBUG_EEPROM,
+    DEBUG_UNKNOWN,      DEBUG_TXSUM,    DEBUG_TXERR,    DEBUG_RXERR,
     DEBUG_RXFILTER,     DEBUG_PHY,      DEBUG_NOTYET,
 };
-#define DBGBIT(x)	(1<<DEBUG_##x)
+#define DBGBIT(x)    (1<<DEBUG_##x)
 static int debugflags = DBGBIT(TXERR) | DBGBIT(GENERAL);
 
-#define	DBGOUT(what, fmt, ...) do { \
+#define DBGOUT(what, fmt, ...) do { \
     if (debugflags & DBGBIT(what)) \
         fprintf(stderr, "e1000: " fmt, ## __VA_ARGS__); \
     } while (0)
 #else
-#define	DBGOUT(what, fmt, ...) do {} while (0)
+#define DBGOUT(what, fmt, ...) do {} while (0)
 #endif
 
 #define IOPORT_SIZE       0x40
@@ -118,7 +120,7 @@ typedef struct E1000State_st {
     } tx;
 
     struct {
-        uint32_t val_in;	// shifted in from guest driver
+        uint32_t val_in;    /* shifted in from guest driver */
         uint16_t bitnum_in;
         uint16_t bitnum_out;
         uint16_t reading;
@@ -135,11 +137,15 @@ typedef struct E1000State_st {
 /* Compatibility flags for migration to/from qemu 1.3.0 and older */
 #define E1000_FLAG_AUTONEG_BIT 0
 #define E1000_FLAG_MIT_BIT 1
+#define E1000_FLAG_MAC_BIT 2
 #define E1000_FLAG_AUTONEG (1 << E1000_FLAG_AUTONEG_BIT)
 #define E1000_FLAG_MIT (1 << E1000_FLAG_MIT_BIT)
+#define E1000_FLAG_MAC (1 << E1000_FLAG_MAC_BIT)
     uint32_t compat_flags;
 } E1000State;
 
+#define chkflag(x)     (s->compat_flags & E1000_FLAG_##x)
+
 typedef struct E1000BaseClass {
     PCIDeviceClass parent_class;
     uint16_t phy_id2;
@@ -155,20 +161,36 @@ typedef struct E1000BaseClass {
 #define E1000_DEVICE_GET_CLASS(obj) \
     OBJECT_GET_CLASS(E1000BaseClass, (obj), TYPE_E1000_BASE)
 
-#define	defreg(x)	x = (E1000_##x>>2)
+#define defreg(x)    x = (E1000_##x>>2)
 enum {
-    defreg(CTRL),	defreg(EECD),	defreg(EERD),	defreg(GPRC),
-    defreg(GPTC),	defreg(ICR),	defreg(ICS),	defreg(IMC),
-    defreg(IMS),	defreg(LEDCTL),	defreg(MANC),	defreg(MDIC),
-    defreg(MPC),	defreg(PBA),	defreg(RCTL),	defreg(RDBAH),
-    defreg(RDBAL),	defreg(RDH),	defreg(RDLEN),	defreg(RDT),
-    defreg(STATUS),	defreg(SWSM),	defreg(TCTL),	defreg(TDBAH),
-    defreg(TDBAL),	defreg(TDH),	defreg(TDLEN),	defreg(TDT),
-    defreg(TORH),	defreg(TORL),	defreg(TOTH),	defreg(TOTL),
-    defreg(TPR),	defreg(TPT),	defreg(TXDCTL),	defreg(WUFC),
-    defreg(RA),		defreg(MTA),	defreg(CRCERRS),defreg(VFTA),
-    defreg(VET),        defreg(RDTR),   defreg(RADV),   defreg(TADV),
-    defreg(ITR),
+    defreg(CTRL),    defreg(EECD),    defreg(EERD),    defreg(GPRC),
+    defreg(GPTC),    defreg(ICR),     defreg(ICS),     defreg(IMC),
+    defreg(IMS),     defreg(LEDCTL),  defreg(MANC),    defreg(MDIC),
+    defreg(MPC),     defreg(PBA),     defreg(RCTL),    defreg(RDBAH),
+    defreg(RDBAL),   defreg(RDH),     defreg(RDLEN),   defreg(RDT),
+    defreg(STATUS),  defreg(SWSM),    defreg(TCTL),    defreg(TDBAH),
+    defreg(TDBAL),   defreg(TDH),     defreg(TDLEN),   defreg(TDT),
+    defreg(TORH),    defreg(TORL),    defreg(TOTH),    defreg(TOTL),
+    defreg(TPR),     defreg(TPT),     defreg(TXDCTL),  defreg(WUFC),
+    defreg(RA),      defreg(MTA),     defreg(CRCERRS), defreg(VFTA),
+    defreg(VET),     defreg(RDTR),    defreg(RADV),    defreg(TADV),
+    defreg(ITR),     defreg(FCRUC),   defreg(TDFH),    defreg(TDFT),
+    defreg(TDFHS),   defreg(TDFTS),   defreg(TDFPC),   defreg(RDFH),
+    defreg(RDFT),    defreg(RDFHS),   defreg(RDFTS),   defreg(RDFPC),
+    defreg(IPAV),    defreg(WUC),     defreg(WUS),     defreg(AIT),
+    defreg(IP6AT),   defreg(IP4AT),   defreg(FFLT),    defreg(FFMT),
+    defreg(FFVT),    defreg(WUPM),    defreg(PBM),     defreg(SCC),
+    defreg(ECOL),    defreg(MCC),     defreg(LATECOL), defreg(COLC),
+    defreg(DC),      defreg(TNCRS),   defreg(SEC),     defreg(CEXTERR),
+    defreg(RLEC),    defreg(XONRXC),  defreg(XONTXC),  defreg(XOFFRXC),
+    defreg(XOFFTXC), defreg(RFC),     defreg(RJC),     defreg(RNBC),
+    defreg(TSCTFC),  defreg(MGTPRC),  defreg(MGTPDC),  defreg(MGTPTC),
+    defreg(RUC),     defreg(ROC),     defreg(GORCL),   defreg(GORCH),
+    defreg(GOTCL),   defreg(GOTCH),   defreg(BPRC),    defreg(MPRC),
+    defreg(TSCTC),   defreg(PRC64),   defreg(PRC127),  defreg(PRC255),
+    defreg(PRC511),  defreg(PRC1023), defreg(PRC1522), defreg(PTC64),
+    defreg(PTC127),  defreg(PTC255),  defreg(PTC511),  defreg(PTC1023),
+    defreg(PTC1522), defreg(MPTC),    defreg(BPTC)
 };
 
 static void
@@ -193,8 +215,7 @@ e1000_link_up(E1000State *s)
 static bool
 have_autoneg(E1000State *s)
 {
-    return (s->compat_flags & E1000_FLAG_AUTONEG) &&
-           (s->phy_reg[PHY_CTRL] & MII_CR_AUTO_NEG_EN);
+    return chkflag(AUTONEG) && (s->phy_reg[PHY_CTRL] & MII_CR_AUTO_NEG_EN);
 }
 
 static void
@@ -226,18 +247,18 @@ enum { NPHYWRITEOPS = ARRAY_SIZE(phyreg_writeops) };
 
 enum { PHY_R = 1, PHY_W = 2, PHY_RW = PHY_R | PHY_W };
 static const char phy_regcap[0x20] = {
-    [PHY_STATUS] = PHY_R,	[M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
-    [PHY_ID1] = PHY_R,		[M88E1000_PHY_SPEC_CTRL] = PHY_RW,
-    [PHY_CTRL] = PHY_RW,	[PHY_1000T_CTRL] = PHY_RW,
-    [PHY_LP_ABILITY] = PHY_R,	[PHY_1000T_STATUS] = PHY_R,
-    [PHY_AUTONEG_ADV] = PHY_RW,	[M88E1000_RX_ERR_CNTR] = PHY_R,
-    [PHY_ID2] = PHY_R,		[M88E1000_PHY_SPEC_STATUS] = PHY_R,
+    [PHY_STATUS]      = PHY_R,     [M88E1000_EXT_PHY_SPEC_CTRL] = PHY_RW,
+    [PHY_ID1]         = PHY_R,     [M88E1000_PHY_SPEC_CTRL]     = PHY_RW,
+    [PHY_CTRL]        = PHY_RW,    [PHY_1000T_CTRL]             = PHY_RW,
+    [PHY_LP_ABILITY]  = PHY_R,     [PHY_1000T_STATUS]           = PHY_R,
+    [PHY_AUTONEG_ADV] = PHY_RW,    [M88E1000_RX_ERR_CNTR]       = PHY_R,
+    [PHY_ID2]         = PHY_R,     [M88E1000_PHY_SPEC_STATUS]   = PHY_R,
     [PHY_AUTONEG_EXP] = PHY_R,
 };
 
 /* PHY_ID2 documented in 8254x_GBe_SDM.pdf, pp. 250 */
 static const uint16_t phy_reg_init[] = {
-    [PHY_CTRL] =   MII_CR_SPEED_SELECT_MSB |
+    [PHY_CTRL]   = MII_CR_SPEED_SELECT_MSB |
                    MII_CR_FULL_DUPLEX |
                    MII_CR_AUTO_NEG_EN,
 
@@ -264,15 +285,15 @@ static const uint16_t phy_reg_init[] = {
 };
 
 static const uint32_t mac_reg_init[] = {
-    [PBA] =     0x00100030,
-    [LEDCTL] =  0x602,
-    [CTRL] =    E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 |
+    [PBA]     = 0x00100030,
+    [LEDCTL]  = 0x602,
+    [CTRL]    = E1000_CTRL_SWDPIN2 | E1000_CTRL_SWDPIN0 |
                 E1000_CTRL_SPD_1000 | E1000_CTRL_SLU,
-    [STATUS] =  0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE |
+    [STATUS]  = 0x80000000 | E1000_STATUS_GIO_MASTER_ENABLE |
                 E1000_STATUS_ASDV | E1000_STATUS_MTXCKOK |
                 E1000_STATUS_SPEED_1000 | E1000_STATUS_FD |
                 E1000_STATUS_LU,
-    [MANC] =    E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN |
+    [MANC]    = E1000_MANC_EN_MNG2HOST | E1000_MANC_RCV_TCO_EN |
                 E1000_MANC_ARP_EN | E1000_MANC_0298_EN |
                 E1000_MANC_RMCP_EN,
 };
@@ -319,7 +340,7 @@ set_interrupt_cause(E1000State *s, int index, uint32_t val)
         if (s->mit_timer_on) {
             return;
         }
-        if (s->compat_flags & E1000_FLAG_MIT) {
+        if (chkflag(MIT)) {
             /* Compute the next mitigation delay according to pending
              * interrupts and the current values of RADV (provided
              * RDTR!=0), TADV and ITR.
@@ -510,17 +531,19 @@ set_eecd(E1000State *s, int index, uint32_t val)
 
     s->eecd_state.old_eecd = val & (E1000_EECD_SK | E1000_EECD_CS |
             E1000_EECD_DI|E1000_EECD_FWE_MASK|E1000_EECD_REQ);
-    if (!(E1000_EECD_CS & val))			// CS inactive; nothing to do
-	return;
-    if (E1000_EECD_CS & (val ^ oldval)) {	// CS rise edge; reset state
-	s->eecd_state.val_in = 0;
-	s->eecd_state.bitnum_in = 0;
-	s->eecd_state.bitnum_out = 0;
-	s->eecd_state.reading = 0;
+    if (!(E1000_EECD_CS & val)) {            /* CS inactive; nothing to do */
+        return;
     }
-    if (!(E1000_EECD_SK & (val ^ oldval)))	// no clock edge
+    if (E1000_EECD_CS & (val ^ oldval)) {    /* CS rise edge; reset state */
+        s->eecd_state.val_in = 0;
+        s->eecd_state.bitnum_in = 0;
+        s->eecd_state.bitnum_out = 0;
+        s->eecd_state.reading = 0;
+    }
+    if (!(E1000_EECD_SK & (val ^ oldval))) {    /* no clock edge */
         return;
-    if (!(E1000_EECD_SK & val)) {		// falling edge
+    }
+    if (!(E1000_EECD_SK & val)) {               /* falling edge */
         s->eecd_state.bitnum_out++;
         return;
     }
@@ -565,6 +588,56 @@ putsum(uint8_t *data, uint32_t n, uint32_t sloc, uint32_t css, uint32_t cse)
     }
 }
 
+static inline void
+inc_reg_if_not_full(E1000State *s, int index)
+{
+    if (s->mac_reg[index] != 0xffffffff) {
+        s->mac_reg[index]++;
+    }
+}
+
+static inline void
+inc_tx_bcast_or_mcast_count(E1000State *s, const unsigned char *arr)
+{
+    if (!memcmp(arr, bcast, sizeof bcast)) {
+        inc_reg_if_not_full(s, BPTC);
+    } else if (arr[0] & 1) {
+        inc_reg_if_not_full(s, MPTC);
+    }
+}
+
+static void
+grow_8reg_if_not_full(E1000State *s, int index, int size)
+{
+    uint64_t sum = s->mac_reg[index] | (uint64_t)s->mac_reg[index+1] << 32;
+
+    if (sum + size < sum) {
+        sum = ~0ULL;
+    } else {
+        sum += size;
+    }
+    s->mac_reg[index] = sum;
+    s->mac_reg[index+1] = sum >> 32;
+}
+
+static void
+increase_size_stats(E1000State *s, const int *size_regs, int size)
+{
+    if (size > 1023) {
+        inc_reg_if_not_full(s, size_regs[5]);
+    } else if (size > 511) {
+        inc_reg_if_not_full(s, size_regs[4]);
+    } else if (size > 255) {
+        inc_reg_if_not_full(s, size_regs[3]);
+    } else if (size > 127) {
+        inc_reg_if_not_full(s, size_regs[2]);
+    } else if (size > 64) {
+        inc_reg_if_not_full(s, size_regs[1]);
+    } else if (size == 64) {
+        inc_reg_if_not_full(s, size_regs[0]);
+    }
+}
+
 static inline int
 vlan_enabled(E1000State *s)
 {
@@ -602,40 +675,49 @@ fcs_len(E1000State *s)
 static void
 e1000_send_packet(E1000State *s, const uint8_t *buf, int size)
 {
+    static const int PTCregs[6] = { PTC64, PTC127, PTC255, PTC511,
+                                    PTC1023, PTC1522 };
+
     NetClientState *nc = qemu_get_queue(s->nic);
     if (s->phy_reg[PHY_CTRL] & MII_CR_LOOPBACK) {
         nc->info->receive(nc, buf, size);
     } else {
         qemu_send_packet(nc, buf, size);
     }
+    inc_tx_bcast_or_mcast_count(s, buf);
+    increase_size_stats(s, PTCregs, size);
 }
 
 static void
 xmit_seg(E1000State *s)
 {
     uint16_t len, *sp;
-    unsigned int frames = s->tx.tso_frames, css, sofar, n;
+    unsigned int frames = s->tx.tso_frames, css, sofar;
     struct e1000_tx *tp = &s->tx;
 
     if (tp->tse && tp->cptse) {
         css = tp->ipcss;
         DBGOUT(TXSUM, "frames %d size %d ipcss %d\n",
                frames, tp->size, css);
-        if (tp->ip) {		// IPv4
+        if (tp->ip) {    /* IPv4 */
             stw_be_p(tp->data+css+2, tp->size - css);
             stw_be_p(tp->data+css+4,
-                          be16_to_cpup((uint16_t *)(tp->data+css+4))+frames);
-        } else			// IPv6
+                     be16_to_cpup((uint16_t *)(tp->data+css+4))+frames);
+        } else {         /* IPv6 */
             stw_be_p(tp->data+css+4, tp->size - css);
+        }
         css = tp->tucss;
         len = tp->size - css;
         DBGOUT(TXSUM, "tcp %d tucss %d len %d\n", tp->tcp, css, len);
         if (tp->tcp) {
             sofar = frames * tp->mss;
             stl_be_p(tp->data+css+4, ldl_be_p(tp->data+css+4)+sofar); /* seq */
-            if (tp->paylen - sofar > tp->mss)
-                tp->data[css + 13] &= ~9;		// PSH, FIN
-        } else	// UDP
+            if (tp->paylen - sofar > tp->mss) {
+                tp->data[css + 13] &= ~9;    /* PSH, FIN */
+            } else if (frames) {
+                inc_reg_if_not_full(s, TSCTC);
+            }
+        } else    /* UDP */
             stw_be_p(tp->data+css+4, len);
         if (tp->sum_needed & E1000_TXD_POPTS_TXSM) {
             unsigned int phsum;
@@ -657,13 +739,15 @@ xmit_seg(E1000State *s)
         memmove(tp->data, tp->data + 4, 8);
         memcpy(tp->data + 8, tp->vlan_header, 4);
         e1000_send_packet(s, tp->vlan, tp->size + 4);
-    } else
+    } else {
         e1000_send_packet(s, tp->data, tp->size);
-    s->mac_reg[TPT]++;
-    s->mac_reg[GPTC]++;
-    n = s->mac_reg[TOTL];
-    if ((s->mac_reg[TOTL] += s->tx.size) < n)
-        s->mac_reg[TOTH]++;
+    }
+
+    inc_reg_if_not_full(s, TPT);
+    grow_8reg_if_not_full(s, TOTL, s->tx.size);
+    s->mac_reg[GPTC] = s->mac_reg[TPT];
+    s->mac_reg[GOTCL] = s->mac_reg[TOTL];
+    s->mac_reg[GOTCH] = s->mac_reg[TOTH];
 }
 
 static void
@@ -679,7 +763,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
     struct e1000_tx *tp = &s->tx;
 
     s->mit_ide |= (txd_lower & E1000_TXD_CMD_IDE);
-    if (dtype == E1000_TXD_CMD_DEXT) {	// context descriptor
+    if (dtype == E1000_TXD_CMD_DEXT) {    /* context descriptor */
         op = le32_to_cpu(xp->cmd_and_length);
         tp->ipcss = xp->lower_setup.ip_fields.ipcss;
         tp->ipcso = xp->lower_setup.ip_fields.ipcso;
@@ -694,7 +778,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
         tp->tcp = (op & E1000_TXD_CMD_TCP) ? 1 : 0;
         tp->tse = (op & E1000_TXD_CMD_TSE) ? 1 : 0;
         tp->tso_frames = 0;
-        if (tp->tucso == 0) {	// this is probably wrong
+        if (tp->tucso == 0) {    /* this is probably wrong */
             DBGOUT(TXSUM, "TCP/UDP: cso 0!\n");
             tp->tucso = tp->tucss + (tp->tcp ? 16 : 6);
         }
@@ -718,7 +802,7 @@ process_tx_desc(E1000State *s, struct e1000_tx_desc *dp)
         stw_be_p(tp->vlan_header + 2,
                       le16_to_cpu(dp->upper.fields.special));
     }
-        
+
     addr = le64_to_cpu(dp->buffer_addr);
     if (tp->tse && tp->cptse) {
         msh = tp->hdr_len + tp->mss;
@@ -831,9 +915,9 @@ start_xmit(E1000State *s)
 static int
 receive_filter(E1000State *s, const uint8_t *buf, int size)
 {
-    static const uint8_t bcast[] = {0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
     static const int mta_shift[] = {4, 3, 2, 0};
     uint32_t f, rctl = s->mac_reg[RCTL], ra[2], *rp;
+    int isbcast = !memcmp(buf, bcast, sizeof bcast), ismcast = (buf[0] & 1);
 
     if (is_vlan_packet(s, buf) && vlan_rx_filter_enabled(s)) {
         uint16_t vid = be16_to_cpup((uint16_t *)(buf + 14));
@@ -843,14 +927,19 @@ receive_filter(E1000State *s, const uint8_t *buf, int size)
             return 0;
     }
 
-    if (rctl & E1000_RCTL_UPE)			// promiscuous
+    if (!isbcast && !ismcast && (rctl & E1000_RCTL_UPE)) { /* promiscuous ucast */
         return 1;
+    }
 
-    if ((buf[0] & 1) && (rctl & E1000_RCTL_MPE))	// promiscuous mcast
+    if (ismcast && (rctl & E1000_RCTL_MPE)) {          /* promiscuous mcast */
+        inc_reg_if_not_full(s, MPRC);
         return 1;
+    }
 
-    if ((rctl & E1000_RCTL_BAM) && !memcmp(buf, bcast, sizeof bcast))
+    if (isbcast && (rctl & E1000_RCTL_BAM)) {          /* broadcast enabled */
+        inc_reg_if_not_full(s, BPRC);
         return 1;
+    }
 
     for (rp = s->mac_reg + RA; rp < s->mac_reg + RA + 32; rp += 2) {
         if (!(rp[1] & E1000_RAH_AV))
@@ -870,8 +959,10 @@ receive_filter(E1000State *s, const uint8_t *buf, int size)
 
     f = mta_shift[(rctl >> E1000_RCTL_MO_SHIFT) & 3];
     f = (((buf[5] << 8) | buf[4]) >> f) & 0xfff;
-    if (s->mac_reg[MTA + (f >> 5)] & (1 << (f & 0x1f)))
+    if (s->mac_reg[MTA + (f >> 5)] & (1 << (f & 0x1f))) {
+        inc_reg_if_not_full(s, MPRC);
         return 1;
+    }
     DBGOUT(RXFILTER,
            "dropping, inexact filter mismatch: %02x:%02x:%02x:%02x:%02x:%02x MO %d MTA[%d] %x\n",
            buf[0], buf[1], buf[2], buf[3], buf[4], buf[5],
@@ -960,6 +1051,8 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
     size_t desc_offset;
     size_t desc_size;
     size_t total_size;
+    static const int PRCregs[6] = { PRC64, PRC127, PRC255, PRC511,
+                                    PRC1023, PRC1522 };
 
     if (!(s->mac_reg[STATUS] & E1000_STATUS_LU)) {
         return -1;
@@ -973,6 +1066,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
     if (size < sizeof(min_buf)) {
         iov_to_buf(iov, iovcnt, 0, min_buf, size);
         memset(&min_buf[size], 0, sizeof(min_buf) - size);
+        inc_reg_if_not_full(s, RUC);
         min_iov.iov_base = filter_buf = min_buf;
         min_iov.iov_len = size = sizeof(min_buf);
         iovcnt = 1;
@@ -988,6 +1082,7 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
         (size > MAXIMUM_ETHERNET_VLAN_SIZE
         && !(s->mac_reg[RCTL] & E1000_RCTL_LPE)))
         && !(s->mac_reg[RCTL] & E1000_RCTL_SBP)) {
+        inc_reg_if_not_full(s, ROC);
         return size;
     }
 
@@ -1073,16 +1168,17 @@ e1000_receive_iov(NetClientState *nc, const struct iovec *iov, int iovcnt)
         }
     } while (desc_offset < total_size);
 
-    s->mac_reg[GPRC]++;
-    s->mac_reg[TPR]++;
+    increase_size_stats(s, PRCregs, total_size);
+    inc_reg_if_not_full(s, TPR);
+    s->mac_reg[GPRC] = s->mac_reg[TPR];
     /* TOR - Total Octets Received:
      * This register includes bytes received in a packet from the <Destination
      * Address> field through the <CRC> field, inclusively.
+     * Always include FCS length (4) in size.
      */
-    n = s->mac_reg[TORL] + size + /* Always include FCS length. */ 4;
-    if (n < s->mac_reg[TORL])
-        s->mac_reg[TORH]++;
-    s->mac_reg[TORL] = n;
+    grow_8reg_if_not_full(s, TORL, size+4);
+    s->mac_reg[GORCL] = s->mac_reg[TORL];
+    s->mac_reg[GORCH] = s->mac_reg[TORH];
 
     n = E1000_ICS_RXT0;
     if ((rdt = s->mac_reg[RDT]) < s->mac_reg[RDH])
@@ -1114,6 +1210,30 @@ mac_readreg(E1000State *s, int index)
 }
 
 static uint32_t
+mac_low4_read(E1000State *s, int index)
+{
+    return s->mac_reg[index] & 0xf;
+}
+
+static uint32_t
+mac_low11_read(E1000State *s, int index)
+{
+    return s->mac_reg[index] & 0x7ff;
+}
+
+static uint32_t
+mac_low13_read(E1000State *s, int index)
+{
+    return s->mac_reg[index] & 0x1fff;
+}
+
+static uint32_t
+mac_low16_read(E1000State *s, int index)
+{
+    return s->mac_reg[index] & 0xffff;
+}
+
+static uint32_t
 mac_icr_read(E1000State *s, int index)
 {
     uint32_t ret = s->mac_reg[ICR];
@@ -1206,46 +1326,144 @@ set_ims(E1000State *s, int index, uint32_t val)
     set_ics(s, 0, 0);
 }
 
-#define getreg(x)	[x] = mac_readreg
+#define getreg(x)    [x] = mac_readreg
 static uint32_t (*macreg_readops[])(E1000State *, int) = {
-    getreg(PBA),	getreg(RCTL),	getreg(TDH),	getreg(TXDCTL),
-    getreg(WUFC),	getreg(TDT),	getreg(CTRL),	getreg(LEDCTL),
-    getreg(MANC),	getreg(MDIC),	getreg(SWSM),	getreg(STATUS),
-    getreg(TORL),	getreg(TOTL),	getreg(IMS),	getreg(TCTL),
-    getreg(RDH),	getreg(RDT),	getreg(VET),	getreg(ICS),
-    getreg(TDBAL),	getreg(TDBAH),	getreg(RDBAH),	getreg(RDBAL),
-    getreg(TDLEN),      getreg(RDLEN),  getreg(RDTR),   getreg(RADV),
-    getreg(TADV),       getreg(ITR),
-
-    [TOTH] = mac_read_clr8,	[TORH] = mac_read_clr8,	[GPRC] = mac_read_clr4,
-    [GPTC] = mac_read_clr4,	[TPR] = mac_read_clr4,	[TPT] = mac_read_clr4,
-    [ICR] = mac_icr_read,	[EECD] = get_eecd,	[EERD] = flash_eerd_read,
-    [CRCERRS ... MPC] = &mac_readreg,
-    [RA ... RA+31] = &mac_readreg,
-    [MTA ... MTA+127] = &mac_readreg,
+    getreg(PBA),      getreg(RCTL),     getreg(TDH),      getreg(TXDCTL),
+    getreg(WUFC),     getreg(TDT),      getreg(CTRL),     getreg(LEDCTL),
+    getreg(MANC),     getreg(MDIC),     getreg(SWSM),     getreg(STATUS),
+    getreg(TORL),     getreg(TOTL),     getreg(IMS),      getreg(TCTL),
+    getreg(RDH),      getreg(RDT),      getreg(VET),      getreg(ICS),
+    getreg(TDBAL),    getreg(TDBAH),    getreg(RDBAH),    getreg(RDBAL),
+    getreg(TDLEN),    getreg(RDLEN),    getreg(RDTR),     getreg(RADV),
+    getreg(TADV),     getreg(ITR),      getreg(FCRUC),    getreg(IPAV),
+    getreg(WUC),      getreg(WUS),      getreg(SCC),      getreg(ECOL),
+    getreg(MCC),      getreg(LATECOL),  getreg(COLC),     getreg(DC),
+    getreg(TNCRS),    getreg(SEC),      getreg(CEXTERR),  getreg(RLEC),
+    getreg(XONRXC),   getreg(XONTXC),   getreg(XOFFRXC),  getreg(XOFFTXC),
+    getreg(RFC),      getreg(RJC),      getreg(RNBC),     getreg(TSCTFC),
+    getreg(MGTPRC),   getreg(MGTPDC),   getreg(MGTPTC),   getreg(GORCL),
+    getreg(GOTCL),
+
+    [TOTH]    = mac_read_clr8,      [TORH]    = mac_read_clr8,
+    [GOTCH]   = mac_read_clr8,      [GORCH]   = mac_read_clr8,
+    [PRC64]   = mac_read_clr4,      [PRC127]  = mac_read_clr4,
+    [PRC255]  = mac_read_clr4,      [PRC511]  = mac_read_clr4,
+    [PRC1023] = mac_read_clr4,      [PRC1522] = mac_read_clr4,
+    [PTC64]   = mac_read_clr4,      [PTC127]  = mac_read_clr4,
+    [PTC255]  = mac_read_clr4,      [PTC511]  = mac_read_clr4,
+    [PTC1023] = mac_read_clr4,      [PTC1522] = mac_read_clr4,
+    [GPRC]    = mac_read_clr4,      [GPTC]    = mac_read_clr4,
+    [TPT]     = mac_read_clr4,      [TPR]     = mac_read_clr4,
+    [RUC]     = mac_read_clr4,      [ROC]     = mac_read_clr4,
+    [BPRC]    = mac_read_clr4,      [MPRC]    = mac_read_clr4,
+    [TSCTC]   = mac_read_clr4,      [BPTC]    = mac_read_clr4,
+    [MPTC]    = mac_read_clr4,
+    [ICR]     = mac_icr_read,       [EECD]    = get_eecd,
+    [EERD]    = flash_eerd_read,
+    [RDFH]    = mac_low13_read,     [RDFT]    = mac_low13_read,
+    [RDFHS]   = mac_low13_read,     [RDFTS]   = mac_low13_read,
+    [RDFPC]   = mac_low13_read,
+    [TDFH]    = mac_low11_read,     [TDFT]    = mac_low11_read,
+    [TDFHS]   = mac_low13_read,     [TDFTS]   = mac_low13_read,
+    [TDFPC]   = mac_low13_read,
+    [AIT]     = mac_low16_read,
+
+    [CRCERRS ... MPC]   = &mac_readreg,
+    [IP6AT ... IP6AT+3] = &mac_readreg,    [IP4AT ... IP4AT+6] = &mac_readreg,
+    [FFLT ... FFLT+6]   = &mac_low11_read,
+    [RA ... RA+31]      = &mac_readreg,
+    [WUPM ... WUPM+31]  = &mac_readreg,
+    [MTA ... MTA+127]   = &mac_readreg,
     [VFTA ... VFTA+127] = &mac_readreg,
+    [FFMT ... FFMT+254] = &mac_low4_read,
+    [FFVT ... FFVT+254] = &mac_readreg,
+    [PBM ... PBM+16383] = &mac_readreg,
 };
 enum { NREADOPS = ARRAY_SIZE(macreg_readops) };
 
-#define putreg(x)	[x] = mac_writereg
+#define putreg(x)    [x] = mac_writereg
 static void (*macreg_writeops[])(E1000State *, int, uint32_t) = {
-    putreg(PBA),	putreg(EERD),	putreg(SWSM),	putreg(WUFC),
-    putreg(TDBAL),	putreg(TDBAH),	putreg(TXDCTL),	putreg(RDBAH),
-    putreg(RDBAL),	putreg(LEDCTL), putreg(VET),
-    [TDLEN] = set_dlen,	[RDLEN] = set_dlen,	[TCTL] = set_tctl,
-    [TDT] = set_tctl,	[MDIC] = set_mdic,	[ICS] = set_ics,
-    [TDH] = set_16bit,	[RDH] = set_16bit,	[RDT] = set_rdt,
-    [IMC] = set_imc,	[IMS] = set_ims,	[ICR] = set_icr,
-    [EECD] = set_eecd,	[RCTL] = set_rx_control, [CTRL] = set_ctrl,
-    [RDTR] = set_16bit, [RADV] = set_16bit,     [TADV] = set_16bit,
-    [ITR] = set_16bit,
-    [RA ... RA+31] = &mac_writereg,
-    [MTA ... MTA+127] = &mac_writereg,
+    putreg(PBA),      putreg(EERD),     putreg(SWSM),     putreg(WUFC),
+    putreg(TDBAL),    putreg(TDBAH),    putreg(TXDCTL),   putreg(RDBAH),
+    putreg(RDBAL),    putreg(LEDCTL),   putreg(VET),      putreg(FCRUC),
+    putreg(TDFH),     putreg(TDFT),     putreg(TDFHS),    putreg(TDFTS),
+    putreg(TDFPC),    putreg(RDFH),     putreg(RDFT),     putreg(RDFHS),
+    putreg(RDFTS),    putreg(RDFPC),    putreg(IPAV),     putreg(WUC),
+    putreg(WUS),      putreg(AIT),
+
+    [TDLEN]  = set_dlen,   [RDLEN]  = set_dlen,       [TCTL] = set_tctl,
+    [TDT]    = set_tctl,   [MDIC]   = set_mdic,       [ICS]  = set_ics,
+    [TDH]    = set_16bit,  [RDH]    = set_16bit,      [RDT]  = set_rdt,
+    [IMC]    = set_imc,    [IMS]    = set_ims,        [ICR]  = set_icr,
+    [EECD]   = set_eecd,   [RCTL]   = set_rx_control, [CTRL] = set_ctrl,
+    [RDTR]   = set_16bit,  [RADV]   = set_16bit,      [TADV] = set_16bit,
+    [ITR]    = set_16bit,
+
+    [IP6AT ... IP6AT+3] = &mac_writereg, [IP4AT ... IP4AT+6] = &mac_writereg,
+    [FFLT ... FFLT+6]   = &mac_writereg,
+    [RA ... RA+31]      = &mac_writereg,
+    [WUPM ... WUPM+31]  = &mac_writereg,
+    [MTA ... MTA+127]   = &mac_writereg,
     [VFTA ... VFTA+127] = &mac_writereg,
+    [FFMT ... FFMT+254] = &mac_writereg, [FFVT ... FFVT+254] = &mac_writereg,
+    [PBM ... PBM+16383] = &mac_writereg,
 };
 
 enum { NWRITEOPS = ARRAY_SIZE(macreg_writeops) };
 
+enum { MAC_ACCESS_PARTIAL = 1, MAC_ACCESS_FLAG_NEEDED = 2 };
+
+#define markflag(x)    ((E1000_FLAG_##x << 2) | MAC_ACCESS_FLAG_NEEDED)
+/* In the array below the meaning of the bits is: [f|f|f|f|f|f|n|p]
+ * f - flag bits (up to 6 possible flags)
+ * n - flag needed
+ * p - partially implenented */
+static const uint8_t mac_reg_access[0x8000] = {
+    [RDTR]    = markflag(MIT),    [TADV]    = markflag(MIT),
+    [RADV]    = markflag(MIT),    [ITR]     = markflag(MIT),
+
+    [IPAV]    = markflag(MAC),    [WUC]     = markflag(MAC),
+    [IP6AT]   = markflag(MAC),    [IP4AT]   = markflag(MAC),
+    [FFVT]    = markflag(MAC),    [WUPM]    = markflag(MAC),
+    [ECOL]    = markflag(MAC),    [MCC]     = markflag(MAC),
+    [DC]      = markflag(MAC),    [TNCRS]   = markflag(MAC),
+    [RLEC]    = markflag(MAC),    [XONRXC]  = markflag(MAC),
+    [XOFFTXC] = markflag(MAC),    [RFC]     = markflag(MAC),
+    [TSCTFC]  = markflag(MAC),    [MGTPRC]  = markflag(MAC),
+    [WUS]     = markflag(MAC),    [AIT]     = markflag(MAC),
+    [FFLT]    = markflag(MAC),    [FFMT]    = markflag(MAC),
+    [SCC]     = markflag(MAC),    [FCRUC]   = markflag(MAC),
+    [LATECOL] = markflag(MAC),    [COLC]    = markflag(MAC),
+    [SEC]     = markflag(MAC),    [CEXTERR] = markflag(MAC),
+    [XONTXC]  = markflag(MAC),    [XOFFRXC] = markflag(MAC),
+    [RJC]     = markflag(MAC),    [RNBC]    = markflag(MAC),
+    [MGTPDC]  = markflag(MAC),    [MGTPTC]  = markflag(MAC),
+    [RUC]     = markflag(MAC),    [ROC]     = markflag(MAC),
+    [GORCL]   = markflag(MAC),    [GORCH]   = markflag(MAC),
+    [GOTCL]   = markflag(MAC),    [GOTCH]   = markflag(MAC),
+    [BPRC]    = markflag(MAC),    [MPRC]    = markflag(MAC),
+    [TSCTC]   = markflag(MAC),    [PRC64]   = markflag(MAC),
+    [PRC127]  = markflag(MAC),    [PRC255]  = markflag(MAC),
+    [PRC511]  = markflag(MAC),    [PRC1023] = markflag(MAC),
+    [PRC1522] = markflag(MAC),    [PTC64]   = markflag(MAC),
+    [PTC127]  = markflag(MAC),    [PTC255]  = markflag(MAC),
+    [PTC511]  = markflag(MAC),    [PTC1023] = markflag(MAC),
+    [PTC1522] = markflag(MAC),    [MPTC]    = markflag(MAC),
+    [BPTC]    = markflag(MAC),
+
+    [TDFH]  = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [TDFT]  = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [TDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [TDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [TDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [RDFH]  = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [RDFT]  = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [RDFHS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [RDFTS] = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [RDFPC] = markflag(MAC) | MAC_ACCESS_PARTIAL,
+    [PBM]   = markflag(MAC) | MAC_ACCESS_PARTIAL,
+};
+
 static void
 e1000_mmio_write(void *opaque, hwaddr addr, uint64_t val,
                  unsigned size)
@@ -1254,9 +1472,20 @@ e1000_mmio_write(void *opaque, hwaddr addr, uint64_t val,
     unsigned int index = (addr & 0x1ffff) >> 2;
 
     if (index < NWRITEOPS && macreg_writeops[index]) {
-        macreg_writeops[index](s, index, val);
+        if (!(mac_reg_access[index] & MAC_ACCESS_FLAG_NEEDED)
+            || (s->compat_flags & (mac_reg_access[index] >> 2))) {
+            if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
+                DBGOUT(GENERAL, "Writing to register at offset: 0x%08x. "
+                       "It is not fully implemented.\n", index<<2);
+            }
+            macreg_writeops[index](s, index, val);
+        } else {    /* "flag needed" bit is set, but the flag is not active */
+            DBGOUT(MMIO, "MMIO write attempt to disabled reg. addr=0x%08x\n",
+                   index<<2);
+        }
     } else if (index < NREADOPS && macreg_readops[index]) {
-        DBGOUT(MMIO, "e1000_mmio_writel RO %x: 0x%04"PRIx64"\n", index<<2, val);
+        DBGOUT(MMIO, "e1000_mmio_writel RO %x: 0x%04"PRIx64"\n",
+               index<<2, val);
     } else {
         DBGOUT(UNKNOWN, "MMIO unknown write addr=0x%08x,val=0x%08"PRIx64"\n",
                index<<2, val);
@@ -1269,11 +1498,21 @@ e1000_mmio_read(void *opaque, hwaddr addr, unsigned size)
     E1000State *s = opaque;
     unsigned int index = (addr & 0x1ffff) >> 2;
 
-    if (index < NREADOPS && macreg_readops[index])
-    {
-        return macreg_readops[index](s, index);
+    if (index < NREADOPS && macreg_readops[index]) {
+        if (!(mac_reg_access[index] & MAC_ACCESS_FLAG_NEEDED)
+            || (s->compat_flags & (mac_reg_access[index] >> 2))) {
+            if (mac_reg_access[index] & MAC_ACCESS_PARTIAL) {
+                DBGOUT(GENERAL, "Reading register at offset: 0x%08x. "
+                       "It is not fully implemented.\n", index<<2);
+            }
+            return macreg_readops[index](s, index);
+        } else {    /* "flag needed" bit is set, but the flag is not active */
+            DBGOUT(MMIO, "MMIO read attempt of disabled reg. addr=0x%08x\n",
+                   index<<2);
+        }
+    } else {
+        DBGOUT(UNKNOWN, "MMIO unknown read addr=0x%08x\n", index<<2);
     }
-    DBGOUT(UNKNOWN, "MMIO unknown read addr=0x%08x\n", index<<2);
     return 0;
 }
 
@@ -1340,7 +1579,7 @@ static int e1000_post_load(void *opaque, int version_id)
     E1000State *s = opaque;
     NetClientState *nc = qemu_get_queue(s->nic);
 
-    if (!(s->compat_flags & E1000_FLAG_MIT)) {
+    if (!chkflag(MIT)) {
         s->mac_reg[ITR] = s->mac_reg[RDTR] = s->mac_reg[RADV] =
             s->mac_reg[TADV] = 0;
         s->mit_irq_level = false;
@@ -1367,7 +1606,14 @@ static bool e1000_mit_state_needed(void *opaque)
 {
     E1000State *s = opaque;
 
-    return s->compat_flags & E1000_FLAG_MIT;
+    return chkflag(MIT);
+}
+
+static bool e1000_full_mac_needed(void *opaque)
+{
+    E1000State *s = opaque;
+
+    return chkflag(MAC);
 }
 
 static const VMStateDescription vmstate_e1000_mit_state = {
@@ -1385,6 +1631,17 @@ static const VMStateDescription vmstate_e1000_mit_state = {
     }
 };
 
+static const VMStateDescription vmstate_e1000_full_mac_state = {
+    .name = "e1000/full_mac_state",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = e1000_full_mac_needed,
+    .fields = (VMStateField[]) {
+        VMSTATE_UINT32_ARRAY(mac_reg, E1000State, 0x8000),
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_e1000 = {
     .name = "e1000",
     .version_id = 2,
@@ -1464,6 +1721,7 @@ static const VMStateDescription vmstate_e1000 = {
     },
     .subsections = (const VMStateDescription*[]) {
         &vmstate_e1000_mit_state,
+        &vmstate_e1000_full_mac_state,
         NULL
     }
 };
@@ -1596,6 +1854,8 @@ static Property e1000_properties[] = {
                     compat_flags, E1000_FLAG_AUTONEG_BIT, true),
     DEFINE_PROP_BIT("mitigation", E1000State,
                     compat_flags, E1000_FLAG_MIT_BIT, true),
+    DEFINE_PROP_BIT("extra_mac_registers", E1000State,
+                    compat_flags, E1000_FLAG_MAC_BIT, true),
     DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/hw/net/e1000_regs.h b/hw/net/e1000_regs.h
index 60b96aa..1c40244 100644
--- a/hw/net/e1000_regs.h
+++ b/hw/net/e1000_regs.h
@@ -158,7 +158,8 @@
 #define E1000_PHY_CTRL     0x00F10  /* PHY Control Register in CSR */
 #define FEXTNVM_SW_CONFIG  0x0001
 #define E1000_PBA      0x01000  /* Packet Buffer Allocation - RW */
-#define E1000_PBS      0x01008  /* Packet Buffer Size */
+#define E1000_PBM      0x10000  /* Packet Buffer Memory - RW */
+#define E1000_PBS      0x01008  /* Packet Buffer Size - RW */
 #define E1000_EEMNGCTL 0x01010  /* MNG EEprom Control */
 #define E1000_FLASH_UPDATES 1000
 #define E1000_EEARBC   0x01024  /* EEPROM Auto Read Bus Control */
@@ -191,6 +192,11 @@
 #define E1000_RAID     0x02C08  /* Receive Ack Interrupt Delay - RW */
 #define E1000_TXDMAC   0x03000  /* TX DMA Control - RW */
 #define E1000_KABGTXD  0x03004  /* AFE Band Gap Transmit Ref Data */
+#define E1000_RDFH     0x02410  /* Receive Data FIFO Head Register - RW */
+#define E1000_RDFT     0x02418  /* Receive Data FIFO Tail Register - RW */
+#define E1000_RDFHS    0x02420  /* Receive Data FIFO Head Saved Register - RW */
+#define E1000_RDFTS    0x02428  /* Receive Data FIFO Tail Saved Register - RW */
+#define E1000_RDFPC    0x02430  /* Receive Data FIFO Packet Count - RW */
 #define E1000_TDFH     0x03410  /* TX Data FIFO Head - RW */
 #define E1000_TDFT     0x03418  /* TX Data FIFO Tail - RW */
 #define E1000_TDFHS    0x03420  /* TX Data FIFO Head Saved - RW */
diff --git a/hw/net/rocker/rocker.c b/hw/net/rocker/rocker.c
index bb6fdc3..c57f1a6 100644
--- a/hw/net/rocker/rocker.c
+++ b/hw/net/rocker/rocker.c
@@ -101,8 +101,7 @@ RockerSwitch *qmp_query_rocker(const char *name, Error **errp)
 
     r = rocker_find(name);
     if (!r) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                  "rocker %s not found", name);
+        error_setg(errp, "rocker %s not found", name);
         return NULL;
     }
 
@@ -122,8 +121,7 @@ RockerPortList *qmp_query_rocker_ports(const char *name, Error **errp)
 
     r = rocker_find(name);
     if (!r) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                  "rocker %s not found", name);
+        error_setg(errp, "rocker %s not found", name);
         return NULL;
     }
 
diff --git a/hw/net/rocker/rocker_of_dpa.c b/hw/net/rocker/rocker_of_dpa.c
index 1ad2791..3cf1d61 100644
--- a/hw/net/rocker/rocker_of_dpa.c
+++ b/hw/net/rocker/rocker_of_dpa.c
@@ -2462,15 +2462,13 @@ RockerOfDpaFlowList *qmp_query_rocker_of_dpa_flows(const char *name,
 
     r = rocker_find(name);
     if (!r) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                  "rocker %s not found", name);
+        error_setg(errp, "rocker %s not found", name);
         return NULL;
     }
 
     w = rocker_get_world(r, ROCKER_WORLD_TYPE_OF_DPA);
     if (!w) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                  "rocker %s doesn't have OF-DPA world", name);
+        error_setg(errp, "rocker %s doesn't have OF-DPA world", name);
         return NULL;
     }
 
@@ -2597,15 +2595,13 @@ RockerOfDpaGroupList *qmp_query_rocker_of_dpa_groups(const char *name,
 
     r = rocker_find(name);
     if (!r) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                  "rocker %s not found", name);
+        error_setg(errp, "rocker %s not found", name);
         return NULL;
     }
 
     w = rocker_get_world(r, ROCKER_WORLD_TYPE_OF_DPA);
     if (!w) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                  "rocker %s doesn't have OF-DPA world", name);
+        error_setg(errp, "rocker %s doesn't have OF-DPA world", name);
         return NULL;
     }
 
diff --git a/hw/pci/pcie.c b/hw/pci/pcie.c
index 32c65c2..0eab29d 100644
--- a/hw/pci/pcie.c
+++ b/hw/pci/pcie.c
@@ -426,13 +426,13 @@ void pcie_cap_slot_write_config(PCIDevice *dev,
      */
     if ((sltsta & PCI_EXP_SLTSTA_PDS) && (val & PCI_EXP_SLTCTL_PCC) &&
         ((val & PCI_EXP_SLTCTL_PIC_OFF) == PCI_EXP_SLTCTL_PIC_OFF)) {
-            PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(dev));
-            pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
-                                pcie_unplug_device, NULL);
+        PCIBus *sec_bus = pci_bridge_get_sec_bus(PCI_BRIDGE(dev));
+        pci_for_each_device(sec_bus, pci_bus_num(sec_bus),
+                            pcie_unplug_device, NULL);
 
-            pci_word_test_and_clear_mask(exp_cap + PCI_EXP_SLTSTA,
-                                         PCI_EXP_SLTSTA_PDS);
-            pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA,
+        pci_word_test_and_clear_mask(exp_cap + PCI_EXP_SLTSTA,
+                                     PCI_EXP_SLTSTA_PDS);
+        pci_word_test_and_set_mask(exp_cap + PCI_EXP_SLTSTA,
                                        PCI_EXP_SLTSTA_PDC);
     }
 
diff --git a/hw/ppc/mac.h b/hw/ppc/mac.h
index 8bdba30..e375ed2 100644
--- a/hw/ppc/mac.h
+++ b/hw/ppc/mac.h
@@ -103,6 +103,9 @@ typedef struct CUDAState {
     uint8_t last_b;
     uint8_t last_acr;
 
+    /* MacOS 9 is racy and requires a delay upon setting the SR_INT bit */
+    QEMUTimer *sr_delay_timer;
+
     int data_in_size;
     int data_in_index;
     int data_out_index;
diff --git a/hw/ppc/mac_newworld.c b/hw/ppc/mac_newworld.c
index 66d016c..1b9a573 100644
--- a/hw/ppc/mac_newworld.c
+++ b/hw/ppc/mac_newworld.c
@@ -371,12 +371,13 @@ static void ppc_core99_init(MachineState *machine)
         /* 970 gets a U3 bus */
         pci_bus = pci_pmac_u3_init(pic, get_system_memory(), get_system_io());
         machine_arch = ARCH_MAC99_U3;
-        machine->usb |= defaults_enabled() && !machine->usb_disabled;
     } else {
         pci_bus = pci_pmac_init(pic, get_system_memory(), get_system_io());
         machine_arch = ARCH_MAC99;
     }
 
+    machine->usb |= defaults_enabled() && !machine->usb_disabled;
+
     /* Timebase Frequency */
     if (kvm_enabled()) {
         tbfreq = kvmppc_get_tbfreq();
diff --git a/hw/ppc/spapr.c b/hw/ppc/spapr.c
index 0ed8527..030ee35 100644
--- a/hw/ppc/spapr.c
+++ b/hw/ppc/spapr.c
@@ -1021,9 +1021,19 @@ static void spapr_alloc_htab(sPAPRMachineState *spapr)
      * RAM */
 
     shift = kvmppc_reset_htab(spapr->htab_shift);
-
-    if (shift > 0) {
-        /* Kernel handles htab, we don't need to allocate one */
+    if (shift < 0) {
+        /*
+         * For HV KVM, host kernel will return -ENOMEM when requested
+         * HTAB size can't be allocated.
+         */
+        error_setg(&error_abort, "Failed to allocate HTAB of requested size, try with smaller maxmem");
+    } else if (shift > 0) {
+        /*
+         * Kernel handles htab, we don't need to allocate one
+         *
+         * Older kernels can fall back to lower HTAB shift values,
+         * but we don't allow booting of such guests.
+         */
         if (shift != spapr->htab_shift) {
             error_setg(&error_abort, "Failed to allocate HTAB of requested size, try with smaller maxmem");
         }
@@ -1055,7 +1065,9 @@ static void spapr_reset_htab(sPAPRMachineState *spapr)
     int index;
 
     shift = kvmppc_reset_htab(spapr->htab_shift);
-    if (shift > 0) {
+    if (shift < 0) {
+        error_setg(&error_abort, "Failed to reset HTAB");
+    } else if (shift > 0) {
         if (shift != spapr->htab_shift) {
             error_setg(&error_abort, "Requested HTAB allocation failed during reset");
         }
@@ -1588,7 +1600,7 @@ static int htab_load(QEMUFile *f, void *opaque, int version_id)
 static SaveVMHandlers savevm_htab_handlers = {
     .save_live_setup = htab_save_setup,
     .save_live_iterate = htab_save_iterate,
-    .save_live_complete = htab_save_complete,
+    .save_live_complete_precopy = htab_save_complete,
     .load_state = htab_load,
 };
 
diff --git a/hw/s390x/css.c b/hw/s390x/css.c
index c033612..19851ce 100644
--- a/hw/s390x/css.c
+++ b/hw/s390x/css.c
@@ -892,8 +892,14 @@ int css_do_tsch_get_irb(SubchDev *sch, IRB *target_irb, int *irb_len)
         /* If a unit check is pending, copy sense data. */
         if ((s->dstat & SCSW_DSTAT_UNIT_CHECK) &&
             (p->chars & PMCW_CHARS_MASK_CSENSE)) {
+            int i;
+
             irb.scsw.flags |= SCSW_FLAGS_MASK_ESWF | SCSW_FLAGS_MASK_ECTL;
+            /* Attention: sense_data is already BE! */
             memcpy(irb.ecw, sch->sense_data, sizeof(sch->sense_data));
+            for (i = 0; i < ARRAY_SIZE(irb.ecw); i++) {
+                irb.ecw[i] = be32_to_cpu(irb.ecw[i]);
+            }
             irb.esw[1] = 0x01000000 | (sizeof(sch->sense_data) << 8);
         }
     }
diff --git a/hw/s390x/ipl.c b/hw/s390x/ipl.c
index 5f7f349..b91fcc6 100644
--- a/hw/s390x/ipl.c
+++ b/hw/s390x/ipl.c
@@ -15,7 +15,6 @@
 #include "cpu.h"
 #include "elf.h"
 #include "hw/loader.h"
-#include "hw/sysbus.h"
 #include "hw/s390x/virtio-ccw.h"
 #include "hw/s390x/css.h"
 #include "ipl.h"
@@ -29,44 +28,6 @@
 #define ZIPL_IMAGE_START                0x009000UL
 #define IPL_PSW_MASK                    (PSW_MASK_32 | PSW_MASK_64)
 
-#define TYPE_S390_IPL "s390-ipl"
-#define S390_IPL(obj) \
-    OBJECT_CHECK(S390IPLState, (obj), TYPE_S390_IPL)
-#if 0
-#define S390_IPL_CLASS(klass) \
-    OBJECT_CLASS_CHECK(S390IPLState, (klass), TYPE_S390_IPL)
-#define S390_IPL_GET_CLASS(obj) \
-    OBJECT_GET_CLASS(S390IPLState, (obj), TYPE_S390_IPL)
-#endif
-
-typedef struct S390IPLClass {
-    /*< private >*/
-    SysBusDeviceClass parent_class;
-    /*< public >*/
-
-    void (*parent_reset) (SysBusDevice *dev);
-} S390IPLClass;
-
-typedef struct S390IPLState {
-    /*< private >*/
-    SysBusDevice parent_obj;
-    uint64_t start_addr;
-    uint64_t bios_start_addr;
-    bool enforce_bios;
-    IplParameterBlock iplb;
-    bool iplb_valid;
-    bool reipl_requested;
-
-    /*< public >*/
-    char *kernel;
-    char *initrd;
-    char *cmdline;
-    char *firmware;
-    uint8_t cssid;
-    uint8_t ssid;
-    uint16_t devno;
-} S390IPLState;
-
 static const VMStateDescription vmstate_iplb = {
     .name = "ipl/iplb",
     .version_id = 0,
@@ -110,11 +71,12 @@ static uint64_t bios_translate_addr(void *opaque, uint64_t srcaddr)
     return srcaddr + dstaddr;
 }
 
-static int s390_ipl_init(SysBusDevice *dev)
+static void s390_ipl_realize(DeviceState *dev, Error **errp)
 {
     S390IPLState *ipl = S390_IPL(dev);
     uint64_t pentry = KERN_IMAGE_START;
     int kernel_size;
+    Error *l_err = NULL;
 
     int bios_size;
     char *bios_filename;
@@ -132,7 +94,8 @@ static int s390_ipl_init(SysBusDevice *dev)
 
         bios_filename = qemu_find_file(QEMU_FILE_TYPE_BIOS, bios_name);
         if (bios_filename == NULL) {
-            hw_error("could not find stage1 bootloader\n");
+            error_setg(&l_err, "could not find stage1 bootloader\n");
+            goto error;
         }
 
         bios_size = load_elf(bios_filename, bios_translate_addr, &fwbase,
@@ -150,7 +113,8 @@ static int s390_ipl_init(SysBusDevice *dev)
         g_free(bios_filename);
 
         if (bios_size == -1) {
-            hw_error("could not load bootloader '%s'\n", bios_name);
+            error_setg(&l_err, "could not load bootloader '%s'\n", bios_name);
+            goto error;
         }
 
         /* default boot target is the bios */
@@ -164,8 +128,8 @@ static int s390_ipl_init(SysBusDevice *dev)
             kernel_size = load_image_targphys(ipl->kernel, 0, ram_size);
         }
         if (kernel_size < 0) {
-            fprintf(stderr, "could not load kernel '%s'\n", ipl->kernel);
-            return -1;
+            error_setg(&l_err, "could not load kernel '%s'\n", ipl->kernel);
+            goto error;
         }
         /*
          * Is it a Linux kernel (starting at 0x10000)? If yes, we fill in the
@@ -192,9 +156,8 @@ static int s390_ipl_init(SysBusDevice *dev)
             initrd_size = load_image_targphys(ipl->initrd, initrd_offset,
                                               ram_size - initrd_offset);
             if (initrd_size == -1) {
-                fprintf(stderr, "qemu: could not load initrd '%s'\n",
-                        ipl->initrd);
-                exit(1);
+                error_setg(&l_err, "could not load initrd '%s'\n", ipl->initrd);
+                goto error;
             }
 
             /*
@@ -205,7 +168,9 @@ static int s390_ipl_init(SysBusDevice *dev)
             stq_p(rom_ptr(INITRD_PARM_SIZE), initrd_size);
         }
     }
-    return 0;
+    qemu_register_reset(qdev_reset_all_fn, dev);
+error:
+    error_propagate(errp, l_err);
 }
 
 static Property s390_ipl_properties[] = {
@@ -308,9 +273,8 @@ static void s390_ipl_reset(DeviceState *dev)
 static void s390_ipl_class_init(ObjectClass *klass, void *data)
 {
     DeviceClass *dc = DEVICE_CLASS(klass);
-    SysBusDeviceClass *k = SYS_BUS_DEVICE_CLASS(klass);
 
-    k->init = s390_ipl_init;
+    dc->realize = s390_ipl_realize;
     dc->props = s390_ipl_properties;
     dc->reset = s390_ipl_reset;
     dc->vmsd = &vmstate_ipl;
@@ -319,8 +283,8 @@ static void s390_ipl_class_init(ObjectClass *klass, void *data)
 
 static const TypeInfo s390_ipl_info = {
     .class_init = s390_ipl_class_init,
-    .parent = TYPE_SYS_BUS_DEVICE,
-    .name  = "s390-ipl",
+    .parent = TYPE_DEVICE,
+    .name  = TYPE_S390_IPL,
     .instance_size  = sizeof(S390IPLState),
 };
 
diff --git a/hw/s390x/ipl.h b/hw/s390x/ipl.h
index 7f2b403..6b48ed7 100644
--- a/hw/s390x/ipl.h
+++ b/hw/s390x/ipl.h
@@ -12,6 +12,7 @@
 #ifndef HW_S390_IPL_H
 #define HW_S390_IPL_H
 
+#include "hw/qdev.h"
 #include "cpu.h"
 
 typedef struct IplParameterBlock {
@@ -25,4 +26,28 @@ void s390_ipl_prepare_cpu(S390CPU *cpu);
 IplParameterBlock *s390_ipl_get_iplb(void);
 void s390_reipl_request(void);
 
+#define TYPE_S390_IPL "s390-ipl"
+#define S390_IPL(obj) OBJECT_CHECK(S390IPLState, (obj), TYPE_S390_IPL)
+
+struct S390IPLState {
+    /*< private >*/
+    DeviceState parent_obj;
+    uint64_t start_addr;
+    uint64_t bios_start_addr;
+    bool enforce_bios;
+    IplParameterBlock iplb;
+    bool iplb_valid;
+    bool reipl_requested;
+
+    /*< public >*/
+    char *kernel;
+    char *initrd;
+    char *cmdline;
+    char *firmware;
+    uint8_t cssid;
+    uint8_t ssid;
+    uint16_t devno;
+};
+typedef struct S390IPLState S390IPLState;
+
 #endif
diff --git a/hw/s390x/s390-pci-bus.c b/hw/s390x/s390-pci-bus.c
index 560b66a..d5712ae 100644
--- a/hw/s390x/s390-pci-bus.c
+++ b/hw/s390x/s390-pci-bus.c
@@ -309,8 +309,7 @@ static IOMMUTLBEntry s390_translate_iommu(MemoryRegion *iommu, hwaddr addr,
     uint64_t pte;
     uint32_t flags;
     S390PCIBusDevice *pbdev = container_of(iommu, S390PCIBusDevice, mr);
-    S390pciState *s = S390_PCI_HOST_BRIDGE(pci_device_root_bus(pbdev->pdev)
-                                           ->qbus.parent);
+    S390pciState *s;
     IOMMUTLBEntry ret = {
         .target_as = &address_space_memory,
         .iova = 0,
@@ -319,8 +318,13 @@ static IOMMUTLBEntry s390_translate_iommu(MemoryRegion *iommu, hwaddr addr,
         .perm = IOMMU_NONE,
     };
 
+    if (!pbdev->configured || !pbdev->pdev) {
+        return ret;
+    }
+
     DPRINTF("iommu trans addr 0x%" PRIx64 "\n", addr);
 
+    s = S390_PCI_HOST_BRIDGE(pci_device_root_bus(pbdev->pdev)->qbus.parent);
     /* s390 does not have an APIC mapped to main storage so we use
      * a separate AddressSpace only for msix notifications
      */
diff --git a/hw/s390x/s390-virtio-ccw.c b/hw/s390x/s390-virtio-ccw.c
index 84221f4..5a52ff2 100644
--- a/hw/s390x/s390-virtio-ccw.c
+++ b/hw/s390x/s390-virtio-ccw.c
@@ -104,8 +104,7 @@ void s390_memory_init(ram_addr_t mem_size)
     MemoryRegion *ram = g_new(MemoryRegion, 1);
 
     /* allocate RAM for core */
-    memory_region_init_ram(ram, NULL, "s390.ram", mem_size, &error_fatal);
-    vmstate_register_ram_global(ram);
+    memory_region_allocate_system_memory(ram, NULL, "s390.ram", mem_size);
     memory_region_add_subregion(sysmem, 0, ram);
 
     /* Initialize storage key device */
diff --git a/hw/s390x/s390-virtio.c b/hw/s390x/s390-virtio.c
index cbde977..51dc0a8 100644
--- a/hw/s390x/s390-virtio.c
+++ b/hw/s390x/s390-virtio.c
@@ -31,7 +31,6 @@
 #include "hw/boards.h"
 #include "hw/loader.h"
 #include "hw/virtio/virtio.h"
-#include "hw/sysbus.h"
 #include "sysemu/kvm.h"
 #include "exec/address-spaces.h"
 
@@ -151,9 +150,9 @@ void s390_init_ipl_dev(const char *kernel_filename,
                        const char *firmware,
                        bool enforce_bios)
 {
-    DeviceState *dev;
+    Object *new = object_new(TYPE_S390_IPL);
+    DeviceState *dev = DEVICE(new);
 
-    dev  = qdev_create(NULL, "s390-ipl");
     if (kernel_filename) {
         qdev_prop_set_string(dev, "kernel", kernel_filename);
     }
@@ -163,8 +162,9 @@ void s390_init_ipl_dev(const char *kernel_filename,
     qdev_prop_set_string(dev, "cmdline", kernel_cmdline);
     qdev_prop_set_string(dev, "firmware", firmware);
     qdev_prop_set_bit(dev, "enforce_bios", enforce_bios);
-    object_property_add_child(qdev_get_machine(), "s390-ipl",
-                              OBJECT(dev), NULL);
+    object_property_add_child(qdev_get_machine(), TYPE_S390_IPL,
+                              new, NULL);
+    object_unref(new);
     qdev_init_nofail(dev);
 }
 
@@ -268,6 +268,10 @@ static void s390_init(MachineState *machine)
     hwaddr virtio_region_len;
     hwaddr virtio_region_start;
 
+    error_printf("WARNING\n"
+                 "The s390-virtio machine (non-ccw) is deprecated.\n"
+                 "It will be removed in 2.6. Please use s390-ccw-virtio\n");
+
     if (machine->ram_slots) {
         error_report("Memory hotplug not supported by the selected machine.");
         exit(EXIT_FAILURE);
@@ -334,7 +338,7 @@ static void s390_machine_class_init(ObjectClass *oc, void *data)
     NMIClass *nc = NMI_CLASS(oc);
 
     mc->alias = "s390";
-    mc->desc = "VirtIO based S390 machine";
+    mc->desc = "VirtIO based S390 machine (deprecated)";
     mc->init = s390_init;
     mc->reset = s390_machine_reset;
     mc->block_default_type = IF_VIRTIO;
diff --git a/hw/scsi/megasas.c b/hw/scsi/megasas.c
index dcd724e..d7dc667 100644
--- a/hw/scsi/megasas.c
+++ b/hw/scsi/megasas.c
@@ -757,7 +757,7 @@ static int megasas_ctrl_get_info(MegasasState *s, MegasasCmd *cmd)
 
     memcpy(info.product_name, base_class->product_name, 24);
     snprintf(info.serial_number, 32, "%s", s->hba_serial);
-    snprintf(info.package_version, 0x60, "%s-QEMU", QEMU_VERSION);
+    snprintf(info.package_version, 0x60, "%s-QEMU", qemu_hw_version());
     memcpy(info.image_component[0].name, "APP", 3);
     snprintf(info.image_component[0].version, 10, "%s-QEMU",
              base_class->product_version);
diff --git a/hw/scsi/scsi-bus.c b/hw/scsi/scsi-bus.c
index d373c1b..fd1171e 100644
--- a/hw/scsi/scsi-bus.c
+++ b/hw/scsi/scsi-bus.c
@@ -453,7 +453,7 @@ static bool scsi_target_emulate_inquiry(SCSITargetReq *r)
         r->buf[7] = 0x10 | (r->req.bus->info->tcq ? 0x02 : 0); /* Sync, TCQ.  */
         memcpy(&r->buf[8], "QEMU    ", 8);
         memcpy(&r->buf[16], "QEMU TARGET     ", 16);
-        pstrcpy((char *) &r->buf[32], 4, qemu_get_version());
+        pstrcpy((char *) &r->buf[32], 4, qemu_hw_version());
     }
     return true;
 }
diff --git a/hw/scsi/scsi-disk.c b/hw/scsi/scsi-disk.c
index bada9a7..4797d83 100644
--- a/hw/scsi/scsi-disk.c
+++ b/hw/scsi/scsi-disk.c
@@ -90,7 +90,7 @@ struct SCSIDiskState
     bool tray_locked;
 };
 
-static int scsi_handle_rw_error(SCSIDiskReq *r, int error);
+static int scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed);
 
 static void scsi_free_request(SCSIRequest *req)
 {
@@ -169,18 +169,18 @@ static void scsi_aio_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
-    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
     if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret)) {
+        if (scsi_handle_rw_error(r, -ret, true)) {
             goto done;
         }
     }
 
+    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     scsi_req_complete(&r->req, GOOD);
 
 done:
@@ -247,7 +247,7 @@ static void scsi_dma_complete_noio(SCSIDiskReq *r, int ret)
     }
 
     if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret)) {
+        if (scsi_handle_rw_error(r, -ret, false)) {
             goto done;
         }
     }
@@ -273,7 +273,11 @@ static void scsi_dma_complete(void *opaque, int ret)
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
-    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    if (ret < 0) {
+        block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    } else {
+        block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    }
     scsi_dma_complete_noio(r, ret);
 }
 
@@ -285,18 +289,18 @@ static void scsi_read_complete(void * opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
-    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
     if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret)) {
+        if (scsi_handle_rw_error(r, -ret, true)) {
             goto done;
         }
     }
 
+    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     DPRINTF("Data ready tag=0x%x len=%zd\n", r->req.tag, r->qiov.size);
 
     n = r->qiov.size / 512;
@@ -322,7 +326,7 @@ static void scsi_do_read(SCSIDiskReq *r, int ret)
     }
 
     if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret)) {
+        if (scsi_handle_rw_error(r, -ret, false)) {
             goto done;
         }
     }
@@ -355,7 +359,11 @@ static void scsi_do_read_cb(void *opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
-    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    if (ret < 0) {
+        block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    } else {
+        block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    }
     scsi_do_read(opaque, ret);
 }
 
@@ -407,7 +415,7 @@ static void scsi_read_data(SCSIRequest *req)
  * scsi_handle_rw_error always manages its reference counts, independent
  * of the return value.
  */
-static int scsi_handle_rw_error(SCSIDiskReq *r, int error)
+static int scsi_handle_rw_error(SCSIDiskReq *r, int error, bool acct_failed)
 {
     bool is_read = (r->req.cmd.mode == SCSI_XFER_FROM_DEV);
     SCSIDiskState *s = DO_UPCAST(SCSIDiskState, qdev, r->req.dev);
@@ -415,6 +423,9 @@ static int scsi_handle_rw_error(SCSIDiskReq *r, int error)
                                                    is_read, error);
 
     if (action == BLOCK_ERROR_ACTION_REPORT) {
+        if (acct_failed) {
+            block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+        }
         switch (error) {
         case ENOMEDIUM:
             scsi_check_condition(r, SENSE_CODE(NO_MEDIUM));
@@ -452,7 +463,7 @@ static void scsi_write_complete_noio(SCSIDiskReq *r, int ret)
     }
 
     if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret)) {
+        if (scsi_handle_rw_error(r, -ret, false)) {
             goto done;
         }
     }
@@ -481,7 +492,11 @@ static void scsi_write_complete(void * opaque, int ret)
     assert (r->req.aiocb != NULL);
     r->req.aiocb = NULL;
 
-    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    if (ret < 0) {
+        block_acct_failed(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    } else {
+        block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+    }
     scsi_write_complete_noio(r, ret);
 }
 
@@ -1592,7 +1607,7 @@ static void scsi_unmap_complete_noio(UnmapCBData *data, int ret)
     }
 
     if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret)) {
+        if (scsi_handle_rw_error(r, -ret, false)) {
             goto done;
         }
     }
@@ -1696,18 +1711,19 @@ static void scsi_write_same_complete(void *opaque, int ret)
 
     assert(r->req.aiocb != NULL);
     r->req.aiocb = NULL;
-    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
     if (r->req.io_canceled) {
         scsi_req_cancel_complete(&r->req);
         goto done;
     }
 
     if (ret < 0) {
-        if (scsi_handle_rw_error(r, -ret)) {
+        if (scsi_handle_rw_error(r, -ret, true)) {
             goto done;
         }
     }
 
+    block_acct_done(blk_get_stats(s->qdev.conf.blk), &r->acct);
+
     data->nb_sectors -= data->iov.iov_len / 512;
     data->sector += data->iov.iov_len / 512;
     data->iov.iov_len = MIN(data->nb_sectors * 512, data->iov.iov_len);
@@ -2315,7 +2331,7 @@ static void scsi_realize(SCSIDevice *dev, Error **errp)
     }
 
     if (!s->version) {
-        s->version = g_strdup(qemu_get_version());
+        s->version = g_strdup(qemu_hw_version());
     }
     if (!s->vendor) {
         s->vendor = g_strdup("QEMU");
diff --git a/hw/timer/hpet.c b/hw/timer/hpet.c
index 3037bef..7f0391c 100644
--- a/hw/timer/hpet.c
+++ b/hw/timer/hpet.c
@@ -116,12 +116,12 @@ static uint32_t timer_enabled(HPETTimer *t)
 
 static uint32_t hpet_time_after(uint64_t a, uint64_t b)
 {
-    return ((int32_t)(b) - (int32_t)(a) < 0);
+    return ((int32_t)(b - a) < 0);
 }
 
 static uint32_t hpet_time_after64(uint64_t a, uint64_t b)
 {
-    return ((int64_t)(b) - (int64_t)(a) < 0);
+    return ((int64_t)(b - a) < 0);
 }
 
 static uint64_t ticks_to_ns(uint64_t value)
diff --git a/hw/tpm/tpm_tis.c b/hw/tpm/tpm_tis.c
index 0806b5f..ff073d5 100644
--- a/hw/tpm/tpm_tis.c
+++ b/hw/tpm/tpm_tis.c
@@ -141,7 +141,7 @@
 
 #define TPM_TIS_IFACE_ID_SUPPORTED_FLAGS1_3 \
     (TPM_TIS_IFACE_ID_INTERFACE_TIS1_3 | \
-     (~0 << 4)/* all of it is don't care */)
+     (~0u << 4)/* all of it is don't care */)
 
 /* if backend was a TPM 2.0: */
 #define TPM_TIS_IFACE_ID_SUPPORTED_FLAGS2_0 \
diff --git a/hw/usb/ccid-card-emulated.c b/hw/usb/ccid-card-emulated.c
index 72329ed..869a63c 100644
--- a/hw/usb/ccid-card-emulated.c
+++ b/hw/usb/ccid-card-emulated.c
@@ -166,7 +166,7 @@ static void emulated_push_event(EmulatedState *card, EmulEvent *event)
 
 static void emulated_push_type(EmulatedState *card, uint32_t type)
 {
-    EmulEvent *event = (EmulEvent *)g_malloc(sizeof(EmulEvent));
+    EmulEvent *event = g_new(EmulEvent, 1);
 
     assert(event);
     event->p.gen.type = type;
@@ -175,7 +175,7 @@ static void emulated_push_type(EmulatedState *card, uint32_t type)
 
 static void emulated_push_error(EmulatedState *card, uint64_t code)
 {
-    EmulEvent *event = (EmulEvent *)g_malloc(sizeof(EmulEvent));
+    EmulEvent *event = g_new(EmulEvent, 1);
 
     assert(event);
     event->p.error.type = EMUL_ERROR;
diff --git a/hw/usb/dev-mtp.c b/hw/usb/dev-mtp.c
index 809b1cb..a276267 100644
--- a/hw/usb/dev-mtp.c
+++ b/hw/usb/dev-mtp.c
@@ -359,8 +359,7 @@ static void usb_mtp_object_readdir(MTPState *s, MTPObject *o)
     }
     while ((entry = readdir(dir)) != NULL) {
         if ((o->nchildren % 32) == 0) {
-            o->children = g_realloc(o->children,
-                                    (o->nchildren + 32) * sizeof(MTPObject *));
+            o->children = g_renew(MTPObject *, o->children, o->nchildren + 32);
         }
         o->children[o->nchildren] =
             usb_mtp_object_alloc(s, s->next_handle++, o, entry->d_name);
diff --git a/hw/usb/hcd-xhci.c b/hw/usb/hcd-xhci.c
index 1c57e20..268ab36 100644
--- a/hw/usb/hcd-xhci.c
+++ b/hw/usb/hcd-xhci.c
@@ -2188,7 +2188,7 @@ static void xhci_kick_ep(XHCIState *xhci, unsigned int slotid,
             xfer->trbs = NULL;
         }
         if (!xfer->trbs) {
-            xfer->trbs = g_malloc(sizeof(XHCITRB) * length);
+            xfer->trbs = g_new(XHCITRB, length);
             xfer->trb_alloced = length;
         }
         xfer->trb_count = length;
diff --git a/hw/usb/redirect.c b/hw/usb/redirect.c
index 38086cd..30ff742 100644
--- a/hw/usb/redirect.c
+++ b/hw/usb/redirect.c
@@ -322,7 +322,7 @@ static void packet_id_queue_add(struct PacketIdQueue *q, uint64_t id)
 
     DPRINTF("adding packet id %"PRIu64" to %s queue\n", id, q->name);
 
-    e = g_malloc0(sizeof(struct PacketIdQueueEntry));
+    e = g_new0(struct PacketIdQueueEntry, 1);
     e->id = id;
     QTAILQ_INSERT_TAIL(&q->head, e, next);
     q->size++;
@@ -468,7 +468,7 @@ static void bufp_alloc(USBRedirDevice *dev, uint8_t *data, uint16_t len,
         dev->endpoint[EP2I(ep)].bufpq_dropping_packets = 0;
     }
 
-    bufp = g_malloc(sizeof(struct buf_packet));
+    bufp = g_new(struct buf_packet, 1);
     bufp->data   = data;
     bufp->len    = len;
     bufp->offset = 0;
@@ -2234,7 +2234,7 @@ static int usbredir_get_bufpq(QEMUFile *f, void *priv, size_t unused)
 
     endp->bufpq_size = qemu_get_be32(f);
     for (i = 0; i < endp->bufpq_size; i++) {
-        bufp = g_malloc(sizeof(struct buf_packet));
+        bufp = g_new(struct buf_packet, 1);
         bufp->len = qemu_get_be32(f);
         bufp->status = qemu_get_be32(f);
         bufp->offset = 0;
diff --git a/hw/vfio/pci-quirks.c b/hw/vfio/pci-quirks.c
index c675d1b..30c68a1 100644
--- a/hw/vfio/pci-quirks.c
+++ b/hw/vfio/pci-quirks.c
@@ -284,7 +284,7 @@ static void vfio_vga_probe_ati_3c3_quirk(VFIOPCIDevice *vdev)
     }
 
     quirk = g_malloc0(sizeof(*quirk));
-    quirk->mem = g_malloc0(sizeof(MemoryRegion));
+    quirk->mem = g_new0(MemoryRegion, 1);
     quirk->nr_mem = 1;
 
     memory_region_init_io(quirk->mem, OBJECT(vdev), &vfio_ati_3c3_quirk, vdev,
@@ -319,7 +319,7 @@ static void vfio_probe_ati_bar4_quirk(VFIOPCIDevice *vdev, int nr)
     }
 
     quirk = g_malloc0(sizeof(*quirk));
-    quirk->mem = g_malloc0(sizeof(MemoryRegion) * 2);
+    quirk->mem = g_new0(MemoryRegion, 2);
     quirk->nr_mem = 2;
     window = quirk->data = g_malloc0(sizeof(*window) +
                                      sizeof(VFIOConfigWindowMatch));
@@ -368,7 +368,7 @@ static void vfio_probe_ati_bar2_quirk(VFIOPCIDevice *vdev, int nr)
 
     quirk = g_malloc0(sizeof(*quirk));
     mirror = quirk->data = g_malloc0(sizeof(*mirror));
-    mirror->mem = quirk->mem = g_malloc0(sizeof(MemoryRegion));
+    mirror->mem = quirk->mem = g_new0(MemoryRegion, 1);
     quirk->nr_mem = 1;
     mirror->vdev = vdev;
     mirror->offset = 0x4000;
@@ -544,7 +544,7 @@ static void vfio_vga_probe_nvidia_3d0_quirk(VFIOPCIDevice *vdev)
 
     quirk = g_malloc0(sizeof(*quirk));
     quirk->data = data = g_malloc0(sizeof(*data));
-    quirk->mem = g_malloc0(sizeof(MemoryRegion) * 2);
+    quirk->mem = g_new0(MemoryRegion, 2);
     quirk->nr_mem = 2;
     data->vdev = vdev;
 
@@ -661,7 +661,7 @@ static void vfio_probe_nvidia_bar5_quirk(VFIOPCIDevice *vdev, int nr)
     }
 
     quirk = g_malloc0(sizeof(*quirk));
-    quirk->mem = g_malloc0(sizeof(MemoryRegion) * 4);
+    quirk->mem = g_new0(MemoryRegion, 4);
     quirk->nr_mem = 4;
     bar5 = quirk->data = g_malloc0(sizeof(*bar5) +
                                    (sizeof(VFIOConfigWindowMatch) * 2));
@@ -756,7 +756,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
 
     quirk = g_malloc0(sizeof(*quirk));
     mirror = quirk->data = g_malloc0(sizeof(*mirror));
-    mirror->mem = quirk->mem = g_malloc0(sizeof(MemoryRegion));
+    mirror->mem = quirk->mem = g_new0(MemoryRegion, 1);
     quirk->nr_mem = 1;
     mirror->vdev = vdev;
     mirror->offset = 0x88000;
@@ -775,7 +775,7 @@ static void vfio_probe_nvidia_bar0_quirk(VFIOPCIDevice *vdev, int nr)
     if (vdev->has_vga) {
         quirk = g_malloc0(sizeof(*quirk));
         mirror = quirk->data = g_malloc0(sizeof(*mirror));
-        mirror->mem = quirk->mem = g_malloc0(sizeof(MemoryRegion));
+        mirror->mem = quirk->mem = g_new0(MemoryRegion, 1);
         quirk->nr_mem = 1;
         mirror->vdev = vdev;
         mirror->offset = 0x1800;
@@ -938,7 +938,7 @@ static void vfio_probe_rtl8168_bar2_quirk(VFIOPCIDevice *vdev, int nr)
     }
 
     quirk = g_malloc0(sizeof(*quirk));
-    quirk->mem = g_malloc0(sizeof(MemoryRegion) * 2);
+    quirk->mem = g_new0(MemoryRegion, 2);
     quirk->nr_mem = 2;
     quirk->data = rtl = g_malloc0(sizeof(*rtl));
     rtl->vdev = vdev;
diff --git a/hw/vfio/pci.c b/hw/vfio/pci.c
index 8fadbcf..1fb868c 100644
--- a/hw/vfio/pci.c
+++ b/hw/vfio/pci.c
@@ -28,6 +28,7 @@
 #include "config.h"
 #include "hw/pci/msi.h"
 #include "hw/pci/msix.h"
+#include "hw/pci/pci_bridge.h"
 #include "qemu/error-report.h"
 #include "qemu/range.h"
 #include "sysemu/kvm.h"
@@ -586,7 +587,7 @@ static void vfio_msix_enable(VFIOPCIDevice *vdev)
 {
     vfio_disable_interrupts(vdev);
 
-    vdev->msi_vectors = g_malloc0(vdev->msix->entries * sizeof(VFIOMSIVector));
+    vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->msix->entries);
 
     vdev->interrupt = VFIO_INT_MSIX;
 
@@ -622,7 +623,7 @@ static void vfio_msi_enable(VFIOPCIDevice *vdev)
 
     vdev->nr_vectors = msi_nr_vectors_allocated(&vdev->pdev);
 retry:
-    vdev->msi_vectors = g_malloc0(vdev->nr_vectors * sizeof(VFIOMSIVector));
+    vdev->msi_vectors = g_new0(VFIOMSIVector, vdev->nr_vectors);
 
     for (i = 0; i < vdev->nr_vectors; i++) {
         VFIOMSIVector *vector = &vdev->msi_vectors[i];
@@ -1524,13 +1525,38 @@ static int vfio_setup_pcie_cap(VFIOPCIDevice *vdev, int pos, uint8_t size)
     }
 
     if (!pci_bus_is_express(vdev->pdev.bus)) {
+        PCIBus *bus = vdev->pdev.bus;
+        PCIDevice *bridge;
+
         /*
-         * Use express capability as-is on PCI bus.  It doesn't make much
-         * sense to even expose, but some drivers (ex. tg3) depend on it
-         * and guests don't seem to be particular about it.  We'll need
-         * to revist this or force express devices to express buses if we
-         * ever expose an IOMMU to the guest.
+         * Traditionally PCI device assignment exposes the PCIe capability
+         * as-is on non-express buses.  The reason being that some drivers
+         * simply assume that it's there, for example tg3.  However when
+         * we're running on a native PCIe machine type, like Q35, we need
+         * to hide the PCIe capability.  The reason for this is twofold;
+         * first Windows guests get a Code 10 error when the PCIe capability
+         * is exposed in this configuration.  Therefore express devices won't
+         * work at all unless they're attached to express buses in the VM.
+         * Second, a native PCIe machine introduces the possibility of fine
+         * granularity IOMMUs supporting both translation and isolation.
+         * Guest code to discover the IOMMU visibility of a device, such as
+         * IOMMU grouping code on Linux, is very aware of device types and
+         * valid transitions between bus types.  An express device on a non-
+         * express bus is not a valid combination on bare metal systems.
+         *
+         * Drivers that require a PCIe capability to make the device
+         * functional are simply going to need to have their devices placed
+         * on a PCIe bus in the VM.
          */
+        while (!pci_bus_is_root(bus)) {
+            bridge = pci_bridge_get_device(bus);
+            bus = bridge->bus;
+        }
+
+        if (pci_bus_is_express(bus)) {
+            return 0;
+        }
+
     } else if (pci_bus_is_root(vdev->pdev.bus)) {
         /*
          * On a Root Complex bus Endpoints become Root Complex Integrated
diff --git a/hw/vfio/platform.c b/hw/vfio/platform.c
index 5c1156c..289b498 100644
--- a/hw/vfio/platform.c
+++ b/hw/vfio/platform.c
@@ -478,7 +478,7 @@ static int vfio_populate_device(VFIODevice *vbasedev)
         struct vfio_region_info reg_info = { .argsz = sizeof(reg_info) };
         VFIORegion *ptr;
 
-        vdev->regions[i] = g_malloc0(sizeof(VFIORegion));
+        vdev->regions[i] = g_new0(VFIORegion, 1);
         ptr = vdev->regions[i];
         reg_info.index = i;
         ret = ioctl(vbasedev->fd, VFIO_DEVICE_GET_REGION_INFO, &reg_info);
diff --git a/hw/virtio/dataplane/vring.c b/hw/virtio/dataplane/vring.c
index 68f1994..23f667e 100644
--- a/hw/virtio/dataplane/vring.c
+++ b/hw/virtio/dataplane/vring.c
@@ -25,15 +25,30 @@
 
 /* vring_map can be coupled with vring_unmap or (if you still have the
  * value returned in *mr) memory_region_unref.
+ * Returns NULL on failure.
+ * Callers that can handle a partial mapping must supply mapped_len pointer to
+ * get the actual length mapped.
+ * Passing mapped_len == NULL requires either a full mapping or a failure.
  */
-static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len,
+static void *vring_map(MemoryRegion **mr, hwaddr phys,
+                       hwaddr len, hwaddr *mapped_len,
                        bool is_write)
 {
     MemoryRegionSection section = memory_region_find(get_system_memory(), phys, len);
+    uint64_t size;
 
-    if (!section.mr || int128_get64(section.size) < len) {
+    if (!section.mr) {
         goto out;
     }
+
+    size = int128_get64(section.size);
+    assert(size);
+
+    /* Passing mapped_len == NULL requires either a full mapping or a failure. */
+    if (!mapped_len && size < len) {
+        goto out;
+    }
+
     if (is_write && section.readonly) {
         goto out;
     }
@@ -46,6 +61,10 @@ static void *vring_map(MemoryRegion **mr, hwaddr phys, hwaddr len,
         goto out;
     }
 
+    if (mapped_len) {
+        *mapped_len = MIN(size, len);
+    }
+
     *mr = section.mr;
     return memory_region_get_ram_ptr(section.mr) + section.offset_within_region;
 
@@ -78,7 +97,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
     addr = virtio_queue_get_desc_addr(vdev, n);
     size = virtio_queue_get_desc_size(vdev, n);
     /* Map the descriptor area as read only */
-    ptr = vring_map(&vring->mr_desc, addr, size, false);
+    ptr = vring_map(&vring->mr_desc, addr, size, NULL, false);
     if (!ptr) {
         error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring desc "
                      "at 0x%" HWADDR_PRIx,
@@ -92,7 +111,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
     /* Add the size of the used_event_idx */
     size += sizeof(uint16_t);
     /* Map the driver area as read only */
-    ptr = vring_map(&vring->mr_avail, addr, size, false);
+    ptr = vring_map(&vring->mr_avail, addr, size, NULL, false);
     if (!ptr) {
         error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring avail "
                      "at 0x%" HWADDR_PRIx,
@@ -106,7 +125,7 @@ bool vring_setup(Vring *vring, VirtIODevice *vdev, int n)
     /* Add the size of the avail_event_idx */
     size += sizeof(uint16_t);
     /* Map the device area as read-write */
-    ptr = vring_map(&vring->mr_used, addr, size, true);
+    ptr = vring_map(&vring->mr_used, addr, size, NULL, true);
     if (!ptr) {
         error_report("Failed to map 0x%" HWADDR_PRIx " byte for vring used "
                      "at 0x%" HWADDR_PRIx,
@@ -206,6 +225,7 @@ static int get_desc(Vring *vring, VirtQueueElement *elem,
     struct iovec *iov;
     hwaddr *addr;
     MemoryRegion *mr;
+    hwaddr len;
 
     if (desc->flags & VRING_DESC_F_WRITE) {
         num = &elem->in_num;
@@ -224,26 +244,30 @@ static int get_desc(Vring *vring, VirtQueueElement *elem,
         }
     }
 
-    /* Stop for now if there are not enough iovecs available. */
-    if (*num >= VIRTQUEUE_MAX_SIZE) {
-        error_report("Invalid SG num: %u", *num);
-        return -EFAULT;
-    }
+    while (desc->len) {
+        /* Stop for now if there are not enough iovecs available. */
+        if (*num >= VIRTQUEUE_MAX_SIZE) {
+            error_report("Invalid SG num: %u", *num);
+            return -EFAULT;
+        }
 
-    /* TODO handle non-contiguous memory across region boundaries */
-    iov->iov_base = vring_map(&mr, desc->addr, desc->len,
-                              desc->flags & VRING_DESC_F_WRITE);
-    if (!iov->iov_base) {
-        error_report("Failed to map descriptor addr %#" PRIx64 " len %u",
-                     (uint64_t)desc->addr, desc->len);
-        return -EFAULT;
+        iov->iov_base = vring_map(&mr, desc->addr, desc->len, &len,
+                                  desc->flags & VRING_DESC_F_WRITE);
+        if (!iov->iov_base) {
+            error_report("Failed to map descriptor addr %#" PRIx64 " len %u",
+                         (uint64_t)desc->addr, desc->len);
+            return -EFAULT;
+        }
+
+        /* The MemoryRegion is looked up again and unref'ed later, leave the
+         * ref in place.  */
+        (iov++)->iov_len = len;
+        *addr++ = desc->addr;
+        desc->len -= len;
+        desc->addr += len;
+        *num += 1;
     }
 
-    /* The MemoryRegion is looked up again and unref'ed later, leave the
-     * ref in place.  */
-    iov->iov_len = desc->len;
-    *addr = desc->addr;
-    *num += 1;
     return 0;
 }
 
@@ -257,6 +281,21 @@ static void copy_in_vring_desc(VirtIODevice *vdev,
     host->next = virtio_lduw_p(vdev, &guest->next);
 }
 
+static bool read_vring_desc(VirtIODevice *vdev,
+                            hwaddr guest,
+                            struct vring_desc *host)
+{
+    if (address_space_read(&address_space_memory, guest, MEMTXATTRS_UNSPECIFIED,
+                           (uint8_t *)host, sizeof *host)) {
+        return false;
+    }
+    host->addr = virtio_tswap64(vdev, host->addr);
+    host->len = virtio_tswap32(vdev, host->len);
+    host->flags = virtio_tswap16(vdev, host->flags);
+    host->next = virtio_tswap16(vdev, host->next);
+    return true;
+}
+
 /* This is stolen from linux/drivers/vhost/vhost.c. */
 static int get_indirect(VirtIODevice *vdev, Vring *vring,
                         VirtQueueElement *elem, struct vring_desc *indirect)
@@ -284,23 +323,16 @@ static int get_indirect(VirtIODevice *vdev, Vring *vring,
     }
 
     do {
-        struct vring_desc *desc_ptr;
-        MemoryRegion *mr;
-
         /* Translate indirect descriptor */
-        desc_ptr = vring_map(&mr,
-                             indirect->addr + found * sizeof(desc),
-                             sizeof(desc), false);
-        if (!desc_ptr) {
-            error_report("Failed to map indirect descriptor "
+        if (!read_vring_desc(vdev, indirect->addr + found * sizeof(desc),
+                             &desc)) {
+            error_report("Failed to read indirect descriptor "
                          "addr %#" PRIx64 " len %zu",
                          (uint64_t)indirect->addr + found * sizeof(desc),
                          sizeof(desc));
             vring->broken = true;
             return -EFAULT;
         }
-        copy_in_vring_desc(vdev, desc_ptr, &desc);
-        memory_region_unref(mr);
 
         /* Ensure descriptor has been loaded before accessing fields */
         barrier(); /* read_barrier_depends(); */
diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 1d5f684..b734a60 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -156,7 +156,7 @@ static int vhost_kernel_set_owner(struct vhost_dev *dev)
 
 static int vhost_kernel_reset_device(struct vhost_dev *dev)
 {
-    return vhost_kernel_call(dev, VHOST_RESET_DEVICE, NULL);
+    return vhost_kernel_call(dev, VHOST_RESET_OWNER, NULL);
 }
 
 static int vhost_kernel_get_vq_index(struct vhost_dev *dev, int idx)
diff --git a/hw/virtio/vhost-user.c b/hw/virtio/vhost-user.c
index 83c84f1..c443602 100644
--- a/hw/virtio/vhost-user.c
+++ b/hw/virtio/vhost-user.c
@@ -43,7 +43,7 @@ typedef enum VhostUserRequest {
     VHOST_USER_GET_FEATURES = 1,
     VHOST_USER_SET_FEATURES = 2,
     VHOST_USER_SET_OWNER = 3,
-    VHOST_USER_RESET_DEVICE = 4,
+    VHOST_USER_RESET_OWNER = 4,
     VHOST_USER_SET_MEM_TABLE = 5,
     VHOST_USER_SET_LOG_BASE = 6,
     VHOST_USER_SET_LOG_FD = 7,
@@ -75,6 +75,11 @@ typedef struct VhostUserMemory {
     VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
 } VhostUserMemory;
 
+typedef struct VhostUserLog {
+    uint64_t mmap_size;
+    uint64_t mmap_offset;
+} VhostUserLog;
+
 typedef struct VhostUserMsg {
     VhostUserRequest request;
 
@@ -89,6 +94,7 @@ typedef struct VhostUserMsg {
         struct vhost_vring_state state;
         struct vhost_vring_addr addr;
         VhostUserMemory memory;
+        VhostUserLog log;
     } payload;
 } QEMU_PACKED VhostUserMsg;
 
@@ -157,7 +163,7 @@ static bool vhost_user_one_time_request(VhostUserRequest request)
 {
     switch (request) {
     case VHOST_USER_SET_OWNER:
-    case VHOST_USER_RESET_DEVICE:
+    case VHOST_USER_RESET_OWNER:
     case VHOST_USER_SET_MEM_TABLE:
     case VHOST_USER_GET_QUEUE_NUM:
         return true;
@@ -200,8 +206,9 @@ static int vhost_user_set_log_base(struct vhost_dev *dev, uint64_t base,
     VhostUserMsg msg = {
         .request = VHOST_USER_SET_LOG_BASE,
         .flags = VHOST_USER_VERSION,
-        .payload.u64 = base,
-        .size = sizeof(msg.payload.u64),
+        .payload.log.mmap_size = log->size,
+        .payload.log.mmap_offset = 0,
+        .size = sizeof(msg.payload.log),
     };
 
     if (shmfd && log->fd != -1) {
@@ -486,7 +493,7 @@ static int vhost_user_set_owner(struct vhost_dev *dev)
 static int vhost_user_reset_device(struct vhost_dev *dev)
 {
     VhostUserMsg msg = {
-        .request = VHOST_USER_RESET_DEVICE,
+        .request = VHOST_USER_RESET_OWNER,
         .flags = VHOST_USER_VERSION,
     };
 
diff --git a/hw/virtio/vhost.c b/hw/virtio/vhost.c
index de29968..1794f0d 100644
--- a/hw/virtio/vhost.c
+++ b/hw/virtio/vhost.c
@@ -1226,6 +1226,11 @@ int vhost_dev_start(struct vhost_dev *hdev, VirtIODevice *vdev)
         }
     }
 
+    if (hdev->vhost_ops->vhost_set_vring_enable) {
+        /* only enable first vq pair by default */
+        hdev->vhost_ops->vhost_set_vring_enable(hdev, hdev->vq_index == 0);
+    }
+
     return 0;
 fail_log:
     vhost_log_put(hdev, false);
@@ -1256,6 +1261,10 @@ void vhost_dev_stop(struct vhost_dev *hdev, VirtIODevice *vdev)
                              hdev->vq_index + i);
     }
 
+    if (hdev->vhost_ops->vhost_set_vring_enable) {
+        hdev->vhost_ops->vhost_set_vring_enable(hdev, 0);
+    }
+
     vhost_log_put(hdev, true);
     hdev->started = false;
     hdev->log = NULL;
diff --git a/hw/virtio/virtio-balloon.c b/hw/virtio/virtio-balloon.c
index c419b17..9671635 100644
--- a/hw/virtio/virtio-balloon.c
+++ b/hw/virtio/virtio-balloon.c
@@ -37,9 +37,11 @@
 static void balloon_page(void *addr, int deflate)
 {
 #if defined(__linux__)
-    if (!kvm_enabled() || kvm_has_sync_mmu())
+    if (!qemu_balloon_is_inhibited() && (!kvm_enabled() ||
+                                         kvm_has_sync_mmu())) {
         qemu_madvise(addr, TARGET_PAGE_SIZE,
                 deflate ? QEMU_MADV_WILLNEED : QEMU_MADV_DONTNEED);
+    }
 #endif
 }
 
diff --git a/hw/virtio/virtio-pci.c b/hw/virtio/virtio-pci.c
index f55dd2b..dd48562 100644
--- a/hw/virtio/virtio-pci.c
+++ b/hw/virtio/virtio-pci.c
@@ -86,6 +86,129 @@ static void virtio_pci_save_config(DeviceState *d, QEMUFile *f)
         qemu_put_be16(f, vdev->config_vector);
 }
 
+static void virtio_pci_load_modern_queue_state(VirtIOPCIQueue *vq,
+                                               QEMUFile *f)
+{
+    vq->num = qemu_get_be16(f);
+    vq->enabled = qemu_get_be16(f);
+    vq->desc[0] = qemu_get_be32(f);
+    vq->desc[1] = qemu_get_be32(f);
+    vq->avail[0] = qemu_get_be32(f);
+    vq->avail[1] = qemu_get_be32(f);
+    vq->used[0] = qemu_get_be32(f);
+    vq->used[1] = qemu_get_be32(f);
+}
+
+static bool virtio_pci_has_extra_state(DeviceState *d)
+{
+    VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+    return proxy->flags & VIRTIO_PCI_FLAG_MIGRATE_EXTRA;
+}
+
+static int get_virtio_pci_modern_state(QEMUFile *f, void *pv, size_t size)
+{
+    VirtIOPCIProxy *proxy = pv;
+    int i;
+
+    proxy->dfselect = qemu_get_be32(f);
+    proxy->gfselect = qemu_get_be32(f);
+    proxy->guest_features[0] = qemu_get_be32(f);
+    proxy->guest_features[1] = qemu_get_be32(f);
+    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+        virtio_pci_load_modern_queue_state(&proxy->vqs[i], f);
+    }
+
+    return 0;
+}
+
+static void virtio_pci_save_modern_queue_state(VirtIOPCIQueue *vq,
+                                               QEMUFile *f)
+{
+    qemu_put_be16(f, vq->num);
+    qemu_put_be16(f, vq->enabled);
+    qemu_put_be32(f, vq->desc[0]);
+    qemu_put_be32(f, vq->desc[1]);
+    qemu_put_be32(f, vq->avail[0]);
+    qemu_put_be32(f, vq->avail[1]);
+    qemu_put_be32(f, vq->used[0]);
+    qemu_put_be32(f, vq->used[1]);
+}
+
+static void put_virtio_pci_modern_state(QEMUFile *f, void *pv, size_t size)
+{
+    VirtIOPCIProxy *proxy = pv;
+    int i;
+
+    qemu_put_be32(f, proxy->dfselect);
+    qemu_put_be32(f, proxy->gfselect);
+    qemu_put_be32(f, proxy->guest_features[0]);
+    qemu_put_be32(f, proxy->guest_features[1]);
+    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+        virtio_pci_save_modern_queue_state(&proxy->vqs[i], f);
+    }
+}
+
+static const VMStateInfo vmstate_info_virtio_pci_modern_state = {
+    .name = "virtqueue_state",
+    .get = get_virtio_pci_modern_state,
+    .put = put_virtio_pci_modern_state,
+};
+
+static bool virtio_pci_modern_state_needed(void *opaque)
+{
+    VirtIOPCIProxy *proxy = opaque;
+
+    return !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+}
+
+static const VMStateDescription vmstate_virtio_pci_modern_state = {
+    .name = "virtio_pci/modern_state",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = &virtio_pci_modern_state_needed,
+    .fields = (VMStateField[]) {
+        {
+            .name         = "modern_state",
+            .version_id   = 0,
+            .field_exists = NULL,
+            .size         = 0,
+            .info         = &vmstate_info_virtio_pci_modern_state,
+            .flags        = VMS_SINGLE,
+            .offset       = 0,
+        },
+        VMSTATE_END_OF_LIST()
+    }
+};
+
+static const VMStateDescription vmstate_virtio_pci = {
+    .name = "virtio_pci",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .minimum_version_id_old = 1,
+    .fields = (VMStateField[]) {
+        VMSTATE_END_OF_LIST()
+    },
+    .subsections = (const VMStateDescription*[]) {
+        &vmstate_virtio_pci_modern_state,
+        NULL
+    }
+};
+
+static void virtio_pci_save_extra_state(DeviceState *d, QEMUFile *f)
+{
+    VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+    vmstate_save_state(f, &vmstate_virtio_pci, proxy, NULL);
+}
+
+static int virtio_pci_load_extra_state(DeviceState *d, QEMUFile *f)
+{
+    VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
+
+    return vmstate_load_state(f, &vmstate_virtio_pci, proxy, 1);
+}
+
 static void virtio_pci_save_queue(DeviceState *d, int n, QEMUFile *f)
 {
     VirtIOPCIProxy *proxy = to_virtio_pci_proxy(d);
@@ -133,6 +256,7 @@ static int virtio_pci_load_queue(DeviceState *d, int n, QEMUFile *f)
     if (vector != VIRTIO_NO_VECTOR) {
         return msix_vector_use(&proxy->pci_dev, vector);
     }
+
     return 0;
 }
 
@@ -146,7 +270,10 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
     EventNotifier *notifier = virtio_queue_get_host_notifier(vq);
     bool legacy = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_LEGACY);
     bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+    bool fast_mmio = kvm_ioeventfd_any_length_enabled();
+    bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
     MemoryRegion *modern_mr = &proxy->notify.mr;
+    MemoryRegion *modern_notify_mr = &proxy->notify_pio.mr;
     MemoryRegion *legacy_mr = &proxy->bar;
     hwaddr modern_addr = QEMU_VIRTIO_PCI_QUEUE_MEM_MULT *
                          virtio_get_queue_index(vq);
@@ -162,8 +289,17 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
         }
         virtio_queue_set_host_notifier_fd_handler(vq, true, set_handler);
         if (modern) {
-            memory_region_add_eventfd(modern_mr, modern_addr, 2,
-                                      true, n, notifier);
+            if (fast_mmio) {
+                memory_region_add_eventfd(modern_mr, modern_addr, 0,
+                                          false, n, notifier);
+            } else {
+                memory_region_add_eventfd(modern_mr, modern_addr, 2,
+                                          false, n, notifier);
+            }
+            if (modern_pio) {
+                memory_region_add_eventfd(modern_notify_mr, 0, 2,
+                                              true, n, notifier);
+            }
         }
         if (legacy) {
             memory_region_add_eventfd(legacy_mr, legacy_addr, 2,
@@ -171,8 +307,17 @@ static int virtio_pci_set_host_notifier_internal(VirtIOPCIProxy *proxy,
         }
     } else {
         if (modern) {
-            memory_region_del_eventfd(modern_mr, modern_addr, 2,
-                                      true, n, notifier);
+            if (fast_mmio) {
+                memory_region_del_eventfd(modern_mr, modern_addr, 0,
+                                          false, n, notifier);
+            } else {
+                memory_region_del_eventfd(modern_mr, modern_addr, 2,
+                                          false, n, notifier);
+            }
+            if (modern_pio) {
+                memory_region_del_eventfd(modern_notify_mr, 0, 2,
+                                          true, n, notifier);
+            }
         }
         if (legacy) {
             memory_region_del_eventfd(legacy_mr, legacy_addr, 2,
@@ -1239,6 +1384,7 @@ static void virtio_pci_common_write(void *opaque, hwaddr addr,
                        proxy->vqs[vdev->queue_sel].avail[0],
                        ((uint64_t)proxy->vqs[vdev->queue_sel].used[1]) << 32 |
                        proxy->vqs[vdev->queue_sel].used[0]);
+        proxy->vqs[vdev->queue_sel].enabled = 1;
         break;
     case VIRTIO_PCI_COMMON_Q_DESCLO:
         proxy->vqs[vdev->queue_sel].desc[0] = val;
@@ -1281,6 +1427,17 @@ static void virtio_pci_notify_write(void *opaque, hwaddr addr,
     }
 }
 
+static void virtio_pci_notify_write_pio(void *opaque, hwaddr addr,
+                                        uint64_t val, unsigned size)
+{
+    VirtIODevice *vdev = opaque;
+    unsigned queue = val;
+
+    if (queue < VIRTIO_QUEUE_MAX) {
+        virtio_queue_notify(vdev, queue);
+    }
+}
+
 static uint64_t virtio_pci_isr_read(void *opaque, hwaddr addr,
                                     unsigned size)
 {
@@ -1374,6 +1531,16 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy)
         },
         .endianness = DEVICE_LITTLE_ENDIAN,
     };
+    static const MemoryRegionOps notify_pio_ops = {
+        .read = virtio_pci_notify_read,
+        .write = virtio_pci_notify_write_pio,
+        .impl = {
+            .min_access_size = 1,
+            .max_access_size = 4,
+        },
+        .endianness = DEVICE_LITTLE_ENDIAN,
+    };
+
 
     memory_region_init_io(&proxy->common.mr, OBJECT(proxy),
                           &common_ops,
@@ -1398,30 +1565,60 @@ static void virtio_pci_modern_regions_init(VirtIOPCIProxy *proxy)
                           virtio_bus_get_device(&proxy->bus),
                           "virtio-pci-notify",
                           proxy->notify.size);
+
+    memory_region_init_io(&proxy->notify_pio.mr, OBJECT(proxy),
+                          &notify_pio_ops,
+                          virtio_bus_get_device(&proxy->bus),
+                          "virtio-pci-notify-pio",
+                          proxy->notify.size);
 }
 
 static void virtio_pci_modern_region_map(VirtIOPCIProxy *proxy,
                                          VirtIOPCIRegion *region,
-                                         struct virtio_pci_cap *cap)
+                                         struct virtio_pci_cap *cap,
+                                         MemoryRegion *mr,
+                                         uint8_t bar)
 {
-    memory_region_add_subregion(&proxy->modern_bar,
-                                region->offset,
-                                &region->mr);
+    memory_region_add_subregion(mr, region->offset, &region->mr);
 
     cap->cfg_type = region->type;
-    cap->bar = proxy->modern_mem_bar;
+    cap->bar = bar;
     cap->offset = cpu_to_le32(region->offset);
     cap->length = cpu_to_le32(region->size);
     virtio_pci_add_mem_cap(proxy, cap);
+
+}
+
+static void virtio_pci_modern_mem_region_map(VirtIOPCIProxy *proxy,
+                                             VirtIOPCIRegion *region,
+                                             struct virtio_pci_cap *cap)
+{
+    virtio_pci_modern_region_map(proxy, region, cap,
+                                 &proxy->modern_bar, proxy->modern_mem_bar);
 }
 
-static void virtio_pci_modern_region_unmap(VirtIOPCIProxy *proxy,
-                                           VirtIOPCIRegion *region)
+static void virtio_pci_modern_io_region_map(VirtIOPCIProxy *proxy,
+                                            VirtIOPCIRegion *region,
+                                            struct virtio_pci_cap *cap)
+{
+    virtio_pci_modern_region_map(proxy, region, cap,
+                                 &proxy->io_bar, proxy->modern_io_bar);
+}
+
+static void virtio_pci_modern_mem_region_unmap(VirtIOPCIProxy *proxy,
+                                               VirtIOPCIRegion *region)
 {
     memory_region_del_subregion(&proxy->modern_bar,
                                 &region->mr);
 }
 
+static void virtio_pci_modern_io_region_unmap(VirtIOPCIProxy *proxy,
+                                              VirtIOPCIRegion *region)
+{
+    memory_region_del_subregion(&proxy->io_bar,
+                                &region->mr);
+}
+
 /* This is called by virtio-bus just after the device is plugged. */
 static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
 {
@@ -1429,6 +1626,7 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
     VirtioBusState *bus = &proxy->bus;
     bool legacy = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_LEGACY);
     bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+    bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
     uint8_t *config;
     uint32_t size;
     VirtIODevice *vdev = virtio_bus_get_device(&proxy->bus);
@@ -1467,16 +1665,31 @@ static void virtio_pci_device_plugged(DeviceState *d, Error **errp)
             .cap.cap_len = sizeof cfg,
             .cap.cfg_type = VIRTIO_PCI_CAP_PCI_CFG,
         };
-        struct virtio_pci_cfg_cap *cfg_mask;
+        struct virtio_pci_notify_cap notify_pio = {
+            .cap.cap_len = sizeof notify,
+            .notify_off_multiplier = cpu_to_le32(0x0),
+        };
 
-        /* TODO: add io access for speed */
+        struct virtio_pci_cfg_cap *cfg_mask;
 
         virtio_add_feature(&vdev->host_features, VIRTIO_F_VERSION_1);
         virtio_pci_modern_regions_init(proxy);
-        virtio_pci_modern_region_map(proxy, &proxy->common, &cap);
-        virtio_pci_modern_region_map(proxy, &proxy->isr, &cap);
-        virtio_pci_modern_region_map(proxy, &proxy->device, &cap);
-        virtio_pci_modern_region_map(proxy, &proxy->notify, &notify.cap);
+
+        virtio_pci_modern_mem_region_map(proxy, &proxy->common, &cap);
+        virtio_pci_modern_mem_region_map(proxy, &proxy->isr, &cap);
+        virtio_pci_modern_mem_region_map(proxy, &proxy->device, &cap);
+        virtio_pci_modern_mem_region_map(proxy, &proxy->notify, &notify.cap);
+
+        if (modern_pio) {
+            memory_region_init(&proxy->io_bar, OBJECT(proxy),
+                               "virtio-pci-io", 0x4);
+
+            pci_register_bar(&proxy->pci_dev, proxy->modern_io_bar,
+                             PCI_BASE_ADDRESS_SPACE_IO, &proxy->io_bar);
+
+            virtio_pci_modern_io_region_map(proxy, &proxy->notify_pio,
+                                            &notify_pio.cap);
+        }
 
         pci_register_bar(&proxy->pci_dev, proxy->modern_mem_bar,
                          PCI_BASE_ADDRESS_SPACE_MEMORY |
@@ -1532,14 +1745,18 @@ static void virtio_pci_device_unplugged(DeviceState *d)
 {
     VirtIOPCIProxy *proxy = VIRTIO_PCI(d);
     bool modern = !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN);
+    bool modern_pio = proxy->flags & VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY;
 
     virtio_pci_stop_ioeventfd(proxy);
 
     if (modern) {
-        virtio_pci_modern_region_unmap(proxy, &proxy->common);
-        virtio_pci_modern_region_unmap(proxy, &proxy->isr);
-        virtio_pci_modern_region_unmap(proxy, &proxy->device);
-        virtio_pci_modern_region_unmap(proxy, &proxy->notify);
+        virtio_pci_modern_mem_region_unmap(proxy, &proxy->common);
+        virtio_pci_modern_mem_region_unmap(proxy, &proxy->isr);
+        virtio_pci_modern_mem_region_unmap(proxy, &proxy->device);
+        virtio_pci_modern_mem_region_unmap(proxy, &proxy->notify);
+        if (modern_pio) {
+            virtio_pci_modern_io_region_unmap(proxy, &proxy->notify_pio);
+        }
     }
 }
 
@@ -1559,6 +1776,7 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
      */
     proxy->legacy_io_bar  = 0;
     proxy->msix_bar       = 1;
+    proxy->modern_io_bar  = 2;
     proxy->modern_mem_bar = 4;
 
     proxy->common.offset = 0x0;
@@ -1578,6 +1796,10 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
         QEMU_VIRTIO_PCI_QUEUE_MEM_MULT * VIRTIO_QUEUE_MAX;
     proxy->notify.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
 
+    proxy->notify_pio.offset = 0x0;
+    proxy->notify_pio.size = 0x4;
+    proxy->notify_pio.type = VIRTIO_PCI_CAP_NOTIFY_CFG;
+
     /* subclasses can enforce modern, so do this unconditionally */
     memory_region_init(&proxy->modern_bar, OBJECT(proxy), "virtio-pci",
                        2 * QEMU_VIRTIO_PCI_QUEUE_MEM_MULT *
@@ -1592,6 +1814,26 @@ static void virtio_pci_realize(PCIDevice *pci_dev, Error **errp)
 
     address_space_init(&proxy->modern_as, &proxy->modern_cfg, "virtio-pci-cfg-as");
 
+    if (!(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_PCIE)
+        && !(proxy->flags & VIRTIO_PCI_FLAG_DISABLE_MODERN)
+        && pci_bus_is_express(pci_dev->bus)
+        && !pci_bus_is_root(pci_dev->bus)) {
+        int pos;
+
+        pci_dev->cap_present |= QEMU_PCI_CAP_EXPRESS;
+        pos = pcie_endpoint_cap_init(pci_dev, 0);
+        assert(pos > 0);
+
+        pos = pci_add_capability(pci_dev, PCI_CAP_ID_PM, 0, PCI_PM_SIZEOF);
+        assert(pos > 0);
+
+        /*
+         * Indicates that this function complies with revision 1.2 of the
+         * PCI Power Management Interface Specification.
+         */
+        pci_set_word(pci_dev->config + pos + PCI_PM_PMC, 0x3);
+    }
+
     virtio_pci_bus_new(&proxy->bus, sizeof(proxy->bus), proxy);
     if (k->realize) {
         k->realize(proxy, errp);
@@ -1610,9 +1852,15 @@ static void virtio_pci_reset(DeviceState *qdev)
 {
     VirtIOPCIProxy *proxy = VIRTIO_PCI(qdev);
     VirtioBusState *bus = VIRTIO_BUS(&proxy->bus);
+    int i;
+
     virtio_pci_stop_ioeventfd(proxy);
     virtio_bus_reset(bus);
     msix_unuse_all_vectors(&proxy->pci_dev);
+
+    for (i = 0; i < VIRTIO_QUEUE_MAX; i++) {
+        proxy->vqs[i].enabled = 0;
+    }
 }
 
 static Property virtio_pci_properties[] = {
@@ -1622,6 +1870,12 @@ static Property virtio_pci_properties[] = {
                     VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT, false),
     DEFINE_PROP_BIT("disable-modern", VirtIOPCIProxy, flags,
                     VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT, true),
+    DEFINE_PROP_BIT("migrate-extra", VirtIOPCIProxy, flags,
+                    VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT, true),
+    DEFINE_PROP_BIT("modern-pio-notify", VirtIOPCIProxy, flags,
+                    VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT, false),
+    DEFINE_PROP_BIT("x-disable-pcie", VirtIOPCIProxy, flags,
+                    VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT, false),
     DEFINE_PROP_END_OF_LIST(),
 };
 
@@ -2212,6 +2466,9 @@ static void virtio_pci_bus_class_init(ObjectClass *klass, void *data)
     k->load_config = virtio_pci_load_config;
     k->save_queue = virtio_pci_save_queue;
     k->load_queue = virtio_pci_load_queue;
+    k->save_extra_state = virtio_pci_save_extra_state;
+    k->load_extra_state = virtio_pci_load_extra_state;
+    k->has_extra_state = virtio_pci_has_extra_state;
     k->query_guest_notifiers = virtio_pci_query_guest_notifiers;
     k->set_host_notifier = virtio_pci_set_host_notifier;
     k->set_guest_notifiers = virtio_pci_set_guest_notifiers;
diff --git a/hw/virtio/virtio-pci.h b/hw/virtio/virtio-pci.h
index 801c23a..ffb74bb 100644
--- a/hw/virtio/virtio-pci.h
+++ b/hw/virtio/virtio-pci.h
@@ -72,8 +72,19 @@ typedef struct VirtioBusClass VirtioPCIBusClass;
 /* virtio version flags */
 #define VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT 2
 #define VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT 3
+#define VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT 4
 #define VIRTIO_PCI_FLAG_DISABLE_LEGACY (1 << VIRTIO_PCI_FLAG_DISABLE_LEGACY_BIT)
 #define VIRTIO_PCI_FLAG_DISABLE_MODERN (1 << VIRTIO_PCI_FLAG_DISABLE_MODERN_BIT)
+#define VIRTIO_PCI_FLAG_DISABLE_PCIE (1 << VIRTIO_PCI_FLAG_DISABLE_PCIE_BIT)
+
+/* migrate extra state */
+#define VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT 4
+#define VIRTIO_PCI_FLAG_MIGRATE_EXTRA (1 << VIRTIO_PCI_FLAG_MIGRATE_EXTRA_BIT)
+
+/* have pio notification for modern device ? */
+#define VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT 5
+#define VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY \
+    (1 << VIRTIO_PCI_FLAG_MODERN_PIO_NOTIFY_BIT)
 
 typedef struct {
     MSIMessage msg;
@@ -104,6 +115,14 @@ typedef struct VirtIOPCIRegion {
     uint32_t type;
 } VirtIOPCIRegion;
 
+typedef struct VirtIOPCIQueue {
+  uint16_t num;
+  bool enabled;
+  uint32_t desc[2];
+  uint32_t avail[2];
+  uint32_t used[2];
+} VirtIOPCIQueue;
+
 struct VirtIOPCIProxy {
     PCIDevice pci_dev;
     MemoryRegion bar;
@@ -111,11 +130,14 @@ struct VirtIOPCIProxy {
     VirtIOPCIRegion isr;
     VirtIOPCIRegion device;
     VirtIOPCIRegion notify;
+    VirtIOPCIRegion notify_pio;
     MemoryRegion modern_bar;
+    MemoryRegion io_bar;
     MemoryRegion modern_cfg;
     AddressSpace modern_as;
     uint32_t legacy_io_bar;
     uint32_t msix_bar;
+    uint32_t modern_io_bar;
     uint32_t modern_mem_bar;
     int config_cap;
     uint32_t flags;
@@ -124,13 +146,7 @@ struct VirtIOPCIProxy {
     uint32_t dfselect;
     uint32_t gfselect;
     uint32_t guest_features[2];
-    struct {
-        uint16_t num;
-        bool enabled;
-        uint32_t desc[2];
-        uint32_t avail[2];
-        uint32_t used[2];
-    } vqs[VIRTIO_QUEUE_MAX];
+    VirtIOPCIQueue vqs[VIRTIO_QUEUE_MAX];
 
     bool ioeventfd_disabled;
     bool ioeventfd_started;
diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
index 939f802..1edef59 100644
--- a/hw/virtio/virtio.c
+++ b/hw/virtio/virtio.c
@@ -1116,6 +1116,16 @@ static bool virtio_ringsize_needed(void *opaque)
     return false;
 }
 
+static bool virtio_extra_state_needed(void *opaque)
+{
+    VirtIODevice *vdev = opaque;
+    BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+    return k->has_extra_state &&
+        k->has_extra_state(qbus->parent);
+}
+
 static void put_virtqueue_state(QEMUFile *f, void *pv, size_t size)
 {
     VirtIODevice *vdev = pv;
@@ -1210,6 +1220,53 @@ static const VMStateDescription vmstate_virtio_ringsize = {
     }
 };
 
+static int get_extra_state(QEMUFile *f, void *pv, size_t size)
+{
+    VirtIODevice *vdev = pv;
+    BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+    if (!k->load_extra_state) {
+        return -1;
+    } else {
+        return k->load_extra_state(qbus->parent, f);
+    }
+}
+
+static void put_extra_state(QEMUFile *f, void *pv, size_t size)
+{
+    VirtIODevice *vdev = pv;
+    BusState *qbus = qdev_get_parent_bus(DEVICE(vdev));
+    VirtioBusClass *k = VIRTIO_BUS_GET_CLASS(qbus);
+
+    k->save_extra_state(qbus->parent, f);
+}
+
+static const VMStateInfo vmstate_info_extra_state = {
+    .name = "virtqueue_extra_state",
+    .get = get_extra_state,
+    .put = put_extra_state,
+};
+
+static const VMStateDescription vmstate_virtio_extra_state = {
+    .name = "virtio/extra_state",
+    .version_id = 1,
+    .minimum_version_id = 1,
+    .needed = &virtio_extra_state_needed,
+    .fields = (VMStateField[]) {
+        {
+            .name         = "extra_state",
+            .version_id   = 0,
+            .field_exists = NULL,
+            .size         = 0,
+            .info         = &vmstate_info_extra_state,
+            .flags        = VMS_SINGLE,
+            .offset       = 0,
+        },
+        VMSTATE_END_OF_LIST()
+    }
+};
+
 static const VMStateDescription vmstate_virtio_device_endian = {
     .name = "virtio/device_endian",
     .version_id = 1,
@@ -1245,6 +1302,7 @@ static const VMStateDescription vmstate_virtio = {
         &vmstate_virtio_64bit_features,
         &vmstate_virtio_virtqueues,
         &vmstate_virtio_ringsize,
+        &vmstate_virtio_extra_state,
         NULL
     }
 };
diff --git a/hw/xen/xen_pt_config_init.c b/hw/xen/xen_pt_config_init.c
index 0efee11..3d8686d 100644
--- a/hw/xen/xen_pt_config_init.c
+++ b/hw/xen/xen_pt_config_init.c
@@ -1937,7 +1937,7 @@ static int xen_pt_config_reg_init(XenPCIPassthroughState *s,
                 break;
         case 4: rc = xen_host_pci_get_long(&s->real_device, offset, &val);
                 break;
-        default: assert(1);
+        default: abort();
         }
         if (rc) {
             /* Serious issues when we cannot read the host values! */
@@ -1982,7 +1982,7 @@ static int xen_pt_config_reg_init(XenPCIPassthroughState *s,
                 break;
         case 4: pci_set_long(s->dev.config + offset, val);
                 break;
-        default: assert(1);
+        default: abort();
         }
         /* set register value pointer to the data. */
         reg_entry->ptr.byte = s->dev.config + offset;
diff --git a/include/block/accounting.h b/include/block/accounting.h
index 66637cd..0f46cb4 100644
--- a/include/block/accounting.h
+++ b/include/block/accounting.h
@@ -2,6 +2,7 @@
  * QEMU System Emulator block accounting
  *
  * Copyright (c) 2011 Christoph Hellwig
+ * Copyright (c) 2015 Igalia, S.L.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a copy
  * of this software and associated documentation files (the "Software"), to deal
@@ -25,8 +26,12 @@
 #define BLOCK_ACCOUNTING_H
 
 #include <stdint.h>
+#include <stdbool.h>
 
 #include "qemu/typedefs.h"
+#include "qemu/timed-average.h"
+
+typedef struct BlockAcctTimedStats BlockAcctTimedStats;
 
 enum BlockAcctType {
     BLOCK_ACCT_READ,
@@ -35,11 +40,23 @@ enum BlockAcctType {
     BLOCK_MAX_IOTYPE,
 };
 
+struct BlockAcctTimedStats {
+    TimedAverage latency[BLOCK_MAX_IOTYPE];
+    unsigned interval_length; /* in seconds */
+    QSLIST_ENTRY(BlockAcctTimedStats) entries;
+};
+
 typedef struct BlockAcctStats {
     uint64_t nr_bytes[BLOCK_MAX_IOTYPE];
     uint64_t nr_ops[BLOCK_MAX_IOTYPE];
+    uint64_t invalid_ops[BLOCK_MAX_IOTYPE];
+    uint64_t failed_ops[BLOCK_MAX_IOTYPE];
     uint64_t total_time_ns[BLOCK_MAX_IOTYPE];
     uint64_t merged[BLOCK_MAX_IOTYPE];
+    int64_t last_access_time_ns;
+    QSLIST_HEAD(, BlockAcctTimedStats) intervals;
+    bool account_invalid;
+    bool account_failed;
 } BlockAcctStats;
 
 typedef struct BlockAcctCookie {
@@ -48,10 +65,21 @@ typedef struct BlockAcctCookie {
     enum BlockAcctType type;
 } BlockAcctCookie;
 
+void block_acct_init(BlockAcctStats *stats, bool account_invalid,
+                     bool account_failed);
+void block_acct_cleanup(BlockAcctStats *stats);
+void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length);
+BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats,
+                                              BlockAcctTimedStats *s);
 void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie,
                       int64_t bytes, enum BlockAcctType type);
 void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie);
+void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie);
+void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type);
 void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type,
                            int num_requests);
+int64_t block_acct_idle_time_ns(BlockAcctStats *stats);
+double block_acct_queue_depth(BlockAcctTimedStats *stats,
+                              enum BlockAcctType type);
 
 #endif
diff --git a/include/block/aio.h b/include/block/aio.h
index bcc7d43..e086e3b 100644
--- a/include/block/aio.h
+++ b/include/block/aio.h
@@ -124,6 +124,11 @@ struct AioContext {
     QEMUTimerListGroup tlg;
 
     int external_disable_cnt;
+
+    /* epoll(7) state used when built with CONFIG_EPOLL */
+    int epollfd;
+    bool epoll_enabled;
+    bool epoll_available;
 };
 
 /**
@@ -209,6 +214,11 @@ void aio_notify(AioContext *ctx);
 void aio_notify_accept(AioContext *ctx);
 
 /**
+ * aio_bh_call: Executes callback function of the specified BH.
+ */
+void aio_bh_call(QEMUBH *bh);
+
+/**
  * aio_bh_poll: Poll bottom halves for an AioContext.
  *
  * These are internal functions used by the QEMU main loop.
@@ -401,6 +411,17 @@ static inline void aio_enable_external(AioContext *ctx)
 }
 
 /**
+ * aio_external_disabled:
+ * @ctx: the aio context
+ *
+ * Return true if the external clients are disabled.
+ */
+static inline bool aio_external_disabled(AioContext *ctx)
+{
+    return atomic_read(&ctx->external_disable_cnt);
+}
+
+/**
  * aio_node_check:
  * @ctx: the aio context
  * @is_external: Whether or not the checked node is an external event source.
@@ -413,4 +434,12 @@ static inline bool aio_node_check(AioContext *ctx, bool is_external)
     return !is_external || !atomic_read(&ctx->external_disable_cnt);
 }
 
+/**
+ * aio_context_setup:
+ * @ctx: the aio context
+ *
+ * Initialize the aio context.
+ */
+void aio_context_setup(AioContext *ctx, Error **errp);
+
 #endif
diff --git a/include/block/block.h b/include/block/block.h
index 610db92..73edb1a 100644
--- a/include/block/block.h
+++ b/include/block/block.h
@@ -14,6 +14,7 @@ typedef struct BlockDriver BlockDriver;
 typedef struct BlockJob BlockJob;
 typedef struct BdrvChild BdrvChild;
 typedef struct BdrvChildRole BdrvChildRole;
+typedef struct BlockJobTxn BlockJobTxn;
 
 typedef struct BlockDriverInfo {
     /* in bytes, 0 if irrelevant */
@@ -335,10 +336,18 @@ void bdrv_aio_cancel_async(BlockAIOCB *acb);
 
 typedef struct BlockRequest {
     /* Fields to be filled by multiwrite caller */
-    int64_t sector;
-    int nb_sectors;
-    int flags;
-    QEMUIOVector *qiov;
+    union {
+        struct {
+            int64_t sector;
+            int nb_sectors;
+            int flags;
+            QEMUIOVector *qiov;
+        };
+        struct {
+            int req;
+            void *buf;
+        };
+    };
     BlockCompletionFunc *cb;
     void *opaque;
 
@@ -493,7 +502,6 @@ void bdrv_set_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                            int64_t cur_sector, int nr_sectors);
 void bdrv_reset_dirty_bitmap(BdrvDirtyBitmap *bitmap,
                              int64_t cur_sector, int nr_sectors);
-void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap);
 void bdrv_dirty_iter_init(BdrvDirtyBitmap *bitmap, struct HBitmapIter *hbi);
 void bdrv_set_dirty_iter(struct HBitmapIter *hbi, int64_t offset);
 int64_t bdrv_get_dirty_count(BdrvDirtyBitmap *bitmap);
diff --git a/include/block/block_int.h b/include/block/block_int.h
index 3ceeb5a..4012e36 100644
--- a/include/block/block_int.h
+++ b/include/block/block_int.h
@@ -60,11 +60,19 @@
 
 #define BLOCK_PROBE_BUF_SIZE        512
 
+enum BdrvTrackedRequestType {
+    BDRV_TRACKED_READ,
+    BDRV_TRACKED_WRITE,
+    BDRV_TRACKED_FLUSH,
+    BDRV_TRACKED_IOCTL,
+    BDRV_TRACKED_DISCARD,
+};
+
 typedef struct BdrvTrackedRequest {
     BlockDriverState *bs;
     int64_t offset;
     unsigned int bytes;
-    bool is_write;
+    enum BdrvTrackedRequestType type;
 
     bool serialising;
     int64_t overlap_offset;
@@ -219,7 +227,6 @@ struct BlockDriver {
     void (*bdrv_lock_medium)(BlockDriverState *bs, bool locked);
 
     /* to control generic scsi devices */
-    int (*bdrv_ioctl)(BlockDriverState *bs, unsigned long int req, void *buf);
     BlockAIOCB *(*bdrv_aio_ioctl)(BlockDriverState *bs,
         unsigned long int req, void *buf,
         BlockCompletionFunc *cb, void *opaque);
@@ -288,6 +295,12 @@ struct BlockDriver {
      */
     int (*bdrv_probe_geometry)(BlockDriverState *bs, HDGeometry *geo);
 
+    /**
+     * Drain and stop any internal sources of requests in the driver, and
+     * remain so until next I/O callback (e.g. bdrv_co_writev) is called.
+     */
+    void (*bdrv_drain)(BlockDriverState *bs);
+
     QLIST_ENTRY(BlockDriver) list;
 };
 
@@ -390,7 +403,10 @@ struct BlockDriverState {
     /* number of in-flight serialising requests */
     unsigned int serialising_in_flight;
 
-    /* I/O throttling */
+    /* I/O throttling.
+     * throttle_state tells us if this BDS has I/O limits configured.
+     * io_limits_enabled tells us if they are currently being
+     * enforced, but it can be temporarily set to false */
     CoQueue      throttled_reqs[2];
     bool         io_limits_enabled;
     /* The following fields are protected by the ThrottleGroup lock.
@@ -473,6 +489,8 @@ extern BlockDriver bdrv_file;
 extern BlockDriver bdrv_raw;
 extern BlockDriver bdrv_qcow2;
 
+extern QTAILQ_HEAD(BdrvStates, BlockDriverState) bdrv_states;
+
 /**
  * bdrv_setup_io_funcs:
  *
@@ -651,6 +669,7 @@ void mirror_start(BlockDriverState *bs, BlockDriverState *target,
  * @on_target_error: The action to take upon error writing to the target.
  * @cb: Completion function for the job.
  * @opaque: Opaque pointer value passed to @cb.
+ * @txn: Transaction that this job is part of (may be NULL).
  *
  * Start a backup operation on @bs.  Clusters in @bs are written to @target
  * until the job is cancelled or manually completed.
@@ -661,7 +680,7 @@ void backup_start(BlockDriverState *bs, BlockDriverState *target,
                   BlockdevOnError on_source_error,
                   BlockdevOnError on_target_error,
                   BlockCompletionFunc *cb, void *opaque,
-                  Error **errp);
+                  BlockJobTxn *txn, Error **errp);
 
 void blk_set_bs(BlockBackend *blk, BlockDriverState *bs);
 
@@ -675,4 +694,7 @@ void blk_dev_resize_cb(BlockBackend *blk);
 void bdrv_set_dirty(BlockDriverState *bs, int64_t cur_sector, int nr_sectors);
 bool bdrv_requests_pending(BlockDriverState *bs);
 
+void bdrv_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap **out);
+void bdrv_undo_clear_dirty_bitmap(BdrvDirtyBitmap *bitmap, HBitmap *in);
+
 #endif /* BLOCK_INT_H */
diff --git a/include/block/blockjob.h b/include/block/blockjob.h
index 289b13f..d84ccd8 100644
--- a/include/block/blockjob.h
+++ b/include/block/blockjob.h
@@ -50,6 +50,26 @@ typedef struct BlockJobDriver {
      * manually.
      */
     void (*complete)(BlockJob *job, Error **errp);
+
+    /**
+     * If the callback is not NULL, it will be invoked when all the jobs
+     * belonging to the same transaction complete; or upon this job's
+     * completion if it is not in a transaction. Skipped if NULL.
+     *
+     * All jobs will complete with a call to either .commit() or .abort() but
+     * never both.
+     */
+    void (*commit)(BlockJob *job);
+
+    /**
+     * If the callback is not NULL, it will be invoked when any job in the
+     * same transaction fails; or upon this job's failure (due to error or
+     * cancellation) if it is not in a transaction. Skipped if NULL.
+     *
+     * All jobs will complete with a call to either .commit() or .abort() but
+     * never both.
+     */
+    void (*abort)(BlockJob *job);
 } BlockJobDriver;
 
 /**
@@ -130,6 +150,21 @@ struct BlockJob {
 
     /** The opaque value that is passed to the completion function.  */
     void *opaque;
+
+    /** Reference count of the block job */
+    int refcnt;
+
+    /* True if this job has reported completion by calling block_job_completed.
+     */
+    bool completed;
+
+    /* ret code passed to block_job_completed.
+     */
+    int ret;
+
+    /** Non-NULL if this job is part of a transaction */
+    BlockJobTxn *txn;
+    QLIST_ENTRY(BlockJob) txn_list;
 };
 
 /**
@@ -174,12 +209,21 @@ void block_job_sleep_ns(BlockJob *job, QEMUClockType type, int64_t ns);
 void block_job_yield(BlockJob *job);
 
 /**
- * block_job_release:
+ * block_job_ref:
  * @bs: The block device.
  *
- * Release job resources when an error occurred or job completed.
+ * Grab a reference to the block job. Should be paired with block_job_unref.
  */
-void block_job_release(BlockDriverState *bs);
+void block_job_ref(BlockJob *job);
+
+/**
+ * block_job_unref:
+ * @bs: The block device.
+ *
+ * Release reference to the block job and release resources if it is the last
+ * reference.
+ */
+void block_job_unref(BlockJob *job);
 
 /**
  * block_job_completed:
@@ -364,4 +408,39 @@ void block_job_defer_to_main_loop(BlockJob *job,
                                   BlockJobDeferToMainLoopFn *fn,
                                   void *opaque);
 
+/**
+ * block_job_txn_new:
+ *
+ * Allocate and return a new block job transaction.  Jobs can be added to the
+ * transaction using block_job_txn_add_job().
+ *
+ * The transaction is automatically freed when the last job completes or is
+ * cancelled.
+ *
+ * All jobs in the transaction either complete successfully or fail/cancel as a
+ * group.  Jobs wait for each other before completing.  Cancelling one job
+ * cancels all jobs in the transaction.
+ */
+BlockJobTxn *block_job_txn_new(void);
+
+/**
+ * block_job_txn_unref:
+ *
+ * Release a reference that was previously acquired with block_job_txn_add_job
+ * or block_job_txn_new. If it's the last reference to the object, it will be
+ * freed.
+ */
+void block_job_txn_unref(BlockJobTxn *txn);
+
+/**
+ * block_job_txn_add_job:
+ * @txn: The transaction (may be NULL)
+ * @job: Job to add to the transaction
+ *
+ * Add @job to the transaction.  The @job must not already be in a transaction.
+ * The caller must call either block_job_txn_unref() or block_job_completed()
+ * to release the reference that is automatically grabbed here.
+ */
+void block_job_txn_add_job(BlockJobTxn *txn, BlockJob *job);
+
 #endif
diff --git a/include/exec/cpu-common.h b/include/exec/cpu-common.h
index 9fb1d54..85aa403 100644
--- a/include/exec/cpu-common.h
+++ b/include/exec/cpu-common.h
@@ -64,8 +64,12 @@ typedef uint32_t CPUReadMemoryFunc(void *opaque, hwaddr addr);
 void qemu_ram_remap(ram_addr_t addr, ram_addr_t length);
 /* This should not be used by devices.  */
 MemoryRegion *qemu_ram_addr_from_host(void *ptr, ram_addr_t *ram_addr);
+RAMBlock *qemu_ram_block_by_name(const char *name);
+RAMBlock *qemu_ram_block_from_host(void *ptr, bool round_offset,
+                                   ram_addr_t *ram_addr, ram_addr_t *offset);
 void qemu_ram_set_idstr(ram_addr_t addr, const char *name, DeviceState *dev);
 void qemu_ram_unset_idstr(ram_addr_t addr);
+const char *qemu_ram_get_idstr(RAMBlock *rb);
 
 void cpu_physical_memory_rw(hwaddr addr, uint8_t *buf,
                             int len, int is_write);
diff --git a/include/exec/exec-all.h b/include/exec/exec-all.h
index 9b93b9b..d900b0d 100644
--- a/include/exec/exec-all.h
+++ b/include/exec/exec-all.h
@@ -72,7 +72,6 @@ void restore_state_to_opc(CPUArchState *env, struct TranslationBlock *tb,
 
 void cpu_gen_init(void);
 bool cpu_restore_state(CPUState *cpu, uintptr_t searched_pc);
-void page_size_init(void);
 
 void QEMU_NORETURN cpu_resume_from_signal(CPUState *cpu, void *puc);
 void QEMU_NORETURN cpu_io_recompile(CPUState *cpu, uintptr_t retaddr);
@@ -190,6 +189,7 @@ struct TranslationBlock {
 #define CF_LAST_IO     0x8000 /* Last insn may be an IO access.  */
 #define CF_NOCACHE     0x10000 /* To be freed after execution */
 #define CF_USE_ICOUNT  0x20000
+#define CF_IGNORE_ICOUNT 0x40000 /* Do not generate icount code */
 
     void *tc_ptr;    /* pointer to the translated code */
     uint8_t *tc_search;  /* pointer to search data */
diff --git a/include/exec/ram_addr.h b/include/exec/ram_addr.h
index 3360ac5..7115154 100644
--- a/include/exec/ram_addr.h
+++ b/include/exec/ram_addr.h
@@ -22,8 +22,6 @@
 #ifndef CONFIG_USER_ONLY
 #include "hw/xen/xen.h"
 
-typedef struct RAMBlock RAMBlock;
-
 struct RAMBlock {
     struct rcu_head rcu;
     struct MemoryRegion *mr;
diff --git a/include/hw/arm/allwinner-a10.h b/include/hw/arm/allwinner-a10.h
index 01a189b..6b32a99 100644
--- a/include/hw/arm/allwinner-a10.h
+++ b/include/hw/arm/allwinner-a10.h
@@ -7,6 +7,8 @@
 #include "hw/timer/allwinner-a10-pit.h"
 #include "hw/intc/allwinner-a10-pic.h"
 #include "hw/net/allwinner_emac.h"
+#include "hw/ide/pci.h"
+#include "hw/ide/ahci.h"
 
 #include "sysemu/sysemu.h"
 #include "exec/address-spaces.h"
@@ -16,6 +18,7 @@
 #define AW_A10_PIT_REG_BASE     0x01c20c00
 #define AW_A10_UART0_REG_BASE   0x01c28000
 #define AW_A10_EMAC_BASE        0x01c0b000
+#define AW_A10_SATA_BASE        0x01c18000
 
 #define AW_A10_SDRAM_BASE       0x40000000
 
@@ -32,6 +35,7 @@ typedef struct AwA10State {
     AwA10PITState timer;
     AwA10PICState intc;
     AwEmacState emac;
+    AllwinnerAHCIState sata;
 } AwA10State;
 
 #define ALLWINNER_H_
diff --git a/include/hw/arm/arm.h b/include/hw/arm/arm.h
index 67ba7db..c26b0e3 100644
--- a/include/hw/arm/arm.h
+++ b/include/hw/arm/arm.h
@@ -97,6 +97,12 @@ struct arm_boot_info {
     hwaddr board_setup_addr;
     void (*write_board_setup)(ARMCPU *cpu,
                               const struct arm_boot_info *info);
+
+    /* If set, the board specific loader/setup blob will be run from secure
+     * mode, regardless of secure_boot. The blob becomes responsible for
+     * changing to non-secure state if implementing a non-secure boot
+     */
+    bool secure_board_setup;
 };
 
 /**
diff --git a/include/hw/compat.h b/include/hw/compat.h
index 93e71af..d0b1c4f 100644
--- a/include/hw/compat.h
+++ b/include/hw/compat.h
@@ -6,6 +6,18 @@
             .driver   = "virtio-blk-device",\
             .property = "scsi",\
             .value    = "true",\
+        },{\
+            .driver   = "e1000",\
+            .property = "extra_mac_registers",\
+            .value    = "off",\
+        },{\
+            .driver   = "virtio-pci",\
+            .property = "x-disable-pcie",\
+            .value    = "on",\
+        },{\
+            .driver   = "virtio-pci",\
+            .property = "migrate-extra",\
+            .value    = "off",\
         },
 
 #define HW_COMPAT_2_3 \
diff --git a/include/hw/i386/pc.h b/include/hw/i386/pc.h
index 606dbc2..4bbc0ff 100644
--- a/include/hw/i386/pc.h
+++ b/include/hw/i386/pc.h
@@ -322,6 +322,31 @@ bool e820_get_entry(int, uint32_t, uint64_t *, uint64_t *);
             .driver   = "host" "-" TYPE_X86_CPU,\
             .property = "host-cache-info",\
             .value    = "on",\
+        },\
+        {\
+            .driver   = TYPE_X86_CPU,\
+            .property = "check",\
+            .value    = "off",\
+        },\
+        {\
+            .driver   = "qemu64" "-" TYPE_X86_CPU,\
+            .property = "sse4a",\
+            .value    = "on",\
+        },\
+        {\
+            .driver   = "qemu64" "-" TYPE_X86_CPU,\
+            .property = "abm",\
+            .value    = "on",\
+        },\
+        {\
+            .driver   = "qemu64" "-" TYPE_X86_CPU,\
+            .property = "popcnt",\
+            .value    = "on",\
+        },\
+        {\
+            .driver   = "qemu32" "-" TYPE_X86_CPU,\
+            .property = "popcnt",\
+            .value    = "on",\
         },
 
 #define PC_COMPAT_2_3 \
diff --git a/include/hw/misc/zynq-xadc.h b/include/hw/misc/zynq-xadc.h
new file mode 100644
index 0000000..f1a410a
--- /dev/null
+++ b/include/hw/misc/zynq-xadc.h
@@ -0,0 +1,46 @@
+/*
+ * Device model for Zynq ADC controller
+ *
+ * Copyright (c) 2015 Guenter Roeck <linux@roeck-us.net>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * You should have received a copy of the GNU General Public License along
+ * with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef ZYNQ_XADC_H
+#define ZYNQ_XADC_H
+
+#include "hw/sysbus.h"
+
+#define ZYNQ_XADC_MMIO_SIZE     0x0020
+#define ZYNQ_XADC_NUM_IO_REGS   (ZYNQ_XADC_MMIO_SIZE / 4)
+#define ZYNQ_XADC_NUM_ADC_REGS  128
+#define ZYNQ_XADC_FIFO_DEPTH    15
+
+#define TYPE_ZYNQ_XADC          "xlnx,zynq-xadc"
+#define ZYNQ_XADC(obj) \
+    OBJECT_CHECK(ZynqXADCState, (obj), TYPE_ZYNQ_XADC)
+
+typedef struct ZynqXADCState {
+    /*< private >*/
+    SysBusDevice parent_obj;
+
+    /*< public >*/
+    MemoryRegion iomem;
+
+    uint32_t regs[ZYNQ_XADC_NUM_IO_REGS];
+    uint16_t xadc_regs[ZYNQ_XADC_NUM_ADC_REGS];
+    uint16_t xadc_read_reg_previous;
+    uint16_t xadc_dfifo[ZYNQ_XADC_FIFO_DEPTH];
+    uint16_t xadc_dfifo_entries;
+
+    struct IRQState *qemu_irq;
+
+} ZynqXADCState;
+
+#endif /* ZYNQ_XADC_H */
diff --git a/include/hw/qdev-core.h b/include/hw/qdev-core.h
index 8057aed..e6dbde4 100644
--- a/include/hw/qdev-core.h
+++ b/include/hw/qdev-core.h
@@ -337,6 +337,7 @@ int qdev_walk_children(DeviceState *dev,
                        void *opaque);
 
 void qdev_reset_all(DeviceState *dev);
+void qdev_reset_all_fn(void *opaque);
 
 /**
  * @qbus_reset_all:
diff --git a/include/hw/virtio/virtio-bus.h b/include/hw/virtio/virtio-bus.h
index 8811415..6c3d4cb 100644
--- a/include/hw/virtio/virtio-bus.h
+++ b/include/hw/virtio/virtio-bus.h
@@ -44,9 +44,12 @@ typedef struct VirtioBusClass {
     void (*notify)(DeviceState *d, uint16_t vector);
     void (*save_config)(DeviceState *d, QEMUFile *f);
     void (*save_queue)(DeviceState *d, int n, QEMUFile *f);
+    void (*save_extra_state)(DeviceState *d, QEMUFile *f);
     int (*load_config)(DeviceState *d, QEMUFile *f);
     int (*load_queue)(DeviceState *d, int n, QEMUFile *f);
     int (*load_done)(DeviceState *d, QEMUFile *f);
+    int (*load_extra_state)(DeviceState *d, QEMUFile *f);
+    bool (*has_extra_state)(DeviceState *d);
     bool (*query_guest_notifiers)(DeviceState *d);
     int (*set_guest_notifiers)(DeviceState *d, int nvqs, bool assign);
     int (*set_host_notifier)(DeviceState *d, int n, bool assigned);
diff --git a/include/migration/migration.h b/include/migration/migration.h
index 8334621..fd018b7 100644
--- a/include/migration/migration.h
+++ b/include/migration/migration.h
@@ -35,6 +35,7 @@
 #define QEMU_VM_SUBSECTION           0x05
 #define QEMU_VM_VMDESCRIPTION        0x06
 #define QEMU_VM_CONFIGURATION        0x07
+#define QEMU_VM_COMMAND              0x08
 #define QEMU_VM_SECTION_FOOTER       0x7e
 
 struct MigrationParams {
@@ -42,13 +43,67 @@ struct MigrationParams {
     bool shared;
 };
 
-typedef struct MigrationState MigrationState;
+/* Messages sent on the return path from destination to source */
+enum mig_rp_message_type {
+    MIG_RP_MSG_INVALID = 0,  /* Must be 0 */
+    MIG_RP_MSG_SHUT,         /* sibling will not send any more RP messages */
+    MIG_RP_MSG_PONG,         /* Response to a PING; data (seq: be32 ) */
+
+    MIG_RP_MSG_REQ_PAGES_ID, /* data (start: be64, len: be32, id: string) */
+    MIG_RP_MSG_REQ_PAGES,    /* data (start: be64, len: be32) */
+
+    MIG_RP_MSG_MAX
+};
 
 typedef QLIST_HEAD(, LoadStateEntry) LoadStateEntry_Head;
 
+/* The current postcopy state is read/set by postcopy_state_get/set
+ * which update it atomically.
+ * The state is updated as postcopy messages are received, and
+ * in general only one thread should be writing to the state at any one
+ * time, initially the main thread and then the listen thread;
+ * Corner cases are where either thread finishes early and/or errors.
+ * The state is checked as messages are received to ensure that
+ * the source is sending us messages in the correct order.
+ * The state is also used by the RAM reception code to know if it
+ * has to place pages atomically, and the cleanup code at the end of
+ * the main thread to know if it has to delay cleanup until the end
+ * of postcopy.
+ */
+typedef enum {
+    POSTCOPY_INCOMING_NONE = 0,  /* Initial state - no postcopy */
+    POSTCOPY_INCOMING_ADVISE,
+    POSTCOPY_INCOMING_DISCARD,
+    POSTCOPY_INCOMING_LISTENING,
+    POSTCOPY_INCOMING_RUNNING,
+    POSTCOPY_INCOMING_END
+} PostcopyState;
+
 /* State for the incoming migration */
 struct MigrationIncomingState {
-    QEMUFile *file;
+    QEMUFile *from_src_file;
+
+    /*
+     * Free at the start of the main state load, set as the main thread finishes
+     * loading state.
+     */
+    QemuEvent main_thread_load_event;
+
+    bool           have_fault_thread;
+    QemuThread     fault_thread;
+    QemuSemaphore  fault_thread_sem;
+
+    bool           have_listen_thread;
+    QemuThread     listen_thread;
+    QemuSemaphore  listen_thread_sem;
+
+    /* For the kernel to send us notifications */
+    int       userfault_fd;
+    /* To tell the fault_thread to quit */
+    int       userfault_quit_fd;
+    QEMUFile *to_src_file;
+    QemuMutex rp_mutex;    /* We send replies from multiple threads */
+    void     *postcopy_tmp_page;
 
     /* See savevm.c */
     LoadStateEntry_Head loadvm_handlers;
@@ -58,6 +113,18 @@ MigrationIncomingState *migration_incoming_get_current(void);
 MigrationIncomingState *migration_incoming_state_new(QEMUFile *f);
 void migration_incoming_state_destroy(void);
 
+/*
+ * An outstanding page request, on the source, having been received
+ * and queued
+ */
+struct MigrationSrcPageRequest {
+    RAMBlock *rb;
+    hwaddr    offset;
+    hwaddr    len;
+
+    QSIMPLEQ_ENTRY(MigrationSrcPageRequest) next_req;
+};
+
 struct MigrationState
 {
     int64_t bandwidth_limit;
@@ -70,6 +137,14 @@ struct MigrationState
 
     int state;
     MigrationParams params;
+
+    /* State related to return path */
+    struct {
+        QEMUFile     *from_dst_file;
+        QemuThread    rp_thread;
+        bool          error;
+    } rp_state;
+
     double mbps;
     int64_t total_time;
     int64_t downtime;
@@ -80,6 +155,18 @@ struct MigrationState
     int64_t xbzrle_cache_size;
     int64_t setup_time;
     int64_t dirty_sync_count;
+
+    /* Flag set once the migration has been asked to enter postcopy */
+    bool start_postcopy;
+
+    /* Flag set once the migration thread is running (and needs joining) */
+    bool migration_thread_running;
+
+    /* Queue of outstanding page requests from the destination */
+    QemuMutex src_page_req_mutex;
+    QSIMPLEQ_HEAD(src_page_requests, MigrationSrcPageRequest) src_page_requests;
+    /* The RAMBlock used in the last src_page_request */
+    RAMBlock *last_req_rb;
 };
 
 void process_incoming_migration(QEMUFile *f);
@@ -116,9 +203,12 @@ int migrate_fd_close(MigrationState *s);
 
 void add_migration_state_change_notifier(Notifier *notify);
 void remove_migration_state_change_notifier(Notifier *notify);
+MigrationState *migrate_init(const MigrationParams *params);
 bool migration_in_setup(MigrationState *);
 bool migration_has_finished(MigrationState *);
 bool migration_has_failed(MigrationState *);
+/* True if outgoing migration has entered postcopy phase */
+bool migration_in_postcopy(MigrationState *);
 MigrationState *migrate_get_current(void);
 
 void migrate_compress_threads_create(void);
@@ -145,6 +235,13 @@ uint64_t xbzrle_mig_pages_cache_miss(void);
 double xbzrle_mig_cache_miss_rate(void);
 
 void ram_handle_compressed(void *host, uint8_t ch, uint64_t size);
+void ram_debug_dump_bitmap(unsigned long *todump, bool expected);
+/* For outgoing discard bitmap */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms);
+/* For incoming postcopy discard */
+int ram_discard_range(MigrationIncomingState *mis, const char *block_name,
+                      uint64_t start, size_t length);
+int ram_postcopy_incoming_init(MigrationIncomingState *mis);
 
 /**
  * @migrate_add_blocker - prevent migration from proceeding
@@ -160,6 +257,7 @@ void migrate_add_blocker(Error *reason);
  */
 void migrate_del_blocker(Error *reason);
 
+bool migrate_postcopy_ram(void);
 bool migrate_zero_blocks(void);
 
 bool migrate_auto_converge(void);
@@ -179,6 +277,17 @@ int migrate_compress_threads(void);
 int migrate_decompress_threads(void);
 bool migrate_use_events(void);
 
+/* Sending on the return path - generic and then for each message type */
+void migrate_send_rp_message(MigrationIncomingState *mis,
+                             enum mig_rp_message_type message_type,
+                             uint16_t len, void *data);
+void migrate_send_rp_shut(MigrationIncomingState *mis,
+                          uint32_t value);
+void migrate_send_rp_pong(MigrationIncomingState *mis,
+                          uint32_t value);
+void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char* rbname,
+                              ram_addr_t start, size_t len);
+
 void ram_control_before_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_after_iterate(QEMUFile *f, uint64_t flags);
 void ram_control_load_hook(QEMUFile *f, uint64_t flags, void *data);
@@ -204,4 +313,12 @@ void global_state_set_optional(void);
 void savevm_skip_configuration(void);
 int global_state_store(void);
 void global_state_store_running(void);
+
+void flush_page_queue(MigrationState *ms);
+int ram_save_queue_pages(MigrationState *ms, const char *rbname,
+                         ram_addr_t start, ram_addr_t len);
+
+PostcopyState postcopy_state_get(void);
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(PostcopyState new_state);
 #endif
diff --git a/include/migration/postcopy-ram.h b/include/migration/postcopy-ram.h
new file mode 100644
index 0000000..b6a7491
--- /dev/null
+++ b/include/migration/postcopy-ram.h
@@ -0,0 +1,99 @@
+/*
+ * Postcopy migration for RAM
+ *
+ * Copyright 2013 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Dave Gilbert  <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+#ifndef QEMU_POSTCOPY_RAM_H
+#define QEMU_POSTCOPY_RAM_H
+
+/* Return true if the host supports everything we need to do postcopy-ram */
+bool postcopy_ram_supported_by_host(void);
+
+/*
+ * Make all of RAM sensitive to accesses to areas that haven't yet been written
+ * and wire up anything necessary to deal with it.
+ */
+int postcopy_ram_enable_notify(MigrationIncomingState *mis);
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from ram.c's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages);
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis);
+
+/*
+ * Discard the contents of 'length' bytes from 'start'
+ * We can assume that if we've been called postcopy_ram_hosttest returned true
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               size_t length);
+
+/*
+ * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
+ * however leaving it until after precopy means that most of the precopy
+ * data is still THPd
+ */
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis);
+
+/*
+ * Called at the start of each RAMBlock by the bitmap code.
+ * 'offset' is the bitmap offset of the named RAMBlock in the migration
+ * bitmap.
+ * Returns a new PDS
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+                                                 unsigned long offset,
+                                                 const char *name);
+
+/*
+ * Called by the bitmap code for each chunk to discard.
+ * May send a discard message, may just leave it queued to
+ * be sent later.
+ * @start,@length: a range of pages in the migration bitmap in the
+ *  RAM block passed to postcopy_discard_send_init() (length=1 is one page)
+ */
+void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
+                                 unsigned long start, unsigned long length);
+
+/*
+ * Called at the end of each RAMBlock by the bitmap code.
+ * Sends any outstanding discard messages, frees the PDS.
+ */
+void postcopy_discard_send_finish(MigrationState *ms,
+                                  PostcopyDiscardState *pds);
+
+/*
+ * Place a page (from) at (host) efficiently
+ *    There are restrictions on how 'from' must be mapped, in general best
+ *    to use other postcopy_ routines to allocate.
+ * returns 0 on success
+ */
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from);
+
+/*
+ * Place a zero page at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host);
+
+/*
+ * Allocate a page of memory that can be mapped at a later point in time
+ * using postcopy_place_page
+ * Returns: Pointer to allocated page
+ */
+void *postcopy_get_tmp_page(MigrationIncomingState *mis);
+
+#endif
diff --git a/include/migration/qemu-file.h b/include/migration/qemu-file.h
index 29a338d..b5d08d2 100644
--- a/include/migration/qemu-file.h
+++ b/include/migration/qemu-file.h
@@ -89,6 +89,11 @@ typedef size_t (QEMURamSaveFunc)(QEMUFile *f, void *opaque,
                                uint64_t *bytes_sent);
 
 /*
+ * Return a QEMUFile for comms in the opposite direction
+ */
+typedef QEMUFile *(QEMURetPathFunc)(void *opaque);
+
+/*
  * Stop any read or write (depending on flags) on the underlying
  * transport on the QEMUFile.
  * Existing blocking reads/writes must be woken
@@ -106,6 +111,7 @@ typedef struct QEMUFileOps {
     QEMURamHookFunc *after_ram_iterate;
     QEMURamHookFunc *hook_ram_load;
     QEMURamSaveFunc *save_page;
+    QEMURetPathFunc *get_return_path;
     QEMUFileShutdownFunc *shut_down;
 } QEMUFileOps;
 
@@ -163,9 +169,11 @@ void qemu_put_be32(QEMUFile *f, unsigned int v);
 void qemu_put_be64(QEMUFile *f, uint64_t v);
 size_t qemu_peek_buffer(QEMUFile *f, uint8_t **buf, size_t size, size_t offset);
 size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size);
+size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size);
 ssize_t qemu_put_compression_data(QEMUFile *f, const uint8_t *p, size_t size,
                                   int level);
 int qemu_put_qemu_file(QEMUFile *f_des, QEMUFile *f_src);
+
 /*
  * Note that you can only peek continuous bytes from where the current pointer
  * is; you aren't guaranteed to be able to peak to +n bytes unless you've
@@ -194,7 +202,9 @@ int64_t qemu_file_get_rate_limit(QEMUFile *f);
 int qemu_file_get_error(QEMUFile *f);
 void qemu_file_set_error(QEMUFile *f, int ret);
 int qemu_file_shutdown(QEMUFile *f);
+QEMUFile *qemu_file_get_return_path(QEMUFile *f);
 void qemu_fflush(QEMUFile *f);
+void qemu_file_set_blocking(QEMUFile *f, bool block);
 
 static inline void qemu_put_be64s(QEMUFile *f, const uint64_t *pv)
 {
diff --git a/include/migration/vmstate.h b/include/migration/vmstate.h
index 9a65522..7267e38 100644
--- a/include/migration/vmstate.h
+++ b/include/migration/vmstate.h
@@ -39,8 +39,9 @@ typedef struct SaveVMHandlers {
     void (*set_params)(const MigrationParams *params, void * opaque);
     SaveStateHandler *save_state;
 
-    void (*cancel)(void *opaque);
-    int (*save_live_complete)(QEMUFile *f, void *opaque);
+    void (*cleanup)(void *opaque);
+    int (*save_live_complete_postcopy)(QEMUFile *f, void *opaque);
+    int (*save_live_complete_precopy)(QEMUFile *f, void *opaque);
 
     /* This runs both outside and inside the iothread lock.  */
     bool (*is_active)(void *opaque);
@@ -54,8 +55,9 @@ typedef struct SaveVMHandlers {
 
     /* This runs outside the iothread lock!  */
     int (*save_live_setup)(QEMUFile *f, void *opaque);
-    uint64_t (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size);
-
+    void (*save_live_pending)(QEMUFile *f, void *opaque, uint64_t max_size,
+                              uint64_t *non_postcopiable_pending,
+                              uint64_t *postcopiable_pending);
     LoadStateHandler *load_state;
 } SaveVMHandlers;
 
diff --git a/include/monitor/hmp-target.h b/include/monitor/hmp-target.h
index 213566c..bc2c9c0 100644
--- a/include/monitor/hmp-target.h
+++ b/include/monitor/hmp-target.h
@@ -35,6 +35,7 @@ struct MonitorDef {
 };
 
 const MonitorDef *target_monitor_defs(void);
+int target_get_monitor_def(CPUState *cs, const char *name, uint64_t *pval);
 
 CPUArchState *mon_get_cpu_env(void);
 CPUState *mon_get_cpu(void);
diff --git a/include/qapi/error.h b/include/qapi/error.h
index c69dddb..4d42cdc 100644
--- a/include/qapi/error.h
+++ b/include/qapi/error.h
@@ -30,6 +30,10 @@
  * Handle an error without reporting it (just for completeness):
  *     error_free(err);
  *
+ * Assert that an expected error occurred, but clean it up without
+ * reporting it (primarily useful in testsuites):
+ *     error_free_or_abort(&err);
+ *
  * Pass an existing error to the caller:
  *     error_propagate(errp, err);
  * where Error **errp is a parameter, by convention the last one.
@@ -190,6 +194,11 @@ Error *error_copy(const Error *err);
 void error_free(Error *err);
 
 /*
+ * Convenience function to assert that *@errp is set, then silently free it.
+ */
+void error_free_or_abort(Error **errp);
+
+/*
  * Convenience function to error_report() and free @err.
  */
 void error_report_err(Error *);
diff --git a/include/qapi/qmp/qerror.h b/include/qapi/qmp/qerror.h
index 842b27a..f601499 100644
--- a/include/qapi/qmp/qerror.h
+++ b/include/qapi/qmp/qerror.h
@@ -106,4 +106,7 @@
 #define QERR_UNSUPPORTED \
     "this feature or command is not currently supported"
 
+#define QERR_REPLAY_NOT_SUPPORTED \
+    "Record/replay feature is not supported for '%s'"
+
 #endif /* QERROR_H */
diff --git a/include/qapi/qmp/qobject.h b/include/qapi/qmp/qobject.h
index c856f55..4b96ed5 100644
--- a/include/qapi/qmp/qobject.h
+++ b/include/qapi/qmp/qobject.h
@@ -90,6 +90,7 @@ static inline void qobject_incref(QObject *obj)
  */
 static inline void qobject_decref(QObject *obj)
 {
+    assert(!obj || obj->refcnt);
     if (obj && --obj->refcnt == 0) {
         assert(obj->type != NULL);
         assert(obj->type->destroy != NULL);
diff --git a/include/qemu-common.h b/include/qemu-common.h
index 2f74540..405364f 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -499,5 +499,6 @@ size_t buffer_find_nonzero_offset(const void *buf, size_t len);
 int parse_debug_env(const char *name, int max, int initial);
 
 const char *qemu_ether_ntoa(const MACAddr *mac);
+void page_size_init(void);
 
 #endif
diff --git a/include/qemu/log.h b/include/qemu/log.h
index 7de4500..362cbc4 100644
--- a/include/qemu/log.h
+++ b/include/qemu/log.h
@@ -35,7 +35,6 @@ static inline bool qemu_log_enabled(void)
 #define CPU_LOG_INT        (1 << 4)
 #define CPU_LOG_EXEC       (1 << 5)
 #define CPU_LOG_PCALL      (1 << 6)
-#define CPU_LOG_IOPORT     (1 << 7)
 #define CPU_LOG_TB_CPU     (1 << 8)
 #define CPU_LOG_RESET      (1 << 9)
 #define LOG_UNIMP          (1 << 10)
diff --git a/include/qemu/osdep.h b/include/qemu/osdep.h
index b568424..861d84b 100644
--- a/include/qemu/osdep.h
+++ b/include/qemu/osdep.h
@@ -139,6 +139,8 @@ void qemu_anon_ram_free(void *ptr, size_t size);
 
 #if defined(CONFIG_MADVISE)
 
+#include <sys/mman.h>
+
 #define QEMU_MADV_WILLNEED  MADV_WILLNEED
 #define QEMU_MADV_DONTNEED  MADV_DONTNEED
 #ifdef MADV_DONTFORK
@@ -171,6 +173,11 @@ void qemu_anon_ram_free(void *ptr, size_t size);
 #else
 #define QEMU_MADV_HUGEPAGE QEMU_MADV_INVALID
 #endif
+#ifdef MADV_NOHUGEPAGE
+#define QEMU_MADV_NOHUGEPAGE MADV_NOHUGEPAGE
+#else
+#define QEMU_MADV_NOHUGEPAGE QEMU_MADV_INVALID
+#endif
 
 #elif defined(CONFIG_POSIX_MADVISE)
 
@@ -182,6 +189,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
 #define QEMU_MADV_DODUMP QEMU_MADV_INVALID
 #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
+#define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 
 #else /* no-op */
 
@@ -193,6 +201,7 @@ void qemu_anon_ram_free(void *ptr, size_t size);
 #define QEMU_MADV_DODUMP QEMU_MADV_INVALID
 #define QEMU_MADV_DONTDUMP QEMU_MADV_INVALID
 #define QEMU_MADV_HUGEPAGE  QEMU_MADV_INVALID
+#define QEMU_MADV_NOHUGEPAGE  QEMU_MADV_INVALID
 
 #endif
 
@@ -247,8 +256,8 @@ static inline void qemu_timersub(const struct timeval *val1,
 
 void qemu_set_cloexec(int fd);
 
-void qemu_set_version(const char *);
-const char *qemu_get_version(void);
+void qemu_set_hw_version(const char *);
+const char *qemu_hw_version(void);
 
 void fips_set_state(bool requested);
 bool fips_get_state(void);
diff --git a/include/qemu/timed-average.h b/include/qemu/timed-average.h
new file mode 100644
index 0000000..364bf88
--- /dev/null
+++ b/include/qemu/timed-average.h
@@ -0,0 +1,64 @@
+/*
+ * QEMU timed average computation
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   Benoît Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free software: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation, either version 2 of the License, or
+ * (at your option) version 3 or any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef TIMED_AVERAGE_H
+#define TIMED_AVERAGE_H
+
+#include <stdint.h>
+
+#include "qemu/timer.h"
+
+typedef struct TimedAverageWindow TimedAverageWindow;
+typedef struct TimedAverage TimedAverage;
+
+/* All fields of both structures are private */
+
+struct TimedAverageWindow {
+    uint64_t      min;             /* minimum value accounted in the window */
+    uint64_t      max;             /* maximum value accounted in the window */
+    uint64_t      sum;             /* sum of all values */
+    uint64_t      count;           /* number of values */
+    int64_t       expiration;      /* the end of the current window in ns */
+};
+
+struct TimedAverage {
+    uint64_t           period;     /* period in nanoseconds */
+    TimedAverageWindow windows[2]; /* two overlapping windows of with
+                                    * an offset of period / 2 between them */
+    unsigned           current;    /* the current window index: it's also the
+                                    * oldest window index */
+    QEMUClockType      clock_type; /* the clock used */
+};
+
+void timed_average_init(TimedAverage *ta, QEMUClockType clock_type,
+                        uint64_t period);
+
+void timed_average_account(TimedAverage *ta, uint64_t value);
+
+uint64_t timed_average_min(TimedAverage *ta);
+uint64_t timed_average_avg(TimedAverage *ta);
+uint64_t timed_average_max(TimedAverage *ta);
+uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed);
+
+#endif
diff --git a/include/qemu/typedefs.h b/include/qemu/typedefs.h
index d961362..6b1093d 100644
--- a/include/qemu/typedefs.h
+++ b/include/qemu/typedefs.h
@@ -9,6 +9,7 @@ struct Monitor;
 typedef struct AdapterInfo AdapterInfo;
 typedef struct AddressSpace AddressSpace;
 typedef struct AioContext AioContext;
+typedef struct AllwinnerAHCIState AllwinnerAHCIState;
 typedef struct AudioState AudioState;
 typedef struct BlockBackend BlockBackend;
 typedef struct BlockBackendRootState BlockBackendRootState;
@@ -43,6 +44,7 @@ typedef struct MemoryRegion MemoryRegion;
 typedef struct MemoryRegionSection MemoryRegionSection;
 typedef struct MigrationIncomingState MigrationIncomingState;
 typedef struct MigrationParams MigrationParams;
+typedef struct MigrationState MigrationState;
 typedef struct Monitor Monitor;
 typedef struct MouseTransformInfo MouseTransformInfo;
 typedef struct MSIMessage MSIMessage;
@@ -65,6 +67,7 @@ typedef struct PCMachineState PCMachineState;
 typedef struct PCMachineClass PCMachineClass;
 typedef struct PCMCIACardState PCMCIACardState;
 typedef struct PixelFormat PixelFormat;
+typedef struct PostcopyDiscardState PostcopyDiscardState;
 typedef struct PropertyInfo PropertyInfo;
 typedef struct Property Property;
 typedef struct QEMUBH QEMUBH;
@@ -78,6 +81,7 @@ typedef struct QEMUSizedBuffer QEMUSizedBuffer;
 typedef struct QEMUTimerListGroup QEMUTimerListGroup;
 typedef struct QEMUTimer QEMUTimer;
 typedef struct Range Range;
+typedef struct RAMBlock RAMBlock;
 typedef struct SerialState SerialState;
 typedef struct SHPCDevice SHPCDevice;
 typedef struct SMBusDevice SMBusDevice;
diff --git a/include/qom/object.h b/include/qom/object.h
index be7280c..0bb89d4 100644
--- a/include/qom/object.h
+++ b/include/qom/object.h
@@ -510,16 +510,16 @@ struct TypeInfo
 
 /**
  * OBJECT_CLASS_CHECK:
- * @class: The C type to use for the return value.
- * @obj: A derivative of @type to cast.
- * @name: the QOM typename of @class.
+ * @class_type: The C type to use for the return value.
+ * @class: A derivative class of @class_type to cast.
+ * @name: the QOM typename of @class_type.
  *
  * A type safe version of @object_class_dynamic_cast_assert.  This macro is
  * typically wrapped by each type to perform type safe casts of a class to a
  * specific class type.
  */
-#define OBJECT_CLASS_CHECK(class, obj, name) \
-    ((class *)object_class_dynamic_cast_assert(OBJECT_CLASS(obj), (name), \
+#define OBJECT_CLASS_CHECK(class_type, class, name) \
+    ((class_type *)object_class_dynamic_cast_assert(OBJECT_CLASS(class), (name), \
                                                __FILE__, __LINE__, __func__))
 
 /**
diff --git a/include/sysemu/balloon.h b/include/sysemu/balloon.h
index 17fe300..3f976b4 100644
--- a/include/sysemu/balloon.h
+++ b/include/sysemu/balloon.h
@@ -22,5 +22,7 @@ typedef void (QEMUBalloonStatus)(void *opaque, BalloonInfo *info);
 int qemu_add_balloon_handler(QEMUBalloonEvent *event_func,
 			     QEMUBalloonStatus *stat_func, void *opaque);
 void qemu_remove_balloon_handler(void *opaque);
+bool qemu_balloon_is_inhibited(void);
+void qemu_balloon_inhibit(bool state);
 
 #endif
diff --git a/include/sysemu/block-backend.h b/include/sysemu/block-backend.h
index 9306a52..f4a68e2 100644
--- a/include/sysemu/block-backend.h
+++ b/include/sysemu/block-backend.h
@@ -65,6 +65,7 @@ BlockBackend *blk_new_with_bs(const char *name, Error **errp);
 BlockBackend *blk_new_open(const char *name, const char *filename,
                            const char *reference, QDict *options, int flags,
                            Error **errp);
+int blk_get_refcnt(BlockBackend *blk);
 void blk_ref(BlockBackend *blk);
 void blk_unref(BlockBackend *blk);
 const char *blk_name(BlockBackend *blk);
@@ -72,6 +73,7 @@ BlockBackend *blk_by_name(const char *name);
 BlockBackend *blk_next(BlockBackend *blk);
 
 BlockDriverState *blk_bs(BlockBackend *blk);
+void blk_remove_bs(BlockBackend *blk);
 void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs);
 
 void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk);
@@ -166,6 +168,8 @@ void blk_io_unplug(BlockBackend *blk);
 BlockAcctStats *blk_get_stats(BlockBackend *blk);
 BlockBackendRootState *blk_get_root_state(BlockBackend *blk);
 void blk_update_root_state(BlockBackend *blk);
+void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs);
+int blk_get_open_flags_from_root_state(BlockBackend *blk);
 
 void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk,
                   BlockCompletionFunc *cb, void *opaque);
diff --git a/include/sysemu/blockdev.h b/include/sysemu/blockdev.h
index a00be94..b06a060 100644
--- a/include/sysemu/blockdev.h
+++ b/include/sysemu/blockdev.h
@@ -63,8 +63,6 @@ DriveInfo *drive_new(QemuOpts *arg, BlockInterfaceType block_default_type);
 
 /* device-hotplug */
 
-void qmp_change_blockdev(const char *device, const char *filename,
-                         const char *format, Error **errp);
 void hmp_commit(Monitor *mon, const QDict *qdict);
 void hmp_drive_del(Monitor *mon, const QDict *qdict);
 #endif
diff --git a/include/sysemu/cpus.h b/include/sysemu/cpus.h
index 30ddd12..3d1e5ba 100644
--- a/include/sysemu/cpus.h
+++ b/include/sysemu/cpus.h
@@ -11,7 +11,6 @@ void cpu_stop_current(void);
 void cpu_synchronize_all_states(void);
 void cpu_synchronize_all_post_reset(void);
 void cpu_synchronize_all_post_init(void);
-void cpu_clean_all_dirty(void);
 
 void qtest_clock_warp(int64_t dest);
 
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 461ef65..b31f325 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -53,6 +53,7 @@ extern bool kvm_gsi_routing_allowed;
 extern bool kvm_gsi_direct_mapping;
 extern bool kvm_readonly_mem_allowed;
 extern bool kvm_direct_msi_allowed;
+extern bool kvm_ioeventfd_any_length_allowed;
 
 #if defined CONFIG_KVM || !defined NEED_CPU_H
 #define kvm_enabled()           (kvm_allowed)
@@ -153,6 +154,12 @@ extern bool kvm_direct_msi_allowed;
  */
 #define kvm_direct_msi_enabled() (kvm_direct_msi_allowed)
 
+/**
+ * kvm_ioeventfd_any_length_enabled:
+ * Returns: true if KVM allows any length io eventfd.
+ */
+#define kvm_ioeventfd_any_length_enabled() (kvm_ioeventfd_any_length_allowed)
+
 #else
 #define kvm_enabled()           (0)
 #define kvm_irqchip_in_kernel() (false)
@@ -166,6 +173,7 @@ extern bool kvm_direct_msi_allowed;
 #define kvm_gsi_direct_mapping() (false)
 #define kvm_readonly_mem_enabled() (false)
 #define kvm_direct_msi_enabled() (false)
+#define kvm_ioeventfd_any_length_enabled() (false)
 #endif
 
 struct kvm_run;
@@ -417,7 +425,6 @@ int kvm_physical_memory_addr_from_host(KVMState *s, void *ram_addr,
 void kvm_cpu_synchronize_state(CPUState *cpu);
 void kvm_cpu_synchronize_post_reset(CPUState *cpu);
 void kvm_cpu_synchronize_post_init(CPUState *cpu);
-void kvm_cpu_clean_state(CPUState *cpu);
 
 /* generic hooks - to be moved/refactored once there are more users */
 
@@ -442,13 +449,6 @@ static inline void cpu_synchronize_post_init(CPUState *cpu)
     }
 }
 
-static inline void cpu_clean_state(CPUState *cpu)
-{
-    if (kvm_enabled()) {
-        kvm_cpu_clean_state(cpu);
-    }
-}
-
 int kvm_irqchip_add_msi_route(KVMState *s, MSIMessage msg, PCIDevice *dev);
 int kvm_irqchip_update_msi_route(KVMState *s, int virq, MSIMessage msg,
                                  PCIDevice *dev);
diff --git a/include/sysemu/replay.h b/include/sysemu/replay.h
new file mode 100644
index 0000000..abb4688
--- /dev/null
+++ b/include/sysemu/replay.h
@@ -0,0 +1,120 @@
+#ifndef REPLAY_H
+#define REPLAY_H
+
+/*
+ * replay.h
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdbool.h>
+#include <stdint.h>
+#include "qapi-types.h"
+#include "qapi/error.h"
+#include "qemu/typedefs.h"
+
+/* replay clock kinds */
+enum ReplayClockKind {
+    /* host_clock */
+    REPLAY_CLOCK_HOST,
+    /* virtual_rt_clock */
+    REPLAY_CLOCK_VIRTUAL_RT,
+    REPLAY_CLOCK_COUNT
+};
+typedef enum ReplayClockKind ReplayClockKind;
+
+/* IDs of the checkpoints */
+enum ReplayCheckpoint {
+    CHECKPOINT_CLOCK_WARP,
+    CHECKPOINT_RESET_REQUESTED,
+    CHECKPOINT_SUSPEND_REQUESTED,
+    CHECKPOINT_CLOCK_VIRTUAL,
+    CHECKPOINT_CLOCK_HOST,
+    CHECKPOINT_CLOCK_VIRTUAL_RT,
+    CHECKPOINT_INIT,
+    CHECKPOINT_RESET,
+    CHECKPOINT_COUNT
+};
+typedef enum ReplayCheckpoint ReplayCheckpoint;
+
+extern ReplayMode replay_mode;
+
+/* Replay process control functions */
+
+/*! Enables recording or saving event log with specified parameters */
+void replay_configure(struct QemuOpts *opts);
+/*! Initializes timers used for snapshotting and enables events recording */
+void replay_start(void);
+/*! Closes replay log file and frees other resources. */
+void replay_finish(void);
+/*! Adds replay blocker with the specified error description */
+void replay_add_blocker(Error *reason);
+
+/* Processing the instructions */
+
+/*! Returns number of executed instructions. */
+uint64_t replay_get_current_step(void);
+/*! Returns number of instructions to execute in replay mode. */
+int replay_get_instructions(void);
+/*! Updates instructions counter in replay mode. */
+void replay_account_executed_instructions(void);
+
+/* Interrupts and exceptions */
+
+/*! Called by exception handler to write or read
+    exception processing events. */
+bool replay_exception(void);
+/*! Used to determine that exception is pending.
+    Does not proceed to the next event in the log. */
+bool replay_has_exception(void);
+/*! Called by interrupt handlers to write or read
+    interrupt processing events.
+    \return true if interrupt should be processed */
+bool replay_interrupt(void);
+/*! Tries to read interrupt event from the file.
+    Returns true, when interrupt request is pending */
+bool replay_has_interrupt(void);
+
+/* Processing clocks and other time sources */
+
+/*! Save the specified clock */
+int64_t replay_save_clock(ReplayClockKind kind, int64_t clock);
+/*! Read the specified clock from the log or return cached data */
+int64_t replay_read_clock(ReplayClockKind kind);
+/*! Saves or reads the clock depending on the current replay mode. */
+#define REPLAY_CLOCK(clock, value)                                      \
+    (replay_mode == REPLAY_MODE_PLAY ? replay_read_clock((clock))       \
+        : replay_mode == REPLAY_MODE_RECORD                             \
+            ? replay_save_clock((clock), (value))                       \
+        : (value))
+
+/* Events */
+
+/*! Called when qemu shutdown is requested. */
+void replay_shutdown_request(void);
+/*! Should be called at check points in the execution.
+    These check points are skipped, if they were not met.
+    Saves checkpoint in the SAVE mode and validates in the PLAY mode.
+    Returns 0 in PLAY mode if checkpoint was not found.
+    Returns 1 in all other cases. */
+bool replay_checkpoint(ReplayCheckpoint checkpoint);
+
+/* Asynchronous events queue */
+
+/*! Disables storing events in the queue */
+void replay_disable_events(void);
+/*! Returns true when saving events is enabled */
+bool replay_events_enabled(void);
+/*! Adds bottom half event to the queue */
+void replay_bh_schedule_event(QEMUBH *bh);
+/*! Adds input event to the queue */
+void replay_input_event(QemuConsole *src, InputEvent *evt);
+/*! Adds input sync event to the queue */
+void replay_input_sync_event(void);
+
+#endif
diff --git a/include/sysemu/sysemu.h b/include/sysemu/sysemu.h
index c439975..3bb8897 100644
--- a/include/sysemu/sysemu.h
+++ b/include/sysemu/sysemu.h
@@ -70,6 +70,7 @@ void qemu_system_killed(int signal, pid_t pid);
 void qemu_devices_reset(void);
 void qemu_system_reset(bool report);
 void qemu_system_guest_panicked(void);
+size_t qemu_target_page_bits(void);
 
 void qemu_add_exit_notifier(Notifier *notify);
 void qemu_remove_exit_notifier(Notifier *notify);
@@ -83,14 +84,52 @@ void hmp_info_snapshots(Monitor *mon, const QDict *qdict);
 
 void qemu_announce_self(void);
 
+/* Subcommands for QEMU_VM_COMMAND */
+enum qemu_vm_cmd {
+    MIG_CMD_INVALID = 0,   /* Must be 0 */
+    MIG_CMD_OPEN_RETURN_PATH,  /* Tell the dest to open the Return path */
+    MIG_CMD_PING,              /* Request a PONG on the RP */
+
+    MIG_CMD_POSTCOPY_ADVISE,       /* Prior to any page transfers, just
+                                      warn we might want to do PC */
+    MIG_CMD_POSTCOPY_LISTEN,       /* Start listening for incoming
+                                      pages as it's running. */
+    MIG_CMD_POSTCOPY_RUN,          /* Start execution */
+
+    MIG_CMD_POSTCOPY_RAM_DISCARD,  /* A list of pages to discard that
+                                      were previously sent during
+                                      precopy but are dirty. */
+    MIG_CMD_PACKAGED,          /* Send a wrapped stream within this stream */
+    MIG_CMD_MAX
+};
+
+#define MAX_VM_CMD_PACKAGED_SIZE (1ul << 24)
+
 bool qemu_savevm_state_blocked(Error **errp);
 void qemu_savevm_state_begin(QEMUFile *f,
                              const MigrationParams *params);
 void qemu_savevm_state_header(QEMUFile *f);
-int qemu_savevm_state_iterate(QEMUFile *f);
-void qemu_savevm_state_complete(QEMUFile *f);
-void qemu_savevm_state_cancel(void);
-uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size);
+int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy);
+void qemu_savevm_state_cleanup(void);
+void qemu_savevm_state_complete_postcopy(QEMUFile *f);
+void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only);
+void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
+                               uint64_t *res_non_postcopiable,
+                               uint64_t *res_postcopiable);
+void qemu_savevm_command_send(QEMUFile *f, enum qemu_vm_cmd command,
+                              uint16_t len, uint8_t *data);
+void qemu_savevm_send_ping(QEMUFile *f, uint32_t value);
+void qemu_savevm_send_open_return_path(QEMUFile *f);
+int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb);
+void qemu_savevm_send_postcopy_advise(QEMUFile *f);
+void qemu_savevm_send_postcopy_listen(QEMUFile *f);
+void qemu_savevm_send_postcopy_run(QEMUFile *f);
+
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+                                           uint16_t len,
+                                           uint64_t *start_list,
+                                           uint64_t *length_list);
+
 int qemu_loadvm_state(QEMUFile *f);
 
 typedef enum DisplayType
@@ -133,6 +172,7 @@ extern int boot_menu;
 extern bool boot_strict;
 extern uint8_t *boot_splash_filedata;
 extern size_t boot_splash_filedata_size;
+extern bool enable_mlock;
 extern uint8_t qemu_extra_params_fw[2];
 extern QEMUClockType rtc_clock;
 extern const char *mem_path;
diff --git a/include/ui/input.h b/include/ui/input.h
index 5d5ac00..d06a12d 100644
--- a/include/ui/input.h
+++ b/include/ui/input.h
@@ -33,7 +33,9 @@ void qemu_input_handler_bind(QemuInputHandlerState *s,
                              const char *device_id, int head,
                              Error **errp);
 void qemu_input_event_send(QemuConsole *src, InputEvent *evt);
+void qemu_input_event_send_impl(QemuConsole *src, InputEvent *evt);
 void qemu_input_event_sync(void);
+void qemu_input_event_sync_impl(void);
 
 InputEvent *qemu_input_event_new_key(KeyValue *key, bool down);
 void qemu_input_event_send_key(QemuConsole *src, KeyValue *key, bool down);
diff --git a/ioport.c b/ioport.c
index e39093e..193ef76 100644
--- a/ioport.c
+++ b/ioport.c
@@ -30,14 +30,6 @@
 #include "exec/memory.h"
 #include "exec/address-spaces.h"
 
-//#define DEBUG_IOPORT
-
-#ifdef DEBUG_IOPORT
-#  define LOG_IOPORT(...) qemu_log_mask(CPU_LOG_IOPORT, ## __VA_ARGS__)
-#else
-#  define LOG_IOPORT(...) do { } while (0)
-#endif
-
 typedef struct MemoryRegionPortioList {
     MemoryRegion mr;
     void *portio_opaque;
@@ -62,8 +54,7 @@ const MemoryRegionOps unassigned_io_ops = {
 
 void cpu_outb(pio_addr_t addr, uint8_t val)
 {
-    LOG_IOPORT("outb: %04"FMT_pioaddr" %02"PRIx8"\n", addr, val);
-    trace_cpu_out(addr, val);
+    trace_cpu_out(addr, 'b', val);
     address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
                         &val, 1);
 }
@@ -72,8 +63,7 @@ void cpu_outw(pio_addr_t addr, uint16_t val)
 {
     uint8_t buf[2];
 
-    LOG_IOPORT("outw: %04"FMT_pioaddr" %04"PRIx16"\n", addr, val);
-    trace_cpu_out(addr, val);
+    trace_cpu_out(addr, 'w', val);
     stw_p(buf, val);
     address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
                         buf, 2);
@@ -83,8 +73,7 @@ void cpu_outl(pio_addr_t addr, uint32_t val)
 {
     uint8_t buf[4];
 
-    LOG_IOPORT("outl: %04"FMT_pioaddr" %08"PRIx32"\n", addr, val);
-    trace_cpu_out(addr, val);
+    trace_cpu_out(addr, 'l', val);
     stl_p(buf, val);
     address_space_write(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
                         buf, 4);
@@ -96,8 +85,7 @@ uint8_t cpu_inb(pio_addr_t addr)
 
     address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED,
                        &val, 1);
-    trace_cpu_in(addr, val);
-    LOG_IOPORT("inb : %04"FMT_pioaddr" %02"PRIx8"\n", addr, val);
+    trace_cpu_in(addr, 'b', val);
     return val;
 }
 
@@ -108,8 +96,7 @@ uint16_t cpu_inw(pio_addr_t addr)
 
     address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, buf, 2);
     val = lduw_p(buf);
-    trace_cpu_in(addr, val);
-    LOG_IOPORT("inw : %04"FMT_pioaddr" %04"PRIx16"\n", addr, val);
+    trace_cpu_in(addr, 'w', val);
     return val;
 }
 
@@ -120,8 +107,7 @@ uint32_t cpu_inl(pio_addr_t addr)
 
     address_space_read(&address_space_io, addr, MEMTXATTRS_UNSPECIFIED, buf, 4);
     val = ldl_p(buf);
-    trace_cpu_in(addr, val);
-    LOG_IOPORT("inl : %04"FMT_pioaddr" %08"PRIx32"\n", addr, val);
+    trace_cpu_in(addr, 'l', val);
     return val;
 }
 
diff --git a/kvm-all.c b/kvm-all.c
index c442838..c648b81 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -109,6 +109,7 @@ bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
 bool kvm_vm_attributes_allowed;
 bool kvm_direct_msi_allowed;
+bool kvm_ioeventfd_any_length_allowed;
 
 static const KVMCapabilityInfo kvm_required_capabilites[] = {
     KVM_CAP_INFO(USER_MEMORY),
@@ -1461,7 +1462,6 @@ static int kvm_init(MachineState *ms)
      * page size for the system though.
      */
     assert(TARGET_PAGE_SIZE <= getpagesize());
-    page_size_init();
 
     s->sigmask_len = 8;
 
@@ -1612,6 +1612,9 @@ static int kvm_init(MachineState *ms)
     kvm_vm_attributes_allowed =
         (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) > 0);
 
+    kvm_ioeventfd_any_length_allowed =
+        (kvm_check_extension(s, KVM_CAP_IOEVENTFD_ANY_LENGTH) > 0);
+
     ret = kvm_arch_init(ms, s);
     if (ret < 0) {
         goto err;
@@ -1766,11 +1769,6 @@ void kvm_cpu_synchronize_post_init(CPUState *cpu)
     run_on_cpu(cpu, do_kvm_cpu_synchronize_post_init, cpu);
 }
 
-void kvm_cpu_clean_state(CPUState *cpu)
-{
-    cpu->kvm_vcpu_dirty = false;
-}
-
 int kvm_cpu_exec(CPUState *cpu)
 {
     struct kvm_run *run = cpu->kvm_run;
diff --git a/kvm-stub.c b/kvm-stub.c
index a5051f7..dc97a5e 100644
--- a/kvm-stub.c
+++ b/kvm-stub.c
@@ -30,6 +30,7 @@ bool kvm_gsi_routing_allowed;
 bool kvm_gsi_direct_mapping;
 bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
+bool kvm_ioeventfd_any_length_allowed;
 
 int kvm_init_vcpu(CPUState *cpu)
 {
diff --git a/linux-headers/linux/userfaultfd.h b/linux-headers/linux/userfaultfd.h
new file mode 100644
index 0000000..9057d7a
--- /dev/null
+++ b/linux-headers/linux/userfaultfd.h
@@ -0,0 +1,167 @@
+/*
+ *  include/linux/userfaultfd.h
+ *
+ *  Copyright (C) 2007  Davide Libenzi <davidel@xmailserver.org>
+ *  Copyright (C) 2015  Red Hat, Inc.
+ *
+ */
+
+#ifndef _LINUX_USERFAULTFD_H
+#define _LINUX_USERFAULTFD_H
+
+#include <linux/types.h>
+
+#define UFFD_API ((__u64)0xAA)
+/*
+ * After implementing the respective features it will become:
+ * #define UFFD_API_FEATURES (UFFD_FEATURE_PAGEFAULT_FLAG_WP | \
+ *			      UFFD_FEATURE_EVENT_FORK)
+ */
+#define UFFD_API_FEATURES (0)
+#define UFFD_API_IOCTLS				\
+	((__u64)1 << _UFFDIO_REGISTER |		\
+	 (__u64)1 << _UFFDIO_UNREGISTER |	\
+	 (__u64)1 << _UFFDIO_API)
+#define UFFD_API_RANGE_IOCTLS			\
+	((__u64)1 << _UFFDIO_WAKE |		\
+	 (__u64)1 << _UFFDIO_COPY |		\
+	 (__u64)1 << _UFFDIO_ZEROPAGE)
+
+/*
+ * Valid ioctl command number range with this API is from 0x00 to
+ * 0x3F.  UFFDIO_API is the fixed number, everything else can be
+ * changed by implementing a different UFFD_API. If sticking to the
+ * same UFFD_API more ioctl can be added and userland will be aware of
+ * which ioctl the running kernel implements through the ioctl command
+ * bitmask written by the UFFDIO_API.
+ */
+#define _UFFDIO_REGISTER		(0x00)
+#define _UFFDIO_UNREGISTER		(0x01)
+#define _UFFDIO_WAKE			(0x02)
+#define _UFFDIO_COPY			(0x03)
+#define _UFFDIO_ZEROPAGE		(0x04)
+#define _UFFDIO_API			(0x3F)
+
+/* userfaultfd ioctl ids */
+#define UFFDIO 0xAA
+#define UFFDIO_API		_IOWR(UFFDIO, _UFFDIO_API,	\
+				      struct uffdio_api)
+#define UFFDIO_REGISTER		_IOWR(UFFDIO, _UFFDIO_REGISTER, \
+				      struct uffdio_register)
+#define UFFDIO_UNREGISTER	_IOR(UFFDIO, _UFFDIO_UNREGISTER,	\
+				     struct uffdio_range)
+#define UFFDIO_WAKE		_IOR(UFFDIO, _UFFDIO_WAKE,	\
+				     struct uffdio_range)
+#define UFFDIO_COPY		_IOWR(UFFDIO, _UFFDIO_COPY,	\
+				      struct uffdio_copy)
+#define UFFDIO_ZEROPAGE		_IOWR(UFFDIO, _UFFDIO_ZEROPAGE,	\
+				      struct uffdio_zeropage)
+
+/* read() structure */
+struct uffd_msg {
+	__u8	event;
+
+	__u8	reserved1;
+	__u16	reserved2;
+	__u32	reserved3;
+
+	union {
+		struct {
+			__u64	flags;
+			__u64	address;
+		} pagefault;
+
+		struct {
+			/* unused reserved fields */
+			__u64	reserved1;
+			__u64	reserved2;
+			__u64	reserved3;
+		} reserved;
+	} arg;
+} __packed;
+
+/*
+ * Start at 0x12 and not at 0 to be more strict against bugs.
+ */
+#define UFFD_EVENT_PAGEFAULT	0x12
+#if 0 /* not available yet */
+#define UFFD_EVENT_FORK		0x13
+#endif
+
+/* flags for UFFD_EVENT_PAGEFAULT */
+#define UFFD_PAGEFAULT_FLAG_WRITE	(1<<0)	/* If this was a write fault */
+#define UFFD_PAGEFAULT_FLAG_WP		(1<<1)	/* If reason is VM_UFFD_WP */
+
+struct uffdio_api {
+	/* userland asks for an API number and the features to enable */
+	__u64 api;
+	/*
+	 * Kernel answers below with the all available features for
+	 * the API, this notifies userland of which events and/or
+	 * which flags for each event are enabled in the current
+	 * kernel.
+	 *
+	 * Note: UFFD_EVENT_PAGEFAULT and UFFD_PAGEFAULT_FLAG_WRITE
+	 * are to be considered implicitly always enabled in all kernels as
+	 * long as the uffdio_api.api requested matches UFFD_API.
+	 */
+#if 0 /* not available yet */
+#define UFFD_FEATURE_PAGEFAULT_FLAG_WP		(1<<0)
+#define UFFD_FEATURE_EVENT_FORK			(1<<1)
+#endif
+	__u64 features;
+
+	__u64 ioctls;
+};
+
+struct uffdio_range {
+	__u64 start;
+	__u64 len;
+};
+
+struct uffdio_register {
+	struct uffdio_range range;
+#define UFFDIO_REGISTER_MODE_MISSING	((__u64)1<<0)
+#define UFFDIO_REGISTER_MODE_WP		((__u64)1<<1)
+	__u64 mode;
+
+	/*
+	 * kernel answers which ioctl commands are available for the
+	 * range, keep at the end as the last 8 bytes aren't read.
+	 */
+	__u64 ioctls;
+};
+
+struct uffdio_copy {
+	__u64 dst;
+	__u64 src;
+	__u64 len;
+	/*
+	 * There will be a wrprotection flag later that allows to map
+	 * pages wrprotected on the fly. And such a flag will be
+	 * available if the wrprotection ioctl are implemented for the
+	 * range according to the uffdio_register.ioctls.
+	 */
+#define UFFDIO_COPY_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "copy" is written by the ioctl and must be at the end: the
+	 * copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 copy;
+};
+
+struct uffdio_zeropage {
+	struct uffdio_range range;
+#define UFFDIO_ZEROPAGE_MODE_DONTWAKE		((__u64)1<<0)
+	__u64 mode;
+
+	/*
+	 * "zeropage" is written by the ioctl and must be at the end:
+	 * the copy_from_user will not read the last 8 bytes.
+	 */
+	__s64 zeropage;
+};
+
+#endif /* _LINUX_USERFAULTFD_H */
diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h
index 14a0160..ead86db 100644
--- a/linux-headers/linux/vhost.h
+++ b/linux-headers/linux/vhost.h
@@ -78,7 +78,7 @@ struct vhost_memory {
 #define VHOST_SET_OWNER _IO(VHOST_VIRTIO, 0x01)
 /* Give up ownership, and reset the device to default values.
  * Allows subsequent call to VHOST_OWNER_SET to succeed. */
-#define VHOST_RESET_DEVICE _IO(VHOST_VIRTIO, 0x02)
+#define VHOST_RESET_OWNER _IO(VHOST_VIRTIO, 0x02)
 
 /* Set up/modify memory layout */
 #define VHOST_SET_MEM_TABLE	_IOW(VHOST_VIRTIO, 0x03, struct vhost_memory)
diff --git a/linux-user/syscall.c b/linux-user/syscall.c
index 8bfb24f..6c64ba6 100644
--- a/linux-user/syscall.c
+++ b/linux-user/syscall.c
@@ -5325,8 +5325,7 @@ static abi_long do_open_by_handle_at(abi_long mount_fd, abi_long handle,
         return -TARGET_EFAULT;
     }
 
-    fh = g_malloc0(total_size);
-    memcpy(fh, target_fh, total_size);
+    fh = g_memdup(target_fh, total_size);
     fh->handle_bytes = size;
     fh->handle_type = tswap32(target_fh->handle_type);
 
diff --git a/main-loop.c b/main-loop.c
index db600a3..df28670 100644
--- a/main-loop.c
+++ b/main-loop.c
@@ -506,6 +506,9 @@ int main_loop_wait(int nonblocking)
     slirp_pollfds_poll(gpollfds, (ret < 0));
 #endif
 
+    /* CPU thread can infinitely wait for event after
+       missing the warp */
+    qemu_clock_warp(QEMU_CLOCK_VIRTUAL);
     qemu_clock_run_all_timers();
 
     return ret;
diff --git a/memory.c b/memory.c
index 2eb1597..e193658 100644
--- a/memory.c
+++ b/memory.c
@@ -1688,7 +1688,9 @@ void memory_region_add_eventfd(MemoryRegion *mr,
     };
     unsigned i;
 
-    adjust_endianness(mr, &mrfd.data, size);
+    if (size) {
+        adjust_endianness(mr, &mrfd.data, size);
+    }
     memory_region_transaction_begin();
     for (i = 0; i < mr->ioeventfd_nb; ++i) {
         if (memory_region_ioeventfd_before(mrfd, mr->ioeventfds[i])) {
@@ -1721,7 +1723,9 @@ void memory_region_del_eventfd(MemoryRegion *mr,
     };
     unsigned i;
 
-    adjust_endianness(mr, &mrfd.data, size);
+    if (size) {
+        adjust_endianness(mr, &mrfd.data, size);
+    }
     memory_region_transaction_begin();
     for (i = 0; i < mr->ioeventfd_nb; ++i) {
         if (memory_region_ioeventfd_equal(mrfd, mr->ioeventfds[i])) {
@@ -2036,6 +2040,9 @@ static void listener_add_address_space(MemoryListener *listener,
         return;
     }
 
+    if (listener->begin) {
+        listener->begin(listener);
+    }
     if (global_dirty_log) {
         if (listener->log_global_start) {
             listener->log_global_start(listener);
@@ -2052,10 +2059,16 @@ static void listener_add_address_space(MemoryListener *listener,
             .offset_within_address_space = int128_get64(fr->addr.start),
             .readonly = fr->readonly,
         };
+        if (fr->dirty_log_mask && listener->log_start) {
+            listener->log_start(listener, &section, 0, fr->dirty_log_mask);
+        }
         if (listener->region_add) {
             listener->region_add(listener, &section);
         }
     }
+    if (listener->commit) {
+        listener->commit(listener);
+    }
     flatview_unref(view);
 }
 
diff --git a/migration/Makefile.objs b/migration/Makefile.objs
index d929e96..0cac6d7 100644
--- a/migration/Makefile.objs
+++ b/migration/Makefile.objs
@@ -1,7 +1,7 @@
 common-obj-y += migration.o tcp.o
 common-obj-y += vmstate.o
 common-obj-y += qemu-file.o qemu-file-buf.o qemu-file-unix.o qemu-file-stdio.o
-common-obj-y += xbzrle.o
+common-obj-y += xbzrle.o postcopy-ram.o
 
 common-obj-$(CONFIG_RDMA) += rdma.o
 common-obj-$(CONFIG_POSIX) += exec.o unix.o fd.o
diff --git a/migration/block.c b/migration/block.c
index f7bb1e0..310e2b3 100644
--- a/migration/block.c
+++ b/migration/block.c
@@ -591,7 +591,7 @@ static int64_t get_remaining_dirty(void)
 
 /* Called with iothread lock taken.  */
 
-static void blk_mig_cleanup(void)
+static void block_migration_cleanup(void *opaque)
 {
     BlkMigDevState *bmds;
     BlkMigBlock *blk;
@@ -618,11 +618,6 @@ static void blk_mig_cleanup(void)
     blk_mig_unlock();
 }
 
-static void block_migration_cancel(void *opaque)
-{
-    blk_mig_cleanup();
-}
-
 static int block_save_setup(QEMUFile *f, void *opaque)
 {
     int ret;
@@ -750,11 +745,12 @@ static int block_save_complete(QEMUFile *f, void *opaque)
 
     qemu_put_be64(f, BLK_MIG_FLAG_EOS);
 
-    blk_mig_cleanup();
     return 0;
 }
 
-static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+                               uint64_t *non_postcopiable_pending,
+                               uint64_t *postcopiable_pending)
 {
     /* Estimate pending number of bytes to send */
     uint64_t pending;
@@ -773,7 +769,8 @@ static uint64_t block_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
     qemu_mutex_unlock_iothread();
 
     DPRINTF("Enter save live pending  %" PRIu64 "\n", pending);
-    return pending;
+    /* We don't do postcopy */
+    *non_postcopiable_pending += pending;
 }
 
 static int block_load(QEMUFile *f, void *opaque, int version_id)
@@ -882,10 +879,10 @@ static SaveVMHandlers savevm_block_handlers = {
     .set_params = block_set_params,
     .save_live_setup = block_save_setup,
     .save_live_iterate = block_save_iterate,
-    .save_live_complete = block_save_complete,
+    .save_live_complete_precopy = block_save_complete,
     .save_live_pending = block_save_pending,
     .load_state = block_load,
-    .cancel = block_migration_cancel,
+    .cleanup = block_migration_cleanup,
     .is_active = block_is_active,
 };
 
diff --git a/migration/migration.c b/migration/migration.c
index b092f38..7e4e27b 100644
--- a/migration/migration.c
+++ b/migration/migration.c
@@ -21,15 +21,18 @@
 #include "sysemu/sysemu.h"
 #include "block/block.h"
 #include "qapi/qmp/qerror.h"
+#include "qapi/util.h"
 #include "qemu/sockets.h"
 #include "qemu/rcu.h"
 #include "migration/block.h"
+#include "migration/postcopy-ram.h"
 #include "qemu/thread.h"
 #include "qmp-commands.h"
 #include "trace.h"
-#include "qapi/util.h"
 #include "qapi-event.h"
 #include "qom/cpu.h"
+#include "exec/memory.h"
+#include "exec/address-spaces.h"
 
 #define MAX_THROTTLE  (32 << 20)      /* Migration transfer speed throttling */
 
@@ -57,6 +60,13 @@ static NotifierList migration_state_notifiers =
 
 static bool deferred_incoming;
 
+/*
+ * Current state of incoming postcopy; note this is not part of
+ * MigrationIncomingState since it's state is used during cleanup
+ * at the end as MIS is being freed.
+ */
+static PostcopyState incoming_postcopy_state;
+
 /* When we add fault tolerance, we could have several
    migrations at once.  For now we don't need to add
    dynamic creation of migration */
@@ -64,6 +74,7 @@ static bool deferred_incoming;
 /* For outgoing */
 MigrationState *migrate_get_current(void)
 {
+    static bool once;
     static MigrationState current_migration = {
         .state = MIGRATION_STATUS_NONE,
         .bandwidth_limit = MAX_THROTTLE,
@@ -81,6 +92,10 @@ MigrationState *migrate_get_current(void)
                 DEFAULT_MIGRATE_X_CPU_THROTTLE_INCREMENT,
     };
 
+    if (!once) {
+        qemu_mutex_init(&current_migration.src_page_req_mutex);
+        once = true;
+    }
     return &current_migration;
 }
 
@@ -95,14 +110,17 @@ MigrationIncomingState *migration_incoming_get_current(void)
 MigrationIncomingState *migration_incoming_state_new(QEMUFile* f)
 {
     mis_current = g_new0(MigrationIncomingState, 1);
-    mis_current->file = f;
+    mis_current->from_src_file = f;
     QLIST_INIT(&mis_current->loadvm_handlers);
+    qemu_mutex_init(&mis_current->rp_mutex);
+    qemu_event_init(&mis_current->main_thread_load_event, false);
 
     return mis_current;
 }
 
 void migration_incoming_state_destroy(void)
 {
+    qemu_event_destroy(&mis_current->main_thread_load_event);
     loadvm_free_handlers(mis_current);
     g_free(mis_current);
     mis_current = NULL;
@@ -248,6 +266,35 @@ static void deferred_incoming_migration(Error **errp)
     deferred_incoming = true;
 }
 
+/* Request a range of pages from the source VM at the given
+ * start address.
+ *   rbname: Name of the RAMBlock to request the page in, if NULL it's the same
+ *           as the last request (a name must have been given previously)
+ *   Start: Address offset within the RB
+ *   Len: Length in bytes required - must be a multiple of pagesize
+ */
+void migrate_send_rp_req_pages(MigrationIncomingState *mis, const char *rbname,
+                               ram_addr_t start, size_t len)
+{
+    uint8_t bufc[12 + 1 + 255]; /* start (8), len (4), rbname upto 256 */
+    size_t msglen = 12; /* start + len */
+
+    *(uint64_t *)bufc = cpu_to_be64((uint64_t)start);
+    *(uint32_t *)(bufc + 8) = cpu_to_be32((uint32_t)len);
+
+    if (rbname) {
+        int rbname_len = strlen(rbname);
+        assert(rbname_len < 256);
+
+        bufc[msglen++] = rbname_len;
+        memcpy(bufc + msglen, rbname, rbname_len);
+        msglen += rbname_len;
+        migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES_ID, msglen, bufc);
+    } else {
+        migrate_send_rp_message(mis, MIG_RP_MSG_REQ_PAGES, msglen, bufc);
+    }
+}
+
 void qemu_start_incoming_migration(const char *uri, Error **errp)
 {
     const char *p;
@@ -278,12 +325,37 @@ static void process_incoming_migration_co(void *opaque)
 {
     QEMUFile *f = opaque;
     Error *local_err = NULL;
+    MigrationIncomingState *mis;
+    PostcopyState ps;
     int ret;
 
-    migration_incoming_state_new(f);
+    mis = migration_incoming_state_new(f);
+    postcopy_state_set(POSTCOPY_INCOMING_NONE);
     migrate_generate_event(MIGRATION_STATUS_ACTIVE);
+
     ret = qemu_loadvm_state(f);
 
+    ps = postcopy_state_get();
+    trace_process_incoming_migration_co_end(ret, ps);
+    if (ps != POSTCOPY_INCOMING_NONE) {
+        if (ps == POSTCOPY_INCOMING_ADVISE) {
+            /*
+             * Where a migration had postcopy enabled (and thus went to advise)
+             * but managed to complete within the precopy period, we can use
+             * the normal exit.
+             */
+            postcopy_ram_incoming_cleanup(mis);
+        } else if (ret >= 0) {
+            /*
+             * Postcopy was started, cleanup should happen at the end of the
+             * postcopy thread.
+             */
+            trace_process_incoming_migration_co_postcopy_end_main();
+            return;
+        }
+        /* Else if something went wrong then just fall out of the normal exit */
+    }
+
     qemu_fclose(f);
     free_xbzrle_decoded_buf();
     migration_incoming_state_destroy();
@@ -344,6 +416,50 @@ void process_incoming_migration(QEMUFile *f)
     qemu_coroutine_enter(co, f);
 }
 
+/*
+ * Send a message on the return channel back to the source
+ * of the migration.
+ */
+void migrate_send_rp_message(MigrationIncomingState *mis,
+                             enum mig_rp_message_type message_type,
+                             uint16_t len, void *data)
+{
+    trace_migrate_send_rp_message((int)message_type, len);
+    qemu_mutex_lock(&mis->rp_mutex);
+    qemu_put_be16(mis->to_src_file, (unsigned int)message_type);
+    qemu_put_be16(mis->to_src_file, len);
+    qemu_put_buffer(mis->to_src_file, data, len);
+    qemu_fflush(mis->to_src_file);
+    qemu_mutex_unlock(&mis->rp_mutex);
+}
+
+/*
+ * Send a 'SHUT' message on the return channel with the given value
+ * to indicate that we've finished with the RP.  Non-0 value indicates
+ * error.
+ */
+void migrate_send_rp_shut(MigrationIncomingState *mis,
+                          uint32_t value)
+{
+    uint32_t buf;
+
+    buf = cpu_to_be32(value);
+    migrate_send_rp_message(mis, MIG_RP_MSG_SHUT, sizeof(buf), &buf);
+}
+
+/*
+ * Send a 'PONG' message on the return channel with the given value
+ * (normally in response to a 'PING')
+ */
+void migrate_send_rp_pong(MigrationIncomingState *mis,
+                          uint32_t value)
+{
+    uint32_t buf;
+
+    buf = cpu_to_be32(value);
+    migrate_send_rp_message(mis, MIG_RP_MSG_PONG, sizeof(buf), &buf);
+}
+
 /* amount of nanoseconds we are willing to wait for migration to be down.
  * the choice of nanoseconds is because it is the maximum resolution that
  * get_clock() can achieve. It is an internal measure. All user-visible
@@ -399,6 +515,24 @@ MigrationParameters *qmp_query_migrate_parameters(Error **errp)
     return params;
 }
 
+/*
+ * Return true if we're already in the middle of a migration
+ * (i.e. any of the active or setup states)
+ */
+static bool migration_is_setup_or_active(int state)
+{
+    switch (state) {
+    case MIGRATION_STATUS_ACTIVE:
+    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+    case MIGRATION_STATUS_SETUP:
+        return true;
+
+    default:
+        return false;
+
+    }
+}
+
 static void get_xbzrle_cache_stats(MigrationInfo *info)
 {
     if (migrate_use_xbzrle()) {
@@ -465,6 +599,39 @@ MigrationInfo *qmp_query_migrate(Error **errp)
 
         get_xbzrle_cache_stats(info);
         break;
+    case MIGRATION_STATUS_POSTCOPY_ACTIVE:
+        /* Mostly the same as active; TODO add some postcopy stats */
+        info->has_status = true;
+        info->has_total_time = true;
+        info->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME)
+            - s->total_time;
+        info->has_expected_downtime = true;
+        info->expected_downtime = s->expected_downtime;
+        info->has_setup_time = true;
+        info->setup_time = s->setup_time;
+
+        info->has_ram = true;
+        info->ram = g_malloc0(sizeof(*info->ram));
+        info->ram->transferred = ram_bytes_transferred();
+        info->ram->remaining = ram_bytes_remaining();
+        info->ram->total = ram_bytes_total();
+        info->ram->duplicate = dup_mig_pages_transferred();
+        info->ram->skipped = skipped_mig_pages_transferred();
+        info->ram->normal = norm_mig_pages_transferred();
+        info->ram->normal_bytes = norm_mig_bytes_transferred();
+        info->ram->dirty_pages_rate = s->dirty_pages_rate;
+        info->ram->mbps = s->mbps;
+
+        if (blk_mig_active()) {
+            info->has_disk = true;
+            info->disk = g_malloc0(sizeof(*info->disk));
+            info->disk->transferred = blk_mig_bytes_transferred();
+            info->disk->remaining = blk_mig_bytes_remaining();
+            info->disk->total = blk_mig_bytes_total();
+        }
+
+        get_xbzrle_cache_stats(info);
+        break;
     case MIGRATION_STATUS_COMPLETED:
         get_xbzrle_cache_stats(info);
 
@@ -506,8 +673,7 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
     MigrationState *s = migrate_get_current();
     MigrationCapabilityStatusList *cap;
 
-    if (s->state == MIGRATION_STATUS_ACTIVE ||
-        s->state == MIGRATION_STATUS_SETUP) {
+    if (migration_is_setup_or_active(s->state)) {
         error_setg(errp, QERR_MIGRATION_ACTIVE);
         return;
     }
@@ -515,6 +681,20 @@ void qmp_migrate_set_capabilities(MigrationCapabilityStatusList *params,
     for (cap = params; cap; cap = cap->next) {
         s->enabled_capabilities[cap->value->capability] = cap->value->state;
     }
+
+    if (migrate_postcopy_ram()) {
+        if (migrate_use_compression()) {
+            /* The decompression threads asynchronously write into RAM
+             * rather than use the atomic copies needed to avoid
+             * userfaulting.  It should be possible to fix the decompression
+             * threads for compatibility in future.
+             */
+            error_report("Postcopy is not currently compatible with "
+                         "compression");
+            s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM] =
+                false;
+        }
+    }
 }
 
 void qmp_migrate_set_parameters(bool has_compress_level,
@@ -583,6 +763,28 @@ void qmp_migrate_set_parameters(bool has_compress_level,
     }
 }
 
+void qmp_migrate_start_postcopy(Error **errp)
+{
+    MigrationState *s = migrate_get_current();
+
+    if (!migrate_postcopy_ram()) {
+        error_setg(errp, "Enable postcopy with migrate_set_capability before"
+                         " the start of migration");
+        return;
+    }
+
+    if (s->state == MIGRATION_STATUS_NONE) {
+        error_setg(errp, "Postcopy must be started after migration has been"
+                         " started");
+        return;
+    }
+    /*
+     * we don't error if migration has finished since that would be racy
+     * with issuing this command.
+     */
+    atomic_set(&s->start_postcopy, true);
+}
+
 /* shared migration helpers */
 
 static void migrate_set_state(MigrationState *s, int old_state, int new_state)
@@ -600,10 +802,15 @@ static void migrate_fd_cleanup(void *opaque)
     qemu_bh_delete(s->cleanup_bh);
     s->cleanup_bh = NULL;
 
+    flush_page_queue(s);
+
     if (s->file) {
         trace_migrate_fd_cleanup();
         qemu_mutex_unlock_iothread();
-        qemu_thread_join(&s->thread);
+        if (s->migration_thread_running) {
+            qemu_thread_join(&s->thread);
+            s->migration_thread_running = false;
+        }
         qemu_mutex_lock_iothread();
 
         migrate_compress_threads_join();
@@ -611,14 +818,12 @@ static void migrate_fd_cleanup(void *opaque)
         s->file = NULL;
     }
 
-    assert(s->state != MIGRATION_STATUS_ACTIVE);
+    assert((s->state != MIGRATION_STATUS_ACTIVE) &&
+           (s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE));
 
-    if (s->state != MIGRATION_STATUS_COMPLETED) {
-        qemu_savevm_state_cancel();
-        if (s->state == MIGRATION_STATUS_CANCELLING) {
-            migrate_set_state(s, MIGRATION_STATUS_CANCELLING,
-                              MIGRATION_STATUS_CANCELLED);
-        }
+    if (s->state == MIGRATION_STATUS_CANCELLING) {
+        migrate_set_state(s, MIGRATION_STATUS_CANCELLING,
+                          MIGRATION_STATUS_CANCELLED);
     }
 
     notifier_list_notify(&migration_state_notifiers, s);
@@ -638,10 +843,14 @@ static void migrate_fd_cancel(MigrationState *s)
     QEMUFile *f = migrate_get_current()->file;
     trace_migrate_fd_cancel();
 
+    if (s->rp_state.from_dst_file) {
+        /* shutdown the rp socket, so causing the rp thread to shutdown */
+        qemu_file_shutdown(s->rp_state.from_dst_file);
+    }
+
     do {
         old_state = s->state;
-        if (old_state != MIGRATION_STATUS_SETUP &&
-            old_state != MIGRATION_STATUS_ACTIVE) {
+        if (!migration_is_setup_or_active(old_state)) {
             break;
         }
         migrate_set_state(s, old_state, MIGRATION_STATUS_CANCELLING);
@@ -685,43 +894,43 @@ bool migration_has_failed(MigrationState *s)
             s->state == MIGRATION_STATUS_FAILED);
 }
 
-static MigrationState *migrate_init(const MigrationParams *params)
+bool migration_in_postcopy(MigrationState *s)
 {
-    MigrationState *s = migrate_get_current();
-    int64_t bandwidth_limit = s->bandwidth_limit;
-    bool enabled_capabilities[MIGRATION_CAPABILITY_MAX];
-    int64_t xbzrle_cache_size = s->xbzrle_cache_size;
-    int compress_level = s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL];
-    int compress_thread_count =
-            s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS];
-    int decompress_thread_count =
-            s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS];
-    int x_cpu_throttle_initial =
-            s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL];
-    int x_cpu_throttle_increment =
-            s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT];
+    return (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE);
+}
 
-    memcpy(enabled_capabilities, s->enabled_capabilities,
-           sizeof(enabled_capabilities));
+MigrationState *migrate_init(const MigrationParams *params)
+{
+    MigrationState *s = migrate_get_current();
 
-    memset(s, 0, sizeof(*s));
+    /*
+     * Reinitialise all migration state, except
+     * parameters/capabilities that the user set, and
+     * locks.
+     */
+    s->bytes_xfer = 0;
+    s->xfer_limit = 0;
+    s->cleanup_bh = 0;
+    s->file = NULL;
+    s->state = MIGRATION_STATUS_NONE;
     s->params = *params;
-    memcpy(s->enabled_capabilities, enabled_capabilities,
-           sizeof(enabled_capabilities));
-    s->xbzrle_cache_size = xbzrle_cache_size;
-
-    s->parameters[MIGRATION_PARAMETER_COMPRESS_LEVEL] = compress_level;
-    s->parameters[MIGRATION_PARAMETER_COMPRESS_THREADS] =
-               compress_thread_count;
-    s->parameters[MIGRATION_PARAMETER_DECOMPRESS_THREADS] =
-               decompress_thread_count;
-    s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INITIAL] =
-                x_cpu_throttle_initial;
-    s->parameters[MIGRATION_PARAMETER_X_CPU_THROTTLE_INCREMENT] =
-                x_cpu_throttle_increment;
-    s->bandwidth_limit = bandwidth_limit;
+    s->rp_state.from_dst_file = NULL;
+    s->rp_state.error = false;
+    s->mbps = 0.0;
+    s->downtime = 0;
+    s->expected_downtime = 0;
+    s->dirty_pages_rate = 0;
+    s->dirty_bytes_rate = 0;
+    s->setup_time = 0;
+    s->dirty_sync_count = 0;
+    s->start_postcopy = false;
+    s->migration_thread_running = false;
+    s->last_req_rb = NULL;
+
     migrate_set_state(s, MIGRATION_STATUS_NONE, MIGRATION_STATUS_SETUP);
 
+    QSIMPLEQ_INIT(&s->src_page_requests);
+
     s->total_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     return s;
 }
@@ -773,8 +982,7 @@ void qmp_migrate(const char *uri, bool has_blk, bool blk,
     params.blk = has_blk && blk;
     params.shared = has_inc && inc;
 
-    if (s->state == MIGRATION_STATUS_ACTIVE ||
-        s->state == MIGRATION_STATUS_SETUP ||
+    if (migration_is_setup_or_active(s->state) ||
         s->state == MIGRATION_STATUS_CANCELLING) {
         error_setg(errp, QERR_MIGRATION_ACTIVE);
         return;
@@ -893,6 +1101,15 @@ void qmp_migrate_set_downtime(double value, Error **errp)
     max_downtime = (uint64_t)value;
 }
 
+bool migrate_postcopy_ram(void)
+{
+    MigrationState *s;
+
+    s = migrate_get_current();
+
+    return s->enabled_capabilities[MIGRATION_CAPABILITY_X_POSTCOPY_RAM];
+}
+
 bool migrate_auto_converge(void)
 {
     MigrationState *s;
@@ -974,36 +1191,382 @@ int64_t migrate_xbzrle_cache_size(void)
     return s->xbzrle_cache_size;
 }
 
+/* migration thread support */
+/*
+ * Something bad happened to the RP stream, mark an error
+ * The caller shall print or trace something to indicate why
+ */
+static void mark_source_rp_bad(MigrationState *s)
+{
+    s->rp_state.error = true;
+}
+
+static struct rp_cmd_args {
+    ssize_t     len; /* -1 = variable */
+    const char *name;
+} rp_cmd_args[] = {
+    [MIG_RP_MSG_INVALID]        = { .len = -1, .name = "INVALID" },
+    [MIG_RP_MSG_SHUT]           = { .len =  4, .name = "SHUT" },
+    [MIG_RP_MSG_PONG]           = { .len =  4, .name = "PONG" },
+    [MIG_RP_MSG_REQ_PAGES]      = { .len = 12, .name = "REQ_PAGES" },
+    [MIG_RP_MSG_REQ_PAGES_ID]   = { .len = -1, .name = "REQ_PAGES_ID" },
+    [MIG_RP_MSG_MAX]            = { .len = -1, .name = "MAX" },
+};
+
+/*
+ * Process a request for pages received on the return path,
+ * We're allowed to send more than requested (e.g. to round to our page size)
+ * and we don't need to send pages that have already been sent.
+ */
+static void migrate_handle_rp_req_pages(MigrationState *ms, const char* rbname,
+                                       ram_addr_t start, size_t len)
+{
+    long our_host_ps = getpagesize();
+
+    trace_migrate_handle_rp_req_pages(rbname, start, len);
+
+    /*
+     * Since we currently insist on matching page sizes, just sanity check
+     * we're being asked for whole host pages.
+     */
+    if (start & (our_host_ps-1) ||
+       (len & (our_host_ps-1))) {
+        error_report("%s: Misaligned page request, start: " RAM_ADDR_FMT
+                     " len: %zd", __func__, start, len);
+        mark_source_rp_bad(ms);
+        return;
+    }
+
+    if (ram_save_queue_pages(ms, rbname, start, len)) {
+        mark_source_rp_bad(ms);
+    }
+}
+
+/*
+ * Handles messages sent on the return path towards the source VM
+ *
+ */
+static void *source_return_path_thread(void *opaque)
+{
+    MigrationState *ms = opaque;
+    QEMUFile *rp = ms->rp_state.from_dst_file;
+    uint16_t header_len, header_type;
+    const int max_len = 512;
+    uint8_t buf[max_len];
+    uint32_t tmp32, sibling_error;
+    ram_addr_t start = 0; /* =0 to silence warning */
+    size_t  len = 0, expected_len;
+    int res;
+
+    trace_source_return_path_thread_entry();
+    while (!ms->rp_state.error && !qemu_file_get_error(rp) &&
+           migration_is_setup_or_active(ms->state)) {
+        trace_source_return_path_thread_loop_top();
+        header_type = qemu_get_be16(rp);
+        header_len = qemu_get_be16(rp);
+
+        if (header_type >= MIG_RP_MSG_MAX ||
+            header_type == MIG_RP_MSG_INVALID) {
+            error_report("RP: Received invalid message 0x%04x length 0x%04x",
+                    header_type, header_len);
+            mark_source_rp_bad(ms);
+            goto out;
+        }
+
+        if ((rp_cmd_args[header_type].len != -1 &&
+            header_len != rp_cmd_args[header_type].len) ||
+            header_len > max_len) {
+            error_report("RP: Received '%s' message (0x%04x) with"
+                    "incorrect length %d expecting %zu",
+                    rp_cmd_args[header_type].name, header_type, header_len,
+                    (size_t)rp_cmd_args[header_type].len);
+            mark_source_rp_bad(ms);
+            goto out;
+        }
+
+        /* We know we've got a valid header by this point */
+        res = qemu_get_buffer(rp, buf, header_len);
+        if (res != header_len) {
+            error_report("RP: Failed reading data for message 0x%04x"
+                         " read %d expected %d",
+                         header_type, res, header_len);
+            mark_source_rp_bad(ms);
+            goto out;
+        }
+
+        /* OK, we have the message and the data */
+        switch (header_type) {
+        case MIG_RP_MSG_SHUT:
+            sibling_error = be32_to_cpup((uint32_t *)buf);
+            trace_source_return_path_thread_shut(sibling_error);
+            if (sibling_error) {
+                error_report("RP: Sibling indicated error %d", sibling_error);
+                mark_source_rp_bad(ms);
+            }
+            /*
+             * We'll let the main thread deal with closing the RP
+             * we could do a shutdown(2) on it, but we're the only user
+             * anyway, so there's nothing gained.
+             */
+            goto out;
+
+        case MIG_RP_MSG_PONG:
+            tmp32 = be32_to_cpup((uint32_t *)buf);
+            trace_source_return_path_thread_pong(tmp32);
+            break;
+
+        case MIG_RP_MSG_REQ_PAGES:
+            start = be64_to_cpup((uint64_t *)buf);
+            len = be32_to_cpup((uint32_t *)(buf + 8));
+            migrate_handle_rp_req_pages(ms, NULL, start, len);
+            break;
+
+        case MIG_RP_MSG_REQ_PAGES_ID:
+            expected_len = 12 + 1; /* header + termination */
+
+            if (header_len >= expected_len) {
+                start = be64_to_cpup((uint64_t *)buf);
+                len = be32_to_cpup((uint32_t *)(buf + 8));
+                /* Now we expect an idstr */
+                tmp32 = buf[12]; /* Length of the following idstr */
+                buf[13 + tmp32] = '\0';
+                expected_len += tmp32;
+            }
+            if (header_len != expected_len) {
+                error_report("RP: Req_Page_id with length %d expecting %zd",
+                        header_len, expected_len);
+                mark_source_rp_bad(ms);
+                goto out;
+            }
+            migrate_handle_rp_req_pages(ms, (char *)&buf[13], start, len);
+            break;
+
+        default:
+            break;
+        }
+    }
+    if (rp && qemu_file_get_error(rp)) {
+        trace_source_return_path_thread_bad_end();
+        mark_source_rp_bad(ms);
+    }
+
+    trace_source_return_path_thread_end();
+out:
+    ms->rp_state.from_dst_file = NULL;
+    qemu_fclose(rp);
+    return NULL;
+}
+
+static int open_return_path_on_source(MigrationState *ms)
+{
+
+    ms->rp_state.from_dst_file = qemu_file_get_return_path(ms->file);
+    if (!ms->rp_state.from_dst_file) {
+        return -1;
+    }
+
+    trace_open_return_path_on_source();
+    qemu_thread_create(&ms->rp_state.rp_thread, "return path",
+                       source_return_path_thread, ms, QEMU_THREAD_JOINABLE);
+
+    trace_open_return_path_on_source_continue();
+
+    return 0;
+}
+
+/* Returns 0 if the RP was ok, otherwise there was an error on the RP */
+static int await_return_path_close_on_source(MigrationState *ms)
+{
+    /*
+     * If this is a normal exit then the destination will send a SHUT and the
+     * rp_thread will exit, however if there's an error we need to cause
+     * it to exit.
+     */
+    if (qemu_file_get_error(ms->file) && ms->rp_state.from_dst_file) {
+        /*
+         * shutdown(2), if we have it, will cause it to unblock if it's stuck
+         * waiting for the destination.
+         */
+        qemu_file_shutdown(ms->rp_state.from_dst_file);
+        mark_source_rp_bad(ms);
+    }
+    trace_await_return_path_close_on_source_joining();
+    qemu_thread_join(&ms->rp_state.rp_thread);
+    trace_await_return_path_close_on_source_close();
+    return ms->rp_state.error;
+}
+
+/*
+ * Switch from normal iteration to postcopy
+ * Returns non-0 on error
+ */
+static int postcopy_start(MigrationState *ms, bool *old_vm_running)
+{
+    int ret;
+    const QEMUSizedBuffer *qsb;
+    int64_t time_at_stop = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+    migrate_set_state(ms, MIGRATION_STATUS_ACTIVE,
+                      MIGRATION_STATUS_POSTCOPY_ACTIVE);
+
+    trace_postcopy_start();
+    qemu_mutex_lock_iothread();
+    trace_postcopy_start_set_run();
+
+    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+    *old_vm_running = runstate_is_running();
+    global_state_store();
+    ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+
+    if (ret < 0) {
+        goto fail;
+    }
+
+    /*
+     * Cause any non-postcopiable, but iterative devices to
+     * send out their final data.
+     */
+    qemu_savevm_state_complete_precopy(ms->file, true);
+
+    /*
+     * in Finish migrate and with the io-lock held everything should
+     * be quiet, but we've potentially still got dirty pages and we
+     * need to tell the destination to throw any pages it's already received
+     * that are dirty
+     */
+    if (ram_postcopy_send_discard_bitmap(ms)) {
+        error_report("postcopy send discard bitmap failed");
+        goto fail;
+    }
+
+    /*
+     * send rest of state - note things that are doing postcopy
+     * will notice we're in POSTCOPY_ACTIVE and not actually
+     * wrap their state up here
+     */
+    qemu_file_set_rate_limit(ms->file, INT64_MAX);
+    /* Ping just for debugging, helps line traces up */
+    qemu_savevm_send_ping(ms->file, 2);
+
+    /*
+     * While loading the device state we may trigger page transfer
+     * requests and the fd must be free to process those, and thus
+     * the destination must read the whole device state off the fd before
+     * it starts processing it.  Unfortunately the ad-hoc migration format
+     * doesn't allow the destination to know the size to read without fully
+     * parsing it through each devices load-state code (especially the open
+     * coded devices that use get/put).
+     * So we wrap the device state up in a package with a length at the start;
+     * to do this we use a qemu_buf to hold the whole of the device state.
+     */
+    QEMUFile *fb = qemu_bufopen("w", NULL);
+    if (!fb) {
+        error_report("Failed to create buffered file");
+        goto fail;
+    }
+
+    /*
+     * Make sure the receiver can get incoming pages before we send the rest
+     * of the state
+     */
+    qemu_savevm_send_postcopy_listen(fb);
+
+    qemu_savevm_state_complete_precopy(fb, false);
+    qemu_savevm_send_ping(fb, 3);
+
+    qemu_savevm_send_postcopy_run(fb);
+
+    /* <><> end of stuff going into the package */
+    qsb = qemu_buf_get(fb);
+
+    /* Now send that blob */
+    if (qemu_savevm_send_packaged(ms->file, qsb)) {
+        goto fail_closefb;
+    }
+    qemu_fclose(fb);
+    ms->downtime =  qemu_clock_get_ms(QEMU_CLOCK_REALTIME) - time_at_stop;
+
+    qemu_mutex_unlock_iothread();
+
+    /*
+     * Although this ping is just for debug, it could potentially be
+     * used for getting a better measurement of downtime at the source.
+     */
+    qemu_savevm_send_ping(ms->file, 4);
+
+    ret = qemu_file_get_error(ms->file);
+    if (ret) {
+        error_report("postcopy_start: Migration stream errored");
+        migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                              MIGRATION_STATUS_FAILED);
+    }
+
+    return ret;
+
+fail_closefb:
+    qemu_fclose(fb);
+fail:
+    migrate_set_state(ms, MIGRATION_STATUS_POSTCOPY_ACTIVE,
+                          MIGRATION_STATUS_FAILED);
+    qemu_mutex_unlock_iothread();
+    return -1;
+}
+
 /**
  * migration_completion: Used by migration_thread when there's not much left.
  *   The caller 'breaks' the loop when this returns.
  *
  * @s: Current migration state
+ * @current_active_state: The migration state we expect to be in
  * @*old_vm_running: Pointer to old_vm_running flag
  * @*start_time: Pointer to time to update
  */
-static void migration_completion(MigrationState *s, bool *old_vm_running,
+static void migration_completion(MigrationState *s, int current_active_state,
+                                 bool *old_vm_running,
                                  int64_t *start_time)
 {
     int ret;
 
-    qemu_mutex_lock_iothread();
-    *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
-    qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
-    *old_vm_running = runstate_is_running();
+    if (s->state == MIGRATION_STATUS_ACTIVE) {
+        qemu_mutex_lock_iothread();
+        *start_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+        qemu_system_wakeup_request(QEMU_WAKEUP_REASON_OTHER);
+        *old_vm_running = runstate_is_running();
+        ret = global_state_store();
+
+        if (!ret) {
+            ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
+            if (ret >= 0) {
+                qemu_file_set_rate_limit(s->file, INT64_MAX);
+                qemu_savevm_state_complete_precopy(s->file, false);
+            }
+        }
+        qemu_mutex_unlock_iothread();
 
-    ret = global_state_store();
-    if (!ret) {
-        ret = vm_stop_force_state(RUN_STATE_FINISH_MIGRATE);
-        if (ret >= 0) {
-            qemu_file_set_rate_limit(s->file, INT64_MAX);
-            qemu_savevm_state_complete(s->file);
+        if (ret < 0) {
+            goto fail;
         }
+    } else if (s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
+        trace_migration_completion_postcopy_end();
+
+        qemu_savevm_state_complete_postcopy(s->file);
+        trace_migration_completion_postcopy_end_after_complete();
     }
-    qemu_mutex_unlock_iothread();
 
-    if (ret < 0) {
-        goto fail;
+    /*
+     * If rp was opened we must clean up the thread before
+     * cleaning everything else up (since if there are no failures
+     * it will wait for the destination to send it's status in
+     * a SHUT command).
+     * Postcopy opens rp if enabled (even if it's not avtivated)
+     */
+    if (migrate_postcopy_ram()) {
+        int rp_error;
+        trace_migration_completion_postcopy_end_before_rp();
+        rp_error = await_return_path_close_on_source(s);
+        trace_migration_completion_postcopy_end_after_rp(rp_error);
+        if (rp_error) {
+            goto fail;
+        }
     }
 
     if (qemu_file_get_error(s->file)) {
@@ -1011,52 +1574,101 @@ static void migration_completion(MigrationState *s, bool *old_vm_running,
         goto fail;
     }
 
-    migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_COMPLETED);
+    migrate_set_state(s, current_active_state, MIGRATION_STATUS_COMPLETED);
     return;
 
 fail:
-    migrate_set_state(s, MIGRATION_STATUS_ACTIVE, MIGRATION_STATUS_FAILED);
+    migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
 }
 
-/* migration thread support */
-
+/*
+ * Master migration thread on the source VM.
+ * It drives the migration and pumps the data down the outgoing channel.
+ */
 static void *migration_thread(void *opaque)
 {
     MigrationState *s = opaque;
+    /* Used by the bandwidth calcs, updated later */
     int64_t initial_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
     int64_t setup_start = qemu_clock_get_ms(QEMU_CLOCK_HOST);
     int64_t initial_bytes = 0;
     int64_t max_size = 0;
     int64_t start_time = initial_time;
+    int64_t end_time;
     bool old_vm_running = false;
+    bool entered_postcopy = false;
+    /* The active state we expect to be in; ACTIVE or POSTCOPY_ACTIVE */
+    enum MigrationStatus current_active_state = MIGRATION_STATUS_ACTIVE;
 
     rcu_register_thread();
 
     qemu_savevm_state_header(s->file);
+
+    if (migrate_postcopy_ram()) {
+        /* Now tell the dest that it should open its end so it can reply */
+        qemu_savevm_send_open_return_path(s->file);
+
+        /* And do a ping that will make stuff easier to debug */
+        qemu_savevm_send_ping(s->file, 1);
+
+        /*
+         * Tell the destination that we *might* want to do postcopy later;
+         * if the other end can't do postcopy it should fail now, nice and
+         * early.
+         */
+        qemu_savevm_send_postcopy_advise(s->file);
+    }
+
     qemu_savevm_state_begin(s->file, &s->params);
 
     s->setup_time = qemu_clock_get_ms(QEMU_CLOCK_HOST) - setup_start;
+    current_active_state = MIGRATION_STATUS_ACTIVE;
     migrate_set_state(s, MIGRATION_STATUS_SETUP, MIGRATION_STATUS_ACTIVE);
 
-    while (s->state == MIGRATION_STATUS_ACTIVE) {
+    trace_migration_thread_setup_complete();
+
+    while (s->state == MIGRATION_STATUS_ACTIVE ||
+           s->state == MIGRATION_STATUS_POSTCOPY_ACTIVE) {
         int64_t current_time;
         uint64_t pending_size;
 
         if (!qemu_file_rate_limit(s->file)) {
-            pending_size = qemu_savevm_state_pending(s->file, max_size);
-            trace_migrate_pending(pending_size, max_size);
+            uint64_t pend_post, pend_nonpost;
+
+            qemu_savevm_state_pending(s->file, max_size, &pend_nonpost,
+                                      &pend_post);
+            pending_size = pend_nonpost + pend_post;
+            trace_migrate_pending(pending_size, max_size,
+                                  pend_post, pend_nonpost);
             if (pending_size && pending_size >= max_size) {
-                qemu_savevm_state_iterate(s->file);
+                /* Still a significant amount to transfer */
+
+                current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
+                if (migrate_postcopy_ram() &&
+                    s->state != MIGRATION_STATUS_POSTCOPY_ACTIVE &&
+                    pend_nonpost <= max_size &&
+                    atomic_read(&s->start_postcopy)) {
+
+                    if (!postcopy_start(s, &old_vm_running)) {
+                        current_active_state = MIGRATION_STATUS_POSTCOPY_ACTIVE;
+                        entered_postcopy = true;
+                    }
+
+                    continue;
+                }
+                /* Just another iteration step */
+                qemu_savevm_state_iterate(s->file, entered_postcopy);
             } else {
                 trace_migration_thread_low_pending(pending_size);
-                migration_completion(s, &old_vm_running, &start_time);
+                migration_completion(s, current_active_state,
+                                     &old_vm_running, &start_time);
                 break;
             }
         }
 
         if (qemu_file_get_error(s->file)) {
-            migrate_set_state(s, MIGRATION_STATUS_ACTIVE,
-                              MIGRATION_STATUS_FAILED);
+            migrate_set_state(s, current_active_state, MIGRATION_STATUS_FAILED);
+            trace_migration_thread_file_err();
             break;
         }
         current_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
@@ -1087,22 +1699,26 @@ static void *migration_thread(void *opaque)
         }
     }
 
+    trace_migration_thread_after_loop();
     /* If we enabled cpu throttling for auto-converge, turn it off. */
     cpu_throttle_stop();
+    end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
 
     qemu_mutex_lock_iothread();
+    qemu_savevm_state_cleanup();
     if (s->state == MIGRATION_STATUS_COMPLETED) {
-        int64_t end_time = qemu_clock_get_ms(QEMU_CLOCK_REALTIME);
         uint64_t transferred_bytes = qemu_ftell(s->file);
         s->total_time = end_time - s->total_time;
-        s->downtime = end_time - start_time;
+        if (!entered_postcopy) {
+            s->downtime = end_time - start_time;
+        }
         if (s->total_time) {
             s->mbps = (((double) transferred_bytes * 8.0) /
                        ((double) s->total_time)) / 1000;
         }
         runstate_set(RUN_STATE_POSTMIGRATE);
     } else {
-        if (old_vm_running) {
+        if (old_vm_running && !entered_postcopy) {
             vm_start();
         }
     }
@@ -1125,7 +1741,34 @@ void migrate_fd_connect(MigrationState *s)
     /* Notify before starting migration thread */
     notifier_list_notify(&migration_state_notifiers, s);
 
+    /*
+     * Open the return path; currently for postcopy but other things might
+     * also want it.
+     */
+    if (migrate_postcopy_ram()) {
+        if (open_return_path_on_source(s)) {
+            error_report("Unable to open return-path for postcopy");
+            migrate_set_state(s, MIGRATION_STATUS_SETUP,
+                              MIGRATION_STATUS_FAILED);
+            migrate_fd_cleanup(s);
+            return;
+        }
+    }
+
     migrate_compress_threads_create();
     qemu_thread_create(&s->thread, "migration", migration_thread, s,
                        QEMU_THREAD_JOINABLE);
+    s->migration_thread_running = true;
+}
+
+PostcopyState  postcopy_state_get(void)
+{
+    return atomic_mb_read(&incoming_postcopy_state);
 }
+
+/* Set the state and return the old state */
+PostcopyState postcopy_state_set(PostcopyState new_state)
+{
+    return atomic_xchg(&incoming_postcopy_state, new_state);
+}
+
diff --git a/migration/postcopy-ram.c b/migration/postcopy-ram.c
new file mode 100644
index 0000000..22d6b18
--- /dev/null
+++ b/migration/postcopy-ram.c
@@ -0,0 +1,767 @@
+/*
+ * Postcopy migration for RAM
+ *
+ * Copyright 2013-2015 Red Hat, Inc. and/or its affiliates
+ *
+ * Authors:
+ *  Dave Gilbert  <dgilbert@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+/*
+ * Postcopy is a migration technique where the execution flips from the
+ * source to the destination before all the data has been copied.
+ */
+
+#include <glib.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "qemu-common.h"
+#include "migration/migration.h"
+#include "migration/postcopy-ram.h"
+#include "sysemu/sysemu.h"
+#include "sysemu/balloon.h"
+#include "qemu/error-report.h"
+#include "trace.h"
+
+/* Arbitrary limit on size of each discard command,
+ * keeps them around ~200 bytes
+ */
+#define MAX_DISCARDS_PER_COMMAND 12
+
+struct PostcopyDiscardState {
+    const char *ramblock_name;
+    uint64_t offset; /* Bitmap entry for the 1st bit of this RAMBlock */
+    uint16_t cur_entry;
+    /*
+     * Start and length of a discard range (bytes)
+     */
+    uint64_t start_list[MAX_DISCARDS_PER_COMMAND];
+    uint64_t length_list[MAX_DISCARDS_PER_COMMAND];
+    unsigned int nsentwords;
+    unsigned int nsentcmds;
+};
+
+/* Postcopy needs to detect accesses to pages that haven't yet been copied
+ * across, and efficiently map new pages in, the techniques for doing this
+ * are target OS specific.
+ */
+#if defined(__linux__)
+
+#include <poll.h>
+#include <sys/eventfd.h>
+#include <sys/mman.h>
+#include <sys/ioctl.h>
+#include <sys/syscall.h>
+#include <sys/types.h>
+#include <asm/types.h> /* for __u64 */
+#endif
+
+#if defined(__linux__) && defined(__NR_userfaultfd)
+#include <linux/userfaultfd.h>
+
+static bool ufd_version_check(int ufd)
+{
+    struct uffdio_api api_struct;
+    uint64_t ioctl_mask;
+
+    api_struct.api = UFFD_API;
+    api_struct.features = 0;
+    if (ioctl(ufd, UFFDIO_API, &api_struct)) {
+        error_report("postcopy_ram_supported_by_host: UFFDIO_API failed: %s",
+                     strerror(errno));
+        return false;
+    }
+
+    ioctl_mask = (__u64)1 << _UFFDIO_REGISTER |
+                 (__u64)1 << _UFFDIO_UNREGISTER;
+    if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
+        error_report("Missing userfault features: %" PRIx64,
+                     (uint64_t)(~api_struct.ioctls & ioctl_mask));
+        return false;
+    }
+
+    return true;
+}
+
+/*
+ * Note: This has the side effect of munlock'ing all of RAM, that's
+ * normally fine since if the postcopy succeeds it gets turned back on at the
+ * end.
+ */
+bool postcopy_ram_supported_by_host(void)
+{
+    long pagesize = getpagesize();
+    int ufd = -1;
+    bool ret = false; /* Error unless we change it */
+    void *testarea = NULL;
+    struct uffdio_register reg_struct;
+    struct uffdio_range range_struct;
+    uint64_t feature_mask;
+
+    if ((1ul << qemu_target_page_bits()) > pagesize) {
+        error_report("Target page size bigger than host page size");
+        goto out;
+    }
+
+    ufd = syscall(__NR_userfaultfd, O_CLOEXEC);
+    if (ufd == -1) {
+        error_report("%s: userfaultfd not available: %s", __func__,
+                     strerror(errno));
+        goto out;
+    }
+
+    /* Version and features check */
+    if (!ufd_version_check(ufd)) {
+        goto out;
+    }
+
+    /*
+     * userfault and mlock don't go together; we'll put it back later if
+     * it was enabled.
+     */
+    if (munlockall()) {
+        error_report("%s: munlockall: %s", __func__,  strerror(errno));
+        return -1;
+    }
+
+    /*
+     *  We need to check that the ops we need are supported on anon memory
+     *  To do that we need to register a chunk and see the flags that
+     *  are returned.
+     */
+    testarea = mmap(NULL, pagesize, PROT_READ | PROT_WRITE, MAP_PRIVATE |
+                                    MAP_ANONYMOUS, -1, 0);
+    if (testarea == MAP_FAILED) {
+        error_report("%s: Failed to map test area: %s", __func__,
+                     strerror(errno));
+        goto out;
+    }
+    g_assert(((size_t)testarea & (pagesize-1)) == 0);
+
+    reg_struct.range.start = (uintptr_t)testarea;
+    reg_struct.range.len = pagesize;
+    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+    if (ioctl(ufd, UFFDIO_REGISTER, &reg_struct)) {
+        error_report("%s userfault register: %s", __func__, strerror(errno));
+        goto out;
+    }
+
+    range_struct.start = (uintptr_t)testarea;
+    range_struct.len = pagesize;
+    if (ioctl(ufd, UFFDIO_UNREGISTER, &range_struct)) {
+        error_report("%s userfault unregister: %s", __func__, strerror(errno));
+        goto out;
+    }
+
+    feature_mask = (__u64)1 << _UFFDIO_WAKE |
+                   (__u64)1 << _UFFDIO_COPY |
+                   (__u64)1 << _UFFDIO_ZEROPAGE;
+    if ((reg_struct.ioctls & feature_mask) != feature_mask) {
+        error_report("Missing userfault map features: %" PRIx64,
+                     (uint64_t)(~reg_struct.ioctls & feature_mask));
+        goto out;
+    }
+
+    /* Success! */
+    ret = true;
+out:
+    if (testarea) {
+        munmap(testarea, pagesize);
+    }
+    if (ufd != -1) {
+        close(ufd);
+    }
+    return ret;
+}
+
+/**
+ * postcopy_ram_discard_range: Discard a range of memory.
+ * We can assume that if we've been called postcopy_ram_hosttest returned true.
+ *
+ * @mis: Current incoming migration state.
+ * @start, @length: range of memory to discard.
+ *
+ * returns: 0 on success.
+ */
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               size_t length)
+{
+    trace_postcopy_ram_discard_range(start, length);
+    if (madvise(start, length, MADV_DONTNEED)) {
+        error_report("%s MADV_DONTNEED: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Setup an area of RAM so that it *can* be used for postcopy later; this
+ * must be done right at the start prior to pre-copy.
+ * opaque should be the MIS.
+ */
+static int init_range(const char *block_name, void *host_addr,
+                      ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+
+    trace_postcopy_init_range(block_name, host_addr, offset, length);
+
+    /*
+     * We need the whole of RAM to be truly empty for postcopy, so things
+     * like ROMs and any data tables built during init must be zero'd
+     * - we're going to get the copy from the source anyway.
+     * (Precopy will just overwrite this data, so doesn't need the discard)
+     */
+    if (postcopy_ram_discard_range(mis, host_addr, length)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of migration, undo the effects of init_range
+ * opaque should be the MIS.
+ */
+static int cleanup_range(const char *block_name, void *host_addr,
+                        ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    struct uffdio_range range_struct;
+    trace_postcopy_cleanup_range(block_name, host_addr, offset, length);
+
+    /*
+     * We turned off hugepage for the precopy stage with postcopy enabled
+     * we can turn it back on now.
+     */
+    if (qemu_madvise(host_addr, length, QEMU_MADV_HUGEPAGE)) {
+        error_report("%s HUGEPAGE: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    /*
+     * We can also turn off userfault now since we should have all the
+     * pages.   It can be useful to leave it on to debug postcopy
+     * if you're not sure it's always getting every page.
+     */
+    range_struct.start = (uintptr_t)host_addr;
+    range_struct.len = length;
+
+    if (ioctl(mis->userfault_fd, UFFDIO_UNREGISTER, &range_struct)) {
+        error_report("%s: userfault unregister %s", __func__, strerror(errno));
+
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Initialise postcopy-ram, setting the RAM to a state where we can go into
+ * postcopy later; must be called prior to any precopy.
+ * called from arch_init's similarly named ram_postcopy_incoming_init
+ */
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    if (qemu_ram_foreach_block(init_range, mis)) {
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * At the end of a migration where postcopy_ram_incoming_init was called.
+ */
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    trace_postcopy_ram_incoming_cleanup_entry();
+
+    if (mis->have_fault_thread) {
+        uint64_t tmp64;
+
+        if (qemu_ram_foreach_block(cleanup_range, mis)) {
+            return -1;
+        }
+        /*
+         * Tell the fault_thread to exit, it's an eventfd that should
+         * currently be at 0, we're going to increment it to 1
+         */
+        tmp64 = 1;
+        if (write(mis->userfault_quit_fd, &tmp64, 8) == 8) {
+            trace_postcopy_ram_incoming_cleanup_join();
+            qemu_thread_join(&mis->fault_thread);
+        } else {
+            /* Not much we can do here, but may as well report it */
+            error_report("%s: incrementing userfault_quit_fd: %s", __func__,
+                         strerror(errno));
+        }
+        trace_postcopy_ram_incoming_cleanup_closeuf();
+        close(mis->userfault_fd);
+        close(mis->userfault_quit_fd);
+        mis->have_fault_thread = false;
+    }
+
+    qemu_balloon_inhibit(false);
+
+    if (enable_mlock) {
+        if (os_mlock() < 0) {
+            error_report("mlock: %s", strerror(errno));
+            /*
+             * It doesn't feel right to fail at this point, we have a valid
+             * VM state.
+             */
+        }
+    }
+
+    postcopy_state_set(POSTCOPY_INCOMING_END);
+    migrate_send_rp_shut(mis, qemu_file_get_error(mis->from_src_file) != 0);
+
+    if (mis->postcopy_tmp_page) {
+        munmap(mis->postcopy_tmp_page, getpagesize());
+        mis->postcopy_tmp_page = NULL;
+    }
+    trace_postcopy_ram_incoming_cleanup_exit();
+    return 0;
+}
+
+/*
+ * Disable huge pages on an area
+ */
+static int nhp_range(const char *block_name, void *host_addr,
+                    ram_addr_t offset, ram_addr_t length, void *opaque)
+{
+    trace_postcopy_nhp_range(block_name, host_addr, offset, length);
+
+    /*
+     * Before we do discards we need to ensure those discards really
+     * do delete areas of the page, even if THP thinks a hugepage would
+     * be a good idea, so force hugepages off.
+     */
+    if (qemu_madvise(host_addr, length, QEMU_MADV_NOHUGEPAGE)) {
+        error_report("%s: NOHUGEPAGE: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Userfault requires us to mark RAM as NOHUGEPAGE prior to discard
+ * however leaving it until after precopy means that most of the precopy
+ * data is still THPd
+ */
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+    if (qemu_ram_foreach_block(nhp_range, mis)) {
+        return -1;
+    }
+
+    postcopy_state_set(POSTCOPY_INCOMING_DISCARD);
+
+    return 0;
+}
+
+/*
+ * Mark the given area of RAM as requiring notification to unwritten areas
+ * Used as a  callback on qemu_ram_foreach_block.
+ *   host_addr: Base of area to mark
+ *   offset: Offset in the whole ram arena
+ *   length: Length of the section
+ *   opaque: MigrationIncomingState pointer
+ * Returns 0 on success
+ */
+static int ram_block_enable_notify(const char *block_name, void *host_addr,
+                                   ram_addr_t offset, ram_addr_t length,
+                                   void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    struct uffdio_register reg_struct;
+
+    reg_struct.range.start = (uintptr_t)host_addr;
+    reg_struct.range.len = length;
+    reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
+
+    /* Now tell our userfault_fd that it's responsible for this area */
+    if (ioctl(mis->userfault_fd, UFFDIO_REGISTER, &reg_struct)) {
+        error_report("%s userfault register: %s", __func__, strerror(errno));
+        return -1;
+    }
+
+    return 0;
+}
+
+/*
+ * Handle faults detected by the USERFAULT markings
+ */
+static void *postcopy_ram_fault_thread(void *opaque)
+{
+    MigrationIncomingState *mis = opaque;
+    struct uffd_msg msg;
+    int ret;
+    size_t hostpagesize = getpagesize();
+    RAMBlock *rb = NULL;
+    RAMBlock *last_rb = NULL; /* last RAMBlock we sent part of */
+
+    trace_postcopy_ram_fault_thread_entry();
+    qemu_sem_post(&mis->fault_thread_sem);
+
+    while (true) {
+        ram_addr_t rb_offset;
+        ram_addr_t in_raspace;
+        struct pollfd pfd[2];
+
+        /*
+         * We're mainly waiting for the kernel to give us a faulting HVA,
+         * however we can be told to quit via userfault_quit_fd which is
+         * an eventfd
+         */
+        pfd[0].fd = mis->userfault_fd;
+        pfd[0].events = POLLIN;
+        pfd[0].revents = 0;
+        pfd[1].fd = mis->userfault_quit_fd;
+        pfd[1].events = POLLIN; /* Waiting for eventfd to go positive */
+        pfd[1].revents = 0;
+
+        if (poll(pfd, 2, -1 /* Wait forever */) == -1) {
+            error_report("%s: userfault poll: %s", __func__, strerror(errno));
+            break;
+        }
+
+        if (pfd[1].revents) {
+            trace_postcopy_ram_fault_thread_quit();
+            break;
+        }
+
+        ret = read(mis->userfault_fd, &msg, sizeof(msg));
+        if (ret != sizeof(msg)) {
+            if (errno == EAGAIN) {
+                /*
+                 * if a wake up happens on the other thread just after
+                 * the poll, there is nothing to read.
+                 */
+                continue;
+            }
+            if (ret < 0) {
+                error_report("%s: Failed to read full userfault message: %s",
+                             __func__, strerror(errno));
+                break;
+            } else {
+                error_report("%s: Read %d bytes from userfaultfd expected %zd",
+                             __func__, ret, sizeof(msg));
+                break; /* Lost alignment, don't know what we'd read next */
+            }
+        }
+        if (msg.event != UFFD_EVENT_PAGEFAULT) {
+            error_report("%s: Read unexpected event %ud from userfaultfd",
+                         __func__, msg.event);
+            continue; /* It's not a page fault, shouldn't happen */
+        }
+
+        rb = qemu_ram_block_from_host(
+                 (void *)(uintptr_t)msg.arg.pagefault.address,
+                 true, &in_raspace, &rb_offset);
+        if (!rb) {
+            error_report("postcopy_ram_fault_thread: Fault outside guest: %"
+                         PRIx64, (uint64_t)msg.arg.pagefault.address);
+            break;
+        }
+
+        rb_offset &= ~(hostpagesize - 1);
+        trace_postcopy_ram_fault_thread_request(msg.arg.pagefault.address,
+                                                qemu_ram_get_idstr(rb),
+                                                rb_offset);
+
+        /*
+         * Send the request to the source - we want to request one
+         * of our host page sizes (which is >= TPS)
+         */
+        if (rb != last_rb) {
+            last_rb = rb;
+            migrate_send_rp_req_pages(mis, qemu_ram_get_idstr(rb),
+                                     rb_offset, hostpagesize);
+        } else {
+            /* Save some space */
+            migrate_send_rp_req_pages(mis, NULL,
+                                     rb_offset, hostpagesize);
+        }
+    }
+    trace_postcopy_ram_fault_thread_exit();
+    return NULL;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+    /* Open the fd for the kernel to give us userfaults */
+    mis->userfault_fd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
+    if (mis->userfault_fd == -1) {
+        error_report("%s: Failed to open userfault fd: %s", __func__,
+                     strerror(errno));
+        return -1;
+    }
+
+    /*
+     * Although the host check already tested the API, we need to
+     * do the check again as an ABI handshake on the new fd.
+     */
+    if (!ufd_version_check(mis->userfault_fd)) {
+        return -1;
+    }
+
+    /* Now an eventfd we use to tell the fault-thread to quit */
+    mis->userfault_quit_fd = eventfd(0, EFD_CLOEXEC);
+    if (mis->userfault_quit_fd == -1) {
+        error_report("%s: Opening userfault_quit_fd: %s", __func__,
+                     strerror(errno));
+        close(mis->userfault_fd);
+        return -1;
+    }
+
+    qemu_sem_init(&mis->fault_thread_sem, 0);
+    qemu_thread_create(&mis->fault_thread, "postcopy/fault",
+                       postcopy_ram_fault_thread, mis, QEMU_THREAD_JOINABLE);
+    qemu_sem_wait(&mis->fault_thread_sem);
+    qemu_sem_destroy(&mis->fault_thread_sem);
+    mis->have_fault_thread = true;
+
+    /* Mark so that we get notified of accesses to unwritten areas */
+    if (qemu_ram_foreach_block(ram_block_enable_notify, mis)) {
+        return -1;
+    }
+
+    /*
+     * Ballooning can mark pages as absent while we're postcopying
+     * that would cause false userfaults.
+     */
+    qemu_balloon_inhibit(true);
+
+    trace_postcopy_ram_enable_notify();
+
+    return 0;
+}
+
+/*
+ * Place a host page (from) at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+    struct uffdio_copy copy_struct;
+
+    copy_struct.dst = (uint64_t)(uintptr_t)host;
+    copy_struct.src = (uint64_t)(uintptr_t)from;
+    copy_struct.len = getpagesize();
+    copy_struct.mode = 0;
+
+    /* copy also acks to the kernel waking the stalled thread up
+     * TODO: We can inhibit that ack and only do it if it was requested
+     * which would be slightly cheaper, but we'd have to be careful
+     * of the order of updating our page state.
+     */
+    if (ioctl(mis->userfault_fd, UFFDIO_COPY, &copy_struct)) {
+        int e = errno;
+        error_report("%s: %s copy host: %p from: %p",
+                     __func__, strerror(e), host, from);
+
+        return -e;
+    }
+
+    trace_postcopy_place_page(host);
+    return 0;
+}
+
+/*
+ * Place a zero page at (host) atomically
+ * returns 0 on success
+ */
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+    struct uffdio_zeropage zero_struct;
+
+    zero_struct.range.start = (uint64_t)(uintptr_t)host;
+    zero_struct.range.len = getpagesize();
+    zero_struct.mode = 0;
+
+    if (ioctl(mis->userfault_fd, UFFDIO_ZEROPAGE, &zero_struct)) {
+        int e = errno;
+        error_report("%s: %s zero host: %p",
+                     __func__, strerror(e), host);
+
+        return -e;
+    }
+
+    trace_postcopy_place_page_zero(host);
+    return 0;
+}
+
+/*
+ * Returns a target page of memory that can be mapped at a later point in time
+ * using postcopy_place_page
+ * The same address is used repeatedly, postcopy_place_page just takes the
+ * backing page away.
+ * Returns: Pointer to allocated page
+ *
+ */
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+    if (!mis->postcopy_tmp_page) {
+        mis->postcopy_tmp_page = mmap(NULL, getpagesize(),
+                             PROT_READ | PROT_WRITE, MAP_PRIVATE |
+                             MAP_ANONYMOUS, -1, 0);
+        if (!mis->postcopy_tmp_page) {
+            error_report("%s: %s", __func__, strerror(errno));
+            return NULL;
+        }
+    }
+
+    return mis->postcopy_tmp_page;
+}
+
+#else
+/* No target OS support, stubs just fail */
+bool postcopy_ram_supported_by_host(void)
+{
+    error_report("%s: No OS support", __func__);
+    return false;
+}
+
+int postcopy_ram_incoming_init(MigrationIncomingState *mis, size_t ram_pages)
+{
+    error_report("postcopy_ram_incoming_init: No OS support");
+    return -1;
+}
+
+int postcopy_ram_incoming_cleanup(MigrationIncomingState *mis)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_ram_discard_range(MigrationIncomingState *mis, uint8_t *start,
+                               size_t length)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_ram_prepare_discard(MigrationIncomingState *mis)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_ram_enable_notify(MigrationIncomingState *mis)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_place_page(MigrationIncomingState *mis, void *host, void *from)
+{
+    assert(0);
+    return -1;
+}
+
+int postcopy_place_page_zero(MigrationIncomingState *mis, void *host)
+{
+    assert(0);
+    return -1;
+}
+
+void *postcopy_get_tmp_page(MigrationIncomingState *mis)
+{
+    assert(0);
+    return NULL;
+}
+
+#endif
+
+/* ------------------------------------------------------------------------- */
+
+/**
+ * postcopy_discard_send_init: Called at the start of each RAMBlock before
+ *   asking to discard individual ranges.
+ *
+ * @ms: The current migration state.
+ * @offset: the bitmap offset of the named RAMBlock in the migration
+ *   bitmap.
+ * @name: RAMBlock that discards will operate on.
+ *
+ * returns: a new PDS.
+ */
+PostcopyDiscardState *postcopy_discard_send_init(MigrationState *ms,
+                                                 unsigned long offset,
+                                                 const char *name)
+{
+    PostcopyDiscardState *res = g_malloc0(sizeof(PostcopyDiscardState));
+
+    if (res) {
+        res->ramblock_name = name;
+        res->offset = offset;
+    }
+
+    return res;
+}
+
+/**
+ * postcopy_discard_send_range: Called by the bitmap code for each chunk to
+ *   discard. May send a discard message, may just leave it queued to
+ *   be sent later.
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ * @start,@length: a range of pages in the migration bitmap in the
+ *   RAM block passed to postcopy_discard_send_init() (length=1 is one page)
+ */
+void postcopy_discard_send_range(MigrationState *ms, PostcopyDiscardState *pds,
+                                unsigned long start, unsigned long length)
+{
+    size_t tp_bits = qemu_target_page_bits();
+    /* Convert to byte offsets within the RAM block */
+    pds->start_list[pds->cur_entry] = (start - pds->offset) << tp_bits;
+    pds->length_list[pds->cur_entry] = length << tp_bits;
+    trace_postcopy_discard_send_range(pds->ramblock_name, start, length);
+    pds->cur_entry++;
+    pds->nsentwords++;
+
+    if (pds->cur_entry == MAX_DISCARDS_PER_COMMAND) {
+        /* Full set, ship it! */
+        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+                                              pds->cur_entry,
+                                              pds->start_list,
+                                              pds->length_list);
+        pds->nsentcmds++;
+        pds->cur_entry = 0;
+    }
+}
+
+/**
+ * postcopy_discard_send_finish: Called at the end of each RAMBlock by the
+ * bitmap code. Sends any outstanding discard messages, frees the PDS
+ *
+ * @ms: Current migration state.
+ * @pds: Structure initialised by postcopy_discard_send_init().
+ */
+void postcopy_discard_send_finish(MigrationState *ms, PostcopyDiscardState *pds)
+{
+    /* Anything unsent? */
+    if (pds->cur_entry) {
+        qemu_savevm_send_postcopy_ram_discard(ms->file, pds->ramblock_name,
+                                              pds->cur_entry,
+                                              pds->start_list,
+                                              pds->length_list);
+        pds->nsentcmds++;
+    }
+
+    trace_postcopy_discard_send_finish(pds->ramblock_name, pds->nsentwords,
+                                       pds->nsentcmds);
+
+    g_free(pds);
+}
diff --git a/migration/qemu-file-unix.c b/migration/qemu-file-unix.c
index 809bf07..c503b02 100644
--- a/migration/qemu-file-unix.c
+++ b/migration/qemu-file-unix.c
@@ -22,6 +22,7 @@
  * THE SOFTWARE.
  */
 #include "qemu-common.h"
+#include "qemu/error-report.h"
 #include "qemu/iov.h"
 #include "qemu/sockets.h"
 #include "qemu/coroutine.h"
@@ -39,12 +40,43 @@ static ssize_t socket_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
     QEMUFileSocket *s = opaque;
     ssize_t len;
     ssize_t size = iov_size(iov, iovcnt);
+    ssize_t offset = 0;
+    int     err;
 
-    len = iov_send(s->fd, iov, iovcnt, 0, size);
-    if (len < size) {
-        len = -socket_error();
-    }
-    return len;
+    while (size > 0) {
+        len = iov_send(s->fd, iov, iovcnt, offset, size);
+
+        if (len > 0) {
+            size -= len;
+            offset += len;
+        }
+
+        if (size > 0) {
+            err = socket_error();
+
+            if (err != EAGAIN && err != EWOULDBLOCK) {
+                error_report("socket_writev_buffer: Got err=%d for (%zu/%zu)",
+                             err, (size_t)size, (size_t)len);
+                /*
+                 * If I've already sent some but only just got the error, I
+                 * could return the amount validly sent so far and wait for the
+                 * next call to report the error, but I'd rather flag the error
+                 * immediately.
+                 */
+                return -err;
+            }
+
+            /* Emulate blocking */
+            GPollFD pfd;
+
+            pfd.fd = s->fd;
+            pfd.events = G_IO_OUT | G_IO_ERR;
+            pfd.revents = 0;
+            g_poll(&pfd, 1 /* 1 fd */, -1 /* no timeout */);
+        }
+     }
+
+    return offset;
 }
 
 static int socket_get_fd(void *opaque)
@@ -97,6 +129,56 @@ static int socket_shutdown(void *opaque, bool rd, bool wr)
     }
 }
 
+static int socket_return_close(void *opaque)
+{
+    QEMUFileSocket *s = opaque;
+    /*
+     * Note: We don't close the socket, that should be done by the forward
+     * path.
+     */
+    g_free(s);
+    return 0;
+}
+
+static const QEMUFileOps socket_return_read_ops = {
+    .get_fd          = socket_get_fd,
+    .get_buffer      = socket_get_buffer,
+    .close           = socket_return_close,
+    .shut_down       = socket_shutdown,
+};
+
+static const QEMUFileOps socket_return_write_ops = {
+    .get_fd          = socket_get_fd,
+    .writev_buffer   = socket_writev_buffer,
+    .close           = socket_return_close,
+    .shut_down       = socket_shutdown,
+};
+
+/*
+ * Give a QEMUFile* off the same socket but data in the opposite
+ * direction.
+ */
+static QEMUFile *socket_get_return_path(void *opaque)
+{
+    QEMUFileSocket *forward = opaque;
+    QEMUFileSocket *reverse;
+
+    if (qemu_file_get_error(forward->file)) {
+        /* If the forward file is in error, don't try and open a return */
+        return NULL;
+    }
+
+    reverse = g_malloc0(sizeof(QEMUFileSocket));
+    reverse->fd = forward->fd;
+    /* I don't think there's a better way to tell which direction 'this' is */
+    if (forward->file->ops->get_buffer != NULL) {
+        /* being called from the read side, so we need to be able to write */
+        return qemu_fopen_ops(reverse, &socket_return_write_ops);
+    } else {
+        return qemu_fopen_ops(reverse, &socket_return_read_ops);
+    }
+}
+
 static ssize_t unix_writev_buffer(void *opaque, struct iovec *iov, int iovcnt,
                                   int64_t pos)
 {
@@ -206,18 +288,19 @@ QEMUFile *qemu_fdopen(int fd, const char *mode)
 }
 
 static const QEMUFileOps socket_read_ops = {
-    .get_fd     = socket_get_fd,
-    .get_buffer = socket_get_buffer,
-    .close      = socket_close,
-    .shut_down  = socket_shutdown
-
+    .get_fd          = socket_get_fd,
+    .get_buffer      = socket_get_buffer,
+    .close           = socket_close,
+    .shut_down       = socket_shutdown,
+    .get_return_path = socket_get_return_path
 };
 
 static const QEMUFileOps socket_write_ops = {
-    .get_fd        = socket_get_fd,
-    .writev_buffer = socket_writev_buffer,
-    .close         = socket_close,
-    .shut_down     = socket_shutdown
+    .get_fd          = socket_get_fd,
+    .writev_buffer   = socket_writev_buffer,
+    .close           = socket_close,
+    .shut_down       = socket_shutdown,
+    .get_return_path = socket_get_return_path
 };
 
 QEMUFile *qemu_fopen_socket(int fd, const char *mode)
diff --git a/migration/qemu-file.c b/migration/qemu-file.c
index df49023..0bbd257 100644
--- a/migration/qemu-file.c
+++ b/migration/qemu-file.c
@@ -44,6 +44,18 @@ int qemu_file_shutdown(QEMUFile *f)
     return f->ops->shut_down(f->opaque, true, true);
 }
 
+/*
+ * Result: QEMUFile* for a 'return path' for comms in the opposite direction
+ *         NULL if not available
+ */
+QEMUFile *qemu_file_get_return_path(QEMUFile *f)
+{
+    if (!f->ops->get_return_path) {
+        return NULL;
+    }
+    return f->ops->get_return_path(f->opaque);
+}
+
 bool qemu_file_mode_is_not_valid(const char *mode)
 {
     if (mode == NULL ||
@@ -434,6 +446,43 @@ size_t qemu_get_buffer(QEMUFile *f, uint8_t *buf, size_t size)
 }
 
 /*
+ * Read 'size' bytes of data from the file.
+ * 'size' can be larger than the internal buffer.
+ *
+ * The data:
+ *   may be held on an internal buffer (in which case *buf is updated
+ *     to point to it) that is valid until the next qemu_file operation.
+ * OR
+ *   will be copied to the *buf that was passed in.
+ *
+ * The code tries to avoid the copy if possible.
+ *
+ * It will return size bytes unless there was an error, in which case it will
+ * return as many as it managed to read (assuming blocking fd's which
+ * all current QEMUFile are)
+ *
+ * Note: Since **buf may get changed, the caller should take care to
+ *       keep a pointer to the original buffer if it needs to deallocate it.
+ */
+size_t qemu_get_buffer_in_place(QEMUFile *f, uint8_t **buf, size_t size)
+{
+    if (size < IO_BUF_SIZE) {
+        size_t res;
+        uint8_t *src;
+
+        res = qemu_peek_buffer(f, &src, size, 0);
+
+        if (res == size) {
+            qemu_file_skip(f, res);
+            *buf = src;
+            return res;
+        }
+    }
+
+    return qemu_get_buffer(f, *buf, size);
+}
+
+/*
  * Peeks a single byte from the buffer; this isn't guaranteed to work if
  * offset leaves a gap after the previous read/peeked data.
  */
@@ -611,3 +660,18 @@ size_t qemu_get_counted_string(QEMUFile *f, char buf[256])
 
     return res == len ? res : 0;
 }
+
+/*
+ * Set the blocking state of the QEMUFile.
+ * Note: On some transports the OS only keeps a single blocking state for
+ *       both directions, and thus changing the blocking on the main
+ *       QEMUFile can also affect the return path.
+ */
+void qemu_file_set_blocking(QEMUFile *f, bool block)
+{
+    if (block) {
+        qemu_set_block(qemu_get_fd(f));
+    } else {
+        qemu_set_nonblock(qemu_get_fd(f));
+    }
+}
diff --git a/migration/ram.c b/migration/ram.c
index a25bcc7..7f32696 100644
--- a/migration/ram.c
+++ b/migration/ram.c
@@ -32,6 +32,7 @@
 #include "qemu/timer.h"
 #include "qemu/main-loop.h"
 #include "migration/migration.h"
+#include "migration/postcopy-ram.h"
 #include "exec/address-spaces.h"
 #include "migration/page_cache.h"
 #include "qemu/error-report.h"
@@ -237,7 +238,14 @@ typedef struct PageSearchStatus PageSearchStatus;
 
 static struct BitmapRcu {
     struct rcu_head rcu;
+    /* Main migration bitmap */
     unsigned long *bmap;
+    /* bitmap of pages that haven't been sent even once
+     * only maintained and used in postcopy at the moment
+     * where it's used to send the dirtymap at the start
+     * of the postcopy phase
+     */
+    unsigned long *unsentmap;
 } *migration_bitmap_rcu;
 
 struct CompressParam {
@@ -531,10 +539,18 @@ static int save_xbzrle_page(QEMUFile *f, uint8_t **current_data,
     return 1;
 }
 
-/* Called with rcu_read_lock() to protect migration_bitmap */
+/* Called with rcu_read_lock() to protect migration_bitmap
+ * rb: The RAMBlock  to search for dirty pages in
+ * start: Start address (typically so we can continue from previous page)
+ * ram_addr_abs: Pointer into which to store the address of the dirty page
+ *               within the global ram_addr space
+ *
+ * Returns: byte offset within memory region of the start of a dirty page
+ */
 static inline
-ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
-                                                 ram_addr_t start)
+ram_addr_t migration_bitmap_find_dirty(RAMBlock *rb,
+                                       ram_addr_t start,
+                                       ram_addr_t *ram_addr_abs)
 {
     unsigned long base = rb->offset >> TARGET_PAGE_BITS;
     unsigned long nr = base + (start >> TARGET_PAGE_BITS);
@@ -551,14 +567,24 @@ ram_addr_t migration_bitmap_find_and_reset_dirty(RAMBlock *rb,
         next = find_next_bit(bitmap, size, nr);
     }
 
-    if (next < size) {
-        clear_bit(next, bitmap);
+    *ram_addr_abs = next << TARGET_PAGE_BITS;
+    return (next - base) << TARGET_PAGE_BITS;
+}
+
+static inline bool migration_bitmap_clear_dirty(ram_addr_t addr)
+{
+    bool ret;
+    int nr = addr >> TARGET_PAGE_BITS;
+    unsigned long *bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+
+    ret = test_and_clear_bit(nr, bitmap);
+
+    if (ret) {
         migration_dirty_pages--;
     }
-    return (next - base) << TARGET_PAGE_BITS;
+    return ret;
 }
 
-/* Called with rcu_read_lock() to protect migration_bitmap */
 static void migration_bitmap_sync_range(ram_addr_t start, ram_addr_t length)
 {
     unsigned long *bitmap;
@@ -951,12 +977,14 @@ static int ram_save_compressed_page(QEMUFile *f, RAMBlock *block,
  * @f: Current migration stream.
  * @pss: Data about the state of the current dirty page scan.
  * @*again: Set to false if the search has scanned the whole of RAM
+ * *ram_addr_abs: Pointer into which to store the address of the dirty page
+ *               within the global ram_addr space
  */
 static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
-                             bool *again)
+                             bool *again, ram_addr_t *ram_addr_abs)
 {
-    pss->offset = migration_bitmap_find_and_reset_dirty(pss->block,
-                                                       pss->offset);
+    pss->offset = migration_bitmap_find_dirty(pss->block, pss->offset,
+                                              ram_addr_abs);
     if (pss->complete_round && pss->block == last_seen_block &&
         pss->offset >= last_offset) {
         /*
@@ -995,6 +1023,277 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
     }
 }
 
+/*
+ * Helper for 'get_queued_page' - gets a page off the queue
+ *      ms:      MigrationState in
+ * *offset:      Used to return the offset within the RAMBlock
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns:      block (or NULL if none available)
+ */
+static RAMBlock *unqueue_page(MigrationState *ms, ram_addr_t *offset,
+                              ram_addr_t *ram_addr_abs)
+{
+    RAMBlock *block = NULL;
+
+    qemu_mutex_lock(&ms->src_page_req_mutex);
+    if (!QSIMPLEQ_EMPTY(&ms->src_page_requests)) {
+        struct MigrationSrcPageRequest *entry =
+                                QSIMPLEQ_FIRST(&ms->src_page_requests);
+        block = entry->rb;
+        *offset = entry->offset;
+        *ram_addr_abs = (entry->offset + entry->rb->offset) &
+                        TARGET_PAGE_MASK;
+
+        if (entry->len > TARGET_PAGE_SIZE) {
+            entry->len -= TARGET_PAGE_SIZE;
+            entry->offset += TARGET_PAGE_SIZE;
+        } else {
+            memory_region_unref(block->mr);
+            QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+            g_free(entry);
+        }
+    }
+    qemu_mutex_unlock(&ms->src_page_req_mutex);
+
+    return block;
+}
+
+/*
+ * Unqueue a page from the queue fed by postcopy page requests; skips pages
+ * that are already sent (!dirty)
+ *
+ *      ms:      MigrationState in
+ *     pss:      PageSearchStatus structure updated with found block/offset
+ * ram_addr_abs: global offset in the dirty/sent bitmaps
+ *
+ * Returns:      true if a queued page is found
+ */
+static bool get_queued_page(MigrationState *ms, PageSearchStatus *pss,
+                            ram_addr_t *ram_addr_abs)
+{
+    RAMBlock  *block;
+    ram_addr_t offset;
+    bool dirty;
+
+    do {
+        block = unqueue_page(ms, &offset, ram_addr_abs);
+        /*
+         * We're sending this page, and since it's postcopy nothing else
+         * will dirty it, and we must make sure it doesn't get sent again
+         * even if this queue request was received after the background
+         * search already sent it.
+         */
+        if (block) {
+            unsigned long *bitmap;
+            bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+            dirty = test_bit(*ram_addr_abs >> TARGET_PAGE_BITS, bitmap);
+            if (!dirty) {
+                trace_get_queued_page_not_dirty(
+                    block->idstr, (uint64_t)offset,
+                    (uint64_t)*ram_addr_abs,
+                    test_bit(*ram_addr_abs >> TARGET_PAGE_BITS,
+                         atomic_rcu_read(&migration_bitmap_rcu)->unsentmap));
+            } else {
+                trace_get_queued_page(block->idstr,
+                                      (uint64_t)offset,
+                                      (uint64_t)*ram_addr_abs);
+            }
+        }
+
+    } while (block && !dirty);
+
+    if (block) {
+        /*
+         * As soon as we start servicing pages out of order, then we have
+         * to kill the bulk stage, since the bulk stage assumes
+         * in (migration_bitmap_find_and_reset_dirty) that every page is
+         * dirty, that's no longer true.
+         */
+        ram_bulk_stage = false;
+
+        /*
+         * We want the background search to continue from the queued page
+         * since the guest is likely to want other pages near to the page
+         * it just requested.
+         */
+        pss->block = block;
+        pss->offset = offset;
+    }
+
+    return !!block;
+}
+
+/**
+ * flush_page_queue: Flush any remaining pages in the ram request queue
+ *    it should be empty at the end anyway, but in error cases there may be
+ *    some left.
+ *
+ * ms: MigrationState
+ */
+void flush_page_queue(MigrationState *ms)
+{
+    struct MigrationSrcPageRequest *mspr, *next_mspr;
+    /* This queue generally should be empty - but in the case of a failed
+     * migration might have some droppings in.
+     */
+    rcu_read_lock();
+    QSIMPLEQ_FOREACH_SAFE(mspr, &ms->src_page_requests, next_req, next_mspr) {
+        memory_region_unref(mspr->rb->mr);
+        QSIMPLEQ_REMOVE_HEAD(&ms->src_page_requests, next_req);
+        g_free(mspr);
+    }
+    rcu_read_unlock();
+}
+
+/**
+ * Queue the pages for transmission, e.g. a request from postcopy destination
+ *   ms: MigrationStatus in which the queue is held
+ *   rbname: The RAMBlock the request is for - may be NULL (to mean reuse last)
+ *   start: Offset from the start of the RAMBlock
+ *   len: Length (in bytes) to send
+ *   Return: 0 on success
+ */
+int ram_save_queue_pages(MigrationState *ms, const char *rbname,
+                         ram_addr_t start, ram_addr_t len)
+{
+    RAMBlock *ramblock;
+
+    rcu_read_lock();
+    if (!rbname) {
+        /* Reuse last RAMBlock */
+        ramblock = ms->last_req_rb;
+
+        if (!ramblock) {
+            /*
+             * Shouldn't happen, we can't reuse the last RAMBlock if
+             * it's the 1st request.
+             */
+            error_report("ram_save_queue_pages no previous block");
+            goto err;
+        }
+    } else {
+        ramblock = qemu_ram_block_by_name(rbname);
+
+        if (!ramblock) {
+            /* We shouldn't be asked for a non-existent RAMBlock */
+            error_report("ram_save_queue_pages no block '%s'", rbname);
+            goto err;
+        }
+        ms->last_req_rb = ramblock;
+    }
+    trace_ram_save_queue_pages(ramblock->idstr, start, len);
+    if (start+len > ramblock->used_length) {
+        error_report("%s request overrun start=" RAM_ADDR_FMT " len="
+                     RAM_ADDR_FMT " blocklen=" RAM_ADDR_FMT,
+                     __func__, start, len, ramblock->used_length);
+        goto err;
+    }
+
+    struct MigrationSrcPageRequest *new_entry =
+        g_malloc0(sizeof(struct MigrationSrcPageRequest));
+    new_entry->rb = ramblock;
+    new_entry->offset = start;
+    new_entry->len = len;
+
+    memory_region_ref(ramblock->mr);
+    qemu_mutex_lock(&ms->src_page_req_mutex);
+    QSIMPLEQ_INSERT_TAIL(&ms->src_page_requests, new_entry, next_req);
+    qemu_mutex_unlock(&ms->src_page_req_mutex);
+    rcu_read_unlock();
+
+    return 0;
+
+err:
+    rcu_read_unlock();
+    return -1;
+}
+
+/**
+ * ram_save_target_page: Save one target page
+ *
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page;
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ *
+ * Returns: Number of pages written.
+ */
+static int ram_save_target_page(MigrationState *ms, QEMUFile *f,
+                                RAMBlock *block, ram_addr_t offset,
+                                bool last_stage,
+                                uint64_t *bytes_transferred,
+                                ram_addr_t dirty_ram_abs)
+{
+    int res = 0;
+
+    /* Check the pages is dirty and if it is send it */
+    if (migration_bitmap_clear_dirty(dirty_ram_abs)) {
+        unsigned long *unsentmap;
+        if (compression_switch && migrate_use_compression()) {
+            res = ram_save_compressed_page(f, block, offset,
+                                           last_stage,
+                                           bytes_transferred);
+        } else {
+            res = ram_save_page(f, block, offset, last_stage,
+                                bytes_transferred);
+        }
+
+        if (res < 0) {
+            return res;
+        }
+        unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+        if (unsentmap) {
+            clear_bit(dirty_ram_abs >> TARGET_PAGE_BITS, unsentmap);
+        }
+    }
+
+    return res;
+}
+
+/**
+ * ram_save_host_page: Starting at *offset send pages upto the end
+ *                     of the current host page.  It's valid for the initial
+ *                     offset to point into the middle of a host page
+ *                     in which case the remainder of the hostpage is sent.
+ *                     Only dirty target pages are sent.
+ *
+ * Returns: Number of pages written.
+ *
+ * @f: QEMUFile where to send the data
+ * @block: pointer to block that contains the page we want to send
+ * @offset: offset inside the block for the page; updated to last target page
+ *          sent
+ * @last_stage: if we are at the completion stage
+ * @bytes_transferred: increase it with the number of transferred bytes
+ * @dirty_ram_abs: Address of the start of the dirty page in ram_addr_t space
+ */
+static int ram_save_host_page(MigrationState *ms, QEMUFile *f, RAMBlock *block,
+                              ram_addr_t *offset, bool last_stage,
+                              uint64_t *bytes_transferred,
+                              ram_addr_t dirty_ram_abs)
+{
+    int tmppages, pages = 0;
+    do {
+        tmppages = ram_save_target_page(ms, f, block, *offset, last_stage,
+                                        bytes_transferred, dirty_ram_abs);
+        if (tmppages < 0) {
+            return tmppages;
+        }
+
+        pages += tmppages;
+        *offset += TARGET_PAGE_SIZE;
+        dirty_ram_abs += TARGET_PAGE_SIZE;
+    } while (*offset & (qemu_host_page_size - 1));
+
+    /* The offset we leave with is the last one we looked at */
+    *offset -= TARGET_PAGE_SIZE;
+    return pages;
+}
+
 /**
  * ram_find_and_save_block: Finds a dirty page and sends it to f
  *
@@ -1006,14 +1305,20 @@ static bool find_dirty_block(QEMUFile *f, PageSearchStatus *pss,
  * @f: QEMUFile where to send the data
  * @last_stage: if we are at the completion stage
  * @bytes_transferred: increase it with the number of transferred bytes
+ *
+ * On systems where host-page-size > target-page-size it will send all the
+ * pages in a host page that are dirty.
  */
 
 static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
                                    uint64_t *bytes_transferred)
 {
     PageSearchStatus pss;
+    MigrationState *ms = migrate_get_current();
     int pages = 0;
     bool again, found;
+    ram_addr_t dirty_ram_abs; /* Address of the start of the dirty page in
+                                 ram_addr_t space */
 
     pss.block = last_seen_block;
     pss.offset = last_offset;
@@ -1024,22 +1329,18 @@ static int ram_find_and_save_block(QEMUFile *f, bool last_stage,
     }
 
     do {
-        found = find_dirty_block(f, &pss, &again);
+        again = true;
+        found = get_queued_page(ms, &pss, &dirty_ram_abs);
 
-        if (found) {
-            if (compression_switch && migrate_use_compression()) {
-                pages = ram_save_compressed_page(f, pss.block, pss.offset,
-                                                 last_stage,
-                                                 bytes_transferred);
-            } else {
-                pages = ram_save_page(f, pss.block, pss.offset, last_stage,
-                                      bytes_transferred);
-            }
+        if (!found) {
+            /* priority queue empty, so just search for something dirty */
+            found = find_dirty_block(f, &pss, &again, &dirty_ram_abs);
+        }
 
-            /* if page is unmodified, continue to the next */
-            if (pages > 0) {
-                last_sent_block = pss.block;
-            }
+        if (found) {
+            pages = ram_save_host_page(ms, f, pss.block, &pss.offset,
+                                       last_stage, bytes_transferred,
+                                       dirty_ram_abs);
         }
     } while (!pages && again);
 
@@ -1097,10 +1398,11 @@ void free_xbzrle_decoded_buf(void)
 static void migration_bitmap_free(struct BitmapRcu *bmap)
 {
     g_free(bmap->bmap);
+    g_free(bmap->unsentmap);
     g_free(bmap);
 }
 
-static void migration_end(void)
+static void ram_migration_cleanup(void *opaque)
 {
     /* caller have hold iothread lock or is in a bh, so there is
      * no writing race against this migration_bitmap
@@ -1124,11 +1426,6 @@ static void migration_end(void)
     XBZRLE_cache_unlock();
 }
 
-static void ram_migration_cancel(void *opaque)
-{
-    migration_end();
-}
-
 static void reset_ram_globals(void)
 {
     last_seen_block = NULL;
@@ -1158,6 +1455,13 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
         qemu_mutex_lock(&migration_bitmap_mutex);
         bitmap_copy(bitmap->bmap, old_bitmap->bmap, old);
         bitmap_set(bitmap->bmap, old, new - old);
+
+        /* We don't have a way to safely extend the sentmap
+         * with RCU; so mark it as missing, entry to postcopy
+         * will fail.
+         */
+        bitmap->unsentmap = NULL;
+
         atomic_rcu_set(&migration_bitmap_rcu, bitmap);
         qemu_mutex_unlock(&migration_bitmap_mutex);
         migration_dirty_pages += new - old;
@@ -1165,6 +1469,394 @@ void migration_bitmap_extend(ram_addr_t old, ram_addr_t new)
     }
 }
 
+/*
+ * 'expected' is the value you expect the bitmap mostly to be full
+ * of; it won't bother printing lines that are all this value.
+ * If 'todump' is null the migration bitmap is dumped.
+ */
+void ram_debug_dump_bitmap(unsigned long *todump, bool expected)
+{
+    int64_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    int64_t cur;
+    int64_t linelen = 128;
+    char linebuf[129];
+
+    if (!todump) {
+        todump = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+    }
+
+    for (cur = 0; cur < ram_pages; cur += linelen) {
+        int64_t curb;
+        bool found = false;
+        /*
+         * Last line; catch the case where the line length
+         * is longer than remaining ram
+         */
+        if (cur + linelen > ram_pages) {
+            linelen = ram_pages - cur;
+        }
+        for (curb = 0; curb < linelen; curb++) {
+            bool thisbit = test_bit(cur + curb, todump);
+            linebuf[curb] = thisbit ? '1' : '.';
+            found = found || (thisbit != expected);
+        }
+        if (found) {
+            linebuf[curb] = '\0';
+            fprintf(stderr,  "0x%08" PRIx64 " : %s\n", cur, linebuf);
+        }
+    }
+}
+
+/* **** functions for postcopy ***** */
+
+/*
+ * Callback from postcopy_each_ram_send_discard for each RAMBlock
+ * Note: At this point the 'unsentmap' is the processed bitmap combined
+ *       with the dirtymap; so a '1' means it's either dirty or unsent.
+ * start,length: Indexes into the bitmap for the first bit
+ *            representing the named block and length in target-pages
+ */
+static int postcopy_send_discard_bm_ram(MigrationState *ms,
+                                        PostcopyDiscardState *pds,
+                                        unsigned long start,
+                                        unsigned long length)
+{
+    unsigned long end = start + length; /* one after the end */
+    unsigned long current;
+    unsigned long *unsentmap;
+
+    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+    for (current = start; current < end; ) {
+        unsigned long one = find_next_bit(unsentmap, end, current);
+
+        if (one <= end) {
+            unsigned long zero = find_next_zero_bit(unsentmap, end, one + 1);
+            unsigned long discard_length;
+
+            if (zero >= end) {
+                discard_length = end - one;
+            } else {
+                discard_length = zero - one;
+            }
+            postcopy_discard_send_range(ms, pds, one, discard_length);
+            current = one + discard_length;
+        } else {
+            current = one;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *   Calls postcopy_send_discard_bm_ram for each RAMBlock
+ *   passing it bitmap indexes and name.
+ * Returns: 0 on success
+ * (qemu_ram_foreach_block ends up passing unscaled lengths
+ *  which would mean postcopy code would have to deal with target page)
+ */
+static int postcopy_each_ram_send_discard(MigrationState *ms)
+{
+    struct RAMBlock *block;
+    int ret;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        unsigned long first = block->offset >> TARGET_PAGE_BITS;
+        PostcopyDiscardState *pds = postcopy_discard_send_init(ms,
+                                                               first,
+                                                               block->idstr);
+
+        /*
+         * Postcopy sends chunks of bitmap over the wire, but it
+         * just needs indexes at this point, avoids it having
+         * target page specific code.
+         */
+        ret = postcopy_send_discard_bm_ram(ms, pds, first,
+                                    block->used_length >> TARGET_PAGE_BITS);
+        postcopy_discard_send_finish(ms, pds);
+        if (ret) {
+            return ret;
+        }
+    }
+
+    return 0;
+}
+
+/*
+ * Helper for postcopy_chunk_hostpages; it's called twice to cleanup
+ *   the two bitmaps, that are similar, but one is inverted.
+ *
+ * We search for runs of target-pages that don't start or end on a
+ * host page boundary;
+ * unsent_pass=true: Cleans up partially unsent host pages by searching
+ *                 the unsentmap
+ * unsent_pass=false: Cleans up partially dirty host pages by searching
+ *                 the main migration bitmap
+ *
+ */
+static void postcopy_chunk_hostpages_pass(MigrationState *ms, bool unsent_pass,
+                                          RAMBlock *block,
+                                          PostcopyDiscardState *pds)
+{
+    unsigned long *bitmap;
+    unsigned long *unsentmap;
+    unsigned int host_ratio = qemu_host_page_size / TARGET_PAGE_SIZE;
+    unsigned long first = block->offset >> TARGET_PAGE_BITS;
+    unsigned long len = block->used_length >> TARGET_PAGE_BITS;
+    unsigned long last = first + (len - 1);
+    unsigned long run_start;
+
+    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+
+    if (unsent_pass) {
+        /* Find a sent page */
+        run_start = find_next_zero_bit(unsentmap, last + 1, first);
+    } else {
+        /* Find a dirty page */
+        run_start = find_next_bit(bitmap, last + 1, first);
+    }
+
+    while (run_start <= last) {
+        bool do_fixup = false;
+        unsigned long fixup_start_addr;
+        unsigned long host_offset;
+
+        /*
+         * If the start of this run of pages is in the middle of a host
+         * page, then we need to fixup this host page.
+         */
+        host_offset = run_start % host_ratio;
+        if (host_offset) {
+            do_fixup = true;
+            run_start -= host_offset;
+            fixup_start_addr = run_start;
+            /* For the next pass */
+            run_start = run_start + host_ratio;
+        } else {
+            /* Find the end of this run */
+            unsigned long run_end;
+            if (unsent_pass) {
+                run_end = find_next_bit(unsentmap, last + 1, run_start + 1);
+            } else {
+                run_end = find_next_zero_bit(bitmap, last + 1, run_start + 1);
+            }
+            /*
+             * If the end isn't at the start of a host page, then the
+             * run doesn't finish at the end of a host page
+             * and we need to discard.
+             */
+            host_offset = run_end % host_ratio;
+            if (host_offset) {
+                do_fixup = true;
+                fixup_start_addr = run_end - host_offset;
+                /*
+                 * This host page has gone, the next loop iteration starts
+                 * from after the fixup
+                 */
+                run_start = fixup_start_addr + host_ratio;
+            } else {
+                /*
+                 * No discards on this iteration, next loop starts from
+                 * next sent/dirty page
+                 */
+                run_start = run_end + 1;
+            }
+        }
+
+        if (do_fixup) {
+            unsigned long page;
+
+            /* Tell the destination to discard this page */
+            if (unsent_pass || !test_bit(fixup_start_addr, unsentmap)) {
+                /* For the unsent_pass we:
+                 *     discard partially sent pages
+                 * For the !unsent_pass (dirty) we:
+                 *     discard partially dirty pages that were sent
+                 *     (any partially sent pages were already discarded
+                 *     by the previous unsent_pass)
+                 */
+                postcopy_discard_send_range(ms, pds, fixup_start_addr,
+                                            host_ratio);
+            }
+
+            /* Clean up the bitmap */
+            for (page = fixup_start_addr;
+                 page < fixup_start_addr + host_ratio; page++) {
+                /* All pages in this host page are now not sent */
+                set_bit(page, unsentmap);
+
+                /*
+                 * Remark them as dirty, updating the count for any pages
+                 * that weren't previously dirty.
+                 */
+                migration_dirty_pages += !test_and_set_bit(page, bitmap);
+            }
+        }
+
+        if (unsent_pass) {
+            /* Find the next sent page for the next iteration */
+            run_start = find_next_zero_bit(unsentmap, last + 1,
+                                           run_start);
+        } else {
+            /* Find the next dirty page for the next iteration */
+            run_start = find_next_bit(bitmap, last + 1, run_start);
+        }
+    }
+}
+
+/*
+ * Utility for the outgoing postcopy code.
+ *
+ * Discard any partially sent host-page size chunks, mark any partially
+ * dirty host-page size chunks as all dirty.
+ *
+ * Returns: 0 on success
+ */
+static int postcopy_chunk_hostpages(MigrationState *ms)
+{
+    struct RAMBlock *block;
+
+    if (qemu_host_page_size == TARGET_PAGE_SIZE) {
+        /* Easy case - TPS==HPS - nothing to be done */
+        return 0;
+    }
+
+    /* Easiest way to make sure we don't resume in the middle of a host-page */
+    last_seen_block = NULL;
+    last_sent_block = NULL;
+    last_offset     = 0;
+
+    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
+        unsigned long first = block->offset >> TARGET_PAGE_BITS;
+
+        PostcopyDiscardState *pds =
+                         postcopy_discard_send_init(ms, first, block->idstr);
+
+        /* First pass: Discard all partially sent host pages */
+        postcopy_chunk_hostpages_pass(ms, true, block, pds);
+        /*
+         * Second pass: Ensure that all partially dirty host pages are made
+         * fully dirty.
+         */
+        postcopy_chunk_hostpages_pass(ms, false, block, pds);
+
+        postcopy_discard_send_finish(ms, pds);
+    } /* ram_list loop */
+
+    return 0;
+}
+
+/*
+ * Transmit the set of pages to be discarded after precopy to the target
+ * these are pages that:
+ *     a) Have been previously transmitted but are now dirty again
+ *     b) Pages that have never been transmitted, this ensures that
+ *        any pages on the destination that have been mapped by background
+ *        tasks get discarded (transparent huge pages is the specific concern)
+ * Hopefully this is pretty sparse
+ */
+int ram_postcopy_send_discard_bitmap(MigrationState *ms)
+{
+    int ret;
+    unsigned long *bitmap, *unsentmap;
+
+    rcu_read_lock();
+
+    /* This should be our last sync, the src is now paused */
+    migration_bitmap_sync();
+
+    unsentmap = atomic_rcu_read(&migration_bitmap_rcu)->unsentmap;
+    if (!unsentmap) {
+        /* We don't have a safe way to resize the sentmap, so
+         * if the bitmap was resized it will be NULL at this
+         * point.
+         */
+        error_report("migration ram resized during precopy phase");
+        rcu_read_unlock();
+        return -EINVAL;
+    }
+
+    /* Deal with TPS != HPS */
+    ret = postcopy_chunk_hostpages(ms);
+    if (ret) {
+        rcu_read_unlock();
+        return ret;
+    }
+
+    /*
+     * Update the unsentmap to be unsentmap = unsentmap | dirty
+     */
+    bitmap = atomic_rcu_read(&migration_bitmap_rcu)->bmap;
+    bitmap_or(unsentmap, unsentmap, bitmap,
+               last_ram_offset() >> TARGET_PAGE_BITS);
+
+
+    trace_ram_postcopy_send_discard_bitmap();
+#ifdef DEBUG_POSTCOPY
+    ram_debug_dump_bitmap(unsentmap, true);
+#endif
+
+    ret = postcopy_each_ram_send_discard(ms);
+    rcu_read_unlock();
+
+    return ret;
+}
+
+/*
+ * At the start of the postcopy phase of migration, any now-dirty
+ * precopied pages are discarded.
+ *
+ * start, length describe a byte address range within the RAMBlock
+ *
+ * Returns 0 on success.
+ */
+int ram_discard_range(MigrationIncomingState *mis,
+                      const char *block_name,
+                      uint64_t start, size_t length)
+{
+    int ret = -1;
+
+    rcu_read_lock();
+    RAMBlock *rb = qemu_ram_block_by_name(block_name);
+
+    if (!rb) {
+        error_report("ram_discard_range: Failed to find block '%s'",
+                     block_name);
+        goto err;
+    }
+
+    uint8_t *host_startaddr = rb->host + start;
+
+    if ((uintptr_t)host_startaddr & (qemu_host_page_size - 1)) {
+        error_report("ram_discard_range: Unaligned start address: %p",
+                     host_startaddr);
+        goto err;
+    }
+
+    if ((start + length) <= rb->used_length) {
+        uint8_t *host_endaddr = host_startaddr + length;
+        if ((uintptr_t)host_endaddr & (qemu_host_page_size - 1)) {
+            error_report("ram_discard_range: Unaligned end address: %p",
+                         host_endaddr);
+            goto err;
+        }
+        ret = postcopy_ram_discard_range(mis, host_startaddr, length);
+    } else {
+        error_report("ram_discard_range: Overrun block '%s' (%" PRIu64
+                     "/%zx/" RAM_ADDR_FMT")",
+                     block_name, start, length, rb->used_length);
+    }
+
+err:
+    rcu_read_unlock();
+
+    return ret;
+}
+
+
 /* Each of ram_save_setup, ram_save_iterate and ram_save_complete has
  * long-running RCU critical section.  When rcu-reclaims in the code
  * start to become numerous it will be necessary to reduce the
@@ -1219,10 +1911,15 @@ static int ram_save_setup(QEMUFile *f, void *opaque)
     reset_ram_globals();
 
     ram_bitmap_pages = last_ram_offset() >> TARGET_PAGE_BITS;
-    migration_bitmap_rcu = g_new(struct BitmapRcu, 1);
+    migration_bitmap_rcu = g_new0(struct BitmapRcu, 1);
     migration_bitmap_rcu->bmap = bitmap_new(ram_bitmap_pages);
     bitmap_set(migration_bitmap_rcu->bmap, 0, ram_bitmap_pages);
 
+    if (migrate_postcopy_ram()) {
+        migration_bitmap_rcu->unsentmap = bitmap_new(ram_bitmap_pages);
+        bitmap_set(migration_bitmap_rcu->unsentmap, 0, ram_bitmap_pages);
+    }
+
     /*
      * Count the total number of pages used by ram blocks not including any
      * gaps due to alignment or unplugs.
@@ -1322,7 +2019,9 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 {
     rcu_read_lock();
 
-    migration_bitmap_sync();
+    if (!migration_in_postcopy(migrate_get_current())) {
+        migration_bitmap_sync();
+    }
 
     ram_control_before_iterate(f, RAM_CONTROL_FINISH);
 
@@ -1344,19 +2043,21 @@ static int ram_save_complete(QEMUFile *f, void *opaque)
 
     rcu_read_unlock();
 
-    migration_end();
     qemu_put_be64(f, RAM_SAVE_FLAG_EOS);
 
     return 0;
 }
 
-static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
+static void ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size,
+                             uint64_t *non_postcopiable_pending,
+                             uint64_t *postcopiable_pending)
 {
     uint64_t remaining_size;
 
     remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
 
-    if (remaining_size < max_size) {
+    if (!migration_in_postcopy(migrate_get_current()) &&
+        remaining_size < max_size) {
         qemu_mutex_lock_iothread();
         rcu_read_lock();
         migration_bitmap_sync();
@@ -1364,7 +2065,9 @@ static uint64_t ram_save_pending(QEMUFile *f, void *opaque, uint64_t max_size)
         qemu_mutex_unlock_iothread();
         remaining_size = ram_save_remaining() * TARGET_PAGE_SIZE;
     }
-    return remaining_size;
+
+    /* We can do postcopy, and all the data is postcopiable */
+    *postcopiable_pending += remaining_size;
 }
 
 static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
@@ -1405,6 +2108,14 @@ static int load_xbzrle(QEMUFile *f, ram_addr_t addr, void *host)
 /* Must be called from within a rcu critical section.
  * Returns a pointer from within the RCU-protected ram_list.
  */
+/*
+ * Read a RAMBlock ID from the stream f, find the host address of the
+ * start of that block and add on 'offset'
+ *
+ * f: Stream to read from
+ * offset: Offset within the block
+ * flags: Page flags (mostly to see if it's a continuation of previous block)
+ */
 static inline void *host_from_stream_offset(QEMUFile *f,
                                             ram_addr_t offset,
                                             int flags)
@@ -1426,14 +2137,12 @@ static inline void *host_from_stream_offset(QEMUFile *f,
     qemu_get_buffer(f, (uint8_t *)id, len);
     id[len] = 0;
 
-    QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-        if (!strncmp(id, block->idstr, sizeof(id)) &&
-            block->max_length > offset) {
-            return block->host + offset;
-        }
+    block = qemu_ram_block_by_name(id);
+    if (block && block->max_length > offset) {
+        return block->host + offset;
     }
 
-    error_report("Can't find block %s!", id);
+    error_report("Can't find block %s", id);
     return NULL;
 }
 
@@ -1541,11 +2250,148 @@ static void decompress_data_with_multi_threads(uint8_t *compbuf,
     }
 }
 
+/*
+ * Allocate data structures etc needed by incoming migration with postcopy-ram
+ * postcopy-ram's similarly names postcopy_ram_incoming_init does the work
+ */
+int ram_postcopy_incoming_init(MigrationIncomingState *mis)
+{
+    size_t ram_pages = last_ram_offset() >> TARGET_PAGE_BITS;
+
+    return postcopy_ram_incoming_init(mis, ram_pages);
+}
+
+/*
+ * Called in postcopy mode by ram_load().
+ * rcu_read_lock is taken prior to this being called.
+ */
+static int ram_load_postcopy(QEMUFile *f)
+{
+    int flags = 0, ret = 0;
+    bool place_needed = false;
+    bool matching_page_sizes = qemu_host_page_size == TARGET_PAGE_SIZE;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    /* Temporary page that is later 'placed' */
+    void *postcopy_host_page = postcopy_get_tmp_page(mis);
+    void *last_host = NULL;
+    bool all_zero = false;
+
+    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+        ram_addr_t addr;
+        void *host = NULL;
+        void *page_buffer = NULL;
+        void *place_source = NULL;
+        uint8_t ch;
+
+        addr = qemu_get_be64(f);
+        flags = addr & ~TARGET_PAGE_MASK;
+        addr &= TARGET_PAGE_MASK;
+
+        trace_ram_load_postcopy_loop((uint64_t)addr, flags);
+        place_needed = false;
+        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE)) {
+            host = host_from_stream_offset(f, addr, flags);
+            if (!host) {
+                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+                ret = -EINVAL;
+                break;
+            }
+            page_buffer = host;
+            /*
+             * Postcopy requires that we place whole host pages atomically.
+             * To make it atomic, the data is read into a temporary page
+             * that's moved into place later.
+             * The migration protocol uses,  possibly smaller, target-pages
+             * however the source ensures it always sends all the components
+             * of a host page in order.
+             */
+            page_buffer = postcopy_host_page +
+                          ((uintptr_t)host & ~qemu_host_page_mask);
+            /* If all TP are zero then we can optimise the place */
+            if (!((uintptr_t)host & ~qemu_host_page_mask)) {
+                all_zero = true;
+            } else {
+                /* not the 1st TP within the HP */
+                if (host != (last_host + TARGET_PAGE_SIZE)) {
+                    error_report("Non-sequential target page %p/%p\n",
+                                  host, last_host);
+                    ret = -EINVAL;
+                    break;
+                }
+            }
+
+
+            /*
+             * If it's the last part of a host page then we place the host
+             * page
+             */
+            place_needed = (((uintptr_t)host + TARGET_PAGE_SIZE) &
+                                     ~qemu_host_page_mask) == 0;
+            place_source = postcopy_host_page;
+        }
+        last_host = host;
+
+        switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
+        case RAM_SAVE_FLAG_COMPRESS:
+            ch = qemu_get_byte(f);
+            memset(page_buffer, ch, TARGET_PAGE_SIZE);
+            if (ch) {
+                all_zero = false;
+            }
+            break;
+
+        case RAM_SAVE_FLAG_PAGE:
+            all_zero = false;
+            if (!place_needed || !matching_page_sizes) {
+                qemu_get_buffer(f, page_buffer, TARGET_PAGE_SIZE);
+            } else {
+                /* Avoids the qemu_file copy during postcopy, which is
+                 * going to do a copy later; can only do it when we
+                 * do this read in one go (matching page sizes)
+                 */
+                qemu_get_buffer_in_place(f, (uint8_t **)&place_source,
+                                         TARGET_PAGE_SIZE);
+            }
+            break;
+        case RAM_SAVE_FLAG_EOS:
+            /* normal exit */
+            break;
+        default:
+            error_report("Unknown combination of migration flags: %#x"
+                         " (postcopy mode)", flags);
+            ret = -EINVAL;
+        }
+
+        if (place_needed) {
+            /* This gets called at the last target page in the host page */
+            if (all_zero) {
+                ret = postcopy_place_page_zero(mis,
+                                               host + TARGET_PAGE_SIZE -
+                                               qemu_host_page_size);
+            } else {
+                ret = postcopy_place_page(mis, host + TARGET_PAGE_SIZE -
+                                               qemu_host_page_size,
+                                               place_source);
+            }
+        }
+        if (!ret) {
+            ret = qemu_file_get_error(f);
+        }
+    }
+
+    return ret;
+}
+
 static int ram_load(QEMUFile *f, void *opaque, int version_id)
 {
     int flags = 0, ret = 0;
     static uint64_t seq_iter;
     int len = 0;
+    /*
+     * If system is running in postcopy mode, page inserts to host memory must
+     * be atomic
+     */
+    bool postcopy_running = postcopy_state_get() >= POSTCOPY_INCOMING_LISTENING;
 
     seq_iter++;
 
@@ -1559,15 +2405,30 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
      * critical section.
      */
     rcu_read_lock();
-    while (!ret && !(flags & RAM_SAVE_FLAG_EOS)) {
+
+    if (postcopy_running) {
+        ret = ram_load_postcopy(f);
+    }
+
+    while (!postcopy_running && !ret && !(flags & RAM_SAVE_FLAG_EOS)) {
         ram_addr_t addr, total_ram_bytes;
-        void *host;
+        void *host = NULL;
         uint8_t ch;
 
         addr = qemu_get_be64(f);
         flags = addr & ~TARGET_PAGE_MASK;
         addr &= TARGET_PAGE_MASK;
 
+        if (flags & (RAM_SAVE_FLAG_COMPRESS | RAM_SAVE_FLAG_PAGE |
+                     RAM_SAVE_FLAG_COMPRESS_PAGE | RAM_SAVE_FLAG_XBZRLE)) {
+            host = host_from_stream_offset(f, addr, flags);
+            if (!host) {
+                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
+                ret = -EINVAL;
+                break;
+            }
+        }
+
         switch (flags & ~RAM_SAVE_FLAG_CONTINUE) {
         case RAM_SAVE_FLAG_MEM_SIZE:
             /* Synchronize RAM block list */
@@ -1582,23 +2443,20 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                 id[len] = 0;
                 length = qemu_get_be64(f);
 
-                QLIST_FOREACH_RCU(block, &ram_list.blocks, next) {
-                    if (!strncmp(id, block->idstr, sizeof(id))) {
-                        if (length != block->used_length) {
-                            Error *local_err = NULL;
+                block = qemu_ram_block_by_name(id);
+                if (block) {
+                    if (length != block->used_length) {
+                        Error *local_err = NULL;
 
-                            ret = qemu_ram_resize(block->offset, length, &local_err);
-                            if (local_err) {
-                                error_report_err(local_err);
-                            }
+                        ret = qemu_ram_resize(block->offset, length,
+                                              &local_err);
+                        if (local_err) {
+                            error_report_err(local_err);
                         }
-                        ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
-                                              block->idstr);
-                        break;
                     }
-                }
-
-                if (!block) {
+                    ram_control_load_hook(f, RAM_CONTROL_BLOCK_REG,
+                                          block->idstr);
+                } else {
                     error_report("Unknown ramblock \"%s\", cannot "
                                  "accept migration", id);
                     ret = -EINVAL;
@@ -1607,33 +2465,17 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
                 total_ram_bytes -= length;
             }
             break;
+
         case RAM_SAVE_FLAG_COMPRESS:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
             ch = qemu_get_byte(f);
             ram_handle_compressed(host, ch, TARGET_PAGE_SIZE);
             break;
+
         case RAM_SAVE_FLAG_PAGE:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
             qemu_get_buffer(f, host, TARGET_PAGE_SIZE);
             break;
-        case RAM_SAVE_FLAG_COMPRESS_PAGE:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Invalid RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
 
+        case RAM_SAVE_FLAG_COMPRESS_PAGE:
             len = qemu_get_be32(f);
             if (len < 0 || len > compressBound(TARGET_PAGE_SIZE)) {
                 error_report("Invalid compressed data length: %d", len);
@@ -1643,13 +2485,8 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
             qemu_get_buffer(f, compressed_data_buf, len);
             decompress_data_with_multi_threads(compressed_data_buf, host, len);
             break;
+
         case RAM_SAVE_FLAG_XBZRLE:
-            host = host_from_stream_offset(f, addr, flags);
-            if (!host) {
-                error_report("Illegal RAM offset " RAM_ADDR_FMT, addr);
-                ret = -EINVAL;
-                break;
-            }
             if (load_xbzrle(f, addr, host) < 0) {
                 error_report("Failed to decompress XBZRLE page at "
                              RAM_ADDR_FMT, addr);
@@ -1683,10 +2520,11 @@ static int ram_load(QEMUFile *f, void *opaque, int version_id)
 static SaveVMHandlers savevm_ram_handlers = {
     .save_live_setup = ram_save_setup,
     .save_live_iterate = ram_save_iterate,
-    .save_live_complete = ram_save_complete,
+    .save_live_complete_postcopy = ram_save_complete,
+    .save_live_complete_precopy = ram_save_complete,
     .save_live_pending = ram_save_pending,
     .load_state = ram_load,
-    .cancel = ram_migration_cancel,
+    .cleanup = ram_migration_cleanup,
 };
 
 void ram_mig_init(void)
diff --git a/migration/rdma.c b/migration/rdma.c
index 553fbd7..dcabb91 100644
--- a/migration/rdma.c
+++ b/migration/rdma.c
@@ -577,7 +577,7 @@ static int rdma_add_block(RDMAContext *rdma, const char *block_name,
     block->is_ram_block = local->init ? false : true;
 
     if (rdma->blockmap) {
-        g_hash_table_insert(rdma->blockmap, (void *) block_offset, block);
+        g_hash_table_insert(rdma->blockmap, (void *)(uintptr_t)block_offset, block);
     }
 
     trace_rdma_add_block(block_name, local->nb_blocks,
diff --git a/migration/savevm.c b/migration/savevm.c
index dbcc39a..d90e228 100644
--- a/migration/savevm.c
+++ b/migration/savevm.c
@@ -37,6 +37,7 @@
 #include "qemu/timer.h"
 #include "audio/audio.h"
 #include "migration/migration.h"
+#include "migration/postcopy-ram.h"
 #include "qapi/qmp/qerror.h"
 #include "qemu/error-report.h"
 #include "qemu/sockets.h"
@@ -45,6 +46,7 @@
 #include "exec/memory.h"
 #include "qmp-commands.h"
 #include "trace.h"
+#include "qemu/bitops.h"
 #include "qemu/iov.h"
 #include "block/snapshot.h"
 #include "block/qapi.h"
@@ -57,8 +59,26 @@
 #define ARP_PTYPE_IP 0x0800
 #define ARP_OP_REQUEST_REV 0x3
 
+const unsigned int postcopy_ram_discard_version = 0;
+
 static bool skip_section_footers;
 
+static struct mig_cmd_args {
+    ssize_t     len; /* -1 = variable */
+    const char *name;
+} mig_cmd_args[] = {
+    [MIG_CMD_INVALID]          = { .len = -1, .name = "INVALID" },
+    [MIG_CMD_OPEN_RETURN_PATH] = { .len =  0, .name = "OPEN_RETURN_PATH" },
+    [MIG_CMD_PING]             = { .len = sizeof(uint32_t), .name = "PING" },
+    [MIG_CMD_POSTCOPY_ADVISE]  = { .len = 16, .name = "POSTCOPY_ADVISE" },
+    [MIG_CMD_POSTCOPY_LISTEN]  = { .len =  0, .name = "POSTCOPY_LISTEN" },
+    [MIG_CMD_POSTCOPY_RUN]     = { .len =  0, .name = "POSTCOPY_RUN" },
+    [MIG_CMD_POSTCOPY_RAM_DISCARD] = {
+                                   .len = -1, .name = "POSTCOPY_RAM_DISCARD" },
+    [MIG_CMD_PACKAGED]         = { .len =  4, .name = "PACKAGED" },
+    [MIG_CMD_MAX]              = { .len = -1, .name = "MAX" },
+};
+
 static int announce_self_create(uint8_t *buf,
                                 uint8_t *mac_addr)
 {
@@ -694,6 +714,156 @@ static void save_section_footer(QEMUFile *f, SaveStateEntry *se)
     }
 }
 
+/**
+ * qemu_savevm_command_send: Send a 'QEMU_VM_COMMAND' type element with the
+ *                           command and associated data.
+ *
+ * @f: File to send command on
+ * @command: Command type to send
+ * @len: Length of associated data
+ * @data: Data associated with command.
+ */
+void qemu_savevm_command_send(QEMUFile *f,
+                              enum qemu_vm_cmd command,
+                              uint16_t len,
+                              uint8_t *data)
+{
+    trace_savevm_command_send(command, len);
+    qemu_put_byte(f, QEMU_VM_COMMAND);
+    qemu_put_be16(f, (uint16_t)command);
+    qemu_put_be16(f, len);
+    qemu_put_buffer(f, data, len);
+    qemu_fflush(f);
+}
+
+void qemu_savevm_send_ping(QEMUFile *f, uint32_t value)
+{
+    uint32_t buf;
+
+    trace_savevm_send_ping(value);
+    buf = cpu_to_be32(value);
+    qemu_savevm_command_send(f, MIG_CMD_PING, sizeof(value), (uint8_t *)&buf);
+}
+
+void qemu_savevm_send_open_return_path(QEMUFile *f)
+{
+    trace_savevm_send_open_return_path();
+    qemu_savevm_command_send(f, MIG_CMD_OPEN_RETURN_PATH, 0, NULL);
+}
+
+/* We have a buffer of data to send; we don't want that all to be loaded
+ * by the command itself, so the command contains just the length of the
+ * extra buffer that we then send straight after it.
+ * TODO: Must be a better way to organise that
+ *
+ * Returns:
+ *    0 on success
+ *    -ve on error
+ */
+int qemu_savevm_send_packaged(QEMUFile *f, const QEMUSizedBuffer *qsb)
+{
+    size_t cur_iov;
+    size_t len = qsb_get_length(qsb);
+    uint32_t tmp;
+
+    if (len > MAX_VM_CMD_PACKAGED_SIZE) {
+        error_report("%s: Unreasonably large packaged state: %zu",
+                     __func__, len);
+        return -1;
+    }
+
+    tmp = cpu_to_be32(len);
+
+    trace_qemu_savevm_send_packaged();
+    qemu_savevm_command_send(f, MIG_CMD_PACKAGED, 4, (uint8_t *)&tmp);
+
+    /* all the data follows (concatinating the iov's) */
+    for (cur_iov = 0; cur_iov < qsb->n_iov; cur_iov++) {
+        /* The iov entries are partially filled */
+        size_t towrite = MIN(qsb->iov[cur_iov].iov_len, len);
+        len -= towrite;
+
+        if (!towrite) {
+            break;
+        }
+
+        qemu_put_buffer(f, qsb->iov[cur_iov].iov_base, towrite);
+    }
+
+    return 0;
+}
+
+/* Send prior to any postcopy transfer */
+void qemu_savevm_send_postcopy_advise(QEMUFile *f)
+{
+    uint64_t tmp[2];
+    tmp[0] = cpu_to_be64(getpagesize());
+    tmp[1] = cpu_to_be64(1ul << qemu_target_page_bits());
+
+    trace_qemu_savevm_send_postcopy_advise();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_ADVISE, 16, (uint8_t *)tmp);
+}
+
+/* Sent prior to starting the destination running in postcopy, discard pages
+ * that have already been sent but redirtied on the source.
+ * CMD_POSTCOPY_RAM_DISCARD consist of:
+ *      byte   version (0)
+ *      byte   Length of name field (not including 0)
+ *  n x byte   RAM block name
+ *      byte   0 terminator (just for safety)
+ *  n x        Byte ranges within the named RAMBlock
+ *      be64   Start of the range
+ *      be64   Length
+ *
+ *  name:  RAMBlock name that these entries are part of
+ *  len: Number of page entries
+ *  start_list: 'len' addresses
+ *  length_list: 'len' addresses
+ *
+ */
+void qemu_savevm_send_postcopy_ram_discard(QEMUFile *f, const char *name,
+                                           uint16_t len,
+                                           uint64_t *start_list,
+                                           uint64_t *length_list)
+{
+    uint8_t *buf;
+    uint16_t tmplen;
+    uint16_t t;
+    size_t name_len = strlen(name);
+
+    trace_qemu_savevm_send_postcopy_ram_discard(name, len);
+    assert(name_len < 256);
+    buf = g_malloc0(1 + 1 + name_len + 1 + (8 + 8) * len);
+    buf[0] = postcopy_ram_discard_version;
+    buf[1] = name_len;
+    memcpy(buf + 2, name, name_len);
+    tmplen = 2 + name_len;
+    buf[tmplen++] = '\0';
+
+    for (t = 0; t < len; t++) {
+        cpu_to_be64w((uint64_t *)(buf + tmplen), start_list[t]);
+        tmplen += 8;
+        cpu_to_be64w((uint64_t *)(buf + tmplen), length_list[t]);
+        tmplen += 8;
+    }
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RAM_DISCARD, tmplen, buf);
+    g_free(buf);
+}
+
+/* Get the destination into a state where it can receive postcopy data. */
+void qemu_savevm_send_postcopy_listen(QEMUFile *f)
+{
+    trace_savevm_send_postcopy_listen();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_LISTEN, 0, NULL);
+}
+
+/* Kick the destination into running */
+void qemu_savevm_send_postcopy_run(QEMUFile *f)
+{
+    trace_savevm_send_postcopy_run();
+    qemu_savevm_command_send(f, MIG_CMD_POSTCOPY_RUN, 0, NULL);
+}
+
 bool qemu_savevm_state_blocked(Error **errp)
 {
     SaveStateEntry *se;
@@ -713,6 +883,12 @@ void qemu_savevm_state_header(QEMUFile *f)
     trace_savevm_state_header();
     qemu_put_be32(f, QEMU_VM_FILE_MAGIC);
     qemu_put_be32(f, QEMU_VM_FILE_VERSION);
+
+    if (!savevm_state.skip_configuration) {
+        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
+        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
+    }
+
 }
 
 void qemu_savevm_state_begin(QEMUFile *f,
@@ -729,11 +905,6 @@ void qemu_savevm_state_begin(QEMUFile *f,
         se->ops->set_params(params, se->opaque);
     }
 
-    if (!savevm_state.skip_configuration) {
-        qemu_put_byte(f, QEMU_VM_CONFIGURATION);
-        vmstate_save_state(f, &vmstate_configuration, &savevm_state, 0);
-    }
-
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
         if (!se->ops || !se->ops->save_live_setup) {
             continue;
@@ -760,7 +931,7 @@ void qemu_savevm_state_begin(QEMUFile *f,
  *   0 : We haven't finished, caller have to go again
  *   1 : We have finished, we can go to complete phase
  */
-int qemu_savevm_state_iterate(QEMUFile *f)
+int qemu_savevm_state_iterate(QEMUFile *f, bool postcopy)
 {
     SaveStateEntry *se;
     int ret = 1;
@@ -775,6 +946,15 @@ int qemu_savevm_state_iterate(QEMUFile *f)
                 continue;
             }
         }
+        /*
+         * In the postcopy phase, any device that doesn't know how to
+         * do postcopy should have saved it's state in the _complete
+         * call that's already run, it might get confused if we call
+         * iterate afterwards.
+         */
+        if (postcopy && !se->ops->save_live_complete_postcopy) {
+            continue;
+        }
         if (qemu_file_rate_limit(f)) {
             return 0;
         }
@@ -803,24 +983,69 @@ int qemu_savevm_state_iterate(QEMUFile *f)
 static bool should_send_vmdesc(void)
 {
     MachineState *machine = MACHINE(qdev_get_machine());
-    return !machine->suppress_vmdesc;
+    bool in_postcopy = migration_in_postcopy(migrate_get_current());
+    return !machine->suppress_vmdesc && !in_postcopy;
 }
 
-void qemu_savevm_state_complete(QEMUFile *f)
+/*
+ * Calls the save_live_complete_postcopy methods
+ * causing the last few pages to be sent immediately and doing any associated
+ * cleanup.
+ * Note postcopy also calls qemu_savevm_state_complete_precopy to complete
+ * all the other devices, but that happens at the point we switch to postcopy.
+ */
+void qemu_savevm_state_complete_postcopy(QEMUFile *f)
+{
+    SaveStateEntry *se;
+    int ret;
+
+    QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
+        if (!se->ops || !se->ops->save_live_complete_postcopy) {
+            continue;
+        }
+        if (se->ops && se->ops->is_active) {
+            if (!se->ops->is_active(se->opaque)) {
+                continue;
+            }
+        }
+        trace_savevm_section_start(se->idstr, se->section_id);
+        /* Section type */
+        qemu_put_byte(f, QEMU_VM_SECTION_END);
+        qemu_put_be32(f, se->section_id);
+
+        ret = se->ops->save_live_complete_postcopy(f, se->opaque);
+        trace_savevm_section_end(se->idstr, se->section_id, ret);
+        save_section_footer(f, se);
+        if (ret < 0) {
+            qemu_file_set_error(f, ret);
+            return;
+        }
+    }
+
+    qemu_put_byte(f, QEMU_VM_EOF);
+    qemu_fflush(f);
+}
+
+void qemu_savevm_state_complete_precopy(QEMUFile *f, bool iterable_only)
 {
     QJSON *vmdesc;
     int vmdesc_len;
     SaveStateEntry *se;
     int ret;
+    bool in_postcopy = migration_in_postcopy(migrate_get_current());
 
-    trace_savevm_state_complete();
+    trace_savevm_state_complete_precopy();
 
     cpu_synchronize_all_states();
 
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
-        if (!se->ops || !se->ops->save_live_complete) {
+        if (!se->ops ||
+            (in_postcopy && se->ops->save_live_complete_postcopy) ||
+            (in_postcopy && !iterable_only) ||
+            !se->ops->save_live_complete_precopy) {
             continue;
         }
+
         if (se->ops && se->ops->is_active) {
             if (!se->ops->is_active(se->opaque)) {
                 continue;
@@ -830,7 +1055,7 @@ void qemu_savevm_state_complete(QEMUFile *f)
 
         save_section_header(f, se, QEMU_VM_SECTION_END);
 
-        ret = se->ops->save_live_complete(f, se->opaque);
+        ret = se->ops->save_live_complete_precopy(f, se->opaque);
         trace_savevm_section_end(se->idstr, se->section_id, ret);
         save_section_footer(f, se);
         if (ret < 0) {
@@ -839,6 +1064,10 @@ void qemu_savevm_state_complete(QEMUFile *f)
         }
     }
 
+    if (iterable_only) {
+        return;
+    }
+
     vmdesc = qjson_new();
     json_prop_int(vmdesc, "page_size", TARGET_PAGE_SIZE);
     json_start_array(vmdesc, "devices");
@@ -867,7 +1096,10 @@ void qemu_savevm_state_complete(QEMUFile *f)
         save_section_footer(f, se);
     }
 
-    qemu_put_byte(f, QEMU_VM_EOF);
+    if (!in_postcopy) {
+        /* Postcopy stream will still be going */
+        qemu_put_byte(f, QEMU_VM_EOF);
+    }
 
     json_end_array(vmdesc);
     qjson_finish(vmdesc);
@@ -883,10 +1115,19 @@ void qemu_savevm_state_complete(QEMUFile *f)
     qemu_fflush(f);
 }
 
-uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
+/* Give an estimate of the amount left to be transferred,
+ * the result is split into the amount for units that can and
+ * for units that can't do postcopy.
+ */
+void qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size,
+                               uint64_t *res_non_postcopiable,
+                               uint64_t *res_postcopiable)
 {
     SaveStateEntry *se;
-    uint64_t ret = 0;
+
+    *res_non_postcopiable = 0;
+    *res_postcopiable = 0;
+
 
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
         if (!se->ops || !se->ops->save_live_pending) {
@@ -897,19 +1138,19 @@ uint64_t qemu_savevm_state_pending(QEMUFile *f, uint64_t max_size)
                 continue;
             }
         }
-        ret += se->ops->save_live_pending(f, se->opaque, max_size);
+        se->ops->save_live_pending(f, se->opaque, max_size,
+                                   res_non_postcopiable, res_postcopiable);
     }
-    return ret;
 }
 
-void qemu_savevm_state_cancel(void)
+void qemu_savevm_state_cleanup(void)
 {
     SaveStateEntry *se;
 
-    trace_savevm_state_cancel();
+    trace_savevm_state_cleanup();
     QTAILQ_FOREACH(se, &savevm_state.handlers, entry) {
-        if (se->ops && se->ops->cancel) {
-            se->ops->cancel(se->opaque);
+        if (se->ops && se->ops->cleanup) {
+            se->ops->cleanup(se->opaque);
         }
     }
 }
@@ -921,6 +1162,8 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
         .blk = 0,
         .shared = 0
     };
+    MigrationState *ms = migrate_init(&params);
+    ms->file = f;
 
     if (qemu_savevm_state_blocked(errp)) {
         return -EINVAL;
@@ -932,18 +1175,18 @@ static int qemu_savevm_state(QEMUFile *f, Error **errp)
     qemu_mutex_lock_iothread();
 
     while (qemu_file_get_error(f) == 0) {
-        if (qemu_savevm_state_iterate(f) > 0) {
+        if (qemu_savevm_state_iterate(f, false) > 0) {
             break;
         }
     }
 
     ret = qemu_file_get_error(f);
     if (ret == 0) {
-        qemu_savevm_state_complete(f);
+        qemu_savevm_state_complete_precopy(f, false);
         ret = qemu_file_get_error(f);
     }
+    qemu_savevm_state_cleanup();
     if (ret != 0) {
-        qemu_savevm_state_cancel();
         error_setg_errno(errp, -ret, "Error while writing VM state");
     }
     return ret;
@@ -1001,6 +1244,420 @@ static SaveStateEntry *find_se(const char *idstr, int instance_id)
     return NULL;
 }
 
+enum LoadVMExitCodes {
+    /* Allow a command to quit all layers of nested loadvm loops */
+    LOADVM_QUIT     =  1,
+};
+
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis);
+
+/* ------ incoming postcopy messages ------ */
+/* 'advise' arrives before any transfers just to tell us that a postcopy
+ * *might* happen - it might be skipped if precopy transferred everything
+ * quickly.
+ */
+static int loadvm_postcopy_handle_advise(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+    uint64_t remote_hps, remote_tps;
+
+    trace_loadvm_postcopy_handle_advise();
+    if (ps != POSTCOPY_INCOMING_NONE) {
+        error_report("CMD_POSTCOPY_ADVISE in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+
+    if (!postcopy_ram_supported_by_host()) {
+        return -1;
+    }
+
+    remote_hps = qemu_get_be64(mis->from_src_file);
+    if (remote_hps != getpagesize())  {
+        /*
+         * Some combinations of mismatch are probably possible but it gets
+         * a bit more complicated.  In particular we need to place whole
+         * host pages on the dest at once, and we need to ensure that we
+         * handle dirtying to make sure we never end up sending part of
+         * a hostpage on it's own.
+         */
+        error_report("Postcopy needs matching host page sizes (s=%d d=%d)",
+                     (int)remote_hps, getpagesize());
+        return -1;
+    }
+
+    remote_tps = qemu_get_be64(mis->from_src_file);
+    if (remote_tps != (1ul << qemu_target_page_bits())) {
+        /*
+         * Again, some differences could be dealt with, but for now keep it
+         * simple.
+         */
+        error_report("Postcopy needs matching target page sizes (s=%d d=%d)",
+                     (int)remote_tps, 1 << qemu_target_page_bits());
+        return -1;
+    }
+
+    if (ram_postcopy_incoming_init(mis)) {
+        return -1;
+    }
+
+    postcopy_state_set(POSTCOPY_INCOMING_ADVISE);
+
+    return 0;
+}
+
+/* After postcopy we will be told to throw some pages away since they're
+ * dirty and will have to be demand fetched.  Must happen before CPU is
+ * started.
+ * There can be 0..many of these messages, each encoding multiple pages.
+ */
+static int loadvm_postcopy_ram_handle_discard(MigrationIncomingState *mis,
+                                              uint16_t len)
+{
+    int tmp;
+    char ramid[256];
+    PostcopyState ps = postcopy_state_get();
+
+    trace_loadvm_postcopy_ram_handle_discard();
+
+    switch (ps) {
+    case POSTCOPY_INCOMING_ADVISE:
+        /* 1st discard */
+        tmp = postcopy_ram_prepare_discard(mis);
+        if (tmp) {
+            return tmp;
+        }
+        break;
+
+    case POSTCOPY_INCOMING_DISCARD:
+        /* Expected state */
+        break;
+
+    default:
+        error_report("CMD_POSTCOPY_RAM_DISCARD in wrong postcopy state (%d)",
+                     ps);
+        return -1;
+    }
+    /* We're expecting a
+     *    Version (0)
+     *    a RAM ID string (length byte, name, 0 term)
+     *    then at least 1 16 byte chunk
+    */
+    if (len < (1 + 1 + 1 + 1 + 2 * 8)) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+        return -1;
+    }
+
+    tmp = qemu_get_byte(mis->from_src_file);
+    if (tmp != postcopy_ram_discard_version) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid version (%d)", tmp);
+        return -1;
+    }
+
+    if (!qemu_get_counted_string(mis->from_src_file, ramid)) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD Failed to read RAMBlock ID");
+        return -1;
+    }
+    tmp = qemu_get_byte(mis->from_src_file);
+    if (tmp != 0) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD missing nil (%d)", tmp);
+        return -1;
+    }
+
+    len -= 3 + strlen(ramid);
+    if (len % 16) {
+        error_report("CMD_POSTCOPY_RAM_DISCARD invalid length (%d)", len);
+        return -1;
+    }
+    trace_loadvm_postcopy_ram_handle_discard_header(ramid, len);
+    while (len) {
+        uint64_t start_addr, block_length;
+        start_addr = qemu_get_be64(mis->from_src_file);
+        block_length = qemu_get_be64(mis->from_src_file);
+
+        len -= 16;
+        int ret = ram_discard_range(mis, ramid, start_addr,
+                                    block_length);
+        if (ret) {
+            return ret;
+        }
+    }
+    trace_loadvm_postcopy_ram_handle_discard_end();
+
+    return 0;
+}
+
+/*
+ * Triggered by a postcopy_listen command; this thread takes over reading
+ * the input stream, leaving the main thread free to carry on loading the rest
+ * of the device state (from RAM).
+ * (TODO:This could do with being in a postcopy file - but there again it's
+ * just another input loop, not that postcopy specific)
+ */
+static void *postcopy_ram_listen_thread(void *opaque)
+{
+    QEMUFile *f = opaque;
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    int load_res;
+
+    qemu_sem_post(&mis->listen_thread_sem);
+    trace_postcopy_ram_listen_thread_start();
+
+    /*
+     * Because we're a thread and not a coroutine we can't yield
+     * in qemu_file, and thus we must be blocking now.
+     */
+    qemu_file_set_blocking(f, true);
+    load_res = qemu_loadvm_state_main(f, mis);
+    /* And non-blocking again so we don't block in any cleanup */
+    qemu_file_set_blocking(f, false);
+
+    trace_postcopy_ram_listen_thread_exit();
+    if (load_res < 0) {
+        error_report("%s: loadvm failed: %d", __func__, load_res);
+        qemu_file_set_error(f, load_res);
+    } else {
+        /*
+         * This looks good, but it's possible that the device loading in the
+         * main thread hasn't finished yet, and so we might not be in 'RUN'
+         * state yet; wait for the end of the main thread.
+         */
+        qemu_event_wait(&mis->main_thread_load_event);
+    }
+    postcopy_ram_incoming_cleanup(mis);
+    /*
+     * If everything has worked fine, then the main thread has waited
+     * for us to start, and we're the last use of the mis.
+     * (If something broke then qemu will have to exit anyway since it's
+     * got a bad migration state).
+     */
+    migration_incoming_state_destroy();
+
+    if (load_res < 0) {
+        /*
+         * If something went wrong then we have a bad state so exit;
+         * depending how far we got it might be possible at this point
+         * to leave the guest running and fire MCEs for pages that never
+         * arrived as a desperate recovery step.
+         */
+        exit(EXIT_FAILURE);
+    }
+
+    return NULL;
+}
+
+/* After this message we must be able to immediately receive postcopy data */
+static int loadvm_postcopy_handle_listen(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_LISTENING);
+    trace_loadvm_postcopy_handle_listen();
+    if (ps != POSTCOPY_INCOMING_ADVISE && ps != POSTCOPY_INCOMING_DISCARD) {
+        error_report("CMD_POSTCOPY_LISTEN in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+    if (ps == POSTCOPY_INCOMING_ADVISE) {
+        /*
+         * A rare case, we entered listen without having to do any discards,
+         * so do the setup that's normally done at the time of the 1st discard.
+         */
+        postcopy_ram_prepare_discard(mis);
+    }
+
+    /*
+     * Sensitise RAM - can now generate requests for blocks that don't exist
+     * However, at this point the CPU shouldn't be running, and the IO
+     * shouldn't be doing anything yet so don't actually expect requests
+     */
+    if (postcopy_ram_enable_notify(mis)) {
+        return -1;
+    }
+
+    if (mis->have_listen_thread) {
+        error_report("CMD_POSTCOPY_RAM_LISTEN already has a listen thread");
+        return -1;
+    }
+
+    mis->have_listen_thread = true;
+    /* Start up the listening thread and wait for it to signal ready */
+    qemu_sem_init(&mis->listen_thread_sem, 0);
+    qemu_thread_create(&mis->listen_thread, "postcopy/listen",
+                       postcopy_ram_listen_thread, mis->from_src_file,
+                       QEMU_THREAD_JOINABLE);
+    qemu_sem_wait(&mis->listen_thread_sem);
+    qemu_sem_destroy(&mis->listen_thread_sem);
+
+    return 0;
+}
+
+/* After all discards we can start running and asking for pages */
+static int loadvm_postcopy_handle_run(MigrationIncomingState *mis)
+{
+    PostcopyState ps = postcopy_state_set(POSTCOPY_INCOMING_RUNNING);
+    Error *local_err = NULL;
+
+    trace_loadvm_postcopy_handle_run();
+    if (ps != POSTCOPY_INCOMING_LISTENING) {
+        error_report("CMD_POSTCOPY_RUN in wrong postcopy state (%d)", ps);
+        return -1;
+    }
+
+    /* TODO we should move all of this lot into postcopy_ram.c or a shared code
+     * in migration.c
+     */
+    cpu_synchronize_all_post_init();
+
+    qemu_announce_self();
+
+    /* Make sure all file formats flush their mutable metadata */
+    bdrv_invalidate_cache_all(&local_err);
+    if (local_err) {
+        error_report_err(local_err);
+        return -1;
+    }
+
+    trace_loadvm_postcopy_handle_run_cpu_sync();
+    cpu_synchronize_all_post_init();
+
+    trace_loadvm_postcopy_handle_run_vmstart();
+
+    if (autostart) {
+        /* Hold onto your hats, starting the CPU */
+        vm_start();
+    } else {
+        /* leave it paused and let management decide when to start the CPU */
+        runstate_set(RUN_STATE_PAUSED);
+    }
+
+    /* We need to finish reading the stream from the package
+     * and also stop reading anything more from the stream that loaded the
+     * package (since it's now being read by the listener thread).
+     * LOADVM_QUIT will quit all the layers of nested loadvm loops.
+     */
+    return LOADVM_QUIT;
+}
+
+/**
+ * Immediately following this command is a blob of data containing an embedded
+ * chunk of migration stream; read it and load it.
+ *
+ * @mis: Incoming state
+ * @length: Length of packaged data to read
+ *
+ * Returns: Negative values on error
+ *
+ */
+static int loadvm_handle_cmd_packaged(MigrationIncomingState *mis)
+{
+    int ret;
+    uint8_t *buffer;
+    uint32_t length;
+    QEMUSizedBuffer *qsb;
+
+    length = qemu_get_be32(mis->from_src_file);
+    trace_loadvm_handle_cmd_packaged(length);
+
+    if (length > MAX_VM_CMD_PACKAGED_SIZE) {
+        error_report("Unreasonably large packaged state: %u", length);
+        return -1;
+    }
+    buffer = g_malloc0(length);
+    ret = qemu_get_buffer(mis->from_src_file, buffer, (int)length);
+    if (ret != length) {
+        g_free(buffer);
+        error_report("CMD_PACKAGED: Buffer receive fail ret=%d length=%d\n",
+                ret, length);
+        return (ret < 0) ? ret : -EAGAIN;
+    }
+    trace_loadvm_handle_cmd_packaged_received(ret);
+
+    /* Setup a dummy QEMUFile that actually reads from the buffer */
+    qsb = qsb_create(buffer, length);
+    g_free(buffer); /* Because qsb_create copies */
+    if (!qsb) {
+        error_report("Unable to create qsb");
+    }
+    QEMUFile *packf = qemu_bufopen("r", qsb);
+
+    ret = qemu_loadvm_state_main(packf, mis);
+    trace_loadvm_handle_cmd_packaged_main(ret);
+    qemu_fclose(packf);
+    qsb_free(qsb);
+
+    return ret;
+}
+
+/*
+ * Process an incoming 'QEMU_VM_COMMAND'
+ * 0           just a normal return
+ * LOADVM_QUIT All good, but exit the loop
+ * <0          Error
+ */
+static int loadvm_process_command(QEMUFile *f)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    uint16_t cmd;
+    uint16_t len;
+    uint32_t tmp32;
+
+    cmd = qemu_get_be16(f);
+    len = qemu_get_be16(f);
+
+    trace_loadvm_process_command(cmd, len);
+    if (cmd >= MIG_CMD_MAX || cmd == MIG_CMD_INVALID) {
+        error_report("MIG_CMD 0x%x unknown (len 0x%x)", cmd, len);
+        return -EINVAL;
+    }
+
+    if (mig_cmd_args[cmd].len != -1 && mig_cmd_args[cmd].len != len) {
+        error_report("%s received with bad length - expecting %zu, got %d",
+                     mig_cmd_args[cmd].name,
+                     (size_t)mig_cmd_args[cmd].len, len);
+        return -ERANGE;
+    }
+
+    switch (cmd) {
+    case MIG_CMD_OPEN_RETURN_PATH:
+        if (mis->to_src_file) {
+            error_report("CMD_OPEN_RETURN_PATH called when RP already open");
+            /* Not really a problem, so don't give up */
+            return 0;
+        }
+        mis->to_src_file = qemu_file_get_return_path(f);
+        if (!mis->to_src_file) {
+            error_report("CMD_OPEN_RETURN_PATH failed");
+            return -1;
+        }
+        break;
+
+    case MIG_CMD_PING:
+        tmp32 = qemu_get_be32(f);
+        trace_loadvm_process_command_ping(tmp32);
+        if (!mis->to_src_file) {
+            error_report("CMD_PING (0x%x) received with no return path",
+                         tmp32);
+            return -1;
+        }
+        migrate_send_rp_pong(mis, tmp32);
+        break;
+
+    case MIG_CMD_PACKAGED:
+        return loadvm_handle_cmd_packaged(mis);
+
+    case MIG_CMD_POSTCOPY_ADVISE:
+        return loadvm_postcopy_handle_advise(mis);
+
+    case MIG_CMD_POSTCOPY_LISTEN:
+        return loadvm_postcopy_handle_listen(mis);
+
+    case MIG_CMD_POSTCOPY_RUN:
+        return loadvm_postcopy_handle_run(mis);
+
+    case MIG_CMD_POSTCOPY_RAM_DISCARD:
+        return loadvm_postcopy_ram_handle_discard(mis, len);
+    }
+
+    return 0;
+}
+
 struct LoadStateEntry {
     QLIST_ENTRY(LoadStateEntry) entry;
     SaveStateEntry *se;
@@ -1053,47 +1710,10 @@ void loadvm_free_handlers(MigrationIncomingState *mis)
     }
 }
 
-int qemu_loadvm_state(QEMUFile *f)
+static int qemu_loadvm_state_main(QEMUFile *f, MigrationIncomingState *mis)
 {
-    MigrationIncomingState *mis = migration_incoming_get_current();
-    Error *local_err = NULL;
     uint8_t section_type;
-    unsigned int v;
     int ret;
-    int file_error_after_eof = -1;
-
-    if (qemu_savevm_state_blocked(&local_err)) {
-        error_report_err(local_err);
-        return -EINVAL;
-    }
-
-    v = qemu_get_be32(f);
-    if (v != QEMU_VM_FILE_MAGIC) {
-        error_report("Not a migration stream");
-        return -EINVAL;
-    }
-
-    v = qemu_get_be32(f);
-    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
-        error_report("SaveVM v2 format is obsolete and don't work anymore");
-        return -ENOTSUP;
-    }
-    if (v != QEMU_VM_FILE_VERSION) {
-        error_report("Unsupported migration stream version");
-        return -ENOTSUP;
-    }
-
-    if (!savevm_state.skip_configuration) {
-        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
-            error_report("Configuration section missing");
-            return -EINVAL;
-        }
-        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
-
-        if (ret) {
-            return ret;
-        }
-    }
 
     while ((section_type = qemu_get_byte(f)) != QEMU_VM_EOF) {
         uint32_t instance_id, version_id, section_id;
@@ -1122,16 +1742,14 @@ int qemu_loadvm_state(QEMUFile *f)
             if (se == NULL) {
                 error_report("Unknown savevm section or instance '%s' %d",
                              idstr, instance_id);
-                ret = -EINVAL;
-                goto out;
+                return -EINVAL;
             }
 
             /* Validate version */
             if (version_id > se->version_id) {
                 error_report("savevm: unsupported version %d for '%s' v%d",
                              version_id, idstr, se->version_id);
-                ret = -EINVAL;
-                goto out;
+                return -EINVAL;
             }
 
             /* Add entry */
@@ -1146,11 +1764,10 @@ int qemu_loadvm_state(QEMUFile *f)
             if (ret < 0) {
                 error_report("error while loading state for instance 0x%x of"
                              " device '%s'", instance_id, idstr);
-                goto out;
+                return ret;
             }
             if (!check_section_footer(f, le)) {
-                ret = -EINVAL;
-                goto out;
+                return -EINVAL;
             }
             break;
         case QEMU_VM_SECTION_PART:
@@ -1165,29 +1782,88 @@ int qemu_loadvm_state(QEMUFile *f)
             }
             if (le == NULL) {
                 error_report("Unknown savevm section %d", section_id);
-                ret = -EINVAL;
-                goto out;
+                return -EINVAL;
             }
 
             ret = vmstate_load(f, le->se, le->version_id);
             if (ret < 0) {
                 error_report("error while loading state section id %d(%s)",
                              section_id, le->se->idstr);
-                goto out;
+                return ret;
             }
             if (!check_section_footer(f, le)) {
-                ret = -EINVAL;
-                goto out;
+                return -EINVAL;
+            }
+            break;
+        case QEMU_VM_COMMAND:
+            ret = loadvm_process_command(f);
+            trace_qemu_loadvm_state_section_command(ret);
+            if ((ret < 0) || (ret & LOADVM_QUIT)) {
+                return ret;
             }
             break;
         default:
             error_report("Unknown savevm section type %d", section_type);
-            ret = -EINVAL;
-            goto out;
+            return -EINVAL;
+        }
+    }
+
+    return 0;
+}
+
+int qemu_loadvm_state(QEMUFile *f)
+{
+    MigrationIncomingState *mis = migration_incoming_get_current();
+    Error *local_err = NULL;
+    unsigned int v;
+    int ret;
+
+    if (qemu_savevm_state_blocked(&local_err)) {
+        error_report_err(local_err);
+        return -EINVAL;
+    }
+
+    v = qemu_get_be32(f);
+    if (v != QEMU_VM_FILE_MAGIC) {
+        error_report("Not a migration stream");
+        return -EINVAL;
+    }
+
+    v = qemu_get_be32(f);
+    if (v == QEMU_VM_FILE_VERSION_COMPAT) {
+        error_report("SaveVM v2 format is obsolete and don't work anymore");
+        return -ENOTSUP;
+    }
+    if (v != QEMU_VM_FILE_VERSION) {
+        error_report("Unsupported migration stream version");
+        return -ENOTSUP;
+    }
+
+    if (!savevm_state.skip_configuration) {
+        if (qemu_get_byte(f) != QEMU_VM_CONFIGURATION) {
+            error_report("Configuration section missing");
+            return -EINVAL;
+        }
+        ret = vmstate_load_state(f, &vmstate_configuration, &savevm_state, 0);
+
+        if (ret) {
+            return ret;
         }
     }
 
-    file_error_after_eof = qemu_file_get_error(f);
+    ret = qemu_loadvm_state_main(f, mis);
+    qemu_event_set(&mis->main_thread_load_event);
+
+    trace_qemu_loadvm_state_post_main(ret);
+
+    if (mis->have_listen_thread) {
+        /* Listen thread still going, can't clean up yet */
+        return ret;
+    }
+
+    if (ret == 0) {
+        ret = qemu_file_get_error(f);
+    }
 
     /*
      * Try to read in the VMDESC section as well, so that dumping tools that
@@ -1199,10 +1875,10 @@ int qemu_loadvm_state(QEMUFile *f)
      * We also mustn't read data that isn't there; some transports (RDMA)
      * will stall waiting for that data when the source has already closed.
      */
-    if (should_send_vmdesc()) {
+    if (ret == 0 && should_send_vmdesc()) {
         uint8_t *buf;
         uint32_t size;
-        section_type = qemu_get_byte(f);
+        uint8_t  section_type = qemu_get_byte(f);
 
         if (section_type != QEMU_VM_VMDESCRIPTION) {
             error_report("Expected vmdescription section, but got %d",
@@ -1226,14 +1902,6 @@ int qemu_loadvm_state(QEMUFile *f)
 
     cpu_synchronize_all_post_init();
 
-    ret = 0;
-
-out:
-    if (ret == 0) {
-        /* We may not have a VMDESC section, so ignore relative errors */
-        ret = file_error_after_eof;
-    }
-
     return ret;
 }
 
diff --git a/monitor.c b/monitor.c
index 6cd747f..e4cf34e 100644
--- a/monitor.c
+++ b/monitor.c
@@ -2136,6 +2136,8 @@ static int get_monitor_def(target_long *pval, const char *name)
 {
     const MonitorDef *md = target_monitor_defs();
     void *ptr;
+    uint64_t tmp = 0;
+    int ret;
 
     if (md == NULL) {
         return -1;
@@ -2163,7 +2165,13 @@ static int get_monitor_def(target_long *pval, const char *name)
             return 0;
         }
     }
-    return -1;
+
+    ret = target_get_monitor_def(mon_get_cpu(), name, &tmp);
+    if (!ret) {
+        *pval = (target_long) tmp;
+    }
+
+    return ret;
 }
 
 static void next(void)
@@ -3408,13 +3416,18 @@ static void vm_completion(ReadLineState *rs, const char *str)
     readline_set_completion_index(rs, len);
     while ((bs = bdrv_next(bs))) {
         SnapshotInfoList *snapshots, *snapshot;
+        AioContext *ctx = bdrv_get_aio_context(bs);
+        bool ok = false;
 
-        if (!bdrv_can_snapshot(bs)) {
-            continue;
+        aio_context_acquire(ctx);
+        if (bdrv_can_snapshot(bs)) {
+            ok = bdrv_query_snapshot_info_list(bs, &snapshots, NULL) == 0;
         }
-        if (bdrv_query_snapshot_info_list(bs, &snapshots, NULL)) {
+        aio_context_release(ctx);
+        if (!ok) {
             continue;
         }
+
         snapshot = snapshots;
         while (snapshot) {
             char *completion = snapshot->value->name;
diff --git a/net/netmap.c b/net/netmap.c
index 508b829..5558368 100644
--- a/net/netmap.c
+++ b/net/netmap.c
@@ -90,7 +90,7 @@ pkt_copy(const void *_src, void *_dst, int l)
  * Open a netmap device. We assume there is only one queue
  * (which is the case for the VALE bridge).
  */
-static int netmap_open(NetmapPriv *me)
+static void netmap_open(NetmapPriv *me, Error **errp)
 {
     int fd;
     int err;
@@ -99,9 +99,8 @@ static int netmap_open(NetmapPriv *me)
 
     me->fd = fd = open(me->fdname, O_RDWR);
     if (fd < 0) {
-        error_report("Unable to open netmap device '%s' (%s)",
-                        me->fdname, strerror(errno));
-        return -1;
+        error_setg_file_open(errp, errno, me->fdname);
+        return;
     }
     memset(&req, 0, sizeof(req));
     pstrcpy(req.nr_name, sizeof(req.nr_name), me->ifname);
@@ -109,15 +108,14 @@ static int netmap_open(NetmapPriv *me)
     req.nr_version = NETMAP_API;
     err = ioctl(fd, NIOCREGIF, &req);
     if (err) {
-        error_report("Unable to register %s: %s", me->ifname, strerror(errno));
+        error_setg_errno(errp, errno, "Unable to register %s", me->ifname);
         goto error;
     }
     l = me->memsize = req.nr_memsize;
 
     me->mem = mmap(0, l, PROT_WRITE | PROT_READ, MAP_SHARED, fd, 0);
     if (me->mem == MAP_FAILED) {
-        error_report("Unable to mmap netmap shared memory: %s",
-                        strerror(errno));
+        error_setg_errno(errp, errno, "Unable to mmap netmap shared memory");
         me->mem = NULL;
         goto error;
     }
@@ -125,11 +123,11 @@ static int netmap_open(NetmapPriv *me)
     me->nifp = NETMAP_IF(me->mem, req.nr_offset);
     me->tx = NETMAP_TXRING(me->nifp, 0);
     me->rx = NETMAP_RXRING(me->nifp, 0);
-    return 0;
+
+    return;
 
 error:
     close(me->fd);
-    return -1;
 }
 
 static void netmap_send(void *opaque);
@@ -438,9 +436,9 @@ static NetClientInfo net_netmap_info = {
 int net_init_netmap(const NetClientOptions *opts,
                     const char *name, NetClientState *peer, Error **errp)
 {
-    /* FIXME error_setg(errp, ...) on failure */
-    const NetdevNetmapOptions *netmap_opts = opts->netmap;
+    const NetdevNetmapOptions *netmap_opts = opts->u.netmap;
     NetClientState *nc;
+    Error *err = NULL;
     NetmapPriv me;
     NetmapState *s;
 
@@ -448,7 +446,9 @@ int net_init_netmap(const NetClientOptions *opts,
         netmap_opts->has_devname ? netmap_opts->devname : "/dev/netmap");
     /* Set default name for the port if not supplied. */
     pstrcpy(me.ifname, sizeof(me.ifname), netmap_opts->ifname);
-    if (netmap_open(&me)) {
+    netmap_open(&me, &err);
+    if (err) {
+        error_propagate(errp, err);
         return -1;
     }
     /* Create the object. */
diff --git a/net/tap-bsd.c b/net/tap-bsd.c
index 7028d9b..0103a97 100644
--- a/net/tap-bsd.c
+++ b/net/tap-bsd.c
@@ -109,8 +109,7 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
 
 #define PATH_NET_TAP "/dev/tap"
 
-int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
-             int vnet_hdr_required, int mq_required, Error **errp)
+static int tap_open_clone(char *ifname, int ifname_size, Error **errp)
 {
     int fd, s, ret;
     struct ifreq ifr;
@@ -126,7 +125,8 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
     ret = ioctl(fd, TAPGIFNAME, (void *)&ifr);
     if (ret < 0) {
         error_setg_errno(errp, errno, "could not get tap interface name");
-        goto error;
+        close(fd);
+        return -1;
     }
 
     if (ifname[0] != '\0') {
@@ -135,19 +135,47 @@ int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
         if (s < 0) {
             error_setg_errno(errp, errno,
                              "could not open socket to set interface name");
-            goto error;
+            close(fd);
+            return -1;
         }
         ifr.ifr_data = ifname;
         ret = ioctl(s, SIOCSIFNAME, (void *)&ifr);
         close(s);
         if (ret < 0) {
             error_setg(errp, "could not set tap interface name");
-            goto error;
+            close(fd);
+            return -1;
         }
     } else {
         pstrcpy(ifname, ifname_size, ifr.ifr_name);
     }
 
+    return fd;
+}
+
+int tap_open(char *ifname, int ifname_size, int *vnet_hdr,
+             int vnet_hdr_required, int mq_required, Error **errp)
+{
+    int fd = -1;
+
+    /* If the specified tap device already exists just use it. */
+    if (ifname[0] != '\0') {
+        char dname[100];
+        snprintf(dname, sizeof dname, "/dev/%s", ifname);
+        TFR(fd = open(dname, O_RDWR));
+        if (fd < 0 && errno != ENOENT) {
+            error_setg_errno(errp, errno, "could not open %s", dname);
+            return -1;
+        }
+    }
+
+    if (fd < 0) {
+        /* Tap device not specified or does not exist. */
+        if ((fd = tap_open_clone(ifname, ifname_size, errp)) < 0) {
+            return -1;
+        }
+    }
+
     if (*vnet_hdr) {
         /* BSD doesn't have IFF_VNET_HDR */
         *vnet_hdr = 0;
diff --git a/pc-bios/README b/pc-bios/README
index e4154ab..d260c1b 100644
--- a/pc-bios/README
+++ b/pc-bios/README
@@ -17,7 +17,7 @@
 - SLOF (Slimline Open Firmware) is a free IEEE 1275 Open Firmware
   implementation for certain IBM POWER hardware.  The sources are at
   https://github.com/aik/SLOF, and the image currently in qemu is
-  built from git tag qemu-slof-20150813.
+  built from git tag qemu-slof-20151103.
 
 - sgabios (the Serial Graphics Adapter option ROM) provides a means for
   legacy x86 software to communicate with an attached serial console as
diff --git a/pc-bios/s390-ccw.img b/pc-bios/s390-ccw.img
index 0c540a1..e0d9452 100644
--- a/pc-bios/s390-ccw.img
+++ b/pc-bios/s390-ccw.img
diff --git a/pc-bios/s390-ccw/bootmap.c b/pc-bios/s390-ccw/bootmap.c
index b678d5e..415508b 100644
--- a/pc-bios/s390-ccw/bootmap.c
+++ b/pc-bios/s390-ccw/bootmap.c
@@ -445,6 +445,206 @@ static void ipl_scsi(void)
 }
 
 /***********************************************************************
+ * IPL El Torito ISO9660 image or DVD
+ */
+
+static bool is_iso_bc_entry_compatible(IsoBcSection *s)
+{
+    uint8_t *magic_sec = (uint8_t *)(sec + ISO_SECTOR_SIZE);
+
+    if (s->unused || !s->sector_count) {
+        return false;
+    }
+    read_iso_sector(bswap32(s->load_rba), magic_sec,
+                    "Failed to read image sector 0");
+
+    /* Checking bytes 8 - 32 for S390 Linux magic */
+    return !_memcmp(magic_sec + 8, linux_s390_magic, 24);
+}
+
+/* Location of the current sector of the directory */
+static uint32_t sec_loc[ISO9660_MAX_DIR_DEPTH];
+/* Offset in the current sector of the directory */
+static uint32_t sec_offset[ISO9660_MAX_DIR_DEPTH];
+/* Remained directory space in bytes */
+static uint32_t dir_rem[ISO9660_MAX_DIR_DEPTH];
+
+static inline uint32_t iso_get_file_size(uint32_t load_rba)
+{
+    IsoVolDesc *vd = (IsoVolDesc *)sec;
+    IsoDirHdr *cur_record = &vd->vd.primary.rootdir;
+    uint8_t *temp = sec + ISO_SECTOR_SIZE;
+    int level = 0;
+
+    read_iso_sector(ISO_PRIMARY_VD_SECTOR, sec,
+                    "Failed to read ISO primary descriptor");
+    sec_loc[0] = iso_733_to_u32(cur_record->ext_loc);
+    dir_rem[0] = 0;
+    sec_offset[0] = 0;
+
+    while (level >= 0) {
+        IPL_assert(sec_offset[level] <= ISO_SECTOR_SIZE,
+                   "Directory tree structure violation");
+
+        cur_record = (IsoDirHdr *)(temp + sec_offset[level]);
+
+        if (sec_offset[level] == 0) {
+            read_iso_sector(sec_loc[level], temp,
+                            "Failed to read ISO directory");
+            if (dir_rem[level] == 0) {
+                /* Skip self and parent records */
+                dir_rem[level] = iso_733_to_u32(cur_record->data_len) -
+                                 cur_record->dr_len;
+                sec_offset[level] += cur_record->dr_len;
+
+                cur_record = (IsoDirHdr *)(temp + sec_offset[level]);
+                dir_rem[level] -= cur_record->dr_len;
+                sec_offset[level] += cur_record->dr_len;
+                continue;
+            }
+        }
+
+        if (!cur_record->dr_len || sec_offset[level] == ISO_SECTOR_SIZE) {
+            /* Zero-padding and/or the end of current sector */
+            dir_rem[level] -= ISO_SECTOR_SIZE - sec_offset[level];
+            sec_offset[level] = 0;
+            sec_loc[level]++;
+        } else {
+            /* The directory record is valid */
+            if (load_rba == iso_733_to_u32(cur_record->ext_loc)) {
+                return iso_733_to_u32(cur_record->data_len);
+            }
+
+            dir_rem[level] -= cur_record->dr_len;
+            sec_offset[level] += cur_record->dr_len;
+
+            if (cur_record->file_flags & 0x2) {
+                /* Subdirectory */
+                if (level == ISO9660_MAX_DIR_DEPTH - 1) {
+                    sclp_print("ISO-9660 directory depth limit exceeded\n");
+                } else {
+                    level++;
+                    sec_loc[level] = iso_733_to_u32(cur_record->ext_loc);
+                    sec_offset[level] = 0;
+                    dir_rem[level] = 0;
+                    continue;
+                }
+            }
+        }
+
+        if (dir_rem[level] == 0) {
+            /* Nothing remaining */
+            level--;
+            read_iso_sector(sec_loc[level], temp,
+                            "Failed to read ISO directory");
+        }
+    }
+
+    return 0;
+}
+
+static void load_iso_bc_entry(IsoBcSection *load)
+{
+    IsoBcSection s = *load;
+    /*
+     * According to spec, extent for each file
+     * is padded and ISO_SECTOR_SIZE bytes aligned
+     */
+    uint32_t blks_to_load = bswap16(s.sector_count) >> ET_SECTOR_SHIFT;
+    uint32_t real_size = iso_get_file_size(bswap32(s.load_rba));
+
+    if (real_size) {
+        /* Round up blocks to load */
+        blks_to_load = (real_size + ISO_SECTOR_SIZE - 1) / ISO_SECTOR_SIZE;
+        sclp_print("ISO boot image size verified\n");
+    } else {
+        sclp_print("ISO boot image size could not be verified\n");
+    }
+
+    read_iso_boot_image(bswap32(s.load_rba),
+                        (void *)((uint64_t)bswap16(s.load_segment)),
+                        blks_to_load);
+
+    /* Trying to get PSW at zero address */
+    if (*((uint64_t *)0) & IPL_PSW_MASK) {
+        jump_to_IPL_code((*((uint64_t *)0)) & 0x7fffffff);
+    }
+
+    /* Try default linux start address */
+    jump_to_IPL_code(KERN_IMAGE_START);
+}
+
+static uint32_t find_iso_bc(void)
+{
+    IsoVolDesc *vd = (IsoVolDesc *)sec;
+    uint32_t block_num = ISO_PRIMARY_VD_SECTOR;
+
+    if (virtio_read_many(block_num++, sec, 1)) {
+        /* If primary vd cannot be read, there is no boot catalog */
+        return 0;
+    }
+
+    while (is_iso_vd_valid(vd) && vd->type != VOL_DESC_TERMINATOR) {
+        if (vd->type == VOL_DESC_TYPE_BOOT) {
+            IsoVdElTorito *et = &vd->vd.boot;
+
+            if (!_memcmp(&et->el_torito[0], el_torito_magic, 32)) {
+                return bswap32(et->bc_offset);
+            }
+        }
+        read_iso_sector(block_num++, sec,
+                        "Failed to read ISO volume descriptor");
+    }
+
+    return 0;
+}
+
+static IsoBcSection *find_iso_bc_entry(void)
+{
+    IsoBcEntry *e = (IsoBcEntry *)sec;
+    uint32_t offset = find_iso_bc();
+    int i;
+
+    if (!offset) {
+        return NULL;
+    }
+
+    read_iso_sector(offset, sec, "Failed to read El Torito boot catalog");
+
+    if (!is_iso_bc_valid(e)) {
+        /* The validation entry is mandatory */
+        virtio_panic("No valid boot catalog found!\n");
+        return NULL;
+    }
+
+    /*
+     * Each entry has 32 bytes size, so one sector cannot contain > 64 entries.
+     * We consider only boot catalogs with no more than 64 entries.
+     */
+    for (i = 1; i < ISO_BC_ENTRY_PER_SECTOR; i++) {
+        if (e[i].id == ISO_BC_BOOTABLE_SECTION) {
+            if (is_iso_bc_entry_compatible(&e[i].body.sect)) {
+                return &e[i].body.sect;
+            }
+        }
+    }
+
+    virtio_panic("No suitable boot entry found on ISO-9660 media!\n");
+
+    return NULL;
+}
+
+static void ipl_iso_el_torito(void)
+{
+    IsoBcSection *s = find_iso_bc_entry();
+
+    if (s) {
+        load_iso_bc_entry(s);
+        /* no return */
+    }
+}
+
+/***********************************************************************
  * IPL starts here
  */
 
@@ -463,6 +663,12 @@ void zipl_load(void)
         ipl_scsi(); /* no return */
     }
 
+    /* Check if we can boot as ISO media */
+    if (virtio_guessed_disk_nature()) {
+        virtio_assume_iso9660();
+    }
+    ipl_iso_el_torito();
+
     /* We have failed to follow the SCSI scheme, so */
     if (virtio_guessed_disk_nature()) {
         sclp_print("Using guessed DASD geometry.\n");
diff --git a/pc-bios/s390-ccw/bootmap.h b/pc-bios/s390-ccw/bootmap.h
index ab132e3..f98765b 100644
--- a/pc-bios/s390-ccw/bootmap.h
+++ b/pc-bios/s390-ccw/bootmap.h
@@ -341,4 +341,210 @@ static inline bool magic_match(const void *data, const void *magic)
     return *((uint32_t *)data) == *((uint32_t *)magic);
 }
 
+static inline int _memcmp(const void *s1, const void *s2, size_t n)
+{
+    int i;
+    const uint8_t *p1 = s1, *p2 = s2;
+
+    for (i = 0; i < n; i++) {
+        if (p1[i] != p2[i]) {
+            return p1[i] > p2[i] ? 1 : -1;
+        }
+    }
+
+    return 0;
+}
+
+/* from include/qemu/bswap.h */
+
+/* El Torito is always little-endian */
+static inline uint16_t bswap16(uint16_t x)
+{
+    return ((x & 0x00ff) << 8) | ((x & 0xff00) >> 8);
+}
+
+static inline uint32_t bswap32(uint32_t x)
+{
+    return ((x & 0x000000ffU) << 24) | ((x & 0x0000ff00U) <<  8) |
+           ((x & 0x00ff0000U) >>  8) | ((x & 0xff000000U) >> 24);
+}
+
+static inline uint64_t bswap64(uint64_t x)
+{
+    return ((x & 0x00000000000000ffULL) << 56) |
+           ((x & 0x000000000000ff00ULL) << 40) |
+           ((x & 0x0000000000ff0000ULL) << 24) |
+           ((x & 0x00000000ff000000ULL) <<  8) |
+           ((x & 0x000000ff00000000ULL) >>  8) |
+           ((x & 0x0000ff0000000000ULL) >> 24) |
+           ((x & 0x00ff000000000000ULL) >> 40) |
+           ((x & 0xff00000000000000ULL) >> 56);
+}
+
+static inline uint32_t iso_733_to_u32(uint64_t x)
+{
+    return (uint32_t)x;
+}
+
+#define ISO_SECTOR_SIZE 2048
+/* El Torito specifies boot image size in 512 byte blocks */
+#define ET_SECTOR_SHIFT 2
+#define KERN_IMAGE_START 0x010000UL
+#define PSW_MASK_64 0x0000000100000000ULL
+#define PSW_MASK_32 0x0000000080000000ULL
+#define IPL_PSW_MASK (PSW_MASK_32 | PSW_MASK_64)
+
+#define ISO_PRIMARY_VD_SECTOR 16
+
+static inline void read_iso_sector(uint32_t block_offset, void *buf,
+                                   const char *errmsg)
+{
+    IPL_assert(virtio_read_many(block_offset, buf, 1) == 0, errmsg);
+}
+
+static inline void read_iso_boot_image(uint32_t block_offset, void *load_addr,
+                                       uint32_t blks_to_load)
+{
+    IPL_assert(virtio_read_many(block_offset, load_addr, blks_to_load) == 0,
+               "Failed to read boot image!");
+}
+
+const uint8_t el_torito_magic[] = "EL TORITO SPECIFICATION"
+                                  "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0";
+
+#define ISO9660_MAX_DIR_DEPTH 8
+
+typedef struct IsoDirHdr {
+    uint8_t dr_len;
+    uint8_t ear_len;
+    uint64_t ext_loc;
+    uint64_t data_len;
+    uint8_t recording_datetime[7];
+    uint8_t file_flags;
+    uint8_t file_unit_size;
+    uint8_t gap_size;
+    uint32_t vol_seqnum;
+    uint8_t fileid_len;
+} __attribute__((packed)) IsoDirHdr;
+
+typedef struct IsoVdElTorito {
+    uint8_t el_torito[32]; /* must contain el_torito_magic value */
+    uint8_t unused0[32];
+    uint32_t bc_offset;
+    uint8_t unused1[1974];
+} __attribute__((packed)) IsoVdElTorito;
+
+typedef struct IsoVdPrimary {
+    uint8_t unused1;
+    uint8_t sys_id[32];
+    uint8_t vol_id[32];
+    uint8_t unused2[8];
+    uint64_t vol_space_size;
+    uint8_t unused3[32];
+    uint32_t vol_set_size;
+    uint32_t vol_seqnum;
+    uint32_t log_block_size;
+    uint64_t path_table_size;
+    uint32_t l_path_table;
+    uint32_t opt_l_path_table;
+    uint32_t m_path_table;
+    uint32_t opt_m_path_table;
+    IsoDirHdr rootdir;
+    uint8_t root_null;
+    uint8_t reserved2[1858];
+} __attribute__((packed)) IsoVdPrimary;
+
+typedef struct IsoVolDesc {
+    uint8_t type;
+    uint8_t ident[5];
+    uint8_t version;
+    union {
+        IsoVdElTorito boot;
+        IsoVdPrimary primary;
+    } vd;
+} __attribute__((packed)) IsoVolDesc;
+
+const uint8_t vol_desc_magic[] = "CD001";
+#define VOL_DESC_TYPE_BOOT 0
+#define VOL_DESC_TYPE_PRIMARY 1
+#define VOL_DESC_TYPE_SUPPLEMENT 2
+#define VOL_DESC_TYPE_PARTITION 3
+#define VOL_DESC_TERMINATOR 255
+
+static inline bool is_iso_vd_valid(IsoVolDesc *vd)
+{
+    return !_memcmp(&vd->ident[0], vol_desc_magic, 5) &&
+           vd->version == 0x1 &&
+           vd->type <= VOL_DESC_TYPE_PARTITION;
+}
+
+typedef struct IsoBcValid {
+    uint8_t platform_id;
+    uint16_t reserved;
+    uint8_t id[24];
+    uint16_t checksum;
+    uint8_t key[2];
+} __attribute__((packed)) IsoBcValid;
+
+typedef struct IsoBcSection {
+    uint8_t boot_type;
+    uint16_t load_segment;
+    uint8_t sys_type;
+    uint8_t unused;
+    uint16_t sector_count;
+    uint32_t load_rba;
+    uint8_t selection[20];
+} __attribute__((packed)) IsoBcSection;
+
+typedef struct IsoBcHdr {
+    uint8_t platform_id;
+    uint16_t sect_num;
+    uint8_t id[28];
+} __attribute__((packed)) IsoBcHdr;
+
+/*
+ * Match two CCWs located after PSW and eight filler bytes.
+ * From libmagic and arch/s390/kernel/head.S.
+ */
+const uint8_t linux_s390_magic[] = "\x02\x00\x00\x18\x60\x00\x00\x50\x02\x00"
+                                   "\x00\x68\x60\x00\x00\x50\x40\x40\x40\x40"
+                                   "\x40\x40\x40\x40";
+
+typedef struct IsoBcEntry {
+    uint8_t id;
+    union {
+        IsoBcValid valid; /* id == 0x01 */
+        IsoBcSection sect; /* id == 0x88 || id == 0x0 */
+        IsoBcHdr hdr; /* id == 0x90 || id == 0x91 */
+    } body;
+} __attribute__((packed)) IsoBcEntry;
+
+#define ISO_BC_ENTRY_PER_SECTOR (ISO_SECTOR_SIZE / sizeof(IsoBcEntry))
+#define ISO_BC_HDR_VALIDATION 0x01
+#define ISO_BC_BOOTABLE_SECTION 0x88
+#define ISO_BC_MAGIC_55 0x55
+#define ISO_BC_MAGIC_AA 0xaa
+#define ISO_BC_PLATFORM_X86 0x0
+#define ISO_BC_PLATFORM_PPC 0x1
+#define ISO_BC_PLATFORM_MAC 0x2
+
+static inline bool is_iso_bc_valid(IsoBcEntry *e)
+{
+    IsoBcValid *v = &e->body.valid;
+
+    if (e->id != ISO_BC_HDR_VALIDATION) {
+        return false;
+    }
+
+    if (v->platform_id != ISO_BC_PLATFORM_X86 &&
+        v->platform_id != ISO_BC_PLATFORM_PPC &&
+        v->platform_id != ISO_BC_PLATFORM_MAC) {
+        return false;
+    }
+
+    return v->key[0] == ISO_BC_MAGIC_55 &&
+           v->key[1] == ISO_BC_MAGIC_AA &&
+           v->reserved == 0x0;
+}
+
 #endif /* _PC_BIOS_S390_CCW_BOOTMAP_H */
diff --git a/pc-bios/s390-ccw/virtio.c b/pc-bios/s390-ccw/virtio.c
index 57ff1b0..87aed38 100644
--- a/pc-bios/s390-ccw/virtio.c
+++ b/pc-bios/s390-ccw/virtio.c
@@ -278,6 +278,13 @@ void virtio_assume_scsi(void)
     blk_cfg.physical_block_exp = 0;
 }
 
+void virtio_assume_iso9660(void)
+{
+    guessed_disk_nature = true;
+    blk_cfg.blk_size = 2048;
+    blk_cfg.physical_block_exp = 0;
+}
+
 void virtio_assume_eckd(void)
 {
     guessed_disk_nature = true;
diff --git a/pc-bios/s390-ccw/virtio.h b/pc-bios/s390-ccw/virtio.h
index c23466b..afa01a8 100644
--- a/pc-bios/s390-ccw/virtio.h
+++ b/pc-bios/s390-ccw/virtio.h
@@ -187,6 +187,7 @@ typedef struct VirtioBlkConfig {
 bool virtio_guessed_disk_nature(void);
 void virtio_assume_scsi(void);
 void virtio_assume_eckd(void);
+void virtio_assume_iso9660(void);
 
 extern bool virtio_disk_is_scsi(void);
 extern bool virtio_disk_is_eckd(void);
@@ -199,14 +200,9 @@ extern int virtio_read_many(ulong sector, void *load_addr, int sec_num);
 
 #define VIRTIO_SECTOR_SIZE 512
 
-static inline ulong virtio_eckd_sector_adjust(ulong sector)
-{
-     return sector * (virtio_get_block_size() / VIRTIO_SECTOR_SIZE);
-}
-
 static inline ulong virtio_sector_adjust(ulong sector)
 {
-    return virtio_disk_is_eckd() ? virtio_eckd_sector_adjust(sector) : sector;
+    return sector * (virtio_get_block_size() / VIRTIO_SECTOR_SIZE);
 }
 
 #endif /* VIRTIO_H */
diff --git a/pc-bios/slof.bin b/pc-bios/slof.bin
index 701933f..90f3099 100644
--- a/pc-bios/slof.bin
+++ b/pc-bios/slof.bin
diff --git a/qapi-schema.json b/qapi-schema.json
index 702b7b5..8b1a423 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -430,6 +430,8 @@
 #
 # @active: in the process of doing migration.
 #
+# @postcopy-active: like active, but now in postcopy mode. (since 2.5)
+#
 # @completed: migration is finished.
 #
 # @failed: some error occurred during migration process.
@@ -439,7 +441,7 @@
 ##
 { 'enum': 'MigrationStatus',
   'data': [ 'none', 'setup', 'cancelling', 'cancelled',
-            'active', 'completed', 'failed' ] }
+            'active', 'postcopy-active', 'completed', 'failed' ] }
 
 ##
 # @MigrationInfo
@@ -540,11 +542,15 @@
 # @auto-converge: If enabled, QEMU will automatically throttle down the guest
 #          to speed up convergence of RAM migration. (since 1.6)
 #
+# @x-postcopy-ram: Start executing on the migration target before all of RAM has
+#          been migrated, pulling the remaining pages along as needed. NOTE: If
+#          the migration fails during postcopy the VM will fail.  (since 2.5)
+#
 # Since: 1.2
 ##
 { 'enum': 'MigrationCapability',
   'data': ['xbzrle', 'rdma-pin-all', 'auto-converge', 'zero-blocks',
-           'compress', 'events'] }
+           'compress', 'events', 'x-postcopy-ram'] }
 
 ##
 # @MigrationCapabilityStatus
@@ -698,6 +704,16 @@
             '*tls-port': 'int', '*cert-subject': 'str' } }
 
 ##
+# @migrate-start-postcopy
+#
+# Followup to a migration command to switch the migration to postcopy mode.
+# The x-postcopy-ram capability must be set before the original migration
+# command.
+#
+# Since: 2.5
+{ 'command': 'migrate-start-postcopy' }
+
+##
 # @MouseInfo:
 #
 # Information about a mouse device.
@@ -1520,6 +1536,26 @@
   'data': { } }
 
 ##
+# @ActionCompletionMode
+#
+# An enumeration of Transactional completion modes.
+#
+# @individual: Do not attempt to cancel any other Actions if any Actions fail
+#              after the Transaction request succeeds. All Actions that
+#              can complete successfully will do so without waiting on others.
+#              This is the default.
+#
+# @grouped: If any Action fails after the Transaction succeeds, cancel all
+#           Actions. Actions do not complete until all Actions are ready to
+#           complete. May be rejected by Actions that do not support this
+#           completion mode.
+#
+# Since: 2.5
+##
+{ 'enum': 'ActionCompletionMode',
+  'data': [ 'individual', 'grouped' ] }
+
+##
 # @TransactionAction
 #
 # A discriminated record of operations that can be performed with
@@ -1531,25 +1567,52 @@
 # abort since 1.6
 # blockdev-snapshot-internal-sync since 1.7
 # blockdev-backup since 2.3
+# blockdev-snapshot since 2.5
+# block-dirty-bitmap-add since 2.5
+# block-dirty-bitmap-clear since 2.5
 ##
 { 'union': 'TransactionAction',
   'data': {
-       'blockdev-snapshot-sync': 'BlockdevSnapshot',
+       'blockdev-snapshot': 'BlockdevSnapshot',
+       'blockdev-snapshot-sync': 'BlockdevSnapshotSync',
        'drive-backup': 'DriveBackup',
        'blockdev-backup': 'BlockdevBackup',
        'abort': 'Abort',
-       'blockdev-snapshot-internal-sync': 'BlockdevSnapshotInternal'
+       'blockdev-snapshot-internal-sync': 'BlockdevSnapshotInternal',
+       'block-dirty-bitmap-add': 'BlockDirtyBitmapAdd',
+       'block-dirty-bitmap-clear': 'BlockDirtyBitmap'
    } }
 
 ##
+# @TransactionProperties
+#
+# Optional arguments to modify the behavior of a Transaction.
+#
+# @completion-mode: #optional Controls how jobs launched asynchronously by
+#                   Actions will complete or fail as a group.
+#                   See @ActionCompletionMode for details.
+#
+# Since: 2.5
+##
+{ 'struct': 'TransactionProperties',
+  'data': {
+       '*completion-mode': 'ActionCompletionMode'
+  }
+}
+
+##
 # @transaction
 #
 # Executes a number of transactionable QMP commands atomically. If any
 # operation fails, then the entire set of actions will be abandoned and the
 # appropriate error returned.
 #
-#  List of:
-#  @TransactionAction: information needed for the respective operation
+# @actions: List of @TransactionAction;
+#           information needed for the respective operations.
+#
+# @properties: #optional structure of additional options to control the
+#              execution of the transaction. See @TransactionProperties
+#              for additional detail.
 #
 # Returns: nothing on success
 #          Errors depend on the operations of the transaction
@@ -1561,7 +1624,10 @@
 # Since 1.1
 ##
 { 'command': 'transaction',
-  'data': { 'actions': [ 'TransactionAction' ] } }
+  'data': { 'actions': [ 'TransactionAction' ],
+            '*properties': 'TransactionProperties'
+          }
+}
 
 ##
 # @human-monitor-command:
@@ -1842,8 +1908,10 @@
 #          device's password.  The behavior of reads and writes to the block
 #          device between when these calls are executed is undefined.
 #
-# Notes:  It is strongly recommended that this interface is not used especially
-#         for changing block devices.
+# Notes:  This interface is deprecated, and it is strongly recommended that you
+#         avoid using it.  For changing block devices, use
+#         blockdev-change-medium; for changing VNC parameters, use
+#         change-vnc-password.
 #
 # Since: 0.14.0
 ##
@@ -3511,16 +3579,22 @@
 # Button of a pointer input device (mouse, tablet).
 #
 # Since: 2.0
+#
+# Note that the spelling of these values may change when the
+# x-input-send-event is promoted out of experimental status.
 ##
 { 'enum'  : 'InputButton',
   'data'  : [ 'Left', 'Middle', 'Right', 'WheelUp', 'WheelDown' ] }
 
 ##
-# @InputButton
+# @InputAxis
 #
 # Position axis of a pointer input device (mouse, tablet).
 #
 # Since: 2.0
+#
+# Note that the spelling of these values may change when the
+# x-input-send-event is promoted out of experimental status.
 ##
 { 'enum'  : 'InputAxis',
   'data'  : [ 'X', 'Y' ] }
@@ -3611,7 +3685,10 @@
 #
 # Since: 2.2
 #
-# Note: this command is experimental, and not a stable API.
+# Note: this command is experimental, and not a stable API.  Things that
+# might change before it becomes stable include the spelling of enum
+# values for InputButton and InputAxis, and the notion of how to designate
+# which console will receive the event.
 #
 ##
 { 'command': 'x-input-send-event',
@@ -3876,3 +3953,21 @@
 
 # Rocker ethernet network switch
 { 'include': 'qapi/rocker.json' }
+
+##
+# ReplayMode:
+#
+# Mode of the replay subsystem.
+#
+# @none: normal execution mode. Replay or record are not enabled.
+#
+# @record: record mode. All non-deterministic data is written into the
+#          replay log.
+#
+# @play: replay mode. Non-deterministic data required for system execution
+#        is read from the log.
+#
+# Since: 2.5
+##
+{ 'enum': 'ReplayMode',
+  'data': [ 'none', 'record', 'play' ] }
diff --git a/qapi/block-core.json b/qapi/block-core.json
index 425fdab..a07b13f 100644
--- a/qapi/block-core.json
+++ b/qapi/block-core.json
@@ -414,6 +414,59 @@
 ##
 { 'command': 'query-block', 'returns': ['BlockInfo'] }
 
+
+##
+# @BlockDeviceTimedStats:
+#
+# Statistics of a block device during a given interval of time.
+#
+# @interval_length: Interval used for calculating the statistics,
+#                   in seconds.
+#
+# @min_rd_latency_ns: Minimum latency of read operations in the
+#                     defined interval, in nanoseconds.
+#
+# @min_wr_latency_ns: Minimum latency of write operations in the
+#                     defined interval, in nanoseconds.
+#
+# @min_flush_latency_ns: Minimum latency of flush operations in the
+#                        defined interval, in nanoseconds.
+#
+# @max_rd_latency_ns: Maximum latency of read operations in the
+#                     defined interval, in nanoseconds.
+#
+# @max_wr_latency_ns: Maximum latency of write operations in the
+#                     defined interval, in nanoseconds.
+#
+# @max_flush_latency_ns: Maximum latency of flush operations in the
+#                        defined interval, in nanoseconds.
+#
+# @avg_rd_latency_ns: Average latency of read operations in the
+#                     defined interval, in nanoseconds.
+#
+# @avg_wr_latency_ns: Average latency of write operations in the
+#                     defined interval, in nanoseconds.
+#
+# @avg_flush_latency_ns: Average latency of flush operations in the
+#                        defined interval, in nanoseconds.
+#
+# @avg_rd_queue_depth: Average number of pending read operations
+#                      in the defined interval.
+#
+# @avg_wr_queue_depth: Average number of pending write operations
+#                      in the defined interval.
+#
+# Since: 2.5
+##
+
+{ 'struct': 'BlockDeviceTimedStats',
+  'data': { 'interval_length': 'int', 'min_rd_latency_ns': 'int',
+            'max_rd_latency_ns': 'int', 'avg_rd_latency_ns': 'int',
+            'min_wr_latency_ns': 'int', 'max_wr_latency_ns': 'int',
+            'avg_wr_latency_ns': 'int', 'min_flush_latency_ns': 'int',
+            'max_flush_latency_ns': 'int', 'avg_flush_latency_ns': 'int',
+            'avg_rd_queue_depth': 'number', 'avg_wr_queue_depth': 'number' } }
+
 ##
 # @BlockDeviceStats:
 #
@@ -448,6 +501,37 @@
 # @wr_merged: Number of write requests that have been merged into another
 #             request (Since 2.3).
 #
+# @idle_time_ns: #optional Time since the last I/O operation, in
+#                nanoseconds. If the field is absent it means that
+#                there haven't been any operations yet (Since 2.5).
+#
+# @failed_rd_operations: The number of failed read operations
+#                        performed by the device (Since 2.5)
+#
+# @failed_wr_operations: The number of failed write operations
+#                        performed by the device (Since 2.5)
+#
+# @failed_flush_operations: The number of failed flush operations
+#                           performed by the device (Since 2.5)
+#
+# @invalid_rd_operations: The number of invalid read operations
+#                          performed by the device (Since 2.5)
+#
+# @invalid_wr_operations: The number of invalid write operations
+#                         performed by the device (Since 2.5)
+#
+# @invalid_flush_operations: The number of invalid flush operations
+#                            performed by the device (Since 2.5)
+#
+# @account_invalid: Whether invalid operations are included in the
+#                   last access statistics (Since 2.5)
+#
+# @account_failed: Whether failed operations are included in the
+#                  latency and last access statistics (Since 2.5)
+#
+# @timed_stats: Statistics specific to the set of previously defined
+#               intervals of time (Since 2.5)
+#
 # Since: 0.14.0
 ##
 { 'struct': 'BlockDeviceStats',
@@ -455,7 +539,12 @@
            'wr_operations': 'int', 'flush_operations': 'int',
            'flush_total_time_ns': 'int', 'wr_total_time_ns': 'int',
            'rd_total_time_ns': 'int', 'wr_highest_offset': 'int',
-           'rd_merged': 'int', 'wr_merged': 'int' } }
+           'rd_merged': 'int', 'wr_merged': 'int', '*idle_time_ns': 'int',
+           'failed_rd_operations': 'int', 'failed_wr_operations': 'int',
+           'failed_flush_operations': 'int', 'invalid_rd_operations': 'int',
+           'invalid_wr_operations': 'int', 'invalid_flush_operations': 'int',
+           'account_invalid': 'bool', 'account_failed': 'bool',
+           'timed_stats': ['BlockDeviceTimedStats'] } }
 
 ##
 # @BlockStats:
@@ -682,7 +771,7 @@
   'data': [ 'existing', 'absolute-paths' ] }
 
 ##
-# @BlockdevSnapshot
+# @BlockdevSnapshotSync
 #
 # Either @device or @node-name must be set but not both.
 #
@@ -699,12 +788,27 @@
 # @mode: #optional whether and how QEMU should create a new image, default is
 #        'absolute-paths'.
 ##
-{ 'struct': 'BlockdevSnapshot',
+{ 'struct': 'BlockdevSnapshotSync',
   'data': { '*device': 'str', '*node-name': 'str',
             'snapshot-file': 'str', '*snapshot-node-name': 'str',
             '*format': 'str', '*mode': 'NewImageMode' } }
 
 ##
+# @BlockdevSnapshot
+#
+# @node: device or node name that will have a snapshot created.
+#
+# @overlay: reference to the existing block device that will become
+#           the overlay of @node, as part of creating the snapshot.
+#           It must not have a current backing file (this can be
+#           achieved by passing "backing": "" to blockdev-add).
+#
+# Since 2.5
+##
+{ 'struct': 'BlockdevSnapshot',
+  'data': { 'node': 'str', 'overlay': 'str' } }
+
+##
 # @DriveBackup
 #
 # @device: the name of the device which should be copied.
@@ -790,7 +894,7 @@
 #
 # Generates a synchronous snapshot of a block device.
 #
-# For the arguments, see the documentation of BlockdevSnapshot.
+# For the arguments, see the documentation of BlockdevSnapshotSync.
 #
 # Returns: nothing on success
 #          If @device is not a valid block device, DeviceNotFound
@@ -798,6 +902,19 @@
 # Since 0.14.0
 ##
 { 'command': 'blockdev-snapshot-sync',
+  'data': 'BlockdevSnapshotSync' }
+
+
+##
+# @blockdev-snapshot
+#
+# Generates a snapshot of a block device.
+#
+# For the arguments, see the documentation of BlockdevSnapshot.
+#
+# Since 2.5
+##
+{ 'command': 'blockdev-snapshot',
   'data': 'BlockdevSnapshot' }
 
 ##
@@ -1408,6 +1525,14 @@
 #                 (default: enospc)
 # @read-only:     #optional whether the block device should be read-only
 #                 (default: false)
+# @stats-account-invalid: #optional whether to include invalid
+#                         operations when computing last access statistics
+#                         (default: true) (Since 2.5)
+# @stats-account-failed: #optional whether to include failed
+#                         operations when computing latency and last
+#                         access statistics (default: true) (Since 2.5)
+# @stats-intervals: #optional list of intervals for collecting I/O
+#                   statistics, in seconds (default: none) (Since 2.5)
 # @detect-zeroes: #optional detect and optimize zero writes (Since 2.1)
 #                 (default: off)
 #
@@ -1423,6 +1548,9 @@
             '*rerror': 'BlockdevOnError',
             '*werror': 'BlockdevOnError',
             '*read-only': 'bool',
+            '*stats-account-invalid': 'bool',
+            '*stats-account-failed': 'bool',
+            '*stats-intervals': ['int'],
             '*detect-zeroes': 'BlockdevDetectZeroesOptions' } }
 
 ##
@@ -1867,8 +1995,8 @@
 # level and no BlockBackend will be created.
 #
 # This command is still a work in progress.  It doesn't support all
-# block drivers, it lacks a matching blockdev-del, and more.  Stay
-# away from it unless you want to help with its development.
+# block drivers among other things.  Stay away from it unless you want
+# to help with its development.
 #
 # @options: block device options for the new device
 #
@@ -1876,6 +2004,160 @@
 ##
 { 'command': 'blockdev-add', 'data': { 'options': 'BlockdevOptions' } }
 
+##
+# @x-blockdev-del:
+#
+# Deletes a block device that has been added using blockdev-add.
+# The selected device can be either a block backend or a graph node.
+#
+# In the former case the backend will be destroyed, along with its
+# inserted medium if there's any. The command will fail if the backend
+# or its medium are in use.
+#
+# In the latter case the node will be destroyed. The command will fail
+# if the node is attached to a block backend or is otherwise being
+# used.
+#
+# One of @id or @node-name must be specified, but not both.
+#
+# This command is still a work in progress and is considered
+# experimental. Stay away from it unless you want to help with its
+# development.
+#
+# @id: #optional Name of the block backend device to delete.
+#
+# @node-name: #optional Name of the graph node to delete.
+#
+# Since: 2.5
+##
+{ 'command': 'x-blockdev-del', 'data': { '*id': 'str', '*node-name': 'str' } }
+
+##
+# @blockdev-open-tray:
+#
+# Opens a block device's tray. If there is a block driver state tree inserted as
+# a medium, it will become inaccessible to the guest (but it will remain
+# associated to the block device, so closing the tray will make it accessible
+# again).
+#
+# If the tray was already open before, this will be a no-op.
+#
+# Once the tray opens, a DEVICE_TRAY_MOVED event is emitted. There are cases in
+# which no such event will be generated, these include:
+# - if the guest has locked the tray, @force is false and the guest does not
+#   respond to the eject request
+# - if the BlockBackend denoted by @device does not have a guest device attached
+#   to it
+# - if the guest device does not have an actual tray and is empty, for instance
+#   for floppy disk drives
+#
+# @device: block device name
+#
+# @force:  #optional if false (the default), an eject request will be sent to
+#          the guest if it has locked the tray (and the tray will not be opened
+#          immediately); if true, the tray will be opened regardless of whether
+#          it is locked
+#
+# Since: 2.5
+##
+{ 'command': 'blockdev-open-tray',
+  'data': { 'device': 'str',
+            '*force': 'bool' } }
+
+##
+# @blockdev-close-tray:
+#
+# Closes a block device's tray. If there is a block driver state tree associated
+# with the block device (which is currently ejected), that tree will be loaded
+# as the medium.
+#
+# If the tray was already closed before, this will be a no-op.
+#
+# @device: block device name
+#
+# Since: 2.5
+##
+{ 'command': 'blockdev-close-tray',
+  'data': { 'device': 'str' } }
+
+##
+# @blockdev-remove-medium:
+#
+# Removes a medium (a block driver state tree) from a block device. That block
+# device's tray must currently be open (unless there is no attached guest
+# device).
+#
+# If the tray is open and there is no medium inserted, this will be a no-op.
+#
+# @device: block device name
+#
+# Since: 2.5
+##
+{ 'command': 'blockdev-remove-medium',
+  'data': { 'device': 'str' } }
+
+##
+# @blockdev-insert-medium:
+#
+# Inserts a medium (a block driver state tree) into a block device. That block
+# device's tray must currently be open (unless there is no attached guest
+# device) and there must be no medium inserted already.
+#
+# @device:    block device name
+#
+# @node-name: name of a node in the block driver state graph
+#
+# Since: 2.5
+##
+{ 'command': 'blockdev-insert-medium',
+  'data': { 'device': 'str',
+            'node-name': 'str'} }
+
+
+##
+# @BlockdevChangeReadOnlyMode:
+#
+# Specifies the new read-only mode of a block device subject to the
+# @blockdev-change-medium command.
+#
+# @retain:      Retains the current read-only mode
+#
+# @read-only:   Makes the device read-only
+#
+# @read-write:  Makes the device writable
+#
+# Since: 2.3
+##
+{ 'enum': 'BlockdevChangeReadOnlyMode',
+  'data': ['retain', 'read-only', 'read-write'] }
+
+
+##
+# @blockdev-change-medium:
+#
+# Changes the medium inserted into a block device by ejecting the current medium
+# and loading a new image file which is inserted as the new medium (this command
+# combines blockdev-open-tray, blockdev-remove-medium, blockdev-insert-medium
+# and blockdev-close-tray).
+#
+# @device:          block device name
+#
+# @filename:        filename of the new image to be loaded
+#
+# @format:          #optional, format to open the new image with (defaults to
+#                   the probed format)
+#
+# @read-only-mode:  #optional, change the read-only mode of the device; defaults
+#                   to 'retain'
+#
+# Since: 2.5
+##
+{ 'command': 'blockdev-change-medium',
+  'data': { 'device': 'str',
+            'filename': 'str',
+            '*format': 'str',
+            '*read-only-mode': 'BlockdevChangeReadOnlyMode' } }
+
 
 ##
 # @BlockErrorAction
diff --git a/qapi/introspect.json b/qapi/introspect.json
index cc50dc6..9e9369e 100644
--- a/qapi/introspect.json
+++ b/qapi/introspect.json
@@ -22,9 +22,22 @@
 # what's there), not interface specification.  The specification is in
 # the QAPI schema.
 #
+# Furthermore, while we strive to keep the QMP wire format
+# backwards-compatible across qemu versions, the introspection output
+# is not guaranteed to have the same stability.  For example, one
+# version of qemu may list an object member as an optional
+# non-variant, while another lists the same member only through the
+# object's variants; or the type of a member may change from a generic
+# string into a specific enum or from one specific type into an
+# alternate that includes the original type alongside something else.
+#
 # Returns: array of @SchemaInfo, where each element describes an
 # entity in the ABI: command, event, type, ...
 #
+# The order of the various SchemaInfo is unspecified; however, all
+# names are guaranteed to be unique (no name will be duplicated with
+# different meta-types).
+#
 # Note: the QAPI schema is also used to help define *internal*
 # interfaces, by defining QAPI types.  These are not part of the QMP
 # wire ABI, and therefore not returned by this command.
@@ -78,7 +91,8 @@
 #        Commands and events have the name defined in the QAPI schema.
 #        Unlike command and event names, type names are not part of
 #        the wire ABI.  Consequently, type names are meaningless
-#        strings here.
+#        strings here, although they are still guaranteed unique
+#        regardless of @meta-type.
 #
 # All references to other SchemaInfo are by name.
 #
@@ -130,7 +144,7 @@
 #
 # Additional SchemaInfo members for meta-type 'enum'.
 #
-# @values: the enumeration type's values.
+# @values: the enumeration type's values, in no particular order.
 #
 # Values of this type are JSON string on the wire.
 #
@@ -158,14 +172,16 @@
 #
 # Additional SchemaInfo members for meta-type 'object'.
 #
-# @members: the object type's (non-variant) members.
+# @members: the object type's (non-variant) members, in no particular order.
 #
 # @tag: #optional the name of the member serving as type tag.
 #       An element of @members with this name must exist.
 #
 # @variants: #optional variant members, i.e. additional members that
 #            depend on the type tag's value.  Present exactly when
-#            @tag is present.
+#            @tag is present.  The variants are in no particular order,
+#            and may even differ from the order of the values of the
+#            enum type of the @tag.
 #
 # Values of this type are JSON object on the wire.
 #
@@ -219,7 +235,7 @@
 #
 # Additional SchemaInfo members for meta-type 'alternate'.
 #
-# @members: the alternate type's members.
+# @members: the alternate type's members, in no particular order.
 #           The members' wire encoding is distinct, see
 #           docs/qapi-code-gen.txt section Alternate types.
 #
diff --git a/qemu-doc.texi b/qemu-doc.texi
index 3126abd..460ab71 100644
--- a/qemu-doc.texi
+++ b/qemu-doc.texi
@@ -1299,7 +1299,7 @@ Instead of specifying the <shm size> using POSIX shm, you may specify
 a memory backend that has hugepage support:
 
 @example
-qemu-system-i386 -object memory-backend-file,size=1G,mem-path=/mnt/hugepages,id=mb1
+qemu-system-i386 -object memory-backend-file,size=1G,mem-path=/mnt/hugepages/my-shmem-file,id=mb1
                  -device ivshmem,memdev=mb1
 @end example
 
diff --git a/qemu-img.c b/qemu-img.c
index 3025776..033011c 100644
--- a/qemu-img.c
+++ b/qemu-img.c
@@ -645,9 +645,6 @@ static void common_block_job_cb(void *opaque, int ret)
     if (ret < 0) {
         error_setg_errno(cbi->errp, -ret, "Block job failed");
     }
-
-    /* Drop this block job's reference */
-    bdrv_unref(cbi->bs);
 }
 
 static void run_block_job(BlockJob *job, Error **errp)
@@ -656,7 +653,8 @@ static void run_block_job(BlockJob *job, Error **errp)
 
     do {
         aio_poll(aio_context, true);
-        qemu_progress_print((float)job->offset / job->len * 100.f, 0);
+        qemu_progress_print(job->len ?
+                            ((float)job->offset / job->len * 100.f) : 0.0f, 0);
     } while (!job->ready);
 
     block_job_complete_sync(job, errp);
diff --git a/qemu-io-cmds.c b/qemu-io-cmds.c
index 6e5d1e4..18fc2bd 100644
--- a/qemu-io-cmds.c
+++ b/qemu-io-cmds.c
@@ -136,7 +136,29 @@ static char **breakline(char *input, int *count)
 static int64_t cvtnum(const char *s)
 {
     char *end;
-    return qemu_strtosz_suffix(s, &end, QEMU_STRTOSZ_DEFSUFFIX_B);
+    int64_t ret;
+
+    ret = qemu_strtosz_suffix(s, &end, QEMU_STRTOSZ_DEFSUFFIX_B);
+    if (*end != '\0') {
+        /* Detritus at the end of the string */
+        return -EINVAL;
+    }
+    return ret;
+}
+
+static void print_cvtnum_err(int64_t rc, const char *arg)
+{
+    switch (rc) {
+    case -EINVAL:
+        printf("Parsing error: non-numeric argument,"
+               " or extraneous/unrecognized suffix -- %s\n", arg);
+        break;
+    case -ERANGE:
+        printf("Parsing error: argument too large -- %s\n", arg);
+        break;
+    default:
+        printf("Parsing error: %s\n", arg);
+    }
 }
 
 #define EXABYTES(x)     ((long long)(x) << 60)
@@ -294,9 +316,10 @@ static void qemu_io_free(void *p)
     qemu_vfree(p);
 }
 
-static void dump_buffer(const void *buffer, int64_t offset, int len)
+static void dump_buffer(const void *buffer, int64_t offset, int64_t len)
 {
-    int i, j;
+    uint64_t i;
+    int j;
     const uint8_t *p;
 
     for (i = 0, p = buffer; i < len; i += 16) {
@@ -319,7 +342,7 @@ static void dump_buffer(const void *buffer, int64_t offset, int len)
 }
 
 static void print_report(const char *op, struct timeval *t, int64_t offset,
-                         int count, int total, int cnt, int Cflag)
+                         int64_t count, int64_t total, int cnt, int Cflag)
 {
     char s1[64], s2[64], ts[64];
 
@@ -327,12 +350,12 @@ static void print_report(const char *op, struct timeval *t, int64_t offset,
     if (!Cflag) {
         cvtstr((double)total, s1, sizeof(s1));
         cvtstr(tdiv((double)total, *t), s2, sizeof(s2));
-        printf("%s %d/%d bytes at offset %" PRId64 "\n",
+        printf("%s %"PRId64"/%"PRId64" bytes at offset %" PRId64 "\n",
                op, total, count, offset);
         printf("%s, %d ops; %s (%s/sec and %.4f ops/sec)\n",
                s1, cnt, ts, s2, tdiv((double)cnt, *t));
     } else {/* bytes,ops,time,bytes/sec,ops/sec */
-        printf("%d,%d,%s,%.3f,%.3f\n",
+        printf("%"PRId64",%d,%s,%.3f,%.3f\n",
             total, cnt, ts,
             tdiv((double)total, *t),
             tdiv((double)cnt, *t));
@@ -359,13 +382,13 @@ create_iovec(BlockBackend *blk, QEMUIOVector *qiov, char **argv, int nr_iov,
 
         len = cvtnum(arg);
         if (len < 0) {
-            printf("non-numeric length argument -- %s\n", arg);
+            print_cvtnum_err(len, arg);
             goto fail;
         }
 
         /* should be SIZE_T_MAX, but that doesn't exist */
         if (len > INT_MAX) {
-            printf("too large length argument -- %s\n", arg);
+            printf("Argument '%s' exceeds maximum size %d\n", arg, INT_MAX);
             goto fail;
         }
 
@@ -393,11 +416,15 @@ fail:
     return buf;
 }
 
-static int do_read(BlockBackend *blk, char *buf, int64_t offset, int count,
-                   int *total)
+static int do_read(BlockBackend *blk, char *buf, int64_t offset, int64_t count,
+                   int64_t *total)
 {
     int ret;
 
+    if (count >> 9 > INT_MAX) {
+        return -ERANGE;
+    }
+
     ret = blk_read(blk, offset >> 9, (uint8_t *)buf, count >> 9);
     if (ret < 0) {
         return ret;
@@ -406,11 +433,15 @@ static int do_read(BlockBackend *blk, char *buf, int64_t offset, int count,
     return 1;
 }
 
-static int do_write(BlockBackend *blk, char *buf, int64_t offset, int count,
-                    int *total)
+static int do_write(BlockBackend *blk, char *buf, int64_t offset, int64_t count,
+                    int64_t *total)
 {
     int ret;
 
+    if (count >> 9 > INT_MAX) {
+        return -ERANGE;
+    }
+
     ret = blk_write(blk, offset >> 9, (uint8_t *)buf, count >> 9);
     if (ret < 0) {
         return ret;
@@ -419,9 +450,13 @@ static int do_write(BlockBackend *blk, char *buf, int64_t offset, int count,
     return 1;
 }
 
-static int do_pread(BlockBackend *blk, char *buf, int64_t offset, int count,
-                    int *total)
+static int do_pread(BlockBackend *blk, char *buf, int64_t offset,
+                    int64_t count, int64_t *total)
 {
+    if (count > INT_MAX) {
+        return -ERANGE;
+    }
+
     *total = blk_pread(blk, offset, (uint8_t *)buf, count);
     if (*total < 0) {
         return *total;
@@ -429,9 +464,13 @@ static int do_pread(BlockBackend *blk, char *buf, int64_t offset, int count,
     return 1;
 }
 
-static int do_pwrite(BlockBackend *blk, char *buf, int64_t offset, int count,
-                     int *total)
+static int do_pwrite(BlockBackend *blk, char *buf, int64_t offset,
+                     int64_t count, int64_t *total)
 {
+    if (count > INT_MAX) {
+        return -ERANGE;
+    }
+
     *total = blk_pwrite(blk, offset, (uint8_t *)buf, count);
     if (*total < 0) {
         return *total;
@@ -442,8 +481,8 @@ static int do_pwrite(BlockBackend *blk, char *buf, int64_t offset, int count,
 typedef struct {
     BlockBackend *blk;
     int64_t offset;
-    int count;
-    int *total;
+    int64_t count;
+    int64_t *total;
     int ret;
     bool done;
 } CoWriteZeroes;
@@ -463,8 +502,8 @@ static void coroutine_fn co_write_zeroes_entry(void *opaque)
     *data->total = data->count;
 }
 
-static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int count,
-                              int *total)
+static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int64_t count,
+                              int64_t *total)
 {
     Coroutine *co;
     CoWriteZeroes data = {
@@ -475,6 +514,10 @@ static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int count,
         .done   = false,
     };
 
+    if (count >> BDRV_SECTOR_BITS > INT_MAX) {
+        return -ERANGE;
+    }
+
     co = qemu_coroutine_create(co_write_zeroes_entry);
     qemu_coroutine_enter(co, &data);
     while (!data.done) {
@@ -488,10 +531,14 @@ static int do_co_write_zeroes(BlockBackend *blk, int64_t offset, int count,
 }
 
 static int do_write_compressed(BlockBackend *blk, char *buf, int64_t offset,
-                               int count, int *total)
+                               int64_t count, int64_t *total)
 {
     int ret;
 
+    if (count >> 9 > INT_MAX) {
+        return -ERANGE;
+    }
+
     ret = blk_write_compressed(blk, offset >> 9, (uint8_t *)buf, count >> 9);
     if (ret < 0) {
         return ret;
@@ -501,8 +548,12 @@ static int do_write_compressed(BlockBackend *blk, char *buf, int64_t offset,
 }
 
 static int do_load_vmstate(BlockBackend *blk, char *buf, int64_t offset,
-                           int count, int *total)
+                           int64_t count, int64_t *total)
 {
+    if (count > INT_MAX) {
+        return -ERANGE;
+    }
+
     *total = blk_load_vmstate(blk, (uint8_t *)buf, offset, count);
     if (*total < 0) {
         return *total;
@@ -511,8 +562,12 @@ static int do_load_vmstate(BlockBackend *blk, char *buf, int64_t offset,
 }
 
 static int do_save_vmstate(BlockBackend *blk, char *buf, int64_t offset,
-                           int count, int *total)
+                           int64_t count, int64_t *total)
 {
+    if (count > INT_MAX) {
+        return -ERANGE;
+    }
+
     *total = blk_save_vmstate(blk, (uint8_t *)buf, offset, count);
     if (*total < 0) {
         return *total;
@@ -642,10 +697,11 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
     int c, cnt;
     char *buf;
     int64_t offset;
-    int count;
+    int64_t count;
     /* Some compilers get confused and warn if this is not initialized.  */
-    int total = 0;
-    int pattern = 0, pattern_offset = 0, pattern_count = 0;
+    int64_t total = 0;
+    int pattern = 0;
+    int64_t pattern_offset = 0, pattern_count = 0;
 
     while ((c = getopt(argc, argv, "bCl:pP:qs:v")) != -1) {
         switch (c) {
@@ -659,7 +715,7 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
             lflag = 1;
             pattern_count = cvtnum(optarg);
             if (pattern_count < 0) {
-                printf("non-numeric length argument -- %s\n", optarg);
+                print_cvtnum_err(pattern_count, optarg);
                 return 0;
             }
             break;
@@ -680,7 +736,7 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
             sflag = 1;
             pattern_offset = cvtnum(optarg);
             if (pattern_offset < 0) {
-                printf("non-numeric length argument -- %s\n", optarg);
+                print_cvtnum_err(pattern_offset, optarg);
                 return 0;
             }
             break;
@@ -703,14 +759,18 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
 
     offset = cvtnum(argv[optind]);
     if (offset < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(offset, argv[optind]);
         return 0;
     }
 
     optind++;
     count = cvtnum(argv[optind]);
     if (count < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(count, argv[optind]);
+        return 0;
+    } else if (count > SIZE_MAX) {
+        printf("length cannot exceed %" PRIu64 ", given %s\n",
+               (uint64_t) SIZE_MAX, argv[optind]);
         return 0;
     }
 
@@ -734,7 +794,7 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
             return 0;
         }
         if (count & 0x1ff) {
-            printf("count %d is not sector aligned\n",
+            printf("count %"PRId64" is not sector aligned\n",
                    count);
             return 0;
         }
@@ -762,7 +822,7 @@ static int read_f(BlockBackend *blk, int argc, char **argv)
         memset(cmp_buf, pattern, pattern_count);
         if (memcmp(buf + pattern_offset, cmp_buf, pattern_count)) {
             printf("Pattern verification failed at offset %"
-                   PRId64 ", %d bytes\n",
+                   PRId64 ", %"PRId64" bytes\n",
                    offset + pattern_offset, pattern_count);
         }
         g_free(cmp_buf);
@@ -861,7 +921,7 @@ static int readv_f(BlockBackend *blk, int argc, char **argv)
 
     offset = cvtnum(argv[optind]);
     if (offset < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(offset, argv[optind]);
         return 0;
     }
     optind++;
@@ -957,9 +1017,9 @@ static int write_f(BlockBackend *blk, int argc, char **argv)
     int c, cnt;
     char *buf = NULL;
     int64_t offset;
-    int count;
+    int64_t count;
     /* Some compilers get confused and warn if this is not initialized.  */
-    int total = 0;
+    int64_t total = 0;
     int pattern = 0xcd;
 
     while ((c = getopt(argc, argv, "bcCpP:qz")) != -1) {
@@ -1010,14 +1070,18 @@ static int write_f(BlockBackend *blk, int argc, char **argv)
 
     offset = cvtnum(argv[optind]);
     if (offset < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(offset, argv[optind]);
         return 0;
     }
 
     optind++;
     count = cvtnum(argv[optind]);
     if (count < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(count, argv[optind]);
+        return 0;
+    } else if (count > SIZE_MAX) {
+        printf("length cannot exceed %" PRIu64 ", given %s\n",
+               (uint64_t) SIZE_MAX, argv[optind]);
         return 0;
     }
 
@@ -1029,7 +1093,7 @@ static int write_f(BlockBackend *blk, int argc, char **argv)
         }
 
         if (count & 0x1ff) {
-            printf("count %d is not sector aligned\n",
+            printf("count %"PRId64" is not sector aligned\n",
                    count);
             return 0;
         }
@@ -1142,7 +1206,7 @@ static int writev_f(BlockBackend *blk, int argc, char **argv)
 
     offset = cvtnum(argv[optind]);
     if (offset < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(offset, argv[optind]);
         return 0;
     }
     optind++;
@@ -1269,7 +1333,7 @@ static int multiwrite_f(BlockBackend *blk, int argc, char **argv)
         /* Read the offset of the request */
         offset = cvtnum(argv[optind]);
         if (offset < 0) {
-            printf("non-numeric offset argument -- %s\n", argv[optind]);
+            print_cvtnum_err(offset, argv[optind]);
             goto out;
         }
         optind++;
@@ -1364,6 +1428,7 @@ static void aio_write_done(void *opaque, int ret)
 
     if (ret < 0) {
         printf("aio_write failed: %s\n", strerror(-ret));
+        block_acct_failed(blk_get_stats(ctx->blk), &ctx->acct);
         goto out;
     }
 
@@ -1392,6 +1457,7 @@ static void aio_read_done(void *opaque, int ret)
 
     if (ret < 0) {
         printf("readv failed: %s\n", strerror(-ret));
+        block_acct_failed(blk_get_stats(ctx->blk), &ctx->acct);
         goto out;
     }
 
@@ -1496,7 +1562,7 @@ static int aio_read_f(BlockBackend *blk, int argc, char **argv)
 
     ctx->offset = cvtnum(argv[optind]);
     if (ctx->offset < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(ctx->offset, argv[optind]);
         g_free(ctx);
         return 0;
     }
@@ -1505,6 +1571,7 @@ static int aio_read_f(BlockBackend *blk, int argc, char **argv)
     if (ctx->offset & 0x1ff) {
         printf("offset %" PRId64 " is not sector aligned\n",
                ctx->offset);
+        block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
         g_free(ctx);
         return 0;
     }
@@ -1512,6 +1579,7 @@ static int aio_read_f(BlockBackend *blk, int argc, char **argv)
     nr_iov = argc - optind;
     ctx->buf = create_iovec(blk, &ctx->qiov, &argv[optind], nr_iov, 0xab);
     if (ctx->buf == NULL) {
+        block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_READ);
         g_free(ctx);
         return 0;
     }
@@ -1591,7 +1659,7 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
 
     ctx->offset = cvtnum(argv[optind]);
     if (ctx->offset < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(ctx->offset, argv[optind]);
         g_free(ctx);
         return 0;
     }
@@ -1600,6 +1668,7 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
     if (ctx->offset & 0x1ff) {
         printf("offset %" PRId64 " is not sector aligned\n",
                ctx->offset);
+        block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
         g_free(ctx);
         return 0;
     }
@@ -1607,6 +1676,7 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
     nr_iov = argc - optind;
     ctx->buf = create_iovec(blk, &ctx->qiov, &argv[optind], nr_iov, pattern);
     if (ctx->buf == NULL) {
+        block_acct_invalid(blk_get_stats(blk), BLOCK_ACCT_WRITE);
         g_free(ctx);
         return 0;
     }
@@ -1621,7 +1691,10 @@ static int aio_write_f(BlockBackend *blk, int argc, char **argv)
 
 static int aio_flush_f(BlockBackend *blk, int argc, char **argv)
 {
+    BlockAcctCookie cookie;
+    block_acct_start(blk_get_stats(blk), &cookie, 0, BLOCK_ACCT_FLUSH);
     blk_drain_all();
+    block_acct_done(blk_get_stats(blk), &cookie);
     return 0;
 }
 
@@ -1651,7 +1724,7 @@ static int truncate_f(BlockBackend *blk, int argc, char **argv)
 
     offset = cvtnum(argv[1]);
     if (offset < 0) {
-        printf("non-numeric truncate argument -- %s\n", argv[1]);
+        print_cvtnum_err(offset, argv[1]);
         return 0;
     }
 
@@ -1777,8 +1850,7 @@ static int discard_f(BlockBackend *blk, int argc, char **argv)
     struct timeval t1, t2;
     int Cflag = 0, qflag = 0;
     int c, ret;
-    int64_t offset;
-    int count;
+    int64_t offset, count;
 
     while ((c = getopt(argc, argv, "Cq")) != -1) {
         switch (c) {
@@ -1799,14 +1871,19 @@ static int discard_f(BlockBackend *blk, int argc, char **argv)
 
     offset = cvtnum(argv[optind]);
     if (offset < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(offset, argv[optind]);
         return 0;
     }
 
     optind++;
     count = cvtnum(argv[optind]);
     if (count < 0) {
-        printf("non-numeric length argument -- %s\n", argv[optind]);
+        print_cvtnum_err(count, argv[optind]);
+        return 0;
+    } else if (count >> BDRV_SECTOR_BITS > INT_MAX) {
+        printf("length cannot exceed %"PRIu64", given %s\n",
+               (uint64_t)INT_MAX << BDRV_SECTOR_BITS,
+               argv[optind]);
         return 0;
     }
 
@@ -1833,15 +1910,14 @@ out:
 static int alloc_f(BlockBackend *blk, int argc, char **argv)
 {
     BlockDriverState *bs = blk_bs(blk);
-    int64_t offset, sector_num;
-    int nb_sectors, remaining;
+    int64_t offset, sector_num, nb_sectors, remaining;
     char s1[64];
-    int num, sum_alloc;
-    int ret;
+    int num, ret;
+    int64_t sum_alloc;
 
     offset = cvtnum(argv[1]);
     if (offset < 0) {
-        printf("non-numeric offset argument -- %s\n", argv[1]);
+        print_cvtnum_err(offset, argv[1]);
         return 0;
     } else if (offset & 0x1ff) {
         printf("offset %" PRId64 " is not sector aligned\n",
@@ -1852,7 +1928,11 @@ static int alloc_f(BlockBackend *blk, int argc, char **argv)
     if (argc == 3) {
         nb_sectors = cvtnum(argv[2]);
         if (nb_sectors < 0) {
-            printf("non-numeric length argument -- %s\n", argv[2]);
+            print_cvtnum_err(nb_sectors, argv[2]);
+            return 0;
+        } else if (nb_sectors > INT_MAX) {
+            printf("length argument cannot exceed %d, given %s\n",
+                   INT_MAX, argv[2]);
             return 0;
         }
     } else {
@@ -1881,7 +1961,7 @@ static int alloc_f(BlockBackend *blk, int argc, char **argv)
 
     cvtstr(offset, s1, sizeof(s1));
 
-    printf("%d/%d sectors allocated at offset %s\n",
+    printf("%"PRId64"/%"PRId64" sectors allocated at offset %s\n",
            sum_alloc, nb_sectors, s1);
     return 0;
 }
@@ -2191,9 +2271,13 @@ static const cmdinfo_t sigraise_cmd = {
 
 static int sigraise_f(BlockBackend *blk, int argc, char **argv)
 {
-    int sig = cvtnum(argv[1]);
+    int64_t sig = cvtnum(argv[1]);
     if (sig < 0) {
-        printf("non-numeric signal number argument -- %s\n", argv[1]);
+        print_cvtnum_err(sig, argv[1]);
+        return 0;
+    } else if (sig > NSIG) {
+        printf("signal argument '%s' is too large to be a valid signal\n",
+               argv[1]);
         return 0;
     }
 
diff --git a/qemu-log.c b/qemu-log.c
index efd07c8..7cb01a8 100644
--- a/qemu-log.c
+++ b/qemu-log.c
@@ -112,8 +112,6 @@ const QEMULogItem qemu_log_items[] = {
       "x86 only: show protected mode far calls/returns/exceptions" },
     { CPU_LOG_RESET, "cpu_reset",
       "show CPU state before CPU resets" },
-    { CPU_LOG_IOPORT, "ioport",
-      "show all i/o ports accesses" },
     { LOG_UNIMP, "unimp",
       "log unimplemented functionality" },
     { LOG_GUEST_ERROR, "guest_errors",
diff --git a/qemu-options.hx b/qemu-options.hx
index 949db7f..0eea4ee 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -3157,12 +3157,12 @@ re-inject them.
 ETEXI
 
 DEF("icount", HAS_ARG, QEMU_OPTION_icount, \
-    "-icount [shift=N|auto][,align=on|off][,sleep=no]\n" \
+    "-icount [shift=N|auto][,align=on|off][,sleep=no,rr=record|replay,rrfile=<filename>]\n" \
     "                enable virtual instruction counter with 2^N clock ticks per\n" \
     "                instruction, enable aligning the host and virtual clocks\n" \
     "                or disable real time cpu sleeping\n", QEMU_ARCH_ALL)
 STEXI
-@item -icount [shift=@var{N}|auto]
+@item -icount [shift=@var{N}|auto][,rr=record|replay,rrfile=@var{filename}]
 @findex -icount
 Enable virtual instruction counter.  The virtual cpu will execute one
 instruction every 2^@var{N} ns of virtual time.  If @code{auto} is specified
@@ -3191,6 +3191,10 @@ Currently this option does not work when @option{shift} is @code{auto}.
 Note: The sync algorithm will work for those shift values for which
 the guest clock runs ahead of the host clock. Typically this happens
 when the shift value is high (how high depends on the host machine).
+
+When @option{rr} option is specified deterministic record/replay is enabled.
+Replay log is written into @var{filename} file in record mode and
+read from this file in replay mode.
 ETEXI
 
 DEF("watchdog", HAS_ARG, QEMU_OPTION_watchdog, \
diff --git a/qemu-seccomp.c b/qemu-seccomp.c
index 80d034a..c831fe8 100644
--- a/qemu-seccomp.c
+++ b/qemu-seccomp.c
@@ -16,6 +16,14 @@
 #include <seccomp.h>
 #include "sysemu/seccomp.h"
 
+#if SCMP_VER_MAJOR >= 3
+  #define HAVE_CACHEFLUSH
+#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR >= 3
+  #define HAVE_CACHEFLUSH
+#elif SCMP_VER_MAJOR == 2 && SCMP_VER_MINOR == 2 && SCMP_VER_MICRO >= 3
+  #define HAVE_CACHEFLUSH
+#endif
+
 struct QemuSeccompSyscall {
     int32_t num;
     uint8_t priority;
@@ -238,7 +246,10 @@ static const struct QemuSeccompSyscall seccomp_whitelist[] = {
     { SCMP_SYS(inotify_init1), 240 },
     { SCMP_SYS(inotify_add_watch), 240 },
     { SCMP_SYS(mbind), 240 },
-    { SCMP_SYS(memfd_create), 240 }
+    { SCMP_SYS(memfd_create), 240 },
+#ifdef HAVE_CACHEFLUSH
+    { SCMP_SYS(cacheflush), 240 },
+#endif
 };
 
 int seccomp_start(void)
diff --git a/qemu-timer.c b/qemu-timer.c
index 2463fe6..f16e422 100644
--- a/qemu-timer.c
+++ b/qemu-timer.c
@@ -24,6 +24,8 @@
 
 #include "qemu/main-loop.h"
 #include "qemu/timer.h"
+#include "sysemu/replay.h"
+#include "sysemu/sysemu.h"
 
 #ifdef CONFIG_POSIX
 #include <pthread.h>
@@ -477,10 +479,31 @@ bool timerlist_run_timers(QEMUTimerList *timer_list)
     void *opaque;
 
     qemu_event_reset(&timer_list->timers_done_ev);
-    if (!timer_list->clock->enabled) {
+    if (!timer_list->clock->enabled || !timer_list->active_timers) {
         goto out;
     }
 
+    switch (timer_list->clock->type) {
+    case QEMU_CLOCK_REALTIME:
+        break;
+    default:
+    case QEMU_CLOCK_VIRTUAL:
+        if (!replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL)) {
+            goto out;
+        }
+        break;
+    case QEMU_CLOCK_HOST:
+        if (!replay_checkpoint(CHECKPOINT_CLOCK_HOST)) {
+            goto out;
+        }
+        break;
+    case QEMU_CLOCK_VIRTUAL_RT:
+        if (!replay_checkpoint(CHECKPOINT_CLOCK_VIRTUAL_RT)) {
+            goto out;
+        }
+        break;
+    }
+
     current_time = qemu_clock_get_ns(timer_list->clock->type);
     for(;;) {
         qemu_mutex_lock(&timer_list->active_timers_lock);
@@ -544,11 +567,17 @@ int64_t timerlistgroup_deadline_ns(QEMUTimerListGroup *tlg)
 {
     int64_t deadline = -1;
     QEMUClockType type;
+    bool play = replay_mode == REPLAY_MODE_PLAY;
     for (type = 0; type < QEMU_CLOCK_MAX; type++) {
-        if (qemu_clock_use_for_deadline(tlg->tl[type]->clock->type)) {
-            deadline = qemu_soonest_timeout(deadline,
-                                            timerlist_deadline_ns(
-                                                tlg->tl[type]));
+        if (qemu_clock_use_for_deadline(type)) {
+            if (!play || type == QEMU_CLOCK_REALTIME) {
+                deadline = qemu_soonest_timeout(deadline,
+                                                timerlist_deadline_ns(tlg->tl[type]));
+            } else {
+                /* Read clock from the replay file and
+                   do not calculate the deadline, based on virtual clock. */
+                qemu_clock_get_ns(type);
+            }
         }
     }
     return deadline;
@@ -570,7 +599,7 @@ int64_t qemu_clock_get_ns(QEMUClockType type)
             return cpu_get_clock();
         }
     case QEMU_CLOCK_HOST:
-        now = get_clock_realtime();
+        now = REPLAY_CLOCK(REPLAY_CLOCK_HOST, get_clock_realtime());
         last = clock->last;
         clock->last = now;
         if (now < last || now > (last + get_max_clock_jump())) {
@@ -578,7 +607,7 @@ int64_t qemu_clock_get_ns(QEMUClockType type)
         }
         return now;
     case QEMU_CLOCK_VIRTUAL_RT:
-        return cpu_get_clock();
+        return REPLAY_CLOCK(REPLAY_CLOCK_VIRTUAL_RT, cpu_get_clock());
     }
 }
 
diff --git a/qga/commands-posix.c b/qga/commands-posix.c
index 67a173a..0ebd473 100644
--- a/qga/commands-posix.c
+++ b/qga/commands-posix.c
@@ -28,6 +28,7 @@
 #include "qapi/qmp/qerror.h"
 #include "qemu/queue.h"
 #include "qemu/host-utils.h"
+#include "qemu/sockets.h"
 
 #ifndef CONFIG_HAS_ENVIRON
 #ifdef __APPLE__
@@ -385,27 +386,6 @@ safe_open_or_create(const char *path, const char *mode, Error **errp)
     return NULL;
 }
 
-static int guest_file_toggle_flags(int fd, int flags, bool set, Error **err)
-{
-    int ret, old_flags;
-
-    old_flags = fcntl(fd, F_GETFL);
-    if (old_flags == -1) {
-        error_setg_errno(err, errno, QERR_QGA_COMMAND_FAILED,
-                         "failed to fetch filehandle flags");
-        return -1;
-    }
-
-    ret = fcntl(fd, F_SETFL, set ? (old_flags | flags) : (old_flags & ~flags));
-    if (ret == -1) {
-        error_setg_errno(err, errno, QERR_QGA_COMMAND_FAILED,
-                         "failed to set filehandle flags");
-        return -1;
-    }
-
-    return ret;
-}
-
 int64_t qmp_guest_file_open(const char *path, bool has_mode, const char *mode,
                             Error **errp)
 {
@@ -426,10 +406,7 @@ int64_t qmp_guest_file_open(const char *path, bool has_mode, const char *mode,
     /* set fd non-blocking to avoid common use cases (like reading from a
      * named pipe) from hanging the agent
      */
-    if (guest_file_toggle_flags(fileno(fh), O_NONBLOCK, true, errp) < 0) {
-        fclose(fh);
-        return -1;
-    }
+    qemu_set_nonblock(fileno(fh));
 
     handle = guest_file_handle_add(fh, errp);
     if (handle < 0) {
diff --git a/qga/commands-win32.c b/qga/commands-win32.c
index d9de23b..41f6dd9 100644
--- a/qga/commands-win32.c
+++ b/qga/commands-win32.c
@@ -59,6 +59,7 @@ static struct {
     .filehandles = QTAILQ_HEAD_INITIALIZER(guest_file_state.filehandles),
 };
 
+#define FILE_GENERIC_APPEND (FILE_GENERIC_WRITE & ~FILE_WRITE_DATA)
 
 typedef struct OpenFlags {
     const char *forms;
@@ -66,20 +67,20 @@ typedef struct OpenFlags {
     DWORD creation_disposition;
 } OpenFlags;
 static OpenFlags guest_file_open_modes[] = {
-    {"r",   GENERIC_READ,               OPEN_EXISTING},
-    {"rb",  GENERIC_READ,               OPEN_EXISTING},
-    {"w",   GENERIC_WRITE,              CREATE_ALWAYS},
-    {"wb",  GENERIC_WRITE,              CREATE_ALWAYS},
-    {"a",   GENERIC_WRITE,              OPEN_ALWAYS  },
-    {"r+",  GENERIC_WRITE|GENERIC_READ, OPEN_EXISTING},
-    {"rb+", GENERIC_WRITE|GENERIC_READ, OPEN_EXISTING},
-    {"r+b", GENERIC_WRITE|GENERIC_READ, OPEN_EXISTING},
-    {"w+",  GENERIC_WRITE|GENERIC_READ, CREATE_ALWAYS},
-    {"wb+", GENERIC_WRITE|GENERIC_READ, CREATE_ALWAYS},
-    {"w+b", GENERIC_WRITE|GENERIC_READ, CREATE_ALWAYS},
-    {"a+",  GENERIC_WRITE|GENERIC_READ, OPEN_ALWAYS  },
-    {"ab+", GENERIC_WRITE|GENERIC_READ, OPEN_ALWAYS  },
-    {"a+b", GENERIC_WRITE|GENERIC_READ, OPEN_ALWAYS  }
+    {"r",   GENERIC_READ,                     OPEN_EXISTING},
+    {"rb",  GENERIC_READ,                     OPEN_EXISTING},
+    {"w",   GENERIC_WRITE,                    CREATE_ALWAYS},
+    {"wb",  GENERIC_WRITE,                    CREATE_ALWAYS},
+    {"a",   FILE_GENERIC_APPEND,              OPEN_ALWAYS  },
+    {"r+",  GENERIC_WRITE|GENERIC_READ,       OPEN_EXISTING},
+    {"rb+", GENERIC_WRITE|GENERIC_READ,       OPEN_EXISTING},
+    {"r+b", GENERIC_WRITE|GENERIC_READ,       OPEN_EXISTING},
+    {"w+",  GENERIC_WRITE|GENERIC_READ,       CREATE_ALWAYS},
+    {"wb+", GENERIC_WRITE|GENERIC_READ,       CREATE_ALWAYS},
+    {"w+b", GENERIC_WRITE|GENERIC_READ,       CREATE_ALWAYS},
+    {"a+",  FILE_GENERIC_APPEND|GENERIC_READ, OPEN_ALWAYS  },
+    {"ab+", FILE_GENERIC_APPEND|GENERIC_READ, OPEN_ALWAYS  },
+    {"a+b", FILE_GENERIC_APPEND|GENERIC_READ, OPEN_ALWAYS  }
 };
 
 static OpenFlags *find_open_flag(const char *mode_str)
@@ -128,6 +129,28 @@ static GuestFileHandle *guest_file_handle_find(int64_t id, Error **errp)
     return NULL;
 }
 
+static void handle_set_nonblocking(HANDLE fh)
+{
+    DWORD file_type, pipe_state;
+    file_type = GetFileType(fh);
+    if (file_type != FILE_TYPE_PIPE) {
+        return;
+    }
+    /* If file_type == FILE_TYPE_PIPE, according to MSDN
+     * the specified file is socket or named pipe */
+    if (!GetNamedPipeHandleState(fh, &pipe_state, NULL,
+                                 NULL, NULL, NULL, 0)) {
+        return;
+    }
+    /* The fd is named pipe fd */
+    if (pipe_state & PIPE_NOWAIT) {
+        return;
+    }
+
+    pipe_state |= PIPE_NOWAIT;
+    SetNamedPipeHandleState(fh, &pipe_state, NULL, NULL);
+}
+
 int64_t qmp_guest_file_open(const char *path, bool has_mode,
                             const char *mode, Error **errp)
 {
@@ -158,9 +181,14 @@ int64_t qmp_guest_file_open(const char *path, bool has_mode,
         return -1;
     }
 
+    /* set fd non-blocking to avoid common use cases (like reading from a
+     * named pipe) from hanging the agent
+     */
+    handle_set_nonblocking(fh);
+
     fd = guest_file_handle_add(fh, errp);
     if (fd < 0) {
-        CloseHandle(&fh);
+        CloseHandle(fh);
         error_setg(errp, "failed to add handle to qmp handle table");
         return -1;
     }
diff --git a/qmp-commands.hx b/qmp-commands.hx
index d7cf0ff..9d8b42f 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -718,6 +718,25 @@ Example:
 
 EQMP
     {
+        .name       = "migrate-start-postcopy",
+        .args_type  = "",
+        .mhandler.cmd_new = qmp_marshal_migrate_start_postcopy,
+    },
+
+SQMP
+migrate-start-postcopy
+----------------------
+
+Switch an in-progress migration to postcopy mode. Ignored after the end of
+migration (or once already in postcopy).
+
+Example:
+-> { "execute": "migrate-start-postcopy" }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "query-migrate-cache-size",
         .args_type  = "",
         .mhandler.cmd_new = qmp_marshal_query_migrate_cache_size,
@@ -1262,7 +1281,7 @@ EQMP
     },
     {
         .name       = "transaction",
-        .args_type  = "actions:q",
+        .args_type  = "actions:q,properties:q?",
         .mhandler.cmd_new = qmp_marshal_transaction,
     },
 
@@ -1476,6 +1495,44 @@ Example:
 EQMP
 
     {
+        .name       = "blockdev-snapshot",
+        .args_type  = "node:s,overlay:s",
+        .mhandler.cmd_new = qmp_marshal_blockdev_snapshot,
+    },
+
+SQMP
+blockdev-snapshot
+-----------------
+Since 2.5
+
+Create a snapshot, by installing 'node' as the backing image of
+'overlay'. Additionally, if 'node' is associated with a block
+device, the block device changes to using 'overlay' as its new active
+image.
+
+Arguments:
+
+- "node": device that will have a snapshot created (json-string)
+- "overlay": device that will have 'node' as its backing image (json-string)
+
+Example:
+
+-> { "execute": "blockdev-add",
+                "arguments": { "options": { "driver": "qcow2",
+                                            "node-name": "node1534",
+                                            "file": { "driver": "file",
+                                                      "filename": "hd1.qcow2" },
+                                            "backing": "" } } }
+
+<- { "return": {} }
+
+-> { "execute": "blockdev-snapshot", "arguments": { "node": "ide-hd0",
+                                                    "overlay": "node1534" } }
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "blockdev-snapshot-internal-sync",
         .args_type  = "device:B,name:s",
         .mhandler.cmd_new = qmp_marshal_blockdev_snapshot_internal_sync,
@@ -2526,6 +2583,64 @@ Each json-object contain the following:
                    another request (json-int)
     - "wr_merged": number of write requests that have been merged into
                    another request (json-int)
+    - "idle_time_ns": time since the last I/O operation, in
+                      nanoseconds. If the field is absent it means
+                      that there haven't been any operations yet
+                      (json-int, optional)
+    - "failed_rd_operations": number of failed read operations
+                              (json-int)
+    - "failed_wr_operations": number of failed write operations
+                              (json-int)
+    - "failed_flush_operations": number of failed flush operations
+                               (json-int)
+    - "invalid_rd_operations": number of invalid read operations
+                               (json-int)
+    - "invalid_wr_operations": number of invalid write operations
+                               (json-int)
+    - "invalid_flush_operations": number of invalid flush operations
+                                  (json-int)
+    - "account_invalid": whether invalid operations are included in
+                         the last access statistics (json-bool)
+    - "account_failed": whether failed operations are included in the
+                         latency and last access statistics
+                         (json-bool)
+    - "timed_stats": A json-array containing statistics collected in
+                     specific intervals, with the following members:
+        - "interval_length": interval used for calculating the
+                             statistics, in seconds (json-int)
+        - "min_rd_latency_ns": minimum latency of read operations in
+                               the defined interval, in nanoseconds
+                               (json-int)
+        - "min_wr_latency_ns": minimum latency of write operations in
+                               the defined interval, in nanoseconds
+                               (json-int)
+        - "min_flush_latency_ns": minimum latency of flush operations
+                                  in the defined interval, in
+                                  nanoseconds (json-int)
+        - "max_rd_latency_ns": maximum latency of read operations in
+                               the defined interval, in nanoseconds
+                               (json-int)
+        - "max_wr_latency_ns": maximum latency of write operations in
+                               the defined interval, in nanoseconds
+                               (json-int)
+        - "max_flush_latency_ns": maximum latency of flush operations
+                                  in the defined interval, in
+                                  nanoseconds (json-int)
+        - "avg_rd_latency_ns": average latency of read operations in
+                               the defined interval, in nanoseconds
+                               (json-int)
+        - "avg_wr_latency_ns": average latency of write operations in
+                               the defined interval, in nanoseconds
+                               (json-int)
+        - "avg_flush_latency_ns": average latency of flush operations
+                                  in the defined interval, in
+                                  nanoseconds (json-int)
+        - "avg_rd_queue_depth": average number of pending read
+                                operations in the defined interval
+                                (json-number)
+        - "avg_wr_queue_depth": average number of pending write
+                                operations in the defined interval
+                                (json-number).
 - "parent": Contains recursively the statistics of the underlying
             protocol (e.g. the host file for a qcow2 image). If there is
             no underlying protocol, this field is omitted
@@ -2550,7 +2665,10 @@ Example:
                   "flush_total_times_ns":49653
                   "flush_operations":61,
                   "rd_merged":0,
-                  "wr_merged":0
+                  "wr_merged":0,
+                  "idle_time_ns":2953431879,
+                  "account_invalid":true,
+                  "account_failed":false
                }
             },
             "stats":{
@@ -2564,7 +2682,10 @@ Example:
                "rd_total_times_ns":3465673657
                "flush_total_times_ns":49653,
                "rd_merged":0,
-               "wr_merged":0
+               "wr_merged":0,
+               "idle_time_ns":2953431879,
+               "account_invalid":true,
+               "account_failed":false
             }
          },
          {
@@ -2580,7 +2701,9 @@ Example:
                "rd_total_times_ns":0
                "flush_total_times_ns":0,
                "rd_merged":0,
-               "wr_merged":0
+               "wr_merged":0,
+               "account_invalid":false,
+               "account_failed":false
             }
          },
          {
@@ -2596,7 +2719,9 @@ Example:
                "rd_total_times_ns":0
                "flush_total_times_ns":0,
                "rd_merged":0,
-               "wr_merged":0
+               "wr_merged":0,
+               "account_invalid":false,
+               "account_failed":false
             }
          },
          {
@@ -2612,7 +2737,9 @@ Example:
                "rd_total_times_ns":0
                "flush_total_times_ns":0,
                "rd_merged":0,
-               "wr_merged":0
+               "wr_merged":0,
+               "account_invalid":false,
+               "account_failed":false
             }
          }
       ]
@@ -3889,8 +4016,8 @@ blockdev-add
 Add a block device.
 
 This command is still a work in progress.  It doesn't support all
-block drivers, it lacks a matching blockdev-del, and more.  Stay away
-from it unless you want to help with its development.
+block drivers among other things.  Stay away from it unless you want
+to help with its development.
 
 Arguments:
 
@@ -3936,6 +4063,228 @@ Example (2):
 EQMP
 
     {
+        .name       = "x-blockdev-del",
+        .args_type  = "id:s?,node-name:s?",
+        .mhandler.cmd_new = qmp_marshal_x_blockdev_del,
+    },
+
+SQMP
+x-blockdev-del
+------------
+Since 2.5
+
+Deletes a block device thas has been added using blockdev-add.
+The selected device can be either a block backend or a graph node.
+
+In the former case the backend will be destroyed, along with its
+inserted medium if there's any. The command will fail if the backend
+or its medium are in use.
+
+In the latter case the node will be destroyed. The command will fail
+if the node is attached to a block backend or is otherwise being
+used.
+
+One of "id" or "node-name" must be specified, but not both.
+
+This command is still a work in progress and is considered
+experimental. Stay away from it unless you want to help with its
+development.
+
+Arguments:
+
+- "id": Name of the block backend device to delete (json-string, optional)
+- "node-name": Name of the graph node to delete (json-string, optional)
+
+Example:
+
+-> { "execute": "blockdev-add",
+     "arguments": {
+         "options": {
+             "driver": "qcow2",
+             "id": "drive0",
+             "file": {
+                 "driver": "file",
+                 "filename": "test.qcow2"
+             }
+         }
+     }
+   }
+
+<- { "return": {} }
+
+-> { "execute": "x-blockdev-del",
+     "arguments": { "id": "drive0" }
+   }
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "blockdev-open-tray",
+        .args_type  = "device:s,force:b?",
+        .mhandler.cmd_new = qmp_marshal_blockdev_open_tray,
+    },
+
+SQMP
+blockdev-open-tray
+------------------
+
+Opens a block device's tray. If there is a block driver state tree inserted as a
+medium, it will become inaccessible to the guest (but it will remain associated
+to the block device, so closing the tray will make it accessible again).
+
+If the tray was already open before, this will be a no-op.
+
+Once the tray opens, a DEVICE_TRAY_MOVED event is emitted. There are cases in
+which no such event will be generated, these include:
+- if the guest has locked the tray, @force is false and the guest does not
+  respond to the eject request
+- if the BlockBackend denoted by @device does not have a guest device attached
+  to it
+- if the guest device does not have an actual tray and is empty, for instance
+  for floppy disk drives
+
+Arguments:
+
+- "device": block device name (json-string)
+- "force": if false (the default), an eject request will be sent to the guest if
+           it has locked the tray (and the tray will not be opened immediately);
+           if true, the tray will be opened regardless of whether it is locked
+           (json-bool, optional)
+
+Example:
+
+-> { "execute": "blockdev-open-tray",
+     "arguments": { "device": "ide1-cd0" } }
+
+<- { "timestamp": { "seconds": 1418751016,
+                    "microseconds": 716996 },
+     "event": "DEVICE_TRAY_MOVED",
+     "data": { "device": "ide1-cd0",
+               "tray-open": true } }
+
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "blockdev-close-tray",
+        .args_type  = "device:s",
+        .mhandler.cmd_new = qmp_marshal_blockdev_close_tray,
+    },
+
+SQMP
+blockdev-close-tray
+-------------------
+
+Closes a block device's tray. If there is a block driver state tree associated
+with the block device (which is currently ejected), that tree will be loaded as
+the medium.
+
+If the tray was already closed before, this will be a no-op.
+
+Arguments:
+
+- "device": block device name (json-string)
+
+Example:
+
+-> { "execute": "blockdev-close-tray",
+     "arguments": { "device": "ide1-cd0" } }
+
+<- { "timestamp": { "seconds": 1418751345,
+                    "microseconds": 272147 },
+     "event": "DEVICE_TRAY_MOVED",
+     "data": { "device": "ide1-cd0",
+               "tray-open": false } }
+
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "blockdev-remove-medium",
+        .args_type  = "device:s",
+        .mhandler.cmd_new = qmp_marshal_blockdev_remove_medium,
+    },
+
+SQMP
+blockdev-remove-medium
+----------------------
+
+Removes a medium (a block driver state tree) from a block device. That block
+device's tray must currently be open (unless there is no attached guest device).
+
+If the tray is open and there is no medium inserted, this will be a no-op.
+
+Arguments:
+
+- "device": block device name (json-string)
+
+Example:
+
+-> { "execute": "blockdev-remove-medium",
+     "arguments": { "device": "ide1-cd0" } }
+
+<- { "error": { "class": "GenericError",
+                "desc": "Tray of device 'ide1-cd0' is not open" } }
+
+-> { "execute": "blockdev-open-tray",
+     "arguments": { "device": "ide1-cd0" } }
+
+<- { "timestamp": { "seconds": 1418751627,
+                    "microseconds": 549958 },
+     "event": "DEVICE_TRAY_MOVED",
+     "data": { "device": "ide1-cd0",
+               "tray-open": true } }
+
+<- { "return": {} }
+
+-> { "execute": "blockdev-remove-medium",
+     "arguments": { "device": "ide1-cd0" } }
+
+<- { "return": {} }
+
+EQMP
+
+    {
+        .name       = "blockdev-insert-medium",
+        .args_type  = "device:s,node-name:s",
+        .mhandler.cmd_new = qmp_marshal_blockdev_insert_medium,
+    },
+
+SQMP
+blockdev-insert-medium
+----------------------
+
+Inserts a medium (a block driver state tree) into a block device. That block
+device's tray must currently be open (unless there is no attached guest device)
+and there must be no medium inserted already.
+
+Arguments:
+
+- "device": block device name (json-string)
+- "node-name": root node of the BDS tree to insert into the block device
+
+Example:
+
+-> { "execute": "blockdev-add",
+     "arguments": { "options": { "node-name": "node0",
+                                 "driver": "raw",
+                                 "file": { "driver": "file",
+                                           "filename": "fedora.iso" } } } }
+
+<- { "return": {} }
+
+-> { "execute": "blockdev-insert-medium",
+     "arguments": { "device": "ide1-cd0",
+                    "node-name": "node0" } }
+
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "query-named-block-nodes",
         .args_type  = "",
         .mhandler.cmd_new = qmp_marshal_query_named_block_nodes,
@@ -3998,6 +4347,59 @@ Example:
 EQMP
 
     {
+        .name       = "blockdev-change-medium",
+        .args_type  = "device:B,filename:F,format:s?,read-only-mode:s?",
+        .mhandler.cmd_new = qmp_marshal_blockdev_change_medium,
+    },
+
+SQMP
+blockdev-change-medium
+----------------------
+
+Changes the medium inserted into a block device by ejecting the current medium
+and loading a new image file which is inserted as the new medium.
+
+Arguments:
+
+- "device": device name (json-string)
+- "filename": filename of the new image (json-string)
+- "format": format of the new image (json-string, optional)
+- "read-only-mode": new read-only mode (json-string, optional)
+          - Possible values: "retain" (default), "read-only", "read-write"
+
+Examples:
+
+1. Change a removable medium
+
+-> { "execute": "blockdev-change-medium",
+             "arguments": { "device": "ide1-cd0",
+                            "filename": "/srv/images/Fedora-12-x86_64-DVD.iso",
+                            "format": "raw" } }
+<- { "return": {} }
+
+2. Load a read-only medium into a writable drive
+
+-> { "execute": "blockdev-change-medium",
+             "arguments": { "device": "isa-fd0",
+                            "filename": "/srv/images/ro.img",
+                            "format": "raw",
+                            "read-only-mode": "retain" } }
+
+<- { "error":
+     { "class": "GenericError",
+       "desc": "Could not open '/srv/images/ro.img': Permission denied" } }
+
+-> { "execute": "blockdev-change-medium",
+             "arguments": { "device": "isa-fd0",
+                            "filename": "/srv/images/ro.img",
+                            "format": "raw",
+                            "read-only-mode": "read-only" } }
+
+<- { "return": {} }
+
+EQMP
+
+    {
         .name       = "query-memdev",
         .args_type  = "",
         .mhandler.cmd_new = qmp_marshal_query_memdev,
diff --git a/qmp.c b/qmp.c
index ff54e5a..ddc63ea 100644
--- a/qmp.c
+++ b/qmp.c
@@ -414,7 +414,8 @@ void qmp_change(const char *device, const char *target,
     if (strcmp(device, "vnc") == 0) {
         qmp_change_vnc(target, has_arg, arg, errp);
     } else {
-        qmp_change_blockdev(device, target, arg, errp);
+        qmp_blockdev_change_medium(device, target, has_arg, arg, false, 0,
+                                   errp);
     }
 }
 
diff --git a/qom/object.c b/qom/object.c
index 11cd86b..c0decb6 100644
--- a/qom/object.c
+++ b/qom/object.c
@@ -204,7 +204,7 @@ static bool type_is_ancestor(TypeImpl *type, TypeImpl *target_type)
 {
     assert(target_type);
 
-    /* Check if typename is a direct ancestor of type */
+    /* Check if target_type is a direct ancestor of type */
     while (type) {
         if (type == target_type) {
             return true;
@@ -1330,8 +1330,8 @@ static Object *object_resolve_link(Object *obj, const char *name,
     target = object_resolve_path_type(path, target_type, &ambiguous);
 
     if (ambiguous) {
-        error_set(errp, ERROR_CLASS_GENERIC_ERROR,
-                  "Path '%s' does not uniquely identify an object", path);
+        error_setg(errp, "Path '%s' does not uniquely identify an object",
+                   path);
     } else if (!target) {
         target = object_resolve_path(path, &ambiguous);
         if (target || ambiguous) {
diff --git a/qtest.c b/qtest.c
index 8e10340..05cefd2 100644
--- a/qtest.c
+++ b/qtest.c
@@ -657,7 +657,6 @@ void qtest_init(const char *qtest_chrdev, const char *qtest_log, Error **errp)
 
     inbuf = g_string_new("");
     qtest_chr = chr;
-    page_size_init();
 }
 
 bool qtest_driver(void)
diff --git a/replay/Makefile.objs b/replay/Makefile.objs
new file mode 100644
index 0000000..232193a
--- /dev/null
+++ b/replay/Makefile.objs
@@ -0,0 +1,5 @@
+common-obj-y += replay.o
+common-obj-y += replay-internal.o
+common-obj-y += replay-events.o
+common-obj-y += replay-time.o
+common-obj-y += replay-input.o
diff --git a/replay/replay-events.c b/replay/replay-events.c
new file mode 100644
index 0000000..402f644
--- /dev/null
+++ b/replay/replay-events.c
@@ -0,0 +1,279 @@
+/*
+ * replay-events.c
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "qemu/error-report.h"
+#include "sysemu/replay.h"
+#include "replay-internal.h"
+#include "block/aio.h"
+#include "ui/input.h"
+
+typedef struct Event {
+    ReplayAsyncEventKind event_kind;
+    void *opaque;
+    void *opaque2;
+    uint64_t id;
+
+    QTAILQ_ENTRY(Event) events;
+} Event;
+
+static QTAILQ_HEAD(, Event) events_list = QTAILQ_HEAD_INITIALIZER(events_list);
+static unsigned int read_event_kind = -1;
+static uint64_t read_id = -1;
+static int read_checkpoint = -1;
+
+static bool events_enabled;
+
+/* Functions */
+
+static void replay_run_event(Event *event)
+{
+    switch (event->event_kind) {
+    case REPLAY_ASYNC_EVENT_BH:
+        aio_bh_call(event->opaque);
+        break;
+    case REPLAY_ASYNC_EVENT_INPUT:
+        qemu_input_event_send_impl(NULL, (InputEvent *)event->opaque);
+        qapi_free_InputEvent((InputEvent *)event->opaque);
+        break;
+    case REPLAY_ASYNC_EVENT_INPUT_SYNC:
+        qemu_input_event_sync_impl();
+        break;
+    default:
+        error_report("Replay: invalid async event ID (%d) in the queue",
+                    event->event_kind);
+        exit(1);
+        break;
+    }
+}
+
+void replay_enable_events(void)
+{
+    events_enabled = true;
+}
+
+bool replay_has_events(void)
+{
+    return !QTAILQ_EMPTY(&events_list);
+}
+
+void replay_flush_events(void)
+{
+    replay_mutex_lock();
+    while (!QTAILQ_EMPTY(&events_list)) {
+        Event *event = QTAILQ_FIRST(&events_list);
+        replay_mutex_unlock();
+        replay_run_event(event);
+        replay_mutex_lock();
+        QTAILQ_REMOVE(&events_list, event, events);
+        g_free(event);
+    }
+    replay_mutex_unlock();
+}
+
+void replay_disable_events(void)
+{
+    if (replay_mode != REPLAY_MODE_NONE) {
+        events_enabled = false;
+        /* Flush events queue before waiting of completion */
+        replay_flush_events();
+    }
+}
+
+void replay_clear_events(void)
+{
+    replay_mutex_lock();
+    while (!QTAILQ_EMPTY(&events_list)) {
+        Event *event = QTAILQ_FIRST(&events_list);
+        QTAILQ_REMOVE(&events_list, event, events);
+
+        g_free(event);
+    }
+    replay_mutex_unlock();
+}
+
+/*! Adds specified async event to the queue */
+static void replay_add_event(ReplayAsyncEventKind event_kind,
+                             void *opaque,
+                             void *opaque2, uint64_t id)
+{
+    assert(event_kind < REPLAY_ASYNC_COUNT);
+
+    if (!replay_file || replay_mode == REPLAY_MODE_NONE
+        || !events_enabled) {
+        Event e;
+        e.event_kind = event_kind;
+        e.opaque = opaque;
+        e.opaque2 = opaque2;
+        e.id = id;
+        replay_run_event(&e);
+        return;
+    }
+
+    Event *event = g_malloc0(sizeof(Event));
+    event->event_kind = event_kind;
+    event->opaque = opaque;
+    event->opaque2 = opaque2;
+    event->id = id;
+
+    replay_mutex_lock();
+    QTAILQ_INSERT_TAIL(&events_list, event, events);
+    replay_mutex_unlock();
+}
+
+void replay_bh_schedule_event(QEMUBH *bh)
+{
+    if (replay_mode != REPLAY_MODE_NONE) {
+        uint64_t id = replay_get_current_step();
+        replay_add_event(REPLAY_ASYNC_EVENT_BH, bh, NULL, id);
+    } else {
+        qemu_bh_schedule(bh);
+    }
+}
+
+void replay_add_input_event(struct InputEvent *event)
+{
+    replay_add_event(REPLAY_ASYNC_EVENT_INPUT, event, NULL, 0);
+}
+
+void replay_add_input_sync_event(void)
+{
+    replay_add_event(REPLAY_ASYNC_EVENT_INPUT_SYNC, NULL, NULL, 0);
+}
+
+static void replay_save_event(Event *event, int checkpoint)
+{
+    if (replay_mode != REPLAY_MODE_PLAY) {
+        /* put the event into the file */
+        replay_put_event(EVENT_ASYNC);
+        replay_put_byte(checkpoint);
+        replay_put_byte(event->event_kind);
+
+        /* save event-specific data */
+        switch (event->event_kind) {
+        case REPLAY_ASYNC_EVENT_BH:
+            replay_put_qword(event->id);
+            break;
+        case REPLAY_ASYNC_EVENT_INPUT:
+            replay_save_input_event(event->opaque);
+            break;
+        case REPLAY_ASYNC_EVENT_INPUT_SYNC:
+            break;
+        default:
+            error_report("Unknown ID %d of replay event", read_event_kind);
+            exit(1);
+        }
+    }
+}
+
+/* Called with replay mutex locked */
+void replay_save_events(int checkpoint)
+{
+    while (!QTAILQ_EMPTY(&events_list)) {
+        Event *event = QTAILQ_FIRST(&events_list);
+        replay_save_event(event, checkpoint);
+
+        replay_mutex_unlock();
+        replay_run_event(event);
+        replay_mutex_lock();
+        QTAILQ_REMOVE(&events_list, event, events);
+        g_free(event);
+    }
+}
+
+static Event *replay_read_event(int checkpoint)
+{
+    Event *event;
+    if (read_event_kind == -1) {
+        read_checkpoint = replay_get_byte();
+        read_event_kind = replay_get_byte();
+        read_id = -1;
+        replay_check_error();
+    }
+
+    if (checkpoint != read_checkpoint) {
+        return NULL;
+    }
+
+    /* Events that has not to be in the queue */
+    switch (read_event_kind) {
+    case REPLAY_ASYNC_EVENT_BH:
+        if (read_id == -1) {
+            read_id = replay_get_qword();
+        }
+        break;
+    case REPLAY_ASYNC_EVENT_INPUT:
+        event = g_malloc0(sizeof(Event));
+        event->event_kind = read_event_kind;
+        event->opaque = replay_read_input_event();
+        return event;
+    case REPLAY_ASYNC_EVENT_INPUT_SYNC:
+        event = g_malloc0(sizeof(Event));
+        event->event_kind = read_event_kind;
+        event->opaque = 0;
+        return event;
+    default:
+        error_report("Unknown ID %d of replay event", read_event_kind);
+        exit(1);
+        break;
+    }
+
+    QTAILQ_FOREACH(event, &events_list, events) {
+        if (event->event_kind == read_event_kind
+            && (read_id == -1 || read_id == event->id)) {
+            break;
+        }
+    }
+
+    if (event) {
+        QTAILQ_REMOVE(&events_list, event, events);
+    } else {
+        return NULL;
+    }
+
+    /* Read event-specific data */
+
+    return event;
+}
+
+/* Called with replay mutex locked */
+void replay_read_events(int checkpoint)
+{
+    while (replay_data_kind == EVENT_ASYNC) {
+        Event *event = replay_read_event(checkpoint);
+        if (!event) {
+            break;
+        }
+        replay_mutex_unlock();
+        replay_run_event(event);
+        replay_mutex_lock();
+
+        g_free(event);
+        replay_finish_event();
+        read_event_kind = -1;
+    }
+}
+
+void replay_init_events(void)
+{
+    read_event_kind = -1;
+}
+
+void replay_finish_events(void)
+{
+    events_enabled = false;
+    replay_clear_events();
+}
+
+bool replay_events_enabled(void)
+{
+    return events_enabled;
+}
diff --git a/replay/replay-input.c b/replay/replay-input.c
new file mode 100644
index 0000000..9879895
--- /dev/null
+++ b/replay/replay-input.c
@@ -0,0 +1,160 @@
+/*
+ * replay-input.c
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "sysemu/replay.h"
+#include "replay-internal.h"
+#include "qemu/notify.h"
+#include "ui/input.h"
+#include "qapi/qmp-output-visitor.h"
+#include "qapi/qmp-input-visitor.h"
+#include "qapi-visit.h"
+
+static InputEvent *qapi_clone_InputEvent(InputEvent *src)
+{
+    QmpOutputVisitor *qov;
+    QmpInputVisitor *qiv;
+    Visitor *ov, *iv;
+    QObject *obj;
+    InputEvent *dst = NULL;
+
+    qov = qmp_output_visitor_new();
+    ov = qmp_output_get_visitor(qov);
+    visit_type_InputEvent(ov, &src, NULL, &error_abort);
+    obj = qmp_output_get_qobject(qov);
+    qmp_output_visitor_cleanup(qov);
+    if (!obj) {
+        return NULL;
+    }
+
+    qiv = qmp_input_visitor_new(obj);
+    iv = qmp_input_get_visitor(qiv);
+    visit_type_InputEvent(iv, &dst, NULL, &error_abort);
+    qmp_input_visitor_cleanup(qiv);
+    qobject_decref(obj);
+
+    return dst;
+}
+
+void replay_save_input_event(InputEvent *evt)
+{
+    replay_put_dword(evt->type);
+
+    switch (evt->type) {
+    case INPUT_EVENT_KIND_KEY:
+        replay_put_dword(evt->u.key->key->type);
+
+        switch (evt->u.key->key->type) {
+        case KEY_VALUE_KIND_NUMBER:
+            replay_put_qword(evt->u.key->key->u.number);
+            replay_put_byte(evt->u.key->down);
+            break;
+        case KEY_VALUE_KIND_QCODE:
+            replay_put_dword(evt->u.key->key->u.qcode);
+            replay_put_byte(evt->u.key->down);
+            break;
+        case KEY_VALUE_KIND_MAX:
+            /* keep gcc happy */
+            break;
+        }
+        break;
+    case INPUT_EVENT_KIND_BTN:
+        replay_put_dword(evt->u.btn->button);
+        replay_put_byte(evt->u.btn->down);
+        break;
+    case INPUT_EVENT_KIND_REL:
+        replay_put_dword(evt->u.rel->axis);
+        replay_put_qword(evt->u.rel->value);
+        break;
+    case INPUT_EVENT_KIND_ABS:
+        replay_put_dword(evt->u.abs->axis);
+        replay_put_qword(evt->u.abs->value);
+        break;
+    case INPUT_EVENT_KIND_MAX:
+        /* keep gcc happy */
+        break;
+    }
+}
+
+InputEvent *replay_read_input_event(void)
+{
+    InputEvent evt;
+    KeyValue keyValue;
+    InputKeyEvent key;
+    key.key = &keyValue;
+    InputBtnEvent btn;
+    InputMoveEvent rel;
+    InputMoveEvent abs;
+
+    evt.type = replay_get_dword();
+    switch (evt.type) {
+    case INPUT_EVENT_KIND_KEY:
+        evt.u.key = &key;
+        evt.u.key->key->type = replay_get_dword();
+
+        switch (evt.u.key->key->type) {
+        case KEY_VALUE_KIND_NUMBER:
+            evt.u.key->key->u.number = replay_get_qword();
+            evt.u.key->down = replay_get_byte();
+            break;
+        case KEY_VALUE_KIND_QCODE:
+            evt.u.key->key->u.qcode = (QKeyCode)replay_get_dword();
+            evt.u.key->down = replay_get_byte();
+            break;
+        case KEY_VALUE_KIND_MAX:
+            /* keep gcc happy */
+            break;
+        }
+        break;
+    case INPUT_EVENT_KIND_BTN:
+        evt.u.btn = &btn;
+        evt.u.btn->button = (InputButton)replay_get_dword();
+        evt.u.btn->down = replay_get_byte();
+        break;
+    case INPUT_EVENT_KIND_REL:
+        evt.u.rel = &rel;
+        evt.u.rel->axis = (InputAxis)replay_get_dword();
+        evt.u.rel->value = replay_get_qword();
+        break;
+    case INPUT_EVENT_KIND_ABS:
+        evt.u.abs = &abs;
+        evt.u.abs->axis = (InputAxis)replay_get_dword();
+        evt.u.abs->value = replay_get_qword();
+        break;
+    case INPUT_EVENT_KIND_MAX:
+        /* keep gcc happy */
+        break;
+    }
+
+    return qapi_clone_InputEvent(&evt);
+}
+
+void replay_input_event(QemuConsole *src, InputEvent *evt)
+{
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        /* Nothing */
+    } else if (replay_mode == REPLAY_MODE_RECORD) {
+        replay_add_input_event(qapi_clone_InputEvent(evt));
+    } else {
+        qemu_input_event_send_impl(src, evt);
+    }
+}
+
+void replay_input_sync_event(void)
+{
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        /* Nothing */
+    } else if (replay_mode == REPLAY_MODE_RECORD) {
+        replay_add_input_sync_event();
+    } else {
+        qemu_input_event_sync_impl();
+    }
+}
diff --git a/replay/replay-internal.c b/replay/replay-internal.c
new file mode 100644
index 0000000..35cff44
--- /dev/null
+++ b/replay/replay-internal.c
@@ -0,0 +1,206 @@
+/*
+ * replay-internal.c
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "sysemu/replay.h"
+#include "replay-internal.h"
+#include "qemu/error-report.h"
+#include "sysemu/sysemu.h"
+
+unsigned int replay_data_kind = -1;
+static unsigned int replay_has_unread_data;
+
+/* Mutex to protect reading and writing events to the log.
+   replay_data_kind and replay_has_unread_data are also protected
+   by this mutex.
+   It also protects replay events queue which stores events to be
+   written or read to the log. */
+static QemuMutex lock;
+
+/* File for replay writing */
+FILE *replay_file;
+
+void replay_put_byte(uint8_t byte)
+{
+    if (replay_file) {
+        putc(byte, replay_file);
+    }
+}
+
+void replay_put_event(uint8_t event)
+{
+    assert(event < EVENT_COUNT);
+    replay_put_byte(event);
+}
+
+
+void replay_put_word(uint16_t word)
+{
+    replay_put_byte(word >> 8);
+    replay_put_byte(word);
+}
+
+void replay_put_dword(uint32_t dword)
+{
+    replay_put_word(dword >> 16);
+    replay_put_word(dword);
+}
+
+void replay_put_qword(int64_t qword)
+{
+    replay_put_dword(qword >> 32);
+    replay_put_dword(qword);
+}
+
+void replay_put_array(const uint8_t *buf, size_t size)
+{
+    if (replay_file) {
+        replay_put_dword(size);
+        fwrite(buf, 1, size, replay_file);
+    }
+}
+
+uint8_t replay_get_byte(void)
+{
+    uint8_t byte = 0;
+    if (replay_file) {
+        byte = getc(replay_file);
+    }
+    return byte;
+}
+
+uint16_t replay_get_word(void)
+{
+    uint16_t word = 0;
+    if (replay_file) {
+        word = replay_get_byte();
+        word = (word << 8) + replay_get_byte();
+    }
+
+    return word;
+}
+
+uint32_t replay_get_dword(void)
+{
+    uint32_t dword = 0;
+    if (replay_file) {
+        dword = replay_get_word();
+        dword = (dword << 16) + replay_get_word();
+    }
+
+    return dword;
+}
+
+int64_t replay_get_qword(void)
+{
+    int64_t qword = 0;
+    if (replay_file) {
+        qword = replay_get_dword();
+        qword = (qword << 32) + replay_get_dword();
+    }
+
+    return qword;
+}
+
+void replay_get_array(uint8_t *buf, size_t *size)
+{
+    if (replay_file) {
+        *size = replay_get_dword();
+        if (fread(buf, 1, *size, replay_file) != *size) {
+            error_report("replay read error");
+        }
+    }
+}
+
+void replay_get_array_alloc(uint8_t **buf, size_t *size)
+{
+    if (replay_file) {
+        *size = replay_get_dword();
+        *buf = g_malloc(*size);
+        if (fread(*buf, 1, *size, replay_file) != *size) {
+            error_report("replay read error");
+        }
+    }
+}
+
+void replay_check_error(void)
+{
+    if (replay_file) {
+        if (feof(replay_file)) {
+            error_report("replay file is over");
+            qemu_system_vmstop_request_prepare();
+            qemu_system_vmstop_request(RUN_STATE_PAUSED);
+        } else if (ferror(replay_file)) {
+            error_report("replay file is over or something goes wrong");
+            qemu_system_vmstop_request_prepare();
+            qemu_system_vmstop_request(RUN_STATE_INTERNAL_ERROR);
+        }
+    }
+}
+
+void replay_fetch_data_kind(void)
+{
+    if (replay_file) {
+        if (!replay_has_unread_data) {
+            replay_data_kind = replay_get_byte();
+            if (replay_data_kind == EVENT_INSTRUCTION) {
+                replay_state.instructions_count = replay_get_dword();
+            }
+            replay_check_error();
+            replay_has_unread_data = 1;
+            if (replay_data_kind >= EVENT_COUNT) {
+                error_report("Replay: unknown event kind %d", replay_data_kind);
+                exit(1);
+            }
+        }
+    }
+}
+
+void replay_finish_event(void)
+{
+    replay_has_unread_data = 0;
+    replay_fetch_data_kind();
+}
+
+void replay_mutex_init(void)
+{
+    qemu_mutex_init(&lock);
+}
+
+void replay_mutex_destroy(void)
+{
+    qemu_mutex_destroy(&lock);
+}
+
+void replay_mutex_lock(void)
+{
+    qemu_mutex_lock(&lock);
+}
+
+void replay_mutex_unlock(void)
+{
+    qemu_mutex_unlock(&lock);
+}
+
+/*! Saves cached instructions. */
+void replay_save_instructions(void)
+{
+    if (replay_file && replay_mode == REPLAY_MODE_RECORD) {
+        replay_mutex_lock();
+        int diff = (int)(replay_get_current_step() - replay_state.current_step);
+        if (diff > 0) {
+            replay_put_event(EVENT_INSTRUCTION);
+            replay_put_dword(diff);
+            replay_state.current_step += diff;
+        }
+        replay_mutex_unlock();
+    }
+}
diff --git a/replay/replay-internal.h b/replay/replay-internal.h
new file mode 100644
index 0000000..77e0d29
--- /dev/null
+++ b/replay/replay-internal.h
@@ -0,0 +1,140 @@
+#ifndef REPLAY_INTERNAL_H
+#define REPLAY_INTERNAL_H
+
+/*
+ * replay-internal.h
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include <stdio.h>
+
+enum ReplayEvents {
+    /* for instruction event */
+    EVENT_INSTRUCTION,
+    /* for software interrupt */
+    EVENT_INTERRUPT,
+    /* for emulated exceptions */
+    EVENT_EXCEPTION,
+    /* for async events */
+    EVENT_ASYNC,
+    /* for shutdown request */
+    EVENT_SHUTDOWN,
+    /* for clock read/writes */
+    /* some of greater codes are reserved for clocks */
+    EVENT_CLOCK,
+    EVENT_CLOCK_LAST = EVENT_CLOCK + REPLAY_CLOCK_COUNT - 1,
+    /* for checkpoint event */
+    /* some of greater codes are reserved for checkpoints */
+    EVENT_CHECKPOINT,
+    EVENT_CHECKPOINT_LAST = EVENT_CHECKPOINT + CHECKPOINT_COUNT - 1,
+    /* end of log event */
+    EVENT_END,
+    EVENT_COUNT
+};
+
+/* Asynchronous events IDs */
+
+enum ReplayAsyncEventKind {
+    REPLAY_ASYNC_EVENT_BH,
+    REPLAY_ASYNC_EVENT_INPUT,
+    REPLAY_ASYNC_EVENT_INPUT_SYNC,
+    REPLAY_ASYNC_COUNT
+};
+
+typedef enum ReplayAsyncEventKind ReplayAsyncEventKind;
+
+typedef struct ReplayState {
+    /*! Cached clock values. */
+    int64_t cached_clock[REPLAY_CLOCK_COUNT];
+    /*! Current step - number of processed instructions and timer events. */
+    uint64_t current_step;
+    /*! Number of instructions to be executed before other events happen. */
+    int instructions_count;
+} ReplayState;
+extern ReplayState replay_state;
+
+extern unsigned int replay_data_kind;
+
+/* File for replay writing */
+extern FILE *replay_file;
+
+void replay_put_byte(uint8_t byte);
+void replay_put_event(uint8_t event);
+void replay_put_word(uint16_t word);
+void replay_put_dword(uint32_t dword);
+void replay_put_qword(int64_t qword);
+void replay_put_array(const uint8_t *buf, size_t size);
+
+uint8_t replay_get_byte(void);
+uint16_t replay_get_word(void);
+uint32_t replay_get_dword(void);
+int64_t replay_get_qword(void);
+void replay_get_array(uint8_t *buf, size_t *size);
+void replay_get_array_alloc(uint8_t **buf, size_t *size);
+
+/* Mutex functions for protecting replay log file */
+
+void replay_mutex_init(void);
+void replay_mutex_destroy(void);
+void replay_mutex_lock(void);
+void replay_mutex_unlock(void);
+
+/*! Checks error status of the file. */
+void replay_check_error(void);
+
+/*! Finishes processing of the replayed event and fetches
+    the next event from the log. */
+void replay_finish_event(void);
+/*! Reads data type from the file and stores it in the
+    replay_data_kind variable. */
+void replay_fetch_data_kind(void);
+
+/*! Saves queued events (like instructions and sound). */
+void replay_save_instructions(void);
+
+/*! Skips async events until some sync event will be found.
+    \return true, if event was found */
+bool replay_next_event_is(int event);
+
+/*! Reads next clock value from the file.
+    If clock kind read from the file is different from the parameter,
+    the value is not used. */
+void replay_read_next_clock(unsigned int kind);
+
+/* Asynchronous events queue */
+
+/*! Initializes events' processing internals */
+void replay_init_events(void);
+/*! Clears internal data structures for events handling */
+void replay_finish_events(void);
+/*! Enables storing events in the queue */
+void replay_enable_events(void);
+/*! Flushes events queue */
+void replay_flush_events(void);
+/*! Clears events list before loading new VM state */
+void replay_clear_events(void);
+/*! Returns true if there are any unsaved events in the queue */
+bool replay_has_events(void);
+/*! Saves events from queue into the file */
+void replay_save_events(int checkpoint);
+/*! Read events from the file into the input queue */
+void replay_read_events(int checkpoint);
+
+/* Input events */
+
+/*! Saves input event to the log */
+void replay_save_input_event(InputEvent *evt);
+/*! Reads input event from the log */
+InputEvent *replay_read_input_event(void);
+/*! Adds input event to the queue */
+void replay_add_input_event(struct InputEvent *event);
+/*! Adds input sync event to the queue */
+void replay_add_input_sync_event(void);
+
+#endif
diff --git a/replay/replay-time.c b/replay/replay-time.c
new file mode 100644
index 0000000..6d06951
--- /dev/null
+++ b/replay/replay-time.c
@@ -0,0 +1,64 @@
+/*
+ * replay-time.c
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "sysemu/replay.h"
+#include "replay-internal.h"
+#include "qemu/error-report.h"
+
+int64_t replay_save_clock(ReplayClockKind kind, int64_t clock)
+{
+    replay_save_instructions();
+
+    if (replay_file) {
+        replay_mutex_lock();
+        replay_put_event(EVENT_CLOCK + kind);
+        replay_put_qword(clock);
+        replay_mutex_unlock();
+    }
+
+    return clock;
+}
+
+void replay_read_next_clock(ReplayClockKind kind)
+{
+    unsigned int read_kind = replay_data_kind - EVENT_CLOCK;
+
+    assert(read_kind == kind);
+
+    int64_t clock = replay_get_qword();
+
+    replay_check_error();
+    replay_finish_event();
+
+    replay_state.cached_clock[read_kind] = clock;
+}
+
+/*! Reads next clock event from the input. */
+int64_t replay_read_clock(ReplayClockKind kind)
+{
+    replay_account_executed_instructions();
+
+    if (replay_file) {
+        int64_t ret;
+        replay_mutex_lock();
+        if (replay_next_event_is(EVENT_CLOCK + kind)) {
+            replay_read_next_clock(kind);
+        }
+        ret = replay_state.cached_clock[kind];
+        replay_mutex_unlock();
+
+        return ret;
+    }
+
+    error_report("REPLAY INTERNAL ERROR %d", __LINE__);
+    exit(1);
+}
diff --git a/replay/replay.c b/replay/replay.c
new file mode 100644
index 0000000..0d33e82
--- /dev/null
+++ b/replay/replay.c
@@ -0,0 +1,342 @@
+/*
+ * replay.c
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "qemu-common.h"
+#include "sysemu/replay.h"
+#include "replay-internal.h"
+#include "qemu/timer.h"
+#include "qemu/main-loop.h"
+#include "sysemu/sysemu.h"
+#include "qemu/error-report.h"
+
+/* Current version of the replay mechanism.
+   Increase it when file format changes. */
+#define REPLAY_VERSION              0xe02002
+/* Size of replay log header */
+#define HEADER_SIZE                 (sizeof(uint32_t) + sizeof(uint64_t))
+
+ReplayMode replay_mode = REPLAY_MODE_NONE;
+
+/* Name of replay file  */
+static char *replay_filename;
+ReplayState replay_state;
+static GSList *replay_blockers;
+
+bool replay_next_event_is(int event)
+{
+    bool res = false;
+
+    /* nothing to skip - not all instructions used */
+    if (replay_state.instructions_count != 0) {
+        assert(replay_data_kind == EVENT_INSTRUCTION);
+        return event == EVENT_INSTRUCTION;
+    }
+
+    while (true) {
+        if (event == replay_data_kind) {
+            res = true;
+        }
+        switch (replay_data_kind) {
+        case EVENT_SHUTDOWN:
+            replay_finish_event();
+            qemu_system_shutdown_request();
+            break;
+        default:
+            /* clock, time_t, checkpoint and other events */
+            return res;
+        }
+    }
+    return res;
+}
+
+uint64_t replay_get_current_step(void)
+{
+    return cpu_get_icount_raw();
+}
+
+int replay_get_instructions(void)
+{
+    int res = 0;
+    replay_mutex_lock();
+    if (replay_next_event_is(EVENT_INSTRUCTION)) {
+        res = replay_state.instructions_count;
+    }
+    replay_mutex_unlock();
+    return res;
+}
+
+void replay_account_executed_instructions(void)
+{
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        replay_mutex_lock();
+        if (replay_state.instructions_count > 0) {
+            int count = (int)(replay_get_current_step()
+                              - replay_state.current_step);
+            replay_state.instructions_count -= count;
+            replay_state.current_step += count;
+            if (replay_state.instructions_count == 0) {
+                assert(replay_data_kind == EVENT_INSTRUCTION);
+                replay_finish_event();
+                /* Wake up iothread. This is required because
+                   timers will not expire until clock counters
+                   will be read from the log. */
+                qemu_notify_event();
+            }
+        }
+        replay_mutex_unlock();
+    }
+}
+
+bool replay_exception(void)
+{
+    if (replay_mode == REPLAY_MODE_RECORD) {
+        replay_save_instructions();
+        replay_mutex_lock();
+        replay_put_event(EVENT_EXCEPTION);
+        replay_mutex_unlock();
+        return true;
+    } else if (replay_mode == REPLAY_MODE_PLAY) {
+        bool res = replay_has_exception();
+        if (res) {
+            replay_mutex_lock();
+            replay_finish_event();
+            replay_mutex_unlock();
+        }
+        return res;
+    }
+
+    return true;
+}
+
+bool replay_has_exception(void)
+{
+    bool res = false;
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        replay_account_executed_instructions();
+        replay_mutex_lock();
+        res = replay_next_event_is(EVENT_EXCEPTION);
+        replay_mutex_unlock();
+    }
+
+    return res;
+}
+
+bool replay_interrupt(void)
+{
+    if (replay_mode == REPLAY_MODE_RECORD) {
+        replay_save_instructions();
+        replay_mutex_lock();
+        replay_put_event(EVENT_INTERRUPT);
+        replay_mutex_unlock();
+        return true;
+    } else if (replay_mode == REPLAY_MODE_PLAY) {
+        bool res = replay_has_interrupt();
+        if (res) {
+            replay_mutex_lock();
+            replay_finish_event();
+            replay_mutex_unlock();
+        }
+        return res;
+    }
+
+    return true;
+}
+
+bool replay_has_interrupt(void)
+{
+    bool res = false;
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        replay_account_executed_instructions();
+        replay_mutex_lock();
+        res = replay_next_event_is(EVENT_INTERRUPT);
+        replay_mutex_unlock();
+    }
+    return res;
+}
+
+void replay_shutdown_request(void)
+{
+    if (replay_mode == REPLAY_MODE_RECORD) {
+        replay_mutex_lock();
+        replay_put_event(EVENT_SHUTDOWN);
+        replay_mutex_unlock();
+    }
+}
+
+bool replay_checkpoint(ReplayCheckpoint checkpoint)
+{
+    bool res = false;
+    assert(EVENT_CHECKPOINT + checkpoint <= EVENT_CHECKPOINT_LAST);
+    replay_save_instructions();
+
+    if (!replay_file) {
+        return true;
+    }
+
+    replay_mutex_lock();
+
+    if (replay_mode == REPLAY_MODE_PLAY) {
+        if (replay_next_event_is(EVENT_CHECKPOINT + checkpoint)) {
+            replay_finish_event();
+        } else if (replay_data_kind != EVENT_ASYNC) {
+            res = false;
+            goto out;
+        }
+        replay_read_events(checkpoint);
+        /* replay_read_events may leave some unread events.
+           Return false if not all of the events associated with
+           checkpoint were processed */
+        res = replay_data_kind != EVENT_ASYNC;
+    } else if (replay_mode == REPLAY_MODE_RECORD) {
+        replay_put_event(EVENT_CHECKPOINT + checkpoint);
+        replay_save_events(checkpoint);
+        res = true;
+    }
+out:
+    replay_mutex_unlock();
+    return res;
+}
+
+static void replay_enable(const char *fname, int mode)
+{
+    const char *fmode = NULL;
+    assert(!replay_file);
+
+    switch (mode) {
+    case REPLAY_MODE_RECORD:
+        fmode = "wb";
+        break;
+    case REPLAY_MODE_PLAY:
+        fmode = "rb";
+        break;
+    default:
+        fprintf(stderr, "Replay: internal error: invalid replay mode\n");
+        exit(1);
+    }
+
+    atexit(replay_finish);
+
+    replay_mutex_init();
+
+    replay_file = fopen(fname, fmode);
+    if (replay_file == NULL) {
+        fprintf(stderr, "Replay: open %s: %s\n", fname, strerror(errno));
+        exit(1);
+    }
+
+    replay_filename = g_strdup(fname);
+
+    replay_mode = mode;
+    replay_data_kind = -1;
+    replay_state.instructions_count = 0;
+    replay_state.current_step = 0;
+
+    /* skip file header for RECORD and check it for PLAY */
+    if (replay_mode == REPLAY_MODE_RECORD) {
+        fseek(replay_file, HEADER_SIZE, SEEK_SET);
+    } else if (replay_mode == REPLAY_MODE_PLAY) {
+        unsigned int version = replay_get_dword();
+        if (version != REPLAY_VERSION) {
+            fprintf(stderr, "Replay: invalid input log file version\n");
+            exit(1);
+        }
+        /* go to the beginning */
+        fseek(replay_file, HEADER_SIZE, SEEK_SET);
+        replay_fetch_data_kind();
+    }
+
+    replay_init_events();
+}
+
+void replay_configure(QemuOpts *opts)
+{
+    const char *fname;
+    const char *rr;
+    ReplayMode mode = REPLAY_MODE_NONE;
+
+    rr = qemu_opt_get(opts, "rr");
+    if (!rr) {
+        /* Just enabling icount */
+        return;
+    } else if (!strcmp(rr, "record")) {
+        mode = REPLAY_MODE_RECORD;
+    } else if (!strcmp(rr, "replay")) {
+        mode = REPLAY_MODE_PLAY;
+    } else {
+        error_report("Invalid icount rr option: %s", rr);
+        exit(1);
+    }
+
+    fname = qemu_opt_get(opts, "rrfile");
+    if (!fname) {
+        error_report("File name not specified for replay");
+        exit(1);
+    }
+
+    replay_enable(fname, mode);
+}
+
+void replay_start(void)
+{
+    if (replay_mode == REPLAY_MODE_NONE) {
+        return;
+    }
+
+    if (replay_blockers) {
+        error_report("Record/replay: %s",
+                     error_get_pretty(replay_blockers->data));
+        exit(1);
+    }
+    if (!use_icount) {
+        error_report("Please enable icount to use record/replay");
+        exit(1);
+    }
+
+    /* Timer for snapshotting will be set up here. */
+
+    replay_enable_events();
+}
+
+void replay_finish(void)
+{
+    if (replay_mode == REPLAY_MODE_NONE) {
+        return;
+    }
+
+    replay_save_instructions();
+
+    /* finalize the file */
+    if (replay_file) {
+        if (replay_mode == REPLAY_MODE_RECORD) {
+            /* write end event */
+            replay_put_event(EVENT_END);
+
+            /* write header */
+            fseek(replay_file, 0, SEEK_SET);
+            replay_put_dword(REPLAY_VERSION);
+        }
+
+        fclose(replay_file);
+        replay_file = NULL;
+    }
+    if (replay_filename) {
+        g_free(replay_filename);
+        replay_filename = NULL;
+    }
+
+    replay_finish_events();
+    replay_mutex_destroy();
+}
+
+void replay_add_blocker(Error *reason)
+{
+    replay_blockers = g_slist_prepend(replay_blockers, reason);
+}
diff --git a/roms/SLOF b/roms/SLOF
-Subproject 811277ac91f674a9273e2b529791e9b75350f3e
+Subproject b4c93802a5b2c72f096649c497ec9ff5708e445
diff --git a/scripts/analyze-migration.py b/scripts/analyze-migration.py
index f6894be..1455387 100755
--- a/scripts/analyze-migration.py
+++ b/scripts/analyze-migration.py
@@ -252,6 +252,15 @@ class HTABSection(object):
     def getDict(self):
         return ""
 
+
+class ConfigurationSection(object):
+    def __init__(self, file):
+        self.file = file
+
+    def read(self):
+        name_len = self.file.read32()
+        name = self.file.readstr(len = name_len)
+
 class VMSDFieldGeneric(object):
     def __init__(self, desc, file):
         self.file = file
@@ -474,6 +483,7 @@ class MigrationDump(object):
     QEMU_VM_SECTION_FULL  = 0x04
     QEMU_VM_SUBSECTION    = 0x05
     QEMU_VM_VMDESCRIPTION = 0x06
+    QEMU_VM_CONFIGURATION = 0x07
     QEMU_VM_SECTION_FOOTER= 0x7e
 
     def __init__(self, filename):
@@ -514,6 +524,9 @@ class MigrationDump(object):
             section_type = file.read8()
             if section_type == self.QEMU_VM_EOF:
                 break
+            elif section_type == self.QEMU_VM_CONFIGURATION:
+                section = ConfigurationSection(file)
+                section.read()
             elif section_type == self.QEMU_VM_SECTION_START or section_type == self.QEMU_VM_SECTION_FULL:
                 section_id = file.read32()
                 name = file.readstr()
diff --git a/scripts/qapi-introspect.py b/scripts/qapi-introspect.py
index c0dad66..64f2cd0 100644
--- a/scripts/qapi-introspect.py
+++ b/scripts/qapi-introspect.py
@@ -107,10 +107,12 @@ const char %(c_name)s[] = %(c_string)s;
         # characters.
         if isinstance(typ, QAPISchemaBuiltinType):
             return typ.name
+        if isinstance(typ, QAPISchemaArrayType):
+            return '[' + self._use_type(typ.element_type) + ']'
         return self._name(typ.name)
 
     def _gen_json(self, name, mtype, obj):
-        if mtype != 'command' and mtype != 'event' and mtype != 'builtin':
+        if mtype not in ('command', 'event', 'builtin', 'array'):
             name = self._name(name)
         obj['name'] = name
         obj['meta-type'] = mtype
@@ -136,8 +138,8 @@ const char %(c_name)s[] = %(c_string)s;
         self._gen_json(name, 'enum', {'values': values})
 
     def visit_array_type(self, name, info, element_type):
-        self._gen_json(name, 'array',
-                       {'element-type': self._use_type(element_type)})
+        element = self._use_type(element_type)
+        self._gen_json('[' + element + ']', 'array', {'element-type': element})
 
     def visit_object_type_flat(self, name, info, members, variants):
         obj = {'members': [self._gen_member(m) for m in members]}
diff --git a/scripts/qapi-visit.py b/scripts/qapi-visit.py
index f40c3c7..3ef5c16 100644
--- a/scripts/qapi-visit.py
+++ b/scripts/qapi-visit.py
@@ -138,6 +138,10 @@ void visit_type_%(c_name)s(Visitor *v, %(c_name)s **obj, const char *name, Error
 
 
 def gen_visit_list(name, element_type):
+    # FIXME: if *obj is NULL on entry, and the first visit_next_list()
+    # assigns to *obj, while a later one fails, we should clean up *obj
+    # rather than leaving it non-NULL. As currently written, the caller must
+    # call qapi_free_FOOList() to avoid a memory leak of the partial FOOList.
     return mcgen('''
 
 void visit_type_%(c_name)s(Visitor *v, %(c_name)s **obj, const char *name, Error **errp)
diff --git a/scripts/texi2pod.pl b/scripts/texi2pod.pl
index 94097fb..8767662 100755
--- a/scripts/texi2pod.pl
+++ b/scripts/texi2pod.pl
@@ -317,7 +317,7 @@ while(<$inf>) {
 	@columns = ();
 	for $column (split (/\s*\@tab\s*/, $1)) {
 	    # @strong{...} is used a @headitem work-alike
-	    $column =~ s/^\@strong{(.*)}$/$1/;
+	    $column =~ s/^\@strong\{(.*)\}$/$1/;
 	    push @columns, $column;
 	}
 	$_ = "\n=item ".join (" : ", @columns)."\n";
diff --git a/slirp/bootp.c b/slirp/bootp.c
index b7db9fa..1baaab1 100644
--- a/slirp/bootp.c
+++ b/slirp/bootp.c
@@ -23,6 +23,12 @@
  */
 #include <slirp.h>
 
+#if defined(_WIN32)
+/* Windows ntohl() returns an u_long value.
+ * Add a type cast to match the format strings. */
+# define ntohl(n) ((uint32_t)ntohl(n))
+#endif
+
 /* XXX: only DHCP is supported */
 
 #define LEASE_TIME (24 * 3600)
@@ -155,7 +161,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp)
     dhcp_decode(bp, &dhcp_msg_type, &preq_addr);
     DPRINTF("bootp packet op=%d msgtype=%d", bp->bp_op, dhcp_msg_type);
     if (preq_addr.s_addr != htonl(0L))
-        DPRINTF(" req_addr=%08x\n", ntohl(preq_addr.s_addr));
+        DPRINTF(" req_addr=%08" PRIx32 "\n", ntohl(preq_addr.s_addr));
     else
         DPRINTF("\n");
 
@@ -234,7 +240,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp)
     q += 4;
 
     if (bc) {
-        DPRINTF("%s addr=%08x\n",
+        DPRINTF("%s addr=%08" PRIx32 "\n",
                 (dhcp_msg_type == DHCPDISCOVER) ? "offered" : "ack'ed",
                 ntohl(daddr.sin_addr.s_addr));
 
@@ -302,7 +308,7 @@ static void bootp_reply(Slirp *slirp, const struct bootp_t *bp)
     } else {
         static const char nak_msg[] = "requested address not available";
 
-        DPRINTF("nak'ed addr=%08x\n", ntohl(preq_addr.s_addr));
+        DPRINTF("nak'ed addr=%08" PRIx32 "\n", ntohl(preq_addr.s_addr));
 
         *q++ = RFC2132_MSG_TYPE;
         *q++ = 1;
diff --git a/slirp/if.c b/slirp/if.c
index fb7acf8..8325a2a 100644
--- a/slirp/if.c
+++ b/slirp/if.c
@@ -53,8 +53,8 @@ if_output(struct socket *so, struct mbuf *ifm)
 	int on_fastq = 1;
 
 	DEBUG_CALL("if_output");
-	DEBUG_ARG("so = %lx", (long)so);
-	DEBUG_ARG("ifm = %lx", (long)ifm);
+	DEBUG_ARG("so = %p", so);
+	DEBUG_ARG("ifm = %p", ifm);
 
 	/*
 	 * First remove the mbuf from m_usedlist,
diff --git a/slirp/ip_icmp.c b/slirp/ip_icmp.c
index 9f1cb08..23b9f0f 100644
--- a/slirp/ip_icmp.c
+++ b/slirp/ip_icmp.c
@@ -125,7 +125,7 @@ icmp_input(struct mbuf *m, int hlen)
   Slirp *slirp = m->slirp;
 
   DEBUG_CALL("icmp_input");
-  DEBUG_ARG("m = %lx", (long )m);
+  DEBUG_ARG("m = %p", m);
   DEBUG_ARG("m_len = %d", m->m_len);
 
   /*
@@ -252,7 +252,7 @@ icmp_error(struct mbuf *msrc, u_char type, u_char code, int minsize,
   register struct mbuf *m;
 
   DEBUG_CALL("icmp_error");
-  DEBUG_ARG("msrc = %lx", (long )msrc);
+  DEBUG_ARG("msrc = %p", msrc);
   DEBUG_ARG("msrc_len = %d", msrc->m_len);
 
   if(type!=ICMP_UNREACH && type!=ICMP_TIMXCEED) goto end_error;
diff --git a/slirp/ip_input.c b/slirp/ip_input.c
index 880bdfd..7d436e6 100644
--- a/slirp/ip_input.c
+++ b/slirp/ip_input.c
@@ -80,7 +80,7 @@ ip_input(struct mbuf *m)
 	int hlen;
 
 	DEBUG_CALL("ip_input");
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("m = %p", m);
 	DEBUG_ARG("m_len = %d", m->m_len);
 
 	if (m->m_len < sizeof (struct ip)) {
@@ -232,9 +232,9 @@ ip_reass(Slirp *slirp, struct ip *ip, struct ipq *fp)
 	int i, next;
 
 	DEBUG_CALL("ip_reass");
-	DEBUG_ARG("ip = %lx", (long)ip);
-	DEBUG_ARG("fp = %lx", (long)fp);
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("ip = %p", ip);
+	DEBUG_ARG("fp = %p", fp);
+	DEBUG_ARG("m = %p", m);
 
 	/*
 	 * Presence of header sizes in mbufs
@@ -400,7 +400,7 @@ static void
 ip_enq(register struct ipasfrag *p, register struct ipasfrag *prev)
 {
 	DEBUG_CALL("ip_enq");
-	DEBUG_ARG("prev = %lx", (long)prev);
+	DEBUG_ARG("prev = %p", prev);
 	p->ipf_prev =  prev;
 	p->ipf_next = prev->ipf_next;
 	((struct ipasfrag *)(prev->ipf_next))->ipf_prev = p;
diff --git a/slirp/ip_output.c b/slirp/ip_output.c
index c82830f..1254d0d 100644
--- a/slirp/ip_output.c
+++ b/slirp/ip_output.c
@@ -60,8 +60,8 @@ ip_output(struct socket *so, struct mbuf *m0)
 	int len, off, error = 0;
 
 	DEBUG_CALL("ip_output");
-	DEBUG_ARG("so = %lx", (long)so);
-	DEBUG_ARG("m0 = %lx", (long)m0);
+	DEBUG_ARG("so = %p", so);
+	DEBUG_ARG("m0 = %p", m0);
 
 	ip = mtod(m, struct ip *);
 	/*
diff --git a/slirp/mbuf.c b/slirp/mbuf.c
index 4fefb04..795fc29 100644
--- a/slirp/mbuf.c
+++ b/slirp/mbuf.c
@@ -94,7 +94,7 @@ m_get(Slirp *slirp)
         m->arp_requested = false;
         m->expiration_date = (uint64_t)-1;
 end_error:
-	DEBUG_ARG("m = %lx", (long )m);
+	DEBUG_ARG("m = %p", m);
 	return m;
 }
 
@@ -103,7 +103,7 @@ m_free(struct mbuf *m)
 {
 
   DEBUG_CALL("m_free");
-  DEBUG_ARG("m = %lx", (long )m);
+  DEBUG_ARG("m = %p", m);
 
   if(m) {
 	/* Remove from m_usedlist */
@@ -221,7 +221,7 @@ dtom(Slirp *slirp, void *dat)
 	struct mbuf *m;
 
 	DEBUG_CALL("dtom");
-	DEBUG_ARG("dat = %lx", (long )dat);
+	DEBUG_ARG("dat = %p", dat);
 
 	/* bug corrected for M_EXT buffers */
 	for (m = slirp->m_usedlist.m_next; m != &slirp->m_usedlist;
diff --git a/slirp/misc.c b/slirp/misc.c
index 578e8b2..5497161 100644
--- a/slirp/misc.c
+++ b/slirp/misc.c
@@ -123,9 +123,9 @@ fork_exec(struct socket *so, const char *ex, int do_pty)
 	pid_t pid;
 
 	DEBUG_CALL("fork_exec");
-	DEBUG_ARG("so = %lx", (long)so);
-	DEBUG_ARG("ex = %lx", (long)ex);
-	DEBUG_ARG("do_pty = %lx", (long)do_pty);
+	DEBUG_ARG("so = %p", so);
+	DEBUG_ARG("ex = %p", ex);
+	DEBUG_ARG("do_pty = %x", do_pty);
 
 	if (do_pty == 2) {
                 return 0;
diff --git a/slirp/sbuf.c b/slirp/sbuf.c
index 08ec2b4..b8c3db7 100644
--- a/slirp/sbuf.c
+++ b/slirp/sbuf.c
@@ -72,8 +72,8 @@ sbappend(struct socket *so, struct mbuf *m)
 	int ret = 0;
 
 	DEBUG_CALL("sbappend");
-	DEBUG_ARG("so = %lx", (long)so);
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("so = %p", so);
+	DEBUG_ARG("m = %p", m);
 	DEBUG_ARG("m->m_len = %d", m->m_len);
 
 	/* Shouldn't happen, but...  e.g. foreign host closes connection */
diff --git a/slirp/socket.c b/slirp/socket.c
index 37ac5cf..1673e3a 100644
--- a/slirp/socket.c
+++ b/slirp/socket.c
@@ -91,7 +91,7 @@ size_t sopreprbuf(struct socket *so, struct iovec *iov, int *np)
 	int mss = so->so_tcpcb->t_maxseg;
 
 	DEBUG_CALL("sopreprbuf");
-	DEBUG_ARG("so = %lx", (long )so);
+	DEBUG_ARG("so = %p", so);
 
 	if (len <= 0)
 		return 0;
@@ -155,7 +155,7 @@ soread(struct socket *so)
 	struct iovec iov[2];
 
 	DEBUG_CALL("soread");
-	DEBUG_ARG("so = %lx", (long )so);
+	DEBUG_ARG("so = %p", so);
 
 	/*
 	 * No need to check if there's enough room to read.
@@ -215,7 +215,7 @@ int soreadbuf(struct socket *so, const char *buf, int size)
 	struct iovec iov[2];
 
 	DEBUG_CALL("soreadbuf");
-	DEBUG_ARG("so = %lx", (long )so);
+	DEBUG_ARG("so = %p", so);
 
 	/*
 	 * No need to check if there's enough room to read.
@@ -263,7 +263,7 @@ sorecvoob(struct socket *so)
 	struct tcpcb *tp = sototcpcb(so);
 
 	DEBUG_CALL("sorecvoob");
-	DEBUG_ARG("so = %lx", (long)so);
+	DEBUG_ARG("so = %p", so);
 
 	/*
 	 * We take a guess at how much urgent data has arrived.
@@ -293,7 +293,7 @@ sosendoob(struct socket *so)
 	int n, len;
 
 	DEBUG_CALL("sosendoob");
-	DEBUG_ARG("so = %lx", (long)so);
+	DEBUG_ARG("so = %p", so);
 	DEBUG_ARG("sb->sb_cc = %d", sb->sb_cc);
 
 	if (so->so_urgc > 2048)
@@ -351,7 +351,7 @@ sowrite(struct socket *so)
 	struct iovec iov[2];
 
 	DEBUG_CALL("sowrite");
-	DEBUG_ARG("so = %lx", (long)so);
+	DEBUG_ARG("so = %p", so);
 
 	if (so->so_urgc) {
 		sosendoob(so);
@@ -441,7 +441,7 @@ sorecvfrom(struct socket *so)
 	socklen_t addrlen = sizeof(struct sockaddr_in);
 
 	DEBUG_CALL("sorecvfrom");
-	DEBUG_ARG("so = %lx", (long)so);
+	DEBUG_ARG("so = %p", so);
 
 	if (so->so_type == IPPROTO_ICMP) {   /* This is a "ping" reply */
 	  char buff[256];
@@ -543,8 +543,8 @@ sosendto(struct socket *so, struct mbuf *m)
 	struct sockaddr_in addr;
 
 	DEBUG_CALL("sosendto");
-	DEBUG_ARG("so = %lx", (long)so);
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("so = %p", so);
+	DEBUG_ARG("m = %p", m);
 
         addr.sin_family = AF_INET;
 	if ((so->so_faddr.s_addr & slirp->vnetwork_mask.s_addr) ==
diff --git a/slirp/tcp_input.c b/slirp/tcp_input.c
index 00a77b4..6b096ec 100644
--- a/slirp/tcp_input.c
+++ b/slirp/tcp_input.c
@@ -231,8 +231,8 @@ tcp_input(struct mbuf *m, int iphlen, struct socket *inso)
     Slirp *slirp;
 
 	DEBUG_CALL("tcp_input");
-	DEBUG_ARGS((dfd, " m = %8lx  iphlen = %2d  inso = %lx\n",
-		    (long )m, iphlen, (long )inso ));
+	DEBUG_ARGS((dfd, " m = %p  iphlen = %2d  inso = %p\n",
+		    m, iphlen, inso));
 
 	/*
 	 * If called with m == 0, then we're continuing the connect
@@ -923,8 +923,8 @@ trimthenstep6:
 
 		if (SEQ_LEQ(ti->ti_ack, tp->snd_una)) {
 			if (ti->ti_len == 0 && tiwin == tp->snd_wnd) {
-			  DEBUG_MISC((dfd, " dup ack  m = %lx  so = %lx\n",
-				      (long )m, (long )so));
+			  DEBUG_MISC((dfd, " dup ack  m = %p  so = %p\n",
+				      m, so));
 				/*
 				 * If we have outstanding data (other than
 				 * a window probe), this is a completely
@@ -1302,7 +1302,7 @@ tcp_dooptions(struct tcpcb *tp, u_char *cp, int cnt, struct tcpiphdr *ti)
 	int opt, optlen;
 
 	DEBUG_CALL("tcp_dooptions");
-	DEBUG_ARGS((dfd, " tp = %lx  cnt=%i\n", (long)tp, cnt));
+	DEBUG_ARGS((dfd, " tp = %p  cnt=%i\n", tp, cnt));
 
 	for (; cnt > 0; cnt -= optlen, cp += optlen) {
 		opt = cp[0];
@@ -1383,7 +1383,7 @@ tcp_xmit_timer(register struct tcpcb *tp, int rtt)
 	register short delta;
 
 	DEBUG_CALL("tcp_xmit_timer");
-	DEBUG_ARG("tp = %lx", (long)tp);
+	DEBUG_ARG("tp = %p", tp);
 	DEBUG_ARG("rtt = %d", rtt);
 
 	if (tp->t_srtt != 0) {
@@ -1471,7 +1471,7 @@ tcp_mss(struct tcpcb *tp, u_int offer)
 	int mss;
 
 	DEBUG_CALL("tcp_mss");
-	DEBUG_ARG("tp = %lx", (long)tp);
+	DEBUG_ARG("tp = %p", tp);
 	DEBUG_ARG("offer = %d", offer);
 
 	mss = min(IF_MTU, IF_MRU) - sizeof(struct tcpiphdr);
diff --git a/slirp/tcp_output.c b/slirp/tcp_output.c
index 8aa3d90..fafca58 100644
--- a/slirp/tcp_output.c
+++ b/slirp/tcp_output.c
@@ -66,7 +66,7 @@ tcp_output(struct tcpcb *tp)
 	int idle, sendalot;
 
 	DEBUG_CALL("tcp_output");
-	DEBUG_ARG("tp = %lx", (long )tp);
+	DEBUG_ARG("tp = %p", tp);
 
 	/*
 	 * Determine length of data that should be transmitted,
diff --git a/slirp/tcp_subr.c b/slirp/tcp_subr.c
index 7571c5a..e161ed2 100644
--- a/slirp/tcp_subr.c
+++ b/slirp/tcp_subr.c
@@ -224,7 +224,7 @@ tcp_newtcpcb(struct socket *so)
 struct tcpcb *tcp_drop(struct tcpcb *tp, int err)
 {
 	DEBUG_CALL("tcp_drop");
-	DEBUG_ARG("tp = %lx", (long)tp);
+	DEBUG_ARG("tp = %p", tp);
 	DEBUG_ARG("errno = %d", errno);
 
 	if (TCPS_HAVERCVDSYN(tp->t_state)) {
@@ -249,7 +249,7 @@ tcp_close(struct tcpcb *tp)
 	register struct mbuf *m;
 
 	DEBUG_CALL("tcp_close");
-	DEBUG_ARG("tp = %lx", (long )tp);
+	DEBUG_ARG("tp = %p", tp);
 
 	/* free the reassembly queue, if any */
 	t = tcpfrag_list_first(tp);
@@ -290,7 +290,7 @@ tcp_sockclosed(struct tcpcb *tp)
 {
 
 	DEBUG_CALL("tcp_sockclosed");
-	DEBUG_ARG("tp = %lx", (long)tp);
+	DEBUG_ARG("tp = %p", tp);
 
 	switch (tp->t_state) {
 
@@ -330,7 +330,7 @@ int tcp_fconnect(struct socket *so)
   int ret=0;
 
   DEBUG_CALL("tcp_fconnect");
-  DEBUG_ARG("so = %lx", (long )so);
+  DEBUG_ARG("so = %p", so);
 
   if( (ret = so->s = qemu_socket(AF_INET,SOCK_STREAM,0)) >= 0) {
     int opt, s=so->s;
@@ -393,7 +393,7 @@ void tcp_connect(struct socket *inso)
     int s, opt;
 
     DEBUG_CALL("tcp_connect");
-    DEBUG_ARG("inso = %lx", (long)inso);
+    DEBUG_ARG("inso = %p", inso);
 
     /*
      * If it's an SS_ACCEPTONCE socket, no need to socreate()
@@ -564,8 +564,8 @@ tcp_emu(struct socket *so, struct mbuf *m)
 	char *bptr;
 
 	DEBUG_CALL("tcp_emu");
-	DEBUG_ARG("so = %lx", (long)so);
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("so = %p", so);
+	DEBUG_ARG("m = %p", m);
 
 	switch(so->so_emu) {
 		int x, i;
@@ -900,7 +900,7 @@ int tcp_ctl(struct socket *so)
     int do_pty;
 
     DEBUG_CALL("tcp_ctl");
-    DEBUG_ARG("so = %lx", (long )so);
+    DEBUG_ARG("so = %p", so);
 
     if (so->so_faddr.s_addr != slirp->vhost_addr.s_addr) {
         /* Check if it's pty_exec */
diff --git a/slirp/udp.c b/slirp/udp.c
index f77e00f..fee13b4 100644
--- a/slirp/udp.c
+++ b/slirp/udp.c
@@ -72,7 +72,7 @@ udp_input(register struct mbuf *m, int iphlen)
 	struct socket *so;
 
 	DEBUG_CALL("udp_input");
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("m = %p", m);
 	DEBUG_ARG("iphlen = %d", iphlen);
 
 	/*
@@ -241,8 +241,8 @@ int udp_output2(struct socket *so, struct mbuf *m,
 	int error = 0;
 
 	DEBUG_CALL("udp_output");
-	DEBUG_ARG("so = %lx", (long)so);
-	DEBUG_ARG("m = %lx", (long)m);
+	DEBUG_ARG("so = %p", so);
+	DEBUG_ARG("m = %p", m);
 	DEBUG_ARG("saddr = %lx", (long)saddr->sin_addr.s_addr);
 	DEBUG_ARG("daddr = %lx", (long)daddr->sin_addr.s_addr);
 
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 251443b..d7898a0 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -21,6 +21,8 @@ stub-obj-y += mon-printf.o
 stub-obj-y += monitor-init.o
 stub-obj-y += notify-event.o
 stub-obj-y += qtest.o
+stub-obj-y += replay.o
+stub-obj-y += replay-user.o
 stub-obj-y += reset.o
 stub-obj-y += runstate-check.o
 stub-obj-y += set-fd-handler.o
@@ -34,4 +36,5 @@ stub-obj-y += cpus.o
 stub-obj-y += kvm.o
 stub-obj-y += qmp_pc_dimm_device_list.o
 stub-obj-y += target-monitor-defs.o
+stub-obj-y += target-get-monitor-def.o
 stub-obj-y += vhost.o
diff --git a/stubs/replay-user.c b/stubs/replay-user.c
new file mode 100644
index 0000000..cf33072
--- /dev/null
+++ b/stubs/replay-user.c
@@ -0,0 +1,32 @@
+/*
+ * replay.c
+ *
+ * Copyright (c) 2010-2015 Institute for System Programming
+ *                         of the Russian Academy of Sciences.
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or later.
+ * See the COPYING file in the top-level directory.
+ *
+ */
+
+#include "sysemu/replay.h"
+
+bool replay_exception(void)
+{
+    return true;
+}
+
+bool replay_has_exception(void)
+{
+    return false;
+}
+
+bool replay_interrupt(void)
+{
+    return true;
+}
+
+bool replay_has_interrupt(void)
+{
+    return false;
+}
diff --git a/stubs/replay.c b/stubs/replay.c
new file mode 100644
index 0000000..8f98790
--- /dev/null
+++ b/stubs/replay.c
@@ -0,0 +1,31 @@
+#include "sysemu/replay.h"
+#include <stdlib.h>
+#include "sysemu/sysemu.h"
+
+ReplayMode replay_mode;
+
+int64_t replay_save_clock(unsigned int kind, int64_t clock)
+{
+    abort();
+    return 0;
+}
+
+int64_t replay_read_clock(unsigned int kind)
+{
+    abort();
+    return 0;
+}
+
+bool replay_checkpoint(ReplayCheckpoint checkpoint)
+{
+    return true;
+}
+
+bool replay_events_enabled(void)
+{
+    return false;
+}
+
+void replay_finish(void)
+{
+}
diff --git a/stubs/target-get-monitor-def.c b/stubs/target-get-monitor-def.c
new file mode 100644
index 0000000..711a9ae
--- /dev/null
+++ b/stubs/target-get-monitor-def.c
@@ -0,0 +1,31 @@
+/*
+ *  Stub for target_get_monitor_def.
+ *
+ *  Copyright IBM Corp., 2015
+ *
+ *  Author: Alexey Kardashevskiy <aik@ozlabs.ru>
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License as published by
+ *  the Free Software Foundation; either version 2 of the License,
+ *  or (at your option) any later version.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "stdint.h"
+
+typedef struct CPUState CPUState;
+
+int target_get_monitor_def(CPUState *cs, const char *name, uint64_t *pval);
+
+int target_get_monitor_def(CPUState *cs, const char *name, uint64_t *pval)
+{
+    return -1;
+}
diff --git a/target-alpha/translate.c b/target-alpha/translate.c
index 87950c6..9909c70 100644
--- a/target-alpha/translate.c
+++ b/target-alpha/translate.c
@@ -2916,7 +2916,7 @@ void gen_intermediate_code(CPUAlphaState *env, struct TranslationBlock *tb)
         num_insns++;
 
         if (unlikely(cpu_breakpoint_test(cs, ctx.pc, BP_ANY))) {
-            gen_excp(&ctx, EXCP_DEBUG, 0);
+            ret = gen_excp(&ctx, EXCP_DEBUG, 0);
             /* The address covered by the breakpoint must be included in
                [tb->pc, tb->pc + tb->size) in order to for it to be
                properly cleared -- thus we increment the PC here so that
diff --git a/target-arm/op_helper.c b/target-arm/op_helper.c
index b5db345..6cd54c8 100644
--- a/target-arm/op_helper.c
+++ b/target-arm/op_helper.c
@@ -917,7 +917,13 @@ void arm_debug_excp_handler(CPUState *cs)
         uint64_t pc = is_a64(env) ? env->pc : env->regs[15];
         bool same_el = (arm_debug_target_el(env) == arm_current_el(env));
 
-        if (cpu_breakpoint_test(cs, pc, BP_GDB)) {
+        /* (1) GDB breakpoints should be handled first.
+         * (2) Do not raise a CPU exception if no CPU breakpoint has fired,
+         * since singlestep is also done by generating a debug internal
+         * exception.
+         */
+        if (cpu_breakpoint_test(cs, pc, BP_GDB)
+            || !cpu_breakpoint_test(cs, pc, BP_CPU)) {
             return;
         }
 
diff --git a/target-arm/translate-a64.c b/target-arm/translate-a64.c
index d7e0954..fe485a4 100644
--- a/target-arm/translate-a64.c
+++ b/target-arm/translate-a64.c
@@ -11102,6 +11102,7 @@ void gen_intermediate_code_a64(ARMCPU *cpu, TranslationBlock *tb)
             QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
                 if (bp->pc == dc->pc) {
                     if (bp->flags & BP_CPU) {
+                        gen_a64_set_pc_im(dc->pc);
                         gen_helper_check_breakpoints(cpu_env);
                         /* End the TB early; it likely won't be executed */
                         dc->is_jmp = DISAS_UPDATE;
diff --git a/target-arm/translate.c b/target-arm/translate.c
index ff262a2..4351854 100644
--- a/target-arm/translate.c
+++ b/target-arm/translate.c
@@ -870,7 +870,7 @@ static inline void gen_bx_im(DisasContext *s, uint32_t addr)
 {
     TCGv_i32 tmp;
 
-    s->is_jmp = DISAS_UPDATE;
+    s->is_jmp = DISAS_JUMP;
     if (s->thumb != (addr & 1)) {
         tmp = tcg_temp_new_i32();
         tcg_gen_movi_i32(tmp, addr & 1);
@@ -883,7 +883,7 @@ static inline void gen_bx_im(DisasContext *s, uint32_t addr)
 /* Set PC and Thumb state from var.  var is marked as dead.  */
 static inline void gen_bx(DisasContext *s, TCGv_i32 var)
 {
-    s->is_jmp = DISAS_UPDATE;
+    s->is_jmp = DISAS_JUMP;
     tcg_gen_andi_i32(cpu_R[15], var, ~1);
     tcg_gen_andi_i32(var, var, 1);
     store_cpu_field(var, thumb);
@@ -1062,7 +1062,7 @@ static void gen_exception_insn(DisasContext *s, int offset, int excp,
 static inline void gen_lookup_tb(DisasContext *s)
 {
     tcg_gen_movi_i32(cpu_R[15], s->pc & ~1);
-    s->is_jmp = DISAS_UPDATE;
+    s->is_jmp = DISAS_JUMP;
 }
 
 static inline void gen_add_data_offset(DisasContext *s, unsigned int insn,
@@ -4096,7 +4096,7 @@ static void gen_exception_return(DisasContext *s, TCGv_i32 pc)
     tmp = load_cpu_field(spsr);
     gen_set_cpsr(tmp, CPSR_ERET_MASK);
     tcg_temp_free_i32(tmp);
-    s->is_jmp = DISAS_UPDATE;
+    s->is_jmp = DISAS_JUMP;
 }
 
 /* Generate a v6 exception return.  Marks both values as dead.  */
@@ -4105,7 +4105,7 @@ static void gen_rfe(DisasContext *s, TCGv_i32 pc, TCGv_i32 cpsr)
     gen_set_cpsr(cpsr, CPSR_ERET_MASK);
     tcg_temp_free_i32(cpsr);
     store_reg(s, 15, pc);
-    s->is_jmp = DISAS_UPDATE;
+    s->is_jmp = DISAS_JUMP;
 }
 
 static void gen_nop_hint(DisasContext *s, int val)
@@ -9035,7 +9035,7 @@ static void disas_arm_insn(DisasContext *s, unsigned int insn)
                     tmp = load_cpu_field(spsr);
                     gen_set_cpsr(tmp, CPSR_ERET_MASK);
                     tcg_temp_free_i32(tmp);
-                    s->is_jmp = DISAS_UPDATE;
+                    s->is_jmp = DISAS_JUMP;
                 }
             }
             break;
@@ -11355,7 +11355,7 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
             /* We always get here via a jump, so know we are not in a
                conditional execution block.  */
             gen_exception_internal(EXCP_KERNEL_TRAP);
-            dc->is_jmp = DISAS_UPDATE;
+            dc->is_jmp = DISAS_EXC;
             break;
         }
 #else
@@ -11363,7 +11363,7 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
             /* We always get here via a jump, so know we are not in a
                conditional execution block.  */
             gen_exception_internal(EXCP_EXCEPTION_EXIT);
-            dc->is_jmp = DISAS_UPDATE;
+            dc->is_jmp = DISAS_EXC;
             break;
         }
 #endif
@@ -11373,6 +11373,7 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
             QTAILQ_FOREACH(bp, &cs->breakpoints, entry) {
                 if (bp->pc == dc->pc) {
                     if (bp->flags & BP_CPU) {
+                        gen_set_pc_im(dc, dc->pc);
                         gen_helper_check_breakpoints(cpu_env);
                         /* End the TB early; it's likely not going to be executed */
                         dc->is_jmp = DISAS_UPDATE;
@@ -11497,7 +11498,8 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
             }
             gen_set_label(dc->condlabel);
         }
-        if (dc->condjmp || !dc->is_jmp) {
+        if (dc->condjmp || dc->is_jmp == DISAS_NEXT ||
+            dc->is_jmp == DISAS_UPDATE) {
             gen_set_pc_im(dc, dc->pc);
             dc->condjmp = 0;
         }
@@ -11533,9 +11535,11 @@ void gen_intermediate_code(CPUARMState *env, TranslationBlock *tb)
         case DISAS_NEXT:
             gen_goto_tb(dc, 1, dc->pc);
             break;
-        default:
-        case DISAS_JUMP:
         case DISAS_UPDATE:
+            gen_set_pc_im(dc, dc->pc);
+            /* fall through */
+        case DISAS_JUMP:
+        default:
             /* indicate that the hash table must be used to find the next TB */
             tcg_gen_exit_tb(0);
             break;
diff --git a/target-i386/cpu.c b/target-i386/cpu.c
index 9280bfc..e5f1c5b 100644
--- a/target-i386/cpu.c
+++ b/target-i386/cpu.c
@@ -259,8 +259,8 @@ static const char *svm_feature_name[] = {
 static const char *cpuid_7_0_ebx_feature_name[] = {
     "fsgsbase", "tsc_adjust", NULL, "bmi1", "hle", "avx2", NULL, "smep",
     "bmi2", "erms", "invpcid", "rtm", NULL, NULL, "mpx", NULL,
-    "avx512f", NULL, "rdseed", "adx", "smap", NULL, NULL, NULL,
-    NULL, NULL, "avx512pf", "avx512er", "avx512cd", NULL, NULL, NULL,
+    "avx512f", NULL, "rdseed", "adx", "smap", NULL, "pcommit", "clflushopt",
+    "clwb", NULL, "avx512pf", "avx512er", "avx512cd", NULL, NULL, NULL,
 };
 
 static const char *cpuid_apm_edx_feature_name[] = {
@@ -345,7 +345,9 @@ static const char *cpuid_6_feature_name[] = {
 #define TCG_SVM_FEATURES 0
 #define TCG_KVM_FEATURES 0
 #define TCG_7_0_EBX_FEATURES (CPUID_7_0_EBX_SMEP | CPUID_7_0_EBX_SMAP | \
-          CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX)
+          CPUID_7_0_EBX_BMI1 | CPUID_7_0_EBX_BMI2 | CPUID_7_0_EBX_ADX | \
+          CPUID_7_0_EBX_PCOMMIT | CPUID_7_0_EBX_CLFLUSHOPT |            \
+          CPUID_7_0_EBX_CLWB)
           /* missing:
           CPUID_7_0_EBX_FSGSBASE, CPUID_7_0_EBX_HLE, CPUID_7_0_EBX_AVX2,
           CPUID_7_0_EBX_ERMS, CPUID_7_0_EBX_INVPCID, CPUID_7_0_EBX_RTM,
@@ -671,12 +673,11 @@ static X86CPUDefinition builtin_x86_defs[] = {
             CPUID_MTRR | CPUID_CLFLUSH | CPUID_MCA |
             CPUID_PSE36,
         .features[FEAT_1_ECX] =
-            CPUID_EXT_SSE3 | CPUID_EXT_CX16 | CPUID_EXT_POPCNT,
+            CPUID_EXT_SSE3 | CPUID_EXT_CX16,
         .features[FEAT_8000_0001_EDX] =
             CPUID_EXT2_LM | CPUID_EXT2_SYSCALL | CPUID_EXT2_NX,
         .features[FEAT_8000_0001_ECX] =
-            CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM |
-            CPUID_EXT3_ABM | CPUID_EXT3_SSE4A,
+            CPUID_EXT3_LAHF_LM | CPUID_EXT3_SVM,
         .xlevel = 0x8000000A,
     },
     {
@@ -772,7 +773,7 @@ static X86CPUDefinition builtin_x86_defs[] = {
         .features[FEAT_1_EDX] =
             PPRO_FEATURES,
         .features[FEAT_1_ECX] =
-            CPUID_EXT_SSE3 | CPUID_EXT_POPCNT,
+            CPUID_EXT_SSE3,
         .xlevel = 0x80000004,
     },
     {
@@ -2243,7 +2244,7 @@ void x86_cpudef_setup(void)
                 pstrcpy(def->model_id, sizeof(def->model_id),
                         "QEMU Virtual CPU version ");
                 pstrcat(def->model_id, sizeof(def->model_id),
-                        qemu_get_version());
+                        qemu_hw_version());
                 break;
             }
         }
diff --git a/target-i386/cpu.h b/target-i386/cpu.h
index 62f7879..fc4a605 100644
--- a/target-i386/cpu.h
+++ b/target-i386/cpu.h
@@ -576,6 +576,9 @@ typedef uint32_t FeatureWordArray[FEATURE_WORDS];
 #define CPUID_7_0_EBX_RDSEED   (1U << 18)
 #define CPUID_7_0_EBX_ADX      (1U << 19)
 #define CPUID_7_0_EBX_SMAP     (1U << 20)
+#define CPUID_7_0_EBX_PCOMMIT  (1U << 22) /* Persistent Commit */
+#define CPUID_7_0_EBX_CLFLUSHOPT (1U << 23) /* Flush a Cache Line Optimized */
+#define CPUID_7_0_EBX_CLWB     (1U << 24) /* Cache Line Write Back */
 #define CPUID_7_0_EBX_AVX512PF (1U << 26) /* AVX-512 Prefetch */
 #define CPUID_7_0_EBX_AVX512ER (1U << 27) /* AVX-512 Exponential and Reciprocal */
 #define CPUID_7_0_EBX_AVX512CD (1U << 28) /* AVX-512 Conflict Detection */
diff --git a/target-i386/kvm.c b/target-i386/kvm.c
index 64046cb..2a9953b 100644
--- a/target-i386/kvm.c
+++ b/target-i386/kvm.c
@@ -111,6 +111,51 @@ bool kvm_allows_irq0_override(void)
     return !kvm_irqchip_in_kernel() || kvm_has_gsi_routing();
 }
 
+static int kvm_get_tsc(CPUState *cs)
+{
+    X86CPU *cpu = X86_CPU(cs);
+    CPUX86State *env = &cpu->env;
+    struct {
+        struct kvm_msrs info;
+        struct kvm_msr_entry entries[1];
+    } msr_data;
+    int ret;
+
+    if (env->tsc_valid) {
+        return 0;
+    }
+
+    msr_data.info.nmsrs = 1;
+    msr_data.entries[0].index = MSR_IA32_TSC;
+    env->tsc_valid = !runstate_is_running();
+
+    ret = kvm_vcpu_ioctl(CPU(cpu), KVM_GET_MSRS, &msr_data);
+    if (ret < 0) {
+        return ret;
+    }
+
+    env->tsc = msr_data.entries[0].data;
+    return 0;
+}
+
+static inline void do_kvm_synchronize_tsc(void *arg)
+{
+    CPUState *cpu = arg;
+
+    kvm_get_tsc(cpu);
+}
+
+void kvm_synchronize_all_tsc(void)
+{
+    CPUState *cpu;
+
+    if (kvm_enabled()) {
+        CPU_FOREACH(cpu) {
+            run_on_cpu(cpu, do_kvm_synchronize_tsc, cpu);
+        }
+    }
+}
+
 static struct kvm_cpuid2 *try_get_cpuid(KVMState *s, int max)
 {
     struct kvm_cpuid2 *cpuid;
diff --git a/target-i386/kvm_i386.h b/target-i386/kvm_i386.h
index e557e94..c1b312b 100644
--- a/target-i386/kvm_i386.h
+++ b/target-i386/kvm_i386.h
@@ -15,6 +15,7 @@
 
 bool kvm_allows_irq0_override(void);
 bool kvm_has_smm(void);
+void kvm_synchronize_all_tsc(void);
 void kvm_arch_reset_vcpu(X86CPU *cs);
 void kvm_arch_do_init_vcpu(X86CPU *cs);
 
diff --git a/target-i386/ops_sse.h b/target-i386/ops_sse.h
index 7aa693a..1780d1d 100644
--- a/target-i386/ops_sse.h
+++ b/target-i386/ops_sse.h
@@ -2037,10 +2037,10 @@ static inline unsigned pcmpxstrx(CPUX86State *env, Reg *d, Reg *s,
         }
         break;
     case 3:
-        for (j = valids - validd; j >= 0; j--) {
+        for (j = valids; j >= 0; j--) {
             res <<= 1;
             v = 1;
-            for (i = MIN(upper - j, validd); i >= 0; i--) {
+            for (i = MIN(valids - j, validd); i >= 0; i--) {
                 v &= (pcmp_val(s, ctrl, i + j) == pcmp_val(d, ctrl, i));
             }
             res |= v;
diff --git a/target-i386/translate.c b/target-i386/translate.c
index b400d24..fbe4f80 100644
--- a/target-i386/translate.c
+++ b/target-i386/translate.c
@@ -7716,20 +7716,43 @@ static target_ulong disas_insn(CPUX86State *env, DisasContext *s,
             }
             break;
         case 5: /* lfence */
-        case 6: /* mfence */
             if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & CPUID_SSE2))
                 goto illegal_op;
             break;
-        case 7: /* sfence / clflush */
-            if ((modrm & 0xc7) == 0xc0) {
-                /* sfence */
-                /* XXX: also check for cpuid_ext2_features & CPUID_EXT2_EMMX */
-                if (!(s->cpuid_features & CPUID_SSE))
+        case 6: /* mfence/clwb */
+            if (s->prefix & PREFIX_DATA) {
+                /* clwb */
+                if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLWB))
                     goto illegal_op;
+                gen_nop_modrm(env, s, modrm);
             } else {
-                /* clflush */
-                if (!(s->cpuid_features & CPUID_CLFLUSH))
+                /* mfence */
+                if ((modrm & 0xc7) != 0xc0 || !(s->cpuid_features & CPUID_SSE2))
                     goto illegal_op;
+            }
+            break;
+        case 7: /* sfence / clflush / clflushopt / pcommit */
+            if ((modrm & 0xc7) == 0xc0) {
+                if (s->prefix & PREFIX_DATA) {
+                    /* pcommit */
+                    if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_PCOMMIT))
+                        goto illegal_op;
+                } else {
+                    /* sfence */
+                    /* XXX: also check for cpuid_ext2_features & CPUID_EXT2_EMMX */
+                    if (!(s->cpuid_features & CPUID_SSE))
+                        goto illegal_op;
+                }
+            } else {
+                if (s->prefix & PREFIX_DATA) {
+                    /* clflushopt */
+                    if (!(s->cpuid_7_0_ebx_features & CPUID_7_0_EBX_CLFLUSHOPT))
+                        goto illegal_op;
+                } else {
+                    /* clflush */
+                    if (!(s->cpuid_features & CPUID_CLFLUSH))
+                        goto illegal_op;
+                }
                 gen_lea_modrm(env, s, modrm);
             }
             break;
diff --git a/target-ppc/cpu-qom.h b/target-ppc/cpu-qom.h
index 6967a80..bc20504 100644
--- a/target-ppc/cpu-qom.h
+++ b/target-ppc/cpu-qom.h
@@ -118,6 +118,8 @@ void ppc_cpu_dump_state(CPUState *cpu, FILE *f, fprintf_function cpu_fprintf,
                         int flags);
 void ppc_cpu_dump_statistics(CPUState *cpu, FILE *f,
                              fprintf_function cpu_fprintf, int flags);
+int ppc_cpu_get_monitor_def(CPUState *cs, const char *name,
+                            uint64_t *pval);
 hwaddr ppc_cpu_get_phys_page_debug(CPUState *cpu, vaddr addr);
 int ppc_cpu_gdb_read_register(CPUState *cpu, uint8_t *buf, int reg);
 int ppc_cpu_gdb_read_register_apple(CPUState *cpu, uint8_t *buf, int reg);
diff --git a/target-ppc/cpu.h b/target-ppc/cpu.h
index b34aed6..31c6fee 100644
--- a/target-ppc/cpu.h
+++ b/target-ppc/cpu.h
@@ -122,9 +122,15 @@ enum powerpc_mmu_t {
     /* Architecture 2.06 variant                               */
     POWERPC_MMU_2_06       = POWERPC_MMU_64 | POWERPC_MMU_1TSEG
                              | POWERPC_MMU_AMR | 0x00000003,
+    /* Architecture 2.06 "degraded" (no 1T segments)           */
+    POWERPC_MMU_2_06a      = POWERPC_MMU_64 | POWERPC_MMU_AMR
+                             | 0x00000003,
     /* Architecture 2.07 variant                               */
     POWERPC_MMU_2_07       = POWERPC_MMU_64 | POWERPC_MMU_1TSEG
                              | POWERPC_MMU_AMR | 0x00000004,
+    /* Architecture 2.07 "degraded" (no 1T segments)           */
+    POWERPC_MMU_2_07a      = POWERPC_MMU_64 | POWERPC_MMU_AMR
+                             | 0x00000004,
 #endif /* defined(TARGET_PPC64) */
 };
 
diff --git a/target-ppc/kvm_ppc.h b/target-ppc/kvm_ppc.h
index 309cbe0..5e1333d 100644
--- a/target-ppc/kvm_ppc.h
+++ b/target-ppc/kvm_ppc.h
@@ -180,7 +180,7 @@ static inline int kvmppc_remove_spapr_tce(void *table, int pfd,
 
 static inline int kvmppc_reset_htab(int shift_hint)
 {
-    return -1;
+    return 0;
 }
 
 static inline uint64_t kvmppc_rma_size(uint64_t current_size,
diff --git a/target-ppc/mem_helper.c b/target-ppc/mem_helper.c
index 6d37dae..7e1f234 100644
--- a/target-ppc/mem_helper.c
+++ b/target-ppc/mem_helper.c
@@ -100,8 +100,9 @@ void helper_lswx(CPUPPCState *env, target_ulong addr, uint32_t reg,
                  uint32_t ra, uint32_t rb)
 {
     if (likely(xer_bc != 0)) {
-        if (unlikely((ra != 0 && reg < ra && (reg + xer_bc) > ra) ||
-                     (reg < rb && (reg + xer_bc) > rb))) {
+        int num_used_regs = (xer_bc + 3) / 4;
+        if (unlikely((ra != 0 && reg < ra && (reg + num_used_regs) > ra) ||
+                     (reg < rb && (reg + num_used_regs) > rb))) {
             helper_raise_exception_err(env, POWERPC_EXCP_PROGRAM,
                                        POWERPC_EXCP_INVAL |
                                        POWERPC_EXCP_INVAL_LSWX);
diff --git a/target-ppc/mmu_helper.c b/target-ppc/mmu_helper.c
index e52d0e5..30298d8 100644
--- a/target-ppc/mmu_helper.c
+++ b/target-ppc/mmu_helper.c
@@ -1295,7 +1295,9 @@ void dump_mmu(FILE *f, fprintf_function cpu_fprintf, CPUPPCState *env)
     case POWERPC_MMU_64B:
     case POWERPC_MMU_2_03:
     case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06a:
     case POWERPC_MMU_2_07:
+    case POWERPC_MMU_2_07a:
         dump_slb(f, cpu_fprintf, env);
         break;
 #endif
@@ -1435,7 +1437,9 @@ hwaddr ppc_cpu_get_phys_page_debug(CPUState *cs, vaddr addr)
     case POWERPC_MMU_64B:
     case POWERPC_MMU_2_03:
     case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06a:
     case POWERPC_MMU_2_07:
+    case POWERPC_MMU_2_07a:
         return ppc_hash64_get_phys_page_debug(env, addr);
 #endif
 
@@ -1939,7 +1943,9 @@ void ppc_tlb_invalidate_all(CPUPPCState *env)
     case POWERPC_MMU_64B:
     case POWERPC_MMU_2_03:
     case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06a:
     case POWERPC_MMU_2_07:
+    case POWERPC_MMU_2_07a:
 #endif /* defined(TARGET_PPC64) */
         tlb_flush(CPU(cpu), 1);
         break;
@@ -2013,7 +2019,9 @@ void ppc_tlb_invalidate_one(CPUPPCState *env, target_ulong addr)
     case POWERPC_MMU_64B:
     case POWERPC_MMU_2_03:
     case POWERPC_MMU_2_06:
+    case POWERPC_MMU_2_06a:
     case POWERPC_MMU_2_07:
+    case POWERPC_MMU_2_07a:
         /* tlbie invalidate TLBs for all segments */
         /* XXX: given the fact that there are too many segments to invalidate,
          *      and we still don't have a tlb_flush_mask(env, n, mask) in QEMU,
diff --git a/target-ppc/monitor.c b/target-ppc/monitor.c
index 9cb1fe9..bc571b8 100644
--- a/target-ppc/monitor.c
+++ b/target-ppc/monitor.c
@@ -39,18 +39,6 @@ static target_long monitor_get_ccr (const struct MonitorDef *md, int val)
     return u;
 }
 
-static target_long monitor_get_msr (const struct MonitorDef *md, int val)
-{
-    CPUArchState *env = mon_get_cpu_env();
-    return env->msr;
-}
-
-static target_long monitor_get_xer (const struct MonitorDef *md, int val)
-{
-    CPUArchState *env = mon_get_cpu_env();
-    return env->xer;
-}
-
 static target_long monitor_get_decr (const struct MonitorDef *md, int val)
 {
     CPUArchState *env = mon_get_cpu_env();
@@ -76,176 +64,19 @@ void hmp_info_tlb(Monitor *mon, const QDict *qdict)
     dump_mmu((FILE*)mon, (fprintf_function)monitor_printf, env1);
 }
 
-
 const MonitorDef monitor_defs[] = {
-    /* General purpose registers */
-    { "r0", offsetof(CPUPPCState, gpr[0]) },
-    { "r1", offsetof(CPUPPCState, gpr[1]) },
-    { "r2", offsetof(CPUPPCState, gpr[2]) },
-    { "r3", offsetof(CPUPPCState, gpr[3]) },
-    { "r4", offsetof(CPUPPCState, gpr[4]) },
-    { "r5", offsetof(CPUPPCState, gpr[5]) },
-    { "r6", offsetof(CPUPPCState, gpr[6]) },
-    { "r7", offsetof(CPUPPCState, gpr[7]) },
-    { "r8", offsetof(CPUPPCState, gpr[8]) },
-    { "r9", offsetof(CPUPPCState, gpr[9]) },
-    { "r10", offsetof(CPUPPCState, gpr[10]) },
-    { "r11", offsetof(CPUPPCState, gpr[11]) },
-    { "r12", offsetof(CPUPPCState, gpr[12]) },
-    { "r13", offsetof(CPUPPCState, gpr[13]) },
-    { "r14", offsetof(CPUPPCState, gpr[14]) },
-    { "r15", offsetof(CPUPPCState, gpr[15]) },
-    { "r16", offsetof(CPUPPCState, gpr[16]) },
-    { "r17", offsetof(CPUPPCState, gpr[17]) },
-    { "r18", offsetof(CPUPPCState, gpr[18]) },
-    { "r19", offsetof(CPUPPCState, gpr[19]) },
-    { "r20", offsetof(CPUPPCState, gpr[20]) },
-    { "r21", offsetof(CPUPPCState, gpr[21]) },
-    { "r22", offsetof(CPUPPCState, gpr[22]) },
-    { "r23", offsetof(CPUPPCState, gpr[23]) },
-    { "r24", offsetof(CPUPPCState, gpr[24]) },
-    { "r25", offsetof(CPUPPCState, gpr[25]) },
-    { "r26", offsetof(CPUPPCState, gpr[26]) },
-    { "r27", offsetof(CPUPPCState, gpr[27]) },
-    { "r28", offsetof(CPUPPCState, gpr[28]) },
-    { "r29", offsetof(CPUPPCState, gpr[29]) },
-    { "r30", offsetof(CPUPPCState, gpr[30]) },
-    { "r31", offsetof(CPUPPCState, gpr[31]) },
-    /* Floating point registers */
-    { "f0", offsetof(CPUPPCState, fpr[0]) },
-    { "f1", offsetof(CPUPPCState, fpr[1]) },
-    { "f2", offsetof(CPUPPCState, fpr[2]) },
-    { "f3", offsetof(CPUPPCState, fpr[3]) },
-    { "f4", offsetof(CPUPPCState, fpr[4]) },
-    { "f5", offsetof(CPUPPCState, fpr[5]) },
-    { "f6", offsetof(CPUPPCState, fpr[6]) },
-    { "f7", offsetof(CPUPPCState, fpr[7]) },
-    { "f8", offsetof(CPUPPCState, fpr[8]) },
-    { "f9", offsetof(CPUPPCState, fpr[9]) },
-    { "f10", offsetof(CPUPPCState, fpr[10]) },
-    { "f11", offsetof(CPUPPCState, fpr[11]) },
-    { "f12", offsetof(CPUPPCState, fpr[12]) },
-    { "f13", offsetof(CPUPPCState, fpr[13]) },
-    { "f14", offsetof(CPUPPCState, fpr[14]) },
-    { "f15", offsetof(CPUPPCState, fpr[15]) },
-    { "f16", offsetof(CPUPPCState, fpr[16]) },
-    { "f17", offsetof(CPUPPCState, fpr[17]) },
-    { "f18", offsetof(CPUPPCState, fpr[18]) },
-    { "f19", offsetof(CPUPPCState, fpr[19]) },
-    { "f20", offsetof(CPUPPCState, fpr[20]) },
-    { "f21", offsetof(CPUPPCState, fpr[21]) },
-    { "f22", offsetof(CPUPPCState, fpr[22]) },
-    { "f23", offsetof(CPUPPCState, fpr[23]) },
-    { "f24", offsetof(CPUPPCState, fpr[24]) },
-    { "f25", offsetof(CPUPPCState, fpr[25]) },
-    { "f26", offsetof(CPUPPCState, fpr[26]) },
-    { "f27", offsetof(CPUPPCState, fpr[27]) },
-    { "f28", offsetof(CPUPPCState, fpr[28]) },
-    { "f29", offsetof(CPUPPCState, fpr[29]) },
-    { "f30", offsetof(CPUPPCState, fpr[30]) },
-    { "f31", offsetof(CPUPPCState, fpr[31]) },
     { "fpscr", offsetof(CPUPPCState, fpscr) },
     /* Next instruction pointer */
     { "nip|pc", offsetof(CPUPPCState, nip) },
     { "lr", offsetof(CPUPPCState, lr) },
     { "ctr", offsetof(CPUPPCState, ctr) },
     { "decr", 0, &monitor_get_decr, },
-    { "ccr", 0, &monitor_get_ccr, },
+    { "ccr|cr", 0, &monitor_get_ccr, },
     /* Machine state register */
-    { "msr", 0, &monitor_get_msr, },
-    { "xer", 0, &monitor_get_xer, },
+    { "xer", offsetof(CPUPPCState, xer) },
+    { "msr", offsetof(CPUPPCState, msr) },
     { "tbu", 0, &monitor_get_tbu, },
     { "tbl", 0, &monitor_get_tbl, },
-    /* Segment registers */
-    { "sdr1", offsetof(CPUPPCState, spr[SPR_SDR1]) },
-    { "sr0", offsetof(CPUPPCState, sr[0]) },
-    { "sr1", offsetof(CPUPPCState, sr[1]) },
-    { "sr2", offsetof(CPUPPCState, sr[2]) },
-    { "sr3", offsetof(CPUPPCState, sr[3]) },
-    { "sr4", offsetof(CPUPPCState, sr[4]) },
-    { "sr5", offsetof(CPUPPCState, sr[5]) },
-    { "sr6", offsetof(CPUPPCState, sr[6]) },
-    { "sr7", offsetof(CPUPPCState, sr[7]) },
-    { "sr8", offsetof(CPUPPCState, sr[8]) },
-    { "sr9", offsetof(CPUPPCState, sr[9]) },
-    { "sr10", offsetof(CPUPPCState, sr[10]) },
-    { "sr11", offsetof(CPUPPCState, sr[11]) },
-    { "sr12", offsetof(CPUPPCState, sr[12]) },
-    { "sr13", offsetof(CPUPPCState, sr[13]) },
-    { "sr14", offsetof(CPUPPCState, sr[14]) },
-    { "sr15", offsetof(CPUPPCState, sr[15]) },
-    /* Too lazy to put BATs... */
-    { "pvr", offsetof(CPUPPCState, spr[SPR_PVR]) },
-
-    { "srr0", offsetof(CPUPPCState, spr[SPR_SRR0]) },
-    { "srr1", offsetof(CPUPPCState, spr[SPR_SRR1]) },
-    { "dar", offsetof(CPUPPCState, spr[SPR_DAR]) },
-    { "dsisr", offsetof(CPUPPCState, spr[SPR_DSISR]) },
-    { "cfar", offsetof(CPUPPCState, spr[SPR_CFAR]) },
-    { "sprg0", offsetof(CPUPPCState, spr[SPR_SPRG0]) },
-    { "sprg1", offsetof(CPUPPCState, spr[SPR_SPRG1]) },
-    { "sprg2", offsetof(CPUPPCState, spr[SPR_SPRG2]) },
-    { "sprg3", offsetof(CPUPPCState, spr[SPR_SPRG3]) },
-    { "sprg4", offsetof(CPUPPCState, spr[SPR_SPRG4]) },
-    { "sprg5", offsetof(CPUPPCState, spr[SPR_SPRG5]) },
-    { "sprg6", offsetof(CPUPPCState, spr[SPR_SPRG6]) },
-    { "sprg7", offsetof(CPUPPCState, spr[SPR_SPRG7]) },
-    { "pid", offsetof(CPUPPCState, spr[SPR_BOOKE_PID]) },
-    { "csrr0", offsetof(CPUPPCState, spr[SPR_BOOKE_CSRR0]) },
-    { "csrr1", offsetof(CPUPPCState, spr[SPR_BOOKE_CSRR1]) },
-    { "esr", offsetof(CPUPPCState, spr[SPR_BOOKE_ESR]) },
-    { "dear", offsetof(CPUPPCState, spr[SPR_BOOKE_DEAR]) },
-    { "mcsr", offsetof(CPUPPCState, spr[SPR_BOOKE_MCSR]) },
-    { "tsr", offsetof(CPUPPCState, spr[SPR_BOOKE_TSR]) },
-    { "tcr", offsetof(CPUPPCState, spr[SPR_BOOKE_TCR]) },
-    { "vrsave", offsetof(CPUPPCState, spr[SPR_VRSAVE]) },
-    { "pir", offsetof(CPUPPCState, spr[SPR_BOOKE_PIR]) },
-    { "mcsrr0", offsetof(CPUPPCState, spr[SPR_BOOKE_MCSRR0]) },
-    { "mcsrr1", offsetof(CPUPPCState, spr[SPR_BOOKE_MCSRR1]) },
-    { "decar", offsetof(CPUPPCState, spr[SPR_BOOKE_DECAR]) },
-    { "ivpr", offsetof(CPUPPCState, spr[SPR_BOOKE_IVPR]) },
-    { "epcr", offsetof(CPUPPCState, spr[SPR_BOOKE_EPCR]) },
-    { "sprg8", offsetof(CPUPPCState, spr[SPR_BOOKE_SPRG8]) },
-    { "ivor0", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR0]) },
-    { "ivor1", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR1]) },
-    { "ivor2", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR2]) },
-    { "ivor3", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR3]) },
-    { "ivor4", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR4]) },
-    { "ivor5", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR5]) },
-    { "ivor6", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR6]) },
-    { "ivor7", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR7]) },
-    { "ivor8", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR8]) },
-    { "ivor9", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR9]) },
-    { "ivor10", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR10]) },
-    { "ivor11", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR11]) },
-    { "ivor12", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR12]) },
-    { "ivor13", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR13]) },
-    { "ivor14", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR14]) },
-    { "ivor15", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR15]) },
-    { "ivor32", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR32]) },
-    { "ivor33", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR33]) },
-    { "ivor34", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR34]) },
-    { "ivor35", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR35]) },
-    { "ivor36", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR36]) },
-    { "ivor37", offsetof(CPUPPCState, spr[SPR_BOOKE_IVOR37]) },
-    { "mas0", offsetof(CPUPPCState, spr[SPR_BOOKE_MAS0]) },
-    { "mas1", offsetof(CPUPPCState, spr[SPR_BOOKE_MAS1]) },
-    { "mas2", offsetof(CPUPPCState, spr[SPR_BOOKE_MAS2]) },
-    { "mas3", offsetof(CPUPPCState, spr[SPR_BOOKE_MAS3]) },
-    { "mas4", offsetof(CPUPPCState, spr[SPR_BOOKE_MAS4]) },
-    { "mas6", offsetof(CPUPPCState, spr[SPR_BOOKE_MAS6]) },
-    { "mas7", offsetof(CPUPPCState, spr[SPR_BOOKE_MAS7]) },
-    { "mmucfg", offsetof(CPUPPCState, spr[SPR_MMUCFG]) },
-    { "tlb0cfg", offsetof(CPUPPCState, spr[SPR_BOOKE_TLB0CFG]) },
-    { "tlb1cfg", offsetof(CPUPPCState, spr[SPR_BOOKE_TLB1CFG]) },
-    { "epr", offsetof(CPUPPCState, spr[SPR_BOOKE_EPR]) },
-    { "eplc", offsetof(CPUPPCState, spr[SPR_BOOKE_EPLC]) },
-    { "epsc", offsetof(CPUPPCState, spr[SPR_BOOKE_EPSC]) },
-    { "svr", offsetof(CPUPPCState, spr[SPR_E500_SVR]) },
-    { "mcar", offsetof(CPUPPCState, spr[SPR_Exxx_MCAR]) },
-    { "pid1", offsetof(CPUPPCState, spr[SPR_BOOKE_PID1]) },
-    { "pid2", offsetof(CPUPPCState, spr[SPR_BOOKE_PID2]) },
-    { "hid0", offsetof(CPUPPCState, spr[SPR_HID0]) },
     { NULL },
 };
 
@@ -253,3 +84,63 @@ const MonitorDef *target_monitor_defs(void)
 {
     return monitor_defs;
 }
+
+static int ppc_cpu_get_reg_num(const char *numstr, int maxnum, int *pregnum)
+{
+    int regnum;
+    char *endptr = NULL;
+
+    if (!*numstr) {
+        return false;
+    }
+
+    regnum = strtoul(numstr, &endptr, 10);
+    if (*endptr || (regnum >= maxnum)) {
+        return false;
+    }
+    *pregnum = regnum;
+
+    return true;
+}
+
+int target_get_monitor_def(CPUState *cs, const char *name, uint64_t *pval)
+{
+    int i, regnum;
+    PowerPCCPU *cpu = POWERPC_CPU(cs);
+    CPUPPCState *env = &cpu->env;
+
+    /* General purpose registers */
+    if ((tolower(name[0]) == 'r') &&
+        ppc_cpu_get_reg_num(name + 1, ARRAY_SIZE(env->gpr), &regnum)) {
+        *pval = env->gpr[regnum];
+        return 0;
+    }
+
+    /* Floating point registers */
+    if ((tolower(name[0]) == 'f') &&
+        ppc_cpu_get_reg_num(name + 1, ARRAY_SIZE(env->fpr), &regnum)) {
+        *pval = env->fpr[regnum];
+        return 0;
+    }
+
+    /* Special purpose registers */
+    for (i = 0; i < ARRAY_SIZE(env->spr_cb); ++i) {
+        ppc_spr_t *spr = &env->spr_cb[i];
+
+        if (spr->name && (strcasecmp(name, spr->name) == 0)) {
+            *pval = env->spr[i];
+            return 0;
+        }
+    }
+
+    /* Segment registers */
+#if !defined(CONFIG_USER_ONLY)
+    if ((strncasecmp(name, "sr", 2) == 0) &&
+        ppc_cpu_get_reg_num(name + 2, ARRAY_SIZE(env->sr), &regnum)) {
+        *pval = env->sr[regnum];
+        return 0;
+    }
+#endif
+
+    return -EINVAL;
+}
diff --git a/target-ppc/translate.c b/target-ppc/translate.c
index 308ad68..41a7258 100644
--- a/target-ppc/translate.c
+++ b/target-ppc/translate.c
@@ -9884,7 +9884,7 @@ GEN_HANDLER(mtcrf, 0x1F, 0x10, 0x04, 0x00000801, PPC_MISC),
 GEN_HANDLER(mtmsrd, 0x1F, 0x12, 0x05, 0x001EF801, PPC_64B),
 #endif
 GEN_HANDLER(mtmsr, 0x1F, 0x12, 0x04, 0x001FF801, PPC_MISC),
-GEN_HANDLER(mtspr, 0x1F, 0x13, 0x0E, 0x00000001, PPC_MISC),
+GEN_HANDLER(mtspr, 0x1F, 0x13, 0x0E, 0x00000000, PPC_MISC),
 GEN_HANDLER(dcbf, 0x1F, 0x16, 0x02, 0x03C00001, PPC_CACHE),
 GEN_HANDLER(dcbi, 0x1F, 0x16, 0x0E, 0x03E00001, PPC_CACHE),
 GEN_HANDLER(dcbst, 0x1F, 0x16, 0x01, 0x03E00001, PPC_CACHE),
diff --git a/target-ppc/translate_init.c b/target-ppc/translate_init.c
index 4934c80..e88dc7f 100644
--- a/target-ppc/translate_init.c
+++ b/target-ppc/translate_init.c
@@ -305,7 +305,7 @@ static void spr_read_ibat (DisasContext *ctx, int gprn, int sprn)
 
 static void spr_read_ibat_h (DisasContext *ctx, int gprn, int sprn)
 {
-    tcg_gen_ld_tl(cpu_gpr[gprn], cpu_env, offsetof(CPUPPCState, IBAT[sprn & 1][(sprn - SPR_IBAT4U) / 2]));
+    tcg_gen_ld_tl(cpu_gpr[gprn], cpu_env, offsetof(CPUPPCState, IBAT[sprn & 1][((sprn - SPR_IBAT4U) / 2) + 4]));
 }
 
 static void spr_write_ibatu (DisasContext *ctx, int sprn, int gprn)
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index c3be180..75a0e5d 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -258,7 +258,9 @@ int kvm_arch_init(MachineState *ms, KVMState *s)
     cap_mem_op = kvm_check_extension(s, KVM_CAP_S390_MEM_OP);
     cap_s390_irq = kvm_check_extension(s, KVM_CAP_S390_INJECT_IRQ);
 
-    kvm_s390_enable_cmma(s);
+    if (!mem_path) {
+        kvm_s390_enable_cmma(s);
+    }
 
     if (!kvm_check_extension(s, KVM_CAP_S390_GMAP)
         || !kvm_check_extension(s, KVM_CAP_S390_COW)) {
diff --git a/tests/.gitignore b/tests/.gitignore
index 65496aa..1e55722 100644
--- a/tests/.gitignore
+++ b/tests/.gitignore
@@ -9,6 +9,7 @@ check-qom-proplist
 rcutorture
 test-aio
 test-bitops
+test-blockjob-txn
 test-coroutine
 test-crypto-cipher
 test-crypto-hash
@@ -30,6 +31,7 @@ test-qapi-types.[ch]
 test-qapi-visit.[ch]
 test-qdev-global-props
 test-qemu-opts
+test-qga
 test-qmp-commands
 test-qmp-commands.h
 test-qmp-event
@@ -44,6 +46,7 @@ test-string-input-visitor
 test-string-output-visitor
 test-thread-pool
 test-throttle
+test-timed-average
 test-visitor-serialization
 test-vmstate
 test-write-threshold
diff --git a/tests/Makefile b/tests/Makefile
index 92969e8..90c4141 100644
--- a/tests/Makefile
+++ b/tests/Makefile
@@ -47,6 +47,8 @@ check-unit-y += tests/test-thread-pool$(EXESUF)
 gcov-files-test-thread-pool-y = thread-pool.c
 gcov-files-test-hbitmap-y = util/hbitmap.c
 check-unit-y += tests/test-hbitmap$(EXESUF)
+gcov-files-test-hbitmap-y = blockjob.c
+check-unit-y += tests/test-blockjob-txn$(EXESUF)
 check-unit-y += tests/test-x86-cpuid$(EXESUF)
 # all code tested by test-x86-cpuid is inside topology.h
 gcov-files-test-x86-cpuid-y =
@@ -81,6 +83,7 @@ check-unit-y += tests/test-crypto-cipher$(EXESUF)
 check-unit-$(CONFIG_GNUTLS) += tests/test-crypto-tlscredsx509$(EXESUF)
 check-unit-$(CONFIG_GNUTLS) += tests/test-crypto-tlssession$(EXESUF)
 check-unit-$(CONFIG_LINUX) += tests/test-qga$(EXESUF)
+check-unit-y += tests/test-timed-average$(EXESUF)
 
 check-block-$(CONFIG_POSIX) += tests/qemu-iotests-quick.sh
 
@@ -390,6 +393,7 @@ tests/test-coroutine$(EXESUF): tests/test-coroutine.o $(test-block-obj-y)
 tests/test-aio$(EXESUF): tests/test-aio.o $(test-block-obj-y)
 tests/test-rfifolock$(EXESUF): tests/test-rfifolock.o $(test-util-obj-y)
 tests/test-throttle$(EXESUF): tests/test-throttle.o $(test-block-obj-y)
+tests/test-blockjob-txn$(EXESUF): tests/test-blockjob-txn.o $(test-block-obj-y) $(test-util-obj-y)
 tests/test-thread-pool$(EXESUF): tests/test-thread-pool.o $(test-block-obj-y)
 tests/test-iov$(EXESUF): tests/test-iov.o $(test-util-obj-y)
 tests/test-hbitmap$(EXESUF): tests/test-hbitmap.o $(test-util-obj-y)
@@ -409,6 +413,9 @@ tests/test-vmstate$(EXESUF): tests/test-vmstate.o \
 	migration/vmstate.o migration/qemu-file.o migration/qemu-file-buf.o \
         migration/qemu-file-unix.o qjson.o \
 	$(test-qom-obj-y)
+tests/test-timed-average$(EXESUF): tests/test-timed-average.o qemu-timer.o \
+	libqemuutil.a stubs/clock-warp.o stubs/cpu-get-icount.o \
+	stubs/notify-event.o stubs/replay.o
 
 tests/test-qapi-types.c tests/test-qapi-types.h :\
 $(SRC_PATH)/tests/qapi-schema/qapi-schema-test.json $(SRC_PATH)/scripts/qapi-types.py $(qapi-py)
diff --git a/tests/ahci-test.c b/tests/ahci-test.c
index 59d387c..0888506 100644
--- a/tests/ahci-test.c
+++ b/tests/ahci-test.c
@@ -39,15 +39,17 @@
 #include "hw/pci/pci_ids.h"
 #include "hw/pci/pci_regs.h"
 
-/* Test-specific defines -- in MiB */
-#define TEST_IMAGE_SIZE_MB (200 * 1024)
-#define TEST_IMAGE_SECTORS ((TEST_IMAGE_SIZE_MB / AHCI_SECTOR_SIZE)     \
-                            * 1024 * 1024)
+/* Test images sizes in MB */
+#define TEST_IMAGE_SIZE_MB_LARGE (200 * 1024)
+#define TEST_IMAGE_SIZE_MB_SMALL 64
 
 /*** Globals ***/
 static char tmp_path[] = "/tmp/qtest.XXXXXX";
 static char debug_path[] = "/tmp/qtest-blkdebug.XXXXXX";
+static char mig_socket[] = "/tmp/qtest-migration.XXXXXX";
 static bool ahci_pedantic;
+static const char *imgfmt;
+static unsigned test_image_size_mb;
 
 /*** Function Declarations ***/
 static void ahci_test_port_spec(AHCIQState *ahci, uint8_t port);
@@ -60,6 +62,11 @@ static void ahci_test_pmcap(AHCIQState *ahci, uint8_t offset);
 
 /*** Utilities ***/
 
+static uint64_t mb_to_sectors(uint64_t image_size_mb)
+{
+    return (image_size_mb * 1024 * 1024) / AHCI_SECTOR_SIZE;
+}
+
 static void string_bswap16(uint16_t *s, size_t bytes)
 {
     g_assert_cmphex((bytes & 1), ==, 0);
@@ -114,8 +121,11 @@ static void ahci_migrate(AHCIQState *from, AHCIQState *to, const char *uri)
 {
     QOSState *tmp = to->parent;
     QPCIDevice *dev = to->dev;
+    char *uri_local = NULL;
+
     if (uri == NULL) {
-        uri = "tcp:127.0.0.1:1234";
+        uri_local = g_strdup_printf("%s%s", "unix:", mig_socket);
+        uri = uri_local;
     }
 
     /* context will be 'to' after completion. */
@@ -135,6 +145,7 @@ static void ahci_migrate(AHCIQState *from, AHCIQState *to, const char *uri)
     from->dev = dev;
 
     verify_state(to);
+    g_free(uri_local);
 }
 
 /*** Test Setup & Teardown ***/
@@ -170,11 +181,11 @@ static AHCIQState *ahci_boot(const char *cli, ...)
         va_end(ap);
     } else {
         cli = "-drive if=none,id=drive0,file=%s,cache=writeback,serial=%s"
-            ",format=qcow2"
+            ",format=%s"
             " -M q35 "
             "-device ide-hd,drive=drive0 "
             "-global ide-hd.ver=%s";
-        s = ahci_boot(cli, tmp_path, "testdisk", "version");
+        s = ahci_boot(cli, tmp_path, "testdisk", imgfmt, "version");
     }
 
     return s;
@@ -900,7 +911,7 @@ static void ahci_test_max(AHCIQState *ahci)
     uint64_t nsect;
     uint8_t port;
     uint8_t cmd;
-    uint64_t config_sect = TEST_IMAGE_SECTORS - 1;
+    uint64_t config_sect = mb_to_sectors(test_image_size_mb) - 1;
 
     if (config_sect > 0xFFFFFF) {
         cmd = CMD_READ_MAX_EXT;
@@ -1073,12 +1084,12 @@ static void test_flush_retry(void)
 
     prepare_blkdebug_script(debug_path, "flush_to_disk");
     ahci = ahci_boot_and_enable("-drive file=blkdebug:%s:%s,if=none,id=drive0,"
-                                "format=qcow2,cache=writeback,"
+                                "format=%s,cache=writeback,"
                                 "rerror=stop,werror=stop "
                                 "-M q35 "
                                 "-device ide-hd,drive=drive0 ",
                                 debug_path,
-                                tmp_path);
+                                tmp_path, imgfmt);
 
     /* Issue Flush Command and wait for error */
     port = ahci_port_select(ahci);
@@ -1105,18 +1116,19 @@ static void test_flush_retry(void)
 static void test_migrate_sanity(void)
 {
     AHCIQState *src, *dst;
-    const char *uri = "tcp:127.0.0.1:1234";
+    char *uri = g_strdup_printf("unix:%s", mig_socket);
 
     src = ahci_boot("-m 1024 -M q35 "
-                    "-hda %s ", tmp_path);
+                    "-drive if=ide,file=%s,format=%s ", tmp_path, imgfmt);
     dst = ahci_boot("-m 1024 -M q35 "
-                    "-hda %s "
-                    "-incoming %s", tmp_path, uri);
+                    "-drive if=ide,file=%s,format=%s "
+                    "-incoming %s", tmp_path, imgfmt, uri);
 
     ahci_migrate(src, dst, uri);
 
     ahci_shutdown(src);
     ahci_shutdown(dst);
+    g_free(uri);
 }
 
 /**
@@ -1129,13 +1141,14 @@ static void ahci_migrate_simple(uint8_t cmd_read, uint8_t cmd_write)
     size_t bufsize = 4096;
     unsigned char *tx = g_malloc(bufsize);
     unsigned char *rx = g_malloc0(bufsize);
-    const char *uri = "tcp:127.0.0.1:1234";
+    char *uri = g_strdup_printf("unix:%s", mig_socket);
 
     src = ahci_boot_and_enable("-m 1024 -M q35 "
-                               "-hda %s ", tmp_path);
+                               "-drive if=ide,format=%s,file=%s ",
+                               imgfmt, tmp_path);
     dst = ahci_boot("-m 1024 -M q35 "
-                    "-hda %s "
-                    "-incoming %s", tmp_path, uri);
+                    "-drive if=ide,format=%s,file=%s "
+                    "-incoming %s", imgfmt, tmp_path, uri);
 
     set_context(src->parent);
 
@@ -1158,6 +1171,7 @@ static void ahci_migrate_simple(uint8_t cmd_read, uint8_t cmd_write)
     ahci_shutdown(dst);
     g_free(rx);
     g_free(tx);
+    g_free(uri);
 }
 
 static void test_migrate_dma(void)
@@ -1190,12 +1204,12 @@ static void ahci_halted_io_test(uint8_t cmd_read, uint8_t cmd_write)
     prepare_blkdebug_script(debug_path, "write_aio");
 
     ahci = ahci_boot_and_enable("-drive file=blkdebug:%s:%s,if=none,id=drive0,"
-                                "format=qcow2,cache=writeback,"
+                                "format=%s,cache=writeback,"
                                 "rerror=stop,werror=stop "
                                 "-M q35 "
                                 "-device ide-hd,drive=drive0 ",
                                 debug_path,
-                                tmp_path);
+                                tmp_path, imgfmt);
 
     /* Initialize and prepare */
     port = ahci_port_select(ahci);
@@ -1251,25 +1265,25 @@ static void ahci_migrate_halted_io(uint8_t cmd_read, uint8_t cmd_write)
     unsigned char *rx = g_malloc0(bufsize);
     uint64_t ptr;
     AHCICommand *cmd;
-    const char *uri = "tcp:127.0.0.1:1234";
+    char *uri = g_strdup_printf("unix:%s", mig_socket);
 
     prepare_blkdebug_script(debug_path, "write_aio");
 
     src = ahci_boot_and_enable("-drive file=blkdebug:%s:%s,if=none,id=drive0,"
-                               "format=qcow2,cache=writeback,"
+                               "format=%s,cache=writeback,"
                                "rerror=stop,werror=stop "
                                "-M q35 "
                                "-device ide-hd,drive=drive0 ",
                                debug_path,
-                               tmp_path);
+                               tmp_path, imgfmt);
 
     dst = ahci_boot("-drive file=%s,if=none,id=drive0,"
-                    "format=qcow2,cache=writeback,"
+                    "format=%s,cache=writeback,"
                     "rerror=stop,werror=stop "
                     "-M q35 "
                     "-device ide-hd,drive=drive0 "
                     "-incoming %s",
-                    tmp_path, uri);
+                    tmp_path, imgfmt, uri);
 
     set_context(src->parent);
 
@@ -1301,6 +1315,7 @@ static void ahci_migrate_halted_io(uint8_t cmd_read, uint8_t cmd_write)
     ahci_shutdown(dst);
     g_free(rx);
     g_free(tx);
+    g_free(uri);
 }
 
 static void test_migrate_halted_dma(void)
@@ -1322,20 +1337,22 @@ static void test_flush_migrate(void)
     AHCICommand *cmd;
     uint8_t px;
     const char *s;
-    const char *uri = "tcp:127.0.0.1:1234";
+    char *uri = g_strdup_printf("unix:%s", mig_socket);
 
     prepare_blkdebug_script(debug_path, "flush_to_disk");
 
     src = ahci_boot_and_enable("-drive file=blkdebug:%s:%s,if=none,id=drive0,"
-                               "cache=writeback,rerror=stop,werror=stop "
+                               "cache=writeback,rerror=stop,werror=stop,"
+                               "format=%s "
                                "-M q35 "
                                "-device ide-hd,drive=drive0 ",
-                               debug_path, tmp_path);
+                               debug_path, tmp_path, imgfmt);
     dst = ahci_boot("-drive file=%s,if=none,id=drive0,"
-                    "cache=writeback,rerror=stop,werror=stop "
+                    "cache=writeback,rerror=stop,werror=stop,"
+                    "format=%s "
                     "-M q35 "
                     "-device ide-hd,drive=drive0 "
-                    "-incoming %s", tmp_path, uri);
+                    "-incoming %s", tmp_path, imgfmt, uri);
 
     set_context(src->parent);
 
@@ -1360,6 +1377,7 @@ static void test_flush_migrate(void)
     ahci_command_free(cmd);
     ahci_shutdown(src);
     ahci_shutdown(dst);
+    g_free(uri);
 }
 
 static void test_max(void)
@@ -1476,7 +1494,7 @@ static uint64_t offset_sector(enum OffsetType ofst,
         return 1;
     case OFFSET_HIGH:
         ceil = (addr_type == ADDR_MODE_LBA28) ? 0xfffffff : 0xffffffffffff;
-        ceil = MIN(ceil, TEST_IMAGE_SECTORS - 1);
+        ceil = MIN(ceil, mb_to_sectors(test_image_size_mb) - 1);
         nsectors = buffsize / AHCI_SECTOR_SIZE;
         return ceil - nsectors + 1;
     default:
@@ -1558,8 +1576,9 @@ static void create_ahci_io_test(enum IOMode type, enum AddrMode addr,
                                 enum BuffLen len, enum OffsetType offset)
 {
     char *name;
-    AHCIIOTestOptions *opts = g_malloc(sizeof(AHCIIOTestOptions));
+    AHCIIOTestOptions *opts;
 
+    opts = g_malloc(sizeof(AHCIIOTestOptions));
     opts->length = len;
     opts->address_type = addr;
     opts->io_type = type;
@@ -1571,6 +1590,13 @@ static void create_ahci_io_test(enum IOMode type, enum AddrMode addr,
                            buff_len_str[len],
                            offset_str[offset]);
 
+    if ((addr == ADDR_MODE_LBA48) && (offset == OFFSET_HIGH) &&
+        (mb_to_sectors(test_image_size_mb) <= 0xFFFFFFF)) {
+        g_test_message("%s: skipped; test image too small", name);
+        g_free(name);
+        return;
+    }
+
     qtest_add_data_func(name, opts, test_io_interface);
     g_free(name);
 }
@@ -1617,15 +1643,33 @@ int main(int argc, char **argv)
         return 0;
     }
 
-    /* Create a temporary qcow2 image */
-    close(mkstemp(tmp_path));
-    mkqcow2(tmp_path, TEST_IMAGE_SIZE_MB);
+    /* Create a temporary image */
+    fd = mkstemp(tmp_path);
+    g_assert(fd >= 0);
+    if (have_qemu_img()) {
+        imgfmt = "qcow2";
+        test_image_size_mb = TEST_IMAGE_SIZE_MB_LARGE;
+        mkqcow2(tmp_path, TEST_IMAGE_SIZE_MB_LARGE);
+    } else {
+        g_test_message("QTEST_QEMU_IMG not set or qemu-img missing; "
+                       "skipping LBA48 high-sector tests");
+        imgfmt = "raw";
+        test_image_size_mb = TEST_IMAGE_SIZE_MB_SMALL;
+        ret = ftruncate(fd, test_image_size_mb * 1024 * 1024);
+        g_assert(ret == 0);
+    }
+    close(fd);
 
     /* Create temporary blkdebug instructions */
     fd = mkstemp(debug_path);
     g_assert(fd >= 0);
     close(fd);
 
+    /* Reserve a hollow file to use as a socket for migration tests */
+    fd = mkstemp(mig_socket);
+    g_assert(fd >= 0);
+    close(fd);
+
     /* Run the tests */
     qtest_add_func("/ahci/sanity",     test_sanity);
     qtest_add_func("/ahci/pci_spec",   test_pci_spec);
@@ -1668,6 +1712,7 @@ int main(int argc, char **argv)
     /* Cleanup */
     unlink(tmp_path);
     unlink(debug_path);
+    unlink(mig_socket);
 
     return ret;
 }
diff --git a/tests/i440fx-test.c b/tests/i440fx-test.c
index d0bc8de..7fa1709 100644
--- a/tests/i440fx-test.c
+++ b/tests/i440fx-test.c
@@ -191,7 +191,7 @@ static void write_area(uint32_t start, uint32_t end, uint8_t value)
     uint32_t size = end - start + 1;
     uint8_t *data;
 
-    data = g_malloc0(size);
+    data = g_malloc(size);
     memset(data, value, size);
     memwrite(start, data, size);
 
diff --git a/tests/ivshmem-test.c b/tests/ivshmem-test.c
index c8f0cf0..f1793ba 100644
--- a/tests/ivshmem-test.c
+++ b/tests/ivshmem-test.c
@@ -478,10 +478,12 @@ int main(int argc, char **argv)
     tmpserver = g_strconcat(tmpdir, "/server", NULL);
 
     qtest_add_func("/ivshmem/single", test_ivshmem_single);
-    qtest_add_func("/ivshmem/pair", test_ivshmem_pair);
-    qtest_add_func("/ivshmem/server", test_ivshmem_server);
     qtest_add_func("/ivshmem/hotplug", test_ivshmem_hotplug);
     qtest_add_func("/ivshmem/memdev", test_ivshmem_memdev);
+    if (g_test_slow()) {
+        qtest_add_func("/ivshmem/pair", test_ivshmem_pair);
+        qtest_add_func("/ivshmem/server", test_ivshmem_server);
+    }
 
     ret = g_test_run();
 
diff --git a/tests/libqos/libqos.c b/tests/libqos/libqos.c
index 8d7c5a9..2d1a802 100644
--- a/tests/libqos/libqos.c
+++ b/tests/libqos/libqos.c
@@ -147,6 +147,23 @@ void migrate(QOSState *from, QOSState *to, const char *uri)
     set_context(to);
 }
 
+bool have_qemu_img(void)
+{
+    char *rpath;
+    const char *path = getenv("QTEST_QEMU_IMG");
+    if (!path) {
+        return false;
+    }
+
+    rpath = realpath(path, NULL);
+    if (!rpath) {
+        return false;
+    } else {
+        free(rpath);
+        return true;
+    }
+}
+
 void mkimg(const char *file, const char *fmt, unsigned size_mb)
 {
     gchar *cli;
@@ -155,13 +172,14 @@ void mkimg(const char *file, const char *fmt, unsigned size_mb)
     GError *err = NULL;
     char *qemu_img_path;
     gchar *out, *out2;
-    char *abs_path;
+    char *qemu_img_abs_path;
 
     qemu_img_path = getenv("QTEST_QEMU_IMG");
-    abs_path = realpath(qemu_img_path, NULL);
-    assert(qemu_img_path);
+    g_assert(qemu_img_path);
+    qemu_img_abs_path = realpath(qemu_img_path, NULL);
+    g_assert(qemu_img_abs_path);
 
-    cli = g_strdup_printf("%s create -f %s %s %uM", abs_path,
+    cli = g_strdup_printf("%s create -f %s %s %uM", qemu_img_abs_path,
                           fmt, file, size_mb);
     ret = g_spawn_command_line_sync(cli, &out, &out2, &rc, &err);
     if (err) {
@@ -183,7 +201,7 @@ void mkimg(const char *file, const char *fmt, unsigned size_mb)
     g_free(out);
     g_free(out2);
     g_free(cli);
-    free(abs_path);
+    free(qemu_img_abs_path);
 }
 
 void mkqcow2(const char *file, unsigned size_mb)
diff --git a/tests/libqos/libqos.h b/tests/libqos/libqos.h
index 492a651..ca14d2e 100644
--- a/tests/libqos/libqos.h
+++ b/tests/libqos/libqos.h
@@ -19,6 +19,7 @@ typedef struct QOSState {
 QOSState *qtest_vboot(QOSOps *ops, const char *cmdline_fmt, va_list ap);
 QOSState *qtest_boot(QOSOps *ops, const char *cmdline_fmt, ...);
 void qtest_shutdown(QOSState *qs);
+bool have_qemu_img(void);
 void mkimg(const char *file, const char *fmt, unsigned size_mb);
 void mkqcow2(const char *file, unsigned size_mb);
 void set_context(QOSState *s);
diff --git a/tests/qapi-schema/qapi-schema-test.json b/tests/qapi-schema/qapi-schema-test.json
index 48e104b..44638da 100644
--- a/tests/qapi-schema/qapi-schema-test.json
+++ b/tests/qapi-schema/qapi-schema-test.json
@@ -3,6 +3,9 @@
 # This file is a stress test of supported qapi constructs that must
 # parse and compile correctly.
 
+{ 'struct': 'TestStruct',
+  'data': { 'integer': 'int', 'boolean': 'bool', 'string': 'str' } }
+
 # for testing enums
 { 'struct': 'NestedEnumsOne',
   'data': { 'enum1': 'EnumOne',   # Intentional forward reference
@@ -46,7 +49,8 @@
 
 # dummy struct to force generation of array types not otherwise mentioned
 { 'struct': 'ForceArrays',
-  'data': { 'unused1':['UserDefOne'], 'unused2':['UserDefTwo'] } }
+  'data': { 'unused1':['UserDefOne'], 'unused2':['UserDefTwo'],
+            'unused3':['TestStruct'] } }
 
 # for testing unions
 # Among other things, test that a name collision between branches does
diff --git a/tests/qapi-schema/qapi-schema-test.out b/tests/qapi-schema/qapi-schema-test.out
index a7e9aab..e20a823 100644
--- a/tests/qapi-schema/qapi-schema-test.out
+++ b/tests/qapi-schema/qapi-schema-test.out
@@ -92,6 +92,7 @@ object EventStructOne
 object ForceArrays
     member unused1: UserDefOneList optional=False
     member unused2: UserDefTwoList optional=False
+    member unused3: TestStructList optional=False
 enum MyEnum []
 object NestedEnumsOne
     member enum1: EnumOne optional=False
@@ -100,6 +101,10 @@ object NestedEnumsOne
     member enum4: EnumOne optional=True
 enum QEnumTwo ['value1', 'value2']
     prefix QENUM_TWO
+object TestStruct
+    member integer: int optional=False
+    member boolean: bool optional=False
+    member string: str optional=False
 object UserDefA
     member boolean: bool optional=False
     member a_b: int optional=True
diff --git a/tests/qemu-iotests/039.out b/tests/qemu-iotests/039.out
index 03a31c5..32c8846 100644
--- a/tests/qemu-iotests/039.out
+++ b/tests/qemu-iotests/039.out
@@ -11,7 +11,11 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 incompatible_features     0x1
 ERROR cluster 5 refcount=0 reference=1
 ERROR OFLAG_COPIED data cluster: l2_entry=8000000000050000 refcount=0
@@ -46,7 +50,11 @@ read 512/512 bytes at offset 0
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 incompatible_features     0x1
 ERROR cluster 5 refcount=0 reference=1
 Rebuilding refcount structure
@@ -60,7 +68,11 @@ incompatible_features     0x0
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 incompatible_features     0x0
 No errors were found on the image.
 
@@ -79,7 +91,11 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 incompatible_features     0x1
 ERROR cluster 5 refcount=0 reference=1
 ERROR OFLAG_COPIED data cluster: l2_entry=8000000000050000 refcount=0
@@ -89,7 +105,11 @@ Data may be corrupted, or further writes to the image may corrupt it.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 incompatible_features     0x0
 No errors were found on the image.
 *** done
diff --git a/tests/qemu-iotests/040 b/tests/qemu-iotests/040
index ea2f98e..5bdaf3d 100755
--- a/tests/qemu-iotests/040
+++ b/tests/qemu-iotests/040
@@ -41,6 +41,7 @@ class ImageCommitTestCase(iotests.QMPTestCase):
         while not completed:
             for event in self.vm.get_qmp_events(wait=True):
                 if event['event'] == 'BLOCK_JOB_COMPLETED':
+                    self.assert_qmp_absent(event, 'data/error')
                     self.assert_qmp(event, 'data/type', 'commit')
                     self.assert_qmp(event, 'data/device', 'drive0')
                     self.assert_qmp(event, 'data/offset', event['data']['len'])
@@ -251,5 +252,34 @@ class TestSetSpeed(ImageCommitTestCase):
 class TestActiveZeroLengthImage(TestSingleDrive):
     image_len = 0
 
+class TestReopenOverlay(ImageCommitTestCase):
+    image_len = 1024 * 1024
+    img0 = os.path.join(iotests.test_dir, '0.img')
+    img1 = os.path.join(iotests.test_dir, '1.img')
+    img2 = os.path.join(iotests.test_dir, '2.img')
+    img3 = os.path.join(iotests.test_dir, '3.img')
+
+    def setUp(self):
+        iotests.create_image(self.img0, self.image_len)
+        qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % self.img0, self.img1)
+        qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % self.img1, self.img2)
+        qemu_img('create', '-f', iotests.imgfmt, '-o', 'backing_file=%s' % self.img2, self.img3)
+        qemu_io('-f', iotests.imgfmt, '-c', 'write -P 0xab 0 128K', self.img1)
+        self.vm = iotests.VM().add_drive(self.img3)
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(self.img0)
+        os.remove(self.img1)
+        os.remove(self.img2)
+        os.remove(self.img3)
+
+    # This tests what happens when the overlay image of the 'top' node
+    # needs to be reopened in read-write mode in order to update the
+    # backing image string.
+    def test_reopen_overlay(self):
+        self.run_commit_test(self.img1, self.img0)
+
 if __name__ == '__main__':
     iotests.main(supported_fmts=['qcow2', 'qed'])
diff --git a/tests/qemu-iotests/040.out b/tests/qemu-iotests/040.out
index 42314e9..4fd1c2d 100644
--- a/tests/qemu-iotests/040.out
+++ b/tests/qemu-iotests/040.out
@@ -1,5 +1,5 @@
-........................
+.........................
 ----------------------------------------------------------------------
-Ran 24 tests
+Ran 25 tests
 
 OK
diff --git a/tests/qemu-iotests/058 b/tests/qemu-iotests/058
index f2bdd0b..63a6598 100755
--- a/tests/qemu-iotests/058
+++ b/tests/qemu-iotests/058
@@ -32,11 +32,17 @@ status=1	# failure is the default!
 
 nbd_unix_socket=$TEST_DIR/test_qemu_nbd_socket
 nbd_snapshot_img="nbd:unix:$nbd_unix_socket"
+rm -f "${TEST_DIR}/qemu-nbd.pid"
 
 _cleanup_nbd()
 {
-    if [ -n "$NBD_SNAPSHOT_PID" ]; then
-        kill "$NBD_SNAPSHOT_PID"
+    local NBD_SNAPSHOT_PID
+    if [ -f "${TEST_DIR}/qemu-nbd.pid" ]; then
+        read NBD_SNAPSHOT_PID < "${TEST_DIR}/qemu-nbd.pid"
+        rm -f "${TEST_DIR}/qemu-nbd.pid"
+        if [ -n "$NBD_SNAPSHOT_PID" ]; then
+            kill "$NBD_SNAPSHOT_PID"
+        fi
     fi
     rm -f "$nbd_unix_socket"
 }
@@ -60,7 +66,6 @@ _export_nbd_snapshot()
 {
     _cleanup_nbd
     $QEMU_NBD -v -t -k "$nbd_unix_socket" "$TEST_IMG" -l $1 &
-    NBD_SNAPSHOT_PID=$!
     _wait_for_nbd
 }
 
@@ -68,7 +73,6 @@ _export_nbd_snapshot1()
 {
     _cleanup_nbd
     $QEMU_NBD -v -t -k "$nbd_unix_socket" "$TEST_IMG" -l snapshot.name=$1 &
-    NBD_SNAPSHOT_PID=$!
     _wait_for_nbd
 }
 
diff --git a/tests/qemu-iotests/061.out b/tests/qemu-iotests/061.out
index b16bea9..f2598a8 100644
--- a/tests/qemu-iotests/061.out
+++ b/tests/qemu-iotests/061.out
@@ -57,7 +57,11 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 magic                     0x514649fb
 version                   3
 backing_file_offset       0x0
@@ -215,7 +219,11 @@ No errors were found on the image.
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 wrote 131072/131072 bytes at offset 0
 128 KiB, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 magic                     0x514649fb
 version                   3
 backing_file_offset       0x0
diff --git a/tests/qemu-iotests/085 b/tests/qemu-iotests/085
index 56cd6f8..aa77eca 100755
--- a/tests/qemu-iotests/085
+++ b/tests/qemu-iotests/085
@@ -7,6 +7,7 @@
 # snapshots are performed.
 #
 # Copyright (C) 2014 Red Hat, Inc.
+# Copyright (C) 2015 Igalia, S.L.
 #
 # This program is free software; you can redistribute it and/or modify
 # it under the terms of the GNU General Public License as published by
@@ -34,17 +35,17 @@ status=1	# failure is the default!
 snapshot_virt0="snapshot-v0.qcow2"
 snapshot_virt1="snapshot-v1.qcow2"
 
-MAX_SNAPSHOTS=10
+SNAPSHOTS=10
 
 _cleanup()
 {
     _cleanup_qemu
-    for i in $(seq 1 ${MAX_SNAPSHOTS})
+    for i in $(seq 1 ${SNAPSHOTS})
     do
         rm -f "${TEST_DIR}/${i}-${snapshot_virt0}"
         rm -f "${TEST_DIR}/${i}-${snapshot_virt1}"
     done
-	_cleanup_test_img
+    rm -f "${TEST_IMG}.1" "${TEST_IMG}.2"
 
 }
 trap "_cleanup; exit \$status" 0 1 2 3 15
@@ -64,7 +65,7 @@ function create_single_snapshot()
 {
     cmd="{ 'execute': 'blockdev-snapshot-sync',
                       'arguments': { 'device': 'virtio0',
-                                     'snapshot-file':'"${TEST_DIR}/${1}-${snapshot_virt0}"',
+                                     'snapshot-file':'${TEST_DIR}/${1}-${snapshot_virt0}',
                                      'format': 'qcow2' } }"
     _send_qemu_cmd $h "${cmd}" "return"
 }
@@ -76,27 +77,60 @@ function create_group_snapshot()
            {'actions': [
                { 'type': 'blockdev-snapshot-sync', 'data' :
                    { 'device': 'virtio0',
-                      'snapshot-file': '"${TEST_DIR}/${1}-${snapshot_virt0}"' } },
+                      'snapshot-file': '${TEST_DIR}/${1}-${snapshot_virt0}' } },
                { 'type': 'blockdev-snapshot-sync', 'data' :
                    { 'device': 'virtio1',
-                       'snapshot-file': '"${TEST_DIR}/${1}-${snapshot_virt1}"' } } ]
+                       'snapshot-file': '${TEST_DIR}/${1}-${snapshot_virt1}' } } ]
              } }"
 
     _send_qemu_cmd $h "${cmd}" "return"
 }
 
+# ${1}: unique identifier for the snapshot filename
+# ${2}: true: open backing images; false: don't open them (default)
+function add_snapshot_image()
+{
+    if [ "${2}" = "true" ]; then
+        extra_params=""
+    else
+        extra_params="'backing': '', "
+    fi
+    base_image="${TEST_DIR}/$((${1}-1))-${snapshot_virt0}"
+    snapshot_file="${TEST_DIR}/${1}-${snapshot_virt0}"
+    _make_test_img -b "${base_image}" "$size"
+    mv "${TEST_IMG}" "${snapshot_file}"
+    cmd="{ 'execute': 'blockdev-add', 'arguments':
+           { 'options':
+             { 'driver': 'qcow2', 'node-name': 'snap_${1}', ${extra_params}
+               'file':
+               { 'driver': 'file', 'filename': '${snapshot_file}',
+                 'node-name': 'file_${1}' } } } }"
+    _send_qemu_cmd $h "${cmd}" "return"
+}
+
+# ${1}: unique identifier for the snapshot filename
+# ${2}: expected response, defaults to 'return'
+function blockdev_snapshot()
+{
+    cmd="{ 'execute': 'blockdev-snapshot',
+                      'arguments': { 'node': 'virtio0',
+                                     'overlay':'snap_${1}' } }"
+    _send_qemu_cmd $h "${cmd}" "${2:-return}"
+}
+
 size=128M
 
 _make_test_img $size
-mv "${TEST_IMG}" "${TEST_IMG}.orig"
+mv "${TEST_IMG}" "${TEST_IMG}.1"
 _make_test_img $size
+mv "${TEST_IMG}" "${TEST_IMG}.2"
 
 echo
 echo === Running QEMU ===
 echo
 
 qemu_comm_method="qmp"
-_launch_qemu -drive file="${TEST_IMG}.orig",if=virtio -drive file="${TEST_IMG}",if=virtio
+_launch_qemu -drive file="${TEST_IMG}.1",if=virtio -drive file="${TEST_IMG}.2",if=virtio
 h=$QEMU_HANDLE
 
 echo
@@ -105,6 +139,8 @@ echo
 
 _send_qemu_cmd $h "{ 'execute': 'qmp_capabilities' }" "return"
 
+# Tests for the blockdev-snapshot-sync command
+
 echo
 echo === Create a single snapshot on virtio0 ===
 echo
@@ -117,7 +153,7 @@ echo === Invalid command - missing device and nodename ===
 echo
 
 _send_qemu_cmd $h "{ 'execute': 'blockdev-snapshot-sync',
-                         'arguments': { 'snapshot-file':'"${TEST_DIR}/1-${snapshot_virt0}"',
+                         'arguments': { 'snapshot-file':'${TEST_DIR}/1-${snapshot_virt0}',
                                      'format': 'qcow2' } }" "error"
 
 echo
@@ -132,11 +168,75 @@ echo
 echo === Create several transactional group snapshots ===
 echo
 
-for i in $(seq 2 ${MAX_SNAPSHOTS})
+for i in $(seq 2 ${SNAPSHOTS})
 do
     create_group_snapshot ${i}
 done
 
+# Tests for the blockdev-snapshot command
+
+echo
+echo === Create a couple of snapshots using blockdev-snapshot ===
+echo
+
+SNAPSHOTS=$((${SNAPSHOTS}+1))
+add_snapshot_image ${SNAPSHOTS}
+blockdev_snapshot ${SNAPSHOTS}
+
+SNAPSHOTS=$((${SNAPSHOTS}+1))
+add_snapshot_image ${SNAPSHOTS}
+blockdev_snapshot ${SNAPSHOTS}
+
+echo
+echo === Invalid command - cannot create a snapshot using a file BDS ===
+echo
+
+_send_qemu_cmd $h "{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'virtio0',
+                                    'overlay':'file_${SNAPSHOTS}' }
+                   }" "error"
+
+echo
+echo === Invalid command - snapshot node used as active layer ===
+echo
+
+blockdev_snapshot ${SNAPSHOTS} error
+
+_send_qemu_cmd $h "{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'virtio0',
+                                    'overlay':'virtio0' }
+                   }" "error"
+
+_send_qemu_cmd $h "{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'virtio0',
+                                    'overlay':'virtio1' }
+                   }" "error"
+
+echo
+echo === Invalid command - snapshot node used as backing hd ===
+echo
+
+blockdev_snapshot $((${SNAPSHOTS}-1)) error
+
+echo
+echo === Invalid command - snapshot node has a backing image ===
+echo
+
+SNAPSHOTS=$((${SNAPSHOTS}+1))
+add_snapshot_image ${SNAPSHOTS} true
+blockdev_snapshot ${SNAPSHOTS} error
+
+echo
+echo === Invalid command - The node does not exist ===
+echo
+
+blockdev_snapshot $((${SNAPSHOTS}+1)) error
+
+_send_qemu_cmd $h "{ 'execute': 'blockdev-snapshot',
+                     'arguments': { 'node':'nodevice',
+                                    'overlay':'snap_${SNAPSHOTS}' }
+                   }" "error"
+
 # success, all done
 echo "*** done"
 rm -f $seq.full
diff --git a/tests/qemu-iotests/085.out b/tests/qemu-iotests/085.out
index a6cf19e..01c78d6 100644
--- a/tests/qemu-iotests/085.out
+++ b/tests/qemu-iotests/085.out
@@ -11,7 +11,7 @@ Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728
 
 === Create a single snapshot on virtio0 ===
 
-Formatting 'TEST_DIR/1-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/t.qcow2.orig backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/1-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/t.qcow2.1 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
 {"return": {}}
 
 === Invalid command - missing device and nodename ===
@@ -26,7 +26,7 @@ Formatting 'TEST_DIR/1-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file
 === Create several transactional group snapshots ===
 
 Formatting 'TEST_DIR/2-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/1-snapshot-v0.qcow2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
-Formatting 'TEST_DIR/2-snapshot-v1.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/t.qcow2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
+Formatting 'TEST_DIR/2-snapshot-v1.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/t.qcow2.2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
 {"return": {}}
 Formatting 'TEST_DIR/3-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/2-snapshot-v0.qcow2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
 Formatting 'TEST_DIR/3-snapshot-v1.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/2-snapshot-v1.qcow2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
@@ -52,4 +52,38 @@ Formatting 'TEST_DIR/9-snapshot-v1.qcow2', fmt=qcow2 size=134217728 backing_file
 Formatting 'TEST_DIR/10-snapshot-v0.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/9-snapshot-v0.qcow2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
 Formatting 'TEST_DIR/10-snapshot-v1.qcow2', fmt=qcow2 size=134217728 backing_file=TEST_DIR/9-snapshot-v1.qcow2 backing_fmt=qcow2 encryption=off cluster_size=65536 lazy_refcounts=off refcount_bits=16
 {"return": {}}
+
+=== Create a couple of snapshots using blockdev-snapshot ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/10-snapshot-v0.IMGFMT
+{"return": {}}
+{"return": {}}
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/11-snapshot-v0.IMGFMT
+{"return": {}}
+{"return": {}}
+
+=== Invalid command - cannot create a snapshot using a file BDS ===
+
+{"error": {"class": "GenericError", "desc": "The snapshot does not support backing images"}}
+
+=== Invalid command - snapshot node used as active layer ===
+
+{"error": {"class": "GenericError", "desc": "The snapshot is already in use by virtio0"}}
+{"error": {"class": "GenericError", "desc": "The snapshot is already in use by virtio0"}}
+{"error": {"class": "GenericError", "desc": "The snapshot is already in use by virtio1"}}
+
+=== Invalid command - snapshot node used as backing hd ===
+
+{"error": {"class": "GenericError", "desc": "Node 'snap_11' is busy: node is used as backing hd of 'virtio0'"}}
+
+=== Invalid command - snapshot node has a backing image ===
+
+Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=134217728 backing_file=TEST_DIR/12-snapshot-v0.IMGFMT
+{"return": {}}
+{"error": {"class": "GenericError", "desc": "The snapshot already has a backing image"}}
+
+=== Invalid command - The node does not exist ===
+
+{"error": {"class": "GenericError", "desc": "Cannot find device=snap_14 nor node_name=snap_14"}}
+{"error": {"class": "GenericError", "desc": "Cannot find device=nodevice nor node_name=nodevice"}}
 *** done
diff --git a/tests/qemu-iotests/118 b/tests/qemu-iotests/118
new file mode 100755
index 0000000..a2bcd54
--- /dev/null
+++ b/tests/qemu-iotests/118
@@ -0,0 +1,720 @@
+#!/usr/bin/env python
+#
+# Test case for the QMP 'change' command and all other associated
+# commands
+#
+# Copyright (C) 2015 Red Hat, Inc.
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import stat
+import time
+import iotests
+from iotests import qemu_img
+
+old_img = os.path.join(iotests.test_dir, 'test0.img')
+new_img = os.path.join(iotests.test_dir, 'test1.img')
+
+class ChangeBaseClass(iotests.QMPTestCase):
+    has_opened = False
+    has_closed = False
+
+    def process_events(self):
+        for event in self.vm.get_qmp_events(wait=False):
+            if (event['event'] == 'DEVICE_TRAY_MOVED' and
+                event['data']['device'] == 'drive0'):
+                if event['data']['tray-open'] == False:
+                    self.has_closed = True
+                else:
+                    self.has_opened = True
+
+    def wait_for_open(self):
+        timeout = time.clock() + 3
+        while not self.has_opened and time.clock() < timeout:
+            self.process_events()
+        if not self.has_opened:
+            self.fail('Timeout while waiting for the tray to open')
+
+    def wait_for_close(self):
+        timeout = time.clock() + 3
+        while not self.has_closed and time.clock() < timeout:
+            self.process_events()
+        if not self.has_opened:
+            self.fail('Timeout while waiting for the tray to close')
+
+class GeneralChangeTestsBaseClass(ChangeBaseClass):
+    def test_change(self):
+        result = self.vm.qmp('change', device='drive0', target=new_img,
+                                       arg=iotests.imgfmt)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_blockdev_change_medium(self):
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_eject(self):
+        result = self.vm.qmp('eject', device='drive0', force=True)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp_absent(result, 'return[0]/inserted')
+
+    def test_tray_eject_change(self):
+        result = self.vm.qmp('eject', device='drive0', force=True)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp_absent(result, 'return[0]/inserted')
+
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_tray_open_close(self):
+        result = self.vm.qmp('blockdev-open-tray', device='drive0', force=True)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        if self.was_empty == True:
+            self.assert_qmp_absent(result, 'return[0]/inserted')
+        else:
+            self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-close-tray', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        if self.has_real_tray or not self.was_empty:
+            self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        if self.has_real_tray or not self.was_empty:
+            self.assert_qmp(result, 'return[0]/tray_open', False)
+        else:
+            self.assert_qmp(result, 'return[0]/tray_open', True)
+        if self.was_empty == True:
+            self.assert_qmp_absent(result, 'return[0]/inserted')
+        else:
+            self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+    def test_tray_eject_close(self):
+        result = self.vm.qmp('eject', device='drive0', force=True)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp_absent(result, 'return[0]/inserted')
+
+        result = self.vm.qmp('blockdev-close-tray', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        if self.has_real_tray:
+            self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        if self.has_real_tray:
+            self.assert_qmp(result, 'return[0]/tray_open', False)
+        else:
+            self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp_absent(result, 'return[0]/inserted')
+
+    def test_tray_open_change(self):
+        result = self.vm.qmp('blockdev-open-tray', device='drive0', force=True)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        if self.was_empty == True:
+            self.assert_qmp_absent(result, 'return[0]/inserted')
+        else:
+            self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_cycle(self):
+        result = self.vm.qmp('blockdev-add',
+                             options={'node-name': 'new',
+                                      'driver': iotests.imgfmt,
+                                      'file': {'filename': new_img,
+                                               'driver': 'file'}})
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('blockdev-open-tray', device='drive0', force=True)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        if self.was_empty == True:
+            self.assert_qmp_absent(result, 'return[0]/inserted')
+        else:
+            self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-remove-medium', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp_absent(result, 'return[0]/inserted')
+
+        result = self.vm.qmp('blockdev-insert-medium', device='drive0',
+                                                       node_name='new')
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+        result = self.vm.qmp('blockdev-close-tray', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_close_on_closed(self):
+        result = self.vm.qmp('blockdev-close-tray', device='drive0')
+        # Should be a no-op
+        self.assert_qmp(result, 'return', {})
+        self.assertEquals(self.vm.get_qmp_events(wait=False), [])
+
+    def test_remove_on_closed(self):
+        if self.has_opened:
+            # Empty floppy drive
+            return
+
+        result = self.vm.qmp('blockdev-remove-medium', device='drive0')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+    def test_insert_on_closed(self):
+        if self.has_opened:
+            # Empty floppy drive
+            return
+
+        result = self.vm.qmp('blockdev-add',
+                             options={'node-name': 'new',
+                                      'driver': iotests.imgfmt,
+                                      'file': {'filename': new_img,
+                                               'driver': 'file'}})
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('blockdev-insert-medium', device='drive0',
+                                                       node_name='new')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+class TestInitiallyFilled(GeneralChangeTestsBaseClass):
+    was_empty = False
+
+    def setUp(self, media, interface):
+        qemu_img('create', '-f', iotests.imgfmt, old_img, '1440k')
+        qemu_img('create', '-f', iotests.imgfmt, new_img, '1440k')
+        self.vm = iotests.VM().add_drive(old_img, 'media=%s' % media, interface)
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(old_img)
+        os.remove(new_img)
+
+    def test_insert_on_filled(self):
+        result = self.vm.qmp('blockdev-add',
+                             options={'node-name': 'new',
+                                      'driver': iotests.imgfmt,
+                                      'file': {'filename': new_img,
+                                               'driver': 'file'}})
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('blockdev-open-tray', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('blockdev-insert-medium', device='drive0',
+                                                       node_name='new')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+class TestInitiallyEmpty(GeneralChangeTestsBaseClass):
+    was_empty = True
+
+    def setUp(self, media, interface):
+        qemu_img('create', '-f', iotests.imgfmt, new_img, '1440k')
+        self.vm = iotests.VM().add_drive(None, 'media=%s' % media, interface)
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(new_img)
+
+    def test_remove_on_empty(self):
+        result = self.vm.qmp('blockdev-open-tray', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('blockdev-remove-medium', device='drive0')
+        # Should be a no-op
+        self.assert_qmp(result, 'return', {})
+
+class TestCDInitiallyFilled(TestInitiallyFilled):
+    TestInitiallyFilled = TestInitiallyFilled
+    has_real_tray = True
+
+    def setUp(self):
+        self.TestInitiallyFilled.setUp(self, 'cdrom', 'ide')
+
+class TestCDInitiallyEmpty(TestInitiallyEmpty):
+    TestInitiallyEmpty = TestInitiallyEmpty
+    has_real_tray = True
+
+    def setUp(self):
+        self.TestInitiallyEmpty.setUp(self, 'cdrom', 'ide')
+
+class TestFloppyInitiallyFilled(TestInitiallyFilled):
+    TestInitiallyFilled = TestInitiallyFilled
+    has_real_tray = False
+
+    def setUp(self):
+        self.TestInitiallyFilled.setUp(self, 'disk', 'floppy')
+
+class TestFloppyInitiallyEmpty(TestInitiallyEmpty):
+    TestInitiallyEmpty = TestInitiallyEmpty
+    has_real_tray = False
+
+    def setUp(self):
+        self.TestInitiallyEmpty.setUp(self, 'disk', 'floppy')
+        # FDDs not having a real tray and there not being a medium inside the
+        # tray at startup means the tray will be considered open
+        self.has_opened = True
+
+class TestChangeReadOnly(ChangeBaseClass):
+    def setUp(self):
+        qemu_img('create', '-f', iotests.imgfmt, old_img, '1440k')
+        qemu_img('create', '-f', iotests.imgfmt, new_img, '1440k')
+        self.vm = iotests.VM()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.chmod(old_img, 0666)
+        os.chmod(new_img, 0666)
+        os.remove(old_img)
+        os.remove(new_img)
+
+    def test_ro_ro_retain(self):
+        os.chmod(old_img, 0444)
+        os.chmod(new_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk,read-only=on', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt,
+                                                       read_only_mode='retain')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_ro_rw_retain(self):
+        os.chmod(old_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk,read-only=on', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt,
+                                                       read_only_mode='retain')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_rw_ro_retain(self):
+        os.chmod(new_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt,
+                                                       read_only_mode='retain')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        self.assertEquals(self.vm.get_qmp_events(wait=False), [])
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+    def test_ro_rw(self):
+        os.chmod(old_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk,read-only=on', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium',
+                             device='drive0',
+                             filename=new_img,
+                             format=iotests.imgfmt,
+                             read_only_mode='read-write')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_rw_ro(self):
+        os.chmod(new_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium',
+                             device='drive0',
+                             filename=new_img,
+                             format=iotests.imgfmt,
+                             read_only_mode='read-only')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_make_rw_ro(self):
+        self.vm.add_drive(old_img, 'media=disk', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium',
+                             device='drive0',
+                             filename=new_img,
+                             format=iotests.imgfmt,
+                             read_only_mode='read-only')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_make_ro_rw(self):
+        os.chmod(new_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium',
+                             device='drive0',
+                             filename=new_img,
+                             format=iotests.imgfmt,
+                             read_only_mode='read-write')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        self.assertEquals(self.vm.get_qmp_events(wait=False), [])
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+    def test_make_rw_ro_by_retain(self):
+        os.chmod(old_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk,read-only=on', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt,
+                                                       read_only_mode='retain')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+    def test_make_ro_rw_by_retain(self):
+        os.chmod(new_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-change-medium', device='drive0',
+                                                       filename=new_img,
+                                                       format=iotests.imgfmt,
+                                                       read_only_mode='retain')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+
+        self.assertEquals(self.vm.get_qmp_events(wait=False), [])
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+    def test_rw_ro_cycle(self):
+        os.chmod(new_img, 0444)
+        self.vm.add_drive(old_img, 'media=disk', 'floppy')
+        self.vm.launch()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-add',
+                             options={'node-name': 'new',
+                                      'driver': iotests.imgfmt,
+                                      'read-only': True,
+                                      'file': {'filename': new_img,
+                                               'driver': 'file'}})
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('blockdev-open-tray', device='drive0', force=True)
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_open()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp(result, 'return[0]/inserted/ro', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+        result = self.vm.qmp('blockdev-remove-medium', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp_absent(result, 'return[0]/inserted')
+
+        result = self.vm.qmp('blockdev-insert-medium', device='drive0',
+                                                       node_name='new')
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', True)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+        result = self.vm.qmp('blockdev-close-tray', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.wait_for_close()
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/ro', True)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+
+GeneralChangeTestsBaseClass = None
+TestInitiallyFilled = None
+TestInitiallyEmpty = None
+
+
+class TestBlockJobsAfterCycle(ChangeBaseClass):
+    def setUp(self):
+        qemu_img('create', '-f', iotests.imgfmt, old_img, '1M')
+
+        self.vm = iotests.VM()
+        self.vm.launch()
+
+        result = self.vm.qmp('blockdev-add',
+                             options={'id': 'drive0',
+                                      'driver': 'null-co'})
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/format', 'null-co')
+
+        # For device-less BBs, calling blockdev-open-tray or blockdev-close-tray
+        # is not necessary
+        result = self.vm.qmp('blockdev-remove-medium', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp_absent(result, 'return[0]/inserted')
+
+        result = self.vm.qmp('blockdev-add',
+                             options={'node-name': 'node0',
+                                      'driver': iotests.imgfmt,
+                                      'file': {'filename': old_img,
+                                               'driver': 'file'}})
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('blockdev-insert-medium', device='drive0',
+                                                       node_name='node0')
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/tray_open', False)
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', old_img)
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(old_img)
+        try:
+            os.remove(new_img)
+        except OSError:
+            pass
+
+    def test_snapshot_and_commit(self):
+        # We need backing file support
+        if iotests.imgfmt != 'qcow2' and iotests.imgfmt != 'qed':
+            return
+
+        result = self.vm.qmp('blockdev-snapshot-sync', device='drive0',
+                                                       snapshot_file=new_img,
+                                                       format=iotests.imgfmt)
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('query-block')
+        self.assert_qmp(result, 'return[0]/inserted/image/filename', new_img)
+        self.assert_qmp(result,
+                        'return[0]/inserted/image/backing-image/filename',
+                        old_img)
+
+        result = self.vm.qmp('block-commit', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.vm.event_wait(name='BLOCK_JOB_READY')
+
+        result = self.vm.qmp('query-block-jobs')
+        self.assert_qmp(result, 'return[0]/device', 'drive0')
+
+        result = self.vm.qmp('block-job-complete', device='drive0')
+        self.assert_qmp(result, 'return', {})
+
+        self.vm.event_wait(name='BLOCK_JOB_COMPLETED')
+
+
+if __name__ == '__main__':
+    if iotests.qemu_default_machine != 'pc':
+        # We need floppy and IDE CD-ROM
+        iotests.notrun('not suitable for this machine type: %s' %
+                       iotests.qemu_default_machine)
+    iotests.main()
diff --git a/tests/qemu-iotests/118.out b/tests/qemu-iotests/118.out
new file mode 100644
index 0000000..6a91713
--- /dev/null
+++ b/tests/qemu-iotests/118.out
@@ -0,0 +1,5 @@
+...........................................................
+----------------------------------------------------------------------
+Ran 59 tests
+
+OK
diff --git a/tests/qemu-iotests/124 b/tests/qemu-iotests/124
index 9ccd118..c928f01 100644
--- a/tests/qemu-iotests/124
+++ b/tests/qemu-iotests/124
@@ -36,6 +36,23 @@ def try_remove(img):
         pass
 
 
+def transaction_action(action, **kwargs):
+    return {
+        'type': action,
+        'data': dict((k.replace('_', '-'), v) for k, v in kwargs.iteritems())
+    }
+
+
+def transaction_bitmap_clear(node, name, **kwargs):
+    return transaction_action('block-dirty-bitmap-clear',
+                              node=node, name=name, **kwargs)
+
+
+def transaction_drive_backup(device, target, **kwargs):
+    return transaction_action('drive-backup', device=device, target=target,
+                              **kwargs)
+
+
 class Bitmap:
     def __init__(self, name, drive):
         self.name = name
@@ -122,9 +139,12 @@ class TestIncrementalBackup(iotests.QMPTestCase):
     def do_qmp_backup(self, error='Input/output error', **kwargs):
         res = self.vm.qmp('drive-backup', **kwargs)
         self.assert_qmp(res, 'return', {})
+        return self.wait_qmp_backup(kwargs['device'], error)
+
 
+    def wait_qmp_backup(self, device, error='Input/output error'):
         event = self.vm.event_wait(name="BLOCK_JOB_COMPLETED",
-                                   match={'data': {'device': kwargs['device']}})
+                                   match={'data': {'device': device}})
         self.assertNotEqual(event, None)
 
         try:
@@ -139,6 +159,12 @@ class TestIncrementalBackup(iotests.QMPTestCase):
             return False
 
 
+    def wait_qmp_backup_cancelled(self, device):
+        event = self.vm.event_wait(name='BLOCK_JOB_CANCELLED',
+                                   match={'data': {'device': device}})
+        self.assertNotEqual(event, None)
+
+
     def create_anchor_backup(self, drive=None):
         if drive is None:
             drive = self.drives[-1]
@@ -264,6 +290,43 @@ class TestIncrementalBackup(iotests.QMPTestCase):
         return self.do_incremental_simple(granularity=131072)
 
 
+    def test_incremental_transaction(self):
+        '''Test: Verify backups made from transactionally created bitmaps.
+
+        Create a bitmap "before" VM execution begins, then create a second
+        bitmap AFTER writes have already occurred. Use transactions to create
+        a full backup and synchronize both bitmaps to this backup.
+        Create an incremental backup through both bitmaps and verify that
+        both backups match the current drive0 image.
+        '''
+
+        drive0 = self.drives[0]
+        bitmap0 = self.add_bitmap('bitmap0', drive0)
+        self.hmp_io_writes(drive0['id'], (('0xab', 0, 512),
+                                          ('0xfe', '16M', '256k'),
+                                          ('0x64', '32736k', '64k')))
+        bitmap1 = self.add_bitmap('bitmap1', drive0)
+
+        result = self.vm.qmp('transaction', actions=[
+            transaction_bitmap_clear(bitmap0.drive['id'], bitmap0.name),
+            transaction_bitmap_clear(bitmap1.drive['id'], bitmap1.name),
+            transaction_drive_backup(drive0['id'], drive0['backup'],
+                                     sync='full', format=drive0['fmt'])
+        ])
+        self.assert_qmp(result, 'return', {})
+        self.wait_until_completed(drive0['id'])
+        self.files.append(drive0['backup'])
+
+        self.hmp_io_writes(drive0['id'], (('0x9a', 0, 512),
+                                          ('0x55', '8M', '352k'),
+                                          ('0x78', '15872k', '1M')))
+        # Both bitmaps should be correctly in sync.
+        self.create_incremental(bitmap0)
+        self.create_incremental(bitmap1)
+        self.vm.shutdown()
+        self.check_backups()
+
+
     def test_incremental_failure(self):
         '''Test: Verify backups made after a failure are correct.
 
@@ -321,6 +384,123 @@ class TestIncrementalBackup(iotests.QMPTestCase):
         self.check_backups()
 
 
+    def test_transaction_failure(self):
+        '''Test: Verify backups made from a transaction that partially fails.
+
+        Add a second drive with its own unique pattern, and add a bitmap to each
+        drive. Use blkdebug to interfere with the backup on just one drive and
+        attempt to create a coherent incremental backup across both drives.
+
+        verify a failure in one but not both, then delete the failed stubs and
+        re-run the same transaction.
+
+        verify that both incrementals are created successfully.
+        '''
+
+        # Create a second drive, with pattern:
+        drive1 = self.add_node('drive1')
+        self.img_create(drive1['file'], drive1['fmt'])
+        io_write_patterns(drive1['file'], (('0x14', 0, 512),
+                                           ('0x5d', '1M', '32k'),
+                                           ('0xcd', '32M', '124k')))
+
+        # Create a blkdebug interface to this img as 'drive1'
+        result = self.vm.qmp('blockdev-add', options={
+            'id': drive1['id'],
+            'driver': drive1['fmt'],
+            'file': {
+                'driver': 'blkdebug',
+                'image': {
+                    'driver': 'file',
+                    'filename': drive1['file']
+                },
+                'set-state': [{
+                    'event': 'flush_to_disk',
+                    'state': 1,
+                    'new_state': 2
+                }],
+                'inject-error': [{
+                    'event': 'read_aio',
+                    'errno': 5,
+                    'state': 2,
+                    'immediately': False,
+                    'once': True
+                }],
+            }
+        })
+        self.assert_qmp(result, 'return', {})
+
+        # Create bitmaps and full backups for both drives
+        drive0 = self.drives[0]
+        dr0bm0 = self.add_bitmap('bitmap0', drive0)
+        dr1bm0 = self.add_bitmap('bitmap0', drive1)
+        self.create_anchor_backup(drive0)
+        self.create_anchor_backup(drive1)
+        self.assert_no_active_block_jobs()
+        self.assertFalse(self.vm.get_qmp_events(wait=False))
+
+        # Emulate some writes
+        self.hmp_io_writes(drive0['id'], (('0xab', 0, 512),
+                                          ('0xfe', '16M', '256k'),
+                                          ('0x64', '32736k', '64k')))
+        self.hmp_io_writes(drive1['id'], (('0xba', 0, 512),
+                                          ('0xef', '16M', '256k'),
+                                          ('0x46', '32736k', '64k')))
+
+        # Create incremental backup targets
+        target0 = self.prepare_backup(dr0bm0)
+        target1 = self.prepare_backup(dr1bm0)
+
+        # Ask for a new incremental backup per-each drive,
+        # expecting drive1's backup to fail:
+        transaction = [
+            transaction_drive_backup(drive0['id'], target0, sync='incremental',
+                                     format=drive0['fmt'], mode='existing',
+                                     bitmap=dr0bm0.name),
+            transaction_drive_backup(drive1['id'], target1, sync='incremental',
+                                     format=drive1['fmt'], mode='existing',
+                                     bitmap=dr1bm0.name)
+        ]
+        result = self.vm.qmp('transaction', actions=transaction,
+                             properties={'completion-mode': 'grouped'} )
+        self.assert_qmp(result, 'return', {})
+
+        # Observe that drive0's backup is cancelled and drive1 completes with
+        # an error.
+        self.wait_qmp_backup_cancelled(drive0['id'])
+        self.assertFalse(self.wait_qmp_backup(drive1['id']))
+        error = self.vm.event_wait('BLOCK_JOB_ERROR')
+        self.assert_qmp(error, 'data', {'device': drive1['id'],
+                                        'action': 'report',
+                                        'operation': 'read'})
+        self.assertFalse(self.vm.get_qmp_events(wait=False))
+        self.assert_no_active_block_jobs()
+
+        # Delete drive0's successful target and eliminate our record of the
+        # unsuccessful drive1 target. Then re-run the same transaction.
+        dr0bm0.del_target()
+        dr1bm0.del_target()
+        target0 = self.prepare_backup(dr0bm0)
+        target1 = self.prepare_backup(dr1bm0)
+
+        # Re-run the exact same transaction.
+        result = self.vm.qmp('transaction', actions=transaction,
+                             properties={'completion-mode':'grouped'})
+        self.assert_qmp(result, 'return', {})
+
+        # Both should complete successfully this time.
+        self.assertTrue(self.wait_qmp_backup(drive0['id']))
+        self.assertTrue(self.wait_qmp_backup(drive1['id']))
+        self.make_reference_backup(dr0bm0)
+        self.make_reference_backup(dr1bm0)
+        self.assertFalse(self.vm.get_qmp_events(wait=False))
+        self.assert_no_active_block_jobs()
+
+        # And the images should of course validate.
+        self.vm.shutdown()
+        self.check_backups()
+
+
     def test_sync_dirty_bitmap_missing(self):
         self.assert_no_active_block_jobs()
         self.files.append(self.err_img)
diff --git a/tests/qemu-iotests/124.out b/tests/qemu-iotests/124.out
index 2f7d390..dae404e 100644
--- a/tests/qemu-iotests/124.out
+++ b/tests/qemu-iotests/124.out
@@ -1,5 +1,5 @@
-.......
+.........
 ----------------------------------------------------------------------
-Ran 7 tests
+Ran 9 tests
 
 OK
diff --git a/tests/qemu-iotests/136 b/tests/qemu-iotests/136
new file mode 100644
index 0000000..e8c6937
--- /dev/null
+++ b/tests/qemu-iotests/136
@@ -0,0 +1,349 @@
+#!/usr/bin/env python
+#
+# Tests for block device statistics
+#
+# Copyright (C) 2015 Igalia, S.L.
+# Author: Alberto Garcia <berto@igalia.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import iotests
+import os
+
+interval_length = 10
+nsec_per_sec = 1000000000
+op_latency = nsec_per_sec / 1000 # See qtest_latency_ns in accounting.c
+bad_sector = 8192
+bad_offset = bad_sector * 512
+blkdebug_file = os.path.join(iotests.test_dir, 'blkdebug.conf')
+
+class BlockDeviceStatsTestCase(iotests.QMPTestCase):
+    test_img = "null-aio://"
+    total_rd_bytes = 0
+    total_rd_ops = 0
+    total_wr_bytes = 0
+    total_wr_ops = 0
+    total_wr_merged = 0
+    total_flush_ops = 0
+    failed_rd_ops = 0
+    failed_wr_ops = 0
+    invalid_rd_ops = 0
+    invalid_wr_ops = 0
+    wr_highest_offset = 0
+    account_invalid = False
+    account_failed = False
+
+    def blockstats(self, device):
+        result = self.vm.qmp("query-blockstats")
+        for r in result['return']:
+            if r['device'] == device:
+                return r['stats']
+        raise Exception("Device not found for blockstats: %s" % device)
+
+    def create_blkdebug_file(self):
+        file = open(blkdebug_file, 'w')
+        file.write('''
+[inject-error]
+event = "read_aio"
+errno = "5"
+sector = "%d"
+
+[inject-error]
+event = "write_aio"
+errno = "5"
+sector = "%d"
+''' % (bad_sector, bad_sector))
+        file.close()
+
+    def setUp(self):
+        drive_args = []
+        drive_args.append("stats-intervals.0=%d" % interval_length)
+        drive_args.append("stats-account-invalid=%s" %
+                          (self.account_invalid and "on" or "off"))
+        drive_args.append("stats-account-failed=%s" %
+                          (self.account_failed and "on" or "off"))
+        self.create_blkdebug_file()
+        self.vm = iotests.VM().add_drive('blkdebug:%s:%s ' %
+                                         (blkdebug_file, self.test_img),
+                                         ','.join(drive_args))
+        self.vm.launch()
+        # Set an initial value for the clock
+        self.vm.qtest("clock_step %d" % nsec_per_sec)
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(blkdebug_file)
+
+    def accounted_ops(self, read = False, write = False, flush = False):
+        ops = 0
+        if write:
+            ops += self.total_wr_ops
+            if self.account_failed:
+                ops += self.failed_wr_ops
+            if self.account_invalid:
+                ops += self.invalid_wr_ops
+        if read:
+            ops += self.total_rd_ops
+            if self.account_failed:
+                ops += self.failed_rd_ops
+            if self.account_invalid:
+                ops += self.invalid_rd_ops
+        if flush:
+            ops += self.total_flush_ops
+        return ops
+
+    def accounted_latency(self, read = False, write = False, flush = False):
+        latency = 0
+        if write:
+            latency += self.total_wr_ops * op_latency
+            if self.account_failed:
+                latency += self.failed_wr_ops * op_latency
+        if read:
+            latency += self.total_rd_ops * op_latency
+            if self.account_failed:
+                latency += self.failed_rd_ops * op_latency
+        if flush:
+            latency += self.total_flush_ops * op_latency
+        return latency
+
+    def check_values(self):
+        stats = self.blockstats('drive0')
+
+        # Check that the totals match with what we have calculated
+        self.assertEqual(self.total_rd_bytes, stats['rd_bytes'])
+        self.assertEqual(self.total_wr_bytes, stats['wr_bytes'])
+        self.assertEqual(self.total_rd_ops, stats['rd_operations'])
+        self.assertEqual(self.total_wr_ops, stats['wr_operations'])
+        self.assertEqual(self.total_flush_ops, stats['flush_operations'])
+        self.assertEqual(self.wr_highest_offset, stats['wr_highest_offset'])
+        self.assertEqual(self.failed_rd_ops, stats['failed_rd_operations'])
+        self.assertEqual(self.failed_wr_ops, stats['failed_wr_operations'])
+        self.assertEqual(self.invalid_rd_ops, stats['invalid_rd_operations'])
+        self.assertEqual(self.invalid_wr_ops, stats['invalid_wr_operations'])
+        self.assertEqual(self.account_invalid, stats['account_invalid'])
+        self.assertEqual(self.account_failed, stats['account_failed'])
+        self.assertEqual(self.total_wr_merged, stats['wr_merged'])
+
+        # Check that there's exactly one interval with the length we defined
+        self.assertEqual(1, len(stats['timed_stats']))
+        timed_stats = stats['timed_stats'][0]
+        self.assertEqual(interval_length, timed_stats['interval_length'])
+
+        total_rd_latency = self.accounted_latency(read = True)
+        if (total_rd_latency != 0):
+            self.assertEqual(total_rd_latency, stats['rd_total_time_ns'])
+            self.assertEqual(op_latency, timed_stats['min_rd_latency_ns'])
+            self.assertEqual(op_latency, timed_stats['max_rd_latency_ns'])
+            self.assertEqual(op_latency, timed_stats['avg_rd_latency_ns'])
+            self.assertLess(0, timed_stats['avg_rd_queue_depth'])
+        else:
+            self.assertEqual(0, stats['rd_total_time_ns'])
+            self.assertEqual(0, timed_stats['min_rd_latency_ns'])
+            self.assertEqual(0, timed_stats['max_rd_latency_ns'])
+            self.assertEqual(0, timed_stats['avg_rd_latency_ns'])
+            self.assertEqual(0, timed_stats['avg_rd_queue_depth'])
+
+        # min read latency <= avg read latency <= max read latency
+        self.assertLessEqual(timed_stats['min_rd_latency_ns'],
+                             timed_stats['avg_rd_latency_ns'])
+        self.assertLessEqual(timed_stats['avg_rd_latency_ns'],
+                             timed_stats['max_rd_latency_ns'])
+
+        total_wr_latency = self.accounted_latency(write = True)
+        if (total_wr_latency != 0):
+            self.assertEqual(total_wr_latency, stats['wr_total_time_ns'])
+            self.assertEqual(op_latency, timed_stats['min_wr_latency_ns'])
+            self.assertEqual(op_latency, timed_stats['max_wr_latency_ns'])
+            self.assertEqual(op_latency, timed_stats['avg_wr_latency_ns'])
+            self.assertLess(0, timed_stats['avg_wr_queue_depth'])
+        else:
+            self.assertEqual(0, stats['wr_total_time_ns'])
+            self.assertEqual(0, timed_stats['min_wr_latency_ns'])
+            self.assertEqual(0, timed_stats['max_wr_latency_ns'])
+            self.assertEqual(0, timed_stats['avg_wr_latency_ns'])
+            self.assertEqual(0, timed_stats['avg_wr_queue_depth'])
+
+        # min write latency <= avg write latency <= max write latency
+        self.assertLessEqual(timed_stats['min_wr_latency_ns'],
+                             timed_stats['avg_wr_latency_ns'])
+        self.assertLessEqual(timed_stats['avg_wr_latency_ns'],
+                             timed_stats['max_wr_latency_ns'])
+
+        total_flush_latency = self.accounted_latency(flush = True)
+        if (total_flush_latency != 0):
+            self.assertEqual(total_flush_latency, stats['flush_total_time_ns'])
+            self.assertEqual(op_latency, timed_stats['min_flush_latency_ns'])
+            self.assertEqual(op_latency, timed_stats['max_flush_latency_ns'])
+            self.assertEqual(op_latency, timed_stats['avg_flush_latency_ns'])
+        else:
+            self.assertEqual(0, stats['flush_total_time_ns'])
+            self.assertEqual(0, timed_stats['min_flush_latency_ns'])
+            self.assertEqual(0, timed_stats['max_flush_latency_ns'])
+            self.assertEqual(0, timed_stats['avg_flush_latency_ns'])
+
+        # min flush latency <= avg flush latency <= max flush latency
+        self.assertLessEqual(timed_stats['min_flush_latency_ns'],
+                             timed_stats['avg_flush_latency_ns'])
+        self.assertLessEqual(timed_stats['avg_flush_latency_ns'],
+                             timed_stats['max_flush_latency_ns'])
+
+        # idle_time_ns must be > 0 if we have performed any operation
+        if (self.accounted_ops(read = True, write = True, flush = True) != 0):
+            self.assertLess(0, stats['idle_time_ns'])
+        else:
+            self.assertFalse(stats.has_key('idle_time_ns'))
+
+        # This test does not alter these, so they must be all 0
+        self.assertEqual(0, stats['rd_merged'])
+        self.assertEqual(0, stats['failed_flush_operations'])
+        self.assertEqual(0, stats['invalid_flush_operations'])
+
+    def do_test_stats(self, rd_size = 0, rd_ops = 0, wr_size = 0, wr_ops = 0,
+                      flush_ops = 0, invalid_rd_ops = 0, invalid_wr_ops = 0,
+                      failed_rd_ops = 0, failed_wr_ops = 0, wr_merged = 0):
+        # The 'ops' list will contain all the requested I/O operations
+        ops = []
+        for i in range(rd_ops):
+            ops.append("aio_read %d %d" % (i * rd_size, rd_size))
+
+        for i in range(wr_ops):
+            ops.append("aio_write %d %d" % (i * wr_size, wr_size))
+
+        for i in range(flush_ops):
+            ops.append("aio_flush")
+
+        highest_offset = wr_ops * wr_size
+
+        # Two types of invalid operations: unaligned length and unaligned offset
+        for i in range(invalid_rd_ops / 2):
+            ops.append("aio_read 0 511")
+
+        for i in range(invalid_rd_ops / 2, invalid_rd_ops):
+            ops.append("aio_read 13 512")
+
+        for i in range(invalid_wr_ops / 2):
+            ops.append("aio_write 0 511")
+
+        for i in range(invalid_wr_ops / 2, invalid_wr_ops):
+            ops.append("aio_write 13 512")
+
+        for i in range(failed_rd_ops):
+            ops.append("aio_read %d 512" % bad_offset)
+
+        for i in range(failed_wr_ops):
+            ops.append("aio_write %d 512" % bad_offset)
+
+        if failed_wr_ops > 0:
+            highest_offset = max(highest_offset, bad_offset + 512)
+
+        for i in range(wr_merged):
+            first = i * wr_size * 2
+            second = first + wr_size
+            ops.append("multiwrite %d %d ; %d %d" %
+                       (first, wr_size, second, wr_size))
+
+        highest_offset = max(highest_offset, wr_merged * wr_size * 2)
+
+        # Now perform all operations
+        for op in ops:
+            self.vm.hmp_qemu_io("drive0", op)
+
+        # Update the expected totals
+        self.total_rd_bytes += rd_ops * rd_size
+        self.total_rd_ops += rd_ops
+        self.total_wr_bytes += wr_ops * wr_size
+        self.total_wr_ops += wr_ops
+        self.total_wr_merged += wr_merged
+        self.total_flush_ops += flush_ops
+        self.invalid_rd_ops += invalid_rd_ops
+        self.invalid_wr_ops += invalid_wr_ops
+        self.failed_rd_ops += failed_rd_ops
+        self.failed_wr_ops += failed_wr_ops
+
+        self.wr_highest_offset = max(self.wr_highest_offset, highest_offset)
+
+        # Advance the clock so idle_time_ns has a meaningful value
+        self.vm.qtest("clock_step %d" % nsec_per_sec)
+
+        # And check that the actual statistics match the expected ones
+        self.check_values()
+
+    def test_read_only(self):
+        test_values = [[512,    1],
+                       [65536,  1],
+                       [512,   12],
+                       [65536, 12]]
+        for i in test_values:
+            self.do_test_stats(rd_size = i[0], rd_ops = i[1])
+
+    def test_write_only(self):
+        test_values = [[512,    1],
+                       [65536,  1],
+                       [512,   12],
+                       [65536, 12]]
+        for i in test_values:
+            self.do_test_stats(wr_size = i[0], wr_ops = i[1])
+
+    def test_invalid(self):
+        self.do_test_stats(invalid_rd_ops = 7)
+        self.do_test_stats(invalid_wr_ops = 3)
+        self.do_test_stats(invalid_rd_ops = 4, invalid_wr_ops = 5)
+
+    def test_failed(self):
+        self.do_test_stats(failed_rd_ops = 8)
+        self.do_test_stats(failed_wr_ops = 6)
+        self.do_test_stats(failed_rd_ops = 5, failed_wr_ops = 12)
+
+    def test_flush(self):
+        self.do_test_stats(flush_ops = 8)
+
+    def test_merged(self):
+        for i in range(5):
+            self.do_test_stats(wr_merged = i * 3)
+
+    def test_all(self):
+        # rd_size, rd_ops, wr_size, wr_ops, flush_ops
+        # invalid_rd_ops,  invalid_wr_ops,
+        # failed_rd_ops,   failed_wr_ops
+        # wr_merged
+        test_values = [[512,    1, 512,   1, 1, 4, 7, 5, 2, 1],
+                       [65536,  1, 2048, 12, 7, 7, 5, 2, 5, 5],
+                       [32768,  9, 8192,  1, 4, 3, 2, 4, 6, 4],
+                       [16384, 11, 3584, 16, 9, 8, 6, 7, 3, 4]]
+        for i in test_values:
+            self.do_test_stats(*i)
+
+    def test_no_op(self):
+        # All values must be sane before doing any I/O
+        self.check_values()
+
+
+class BlockDeviceStatsTestAccountInvalid(BlockDeviceStatsTestCase):
+    account_invalid = True
+    account_failed = False
+
+class BlockDeviceStatsTestAccountFailed(BlockDeviceStatsTestCase):
+    account_invalid = False
+    account_failed = True
+
+class BlockDeviceStatsTestAccountBoth(BlockDeviceStatsTestCase):
+    account_invalid = True
+    account_failed = True
+
+class BlockDeviceStatsTestCoroutine(BlockDeviceStatsTestCase):
+    test_img = "null-co://"
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=["raw"])
diff --git a/tests/qemu-iotests/136.out b/tests/qemu-iotests/136.out
new file mode 100644
index 0000000..0a5e958
--- /dev/null
+++ b/tests/qemu-iotests/136.out
@@ -0,0 +1,5 @@
+........................................
+----------------------------------------------------------------------
+Ran 40 tests
+
+OK
diff --git a/tests/qemu-iotests/137.out b/tests/qemu-iotests/137.out
index cf55a41..88c702c 100644
--- a/tests/qemu-iotests/137.out
+++ b/tests/qemu-iotests/137.out
@@ -31,7 +31,11 @@ Cache clean interval too big
 Unsupported value 'blubb' for qcow2 option 'overlap-check'. Allowed are any of the following: none, constant, cached, all
 wrote 512/512 bytes at offset 0
 512 bytes, X ops; XX:XX:XX.X (XXX YYY/sec and XXX ops/sec)
-./common.config: Killed                  ( exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@" )
+./common.config: Killed                  ( if [ "${VALGRIND_QEMU}" == "y" ]; then
+    exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+else
+    exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@";
+fi )
 incompatible_features     0x0
 Formatting 'TEST_DIR/t.IMGFMT', fmt=IMGFMT size=67108864
 wrote 65536/65536 bytes at offset 0
diff --git a/tests/qemu-iotests/139 b/tests/qemu-iotests/139
new file mode 100644
index 0000000..42f78c7
--- /dev/null
+++ b/tests/qemu-iotests/139
@@ -0,0 +1,416 @@
+#!/usr/bin/env python
+#
+# Test cases for the QMP 'x-blockdev-del' command
+#
+# Copyright (C) 2015 Igalia, S.L.
+# Author: Alberto Garcia <berto@igalia.com>
+#
+# This program is free software; you can redistribute it and/or modify
+# it under the terms of the GNU General Public License as published by
+# the Free Software Foundation; either version 2 of the License, or
+# (at your option) any later version.
+#
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU General Public License for more details.
+#
+# You should have received a copy of the GNU General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+#
+
+import os
+import iotests
+import time
+
+base_img = os.path.join(iotests.test_dir, 'base.img')
+new_img = os.path.join(iotests.test_dir, 'new.img')
+
+class TestBlockdevDel(iotests.QMPTestCase):
+
+    def setUp(self):
+        iotests.qemu_img('create', '-f', iotests.imgfmt, base_img, '1M')
+        self.vm = iotests.VM()
+        self.vm.launch()
+
+    def tearDown(self):
+        self.vm.shutdown()
+        os.remove(base_img)
+        if os.path.isfile(new_img):
+            os.remove(new_img)
+
+    # Check whether a BlockBackend exists
+    def checkBlockBackend(self, backend, node, must_exist = True):
+        result = self.vm.qmp('query-block')
+        backends = filter(lambda x: x['device'] == backend, result['return'])
+        self.assertLessEqual(len(backends), 1)
+        self.assertEqual(must_exist, len(backends) == 1)
+        if must_exist:
+            if node:
+                self.assertEqual(backends[0]['inserted']['node-name'], node)
+            else:
+                self.assertFalse(backends[0].has_key('inserted'))
+
+    # Check whether a BlockDriverState exists
+    def checkBlockDriverState(self, node, must_exist = True):
+        result = self.vm.qmp('query-named-block-nodes')
+        nodes = filter(lambda x: x['node-name'] == node, result['return'])
+        self.assertLessEqual(len(nodes), 1)
+        self.assertEqual(must_exist, len(nodes) == 1)
+
+    # Add a new BlockBackend (with its attached BlockDriverState)
+    def addBlockBackend(self, backend, node):
+        file_node = '%s_file' % node
+        self.checkBlockBackend(backend, node, False)
+        self.checkBlockDriverState(node, False)
+        self.checkBlockDriverState(file_node, False)
+        opts = {'driver': iotests.imgfmt,
+                'id': backend,
+                'node-name': node,
+                'file': {'driver': 'file',
+                         'node-name': file_node,
+                         'filename': base_img}}
+        result = self.vm.qmp('blockdev-add', conv_keys = False, options = opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockBackend(backend, node)
+        self.checkBlockDriverState(node)
+        self.checkBlockDriverState(file_node)
+
+    # Add a BlockDriverState without a BlockBackend
+    def addBlockDriverState(self, node):
+        file_node = '%s_file' % node
+        self.checkBlockDriverState(node, False)
+        self.checkBlockDriverState(file_node, False)
+        opts = {'driver': iotests.imgfmt,
+                'node-name': node,
+                'file': {'driver': 'file',
+                         'node-name': file_node,
+                         'filename': base_img}}
+        result = self.vm.qmp('blockdev-add', conv_keys = False, options = opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(node)
+        self.checkBlockDriverState(file_node)
+
+    # Add a BlockDriverState that will be used as overlay for the base_img BDS
+    def addBlockDriverStateOverlay(self, node):
+        self.checkBlockDriverState(node, False)
+        iotests.qemu_img('create', '-f', iotests.imgfmt,
+                         '-b', base_img, new_img, '1M')
+        opts = {'driver': iotests.imgfmt,
+                'node-name': node,
+                'backing': '',
+                'file': {'driver': 'file',
+                         'filename': new_img}}
+        result = self.vm.qmp('blockdev-add', conv_keys = False, options = opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(node)
+
+    # Delete a BlockBackend
+    def delBlockBackend(self, backend, node, expect_error = False,
+                        destroys_media = True):
+        self.checkBlockBackend(backend, node)
+        if node:
+            self.checkBlockDriverState(node)
+        result = self.vm.qmp('x-blockdev-del', id = backend)
+        if expect_error:
+            self.assert_qmp(result, 'error/class', 'GenericError')
+            if node:
+                self.checkBlockDriverState(node)
+        else:
+            self.assert_qmp(result, 'return', {})
+            if node:
+                self.checkBlockDriverState(node, not destroys_media)
+        self.checkBlockBackend(backend, node, must_exist = expect_error)
+
+    # Delete a BlockDriverState
+    def delBlockDriverState(self, node, expect_error = False):
+        self.checkBlockDriverState(node)
+        result = self.vm.qmp('x-blockdev-del', node_name = node)
+        if expect_error:
+            self.assert_qmp(result, 'error/class', 'GenericError')
+        else:
+            self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(node, expect_error)
+
+    # Add a device model
+    def addDeviceModel(self, device, backend):
+        result = self.vm.qmp('device_add', id = device,
+                             driver = 'virtio-blk-pci', drive = backend)
+        self.assert_qmp(result, 'return', {})
+
+    # Delete a device model
+    def delDeviceModel(self, device):
+        result = self.vm.qmp('device_del', id = device)
+        self.assert_qmp(result, 'return', {})
+
+        result = self.vm.qmp('system_reset')
+        self.assert_qmp(result, 'return', {})
+
+        device_path = '/machine/peripheral/%s/virtio-backend' % device
+        event = self.vm.event_wait(name="DEVICE_DELETED",
+                                   match={'data': {'path': device_path}})
+        self.assertNotEqual(event, None)
+
+        event = self.vm.event_wait(name="DEVICE_DELETED",
+                                   match={'data': {'device': device}})
+        self.assertNotEqual(event, None)
+
+    # Remove a BlockDriverState
+    def ejectDrive(self, backend, node, expect_error = False,
+                   destroys_media = True):
+        self.checkBlockBackend(backend, node)
+        self.checkBlockDriverState(node)
+        result = self.vm.qmp('eject', device = backend)
+        if expect_error:
+            self.assert_qmp(result, 'error/class', 'GenericError')
+            self.checkBlockDriverState(node)
+            self.checkBlockBackend(backend, node)
+        else:
+            self.assert_qmp(result, 'return', {})
+            self.checkBlockDriverState(node, not destroys_media)
+            self.checkBlockBackend(backend, None)
+
+    # Insert a BlockDriverState
+    def insertDrive(self, backend, node):
+        self.checkBlockBackend(backend, None)
+        self.checkBlockDriverState(node)
+        result = self.vm.qmp('blockdev-insert-medium',
+                             device = backend, node_name = node)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockBackend(backend, node)
+        self.checkBlockDriverState(node)
+
+    # Create a snapshot using 'blockdev-snapshot-sync'
+    def createSnapshotSync(self, node, overlay):
+        self.checkBlockDriverState(node)
+        self.checkBlockDriverState(overlay, False)
+        opts = {'node-name': node,
+                'snapshot-file': new_img,
+                'snapshot-node-name': overlay,
+                'format': iotests.imgfmt}
+        result = self.vm.qmp('blockdev-snapshot-sync', conv_keys=False, **opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(node)
+        self.checkBlockDriverState(overlay)
+
+    # Create a snapshot using 'blockdev-snapshot'
+    def createSnapshot(self, node, overlay):
+        self.checkBlockDriverState(node)
+        self.checkBlockDriverState(overlay)
+        result = self.vm.qmp('blockdev-snapshot',
+                             node = node, overlay = overlay)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(node)
+        self.checkBlockDriverState(overlay)
+
+    # Create a mirror
+    def createMirror(self, backend, node, new_node):
+        self.checkBlockBackend(backend, node)
+        self.checkBlockDriverState(new_node, False)
+        opts = {'device': backend,
+                'target': new_img,
+                'node-name': new_node,
+                'sync': 'top',
+                'format': iotests.imgfmt}
+        result = self.vm.qmp('drive-mirror', conv_keys=False, **opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockBackend(backend, node)
+        self.checkBlockDriverState(new_node)
+
+    # Complete an existing block job
+    def completeBlockJob(self, backend, node_before, node_after):
+        self.checkBlockBackend(backend, node_before)
+        result = self.vm.qmp('block-job-complete', device=backend)
+        self.assert_qmp(result, 'return', {})
+        self.wait_until_completed(backend)
+        self.checkBlockBackend(backend, node_after)
+
+    # Add a BlkDebug node
+    # Note that the purpose of this is to test the x-blockdev-del
+    # sanity checks, not to create a usable blkdebug drive
+    def addBlkDebug(self, debug, node):
+        self.checkBlockDriverState(node, False)
+        self.checkBlockDriverState(debug, False)
+        image = {'driver': iotests.imgfmt,
+                 'node-name': node,
+                 'file': {'driver': 'file',
+                          'filename': base_img}}
+        opts = {'driver': 'blkdebug',
+                'node-name': debug,
+                'image': image}
+        result = self.vm.qmp('blockdev-add', conv_keys = False, options = opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(node)
+        self.checkBlockDriverState(debug)
+
+    # Add a BlkVerify node
+    # Note that the purpose of this is to test the x-blockdev-del
+    # sanity checks, not to create a usable blkverify drive
+    def addBlkVerify(self, blkverify, test, raw):
+        self.checkBlockDriverState(test, False)
+        self.checkBlockDriverState(raw, False)
+        self.checkBlockDriverState(blkverify, False)
+        iotests.qemu_img('create', '-f', iotests.imgfmt, new_img, '1M')
+        node_0 = {'driver': iotests.imgfmt,
+                  'node-name': test,
+                  'file': {'driver': 'file',
+                           'filename': base_img}}
+        node_1 = {'driver': iotests.imgfmt,
+                  'node-name': raw,
+                  'file': {'driver': 'file',
+                           'filename': new_img}}
+        opts = {'driver': 'blkverify',
+                'node-name': blkverify,
+                'test': node_0,
+                'raw': node_1}
+        result = self.vm.qmp('blockdev-add', conv_keys = False, options = opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(test)
+        self.checkBlockDriverState(raw)
+        self.checkBlockDriverState(blkverify)
+
+    # Add a Quorum node
+    def addQuorum(self, quorum, child0, child1):
+        self.checkBlockDriverState(child0, False)
+        self.checkBlockDriverState(child1, False)
+        self.checkBlockDriverState(quorum, False)
+        iotests.qemu_img('create', '-f', iotests.imgfmt, new_img, '1M')
+        child_0 = {'driver': iotests.imgfmt,
+                   'node-name': child0,
+                   'file': {'driver': 'file',
+                            'filename': base_img}}
+        child_1 = {'driver': iotests.imgfmt,
+                   'node-name': child1,
+                   'file': {'driver': 'file',
+                            'filename': new_img}}
+        opts = {'driver': 'quorum',
+                'node-name': quorum,
+                'vote-threshold': 1,
+                'children': [ child_0, child_1 ]}
+        result = self.vm.qmp('blockdev-add', conv_keys = False, options = opts)
+        self.assert_qmp(result, 'return', {})
+        self.checkBlockDriverState(child0)
+        self.checkBlockDriverState(child1)
+        self.checkBlockDriverState(quorum)
+
+    ########################
+    # The tests start here #
+    ########################
+
+    def testWrongParameters(self):
+        self.addBlockBackend('drive0', 'node0')
+        result = self.vm.qmp('x-blockdev-del')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+        result = self.vm.qmp('x-blockdev-del', id='drive0', node_name='node0')
+        self.assert_qmp(result, 'error/class', 'GenericError')
+        self.delBlockBackend('drive0', 'node0')
+
+    def testBlockBackend(self):
+        self.addBlockBackend('drive0', 'node0')
+        # You cannot delete a BDS that is attached to a backend
+        self.delBlockDriverState('node0', expect_error = True)
+        self.delBlockBackend('drive0', 'node0')
+
+    def testBlockDriverState(self):
+        self.addBlockDriverState('node0')
+        # You cannot delete a file BDS directly
+        self.delBlockDriverState('node0_file', expect_error = True)
+        self.delBlockDriverState('node0')
+
+    def testEject(self):
+        self.addBlockBackend('drive0', 'node0')
+        self.ejectDrive('drive0', 'node0')
+        self.delBlockBackend('drive0', None)
+
+    def testDeviceModel(self):
+        self.addBlockBackend('drive0', 'node0')
+        self.addDeviceModel('device0', 'drive0')
+        self.ejectDrive('drive0', 'node0', expect_error = True)
+        self.delBlockBackend('drive0', 'node0', expect_error = True)
+        self.delDeviceModel('device0')
+        self.delBlockBackend('drive0', 'node0')
+
+    def testAttachMedia(self):
+        # This creates a BlockBackend and removes its media
+        self.addBlockBackend('drive0', 'node0')
+        self.ejectDrive('drive0', 'node0')
+        # This creates a new BlockDriverState and inserts it into the backend
+        self.addBlockDriverState('node1')
+        self.insertDrive('drive0', 'node1')
+        # The backend can't be removed: the new BDS has an extra reference
+        self.delBlockBackend('drive0', 'node1', expect_error = True)
+        self.delBlockDriverState('node1', expect_error = True)
+        # The BDS still exists after being ejected, but now it can be removed
+        self.ejectDrive('drive0', 'node1', destroys_media = False)
+        self.delBlockDriverState('node1')
+        self.delBlockBackend('drive0', None)
+
+    def testSnapshotSync(self):
+        self.addBlockBackend('drive0', 'node0')
+        self.createSnapshotSync('node0', 'overlay0')
+        # This fails because node0 is now being used as a backing image
+        self.delBlockDriverState('node0', expect_error = True)
+        # This succeeds because overlay0 only has the backend reference
+        self.delBlockBackend('drive0', 'overlay0')
+        self.checkBlockDriverState('node0', False)
+
+    def testSnapshot(self):
+        self.addBlockBackend('drive0', 'node0')
+        self.addBlockDriverStateOverlay('overlay0')
+        self.createSnapshot('node0', 'overlay0')
+        self.delBlockBackend('drive0', 'overlay0', expect_error = True)
+        self.delBlockDriverState('node0', expect_error = True)
+        self.delBlockDriverState('overlay0', expect_error = True)
+        self.ejectDrive('drive0', 'overlay0', destroys_media = False)
+        self.delBlockBackend('drive0', None)
+        self.delBlockDriverState('node0', expect_error = True)
+        self.delBlockDriverState('overlay0')
+        self.checkBlockDriverState('node0', False)
+
+    def testMirror(self):
+        self.addBlockBackend('drive0', 'node0')
+        self.createMirror('drive0', 'node0', 'mirror0')
+        # The block job prevents removing the device
+        self.delBlockBackend('drive0', 'node0', expect_error = True)
+        self.delBlockDriverState('node0', expect_error = True)
+        self.delBlockDriverState('mirror0', expect_error = True)
+        self.wait_ready('drive0')
+        self.completeBlockJob('drive0', 'node0', 'mirror0')
+        self.assert_no_active_block_jobs()
+        self.checkBlockDriverState('node0', False)
+        # This succeeds because the backend now points to mirror0
+        self.delBlockBackend('drive0', 'mirror0')
+
+    def testBlkDebug(self):
+        self.addBlkDebug('debug0', 'node0')
+        # 'node0' is used by the blkdebug node
+        self.delBlockDriverState('node0', expect_error = True)
+        # But we can remove the blkdebug node directly
+        self.delBlockDriverState('debug0')
+        self.checkBlockDriverState('node0', False)
+
+    def testBlkVerify(self):
+        self.addBlkVerify('verify0', 'node0', 'node1')
+        # We cannot remove the children of a blkverify device
+        self.delBlockDriverState('node0', expect_error = True)
+        self.delBlockDriverState('node1', expect_error = True)
+        # But we can remove the blkverify node directly
+        self.delBlockDriverState('verify0')
+        self.checkBlockDriverState('node0', False)
+        self.checkBlockDriverState('node1', False)
+
+    def testQuorum(self):
+        if not 'quorum' in iotests.qemu_img_pipe('--help'):
+            return
+        self.addQuorum('quorum0', 'node0', 'node1')
+        # We cannot remove the children of a Quorum device
+        self.delBlockDriverState('node0', expect_error = True)
+        self.delBlockDriverState('node1', expect_error = True)
+        # But we can remove the Quorum node directly
+        self.delBlockDriverState('quorum0')
+        self.checkBlockDriverState('node0', False)
+        self.checkBlockDriverState('node1', False)
+
+
+if __name__ == '__main__':
+    iotests.main(supported_fmts=["qcow2"])
diff --git a/tests/qemu-iotests/139.out b/tests/qemu-iotests/139.out
new file mode 100644
index 0000000..281b69e
--- /dev/null
+++ b/tests/qemu-iotests/139.out
@@ -0,0 +1,5 @@
+............
+----------------------------------------------------------------------
+Ran 12 tests
+
+OK
diff --git a/tests/qemu-iotests/common b/tests/qemu-iotests/common
index 25c351b..ff84f4b 100644
--- a/tests/qemu-iotests/common
+++ b/tests/qemu-iotests/common
@@ -41,7 +41,6 @@ sortme=false
 expunge=true
 have_test_arg=false
 randomize=false
-valgrind=false
 cachemode=false
 rm -f $tmp.list $tmp.tmp $tmp.sed
 
@@ -53,6 +52,7 @@ export CACHEMODE="writeback"
 export QEMU_IO_OPTIONS=""
 export CACHEMODE_IS_DEFAULT=true
 export QEMU_OPTIONS="-nodefaults"
+export VALGRIND_QEMU=
 
 for r
 do
@@ -278,7 +278,7 @@ testlist options
             ;;
 
         -valgrind)
-            valgrind=true
+            VALGRIND_QEMU='y'
             xpand=false
             ;;
 
@@ -436,8 +436,3 @@ fi
 if [ "$IMGPROTO" = "nbd" ] ; then
     [ "$QEMU_NBD" = "" ] && _fatal "qemu-nbd not found"
 fi
-
-if $valgrind; then
-    export REAL_QEMU_IO="$QEMU_IO_PROG"
-    export QEMU_IO_PROG=valgrind_qemu_io
-fi
diff --git a/tests/qemu-iotests/common.config b/tests/qemu-iotests/common.config
index 596bb2b..3ed51b8 100644
--- a/tests/qemu-iotests/common.config
+++ b/tests/qemu-iotests/common.config
@@ -44,6 +44,8 @@ export HOST_OPTIONS=${HOST_OPTIONS:=local.config}
 export CHECK_OPTIONS=${CHECK_OPTIONS:="-g auto"}
 export PWD=`pwd`
 
+export _QEMU_HANDLE=0
+
 # $1 = prog to look for, $2* = default pathnames if not found in $PATH
 set_prog_path()
 {
@@ -105,7 +107,12 @@ fi
 
 _qemu_wrapper()
 {
-    (exec "$QEMU_PROG" $QEMU_OPTIONS "$@")
+    (
+        if [ -n "${QEMU_NEED_PID}" ]; then
+            echo $BASHPID > "${TEST_DIR}/qemu-${_QEMU_HANDLE}.pid"
+        fi
+        exec "$QEMU_PROG" $QEMU_OPTIONS "$@"
+    )
 }
 
 _qemu_img_wrapper()
@@ -115,12 +122,31 @@ _qemu_img_wrapper()
 
 _qemu_io_wrapper()
 {
-    (exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@")
+    local VALGRIND_LOGFILE=/tmp/$$.valgrind
+    local RETVAL
+    (
+        if [ "${VALGRIND_QEMU}" == "y" ]; then
+            exec valgrind --log-file="${VALGRIND_LOGFILE}" --error-exitcode=99 "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@"
+        else
+            exec "$QEMU_IO_PROG" $QEMU_IO_OPTIONS "$@"
+        fi
+    )
+    RETVAL=$?
+    if [ "${VALGRIND_QEMU}" == "y" ]; then
+        if [ $RETVAL == 99 ]; then
+            cat "${VALGRIND_LOGFILE}"
+        fi
+        rm -f "${VALGRIND_LOGFILE}"
+    fi
+    (exit $RETVAL)
 }
 
 _qemu_nbd_wrapper()
 {
-    (exec "$QEMU_NBD_PROG" $QEMU_NBD_OPTIONS "$@")
+    (
+        echo $BASHPID > "${TEST_DIR}/qemu-nbd.pid"
+        exec "$QEMU_NBD_PROG" $QEMU_NBD_OPTIONS "$@"
+    )
 }
 
 export QEMU=_qemu_wrapper
diff --git a/tests/qemu-iotests/common.qemu b/tests/qemu-iotests/common.qemu
index e3faa53..8bf3969 100644
--- a/tests/qemu-iotests/common.qemu
+++ b/tests/qemu-iotests/common.qemu
@@ -30,8 +30,6 @@ QEMU_COMM_TIMEOUT=10
 QEMU_FIFO_IN="${TEST_DIR}/qmp-in-$$"
 QEMU_FIFO_OUT="${TEST_DIR}/qmp-out-$$"
 
-QEMU_PID=
-_QEMU_HANDLE=0
 QEMU_HANDLE=0
 
 # If bash version is >= 4.1, these will be overwritten and dynamic
@@ -153,11 +151,11 @@ function _launch_qemu()
     mkfifo "${fifo_out}"
     mkfifo "${fifo_in}"
 
+    QEMU_NEED_PID='y'\
     ${QEMU} -nographic -serial none ${comm} -machine accel=qtest "${@}" \
                                                                 >"${fifo_out}" \
                                                                 2>&1 \
                                                                 <"${fifo_in}" &
-    QEMU_PID[${_QEMU_HANDLE}]=$!
 
     if [[ "${BASH_VERSINFO[0]}" -ge "5" ||
         ("${BASH_VERSINFO[0]}" -ge "4"  &&  "${BASH_VERSINFO[1]}" -ge "1") ]]
@@ -196,10 +194,18 @@ function _cleanup_qemu()
     # QEMU_PID[], QEMU_IN[], QEMU_OUT[] all use same indices
     for i in "${!QEMU_OUT[@]}"
     do
-        if [ -z "${wait}" ]; then
-            kill -KILL ${QEMU_PID[$i]} 2>/dev/null
+        local QEMU_PID
+        if [ -f "${TEST_DIR}/qemu-${i}.pid" ]; then
+            read QEMU_PID < "${TEST_DIR}/qemu-${i}.pid"
+            rm -f "${TEST_DIR}/qemu-${i}.pid"
+            if [ -z "${wait}" ] && [ -n "${QEMU_PID}" ]; then
+                kill -KILL ${QEMU_PID} 2>/dev/null
+            fi
+            if [ -n "${QEMU_PID}" ]; then
+                wait ${QEMU_PID} 2>/dev/null # silent kill
+            fi
         fi
-        wait ${QEMU_PID[$i]} 2>/dev/null # silent kill
+
         if [ -n "${wait}" ]; then
             cat <&${QEMU_OUT[$i]} | _filter_testdir | _filter_qemu \
                                   | _filter_qemu_io | _filter_qmp
diff --git a/tests/qemu-iotests/common.rc b/tests/qemu-iotests/common.rc
index 28e4bea..d9913f8 100644
--- a/tests/qemu-iotests/common.rc
+++ b/tests/qemu-iotests/common.rc
@@ -70,16 +70,6 @@ else
     TEST_IMG=$IMGPROTO:$TEST_DIR/t.$IMGFMT
 fi
 
-function valgrind_qemu_io()
-{
-    valgrind --log-file=/tmp/$$.valgrind --error-exitcode=99 $REAL_QEMU_IO "$@"
-    if [ $? != 0 ]; then
-        cat /tmp/$$.valgrind
-    fi
-    rm -f /tmp/$$.valgrind
-}
-
-
 _optstr_add()
 {
     if [ -n "$1" ]; then
@@ -154,7 +144,6 @@ _make_test_img()
     # Start an NBD server on the image file, which is what we'll be talking to
     if [ $IMGPROTO = "nbd" ]; then
         eval "$QEMU_NBD -v -t -b 127.0.0.1 -p 10810 -f $IMGFMT  $TEST_IMG_FILE &"
-        QEMU_NBD_PID=$!
         sleep 1 # FIXME: qemu-nbd needs to be listening before we continue
     fi
 }
@@ -175,8 +164,11 @@ _cleanup_test_img()
     case "$IMGPROTO" in
 
         nbd)
-            if [ -n "$QEMU_NBD_PID" ]; then
-                kill $QEMU_NBD_PID
+            if [ -f "${TEST_DIR}/qemu-nbd.pid" ]; then
+                local QEMU_NBD_PID
+                read QEMU_NBD_PID < "${TEST_DIR}/qemu-nbd.pid"
+                kill ${QEMU_NBD_PID}
+                rm -f "${TEST_DIR}/qemu-nbd.pid"
             fi
             rm -f "$TEST_IMG_FILE"
             ;;
diff --git a/tests/qemu-iotests/group b/tests/qemu-iotests/group
index 30c784e..5a08808 100644
--- a/tests/qemu-iotests/group
+++ b/tests/qemu-iotests/group
@@ -122,6 +122,7 @@
 114 rw auto quick
 115 rw auto
 116 rw auto quick
+118 rw auto
 119 rw auto quick
 120 rw auto quick
 121 rw auto
@@ -135,5 +136,7 @@
 132 rw auto quick
 134 rw auto quick
 135 rw auto
+136 rw auto
 137 rw auto
 138 rw auto quick
+139 rw auto quick
diff --git a/tests/test-blockjob-txn.c b/tests/test-blockjob-txn.c
new file mode 100644
index 0000000..34747e9
--- /dev/null
+++ b/tests/test-blockjob-txn.c
@@ -0,0 +1,250 @@
+/*
+ * Blockjob transactions tests
+ *
+ * Copyright Red Hat, Inc. 2015
+ *
+ * Authors:
+ *  Stefan Hajnoczi    <stefanha@redhat.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include <glib.h>
+#include "qapi/error.h"
+#include "qemu/main-loop.h"
+#include "block/blockjob.h"
+
+typedef struct {
+    BlockJob common;
+    unsigned int iterations;
+    bool use_timer;
+    int rc;
+    int *result;
+} TestBlockJob;
+
+static const BlockJobDriver test_block_job_driver = {
+    .instance_size = sizeof(TestBlockJob),
+};
+
+static void test_block_job_complete(BlockJob *job, void *opaque)
+{
+    BlockDriverState *bs = job->bs;
+    int rc = (intptr_t)opaque;
+
+    if (block_job_is_cancelled(job)) {
+        rc = -ECANCELED;
+    }
+
+    block_job_completed(job, rc);
+    bdrv_unref(bs);
+}
+
+static void coroutine_fn test_block_job_run(void *opaque)
+{
+    TestBlockJob *s = opaque;
+    BlockJob *job = &s->common;
+
+    while (s->iterations--) {
+        if (s->use_timer) {
+            block_job_sleep_ns(job, QEMU_CLOCK_REALTIME, 0);
+        } else {
+            block_job_yield(job);
+        }
+
+        if (block_job_is_cancelled(job)) {
+            break;
+        }
+    }
+
+    block_job_defer_to_main_loop(job, test_block_job_complete,
+                                 (void *)(intptr_t)s->rc);
+}
+
+typedef struct {
+    TestBlockJob *job;
+    int *result;
+} TestBlockJobCBData;
+
+static void test_block_job_cb(void *opaque, int ret)
+{
+    TestBlockJobCBData *data = opaque;
+    if (!ret && block_job_is_cancelled(&data->job->common)) {
+        ret = -ECANCELED;
+    }
+    *data->result = ret;
+    g_free(data);
+}
+
+/* Create a block job that completes with a given return code after a given
+ * number of event loop iterations.  The return code is stored in the given
+ * result pointer.
+ *
+ * The event loop iterations can either be handled automatically with a 0 delay
+ * timer, or they can be stepped manually by entering the coroutine.
+ */
+static BlockJob *test_block_job_start(unsigned int iterations,
+                                      bool use_timer,
+                                      int rc, int *result)
+{
+    BlockDriverState *bs;
+    TestBlockJob *s;
+    TestBlockJobCBData *data;
+
+    data = g_new0(TestBlockJobCBData, 1);
+    bs = bdrv_new();
+    s = block_job_create(&test_block_job_driver, bs, 0, test_block_job_cb,
+                         data, &error_abort);
+    s->iterations = iterations;
+    s->use_timer = use_timer;
+    s->rc = rc;
+    s->result = result;
+    s->common.co = qemu_coroutine_create(test_block_job_run);
+    data->job = s;
+    data->result = result;
+    qemu_coroutine_enter(s->common.co, s);
+    return &s->common;
+}
+
+static void test_single_job(int expected)
+{
+    BlockJob *job;
+    BlockJobTxn *txn;
+    int result = -EINPROGRESS;
+
+    txn = block_job_txn_new();
+    job = test_block_job_start(1, true, expected, &result);
+    block_job_txn_add_job(txn, job);
+
+    if (expected == -ECANCELED) {
+        block_job_cancel(job);
+    }
+
+    while (result == -EINPROGRESS) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+    g_assert_cmpint(result, ==, expected);
+
+    block_job_txn_unref(txn);
+}
+
+static void test_single_job_success(void)
+{
+    test_single_job(0);
+}
+
+static void test_single_job_failure(void)
+{
+    test_single_job(-EIO);
+}
+
+static void test_single_job_cancel(void)
+{
+    test_single_job(-ECANCELED);
+}
+
+static void test_pair_jobs(int expected1, int expected2)
+{
+    BlockJob *job1;
+    BlockJob *job2;
+    BlockJobTxn *txn;
+    int result1 = -EINPROGRESS;
+    int result2 = -EINPROGRESS;
+
+    txn = block_job_txn_new();
+    job1 = test_block_job_start(1, true, expected1, &result1);
+    block_job_txn_add_job(txn, job1);
+    job2 = test_block_job_start(2, true, expected2, &result2);
+    block_job_txn_add_job(txn, job2);
+
+    if (expected1 == -ECANCELED) {
+        block_job_cancel(job1);
+    }
+    if (expected2 == -ECANCELED) {
+        block_job_cancel(job2);
+    }
+
+    while (result1 == -EINPROGRESS || result2 == -EINPROGRESS) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+
+    /* Failure or cancellation of one job cancels the other job */
+    if (expected1 != 0) {
+        expected2 = -ECANCELED;
+    } else if (expected2 != 0) {
+        expected1 = -ECANCELED;
+    }
+
+    g_assert_cmpint(result1, ==, expected1);
+    g_assert_cmpint(result2, ==, expected2);
+
+    block_job_txn_unref(txn);
+}
+
+static void test_pair_jobs_success(void)
+{
+    test_pair_jobs(0, 0);
+}
+
+static void test_pair_jobs_failure(void)
+{
+    /* Test both orderings.  The two jobs run for a different number of
+     * iterations so the code path is different depending on which job fails
+     * first.
+     */
+    test_pair_jobs(-EIO, 0);
+    test_pair_jobs(0, -EIO);
+}
+
+static void test_pair_jobs_cancel(void)
+{
+    test_pair_jobs(-ECANCELED, 0);
+    test_pair_jobs(0, -ECANCELED);
+}
+
+static void test_pair_jobs_fail_cancel_race(void)
+{
+    BlockJob *job1;
+    BlockJob *job2;
+    BlockJobTxn *txn;
+    int result1 = -EINPROGRESS;
+    int result2 = -EINPROGRESS;
+
+    txn = block_job_txn_new();
+    job1 = test_block_job_start(1, true, -ECANCELED, &result1);
+    block_job_txn_add_job(txn, job1);
+    job2 = test_block_job_start(2, false, 0, &result2);
+    block_job_txn_add_job(txn, job2);
+
+    block_job_cancel(job1);
+
+    /* Now make job2 finish before the main loop kicks jobs.  This simulates
+     * the race between a pending kick and another job completing.
+     */
+    block_job_enter(job2);
+    block_job_enter(job2);
+
+    while (result1 == -EINPROGRESS || result2 == -EINPROGRESS) {
+        aio_poll(qemu_get_aio_context(), true);
+    }
+
+    g_assert_cmpint(result1, ==, -ECANCELED);
+    g_assert_cmpint(result2, ==, -ECANCELED);
+
+    block_job_txn_unref(txn);
+}
+
+int main(int argc, char **argv)
+{
+    qemu_init_main_loop(&error_abort);
+
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/single/success", test_single_job_success);
+    g_test_add_func("/single/failure", test_single_job_failure);
+    g_test_add_func("/single/cancel", test_single_job_cancel);
+    g_test_add_func("/pair/success", test_pair_jobs_success);
+    g_test_add_func("/pair/failure", test_pair_jobs_failure);
+    g_test_add_func("/pair/cancel", test_pair_jobs_cancel);
+    g_test_add_func("/pair/fail-cancel-race", test_pair_jobs_fail_cancel_race);
+    return g_test_run();
+}
diff --git a/tests/test-qmp-commands.c b/tests/test-qmp-commands.c
index f23d8ea..888fb5f 100644
--- a/tests/test-qmp-commands.c
+++ b/tests/test-qmp-commands.c
@@ -225,8 +225,7 @@ static void test_dealloc_partial(void)
     assert(ud2->dict1 == NULL);
 
     /* confirm & release construction error */
-    assert(err != NULL);
-    error_free(err);
+    error_free_or_abort(&err);
 
     /* tear down partial object */
     qapi_free_UserDefTwo(ud2);
diff --git a/tests/test-qmp-input-strict.c b/tests/test-qmp-input-strict.c
index 53a7693..f1c2e3b 100644
--- a/tests/test-qmp-input-strict.c
+++ b/tests/test-qmp-input-strict.c
@@ -40,31 +40,42 @@ static void validate_teardown(TestInputVisitorData *data,
     }
 }
 
-/* This is provided instead of a test setup function so that the JSON
-   string used by the tests are kept in the test functions (and not
-   int main()) */
-static GCC_FMT_ATTR(2, 3)
-Visitor *validate_test_init(TestInputVisitorData *data,
-                             const char *json_string, ...)
+/* The various test_init functions are provided instead of a test setup
+   function so that the JSON string used by the tests are kept in the test
+   functions (and not in main()). */
+static Visitor *validate_test_init_internal(TestInputVisitorData *data,
+                                            const char *json_string,
+                                            va_list *ap)
 {
     Visitor *v;
-    va_list ap;
 
-    va_start(ap, json_string);
-    data->obj = qobject_from_jsonv(json_string, &ap);
-    va_end(ap);
+    validate_teardown(data, NULL);
 
-    g_assert(data->obj != NULL);
+    data->obj = qobject_from_jsonv(json_string, ap);
+    g_assert(data->obj);
 
     data->qiv = qmp_input_visitor_new_strict(data->obj);
-    g_assert(data->qiv != NULL);
+    g_assert(data->qiv);
 
     v = qmp_input_get_visitor(data->qiv);
-    g_assert(v != NULL);
+    g_assert(v);
 
     return v;
 }
 
+static GCC_FMT_ATTR(2, 3)
+Visitor *validate_test_init(TestInputVisitorData *data,
+                             const char *json_string, ...)
+{
+    Visitor *v;
+    va_list ap;
+
+    va_start(ap, json_string);
+    v = validate_test_init_internal(data, json_string, &ap);
+    va_end(ap);
+    return v;
+}
+
 /* similar to validate_test_init(), but does not expect a string
  * literal/format json_string argument and so can be used for
  * programatically generated strings (and we can't pass in programatically
@@ -75,67 +86,19 @@ Visitor *validate_test_init(TestInputVisitorData *data,
 static Visitor *validate_test_init_raw(TestInputVisitorData *data,
                                        const char *json_string)
 {
-    Visitor *v;
-
-    data->obj = qobject_from_json(json_string);
-    g_assert(data->obj != NULL);
-
-    data->qiv = qmp_input_visitor_new_strict(data->obj);
-    g_assert(data->qiv != NULL);
-
-    v = qmp_input_get_visitor(data->qiv);
-    g_assert(v != NULL);
-
-    return v;
+    return validate_test_init_internal(data, json_string, NULL);
 }
 
-typedef struct TestStruct
-{
-    int64_t integer;
-    bool boolean;
-    char *string;
-} TestStruct;
-
-static void visit_type_TestStruct(Visitor *v, TestStruct **obj,
-                                  const char *name, Error **errp)
-{
-    Error *err = NULL;
-
-    visit_start_struct(v, (void **)obj, "TestStruct", name, sizeof(TestStruct),
-                       &err);
-    if (err) {
-        goto out;
-    }
-
-    visit_type_int(v, &(*obj)->integer, "integer", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_bool(v, &(*obj)->boolean, "boolean", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_str(v, &(*obj)->string, "string", &err);
-
-out_end:
-    error_propagate(errp, err);
-    err = NULL;
-    visit_end_struct(v, &err);
-out:
-    error_propagate(errp, err);
-}
 
 static void test_validate_struct(TestInputVisitorData *data,
                                   const void *unused)
 {
     TestStruct *p = NULL;
-    Error *err = NULL;
     Visitor *v;
 
     v = validate_test_init(data, "{ 'integer': -42, 'boolean': true, 'string': 'foo' }");
 
-    visit_type_TestStruct(v, &p, NULL, &err);
-    g_assert(!err);
+    visit_type_TestStruct(v, &p, NULL, &error_abort);
     g_free(p->string);
     g_free(p);
 }
@@ -144,7 +107,6 @@ static void test_validate_struct_nested(TestInputVisitorData *data,
                                          const void *unused)
 {
     UserDefTwo *udp = NULL;
-    Error *err = NULL;
     Visitor *v;
 
     v = validate_test_init(data, "{ 'string0': 'string0', "
@@ -152,8 +114,7 @@ static void test_validate_struct_nested(TestInputVisitorData *data,
                            "'dict2': { 'userdef': { 'integer': 42, "
                            "'string': 'string' }, 'string': 'string2'}}}");
 
-    visit_type_UserDefTwo(v, &udp, NULL, &err);
-    g_assert(!err);
+    visit_type_UserDefTwo(v, &udp, NULL, &error_abort);
     qapi_free_UserDefTwo(udp);
 }
 
@@ -161,13 +122,11 @@ static void test_validate_list(TestInputVisitorData *data,
                                 const void *unused)
 {
     UserDefOneList *head = NULL;
-    Error *err = NULL;
     Visitor *v;
 
     v = validate_test_init(data, "[ { 'string': 'string0', 'integer': 42 }, { 'string': 'string1', 'integer': 43 }, { 'string': 'string2', 'integer': 44 } ]");
 
-    visit_type_UserDefOneList(v, &head, NULL, &err);
-    g_assert(!err);
+    visit_type_UserDefOneList(v, &head, NULL, &error_abort);
     qapi_free_UserDefOneList(head);
 }
 
@@ -176,12 +135,10 @@ static void test_validate_union_native_list(TestInputVisitorData *data,
 {
     UserDefNativeListUnion *tmp = NULL;
     Visitor *v;
-    Error *err = NULL;
 
     v = validate_test_init(data, "{ 'type': 'integer', 'data' : [ 1, 2 ] }");
 
-    visit_type_UserDefNativeListUnion(v, &tmp, NULL, &err);
-    g_assert(!err);
+    visit_type_UserDefNativeListUnion(v, &tmp, NULL, &error_abort);
     qapi_free_UserDefNativeListUnion(tmp);
 }
 
@@ -190,7 +147,6 @@ static void test_validate_union_flat(TestInputVisitorData *data,
 {
     UserDefFlatUnion *tmp = NULL;
     Visitor *v;
-    Error *err = NULL;
 
     v = validate_test_init(data,
                            "{ 'enum1': 'value1', "
@@ -198,8 +154,7 @@ static void test_validate_union_flat(TestInputVisitorData *data,
                            "'string': 'str', "
                            "'boolean': true }");
 
-    visit_type_UserDefFlatUnion(v, &tmp, NULL, &err);
-    g_assert(!err);
+    visit_type_UserDefFlatUnion(v, &tmp, NULL, &error_abort);
     qapi_free_UserDefFlatUnion(tmp);
 }
 
@@ -208,12 +163,10 @@ static void test_validate_alternate(TestInputVisitorData *data,
 {
     UserDefAlternate *tmp = NULL;
     Visitor *v;
-    Error *err = NULL;
 
     v = validate_test_init(data, "42");
 
-    visit_type_UserDefAlternate(v, &tmp, NULL, &err);
-    g_assert(!err);
+    visit_type_UserDefAlternate(v, &tmp, NULL, &error_abort);
     qapi_free_UserDefAlternate(tmp);
 }
 
@@ -227,7 +180,7 @@ static void test_validate_fail_struct(TestInputVisitorData *data,
     v = validate_test_init(data, "{ 'integer': -42, 'boolean': true, 'string': 'foo', 'extra': 42 }");
 
     visit_type_TestStruct(v, &p, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     if (p) {
         g_free(p->string);
     }
@@ -244,7 +197,7 @@ static void test_validate_fail_struct_nested(TestInputVisitorData *data,
     v = validate_test_init(data, "{ 'string0': 'string0', 'dict1': { 'string1': 'string1', 'dict2': { 'userdef1': { 'integer': 42, 'string': 'string', 'extra': [42, 23, {'foo':'bar'}] }, 'string2': 'string2'}}}");
 
     visit_type_UserDefTwo(v, &udp, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     qapi_free_UserDefTwo(udp);
 }
 
@@ -258,7 +211,7 @@ static void test_validate_fail_list(TestInputVisitorData *data,
     v = validate_test_init(data, "[ { 'string': 'string0', 'integer': 42 }, { 'string': 'string1', 'integer': 43 }, { 'string': 'string2', 'integer': 44, 'extra': 'ggg' } ]");
 
     visit_type_UserDefOneList(v, &head, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     qapi_free_UserDefOneList(head);
 }
 
@@ -273,7 +226,7 @@ static void test_validate_fail_union_native_list(TestInputVisitorData *data,
                            "{ 'type': 'integer', 'data' : [ 'string' ] }");
 
     visit_type_UserDefNativeListUnion(v, &tmp, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     qapi_free_UserDefNativeListUnion(tmp);
 }
 
@@ -287,7 +240,7 @@ static void test_validate_fail_union_flat(TestInputVisitorData *data,
     v = validate_test_init(data, "{ 'string': 'c', 'integer': 41, 'boolean': true }");
 
     visit_type_UserDefFlatUnion(v, &tmp, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     qapi_free_UserDefFlatUnion(tmp);
 }
 
@@ -302,7 +255,7 @@ static void test_validate_fail_union_flat_no_discrim(TestInputVisitorData *data,
     v = validate_test_init(data, "{ 'integer': 42, 'string': 'c', 'string1': 'd', 'string2': 'e' }");
 
     visit_type_UserDefFlatUnion2(v, &tmp, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     qapi_free_UserDefFlatUnion2(tmp);
 }
 
@@ -316,7 +269,7 @@ static void test_validate_fail_alternate(TestInputVisitorData *data,
     v = validate_test_init(data, "3.14");
 
     visit_type_UserDefAlternate(v, &tmp, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     qapi_free_UserDefAlternate(tmp);
 }
 
@@ -324,16 +277,11 @@ static void do_test_validate_qmp_introspect(TestInputVisitorData *data,
                                             const char *schema_json)
 {
     SchemaInfoList *schema = NULL;
-    Error *err = NULL;
     Visitor *v;
 
     v = validate_test_init_raw(data, schema_json);
 
-    visit_type_SchemaInfoList(v, &schema, NULL, &err);
-    if (err) {
-        fprintf(stderr, "%s", error_get_pretty(err));
-    }
-    g_assert(!err);
+    visit_type_SchemaInfoList(v, &schema, NULL, &error_abort);
     g_assert(schema);
 
     qapi_free_SchemaInfoList(schema);
diff --git a/tests/test-qmp-input-visitor.c b/tests/test-qmp-input-visitor.c
index de65982..d48ebdd 100644
--- a/tests/test-qmp-input-visitor.c
+++ b/tests/test-qmp-input-visitor.c
@@ -36,28 +36,39 @@ static void visitor_input_teardown(TestInputVisitorData *data,
     }
 }
 
-/* This is provided instead of a test setup function so that the JSON
-   string used by the tests are kept in the test functions (and not
-   int main()) */
-static GCC_FMT_ATTR(2, 3)
-Visitor *visitor_input_test_init(TestInputVisitorData *data,
-                                 const char *json_string, ...)
+/* The various test_init functions are provided instead of a test setup
+   function so that the JSON string used by the tests are kept in the test
+   functions (and not in main()). */
+static Visitor *visitor_input_test_init_internal(TestInputVisitorData *data,
+                                                 const char *json_string,
+                                                 va_list *ap)
 {
     Visitor *v;
-    va_list ap;
 
-    va_start(ap, json_string);
-    data->obj = qobject_from_jsonv(json_string, &ap);
-    va_end(ap);
+    visitor_input_teardown(data, NULL);
 
-    g_assert(data->obj != NULL);
+    data->obj = qobject_from_jsonv(json_string, ap);
+    g_assert(data->obj);
 
     data->qiv = qmp_input_visitor_new(data->obj);
-    g_assert(data->qiv != NULL);
+    g_assert(data->qiv);
 
     v = qmp_input_get_visitor(data->qiv);
-    g_assert(v != NULL);
+    g_assert(v);
+
+    return v;
+}
+
+static GCC_FMT_ATTR(2, 3)
+Visitor *visitor_input_test_init(TestInputVisitorData *data,
+                                 const char *json_string, ...)
+{
+    Visitor *v;
+    va_list ap;
 
+    va_start(ap, json_string);
+    v = visitor_input_test_init_internal(data, json_string, &ap);
+    va_end(ap);
     return v;
 }
 
@@ -71,32 +82,18 @@ Visitor *visitor_input_test_init(TestInputVisitorData *data,
 static Visitor *visitor_input_test_init_raw(TestInputVisitorData *data,
                                             const char *json_string)
 {
-    Visitor *v;
-
-    data->obj = qobject_from_json(json_string);
-
-    g_assert(data->obj != NULL);
-
-    data->qiv = qmp_input_visitor_new(data->obj);
-    g_assert(data->qiv != NULL);
-
-    v = qmp_input_get_visitor(data->qiv);
-    g_assert(v != NULL);
-
-    return v;
+    return visitor_input_test_init_internal(data, json_string, NULL);
 }
 
 static void test_visitor_in_int(TestInputVisitorData *data,
                                 const void *unused)
 {
     int64_t res = 0, value = -42;
-    Error *err = NULL;
     Visitor *v;
 
     v = visitor_input_test_init(data, "%" PRId64, value);
 
-    visit_type_int(v, &res, NULL, &err);
-    g_assert(!err);
+    visit_type_int(v, &res, NULL, &error_abort);
     g_assert_cmpint(res, ==, value);
 }
 
@@ -114,21 +111,18 @@ static void test_visitor_in_int_overflow(TestInputVisitorData *data,
     v = visitor_input_test_init(data, "%f", DBL_MAX);
 
     visit_type_int(v, &res, NULL, &err);
-    g_assert(err);
-    error_free(err);
+    error_free_or_abort(&err);
 }
 
 static void test_visitor_in_bool(TestInputVisitorData *data,
                                  const void *unused)
 {
-    Error *err = NULL;
     bool res = false;
     Visitor *v;
 
     v = visitor_input_test_init(data, "true");
 
-    visit_type_bool(v, &res, NULL, &err);
-    g_assert(!err);
+    visit_type_bool(v, &res, NULL, &error_abort);
     g_assert_cmpint(res, ==, true);
 }
 
@@ -136,13 +130,11 @@ static void test_visitor_in_number(TestInputVisitorData *data,
                                    const void *unused)
 {
     double res = 0, value = 3.14;
-    Error *err = NULL;
     Visitor *v;
 
     v = visitor_input_test_init(data, "%f", value);
 
-    visit_type_number(v, &res, NULL, &err);
-    g_assert(!err);
+    visit_type_number(v, &res, NULL, &error_abort);
     g_assert_cmpfloat(res, ==, value);
 }
 
@@ -150,13 +142,11 @@ static void test_visitor_in_string(TestInputVisitorData *data,
                                    const void *unused)
 {
     char *res = NULL, *value = (char *) "Q E M U";
-    Error *err = NULL;
     Visitor *v;
 
     v = visitor_input_test_init(data, "%s", value);
 
-    visit_type_str(v, &res, NULL, &err);
-    g_assert(!err);
+    visit_type_str(v, &res, NULL, &error_abort);
     g_assert_cmpstr(res, ==, value);
 
     g_free(res);
@@ -165,7 +155,6 @@ static void test_visitor_in_string(TestInputVisitorData *data,
 static void test_visitor_in_enum(TestInputVisitorData *data,
                                  const void *unused)
 {
-    Error *err = NULL;
     Visitor *v;
     EnumOne i;
 
@@ -174,63 +163,21 @@ static void test_visitor_in_enum(TestInputVisitorData *data,
 
         v = visitor_input_test_init(data, "%s", EnumOne_lookup[i]);
 
-        visit_type_EnumOne(v, &res, NULL, &err);
-        g_assert(!err);
+        visit_type_EnumOne(v, &res, NULL, &error_abort);
         g_assert_cmpint(i, ==, res);
-
-        visitor_input_teardown(data, NULL);
     }
-
-    data->obj = NULL;
-    data->qiv = NULL;
 }
 
-typedef struct TestStruct
-{
-    int64_t integer;
-    bool boolean;
-    char *string;
-} TestStruct;
-
-static void visit_type_TestStruct(Visitor *v, TestStruct **obj,
-                                  const char *name, Error **errp)
-{
-    Error *err = NULL;
-
-    visit_start_struct(v, (void **)obj, "TestStruct", name, sizeof(TestStruct),
-                       &err);
-    if (err) {
-        goto out;
-    }
-    visit_type_int(v, &(*obj)->integer, "integer", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_bool(v, &(*obj)->boolean, "boolean", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_str(v, &(*obj)->string, "string", &err);
-
-out_end:
-    error_propagate(errp, err);
-    err = NULL;
-    visit_end_struct(v, &err);
-out:
-    error_propagate(errp, err);
-}
 
 static void test_visitor_in_struct(TestInputVisitorData *data,
                                    const void *unused)
 {
     TestStruct *p = NULL;
-    Error *err = NULL;
     Visitor *v;
 
     v = visitor_input_test_init(data, "{ 'integer': -42, 'boolean': true, 'string': 'foo' }");
 
-    visit_type_TestStruct(v, &p, NULL, &err);
-    g_assert(!err);
+    visit_type_TestStruct(v, &p, NULL, &error_abort);
     g_assert_cmpint(p->integer, ==, -42);
     g_assert(p->boolean == true);
     g_assert_cmpstr(p->string, ==, "foo");
@@ -239,17 +186,10 @@ static void test_visitor_in_struct(TestInputVisitorData *data,
     g_free(p);
 }
 
-static void check_and_free_str(char *str, const char *cmp)
-{
-    g_assert_cmpstr(str, ==, cmp);
-    g_free(str);
-}
-
 static void test_visitor_in_struct_nested(TestInputVisitorData *data,
                                           const void *unused)
 {
     UserDefTwo *udp = NULL;
-    Error *err = NULL;
     Visitor *v;
 
     v = visitor_input_test_init(data, "{ 'string0': 'string0', "
@@ -257,34 +197,28 @@ static void test_visitor_in_struct_nested(TestInputVisitorData *data,
                                 "'dict2': { 'userdef': { 'integer': 42, "
                                 "'string': 'string' }, 'string': 'string2'}}}");
 
-    visit_type_UserDefTwo(v, &udp, NULL, &err);
-    g_assert(!err);
+    visit_type_UserDefTwo(v, &udp, NULL, &error_abort);
 
-    check_and_free_str(udp->string0, "string0");
-    check_and_free_str(udp->dict1->string1, "string1");
+    g_assert_cmpstr(udp->string0, ==, "string0");
+    g_assert_cmpstr(udp->dict1->string1, ==, "string1");
     g_assert_cmpint(udp->dict1->dict2->userdef->integer, ==, 42);
-    check_and_free_str(udp->dict1->dict2->userdef->string, "string");
-    check_and_free_str(udp->dict1->dict2->string, "string2");
+    g_assert_cmpstr(udp->dict1->dict2->userdef->string, ==, "string");
+    g_assert_cmpstr(udp->dict1->dict2->string, ==, "string2");
     g_assert(udp->dict1->has_dict3 == false);
 
-    g_free(udp->dict1->dict2->userdef);
-    g_free(udp->dict1->dict2);
-    g_free(udp->dict1);
-    g_free(udp);
+    qapi_free_UserDefTwo(udp);
 }
 
 static void test_visitor_in_list(TestInputVisitorData *data,
                                  const void *unused)
 {
     UserDefOneList *item, *head = NULL;
-    Error *err = NULL;
     Visitor *v;
     int i;
 
     v = visitor_input_test_init(data, "[ { 'string': 'string0', 'integer': 42 }, { 'string': 'string1', 'integer': 43 }, { 'string': 'string2', 'integer': 44 } ]");
 
-    visit_type_UserDefOneList(v, &head, NULL, &err);
-    g_assert(!err);
+    visit_type_UserDefOneList(v, &head, NULL, &error_abort);
     g_assert(head != NULL);
 
     for (i = 0, item = head; item; item = item->next, i++) {
@@ -296,13 +230,18 @@ static void test_visitor_in_list(TestInputVisitorData *data,
     }
 
     qapi_free_UserDefOneList(head);
+    head = NULL;
+
+    /* An empty list is valid */
+    v = visitor_input_test_init(data, "[]");
+    visit_type_UserDefOneList(v, &head, NULL, &error_abort);
+    g_assert(!head);
 }
 
 static void test_visitor_in_any(TestInputVisitorData *data,
                                 const void *unused)
 {
     QObject *res = NULL;
-    Error *err = NULL;
     Visitor *v;
     QInt *qint;
     QBool *qbool;
@@ -311,16 +250,14 @@ static void test_visitor_in_any(TestInputVisitorData *data,
     QObject *qobj;
 
     v = visitor_input_test_init(data, "-42");
-    visit_type_any(v, &res, NULL, &err);
-    g_assert(!err);
+    visit_type_any(v, &res, NULL, &error_abort);
     qint = qobject_to_qint(res);
     g_assert(qint);
     g_assert_cmpint(qint_get_int(qint), ==, -42);
     qobject_decref(res);
 
     v = visitor_input_test_init(data, "{ 'integer': -42, 'boolean': true, 'string': 'foo' }");
-    visit_type_any(v, &res, NULL, &err);
-    g_assert(!err);
+    visit_type_any(v, &res, NULL, &error_abort);
     qdict = qobject_to_qdict(res);
     g_assert(qdict && qdict_size(qdict) == 3);
     qobj = qdict_get(qdict, "integer");
@@ -345,7 +282,6 @@ static void test_visitor_in_union_flat(TestInputVisitorData *data,
                                        const void *unused)
 {
     Visitor *v;
-    Error *err = NULL;
     UserDefFlatUnion *tmp;
     UserDefUnionBase *base;
 
@@ -355,8 +291,7 @@ static void test_visitor_in_union_flat(TestInputVisitorData *data,
                                 "'string': 'str', "
                                 "'boolean': true }");
 
-    visit_type_UserDefFlatUnion(v, &tmp, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefFlatUnion(v, &tmp, NULL, &error_abort);
     g_assert_cmpint(tmp->enum1, ==, ENUM_ONE_VALUE1);
     g_assert_cmpstr(tmp->string, ==, "str");
     g_assert_cmpint(tmp->integer, ==, 41);
@@ -380,22 +315,17 @@ static void test_visitor_in_alternate(TestInputVisitorData *data,
     g_assert_cmpint(tmp->type, ==, USER_DEF_ALTERNATE_KIND_I);
     g_assert_cmpint(tmp->u.i, ==, 42);
     qapi_free_UserDefAlternate(tmp);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "'string'");
     visit_type_UserDefAlternate(v, &tmp, NULL, &error_abort);
     g_assert_cmpint(tmp->type, ==, USER_DEF_ALTERNATE_KIND_S);
     g_assert_cmpstr(tmp->u.s, ==, "string");
     qapi_free_UserDefAlternate(tmp);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "false");
     visit_type_UserDefAlternate(v, &tmp, NULL, &err);
-    g_assert(err);
-    error_free(err);
-    err = NULL;
+    error_free_or_abort(&err);
     qapi_free_UserDefAlternate(tmp);
-    visitor_input_teardown(data, NULL);
 }
 
 static void test_visitor_in_alternate_number(TestInputVisitorData *data,
@@ -414,11 +344,8 @@ static void test_visitor_in_alternate_number(TestInputVisitorData *data,
 
     v = visitor_input_test_init(data, "42");
     visit_type_AltStrBool(v, &asb, NULL, &err);
-    g_assert(err);
-    error_free(err);
-    err = NULL;
+    error_free_or_abort(&err);
     qapi_free_AltStrBool(asb);
-    visitor_input_teardown(data, NULL);
 
     /* FIXME: Order of alternate should not affect semantics; asn should
      * parse the same as ans */
@@ -426,85 +353,68 @@ static void test_visitor_in_alternate_number(TestInputVisitorData *data,
     visit_type_AltStrNum(v, &asn, NULL, &err);
     /* FIXME g_assert_cmpint(asn->type, == ALT_STR_NUM_KIND_N); */
     /* FIXME g_assert_cmpfloat(asn->u.n, ==, 42); */
-    g_assert(err);
-    error_free(err);
-    err = NULL;
+    error_free_or_abort(&err);
     qapi_free_AltStrNum(asn);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42");
     visit_type_AltNumStr(v, &ans, NULL, &error_abort);
     g_assert_cmpint(ans->type, ==, ALT_NUM_STR_KIND_N);
     g_assert_cmpfloat(ans->u.n, ==, 42);
     qapi_free_AltNumStr(ans);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42");
     visit_type_AltStrInt(v, &asi, NULL, &error_abort);
     g_assert_cmpint(asi->type, ==, ALT_STR_INT_KIND_I);
     g_assert_cmpint(asi->u.i, ==, 42);
     qapi_free_AltStrInt(asi);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42");
     visit_type_AltIntNum(v, &ain, NULL, &error_abort);
     g_assert_cmpint(ain->type, ==, ALT_INT_NUM_KIND_I);
     g_assert_cmpint(ain->u.i, ==, 42);
     qapi_free_AltIntNum(ain);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42");
     visit_type_AltNumInt(v, &ani, NULL, &error_abort);
     g_assert_cmpint(ani->type, ==, ALT_NUM_INT_KIND_I);
     g_assert_cmpint(ani->u.i, ==, 42);
     qapi_free_AltNumInt(ani);
-    visitor_input_teardown(data, NULL);
 
     /* Parsing a double */
 
     v = visitor_input_test_init(data, "42.5");
     visit_type_AltStrBool(v, &asb, NULL, &err);
-    g_assert(err);
-    error_free(err);
-    err = NULL;
+    error_free_or_abort(&err);
     qapi_free_AltStrBool(asb);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42.5");
     visit_type_AltStrNum(v, &asn, NULL, &error_abort);
     g_assert_cmpint(asn->type, ==, ALT_STR_NUM_KIND_N);
     g_assert_cmpfloat(asn->u.n, ==, 42.5);
     qapi_free_AltStrNum(asn);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42.5");
     visit_type_AltNumStr(v, &ans, NULL, &error_abort);
     g_assert_cmpint(ans->type, ==, ALT_NUM_STR_KIND_N);
     g_assert_cmpfloat(ans->u.n, ==, 42.5);
     qapi_free_AltNumStr(ans);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42.5");
     visit_type_AltStrInt(v, &asi, NULL, &err);
-    g_assert(err);
-    error_free(err);
-    err = NULL;
+    error_free_or_abort(&err);
     qapi_free_AltStrInt(asi);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42.5");
     visit_type_AltIntNum(v, &ain, NULL, &error_abort);
     g_assert_cmpint(ain->type, ==, ALT_INT_NUM_KIND_N);
     g_assert_cmpfloat(ain->u.n, ==, 42.5);
     qapi_free_AltIntNum(ain);
-    visitor_input_teardown(data, NULL);
 
     v = visitor_input_test_init(data, "42.5");
     visit_type_AltNumInt(v, &ani, NULL, &error_abort);
     g_assert_cmpint(ani->type, ==, ALT_NUM_INT_KIND_N);
     g_assert_cmpfloat(ani->u.n, ==, 42.5);
     qapi_free_AltNumInt(ani);
-    visitor_input_teardown(data, NULL);
 }
 
 static void test_native_list_integer_helper(TestInputVisitorData *data,
@@ -512,7 +422,6 @@ static void test_native_list_integer_helper(TestInputVisitorData *data,
                                             UserDefNativeListUnionKind kind)
 {
     UserDefNativeListUnion *cvalue = NULL;
-    Error *err = NULL;
     Visitor *v;
     GString *gstr_list = g_string_new("");
     GString *gstr_union = g_string_new("");
@@ -529,8 +438,7 @@ static void test_native_list_integer_helper(TestInputVisitorData *data,
                            gstr_list->str);
     v = visitor_input_test_init_raw(data,  gstr_union->str);
 
-    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &error_abort);
     g_assert(cvalue != NULL);
     g_assert_cmpint(cvalue->type, ==, kind);
 
@@ -675,7 +583,6 @@ static void test_visitor_in_native_list_bool(TestInputVisitorData *data,
 {
     UserDefNativeListUnion *cvalue = NULL;
     boolList *elem = NULL;
-    Error *err = NULL;
     Visitor *v;
     GString *gstr_list = g_string_new("");
     GString *gstr_union = g_string_new("");
@@ -692,8 +599,7 @@ static void test_visitor_in_native_list_bool(TestInputVisitorData *data,
                            gstr_list->str);
     v = visitor_input_test_init_raw(data,  gstr_union->str);
 
-    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &error_abort);
     g_assert(cvalue != NULL);
     g_assert_cmpint(cvalue->type, ==, USER_DEF_NATIVE_LIST_UNION_KIND_BOOLEAN);
 
@@ -711,7 +617,6 @@ static void test_visitor_in_native_list_string(TestInputVisitorData *data,
 {
     UserDefNativeListUnion *cvalue = NULL;
     strList *elem = NULL;
-    Error *err = NULL;
     Visitor *v;
     GString *gstr_list = g_string_new("");
     GString *gstr_union = g_string_new("");
@@ -727,8 +632,7 @@ static void test_visitor_in_native_list_string(TestInputVisitorData *data,
                            gstr_list->str);
     v = visitor_input_test_init_raw(data,  gstr_union->str);
 
-    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &error_abort);
     g_assert(cvalue != NULL);
     g_assert_cmpint(cvalue->type, ==, USER_DEF_NATIVE_LIST_UNION_KIND_STRING);
 
@@ -750,7 +654,6 @@ static void test_visitor_in_native_list_number(TestInputVisitorData *data,
 {
     UserDefNativeListUnion *cvalue = NULL;
     numberList *elem = NULL;
-    Error *err = NULL;
     Visitor *v;
     GString *gstr_list = g_string_new("");
     GString *gstr_union = g_string_new("");
@@ -766,8 +669,7 @@ static void test_visitor_in_native_list_number(TestInputVisitorData *data,
                            gstr_list->str);
     v = visitor_input_test_init_raw(data,  gstr_union->str);
 
-    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefNativeListUnion(v, &cvalue, NULL, &error_abort);
     g_assert(cvalue != NULL);
     g_assert_cmpint(cvalue->type, ==, USER_DEF_NATIVE_LIST_UNION_KIND_NUMBER);
 
@@ -802,18 +704,69 @@ static void test_visitor_in_errors(TestInputVisitorData *data,
     TestStruct *p = NULL;
     Error *err = NULL;
     Visitor *v;
+    strList *q = NULL;
 
-    v = visitor_input_test_init(data, "{ 'integer': false, 'boolean': 'foo', 'string': -42 }");
+    v = visitor_input_test_init(data, "{ 'integer': false, 'boolean': 'foo', "
+                                "'string': -42 }");
 
     visit_type_TestStruct(v, &p, NULL, &err);
-    g_assert(err);
+    error_free_or_abort(&err);
     /* FIXME - a failed parse should not leave a partially-allocated p
      * for us to clean up; this could cause callers to leak memory. */
     g_assert(p->string == NULL);
 
-    error_free(err);
     g_free(p->string);
     g_free(p);
+
+    v = visitor_input_test_init(data, "[ '1', '2', false, '3' ]");
+    visit_type_strList(v, &q, NULL, &err);
+    error_free_or_abort(&err);
+    assert(q);
+    qapi_free_strList(q);
+}
+
+static void test_visitor_in_wrong_type(TestInputVisitorData *data,
+                                       const void *unused)
+{
+    TestStruct *p = NULL;
+    Visitor *v;
+    strList *q = NULL;
+    int64_t i;
+    Error *err = NULL;
+
+    /* Make sure arrays and structs cannot be confused */
+
+    v = visitor_input_test_init(data, "[]");
+    visit_type_TestStruct(v, &p, NULL, &err);
+    error_free_or_abort(&err);
+    g_assert(!p);
+
+    v = visitor_input_test_init(data, "{}");
+    visit_type_strList(v, &q, NULL, &err);
+    error_free_or_abort(&err);
+    assert(!q);
+
+    /* Make sure primitives and struct cannot be confused */
+
+    v = visitor_input_test_init(data, "1");
+    visit_type_TestStruct(v, &p, NULL, &err);
+    error_free_or_abort(&err);
+    g_assert(!p);
+
+    v = visitor_input_test_init(data, "{}");
+    visit_type_int(v, &i, NULL, &err);
+    error_free_or_abort(&err);
+
+    /* Make sure primitives and arrays cannot be confused */
+
+    v = visitor_input_test_init(data, "1");
+    visit_type_strList(v, &q, NULL, &err);
+    error_free_or_abort(&err);
+    assert(!q);
+
+    v = visitor_input_test_init(data, "[]");
+    visit_type_int(v, &i, NULL, &err);
+    error_free_or_abort(&err);
 }
 
 int main(int argc, char **argv)
@@ -848,6 +801,8 @@ int main(int argc, char **argv)
                            &in_visitor_data, test_visitor_in_alternate);
     input_visitor_test_add("/visitor/input/errors",
                            &in_visitor_data, test_visitor_in_errors);
+    input_visitor_test_add("/visitor/input/wrong-type",
+                           &in_visitor_data, test_visitor_in_wrong_type);
     input_visitor_test_add("/visitor/input/alternate-number",
                            &in_visitor_data, test_visitor_in_alternate_number);
     input_visitor_test_add("/visitor/input/native_list/int",
diff --git a/tests/test-qmp-output-visitor.c b/tests/test-qmp-output-visitor.c
index 09d0dd8..0d0c859 100644
--- a/tests/test-qmp-output-visitor.c
+++ b/tests/test-qmp-output-visitor.c
@@ -45,11 +45,9 @@ static void test_visitor_out_int(TestOutputVisitorData *data,
                                  const void *unused)
 {
     int64_t value = -42;
-    Error *err = NULL;
     QObject *obj;
 
-    visit_type_int(data->ov, &value, NULL, &err);
-    g_assert(!err);
+    visit_type_int(data->ov, &value, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -62,12 +60,10 @@ static void test_visitor_out_int(TestOutputVisitorData *data,
 static void test_visitor_out_bool(TestOutputVisitorData *data,
                                   const void *unused)
 {
-    Error *err = NULL;
     bool value = true;
     QObject *obj;
 
-    visit_type_bool(data->ov, &value, NULL, &err);
-    g_assert(!err);
+    visit_type_bool(data->ov, &value, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -81,11 +77,9 @@ static void test_visitor_out_number(TestOutputVisitorData *data,
                                     const void *unused)
 {
     double value = 3.14;
-    Error *err = NULL;
     QObject *obj;
 
-    visit_type_number(data->ov, &value, NULL, &err);
-    g_assert(!err);
+    visit_type_number(data->ov, &value, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -99,11 +93,9 @@ static void test_visitor_out_string(TestOutputVisitorData *data,
                                     const void *unused)
 {
     char *string = (char *) "Q E M U";
-    Error *err = NULL;
     QObject *obj;
 
-    visit_type_str(data->ov, &string, NULL, &err);
-    g_assert(!err);
+    visit_type_str(data->ov, &string, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -117,12 +109,10 @@ static void test_visitor_out_no_string(TestOutputVisitorData *data,
                                        const void *unused)
 {
     char *string = NULL;
-    Error *err = NULL;
     QObject *obj;
 
     /* A null string should return "" */
-    visit_type_str(data->ov, &string, NULL, &err);
-    g_assert(!err);
+    visit_type_str(data->ov, &string, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -135,13 +125,11 @@ static void test_visitor_out_no_string(TestOutputVisitorData *data,
 static void test_visitor_out_enum(TestOutputVisitorData *data,
                                   const void *unused)
 {
-    Error *err = NULL;
     QObject *obj;
     EnumOne i;
 
     for (i = 0; i < ENUM_ONE_MAX; i++) {
-        visit_type_EnumOne(data->ov, &i, "unused", &err);
-        g_assert(!err);
+        visit_type_EnumOne(data->ov, &i, "unused", &error_abort);
 
         obj = qmp_output_get_qobject(data->qov);
         g_assert(obj != NULL);
@@ -166,41 +154,6 @@ static void test_visitor_out_enum_errors(TestOutputVisitorData *data,
     }
 }
 
-typedef struct TestStruct
-{
-    int64_t integer;
-    bool boolean;
-    char *string;
-} TestStruct;
-
-static void visit_type_TestStruct(Visitor *v, TestStruct **obj,
-                                  const char *name, Error **errp)
-{
-    Error *err = NULL;
-
-    visit_start_struct(v, (void **)obj, "TestStruct", name, sizeof(TestStruct),
-                       &err);
-    if (err) {
-        goto out;
-    }
-
-    visit_type_int(v, &(*obj)->integer, "integer", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_bool(v, &(*obj)->boolean, "boolean", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_str(v, &(*obj)->string, "string", &err);
-
-out_end:
-    error_propagate(errp, err);
-    err = NULL;
-    visit_end_struct(v, &err);
-out:
-    error_propagate(errp, err);
-}
 
 static void test_visitor_out_struct(TestOutputVisitorData *data,
                                     const void *unused)
@@ -209,12 +162,10 @@ static void test_visitor_out_struct(TestOutputVisitorData *data,
                                .boolean = false,
                                .string = (char *) "foo"};
     TestStruct *p = &test_struct;
-    Error *err = NULL;
     QObject *obj;
     QDict *qdict;
 
-    visit_type_TestStruct(data->ov, &p, NULL, &err);
-    g_assert(!err);
+    visit_type_TestStruct(data->ov, &p, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -233,7 +184,6 @@ static void test_visitor_out_struct_nested(TestOutputVisitorData *data,
                                            const void *unused)
 {
     int64_t value = 42;
-    Error *err = NULL;
     UserDefTwo *ud2;
     QObject *obj;
     QDict *qdict, *dict1, *dict2, *dict3, *userdef;
@@ -260,8 +210,7 @@ static void test_visitor_out_struct_nested(TestOutputVisitorData *data,
     ud2->dict1->dict3->userdef->integer = value;
     ud2->dict1->dict3->string = g_strdup(strings[3]);
 
-    visit_type_UserDefTwo(data->ov, &ud2, "unused", &err);
-    g_assert(!err);
+    visit_type_UserDefTwo(data->ov, &ud2, "unused", &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -314,57 +263,33 @@ static void test_visitor_out_struct_errors(TestOutputVisitorData *data,
     }
 }
 
-typedef struct TestStructList
-{
-    union {
-        TestStruct *value;
-        uint64_t padding;
-    };
-    struct TestStructList *next;
-} TestStructList;
-
-static void visit_type_TestStructList(Visitor *v, TestStructList **obj,
-                                      const char *name, Error **errp)
-{
-    GenericList *i, **head = (GenericList **)obj;
-
-    visit_start_list(v, name, errp);
-
-    for (*head = i = visit_next_list(v, head, errp); i; i = visit_next_list(v, &i, errp)) {
-        TestStructList *native_i = (TestStructList *)i;
-        visit_type_TestStruct(v, &native_i->value, NULL, errp);
-    }
-
-    visit_end_list(v, errp);
-}
 
 static void test_visitor_out_list(TestOutputVisitorData *data,
                                   const void *unused)
 {
-    char *value_str = (char *) "list value";
+    const char *value_str = "list value";
     TestStructList *p, *head = NULL;
     const int max_items = 10;
     bool value_bool = true;
     int value_int = 10;
-    Error *err = NULL;
     QListEntry *entry;
     QObject *obj;
     QList *qlist;
     int i;
 
+    /* Build the list in reverse order... */
     for (i = 0; i < max_items; i++) {
         p = g_malloc0(sizeof(*p));
         p->value = g_malloc0(sizeof(*p->value));
-        p->value->integer = value_int;
+        p->value->integer = value_int + (max_items - i - 1);
         p->value->boolean = value_bool;
-        p->value->string = value_str;
+        p->value->string = g_strdup(value_str);
 
         p->next = head;
         head = p;
     }
 
-    visit_type_TestStructList(data->ov, &head, NULL, &err);
-    g_assert(!err);
+    visit_type_TestStructList(data->ov, &head, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
@@ -373,6 +298,7 @@ static void test_visitor_out_list(TestOutputVisitorData *data,
     qlist = qobject_to_qlist(obj);
     g_assert(!qlist_empty(qlist));
 
+    /* ...and ensure that the visitor sees it in order */
     i = 0;
     QLIST_FOREACH_ENTRY(qlist, entry) {
         QDict *qdict;
@@ -380,7 +306,7 @@ static void test_visitor_out_list(TestOutputVisitorData *data,
         g_assert(qobject_type(entry->value) == QTYPE_QDICT);
         qdict = qobject_to_qdict(entry->value);
         g_assert_cmpint(qdict_size(qdict), ==, 3);
-        g_assert_cmpint(qdict_get_int(qdict, "integer"), ==, value_int);
+        g_assert_cmpint(qdict_get_int(qdict, "integer"), ==, value_int + i);
         g_assert_cmpint(qdict_get_bool(qdict, "boolean"), ==, value_bool);
         g_assert_cmpstr(qdict_get_str(qdict, "string"), ==, value_str);
         i++;
@@ -388,13 +314,7 @@ static void test_visitor_out_list(TestOutputVisitorData *data,
     g_assert_cmpint(i, ==, max_items);
 
     QDECREF(qlist);
-
-    for (p = head; p;) {
-        TestStructList *tmp = p->next;
-        g_free(p->value);
-        g_free(p);
-        p = tmp;
-    }
+    qapi_free_TestStructList(head);
 }
 
 static void test_visitor_out_list_qapi_free(TestOutputVisitorData *data,
@@ -429,7 +349,6 @@ static void test_visitor_out_any(TestOutputVisitorData *data,
                                  const void *unused)
 {
     QObject *qobj;
-    Error *err = NULL;
     QInt *qint;
     QBool *qbool;
     QString *qstring;
@@ -437,8 +356,7 @@ static void test_visitor_out_any(TestOutputVisitorData *data,
     QObject *obj;
 
     qobj = QOBJECT(qint_from_int(-42));
-    visit_type_any(data->ov, &qobj, NULL, &err);
-    g_assert(!err);
+    visit_type_any(data->ov, &qobj, NULL, &error_abort);
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
     g_assert(qobject_type(obj) == QTYPE_QINT);
@@ -451,8 +369,8 @@ static void test_visitor_out_any(TestOutputVisitorData *data,
     qdict_put(qdict, "boolean", qbool_from_bool(true));
     qdict_put(qdict, "string", qstring_from_str("foo"));
     qobj = QOBJECT(qdict);
-    visit_type_any(data->ov, &qobj, NULL, &err);
-    g_assert(!err);
+    visit_type_any(data->ov, &qobj, NULL, &error_abort);
+    qobject_decref(qobj);
     obj = qmp_output_get_qobject(data->qov);
     g_assert(obj != NULL);
     qdict = qobject_to_qdict(obj);
@@ -473,7 +391,6 @@ static void test_visitor_out_any(TestOutputVisitorData *data,
     g_assert(qstring);
     g_assert_cmpstr(qstring_get_str(qstring), ==, "foo");
     qobject_decref(obj);
-    qobject_decref(qobj);
 }
 
 static void test_visitor_out_union_flat(TestOutputVisitorData *data,
@@ -482,8 +399,6 @@ static void test_visitor_out_union_flat(TestOutputVisitorData *data,
     QObject *arg;
     QDict *qdict;
 
-    Error *err = NULL;
-
     UserDefFlatUnion *tmp = g_malloc0(sizeof(UserDefFlatUnion));
     tmp->enum1 = ENUM_ONE_VALUE1;
     tmp->string = g_strdup("str");
@@ -491,8 +406,7 @@ static void test_visitor_out_union_flat(TestOutputVisitorData *data,
     tmp->integer = 41;
     tmp->u.value1->boolean = true;
 
-    visit_type_UserDefFlatUnion(data->ov, &tmp, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefFlatUnion(data->ov, &tmp, NULL, &error_abort);
     arg = qmp_output_get_qobject(data->qov);
 
     g_assert(qobject_type(arg) == QTYPE_QDICT);
@@ -511,20 +425,33 @@ static void test_visitor_out_alternate(TestOutputVisitorData *data,
                                        const void *unused)
 {
     QObject *arg;
-    Error *err = NULL;
+    UserDefAlternate *tmp;
 
-    UserDefAlternate *tmp = g_malloc0(sizeof(UserDefAlternate));
+    tmp = g_new0(UserDefAlternate, 1);
     tmp->type = USER_DEF_ALTERNATE_KIND_I;
     tmp->u.i = 42;
 
-    visit_type_UserDefAlternate(data->ov, &tmp, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefAlternate(data->ov, &tmp, NULL, &error_abort);
     arg = qmp_output_get_qobject(data->qov);
 
     g_assert(qobject_type(arg) == QTYPE_QINT);
     g_assert_cmpint(qint_get_int(qobject_to_qint(arg)), ==, 42);
 
     qapi_free_UserDefAlternate(tmp);
+    qobject_decref(arg);
+
+    tmp = g_new0(UserDefAlternate, 1);
+    tmp->type = USER_DEF_ALTERNATE_KIND_S;
+    tmp->u.s = g_strdup("hello");
+
+    visit_type_UserDefAlternate(data->ov, &tmp, NULL, &error_abort);
+    arg = qmp_output_get_qobject(data->qov);
+
+    g_assert(qobject_type(arg) == QTYPE_QSTRING);
+    g_assert_cmpstr(qstring_get_str(qobject_to_qstring(arg)), ==, "hello");
+
+    qapi_free_UserDefAlternate(tmp);
+    qobject_decref(arg);
 }
 
 static void test_visitor_out_empty(TestOutputVisitorData *data,
@@ -758,14 +685,12 @@ static void test_native_list(TestOutputVisitorData *data,
                              UserDefNativeListUnionKind kind)
 {
     UserDefNativeListUnion *cvalue = g_new0(UserDefNativeListUnion, 1);
-    Error *err = NULL;
     QObject *obj;
 
     cvalue->type = kind;
     init_native_list(cvalue);
 
-    visit_type_UserDefNativeListUnion(data->ov, &cvalue, NULL, &err);
-    g_assert(err == NULL);
+    visit_type_UserDefNativeListUnion(data->ov, &cvalue, NULL, &error_abort);
 
     obj = qmp_output_get_qobject(data->qov);
     check_native_list(obj, cvalue->type);
diff --git a/tests/test-timed-average.c b/tests/test-timed-average.c
new file mode 100644
index 0000000..a049799
--- /dev/null
+++ b/tests/test-timed-average.c
@@ -0,0 +1,90 @@
+/*
+ * Timed average computation tests
+ *
+ * Copyright Nodalink, EURL. 2014
+ *
+ * Authors:
+ *  Benoît Canet     <benoit.canet@nodalink.com>
+ *
+ * This work is licensed under the terms of the GNU LGPL, version 2 or later.
+ * See the COPYING.LIB file in the top-level directory.
+ */
+
+#include <glib.h>
+#include <unistd.h>
+
+#include "qemu/timed-average.h"
+
+/* This is the clock for QEMU_CLOCK_VIRTUAL */
+static int64_t my_clock_value;
+
+int64_t cpu_get_clock(void)
+{
+    return my_clock_value;
+}
+
+static void account(TimedAverage *ta)
+{
+    timed_average_account(ta, 1);
+    timed_average_account(ta, 5);
+    timed_average_account(ta, 2);
+    timed_average_account(ta, 4);
+    timed_average_account(ta, 3);
+}
+
+static void test_average(void)
+{
+    TimedAverage ta;
+    uint64_t result;
+    int i;
+
+    /* we will compute some average on a period of 1 second */
+    timed_average_init(&ta, QEMU_CLOCK_VIRTUAL, NANOSECONDS_PER_SECOND);
+
+    result = timed_average_min(&ta);
+    g_assert(result == 0);
+    result = timed_average_avg(&ta);
+    g_assert(result == 0);
+    result = timed_average_max(&ta);
+    g_assert(result == 0);
+
+    for (i = 0; i < 100; i++) {
+        account(&ta);
+        result = timed_average_min(&ta);
+        g_assert(result == 1);
+        result = timed_average_avg(&ta);
+        g_assert(result == 3);
+        result = timed_average_max(&ta);
+        g_assert(result == 5);
+        my_clock_value += NANOSECONDS_PER_SECOND / 10;
+    }
+
+    my_clock_value += NANOSECONDS_PER_SECOND * 100;
+
+    result = timed_average_min(&ta);
+    g_assert(result == 0);
+    result = timed_average_avg(&ta);
+    g_assert(result == 0);
+    result = timed_average_max(&ta);
+    g_assert(result == 0);
+
+    for (i = 0; i < 100; i++) {
+        account(&ta);
+        result = timed_average_min(&ta);
+        g_assert(result == 1);
+        result = timed_average_avg(&ta);
+        g_assert(result == 3);
+        result = timed_average_max(&ta);
+        g_assert(result == 5);
+        my_clock_value += NANOSECONDS_PER_SECOND / 10;
+    }
+}
+
+int main(int argc, char **argv)
+{
+    /* tests in the same order as the header function declarations */
+    g_test_init(&argc, &argv, NULL);
+    g_test_add_func("/timed-average/average", test_average);
+    return g_test_run();
+}
+
diff --git a/tests/test-visitor-serialization.c b/tests/test-visitor-serialization.c
index 634563b..9f67f9e 100644
--- a/tests/test-visitor-serialization.c
+++ b/tests/test-visitor-serialization.c
@@ -186,40 +186,6 @@ static void visit_primitive_list(Visitor *v, void **native, Error **errp)
     }
 }
 
-typedef struct TestStruct
-{
-    int64_t integer;
-    bool boolean;
-    char *string;
-} TestStruct;
-
-static void visit_type_TestStruct(Visitor *v, TestStruct **obj,
-                                  const char *name, Error **errp)
-{
-    Error *err = NULL;
-
-    visit_start_struct(v, (void **)obj, NULL, name, sizeof(TestStruct), &err);
-    if (err) {
-        goto out;
-    }
-
-    visit_type_int(v, &(*obj)->integer, "integer", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_bool(v, &(*obj)->boolean, "boolean", &err);
-    if (err) {
-        goto out_end;
-    }
-    visit_type_str(v, &(*obj)->string, "string", &err);
-
-out_end:
-    error_propagate(errp, err);
-    err = NULL;
-    visit_end_struct(v, &err);
-out:
-    error_propagate(errp, err);
-}
 
 static TestStruct *struct_create(void)
 {
@@ -336,14 +302,13 @@ static void test_primitives(gconstpointer opaque)
     const SerializeOps *ops = args->ops;
     PrimitiveType *pt = args->test_data;
     PrimitiveType *pt_copy = g_malloc0(sizeof(*pt_copy));
-    Error *err = NULL;
     void *serialize_data;
 
     pt_copy->type = pt->type;
-    ops->serialize(pt, &serialize_data, visit_primitive_type, &err);
-    ops->deserialize((void **)&pt_copy, serialize_data, visit_primitive_type, &err);
+    ops->serialize(pt, &serialize_data, visit_primitive_type, &error_abort);
+    ops->deserialize((void **)&pt_copy, serialize_data, visit_primitive_type,
+                     &error_abort);
 
-    g_assert(err == NULL);
     g_assert(pt_copy != NULL);
     if (pt->type == PTYPE_STRING) {
         g_assert_cmpstr(pt->value.string, ==, pt_copy->value.string);
@@ -379,7 +344,6 @@ static void test_primitive_lists(gconstpointer opaque)
     PrimitiveList pl = { .value = { NULL } };
     PrimitiveList pl_copy = { .value = { NULL } };
     PrimitiveList *pl_copy_ptr = &pl_copy;
-    Error *err = NULL;
     void *serialize_data;
     void *cur_head = NULL;
     int i;
@@ -526,10 +490,11 @@ static void test_primitive_lists(gconstpointer opaque)
         }
     }
 
-    ops->serialize((void **)&pl, &serialize_data, visit_primitive_list, &err);
-    ops->deserialize((void **)&pl_copy_ptr, serialize_data, visit_primitive_list, &err);
+    ops->serialize((void **)&pl, &serialize_data, visit_primitive_list,
+                   &error_abort);
+    ops->deserialize((void **)&pl_copy_ptr, serialize_data,
+                     visit_primitive_list, &error_abort);
 
-    g_assert(err == NULL);
     i = 0;
 
     /* compare our deserialized list of primitives to the original */
@@ -686,10 +651,8 @@ static void test_primitive_lists(gconstpointer opaque)
     g_assert_cmpint(i, ==, 33);
 
     ops->cleanup(serialize_data);
-    dealloc_helper(&pl, visit_primitive_list, &err);
-    g_assert(!err);
-    dealloc_helper(&pl_copy, visit_primitive_list, &err);
-    g_assert(!err);
+    dealloc_helper(&pl, visit_primitive_list, &error_abort);
+    dealloc_helper(&pl_copy, visit_primitive_list, &error_abort);
     g_free(args);
 }
 
@@ -699,13 +662,12 @@ static void test_struct(gconstpointer opaque)
     const SerializeOps *ops = args->ops;
     TestStruct *ts = struct_create();
     TestStruct *ts_copy = NULL;
-    Error *err = NULL;
     void *serialize_data;
 
-    ops->serialize(ts, &serialize_data, visit_struct, &err);
-    ops->deserialize((void **)&ts_copy, serialize_data, visit_struct, &err); 
+    ops->serialize(ts, &serialize_data, visit_struct, &error_abort);
+    ops->deserialize((void **)&ts_copy, serialize_data, visit_struct,
+                     &error_abort);
 
-    g_assert(err == NULL);
     struct_compare(ts, ts_copy);
 
     struct_cleanup(ts);
@@ -721,14 +683,12 @@ static void test_nested_struct(gconstpointer opaque)
     const SerializeOps *ops = args->ops;
     UserDefTwo *udnp = nested_struct_create();
     UserDefTwo *udnp_copy = NULL;
-    Error *err = NULL;
     void *serialize_data;
 
-    ops->serialize(udnp, &serialize_data, visit_nested_struct, &err);
+    ops->serialize(udnp, &serialize_data, visit_nested_struct, &error_abort);
     ops->deserialize((void **)&udnp_copy, serialize_data, visit_nested_struct,
-                     &err);
+                     &error_abort);
 
-    g_assert(err == NULL);
     nested_struct_compare(udnp, udnp_copy);
 
     nested_struct_cleanup(udnp);
@@ -743,7 +703,6 @@ static void test_nested_struct_list(gconstpointer opaque)
     TestArgs *args = (TestArgs *) opaque;
     const SerializeOps *ops = args->ops;
     UserDefTwoList *listp = NULL, *tmp, *tmp_copy, *listp_copy = NULL;
-    Error *err = NULL;
     void *serialize_data;
     int i = 0;
 
@@ -754,11 +713,10 @@ static void test_nested_struct_list(gconstpointer opaque)
         listp = tmp;
     }
 
-    ops->serialize(listp, &serialize_data, visit_nested_struct_list, &err);
+    ops->serialize(listp, &serialize_data, visit_nested_struct_list,
+                   &error_abort);
     ops->deserialize((void **)&listp_copy, serialize_data,
-                     visit_nested_struct_list, &err); 
-
-    g_assert(err == NULL);
+                     visit_nested_struct_list, &error_abort);
 
     tmp = listp;
     tmp_copy = listp_copy;
diff --git a/tests/vhost-user-bridge.c b/tests/vhost-user-bridge.c
index fa18ad5..864f69e 100644
--- a/tests/vhost-user-bridge.c
+++ b/tests/vhost-user-bridge.c
@@ -188,7 +188,7 @@ typedef enum VhostUserRequest {
     VHOST_USER_GET_FEATURES = 1,
     VHOST_USER_SET_FEATURES = 2,
     VHOST_USER_SET_OWNER = 3,
-    VHOST_USER_RESET_DEVICE = 4,
+    VHOST_USER_RESET_OWNER = 4,
     VHOST_USER_SET_MEM_TABLE = 5,
     VHOST_USER_SET_LOG_BASE = 6,
     VHOST_USER_SET_LOG_FD = 7,
@@ -274,7 +274,7 @@ static const char *vubr_request_str[] = {
     [VHOST_USER_GET_FEATURES]           =  "VHOST_USER_GET_FEATURES",
     [VHOST_USER_SET_FEATURES]           =  "VHOST_USER_SET_FEATURES",
     [VHOST_USER_SET_OWNER]              =  "VHOST_USER_SET_OWNER",
-    [VHOST_USER_RESET_DEVICE]           =  "VHOST_USER_RESET_DEVICE",
+    [VHOST_USER_RESET_OWNER]           =  "VHOST_USER_RESET_OWNER",
     [VHOST_USER_SET_MEM_TABLE]          =  "VHOST_USER_SET_MEM_TABLE",
     [VHOST_USER_SET_LOG_BASE]           =  "VHOST_USER_SET_LOG_BASE",
     [VHOST_USER_SET_LOG_FD]             =  "VHOST_USER_SET_LOG_FD",
@@ -921,7 +921,7 @@ vubr_execute_request(VubrDev *dev, VhostUserMsg *vmsg)
         return vubr_set_features_exec(dev, vmsg);
     case VHOST_USER_SET_OWNER:
         return vubr_set_owner_exec(dev, vmsg);
-    case VHOST_USER_RESET_DEVICE:
+    case VHOST_USER_RESET_OWNER:
         return vubr_reset_device_exec(dev, vmsg);
     case VHOST_USER_SET_MEM_TABLE:
         return vubr_set_mem_table_exec(dev, vmsg);
diff --git a/tests/vhost-user-test.c b/tests/vhost-user-test.c
index b6dde75..01cfc7e 100644
--- a/tests/vhost-user-test.c
+++ b/tests/vhost-user-test.c
@@ -57,7 +57,7 @@ typedef enum VhostUserRequest {
     VHOST_USER_GET_FEATURES = 1,
     VHOST_USER_SET_FEATURES = 2,
     VHOST_USER_SET_OWNER = 3,
-    VHOST_USER_RESET_DEVICE = 4,
+    VHOST_USER_RESET_OWNER = 4,
     VHOST_USER_SET_MEM_TABLE = 5,
     VHOST_USER_SET_LOG_BASE = 6,
     VHOST_USER_SET_LOG_FD = 7,
@@ -86,6 +86,11 @@ typedef struct VhostUserMemory {
     VhostUserMemoryRegion regions[VHOST_MEMORY_MAX_NREGIONS];
 } VhostUserMemory;
 
+typedef struct VhostUserLog {
+    uint64_t mmap_size;
+    uint64_t mmap_offset;
+} VhostUserLog;
+
 typedef struct VhostUserMsg {
     VhostUserRequest request;
 
@@ -94,10 +99,13 @@ typedef struct VhostUserMsg {
     uint32_t flags;
     uint32_t size; /* the following payload size */
     union {
+#define VHOST_USER_VRING_IDX_MASK   (0xff)
+#define VHOST_USER_VRING_NOFD_MASK  (0x1<<8)
         uint64_t u64;
         struct vhost_vring_state state;
         struct vhost_vring_addr addr;
         VhostUserMemory memory;
+        VhostUserLog log;
     } payload;
 } QEMU_PACKED VhostUserMsg;
 
@@ -307,7 +315,7 @@ static void chr_read(void *opaque, const uint8_t *buf, int size)
         g_cond_signal(&s->data_cond);
         break;
 
-    case VHOST_USER_RESET_DEVICE:
+    case VHOST_USER_RESET_OWNER:
         s->fds_num = 0;
         break;
 
diff --git a/trace-events b/trace-events
index e67ad81..0b0ff02 100644
--- a/trace-events
+++ b/trace-events
@@ -139,8 +139,8 @@ paio_submit_co(int64_t sector_num, int nb_sectors, int type) "sector_num %"PRId6
 paio_submit(void *acb, void *opaque, int64_t sector_num, int nb_sectors, int type) "acb %p opaque %p sector_num %"PRId64" nb_sectors %d type %d"
 
 # ioport.c
-cpu_in(unsigned int addr, unsigned int val) "addr %#x value %u"
-cpu_out(unsigned int addr, unsigned int val) "addr %#x value %u"
+cpu_in(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
+cpu_out(unsigned int addr, char size, unsigned int val) "addr %#x(%c) value %u"
 
 # balloon.c
 # Since requests are raised via monitor, not many tracepoints are needed.
@@ -1202,16 +1202,43 @@ virtio_gpu_fence_resp(uint64_t fence) "fence 0x%" PRIx64
 
 # migration/savevm.c
 qemu_loadvm_state_section(unsigned int section_type) "%d"
+qemu_loadvm_state_section_command(int ret) "%d"
 qemu_loadvm_state_section_partend(uint32_t section_id) "%u"
+qemu_loadvm_state_main(void) ""
+qemu_loadvm_state_main_quit_parent(void) ""
+qemu_loadvm_state_post_main(int ret) "%d"
 qemu_loadvm_state_section_startfull(uint32_t section_id, const char *idstr, uint32_t instance_id, uint32_t version_id) "%u(%s) %u %u"
+qemu_savevm_send_packaged(void) ""
+loadvm_handle_cmd_packaged(unsigned int length) "%u"
+loadvm_handle_cmd_packaged_main(int ret) "%d"
+loadvm_handle_cmd_packaged_received(int ret) "%d"
+loadvm_postcopy_handle_advise(void) ""
+loadvm_postcopy_handle_listen(void) ""
+loadvm_postcopy_handle_run(void) ""
+loadvm_postcopy_handle_run_cpu_sync(void) ""
+loadvm_postcopy_handle_run_vmstart(void) ""
+loadvm_postcopy_ram_handle_discard(void) ""
+loadvm_postcopy_ram_handle_discard_end(void) ""
+loadvm_postcopy_ram_handle_discard_header(const char *ramid, uint16_t len) "%s: %ud"
+loadvm_process_command(uint16_t com, uint16_t len) "com=0x%x len=%d"
+loadvm_process_command_ping(uint32_t val) "%x"
+postcopy_ram_listen_thread_exit(void) ""
+postcopy_ram_listen_thread_start(void) ""
+qemu_savevm_send_postcopy_advise(void) ""
+qemu_savevm_send_postcopy_ram_discard(const char *id, uint16_t len) "%s: %ud"
+savevm_command_send(uint16_t command, uint16_t len) "com=0x%x len=%d"
 savevm_section_start(const char *id, unsigned int section_id) "%s, section_id %u"
 savevm_section_end(const char *id, unsigned int section_id, int ret) "%s, section_id %u -> %d"
 savevm_section_skip(const char *id, unsigned int section_id) "%s, section_id %u"
+savevm_send_open_return_path(void) ""
+savevm_send_ping(uint32_t val) "%x"
+savevm_send_postcopy_listen(void) ""
+savevm_send_postcopy_run(void) ""
 savevm_state_begin(void) ""
 savevm_state_header(void) ""
 savevm_state_iterate(void) ""
-savevm_state_complete(void) ""
-savevm_state_cancel(void) ""
+savevm_state_cleanup(void) ""
+savevm_state_complete_precopy(void) ""
 vmstate_save(const char *idstr, const char *vmsd_name) "%s, %s"
 vmstate_load(const char *idstr, const char *vmsd_name) "%s, %s"
 qemu_announce_self_iter(const char *mac) "%s"
@@ -1229,9 +1256,14 @@ vmstate_subsection_load_good(const char *parent) "%s"
 qemu_file_fclose(void) ""
 
 # migration/ram.c
+get_queued_page(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr) "%s/%" PRIx64 " ram_addr=%" PRIx64
+get_queued_page_not_dirty(const char *block_name, uint64_t tmp_offset, uint64_t ram_addr, int sent) "%s/%" PRIx64 " ram_addr=%" PRIx64 " (sent=%d)"
 migration_bitmap_sync_start(void) ""
 migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64""
 migration_throttle(void) ""
+ram_load_postcopy_loop(uint64_t addr, int flags) "@%" PRIx64 " %x"
+ram_postcopy_send_discard_bitmap(void) ""
+ram_save_queue_pages(const char *rbname, size_t start, size_t len) "%s: start: %zx len: %zx"
 
 # hw/display/qxl.c
 disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d"
@@ -1427,17 +1459,40 @@ flic_no_device_api(int err) "flic: no Device Contral API support %d"
 flic_reset_failed(int err) "flic: reset failed %d"
 
 # migration.c
+await_return_path_close_on_source_close(void) ""
+await_return_path_close_on_source_joining(void) ""
 migrate_set_state(int new_state) "new state %d"
 migrate_fd_cleanup(void) ""
 migrate_fd_error(void) ""
 migrate_fd_cancel(void) ""
-migrate_pending(uint64_t size, uint64_t max) "pending size %" PRIu64 " max %" PRIu64
-migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
-migrate_state_too_big(void) ""
+migrate_handle_rp_req_pages(const char *rbname, size_t start, size_t len) "in %s at %zx len %zx"
+migrate_pending(uint64_t size, uint64_t max, uint64_t post, uint64_t nonpost) "pending size %" PRIu64 " max %" PRIu64 " (post=%" PRIu64 " nonpost=%" PRIu64 ")"
+migrate_send_rp_message(int msg_type, uint16_t len) "%d: len %d"
+migration_completion_file_err(void) ""
+migration_completion_postcopy_end(void) ""
+migration_completion_postcopy_end_after_complete(void) ""
+migration_completion_postcopy_end_before_rp(void) ""
+migration_completion_postcopy_end_after_rp(int rp_error) "%d"
+migration_thread_after_loop(void) ""
+migration_thread_file_err(void) ""
+migration_thread_setup_complete(void) ""
+open_return_path_on_source(void) ""
+open_return_path_on_source_continue(void) ""
+postcopy_start(void) ""
+postcopy_start_set_run(void) ""
+source_return_path_thread_bad_end(void) ""
+source_return_path_thread_end(void) ""
+source_return_path_thread_entry(void) ""
+source_return_path_thread_loop_top(void) ""
+source_return_path_thread_pong(uint32_t val) "%x"
+source_return_path_thread_shut(uint32_t val) "%x"
 migrate_global_state_post_load(const char *state) "loaded state: %s"
 migrate_global_state_pre_save(const char *state) "saved state: %s"
-migration_completion_file_err(void) ""
 migration_thread_low_pending(uint64_t pending) "%" PRIu64
+migrate_state_too_big(void) ""
+migrate_transferred(uint64_t tranferred, uint64_t time_spent, double bandwidth, uint64_t size) "transferred %" PRIu64 " time_spent %" PRIu64 " bandwidth %g max_size %" PRId64
+process_incoming_migration_co_end(int ret, int ps) "ret=%d postcopy-state=%d"
+process_incoming_migration_co_postcopy_end_main(void) ""
 
 # migration/rdma.c
 qemu_rdma_accept_incoming_migration(void) ""
@@ -1503,6 +1558,25 @@ rdma_start_incoming_migration_after_rdma_listen(void) ""
 rdma_start_outgoing_migration_after_rdma_connect(void) ""
 rdma_start_outgoing_migration_after_rdma_source_init(void) ""
 
+# migration/postcopy-ram.c
+postcopy_discard_send_finish(const char *ramblock, int nwords, int ncmds) "%s mask words sent=%d in %d commands"
+postcopy_discard_send_range(const char *ramblock, unsigned long start, unsigned long length) "%s:%lx/%lx"
+postcopy_ram_discard_range(void *start, size_t length) "%p,+%zx"
+postcopy_cleanup_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_init_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_nhp_range(const char *ramblock, void *host_addr, size_t offset, size_t length) "%s: %p offset=%zx length=%zx"
+postcopy_place_page(void *host_addr) "host=%p"
+postcopy_place_page_zero(void *host_addr) "host=%p"
+postcopy_ram_enable_notify(void) ""
+postcopy_ram_fault_thread_entry(void) ""
+postcopy_ram_fault_thread_exit(void) ""
+postcopy_ram_fault_thread_quit(void) ""
+postcopy_ram_fault_thread_request(uint64_t hostaddr, const char *ramblock, size_t offset) "Request for HVA=%" PRIx64 " rb=%s offset=%zx"
+postcopy_ram_incoming_cleanup_closeuf(void) ""
+postcopy_ram_incoming_cleanup_entry(void) ""
+postcopy_ram_incoming_cleanup_exit(void) ""
+postcopy_ram_incoming_cleanup_join(void) ""
+
 # kvm-all.c
 kvm_ioctl(int type, void *arg) "type 0x%x, arg %p"
 kvm_vm_ioctl(int type, void *arg) "type 0x%x, arg %p"
diff --git a/translate-all.c b/translate-all.c
index 20ce40e..a940bd2 100644
--- a/translate-all.c
+++ b/translate-all.c
@@ -1069,7 +1069,7 @@ TranslationBlock *tb_gen_code(CPUState *cpu,
 #endif
 
     phys_pc = get_page_addr_code(env, pc);
-    if (use_icount) {
+    if (use_icount && !(cflags & CF_IGNORE_ICOUNT)) {
         cflags |= CF_USE_ICOUNT;
     }
 
diff --git a/ui/cocoa.m b/ui/cocoa.m
index c0d6bb2..1554331 100644
--- a/ui/cocoa.m
+++ b/ui/cocoa.m
@@ -1113,10 +1113,13 @@ QemuCocoaView *cocoaView;
         }
 
         Error *err = NULL;
-        qmp_change_blockdev([drive cStringUsingEncoding: NSASCIIStringEncoding],
-                            [file cStringUsingEncoding: NSASCIIStringEncoding],
-                            "raw",
-                            &err);
+        qmp_blockdev_change_medium([drive cStringUsingEncoding:
+                                          NSASCIIStringEncoding],
+                                   [file cStringUsingEncoding:
+                                         NSASCIIStringEncoding],
+                                   true, "raw",
+                                   false, 0,
+                                   &err);
         handleAnyDeviceErrors(err);
     }
 }
diff --git a/ui/console.c b/ui/console.c
index f26544e..745c354 100644
--- a/ui/console.c
+++ b/ui/console.c
@@ -450,7 +450,7 @@ static void text_console_resize(QemuConsole *s)
     if (s->width < w1)
         w1 = s->width;
 
-    cells = g_malloc(s->width * s->total_height * sizeof(TextCell));
+    cells = g_new(TextCell, s->width * s->total_height);
     for(y = 0; y < s->total_height; y++) {
         c = &cells[y * s->width];
         if (w1 > 0) {
diff --git a/ui/curses.c b/ui/curses.c
index 266260a..7e7e402 100644
--- a/ui/curses.c
+++ b/ui/curses.c
@@ -431,7 +431,7 @@ void curses_display_init(DisplayState *ds, int full_screen)
 
     curses_winch_init();
 
-    dcl = (DisplayChangeListener *) g_malloc0(sizeof(DisplayChangeListener));
+    dcl = g_new0(DisplayChangeListener, 1);
     dcl->ops = &dcl_ops;
     register_displaychangelistener(dcl);
 
diff --git a/ui/input-legacy.c b/ui/input-legacy.c
index a67ed32..e0a39f0 100644
--- a/ui/input-legacy.c
+++ b/ui/input-legacy.c
@@ -206,7 +206,7 @@ QEMUPutMouseEntry *qemu_add_mouse_event_handler(QEMUPutMouseEvent *func,
 {
     QEMUPutMouseEntry *s;
 
-    s = g_malloc0(sizeof(QEMUPutMouseEntry));
+    s = g_new0(QEMUPutMouseEntry, 1);
 
     s->qemu_put_mouse_event = func;
     s->qemu_put_mouse_event_opaque = opaque;
@@ -240,7 +240,7 @@ QEMUPutLEDEntry *qemu_add_led_event_handler(QEMUPutLEDEvent *func,
 {
     QEMUPutLEDEntry *s;
 
-    s = g_malloc0(sizeof(QEMUPutLEDEntry));
+    s = g_new0(QEMUPutLEDEntry, 1);
 
     s->put_led = func;
     s->opaque = opaque;
diff --git a/ui/input.c b/ui/input.c
index 643f885..a0f9873 100644
--- a/ui/input.c
+++ b/ui/input.c
@@ -6,6 +6,7 @@
 #include "trace.h"
 #include "ui/input.h"
 #include "ui/console.h"
+#include "sysemu/replay.h"
 
 struct QemuInputHandlerState {
     DeviceState       *dev;
@@ -300,14 +301,10 @@ static void qemu_input_queue_sync(struct QemuInputEventQueueHead *queue)
     QTAILQ_INSERT_TAIL(queue, item, node);
 }
 
-void qemu_input_event_send(QemuConsole *src, InputEvent *evt)
+void qemu_input_event_send_impl(QemuConsole *src, InputEvent *evt)
 {
     QemuInputHandlerState *s;
 
-    if (!runstate_is_running() && !runstate_check(RUN_STATE_SUSPENDED)) {
-        return;
-    }
-
     qemu_input_event_trace(src, evt);
 
     /* pre processing */
@@ -324,14 +321,19 @@ void qemu_input_event_send(QemuConsole *src, InputEvent *evt)
     s->events++;
 }
 
-void qemu_input_event_sync(void)
+void qemu_input_event_send(QemuConsole *src, InputEvent *evt)
 {
-    QemuInputHandlerState *s;
-
     if (!runstate_is_running() && !runstate_check(RUN_STATE_SUSPENDED)) {
         return;
     }
 
+    replay_input_event(src, evt);
+}
+
+void qemu_input_event_sync_impl(void)
+{
+    QemuInputHandlerState *s;
+
     trace_input_event_sync();
 
     QTAILQ_FOREACH(s, &handlers, node) {
@@ -345,6 +347,15 @@ void qemu_input_event_sync(void)
     }
 }
 
+void qemu_input_event_sync(void)
+{
+    if (!runstate_is_running() && !runstate_check(RUN_STATE_SUSPENDED)) {
+        return;
+    }
+
+    replay_input_sync_event();
+}
+
 InputEvent *qemu_input_event_new_key(KeyValue *key, bool down)
 {
     InputEvent *evt = g_new0(InputEvent, 1);
diff --git a/ui/keymaps.c b/ui/keymaps.c
index 49410ae..1b9ba3f 100644
--- a/ui/keymaps.c
+++ b/ui/keymaps.c
@@ -109,7 +109,7 @@ static kbd_layout_t *parse_keyboard_layout(const name2keysym_t *table,
     }
 
     if (!k) {
-        k = g_malloc0(sizeof(kbd_layout_t));
+        k = g_new0(kbd_layout_t, 1);
     }
 
     for(;;) {
diff --git a/ui/sdl.c b/ui/sdl.c
index 3be2910..570cb99 100644
--- a/ui/sdl.c
+++ b/ui/sdl.c
@@ -985,7 +985,7 @@ void sdl_display_init(DisplayState *ds, int full_screen, int no_frame)
         sdl_grab_start();
     }
 
-    dcl = g_malloc0(sizeof(DisplayChangeListener));
+    dcl = g_new0(DisplayChangeListener, 1);
     dcl->ops = &dcl_ops;
     register_displaychangelistener(dcl);
 
diff --git a/ui/vnc-jobs.c b/ui/vnc-jobs.c
index 08f0163..aa21191 100644
--- a/ui/vnc-jobs.c
+++ b/ui/vnc-jobs.c
@@ -79,7 +79,7 @@ static void vnc_unlock_queue(VncJobQueue *queue)
 
 VncJob *vnc_job_new(VncState *vs)
 {
-    VncJob *job = g_malloc0(sizeof(VncJob));
+    VncJob *job = g_new0(VncJob, 1);
 
     job->vs = vs;
     vnc_lock_queue(queue);
@@ -90,7 +90,7 @@ VncJob *vnc_job_new(VncState *vs)
 
 int vnc_job_add_rect(VncJob *job, int x, int y, int w, int h)
 {
-    VncRectEntry *entry = g_malloc0(sizeof(VncRectEntry));
+    VncRectEntry *entry = g_new0(VncRectEntry, 1);
 
     entry->rect.x = x;
     entry->rect.y = y;
@@ -305,7 +305,7 @@ disconnected:
 
 static VncJobQueue *vnc_queue_init(void)
 {
-    VncJobQueue *queue = g_malloc0(sizeof(VncJobQueue));
+    VncJobQueue *queue = g_new0(VncJobQueue, 1);
 
     qemu_cond_init(&queue->cond);
     qemu_mutex_init(&queue->mutex);
diff --git a/ui/vnc.c b/ui/vnc.c
index 369ad7b..c9f2fed 100644
--- a/ui/vnc.c
+++ b/ui/vnc.c
@@ -3004,7 +3004,7 @@ static void vnc_refresh(DisplayChangeListener *dcl)
 static void vnc_connect(VncDisplay *vd, int csock,
                         bool skipauth, bool websocket)
 {
-    VncState *vs = g_malloc0(sizeof(VncState));
+    VncState *vs = g_new0(VncState, 1);
     int i;
 
     vs->csock = csock;
@@ -3047,7 +3047,7 @@ static void vnc_connect(VncDisplay *vd, int csock,
 
     vs->lossy_rect = g_malloc0(VNC_STAT_ROWS * sizeof (*vs->lossy_rect));
     for (i = 0; i < VNC_STAT_ROWS; ++i) {
-        vs->lossy_rect[i] = g_malloc0(VNC_STAT_COLS * sizeof (uint8_t));
+        vs->lossy_rect[i] = g_new0(uint8_t, VNC_STAT_COLS);
     }
 
     VNC_DEBUG("New client on socket %d\n", csock);
diff --git a/util/Makefile.objs b/util/Makefile.objs
index d7cc399..89dd80e 100644
--- a/util/Makefile.objs
+++ b/util/Makefile.objs
@@ -29,3 +29,4 @@ util-obj-y += qemu-coroutine.o qemu-coroutine-lock.o qemu-coroutine-io.o
 util-obj-y += qemu-coroutine-sleep.o
 util-obj-y += coroutine-$(CONFIG_COROUTINE_BACKEND).o
 util-obj-y += buffer.o
+util-obj-y += timed-average.o
diff --git a/util/error.c b/util/error.c
index 8b86490..80c89a2 100644
--- a/util/error.c
+++ b/util/error.c
@@ -220,6 +220,13 @@ void error_free(Error *err)
     }
 }
 
+void error_free_or_abort(Error **errp)
+{
+    assert(errp && *errp);
+    error_free(*errp);
+    *errp = NULL;
+}
+
 void error_propagate(Error **dst_errp, Error *local_err)
 {
     if (!local_err) {
diff --git a/util/osdep.c b/util/osdep.c
index 0092bb6..80c6bfe 100644
--- a/util/osdep.c
+++ b/util/osdep.c
@@ -52,7 +52,7 @@ extern int madvise(caddr_t, size_t, int);
 
 static bool fips_enabled = false;
 
-static const char *qemu_version = QEMU_VERSION;
+static const char *hw_version = QEMU_VERSION;
 
 int socket_set_cork(int fd, int v)
 {
@@ -311,14 +311,14 @@ int qemu_accept(int s, struct sockaddr *addr, socklen_t *addrlen)
     return ret;
 }
 
-void qemu_set_version(const char *version)
+void qemu_set_hw_version(const char *version)
 {
-    qemu_version = version;
+    hw_version = version;
 }
 
-const char *qemu_get_version(void)
+const char *qemu_hw_version(void)
 {
-    return qemu_version;
+    return hw_version;
 }
 
 void fips_set_state(bool requested)
diff --git a/util/qemu-sockets.c b/util/qemu-sockets.c
index dfe4587..5a31d16 100644
--- a/util/qemu-sockets.c
+++ b/util/qemu-sockets.c
@@ -751,8 +751,7 @@ int unix_listen_opts(QemuOpts *opts, Error **errp)
         qemu_opt_set(opts, "path", un.sun_path, &error_abort);
     }
 
-    if ((access(un.sun_path, F_OK) == 0) &&
-        unlink(un.sun_path) < 0) {
+    if (unlink(un.sun_path) < 0 && errno != ENOENT) {
         error_setg_errno(errp, errno,
                          "Failed to unlink socket %s", un.sun_path);
         goto err;
diff --git a/util/timed-average.c b/util/timed-average.c
new file mode 100644
index 0000000..a2dfb48
--- /dev/null
+++ b/util/timed-average.c
@@ -0,0 +1,231 @@
+/*
+ * QEMU timed average computation
+ *
+ * Copyright (C) Nodalink, EURL. 2014
+ * Copyright (C) Igalia, S.L. 2015
+ *
+ * Authors:
+ *   Benoît Canet <benoit.canet@nodalink.com>
+ *   Alberto Garcia <berto@igalia.com>
+ *
+ * This program is free sofware: you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Sofware Foundation, either version 2 of the License, or
+ * (at your option) version 3 or any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include <string.h>
+
+#include "qemu/timed-average.h"
+
+/* This module computes an average of a set of values within a time
+ * window.
+ *
+ * Algorithm:
+ *
+ * - Create two windows with a certain expiration period, and
+ *   offsetted by period / 2.
+ * - Each time you want to account a new value, do it in both windows.
+ * - The minimum / maximum / average values are always returned from
+ *   the oldest window.
+ *
+ * Example:
+ *
+ *        t=0          |t=0.5           |t=1          |t=1.5            |t=2
+ *        wnd0: [0,0.5)|wnd0: [0.5,1.5) |             |wnd0: [1.5,2.5)  |
+ *        wnd1: [0,1)  |                |wnd1: [1,2)  |                 |
+ *
+ * Values are returned from:
+ *
+ *        wnd0---------|wnd1------------|wnd0---------|wnd1-------------|
+ */
+
+/* Update the expiration of a time window
+ *
+ * @w:      the window used
+ * @now:    the current time in nanoseconds
+ * @period: the expiration period in nanoseconds
+ */
+static void update_expiration(TimedAverageWindow *w, int64_t now,
+                              int64_t period)
+{
+    /* time elapsed since the last theoretical expiration */
+    int64_t elapsed = (now - w->expiration) % period;
+    /* time remaininging until the next expiration */
+    int64_t remaining = period - elapsed;
+    /* compute expiration */
+    w->expiration = now + remaining;
+}
+
+/* Reset a window
+ *
+ * @w: the window to reset
+ */
+static void window_reset(TimedAverageWindow *w)
+{
+    w->min = UINT64_MAX;
+    w->max = 0;
+    w->sum = 0;
+    w->count = 0;
+}
+
+/* Get the current window (that is, the one with the earliest
+ * expiration time).
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: a pointer to the current window
+ */
+static TimedAverageWindow *current_window(TimedAverage *ta)
+{
+     return &ta->windows[ta->current];
+}
+
+/* Initialize a TimedAverage structure
+ *
+ * @ta:         the TimedAverage structure
+ * @clock_type: the type of clock to use
+ * @period:     the time window period in nanoseconds
+ */
+void timed_average_init(TimedAverage *ta, QEMUClockType clock_type,
+                        uint64_t period)
+{
+    int64_t now = qemu_clock_get_ns(clock_type);
+
+    /* Returned values are from the oldest window, so they belong to
+     * the interval [ta->period/2,ta->period). By adjusting the
+     * requested period by 4/3, we guarantee that they're in the
+     * interval [2/3 period,4/3 period), closer to the requested
+     * period on average */
+    ta->period = (uint64_t) period * 4 / 3;
+    ta->clock_type = clock_type;
+    ta->current = 0;
+
+    window_reset(&ta->windows[0]);
+    window_reset(&ta->windows[1]);
+
+    /* Both windows are offsetted by half a period */
+    ta->windows[0].expiration = now + ta->period / 2;
+    ta->windows[1].expiration = now + ta->period;
+}
+
+/* Check if the time windows have expired, updating their counters and
+ * expiration time if that's the case.
+ *
+ * @ta: the TimedAverage structure
+ * @elapsed: if non-NULL, the elapsed time (in ns) within the current
+ *           window will be stored here
+ */
+static void check_expirations(TimedAverage *ta, uint64_t *elapsed)
+{
+    int64_t now = qemu_clock_get_ns(ta->clock_type);
+    int i;
+
+    assert(ta->period != 0);
+
+    /* Check if the windows have expired */
+    for (i = 0; i < 2; i++) {
+        TimedAverageWindow *w = &ta->windows[i];
+        if (w->expiration <= now) {
+            window_reset(w);
+            update_expiration(w, now, ta->period);
+        }
+    }
+
+    /* Make ta->current point to the oldest window */
+    if (ta->windows[0].expiration < ta->windows[1].expiration) {
+        ta->current = 0;
+    } else {
+        ta->current = 1;
+    }
+
+    /* Calculate the elapsed time within the current window */
+    if (elapsed) {
+        int64_t remaining = ta->windows[ta->current].expiration - now;
+        *elapsed = ta->period - remaining;
+    }
+}
+
+/* Account a value
+ *
+ * @ta:    the TimedAverage structure
+ * @value: the value to account
+ */
+void timed_average_account(TimedAverage *ta, uint64_t value)
+{
+    int i;
+    check_expirations(ta, NULL);
+
+    /* Do the accounting in both windows at the same time */
+    for (i = 0; i < 2; i++) {
+        TimedAverageWindow *w = &ta->windows[i];
+
+        w->sum += value;
+        w->count++;
+
+        if (value < w->min) {
+            w->min = value;
+        }
+
+        if (value > w->max) {
+            w->max = value;
+        }
+    }
+}
+
+/* Get the minimum value
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: the minimum value
+ */
+uint64_t timed_average_min(TimedAverage *ta)
+{
+    TimedAverageWindow *w;
+    check_expirations(ta, NULL);
+    w = current_window(ta);
+    return w->min < UINT64_MAX ? w->min : 0;
+}
+
+/* Get the average value
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: the average value
+ */
+uint64_t timed_average_avg(TimedAverage *ta)
+{
+    TimedAverageWindow *w;
+    check_expirations(ta, NULL);
+    w = current_window(ta);
+    return w->count > 0 ? w->sum / w->count : 0;
+}
+
+/* Get the maximum value
+ *
+ * @ta:  the TimedAverage structure
+ * @ret: the maximum value
+ */
+uint64_t timed_average_max(TimedAverage *ta)
+{
+    check_expirations(ta, NULL);
+    return current_window(ta)->max;
+}
+
+/* Get the sum of all accounted values
+ * @ta:      the TimedAverage structure
+ * @elapsed: if non-NULL, the elapsed time (in ns) will be stored here
+ * @ret:     the sum of all accounted values
+ */
+uint64_t timed_average_sum(TimedAverage *ta, uint64_t *elapsed)
+{
+    TimedAverageWindow *w;
+    check_expirations(ta, elapsed);
+    w = current_window(ta);
+    return w->sum;
+}
diff --git a/vl.c b/vl.c
index f5f7c3f..7d993a5 100644
--- a/vl.c
+++ b/vl.c
@@ -122,6 +122,8 @@ int main(int argc, char **argv)
 #include "qapi-event.h"
 #include "exec/semihost.h"
 #include "crypto/init.h"
+#include "sysemu/replay.h"
+#include "qapi/qmp/qerror.h"
 
 #define MAX_VIRTIO_CONSOLES 1
 #define MAX_SCLP_CONSOLES 1
@@ -474,6 +476,12 @@ static QemuOptsList qemu_icount_opts = {
         }, {
             .name = "sleep",
             .type = QEMU_OPT_BOOL,
+        }, {
+            .name = "rr",
+            .type = QEMU_OPT_STRING,
+        }, {
+            .name = "rrfile",
+            .type = QEMU_OPT_STRING,
         },
         { /* end of list */ }
     },
@@ -674,9 +682,9 @@ void runstate_set(RunState new_state)
     assert(new_state < RUN_STATE_MAX);
 
     if (!runstate_valid_transitions[current_run_state][new_state]) {
-        fprintf(stderr, "ERROR: invalid runstate transition: '%s' -> '%s'\n",
-                RunState_lookup[current_run_state],
-                RunState_lookup[new_state]);
+        error_report("invalid runstate transition: '%s' -> '%s'",
+                     RunState_lookup[current_run_state],
+                     RunState_lookup[new_state]);
         abort();
     }
     trace_runstate_set(new_state);
@@ -828,8 +836,9 @@ static void configure_rtc_date_offset(const char *startdate, int legacy)
         rtc_start_date = mktimegm(&tm);
         if (rtc_start_date == -1) {
         date_fail:
-            fprintf(stderr, "Invalid date format. Valid formats are:\n"
-                            "'2006-06-17T16:01:21' or '2006-06-17'\n");
+            error_report("invalid date format");
+            error_printf("valid formats: "
+                         "'2006-06-17T16:01:21' or '2006-06-17'\n");
             exit(1);
         }
         rtc_date_offset = qemu_time() - rtc_start_date;
@@ -845,7 +854,11 @@ static void configure_rtc(QemuOpts *opts)
         if (!strcmp(value, "utc")) {
             rtc_utc = 1;
         } else if (!strcmp(value, "localtime")) {
+            Error *blocker = NULL;
             rtc_utc = 0;
+            error_setg(&blocker, QERR_REPLAY_NOT_SUPPORTED,
+                      "-rtc base=localtime");
+            replay_add_blocker(blocker);
         } else {
             configure_rtc_date_offset(value, 0);
         }
@@ -859,7 +872,7 @@ static void configure_rtc(QemuOpts *opts)
         } else if (!strcmp(value, "vm")) {
             rtc_clock = QEMU_CLOCK_VIRTUAL;
         } else {
-            fprintf(stderr, "qemu: invalid option value '%s'\n", value);
+            error_report("invalid option value '%s'", value);
             exit(1);
         }
     }
@@ -879,7 +892,7 @@ static void configure_rtc(QemuOpts *opts)
         } else if (!strcmp(value, "none")) {
             /* discard is default */
         } else {
-            fprintf(stderr, "qemu: invalid option value '%s'\n", value);
+            error_report("invalid option value '%s'", value);
             exit(1);
         }
     }
@@ -905,7 +918,7 @@ static int bt_hci_parse(const char *str)
     bdaddr_t bdaddr;
 
     if (nb_hcis >= MAX_NICS) {
-        fprintf(stderr, "qemu: Too many bluetooth HCIs (max %i).\n", MAX_NICS);
+        error_report("too many bluetooth HCIs (max %i)", MAX_NICS);
         return -1;
     }
 
@@ -931,8 +944,8 @@ static void bt_vhci_add(int vlan_id)
     struct bt_scatternet_s *vlan = qemu_find_bt_vlan(vlan_id);
 
     if (!vlan->slave)
-        fprintf(stderr, "qemu: warning: adding a VHCI to "
-                        "an empty scatternet %i\n", vlan_id);
+        error_report("warning: adding a VHCI to an empty scatternet %i",
+                     vlan_id);
 
     bt_vhci_init(bt_new_hci(vlan));
 }
@@ -950,7 +963,7 @@ static struct bt_device_s *bt_device_add(const char *opt)
     if (endp) {
         vlan_id = strtol(endp + 6, &endp, 0);
         if (*endp) {
-            fprintf(stderr, "qemu: unrecognised bluetooth vlan Id\n");
+            error_report("unrecognised bluetooth vlan Id");
             return 0;
         }
     }
@@ -958,13 +971,13 @@ static struct bt_device_s *bt_device_add(const char *opt)
     vlan = qemu_find_bt_vlan(vlan_id);
 
     if (!vlan->slave)
-        fprintf(stderr, "qemu: warning: adding a slave device to "
-                        "an empty scatternet %i\n", vlan_id);
+        error_report("warning: adding a slave device to an empty scatternet %i",
+                     vlan_id);
 
     if (!strcmp(devname, "keyboard"))
         return bt_keyboard_init(vlan);
 
-    fprintf(stderr, "qemu: unsupported bluetooth device `%s'\n", devname);
+    error_report("unsupported bluetooth device '%s'", devname);
     return 0;
 }
 
@@ -987,11 +1000,11 @@ static int bt_parse(const char *opt)
                 if (strstart(endp, ",vlan=", &p)) {
                     vlan = strtol(p, (char **) &endp, 0);
                     if (*endp) {
-                        fprintf(stderr, "qemu: bad scatternet '%s'\n", p);
+                        error_report("bad scatternet '%s'", p);
                         return 1;
                     }
                 } else {
-                    fprintf(stderr, "qemu: bad parameter '%s'\n", endp + 1);
+                    error_report("bad parameter '%s'", endp + 1);
                     return 1;
                 }
             } else
@@ -1003,7 +1016,7 @@ static int bt_parse(const char *opt)
     } else if (strstart(opt, "device:", &endp))
         return !bt_device_add(endp);
 
-    fprintf(stderr, "qemu: bad bluetooth parameter '%s'\n", opt);
+    error_report("bad bluetooth parameter '%s'", opt);
     return 1;
 }
 
@@ -1018,8 +1031,7 @@ static int parse_sandbox(void *opaque, QemuOpts *opts, Error **errp)
             return -1;
         }
 #else
-        error_report("sandboxing request but seccomp is not compiled "
-                     "into this build");
+        error_report("seccomp support is disabled");
         return -1;
 #endif
     }
@@ -1100,7 +1112,7 @@ static int parse_add_fd(void *opaque, QemuOpts *opts, Error **errp)
     }
 #endif
     if (dupfd == -1) {
-        error_report("Error duplicating fd: %s", strerror(errno));
+        error_report("error duplicating fd: %s", strerror(errno));
         return -1;
     }
 
@@ -1220,18 +1232,19 @@ static void smp_parse(QemuOpts *opts)
         } else if (threads == 0) {
             threads = cpus / (cores * sockets);
         } else if (sockets * cores * threads < cpus) {
-            fprintf(stderr, "cpu topology: error: "
-                    "sockets (%u) * cores (%u) * threads (%u) < "
-                    "smp_cpus (%u)\n",
-                    sockets, cores, threads, cpus);
+            error_report("cpu topology: "
+                         "sockets (%u) * cores (%u) * threads (%u) < "
+                         "smp_cpus (%u)",
+                         sockets, cores, threads, cpus);
             exit(1);
         }
 
         max_cpus = qemu_opt_get_number(opts, "maxcpus", cpus);
         if (sockets * cores * threads > max_cpus) {
-            fprintf(stderr, "cpu topology: error: "
-                    "sockets (%u) * cores (%u) * threads (%u) > maxcpus (%u)\n",
-                    sockets, cores, threads, max_cpus);
+            error_report("cpu topology: "
+                         "sockets (%u) * cores (%u) * threads (%u) > "
+                         "maxcpus (%u)",
+                         sockets, cores, threads, max_cpus);
             exit(1);
         }
 
@@ -1246,21 +1259,26 @@ static void smp_parse(QemuOpts *opts)
     }
 
     if (max_cpus > MAX_CPUMASK_BITS) {
-        fprintf(stderr, "Unsupported number of maxcpus\n");
+        error_report("unsupported number of maxcpus");
         exit(1);
     }
     if (max_cpus < smp_cpus) {
-        fprintf(stderr, "maxcpus must be equal to or greater than smp\n");
+        error_report("maxcpus must be equal to or greater than smp");
         exit(1);
     }
 
+    if (smp_cpus > 1 || smp_cores > 1 || smp_threads > 1) {
+        Error *blocker = NULL;
+        error_setg(&blocker, QERR_REPLAY_NOT_SUPPORTED, "smp");
+        replay_add_blocker(blocker);
+    }
 }
 
 static void realtime_init(void)
 {
     if (enable_mlock) {
         if (os_mlock() < 0) {
-            fprintf(stderr, "qemu: locking memory failed\n");
+            error_report("locking memory failed");
             exit(1);
         }
     }
@@ -1414,7 +1432,7 @@ static int usb_parse(const char *cmdline)
     int r;
     r = usb_device_add(cmdline);
     if (r < 0) {
-        fprintf(stderr, "qemu: could not add USB device '%s'\n", cmdline);
+        error_report("could not add USB device '%s'", cmdline);
     }
     return r;
 }
@@ -1624,14 +1642,14 @@ static int qemu_shutdown_requested(void)
 static void qemu_kill_report(void)
 {
     if (!qtest_driver() && shutdown_signal != -1) {
-        fprintf(stderr, "qemu: terminating on signal %d", shutdown_signal);
         if (shutdown_pid == 0) {
             /* This happens for eg ^C at the terminal, so it's worth
              * avoiding printing an odd message in that case.
              */
-            fputc('\n', stderr);
+            error_report("terminating on signal %d", shutdown_signal);
         } else {
-            fprintf(stderr, " from pid " FMT_pid "\n", shutdown_pid);
+            error_report("terminating on signal %d from pid " FMT_pid,
+                         shutdown_signal, shutdown_pid);
         }
         shutdown_signal = -1;
     }
@@ -1640,15 +1658,21 @@ static void qemu_kill_report(void)
 static int qemu_reset_requested(void)
 {
     int r = reset_requested;
-    reset_requested = 0;
-    return r;
+    if (r && replay_checkpoint(CHECKPOINT_RESET_REQUESTED)) {
+        reset_requested = 0;
+        return r;
+    }
+    return false;
 }
 
 static int qemu_suspend_requested(void)
 {
     int r = suspend_requested;
-    suspend_requested = 0;
-    return r;
+    if (r && replay_checkpoint(CHECKPOINT_SUSPEND_REQUESTED)) {
+        suspend_requested = 0;
+        return r;
+    }
+    return false;
 }
 
 static WakeupReason qemu_wakeup_requested(void)
@@ -1796,12 +1820,18 @@ void qemu_system_killed(int signal, pid_t pid)
     shutdown_signal = signal;
     shutdown_pid = pid;
     no_shutdown = 0;
-    qemu_system_shutdown_request();
+
+    /* Cannot call qemu_system_shutdown_request directly because
+     * we are in a signal handler.
+     */
+    shutdown_requested = 1;
+    qemu_notify_event();
 }
 
 void qemu_system_shutdown_request(void)
 {
     trace_qemu_system_shutdown_request();
+    replay_shutdown_request();
     shutdown_requested = 1;
     qemu_notify_event();
 }
@@ -1980,28 +2010,28 @@ static void select_vgahw (const char *p)
         if (vga_available()) {
             vga_interface_type = VGA_STD;
         } else {
-            fprintf(stderr, "Error: standard VGA not available\n");
+            error_report("standard VGA not available");
             exit(0);
         }
     } else if (strstart(p, "cirrus", &opts)) {
         if (cirrus_vga_available()) {
             vga_interface_type = VGA_CIRRUS;
         } else {
-            fprintf(stderr, "Error: Cirrus VGA not available\n");
+            error_report("Cirrus VGA not available");
             exit(0);
         }
     } else if (strstart(p, "vmware", &opts)) {
         if (vmware_vga_available()) {
             vga_interface_type = VGA_VMWARE;
         } else {
-            fprintf(stderr, "Error: VMWare SVGA not available\n");
+            error_report("VMWare SVGA not available");
             exit(0);
         }
     } else if (strstart(p, "virtio", &opts)) {
         if (virtio_vga_available()) {
             vga_interface_type = VGA_VIRTIO;
         } else {
-            fprintf(stderr, "Error: Virtio VGA not available\n");
+            error_report("Virtio VGA not available");
             exit(0);
         }
     } else if (strstart(p, "xenfb", &opts)) {
@@ -2010,26 +2040,26 @@ static void select_vgahw (const char *p)
         if (qxl_vga_available()) {
             vga_interface_type = VGA_QXL;
         } else {
-            fprintf(stderr, "Error: QXL VGA not available\n");
+            error_report("QXL VGA not available");
             exit(0);
         }
     } else if (strstart(p, "tcx", &opts)) {
         if (tcx_vga_available()) {
             vga_interface_type = VGA_TCX;
         } else {
-            fprintf(stderr, "Error: TCX framebuffer not available\n");
+            error_report("TCX framebuffer not available");
             exit(0);
         }
     } else if (strstart(p, "cg3", &opts)) {
         if (cg3_vga_available()) {
             vga_interface_type = VGA_CG3;
         } else {
-            fprintf(stderr, "Error: CG3 framebuffer not available\n");
+            error_report("CG3 framebuffer not available");
             exit(0);
         }
     } else if (!strstart(p, "none", &opts)) {
     invalid_vga:
-        fprintf(stderr, "Unknown vga type: %s\n", p);
+        error_report("unknown vga type: %s", p);
         exit(1);
     }
     while (*opts) {
@@ -2105,7 +2135,7 @@ static DisplayType select_display(const char *p)
                 }
             } else {
             invalid_sdl_args:
-                error_report("Invalid SDL option string");
+                error_report("invalid SDL option string");
                 exit(1);
             }
             opts = nextopt;
@@ -2134,7 +2164,7 @@ static DisplayType select_display(const char *p)
 #ifdef CONFIG_CURSES
         display = DT_CURSES;
 #else
-        error_report("Curses support is disabled");
+        error_report("curses support is disabled");
         exit(1);
 #endif
     } else if (strstart(p, "gtk", &opts)) {
@@ -2163,7 +2193,7 @@ static DisplayType select_display(const char *p)
                 }
             } else {
             invalid_gtk_args:
-                error_report("Invalid GTK option string");
+                error_report("invalid GTK option string");
                 exit(1);
             }
             opts = nextopt;
@@ -2175,7 +2205,7 @@ static DisplayType select_display(const char *p)
     } else if (strstart(p, "none", &opts)) {
         display = DT_NONE;
     } else {
-        error_report("Unknown display type");
+        error_report("unknown display type");
         exit(1);
     }
 
@@ -2276,8 +2306,8 @@ static int parse_fw_cfg(void *opaque, QemuOpts *opts, Error **errp)
         return -1;
     }
     if (strncmp(name, "opt/", 4) != 0) {
-        error_report("WARNING: externally provided fw_cfg item names "
-                     "should be prefixed with \"opt/\"!");
+        error_report("warning: externally provided fw_cfg item names "
+                     "should be prefixed with \"opt/\"");
     }
     if (nonempty_str(str)) {
         size = strlen(str); /* NUL terminator NOT included in fw_cfg blob */
@@ -2349,7 +2379,7 @@ static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
     } else if (strcmp(mode, "control") == 0) {
         flags = MONITOR_USE_CONTROL;
     } else {
-        fprintf(stderr, "unknown monitor mode \"%s\"\n", mode);
+        error_report("unknown monitor mode \"%s\"", mode);
         exit(1);
     }
 
@@ -2362,7 +2392,7 @@ static int mon_init_func(void *opaque, QemuOpts *opts, Error **errp)
     chardev = qemu_opt_get(opts, "chardev");
     chr = qemu_chr_find(chardev);
     if (chr == NULL) {
-        fprintf(stderr, "chardev \"%s\" not found\n", chardev);
+        error_report("chardev \"%s\" not found", chardev);
         exit(1);
     }
 
@@ -2390,7 +2420,7 @@ static void monitor_parse(const char *optarg, const char *mode, bool pretty)
         }
         opts = qemu_chr_parse_compat(label, optarg);
         if (!opts) {
-            fprintf(stderr, "parse error: %s\n", optarg);
+            error_report("parse error: %s", optarg);
             exit(1);
         }
     }
@@ -2464,14 +2494,14 @@ static int serial_parse(const char *devname)
     if (strcmp(devname, "none") == 0)
         return 0;
     if (index == MAX_SERIAL_PORTS) {
-        fprintf(stderr, "qemu: too many serial ports\n");
+        error_report("too many serial ports");
         exit(1);
     }
     snprintf(label, sizeof(label), "serial%d", index);
     serial_hds[index] = qemu_chr_new(label, devname, NULL);
     if (!serial_hds[index]) {
-        fprintf(stderr, "qemu: could not connect serial device"
-                " to character backend '%s'\n", devname);
+        error_report("could not connect serial device"
+                     " to character backend '%s'", devname);
         return -1;
     }
     index++;
@@ -2486,14 +2516,14 @@ static int parallel_parse(const char *devname)
     if (strcmp(devname, "none") == 0)
         return 0;
     if (index == MAX_PARALLEL_PORTS) {
-        fprintf(stderr, "qemu: too many parallel ports\n");
+        error_report("too many parallel ports");
         exit(1);
     }
     snprintf(label, sizeof(label), "parallel%d", index);
     parallel_hds[index] = qemu_chr_new(label, devname, NULL);
     if (!parallel_hds[index]) {
-        fprintf(stderr, "qemu: could not connect parallel device"
-                " to character backend '%s'\n", devname);
+        error_report("could not connect parallel device"
+                     " to character backend '%s'", devname);
         return -1;
     }
     index++;
@@ -2510,7 +2540,7 @@ static int virtcon_parse(const char *devname)
     if (strcmp(devname, "none") == 0)
         return 0;
     if (index == MAX_VIRTIO_CONSOLES) {
-        fprintf(stderr, "qemu: too many virtio consoles\n");
+        error_report("too many virtio consoles");
         exit(1);
     }
 
@@ -2527,8 +2557,8 @@ static int virtcon_parse(const char *devname)
     snprintf(label, sizeof(label), "virtcon%d", index);
     virtcon_hds[index] = qemu_chr_new(label, devname, NULL);
     if (!virtcon_hds[index]) {
-        fprintf(stderr, "qemu: could not connect virtio console"
-                " to character backend '%s'\n", devname);
+        error_report("could not connect virtio console"
+                     " to character backend '%s'", devname);
         return -1;
     }
     qemu_opt_set(dev_opts, "chardev", label, &error_abort);
@@ -2548,7 +2578,7 @@ static int sclp_parse(const char *devname)
         return 0;
     }
     if (index == MAX_SCLP_CONSOLES) {
-        fprintf(stderr, "qemu: too many sclp consoles\n");
+        error_report("too many sclp consoles");
         exit(1);
     }
 
@@ -2560,8 +2590,8 @@ static int sclp_parse(const char *devname)
     snprintf(label, sizeof(label), "sclpcon%d", index);
     sclp_hds[index] = qemu_chr_new(label, devname, NULL);
     if (!sclp_hds[index]) {
-        fprintf(stderr, "qemu: could not connect sclp console"
-                " to character backend '%s'\n", devname);
+        error_report("could not connect sclp console"
+                     " to character backend '%s'", devname);
         return -1;
     }
     qemu_opt_set(dev_opts, "chardev", label, &error_abort);
@@ -2579,7 +2609,7 @@ static int debugcon_parse(const char *devname)
     }
     opts = qemu_opts_create(qemu_find_opts("device"), "debugcon", 1, NULL);
     if (!opts) {
-        fprintf(stderr, "qemu: already have a debugcon device\n");
+        error_report("already have a debugcon device");
         exit(1);
     }
     qemu_opt_set(opts, "driver", "isa-debugcon", &error_abort);
@@ -2634,8 +2664,8 @@ static gint machine_class_cmp(gconstpointer a, gconstpointer b)
         return mc;
     }
     if (name && !is_help_option(name)) {
-        error_report("Unsupported machine type");
-        error_printf("Use -machine help to list supported machines!\n");
+        error_report("unsupported machine type");
+        error_printf("Use -machine help to list supported machines\n");
     } else {
         printf("Supported machines are:\n");
         machines = g_slist_sort(machines, machine_class_cmp);
@@ -3010,8 +3040,7 @@ int main(int argc, char **argv, char **envp)
     runstate_init();
 
     if (qcrypto_init(&err) < 0) {
-        fprintf(stderr, "Cannot initialize crypto: %s\n",
-                error_get_pretty(err));
+        error_report("cannot initialize crypto: %s", error_get_pretty(err));
         exit(1);
     }
     rtc_clock = QEMU_CLOCK_HOST;
@@ -3169,7 +3198,7 @@ int main(int argc, char **argv, char **envp)
                         }
                     } else if (*p != '\0') {
                     chs_fail:
-                        fprintf(stderr, "qemu: invalid physical CHS format\n");
+                        error_report("invalid physical CHS format");
                         exit(1);
                     }
                     if (hda_opts != NULL) {
@@ -3212,7 +3241,7 @@ int main(int argc, char **argv, char **envp)
 #ifdef CONFIG_CURSES
                 display_type = DT_CURSES;
 #else
-                fprintf(stderr, "Curses support is disabled\n");
+                error_report("curses support is disabled");
                 exit(1);
 #endif
                 break;
@@ -3223,8 +3252,7 @@ int main(int argc, char **argv, char **envp)
                 graphic_rotate = strtol(optarg, (char **) &optarg, 10);
                 if (graphic_rotate != 0 && graphic_rotate != 90 &&
                     graphic_rotate != 180 && graphic_rotate != 270) {
-                    fprintf(stderr,
-                        "qemu: only 90, 180, 270 deg rotation is available\n");
+                    error_report("only 90, 180, 270 deg rotation is available");
                     exit(1);
                 }
                 break;
@@ -3375,7 +3403,7 @@ int main(int argc, char **argv, char **envp)
                     w = strtol(p, (char **)&p, 10);
                     if (w <= 0) {
                     graphic_error:
-                        fprintf(stderr, "qemu: invalid resolution or depth\n");
+                        error_report("invalid resolution or depth");
                         exit(1);
                     }
                     if (*p != 'x')
@@ -3441,7 +3469,7 @@ int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_fsdev:
                 olist = qemu_find_opts("fsdev");
                 if (!olist) {
-                    fprintf(stderr, "fsdev is not supported by this qemu build.\n");
+                    error_report("fsdev support is disabled");
                     exit(1);
                 }
                 opts = qemu_opts_parse_noisily(olist, optarg, true);
@@ -3456,7 +3484,7 @@ int main(int argc, char **argv, char **envp)
 
                 olist = qemu_find_opts("virtfs");
                 if (!olist) {
-                    fprintf(stderr, "virtfs is not supported by this qemu build.\n");
+                    error_report("virtfs support is disabled");
                     exit(1);
                 }
                 opts = qemu_opts_parse_noisily(olist, optarg, true);
@@ -3466,15 +3494,15 @@ int main(int argc, char **argv, char **envp)
 
                 if (qemu_opt_get(opts, "fsdriver") == NULL ||
                     qemu_opt_get(opts, "mount_tag") == NULL) {
-                    fprintf(stderr, "Usage: -virtfs fsdriver,mount_tag=tag.\n");
+                    error_report("Usage: -virtfs fsdriver,mount_tag=tag");
                     exit(1);
                 }
                 fsdev = qemu_opts_create(qemu_find_opts("fsdev"),
                                          qemu_opt_get(opts, "mount_tag"),
                                          1, NULL);
                 if (!fsdev) {
-                    fprintf(stderr, "duplicate fsdev id: %s\n",
-                            qemu_opt_get(opts, "mount_tag"));
+                    error_report("duplicate fsdev id: %s",
+                                 qemu_opt_get(opts, "mount_tag"));
                     exit(1);
                 }
 
@@ -3483,8 +3511,8 @@ int main(int argc, char **argv, char **envp)
 #ifdef CONFIG_SYNC_FILE_RANGE
                     qemu_opt_set(fsdev, "writeout", writeout, &error_abort);
 #else
-                    fprintf(stderr, "writeout=immediate not supported on "
-                            "this platform\n");
+                    error_report("writeout=immediate not supported "
+                                 "on this platform");
                     exit(1);
 #endif
                 }
@@ -3523,7 +3551,7 @@ int main(int argc, char **argv, char **envp)
                 fsdev = qemu_opts_create(qemu_find_opts("fsdev"), "v_synth",
                                          1, NULL);
                 if (!fsdev) {
-                    fprintf(stderr, "duplicate option: %s\n", "virtfs_synth");
+                    error_report("duplicate option: %s", "virtfs_synth");
                     exit(1);
                 }
                 qemu_opt_set(fsdev, "fsdriver", "synth", &error_abort);
@@ -3544,15 +3572,14 @@ int main(int argc, char **argv, char **envp)
                 break;
             case QEMU_OPTION_watchdog:
                 if (watchdog) {
-                    fprintf(stderr,
-                            "qemu: only one watchdog option may be given\n");
+                    error_report("only one watchdog option may be given");
                     return 1;
                 }
                 watchdog = optarg;
                 break;
             case QEMU_OPTION_watchdog_action:
                 if (select_watchdog_action(optarg) == -1) {
-                    fprintf(stderr, "Unknown -watchdog-action parameter\n");
+                    error_report("unknown -watchdog-action parameter");
                     exit(1);
                 }
                 break;
@@ -3596,7 +3623,7 @@ int main(int argc, char **argv, char **envp)
                 display_type = DT_SDL;
                 break;
 #else
-                fprintf(stderr, "SDL support is disabled\n");
+                error_report("SDL support is disabled");
                 exit(1);
 #endif
             case QEMU_OPTION_pidfile:
@@ -3658,8 +3685,7 @@ int main(int argc, char **argv, char **envp)
                 qemu_opts_parse_noisily(olist, "accel=tcg", false);
                 break;
             case QEMU_OPTION_no_kvm_pit: {
-                fprintf(stderr, "Warning: KVM PIT can no longer be disabled "
-                                "separately.\n");
+                error_report("warning: ignoring deprecated option");
                 break;
             }
             case QEMU_OPTION_no_kvm_pit_reinjection: {
@@ -3672,8 +3698,8 @@ int main(int argc, char **argv, char **envp)
                     { /* end of list */ }
                 };
 
-                fprintf(stderr, "Warning: option deprecated, use "
-                        "lost_tick_policy property of kvm-pit instead.\n");
+                error_report("warning: deprecated, replaced by "
+                             "-global kvm-pit.lost_tick_policy=discard");
                 qdev_prop_register_global_list(kvm_pit_lost_tick_policy);
                 break;
             }
@@ -3708,7 +3734,7 @@ int main(int argc, char **argv, char **envp)
                     exit(1);
                 }
 #else
-                fprintf(stderr, "VNC support is disabled\n");
+                error_report("VNC support is disabled");
                 exit(1);
 #endif
                 break;
@@ -3721,7 +3747,7 @@ int main(int argc, char **argv, char **envp)
                 break;
             case QEMU_OPTION_balloon:
                 if (balloon_parse(optarg) < 0) {
-                    fprintf(stderr, "Unknown -balloon argument %s\n", optarg);
+                    error_report("unknown -balloon argument %s", optarg);
                     exit(1);
                 }
                 break;
@@ -3736,15 +3762,14 @@ int main(int argc, char **argv, char **envp)
                 break;
             case QEMU_OPTION_uuid:
                 if(qemu_uuid_parse(optarg, qemu_uuid) < 0) {
-                    fprintf(stderr, "Fail to parse UUID string."
-                            " Wrong format.\n");
+                    error_report("failed to parse UUID string: wrong format");
                     exit(1);
                 }
                 qemu_uuid_set = true;
                 break;
             case QEMU_OPTION_option_rom:
                 if (nb_option_roms >= MAX_OPTION_ROMS) {
-                    fprintf(stderr, "Too many option ROMs\n");
+                    error_report("too many option ROMs");
                     exit(1);
                 }
                 opts = qemu_opts_parse_noisily(qemu_find_opts("option-rom"),
@@ -3756,7 +3781,7 @@ int main(int argc, char **argv, char **envp)
                 option_rom[nb_option_roms].bootindex =
                     qemu_opt_get_number(opts, "bootindex", -1);
                 if (!option_rom[nb_option_roms].name) {
-                    fprintf(stderr, "Option ROM file is not specified\n");
+                    error_report("Option ROM file is not specified");
                     exit(1);
                 }
                 nb_option_roms++;
@@ -3781,9 +3806,8 @@ int main(int argc, char **argv, char **envp)
                         } else  if (strcmp("auto", target) == 0) {
                             semihosting.target = SEMIHOSTING_TARGET_AUTO;
                         } else {
-                            fprintf(stderr, "Unsupported semihosting-config"
-                                    " %s\n",
-                                optarg);
+                            error_report("unsupported semihosting-config %s",
+                                         optarg);
                             exit(1);
                         }
                     } else {
@@ -3793,14 +3817,12 @@ int main(int argc, char **argv, char **envp)
                     qemu_opt_foreach(opts, add_semihosting_arg,
                                      &semihosting, NULL);
                 } else {
-                    fprintf(stderr, "Unsupported semihosting-config %s\n",
-                            optarg);
+                    error_report("unsupported semihosting-config %s", optarg);
                     exit(1);
                 }
                 break;
             case QEMU_OPTION_tdf:
-                fprintf(stderr, "Warning: user space PIT time drift fix "
-                                "is no longer supported.\n");
+                error_report("warning: ignoring deprecated option");
                 break;
             case QEMU_OPTION_name:
                 opts = qemu_opts_parse_noisily(qemu_find_opts("name"),
@@ -3811,7 +3833,7 @@ int main(int argc, char **argv, char **envp)
                 break;
             case QEMU_OPTION_prom_env:
                 if (nb_prom_envs >= MAX_PROM_ENVS) {
-                    fprintf(stderr, "Too many prom variables\n");
+                    error_report("too many prom variables");
                     exit(1);
                 }
                 prom_envs[nb_prom_envs] = optarg;
@@ -3894,8 +3916,8 @@ int main(int argc, char **argv, char **envp)
                 {
                     int ret = qemu_read_config_file(optarg);
                     if (ret < 0) {
-                        fprintf(stderr, "read config %s: %s\n", optarg,
-                            strerror(-ret));
+                        error_report("read config %s: %s", optarg,
+                                     strerror(-ret));
                         exit(1);
                     }
                     break;
@@ -3903,7 +3925,7 @@ int main(int argc, char **argv, char **envp)
             case QEMU_OPTION_spice:
                 olist = qemu_find_opts("spice");
                 if (!olist) {
-                    fprintf(stderr, "spice is not supported by this qemu build.\n");
+                    error_report("spice support is disabled");
                     exit(1);
                 }
                 opts = qemu_opts_parse_noisily(olist, optarg, false);
@@ -3920,7 +3942,8 @@ int main(int argc, char **argv, char **envp)
                     } else {
                         fp = fopen(optarg, "w");
                         if (fp == NULL) {
-                            fprintf(stderr, "open %s: %s\n", optarg, strerror(errno));
+                            error_report("open %s: %s", optarg,
+                                         strerror(errno));
                             exit(1);
                         }
                     }
@@ -3981,13 +4004,13 @@ int main(int argc, char **argv, char **envp)
                 break;
             case QEMU_OPTION_dump_vmstate:
                 if (vmstate_dump_file) {
-                    fprintf(stderr, "qemu: only one '-dump-vmstate' "
-                            "option may be given\n");
+                    error_report("only one '-dump-vmstate' "
+                                 "option may be given");
                     exit(1);
                 }
                 vmstate_dump_file = fopen(optarg, "w");
                 if (vmstate_dump_file == NULL) {
-                    fprintf(stderr, "open %s: %s\n", optarg, strerror(errno));
+                    error_report("open %s: %s", optarg, strerror(errno));
                     exit(1);
                 }
                 break;
@@ -3997,6 +4020,8 @@ int main(int argc, char **argv, char **envp)
         }
     }
 
+    replay_configure(icount_opts);
+
     opts = qemu_get_machine_opts();
     optarg = qemu_opt_get(opts, "type");
     if (optarg) {
@@ -4004,8 +4029,8 @@ int main(int argc, char **argv, char **envp)
     }
 
     if (machine_class == NULL) {
-        fprintf(stderr, "No machine specified, and there is no default.\n"
-                "Use -machine help to list supported machines!\n");
+        error_report("No machine specified, and there is no default");
+        error_printf("Use -machine help to list supported machines\n");
         exit(1);
     }
 
@@ -4052,7 +4077,7 @@ int main(int argc, char **argv, char **envp)
     cpu_exec_init_all();
 
     if (machine_class->hw_version) {
-        qemu_set_version(machine_class->hw_version);
+        qemu_set_hw_version(machine_class->hw_version);
     }
 
     /* Init CPU def lists, based on config
@@ -4106,9 +4131,9 @@ int main(int argc, char **argv, char **envp)
 
     machine_class->max_cpus = machine_class->max_cpus ?: 1; /* Default to UP */
     if (max_cpus > machine_class->max_cpus) {
-        fprintf(stderr, "Number of SMP CPUs requested (%d) exceeds max CPUs "
-                "supported by machine '%s' (%d)\n", max_cpus,
-                machine_class->name, machine_class->max_cpus);
+        error_report("Number of SMP CPUs requested (%d) exceeds max CPUs "
+                     "supported by machine '%s' (%d)", max_cpus,
+                     machine_class->name, machine_class->max_cpus);
         exit(1);
     }
 
@@ -4169,12 +4194,12 @@ int main(int argc, char **argv, char **envp)
         if (display_type == DT_NOGRAPHIC
             && (default_parallel || default_serial
                 || default_monitor || default_virtcon)) {
-            fprintf(stderr, "-nographic can not be used with -daemonize\n");
+            error_report("-nographic cannot be used with -daemonize");
             exit(1);
         }
 #ifdef CONFIG_CURSES
         if (display_type == DT_CURSES) {
-            fprintf(stderr, "curses display can not be used with -daemonize\n");
+            error_report("curses display cannot be used with -daemonize");
             exit(1);
         }
 #endif
@@ -4233,12 +4258,12 @@ int main(int argc, char **argv, char **envp)
     }
 
     if ((no_frame || alt_grab || ctrl_grab) && display_type != DT_SDL) {
-        fprintf(stderr, "-no-frame, -alt-grab and -ctrl-grab are only valid "
-                        "for SDL, ignoring option\n");
+        error_report("-no-frame, -alt-grab and -ctrl-grab are only valid "
+                     "for SDL, ignoring option");
     }
     if (no_quit && (display_type != DT_GTK && display_type != DT_SDL)) {
-        fprintf(stderr, "-no-quit is only valid for GTK and SDL, "
-                        "ignoring option\n");
+        error_report("-no-quit is only valid for GTK and SDL, "
+                     "ignoring option");
     }
 
 #if defined(CONFIG_GTK)
@@ -4253,13 +4278,14 @@ int main(int argc, char **argv, char **envp)
 #endif
     if (request_opengl == 1 && display_opengl == 0) {
 #if defined(CONFIG_OPENGL)
-        fprintf(stderr, "OpenGL is not supported by the display.\n");
+        error_report("OpenGL is not supported by the display");
 #else
-        fprintf(stderr, "QEMU was built without opengl support.\n");
+        error_report("OpenGL support is disabled");
 #endif
         exit(1);
     }
 
+    page_size_init();
     socket_init();
 
     if (qemu_opts_foreach(qemu_find_opts("object"),
@@ -4281,7 +4307,7 @@ int main(int argc, char **argv, char **envp)
 #endif
 
     if (pid_file && qemu_create_pidfile(pid_file) != 0) {
-        fprintf(stderr, "Could not acquire pid file: %s\n", strerror(errno));
+        error_report("could not acquire pid file: %s", strerror(errno));
         exit(1);
     }
 
@@ -4352,17 +4378,17 @@ int main(int argc, char **argv, char **envp)
     linux_boot = (kernel_filename != NULL);
 
     if (!linux_boot && *kernel_cmdline != '\0') {
-        fprintf(stderr, "-append only allowed with -kernel option\n");
+        error_report("-append only allowed with -kernel option");
         exit(1);
     }
 
     if (!linux_boot && initrd_filename != NULL) {
-        fprintf(stderr, "-initrd only allowed with -kernel option\n");
+        error_report("-initrd only allowed with -kernel option");
         exit(1);
     }
 
     if (!linux_boot && qemu_opt_get(machine_opts, "dtb")) {
-        fprintf(stderr, "-dtb only allowed with -kernel option\n");
+        error_report("-dtb only allowed with -kernel option");
         exit(1);
     }
 
@@ -4381,7 +4407,7 @@ int main(int argc, char **argv, char **envp)
     cpu_ticks_init();
     if (icount_opts) {
         if (kvm_enabled() || xen_enabled()) {
-            fprintf(stderr, "-icount is not allowed with kvm or xen\n");
+            error_report("-icount is not allowed with kvm or xen");
             exit(1);
         }
         configure_icount(icount_opts, &error_abort);
@@ -4414,7 +4440,7 @@ int main(int argc, char **argv, char **envp)
     if (!xen_enabled()) {
         /* On 32-bit hosts, QEMU is limited by virtual address space */
         if (ram_size > (2047 << 20) && HOST_LONG_BITS == 32) {
-            fprintf(stderr, "qemu: at most 2047 MB RAM can be simulated\n");
+            error_report("at most 2047 MB RAM can be simulated");
             exit(1);
         }
     }
@@ -4430,9 +4456,10 @@ int main(int argc, char **argv, char **envp)
     }
 
     /* open the virtual block devices */
-    if (snapshot)
-        qemu_opts_foreach(qemu_find_opts("drive"),
-                          drive_enable_snapshot, NULL, NULL);
+    if (snapshot || replay_mode != REPLAY_MODE_NONE) {
+        qemu_opts_foreach(qemu_find_opts("drive"), drive_enable_snapshot,
+                          NULL, NULL);
+    }
     if (qemu_opts_foreach(qemu_find_opts("drive"), drive_init_func,
                           &machine_class->block_default_type, NULL)) {
         exit(1);
@@ -4487,6 +4514,10 @@ int main(int argc, char **argv, char **envp)
     }
     qemu_add_globals();
 
+    /* This checkpoint is required by replay to separate prior clock
+       reading from the other reads, because timer polling functions query
+       clock values from the log. */
+    replay_checkpoint(CHECKPOINT_INIT);
     qdev_machine_init();
 
     current_machine->ram_size = ram_size;
@@ -4579,7 +4610,7 @@ int main(int argc, char **argv, char **envp)
                       vnc_init_func, NULL, NULL);
     if (show_vnc_port) {
         char *ret = vnc_display_local_addr("default");
-        printf("VNC server running on `%s'\n", ret);
+        printf("VNC server running on '%s'\n", ret);
         g_free(ret);
     }
 #endif
@@ -4601,10 +4632,16 @@ int main(int argc, char **argv, char **envp)
     qemu_run_machine_init_done_notifiers();
 
     if (rom_check_and_register_reset() != 0) {
-        fprintf(stderr, "rom check and register reset failed\n");
+        error_report("rom check and register reset failed");
         exit(1);
     }
 
+    replay_start();
+
+    /* This checkpoint is required by replay to separate prior clock
+       reading from the other reads, because timer polling functions query
+       clock values from the log. */
+    replay_checkpoint(CHECKPOINT_RESET);
     qemu_system_reset(VMRESET_SILENT);
     register_global_state();
     if (loadvm) {
@@ -4642,6 +4679,8 @@ int main(int argc, char **argv, char **envp)
     }
 
     main_loop();
+    replay_disable_events();
+
     bdrv_close_all();
     pause_all_vcpus();
     res_free();