From 3c02270db980007424d797506301826310ce2db4 Mon Sep 17 00:00:00 2001 From: Chegu Vinod Date: Mon, 24 Jun 2013 03:49:41 -0600 Subject: Introduce async_run_on_cpu() Introduce an asynchronous version of run_on_cpu() i.e. the caller doesn't have to block till the call back routine finishes execution on the target vcpu. Signed-off-by: Chegu Vinod Reviewed-by: Paolo Bonzini Signed-off-by: Juan Quintela --- cpus.c | 29 +++++++++++++++++++++++++++++ include/qemu-common.h | 1 + include/qom/cpu.h | 10 ++++++++++ 3 files changed, 40 insertions(+) diff --git a/cpus.c b/cpus.c index f141428..5123972 100644 --- a/cpus.c +++ b/cpus.c @@ -648,6 +648,7 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data) wi.func = func; wi.data = data; + wi.free = false; if (cpu->queued_work_first == NULL) { cpu->queued_work_first = &wi; } else { @@ -666,6 +667,31 @@ void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data) } } +void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data) +{ + struct qemu_work_item *wi; + + if (qemu_cpu_is_self(cpu)) { + func(data); + return; + } + + wi = g_malloc0(sizeof(struct qemu_work_item)); + wi->func = func; + wi->data = data; + wi->free = true; + if (cpu->queued_work_first == NULL) { + cpu->queued_work_first = wi; + } else { + cpu->queued_work_last->next = wi; + } + cpu->queued_work_last = wi; + wi->next = NULL; + wi->done = false; + + qemu_cpu_kick(cpu); +} + static void flush_queued_work(CPUState *cpu) { struct qemu_work_item *wi; @@ -678,6 +704,9 @@ static void flush_queued_work(CPUState *cpu) cpu->queued_work_first = wi->next; wi->func(wi->data); wi->done = true; + if (wi->free) { + g_free(wi); + } } cpu->queued_work_last = NULL; qemu_cond_broadcast(&qemu_work_cond); diff --git a/include/qemu-common.h b/include/qemu-common.h index f439738..6948bb9 100644 --- a/include/qemu-common.h +++ b/include/qemu-common.h @@ -293,6 +293,7 @@ struct qemu_work_item { void (*func)(void *data); void *data; int done; + bool free; }; diff --git a/include/qom/cpu.h b/include/qom/cpu.h index 147c256..dfd81a1 100644 --- a/include/qom/cpu.h +++ b/include/qom/cpu.h @@ -379,6 +379,16 @@ bool cpu_is_stopped(CPUState *cpu); void run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data); /** + * async_run_on_cpu: + * @cpu: The vCPU to run on. + * @func: The function to be executed. + * @data: Data to pass to the function. + * + * Schedules the function @func for execution on the vCPU @cpu asynchronously. + */ +void async_run_on_cpu(CPUState *cpu, void (*func)(void *data), void *data); + +/** * qemu_for_each_cpu: * @func: The function to be executed. * @data: Data to pass to the function. -- cgit v1.1 From bde1e2ec2176c363c1783bf8887b6b1beb08dfee Mon Sep 17 00:00:00 2001 From: Chegu Vinod Date: Mon, 24 Jun 2013 03:49:42 -0600 Subject: Add 'auto-converge' migration capability The auto-converge migration capability allows the user to specify if they choose live migration seqeunce to automatically detect and force convergence. Signed-off-by: Chegu Vinod Reviewed-by: Paolo Bonzini Reviewed-by: Eric Blake Signed-off-by: Juan Quintela --- include/migration/migration.h | 2 ++ migration.c | 9 +++++++++ qapi-schema.json | 2 +- 3 files changed, 12 insertions(+), 1 deletion(-) diff --git a/include/migration/migration.h b/include/migration/migration.h index f0640e0..bc9fde0 100644 --- a/include/migration/migration.h +++ b/include/migration/migration.h @@ -125,6 +125,8 @@ void migrate_del_blocker(Error *reason); bool migrate_rdma_pin_all(void); +bool migrate_auto_converge(void); + int xbzrle_encode_buffer(uint8_t *old_buf, uint8_t *new_buf, int slen, uint8_t *dst, int dlen); int xbzrle_decode_buffer(uint8_t *src, int slen, uint8_t *dst, int dlen); diff --git a/migration.c b/migration.c index 635a7e7..184ae3f 100644 --- a/migration.c +++ b/migration.c @@ -484,6 +484,15 @@ bool migrate_rdma_pin_all(void) return s->enabled_capabilities[MIGRATION_CAPABILITY_X_RDMA_PIN_ALL]; } +bool migrate_auto_converge(void) +{ + MigrationState *s; + + s = migrate_get_current(); + + return s->enabled_capabilities[MIGRATION_CAPABILITY_AUTO_CONVERGE]; +} + int migrate_use_xbzrle(void) { MigrationState *s; diff --git a/qapi-schema.json b/qapi-schema.json index b251d28..35095ff 100644 --- a/qapi-schema.json +++ b/qapi-schema.json @@ -616,7 +616,7 @@ # Since: 1.2 ## { 'enum': 'MigrationCapability', - 'data': ['xbzrle', 'x-rdma-pin-all'] } + 'data': ['xbzrle', 'x-rdma-pin-all', 'auto-converge'] } ## # @MigrationCapabilityStatus -- cgit v1.1 From 7ca1dfad952d8a8655b32e78623edcc38a51b14a Mon Sep 17 00:00:00 2001 From: Chegu Vinod Date: Mon, 24 Jun 2013 03:47:39 -0600 Subject: Force auto-convegence of live migration If a user chooses to turn on the auto-converge migration capability these changes detect the lack of convergence and throttle down the guest. i.e. force the VCPUs out of the guest for some duration and let the migration thread catchup and help converge. Verified the convergence using the following : - Java Warehouse workload running on a 20VCPU/256G guest(~80% busy) - OLTP like workload running on a 80VCPU/512G guest (~80% busy) Sample results with Java warehouse workload : (migrate speed set to 20Gb and migrate downtime set to 4seconds). (qemu) info migrate capabilities: xbzrle: off auto-converge: off <---- Migration status: active total time: 1487503 milliseconds expected downtime: 519 milliseconds transferred ram: 383749347 kbytes remaining ram: 2753372 kbytes total ram: 268444224 kbytes duplicate: 65461532 pages skipped: 64901568 pages normal: 95750218 pages normal bytes: 383000872 kbytes dirty pages rate: 67551 pages --- (qemu) info migrate capabilities: xbzrle: off auto-converge: on <---- Migration status: completed total time: 241161 milliseconds downtime: 6373 milliseconds transferred ram: 28235307 kbytes remaining ram: 0 kbytes total ram: 268444224 kbytes duplicate: 64946416 pages skipped: 64903523 pages normal: 7044971 pages normal bytes: 28179884 kbytes Signed-off-by: Chegu Vinod Signed-off-by: Juan Quintela --- arch_init.c | 81 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ trace-events | 1 + 2 files changed, 82 insertions(+) diff --git a/arch_init.c b/arch_init.c index 0e553c9..e9dd96f 100644 --- a/arch_init.c +++ b/arch_init.c @@ -104,6 +104,9 @@ int graphic_depth = 32; #endif const uint32_t arch_type = QEMU_ARCH; +static bool mig_throttle_on; +static int dirty_rate_high_cnt; +static void check_guest_throttling(void); /***********************************************************/ /* ram save/restore */ @@ -378,8 +381,14 @@ static void migration_bitmap_sync(void) uint64_t num_dirty_pages_init = migration_dirty_pages; MigrationState *s = migrate_get_current(); static int64_t start_time; + static int64_t bytes_xfer_prev; static int64_t num_dirty_pages_period; int64_t end_time; + int64_t bytes_xfer_now; + + if (!bytes_xfer_prev) { + bytes_xfer_prev = ram_bytes_transferred(); + } if (!start_time) { start_time = qemu_get_clock_ms(rt_clock); @@ -404,6 +413,25 @@ static void migration_bitmap_sync(void) /* more than 1 second = 1000 millisecons */ if (end_time > start_time + 1000) { + if (migrate_auto_converge()) { + /* The following detection logic can be refined later. For now: + Check to see if the dirtied bytes is 50% more than the approx. + amount of bytes that just got transferred since the last time we + were in this routine. If that happens >N times (for now N==4) + we turn on the throttle down logic */ + bytes_xfer_now = ram_bytes_transferred(); + if (s->dirty_pages_rate && + (num_dirty_pages_period * TARGET_PAGE_SIZE > + (bytes_xfer_now - bytes_xfer_prev)/2) && + (dirty_rate_high_cnt++ > 4)) { + trace_migration_throttle(); + mig_throttle_on = true; + dirty_rate_high_cnt = 0; + } + bytes_xfer_prev = bytes_xfer_now; + } else { + mig_throttle_on = false; + } s->dirty_pages_rate = num_dirty_pages_period * 1000 / (end_time - start_time); s->dirty_bytes_rate = s->dirty_pages_rate * TARGET_PAGE_SIZE; @@ -573,6 +601,8 @@ static int ram_save_setup(QEMUFile *f, void *opaque) migration_bitmap = bitmap_new(ram_pages); bitmap_set(migration_bitmap, 0, ram_pages); migration_dirty_pages = ram_pages; + mig_throttle_on = false; + dirty_rate_high_cnt = 0; if (migrate_use_xbzrle()) { XBZRLE.cache = cache_init(migrate_xbzrle_cache_size() / @@ -635,6 +665,7 @@ static int ram_save_iterate(QEMUFile *f, void *opaque) } total_sent += bytes_sent; acct_info.iterations++; + check_guest_throttling(); /* we want to check in the 1st loop, just in case it was the 1st time and we had to sync the dirty bitmap. qemu_get_clock_ns() is a bit expensive, so we only check each some @@ -1110,3 +1141,53 @@ TargetInfo *qmp_query_target(Error **errp) return info; } + +/* Stub function that's gets run on the vcpu when its brought out of the + VM to run inside qemu via async_run_on_cpu()*/ +static void mig_sleep_cpu(void *opq) +{ + qemu_mutex_unlock_iothread(); + g_usleep(30*1000); + qemu_mutex_lock_iothread(); +} + +/* To reduce the dirty rate explicitly disallow the VCPUs from spending + much time in the VM. The migration thread will try to catchup. + Workload will experience a performance drop. +*/ +static void mig_throttle_cpu_down(CPUState *cpu, void *data) +{ + async_run_on_cpu(cpu, mig_sleep_cpu, NULL); +} + +static void mig_throttle_guest_down(void) +{ + qemu_mutex_lock_iothread(); + qemu_for_each_cpu(mig_throttle_cpu_down, NULL); + qemu_mutex_unlock_iothread(); +} + +static void check_guest_throttling(void) +{ + static int64_t t0; + int64_t t1; + + if (!mig_throttle_on) { + return; + } + + if (!t0) { + t0 = qemu_get_clock_ns(rt_clock); + return; + } + + t1 = qemu_get_clock_ns(rt_clock); + + /* If it has been more than 40 ms since the last time the guest + * was throttled then do it again. + */ + if (40 < (t1-t0)/1000000) { + mig_throttle_guest_down(); + t0 = t1; + } +} diff --git a/trace-events b/trace-events index 0acce7b..7f6d962 100644 --- a/trace-events +++ b/trace-events @@ -1036,6 +1036,7 @@ savevm_section_end(unsigned int section_id) "section_id %u" # arch_init.c migration_bitmap_sync_start(void) "" migration_bitmap_sync_end(uint64_t dirty_pages) "dirty_pages %" PRIu64"" +migration_throttle(void) "" # hw/qxl.c disable qxl_interface_set_mm_time(int qid, uint32_t mm_time) "%d %d" -- cgit v1.1