diff options
author | Timothy Pearson <tpearson@raptorengineering.com> | 2019-05-11 15:12:49 -0500 |
---|---|---|
committer | Timothy Pearson <tpearson@raptorengineering.com> | 2019-05-11 15:12:49 -0500 |
commit | 9e80202352dd49bdd9e67b8b906d86f058431505 (patch) | |
tree | 5673c17aad6e3833da8c4ff21b5a11f666ec9fbe /src/block | |
download | hqemu-9e80202352dd49bdd9e67b8b906d86f058431505.zip hqemu-9e80202352dd49bdd9e67b8b906d86f058431505.tar.gz |
Diffstat (limited to 'src/block')
59 files changed, 52052 insertions, 0 deletions
diff --git a/src/block/Makefile.objs b/src/block/Makefile.objs new file mode 100644 index 0000000..58ef2ef --- /dev/null +++ b/src/block/Makefile.objs @@ -0,0 +1,44 @@ +block-obj-y += raw_bsd.o qcow.o vdi.o vmdk.o cloop.o bochs.o vpc.o vvfat.o +block-obj-y += qcow2.o qcow2-refcount.o qcow2-cluster.o qcow2-snapshot.o qcow2-cache.o +block-obj-y += qed.o qed-gencb.o qed-l2-cache.o qed-table.o qed-cluster.o +block-obj-y += qed-check.o +block-obj-$(CONFIG_VHDX) += vhdx.o vhdx-endian.o vhdx-log.o +block-obj-y += quorum.o +block-obj-y += parallels.o blkdebug.o blkverify.o +block-obj-y += block-backend.o snapshot.o qapi.o +block-obj-$(CONFIG_WIN32) += raw-win32.o win32-aio.o +block-obj-$(CONFIG_POSIX) += raw-posix.o +block-obj-$(CONFIG_LINUX_AIO) += linux-aio.o +block-obj-y += null.o mirror.o io.o +block-obj-y += throttle-groups.o + +block-obj-y += nbd.o nbd-client.o sheepdog.o +block-obj-$(CONFIG_LIBISCSI) += iscsi.o +block-obj-$(CONFIG_LIBNFS) += nfs.o +block-obj-$(CONFIG_CURL) += curl.o +block-obj-$(CONFIG_RBD) += rbd.o +block-obj-$(CONFIG_GLUSTERFS) += gluster.o +block-obj-$(CONFIG_ARCHIPELAGO) += archipelago.o +block-obj-$(CONFIG_LIBSSH2) += ssh.o +block-obj-y += accounting.o +block-obj-y += write-threshold.o + +common-obj-y += stream.o +common-obj-y += commit.o +common-obj-y += backup.o + +iscsi.o-cflags := $(LIBISCSI_CFLAGS) +iscsi.o-libs := $(LIBISCSI_LIBS) +curl.o-cflags := $(CURL_CFLAGS) +curl.o-libs := $(CURL_LIBS) +rbd.o-cflags := $(RBD_CFLAGS) +rbd.o-libs := $(RBD_LIBS) +gluster.o-cflags := $(GLUSTERFS_CFLAGS) +gluster.o-libs := $(GLUSTERFS_LIBS) +ssh.o-cflags := $(LIBSSH2_CFLAGS) +ssh.o-libs := $(LIBSSH2_LIBS) +archipelago.o-libs := $(ARCHIPELAGO_LIBS) +block-obj-m += dmg.o +dmg.o-libs := $(BZIP2_LIBS) +qcow.o-libs := -lz +linux-aio.o-libs := -laio diff --git a/src/block/accounting.c b/src/block/accounting.c new file mode 100644 index 0000000..185025e --- /dev/null +++ b/src/block/accounting.c @@ -0,0 +1,172 @@ +/* + * QEMU System Emulator block accounting + * + * Copyright (c) 2011 Christoph Hellwig + * Copyright (c) 2015 Igalia, S.L. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/accounting.h" +#include "block/block_int.h" +#include "qemu/timer.h" +#include "sysemu/qtest.h" + +static QEMUClockType clock_type = QEMU_CLOCK_REALTIME; +static const int qtest_latency_ns = NANOSECONDS_PER_SECOND / 1000; + +void block_acct_init(BlockAcctStats *stats, bool account_invalid, + bool account_failed) +{ + stats->account_invalid = account_invalid; + stats->account_failed = account_failed; + + if (qtest_enabled()) { + clock_type = QEMU_CLOCK_VIRTUAL; + } +} + +void block_acct_cleanup(BlockAcctStats *stats) +{ + BlockAcctTimedStats *s, *next; + QSLIST_FOREACH_SAFE(s, &stats->intervals, entries, next) { + g_free(s); + } +} + +void block_acct_add_interval(BlockAcctStats *stats, unsigned interval_length) +{ + BlockAcctTimedStats *s; + unsigned i; + + s = g_new0(BlockAcctTimedStats, 1); + s->interval_length = interval_length; + QSLIST_INSERT_HEAD(&stats->intervals, s, entries); + + for (i = 0; i < BLOCK_MAX_IOTYPE; i++) { + timed_average_init(&s->latency[i], clock_type, + (uint64_t) interval_length * NANOSECONDS_PER_SECOND); + } +} + +BlockAcctTimedStats *block_acct_interval_next(BlockAcctStats *stats, + BlockAcctTimedStats *s) +{ + if (s == NULL) { + return QSLIST_FIRST(&stats->intervals); + } else { + return QSLIST_NEXT(s, entries); + } +} + +void block_acct_start(BlockAcctStats *stats, BlockAcctCookie *cookie, + int64_t bytes, enum BlockAcctType type) +{ + assert(type < BLOCK_MAX_IOTYPE); + + cookie->bytes = bytes; + cookie->start_time_ns = qemu_clock_get_ns(clock_type); + cookie->type = type; +} + +void block_acct_done(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + BlockAcctTimedStats *s; + int64_t time_ns = qemu_clock_get_ns(clock_type); + int64_t latency_ns = time_ns - cookie->start_time_ns; + + if (qtest_enabled()) { + latency_ns = qtest_latency_ns; + } + + assert(cookie->type < BLOCK_MAX_IOTYPE); + + stats->nr_bytes[cookie->type] += cookie->bytes; + stats->nr_ops[cookie->type]++; + stats->total_time_ns[cookie->type] += latency_ns; + stats->last_access_time_ns = time_ns; + + QSLIST_FOREACH(s, &stats->intervals, entries) { + timed_average_account(&s->latency[cookie->type], latency_ns); + } +} + +void block_acct_failed(BlockAcctStats *stats, BlockAcctCookie *cookie) +{ + assert(cookie->type < BLOCK_MAX_IOTYPE); + + stats->failed_ops[cookie->type]++; + + if (stats->account_failed) { + BlockAcctTimedStats *s; + int64_t time_ns = qemu_clock_get_ns(clock_type); + int64_t latency_ns = time_ns - cookie->start_time_ns; + + if (qtest_enabled()) { + latency_ns = qtest_latency_ns; + } + + stats->total_time_ns[cookie->type] += latency_ns; + stats->last_access_time_ns = time_ns; + + QSLIST_FOREACH(s, &stats->intervals, entries) { + timed_average_account(&s->latency[cookie->type], latency_ns); + } + } +} + +void block_acct_invalid(BlockAcctStats *stats, enum BlockAcctType type) +{ + assert(type < BLOCK_MAX_IOTYPE); + + /* block_acct_done() and block_acct_failed() update + * total_time_ns[], but this one does not. The reason is that + * invalid requests are accounted during their submission, + * therefore there's no actual I/O involved. */ + + stats->invalid_ops[type]++; + + if (stats->account_invalid) { + stats->last_access_time_ns = qemu_clock_get_ns(clock_type); + } +} + +void block_acct_merge_done(BlockAcctStats *stats, enum BlockAcctType type, + int num_requests) +{ + assert(type < BLOCK_MAX_IOTYPE); + stats->merged[type] += num_requests; +} + +int64_t block_acct_idle_time_ns(BlockAcctStats *stats) +{ + return qemu_clock_get_ns(clock_type) - stats->last_access_time_ns; +} + +double block_acct_queue_depth(BlockAcctTimedStats *stats, + enum BlockAcctType type) +{ + uint64_t sum, elapsed; + + assert(type < BLOCK_MAX_IOTYPE); + + sum = timed_average_sum(&stats->latency[type], &elapsed); + + return (double) sum / elapsed; +} diff --git a/src/block/archipelago.c b/src/block/archipelago.c new file mode 100644 index 0000000..855655c --- /dev/null +++ b/src/block/archipelago.c @@ -0,0 +1,1084 @@ +/* + * QEMU Block driver for Archipelago + * + * Copyright (C) 2014 Chrysostomos Nanakos <cnanakos@grnet.gr> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +/* + * VM Image on Archipelago volume is specified like this: + * + * file.driver=archipelago,file.volume=<volumename> + * [,file.mport=<mapperd_port>[,file.vport=<vlmcd_port>] + * [,file.segment=<segment_name>]] + * + * or + * + * file=archipelago:<volumename>[/mport=<mapperd_port>[:vport=<vlmcd_port>][: + * segment=<segment_name>]] + * + * 'archipelago' is the protocol. + * + * 'mport' is the port number on which mapperd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'vport' is the port number on which vlmcd is listening. This is optional + * and if not specified, QEMU will make Archipelago to use the default port. + * + * 'segment' is the name of the shared memory segment Archipelago stack + * is using. This is optional and if not specified, QEMU will make Archipelago + * to use the default value, 'archipelago'. + * + * Examples: + * + * file.driver=archipelago,file.volume=my_vm_volume + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + * file.vport=1234 + * file.driver=archipelago,file.volume=my_vm_volume,file.mport=123, + * file.vport=1234,file.segment=my_segment + * + * or + * + * file=archipelago:my_vm_volume + * file=archipelago:my_vm_volume/mport=123 + * file=archipelago:my_vm_volume/mport=123:vport=1234 + * file=archipelago:my_vm_volume/mport=123:vport=1234:segment=my_segment + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/thread.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" +#include "qapi/qmp/qjson.h" +#include "qemu/atomic.h" + +#include <inttypes.h> +#include <xseg/xseg.h> +#include <xseg/protocol.h> + +#define MAX_REQUEST_SIZE 524288 + +#define ARCHIPELAGO_OPT_VOLUME "volume" +#define ARCHIPELAGO_OPT_SEGMENT "segment" +#define ARCHIPELAGO_OPT_MPORT "mport" +#define ARCHIPELAGO_OPT_VPORT "vport" +#define ARCHIPELAGO_DFL_MPORT 1001 +#define ARCHIPELAGO_DFL_VPORT 501 + +#define archipelagolog(fmt, ...) \ + do { \ + fprintf(stderr, "archipelago\t%-24s: " fmt, __func__, ##__VA_ARGS__); \ + } while (0) + +typedef enum { + ARCHIP_OP_READ, + ARCHIP_OP_WRITE, + ARCHIP_OP_FLUSH, + ARCHIP_OP_VOLINFO, + ARCHIP_OP_TRUNCATE, +} ARCHIPCmd; + +typedef struct ArchipelagoAIOCB { + BlockAIOCB common; + QEMUBH *bh; + struct BDRVArchipelagoState *s; + QEMUIOVector *qiov; + ARCHIPCmd cmd; + int status; + int64_t size; + int64_t ret; +} ArchipelagoAIOCB; + +typedef struct BDRVArchipelagoState { + ArchipelagoAIOCB *event_acb; + char *volname; + char *segment_name; + uint64_t size; + /* Archipelago specific */ + struct xseg *xseg; + struct xseg_port *port; + xport srcport; + xport sport; + xport mportno; + xport vportno; + QemuMutex archip_mutex; + QemuCond archip_cond; + bool is_signaled; + /* Request handler specific */ + QemuThread request_th; + QemuCond request_cond; + QemuMutex request_mutex; + bool th_is_signaled; + bool stopping; +} BDRVArchipelagoState; + +typedef struct ArchipelagoSegmentedRequest { + size_t count; + size_t total; + int ref; + int failed; +} ArchipelagoSegmentedRequest; + +typedef struct AIORequestData { + const char *volname; + off_t offset; + size_t size; + uint64_t bufidx; + int ret; + int op; + ArchipelagoAIOCB *aio_cb; + ArchipelagoSegmentedRequest *segreq; +} AIORequestData; + +static void qemu_archipelago_complete_aio(void *opaque); + +static void init_local_signal(struct xseg *xseg, xport sport, xport srcport) +{ + if (xseg && (sport != srcport)) { + xseg_init_local_signal(xseg, srcport); + sport = srcport; + } +} + +static void archipelago_finish_aiocb(AIORequestData *reqdata) +{ + if (reqdata->aio_cb->ret != reqdata->segreq->total) { + reqdata->aio_cb->ret = -EIO; + } else if (reqdata->aio_cb->ret == reqdata->segreq->total) { + reqdata->aio_cb->ret = 0; + } + reqdata->aio_cb->bh = aio_bh_new( + bdrv_get_aio_context(reqdata->aio_cb->common.bs), + qemu_archipelago_complete_aio, reqdata + ); + qemu_bh_schedule(reqdata->aio_cb->bh); +} + +static int wait_reply(struct xseg *xseg, xport srcport, struct xseg_port *port, + struct xseg_request *expected_req) +{ + struct xseg_request *req; + xseg_prepare_wait(xseg, srcport); + void *psd = xseg_get_signal_desc(xseg, port); + while (1) { + req = xseg_receive(xseg, srcport, X_NONBLOCK); + if (req) { + if (req != expected_req) { + archipelagolog("Unknown received request\n"); + xseg_put_request(xseg, req, srcport); + } else if (!(req->state & XS_SERVED)) { + return -1; + } else { + break; + } + } + xseg_wait_signal(xseg, psd, 100000UL); + } + xseg_cancel_wait(xseg, srcport); + return 0; +} + +static void xseg_request_handler(void *state) +{ + BDRVArchipelagoState *s = (BDRVArchipelagoState *) state; + void *psd = xseg_get_signal_desc(s->xseg, s->port); + qemu_mutex_lock(&s->request_mutex); + + while (!s->stopping) { + struct xseg_request *req; + void *data; + xseg_prepare_wait(s->xseg, s->srcport); + req = xseg_receive(s->xseg, s->srcport, X_NONBLOCK); + if (req) { + AIORequestData *reqdata; + ArchipelagoSegmentedRequest *segreq; + xseg_get_req_data(s->xseg, req, (void **)&reqdata); + + switch (reqdata->op) { + case ARCHIP_OP_READ: + data = xseg_get_data(s->xseg, req); + segreq = reqdata->segreq; + segreq->count += req->serviced; + + qemu_iovec_from_buf(reqdata->aio_cb->qiov, reqdata->bufidx, + data, + req->serviced); + + xseg_put_request(s->xseg, req, s->srcport); + + if (atomic_fetch_dec(&segreq->ref) == 1) { + if (!segreq->failed) { + reqdata->aio_cb->ret = segreq->count; + archipelago_finish_aiocb(reqdata); + g_free(segreq); + } else { + g_free(segreq); + g_free(reqdata); + } + } else { + g_free(reqdata); + } + break; + case ARCHIP_OP_WRITE: + case ARCHIP_OP_FLUSH: + segreq = reqdata->segreq; + segreq->count += req->serviced; + xseg_put_request(s->xseg, req, s->srcport); + + if (atomic_fetch_dec(&segreq->ref) == 1) { + if (!segreq->failed) { + reqdata->aio_cb->ret = segreq->count; + archipelago_finish_aiocb(reqdata); + g_free(segreq); + } else { + g_free(segreq); + g_free(reqdata); + } + } else { + g_free(reqdata); + } + break; + case ARCHIP_OP_VOLINFO: + case ARCHIP_OP_TRUNCATE: + s->is_signaled = true; + qemu_cond_signal(&s->archip_cond); + break; + } + } else { + xseg_wait_signal(s->xseg, psd, 100000UL); + } + xseg_cancel_wait(s->xseg, s->srcport); + } + + s->th_is_signaled = true; + qemu_cond_signal(&s->request_cond); + qemu_mutex_unlock(&s->request_mutex); + qemu_thread_exit(NULL); +} + +static int qemu_archipelago_xseg_init(BDRVArchipelagoState *s) +{ + if (xseg_initialize()) { + archipelagolog("Cannot initialize XSEG\n"); + goto err_exit; + } + + s->xseg = xseg_join("posix", s->segment_name, + "posixfd", NULL); + if (!s->xseg) { + archipelagolog("Cannot join XSEG shared memory segment\n"); + goto err_exit; + } + s->port = xseg_bind_dynport(s->xseg); + s->srcport = s->port->portno; + init_local_signal(s->xseg, s->sport, s->srcport); + return 0; + +err_exit: + return -1; +} + +static int qemu_archipelago_init(BDRVArchipelagoState *s) +{ + int ret; + + ret = qemu_archipelago_xseg_init(s); + if (ret < 0) { + error_report("Cannot initialize XSEG. Aborting..."); + goto err_exit; + } + + qemu_cond_init(&s->archip_cond); + qemu_mutex_init(&s->archip_mutex); + qemu_cond_init(&s->request_cond); + qemu_mutex_init(&s->request_mutex); + s->th_is_signaled = false; + qemu_thread_create(&s->request_th, "xseg_io_th", + (void *) xseg_request_handler, + (void *) s, QEMU_THREAD_JOINABLE); + +err_exit: + return ret; +} + +static void qemu_archipelago_complete_aio(void *opaque) +{ + AIORequestData *reqdata = (AIORequestData *) opaque; + ArchipelagoAIOCB *aio_cb = (ArchipelagoAIOCB *) reqdata->aio_cb; + + qemu_bh_delete(aio_cb->bh); + aio_cb->common.cb(aio_cb->common.opaque, aio_cb->ret); + aio_cb->status = 0; + + qemu_aio_unref(aio_cb); + g_free(reqdata); +} + +static void xseg_find_port(char *pstr, const char *needle, xport *aport) +{ + const char *a; + char *endptr = NULL; + unsigned long port; + if (strstart(pstr, needle, &a)) { + if (strlen(a) > 0) { + port = strtoul(a, &endptr, 10); + if (strlen(endptr)) { + *aport = -2; + return; + } + *aport = (xport) port; + } + } +} + +static void xseg_find_segment(char *pstr, const char *needle, + char **segment_name) +{ + const char *a; + if (strstart(pstr, needle, &a)) { + if (strlen(a) > 0) { + *segment_name = g_strdup(a); + } + } +} + +static void parse_filename_opts(const char *filename, Error **errp, + char **volume, char **segment_name, + xport *mport, xport *vport) +{ + const char *start; + char *tokens[4], *ds; + int idx; + xport lmport = NoPort, lvport = NoPort; + + strstart(filename, "archipelago:", &start); + + ds = g_strdup(start); + tokens[0] = strtok(ds, "/"); + tokens[1] = strtok(NULL, ":"); + tokens[2] = strtok(NULL, ":"); + tokens[3] = strtok(NULL, "\0"); + + if (!strlen(tokens[0])) { + error_setg(errp, "volume name must be specified first"); + g_free(ds); + return; + } + + for (idx = 1; idx < 4; idx++) { + if (tokens[idx] != NULL) { + if (strstart(tokens[idx], "mport=", NULL)) { + xseg_find_port(tokens[idx], "mport=", &lmport); + } + if (strstart(tokens[idx], "vport=", NULL)) { + xseg_find_port(tokens[idx], "vport=", &lvport); + } + if (strstart(tokens[idx], "segment=", NULL)) { + xseg_find_segment(tokens[idx], "segment=", segment_name); + } + } + } + + if ((lmport == -2) || (lvport == -2)) { + error_setg(errp, "mport and/or vport must be set"); + g_free(ds); + return; + } + *volume = g_strdup(tokens[0]); + *mport = lmport; + *vport = lvport; + g_free(ds); +} + +static void archipelago_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + const char *start; + char *volume = NULL, *segment_name = NULL; + xport mport = NoPort, vport = NoPort; + + if (qdict_haskey(options, ARCHIPELAGO_OPT_VOLUME) + || qdict_haskey(options, ARCHIPELAGO_OPT_SEGMENT) + || qdict_haskey(options, ARCHIPELAGO_OPT_MPORT) + || qdict_haskey(options, ARCHIPELAGO_OPT_VPORT)) { + error_setg(errp, "volume/mport/vport/segment and a file name may not" + " be specified at the same time"); + return; + } + + if (!strstart(filename, "archipelago:", &start)) { + error_setg(errp, "File name must start with 'archipelago:'"); + return; + } + + if (!strlen(start) || strstart(start, "/", NULL)) { + error_setg(errp, "volume name must be specified"); + return; + } + + parse_filename_opts(filename, errp, &volume, &segment_name, &mport, &vport); + + if (volume) { + qdict_put(options, ARCHIPELAGO_OPT_VOLUME, qstring_from_str(volume)); + g_free(volume); + } + if (segment_name) { + qdict_put(options, ARCHIPELAGO_OPT_SEGMENT, + qstring_from_str(segment_name)); + g_free(segment_name); + } + if (mport != NoPort) { + qdict_put(options, ARCHIPELAGO_OPT_MPORT, qint_from_int(mport)); + } + if (vport != NoPort) { + qdict_put(options, ARCHIPELAGO_OPT_VPORT, qint_from_int(vport)); + } +} + +static QemuOptsList archipelago_runtime_opts = { + .name = "archipelago", + .head = QTAILQ_HEAD_INITIALIZER(archipelago_runtime_opts.head), + .desc = { + { + .name = ARCHIPELAGO_OPT_VOLUME, + .type = QEMU_OPT_STRING, + .help = "Name of the volume image", + }, + { + .name = ARCHIPELAGO_OPT_SEGMENT, + .type = QEMU_OPT_STRING, + .help = "Name of the Archipelago shared memory segment", + }, + { + .name = ARCHIPELAGO_OPT_MPORT, + .type = QEMU_OPT_NUMBER, + .help = "Archipelago mapperd port number" + }, + { + .name = ARCHIPELAGO_OPT_VPORT, + .type = QEMU_OPT_NUMBER, + .help = "Archipelago vlmcd port number" + + }, + { /* end of list */ } + }, +}; + +static int qemu_archipelago_open(BlockDriverState *bs, + QDict *options, + int bdrv_flags, + Error **errp) +{ + int ret = 0; + const char *volume, *segment_name; + QemuOpts *opts; + Error *local_err = NULL; + BDRVArchipelagoState *s = bs->opaque; + + opts = qemu_opts_create(&archipelago_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto err_exit; + } + + s->mportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_MPORT, + ARCHIPELAGO_DFL_MPORT); + s->vportno = qemu_opt_get_number(opts, ARCHIPELAGO_OPT_VPORT, + ARCHIPELAGO_DFL_VPORT); + + segment_name = qemu_opt_get(opts, ARCHIPELAGO_OPT_SEGMENT); + if (segment_name == NULL) { + s->segment_name = g_strdup("archipelago"); + } else { + s->segment_name = g_strdup(segment_name); + } + + volume = qemu_opt_get(opts, ARCHIPELAGO_OPT_VOLUME); + if (volume == NULL) { + error_setg(errp, "archipelago block driver requires the 'volume'" + " option"); + ret = -EINVAL; + goto err_exit; + } + s->volname = g_strdup(volume); + + /* Initialize XSEG, join shared memory segment */ + ret = qemu_archipelago_init(s); + if (ret < 0) { + error_setg(errp, "cannot initialize XSEG and join shared " + "memory segment"); + goto err_exit; + } + + qemu_opts_del(opts); + return 0; + +err_exit: + g_free(s->volname); + g_free(s->segment_name); + qemu_opts_del(opts); + return ret; +} + +static void qemu_archipelago_close(BlockDriverState *bs) +{ + int r, targetlen; + char *target; + struct xseg_request *req; + BDRVArchipelagoState *s = bs->opaque; + + s->stopping = true; + + qemu_mutex_lock(&s->request_mutex); + while (!s->th_is_signaled) { + qemu_cond_wait(&s->request_cond, + &s->request_mutex); + } + qemu_mutex_unlock(&s->request_mutex); + qemu_thread_join(&s->request_th); + qemu_cond_destroy(&s->request_cond); + qemu_mutex_destroy(&s->request_mutex); + + qemu_cond_destroy(&s->archip_cond); + qemu_mutex_destroy(&s->archip_mutex); + + targetlen = strlen(s->volname); + req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit; + } + r = xseg_prep_request(s->xseg, req, targetlen, 0); + if (r < 0) { + xseg_put_request(s->xseg, req, s->srcport); + archipelagolog("Cannot prepare XSEG close request\n"); + goto err_exit; + } + + target = xseg_get_target(s->xseg, req); + memcpy(target, s->volname, targetlen); + req->size = req->datalen; + req->offset = 0; + req->op = X_CLOSE; + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + xseg_put_request(s->xseg, req, s->srcport); + archipelagolog("Cannot submit XSEG close request\n"); + goto err_exit; + } + + xseg_signal(s->xseg, p); + wait_reply(s->xseg, s->srcport, s->port, req); + + xseg_put_request(s->xseg, req, s->srcport); + +err_exit: + g_free(s->volname); + g_free(s->segment_name); + xseg_quit_local_signal(s->xseg, s->srcport); + xseg_leave_dynport(s->xseg, s->port); + xseg_leave(s->xseg); +} + +static int qemu_archipelago_create_volume(Error **errp, const char *volname, + char *segment_name, + uint64_t size, xport mportno, + xport vportno) +{ + int ret, targetlen; + struct xseg *xseg = NULL; + struct xseg_request *req; + struct xseg_request_clone *xclone; + struct xseg_port *port; + xport srcport = NoPort, sport = NoPort; + char *target; + + /* Try default values if none has been set */ + if (mportno == (xport) -1) { + mportno = ARCHIPELAGO_DFL_MPORT; + } + + if (vportno == (xport) -1) { + vportno = ARCHIPELAGO_DFL_VPORT; + } + + if (xseg_initialize()) { + error_setg(errp, "Cannot initialize XSEG"); + return -1; + } + + xseg = xseg_join("posix", segment_name, + "posixfd", NULL); + + if (!xseg) { + error_setg(errp, "Cannot join XSEG shared memory segment"); + return -1; + } + + port = xseg_bind_dynport(xseg); + srcport = port->portno; + init_local_signal(xseg, sport, srcport); + + req = xseg_get_request(xseg, srcport, mportno, X_ALLOC); + if (!req) { + error_setg(errp, "Cannot get XSEG request"); + return -1; + } + + targetlen = strlen(volname); + ret = xseg_prep_request(xseg, req, targetlen, + sizeof(struct xseg_request_clone)); + if (ret < 0) { + error_setg(errp, "Cannot prepare XSEG request"); + goto err_exit; + } + + target = xseg_get_target(xseg, req); + if (!target) { + error_setg(errp, "Cannot get XSEG target."); + goto err_exit; + } + memcpy(target, volname, targetlen); + xclone = (struct xseg_request_clone *) xseg_get_data(xseg, req); + memset(xclone->target, 0 , XSEG_MAX_TARGETLEN); + xclone->targetlen = 0; + xclone->size = size; + req->offset = 0; + req->size = req->datalen; + req->op = X_CLONE; + + xport p = xseg_submit(xseg, req, srcport, X_ALLOC); + if (p == NoPort) { + error_setg(errp, "Could not submit XSEG request"); + goto err_exit; + } + xseg_signal(xseg, p); + + ret = wait_reply(xseg, srcport, port, req); + if (ret < 0) { + error_setg(errp, "wait_reply() error."); + } + + xseg_put_request(xseg, req, srcport); + xseg_quit_local_signal(xseg, srcport); + xseg_leave_dynport(xseg, port); + xseg_leave(xseg); + return ret; + +err_exit: + xseg_put_request(xseg, req, srcport); + xseg_quit_local_signal(xseg, srcport); + xseg_leave_dynport(xseg, port); + xseg_leave(xseg); + return -1; +} + +static int qemu_archipelago_create(const char *filename, + QemuOpts *options, + Error **errp) +{ + int ret = 0; + uint64_t total_size = 0; + char *volname = NULL, *segment_name = NULL; + const char *start; + xport mport = NoPort, vport = NoPort; + + if (!strstart(filename, "archipelago:", &start)) { + error_setg(errp, "File name must start with 'archipelago:'"); + return -1; + } + + if (!strlen(start) || strstart(start, "/", NULL)) { + error_setg(errp, "volume name must be specified"); + return -1; + } + + parse_filename_opts(filename, errp, &volname, &segment_name, &mport, + &vport); + total_size = ROUND_UP(qemu_opt_get_size_del(options, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + + if (segment_name == NULL) { + segment_name = g_strdup("archipelago"); + } + + /* Create an Archipelago volume */ + ret = qemu_archipelago_create_volume(errp, volname, segment_name, + total_size, mport, + vport); + + g_free(volname); + g_free(segment_name); + return ret; +} + +static const AIOCBInfo archipelago_aiocb_info = { + .aiocb_size = sizeof(ArchipelagoAIOCB), +}; + +static int archipelago_submit_request(BDRVArchipelagoState *s, + uint64_t bufidx, + size_t count, + off_t offset, + ArchipelagoAIOCB *aio_cb, + ArchipelagoSegmentedRequest *segreq, + int op) +{ + int ret, targetlen; + char *target; + void *data = NULL; + struct xseg_request *req; + AIORequestData *reqdata = g_new(AIORequestData, 1); + + targetlen = strlen(s->volname); + req = xseg_get_request(s->xseg, s->srcport, s->vportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit2; + } + ret = xseg_prep_request(s->xseg, req, targetlen, count); + if (ret < 0) { + archipelagolog("Cannot prepare XSEG request\n"); + goto err_exit; + } + target = xseg_get_target(s->xseg, req); + if (!target) { + archipelagolog("Cannot get XSEG target\n"); + goto err_exit; + } + memcpy(target, s->volname, targetlen); + req->size = count; + req->offset = offset; + + switch (op) { + case ARCHIP_OP_READ: + req->op = X_READ; + break; + case ARCHIP_OP_WRITE: + req->op = X_WRITE; + break; + case ARCHIP_OP_FLUSH: + req->op = X_FLUSH; + break; + } + reqdata->volname = s->volname; + reqdata->offset = offset; + reqdata->size = count; + reqdata->bufidx = bufidx; + reqdata->aio_cb = aio_cb; + reqdata->segreq = segreq; + reqdata->op = op; + + xseg_set_req_data(s->xseg, req, reqdata); + if (op == ARCHIP_OP_WRITE) { + data = xseg_get_data(s->xseg, req); + if (!data) { + archipelagolog("Cannot get XSEG data\n"); + goto err_exit; + } + qemu_iovec_to_buf(aio_cb->qiov, bufidx, data, count); + } + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + archipelagolog("Could not submit XSEG request\n"); + goto err_exit; + } + xseg_signal(s->xseg, p); + return 0; + +err_exit: + g_free(reqdata); + xseg_put_request(s->xseg, req, s->srcport); + return -EIO; +err_exit2: + g_free(reqdata); + return -EIO; +} + +static int archipelago_aio_segmented_rw(BDRVArchipelagoState *s, + size_t count, + off_t offset, + ArchipelagoAIOCB *aio_cb, + int op) +{ + int ret, segments_nr; + size_t pos = 0; + ArchipelagoSegmentedRequest *segreq; + + segreq = g_new0(ArchipelagoSegmentedRequest, 1); + + if (op == ARCHIP_OP_FLUSH) { + segments_nr = 1; + } else { + segments_nr = (int)(count / MAX_REQUEST_SIZE) + \ + ((count % MAX_REQUEST_SIZE) ? 1 : 0); + } + segreq->total = count; + atomic_mb_set(&segreq->ref, segments_nr); + + while (segments_nr > 1) { + ret = archipelago_submit_request(s, pos, + MAX_REQUEST_SIZE, + offset + pos, + aio_cb, segreq, op); + + if (ret < 0) { + goto err_exit; + } + count -= MAX_REQUEST_SIZE; + pos += MAX_REQUEST_SIZE; + segments_nr--; + } + ret = archipelago_submit_request(s, pos, count, offset + pos, + aio_cb, segreq, op); + + if (ret < 0) { + goto err_exit; + } + return 0; + +err_exit: + segreq->failed = 1; + if (atomic_fetch_sub(&segreq->ref, segments_nr) == segments_nr) { + g_free(segreq); + } + return ret; +} + +static BlockAIOCB *qemu_archipelago_aio_rw(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, + int op) +{ + ArchipelagoAIOCB *aio_cb; + BDRVArchipelagoState *s = bs->opaque; + int64_t size, off; + int ret; + + aio_cb = qemu_aio_get(&archipelago_aiocb_info, bs, cb, opaque); + aio_cb->cmd = op; + aio_cb->qiov = qiov; + + aio_cb->ret = 0; + aio_cb->s = s; + aio_cb->status = -EINPROGRESS; + + off = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + aio_cb->size = size; + + ret = archipelago_aio_segmented_rw(s, size, off, + aio_cb, op); + if (ret < 0) { + goto err_exit; + } + return &aio_cb->common; + +err_exit: + error_report("qemu_archipelago_aio_rw(): I/O Error"); + qemu_aio_unref(aio_cb); + return NULL; +} + +static BlockAIOCB *qemu_archipelago_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, + opaque, ARCHIP_OP_READ); +} + +static BlockAIOCB *qemu_archipelago_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, sector_num, qiov, nb_sectors, cb, + opaque, ARCHIP_OP_WRITE); +} + +static int64_t archipelago_volume_info(BDRVArchipelagoState *s) +{ + uint64_t size; + int ret, targetlen; + struct xseg_request *req; + struct xseg_reply_info *xinfo; + AIORequestData *reqdata = g_new(AIORequestData, 1); + + const char *volname = s->volname; + targetlen = strlen(volname); + req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit2; + } + ret = xseg_prep_request(s->xseg, req, targetlen, + sizeof(struct xseg_reply_info)); + if (ret < 0) { + archipelagolog("Cannot prepare XSEG request\n"); + goto err_exit; + } + char *target = xseg_get_target(s->xseg, req); + if (!target) { + archipelagolog("Cannot get XSEG target\n"); + goto err_exit; + } + memcpy(target, volname, targetlen); + req->size = req->datalen; + req->offset = 0; + req->op = X_INFO; + + reqdata->op = ARCHIP_OP_VOLINFO; + reqdata->volname = volname; + xseg_set_req_data(s->xseg, req, reqdata); + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + archipelagolog("Cannot submit XSEG request\n"); + goto err_exit; + } + xseg_signal(s->xseg, p); + qemu_mutex_lock(&s->archip_mutex); + while (!s->is_signaled) { + qemu_cond_wait(&s->archip_cond, &s->archip_mutex); + } + s->is_signaled = false; + qemu_mutex_unlock(&s->archip_mutex); + + xinfo = (struct xseg_reply_info *) xseg_get_data(s->xseg, req); + size = xinfo->size; + xseg_put_request(s->xseg, req, s->srcport); + g_free(reqdata); + s->size = size; + return size; + +err_exit: + xseg_put_request(s->xseg, req, s->srcport); +err_exit2: + g_free(reqdata); + return -EIO; +} + +static int64_t qemu_archipelago_getlength(BlockDriverState *bs) +{ + int64_t ret; + BDRVArchipelagoState *s = bs->opaque; + + ret = archipelago_volume_info(s); + return ret; +} + +static int qemu_archipelago_truncate(BlockDriverState *bs, int64_t offset) +{ + int ret, targetlen; + struct xseg_request *req; + BDRVArchipelagoState *s = bs->opaque; + AIORequestData *reqdata = g_new(AIORequestData, 1); + + const char *volname = s->volname; + targetlen = strlen(volname); + req = xseg_get_request(s->xseg, s->srcport, s->mportno, X_ALLOC); + if (!req) { + archipelagolog("Cannot get XSEG request\n"); + goto err_exit2; + } + + ret = xseg_prep_request(s->xseg, req, targetlen, 0); + if (ret < 0) { + archipelagolog("Cannot prepare XSEG request\n"); + goto err_exit; + } + char *target = xseg_get_target(s->xseg, req); + if (!target) { + archipelagolog("Cannot get XSEG target\n"); + goto err_exit; + } + memcpy(target, volname, targetlen); + req->offset = offset; + req->op = X_TRUNCATE; + + reqdata->op = ARCHIP_OP_TRUNCATE; + reqdata->volname = volname; + + xseg_set_req_data(s->xseg, req, reqdata); + + xport p = xseg_submit(s->xseg, req, s->srcport, X_ALLOC); + if (p == NoPort) { + archipelagolog("Cannot submit XSEG request\n"); + goto err_exit; + } + + xseg_signal(s->xseg, p); + qemu_mutex_lock(&s->archip_mutex); + while (!s->is_signaled) { + qemu_cond_wait(&s->archip_cond, &s->archip_mutex); + } + s->is_signaled = false; + qemu_mutex_unlock(&s->archip_mutex); + xseg_put_request(s->xseg, req, s->srcport); + g_free(reqdata); + return 0; + +err_exit: + xseg_put_request(s->xseg, req, s->srcport); +err_exit2: + g_free(reqdata); + return -EIO; +} + +static QemuOptsList qemu_archipelago_create_opts = { + .name = "archipelago-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_archipelago_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + +static BlockAIOCB *qemu_archipelago_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + return qemu_archipelago_aio_rw(bs, 0, NULL, 0, cb, opaque, + ARCHIP_OP_FLUSH); +} + +static BlockDriver bdrv_archipelago = { + .format_name = "archipelago", + .protocol_name = "archipelago", + .instance_size = sizeof(BDRVArchipelagoState), + .bdrv_parse_filename = archipelago_parse_filename, + .bdrv_file_open = qemu_archipelago_open, + .bdrv_close = qemu_archipelago_close, + .bdrv_create = qemu_archipelago_create, + .bdrv_getlength = qemu_archipelago_getlength, + .bdrv_truncate = qemu_archipelago_truncate, + .bdrv_aio_readv = qemu_archipelago_aio_readv, + .bdrv_aio_writev = qemu_archipelago_aio_writev, + .bdrv_aio_flush = qemu_archipelago_aio_flush, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .create_opts = &qemu_archipelago_create_opts, +}; + +static void bdrv_archipelago_init(void) +{ + bdrv_register(&bdrv_archipelago); +} + +block_init(bdrv_archipelago_init); diff --git a/src/block/backup.c b/src/block/backup.c new file mode 100644 index 0000000..705bb77 --- /dev/null +++ b/src/block/backup.c @@ -0,0 +1,586 @@ +/* + * QEMU backup + * + * Copyright (C) 2013 Proxmox Server Solutions + * + * Authors: + * Dietmar Maurer (dietmar@proxmox.com) + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ + +#include <stdio.h> +#include <errno.h> +#include <unistd.h> + +#include "trace.h" +#include "block/block.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" +#include "sysemu/block-backend.h" + +#define BACKUP_CLUSTER_BITS 16 +#define BACKUP_CLUSTER_SIZE (1 << BACKUP_CLUSTER_BITS) +#define BACKUP_SECTORS_PER_CLUSTER (BACKUP_CLUSTER_SIZE / BDRV_SECTOR_SIZE) + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct CowRequest { + int64_t start; + int64_t end; + QLIST_ENTRY(CowRequest) list; + CoQueue wait_queue; /* coroutines blocked on this request */ +} CowRequest; + +typedef struct BackupBlockJob { + BlockJob common; + BlockDriverState *target; + /* bitmap for sync=incremental */ + BdrvDirtyBitmap *sync_bitmap; + MirrorSyncMode sync_mode; + RateLimit limit; + BlockdevOnError on_source_error; + BlockdevOnError on_target_error; + CoRwlock flush_rwlock; + uint64_t sectors_read; + HBitmap *bitmap; + QLIST_HEAD(, CowRequest) inflight_reqs; +} BackupBlockJob; + +/* See if in-flight requests overlap and wait for them to complete */ +static void coroutine_fn wait_for_overlapping_requests(BackupBlockJob *job, + int64_t start, + int64_t end) +{ + CowRequest *req; + bool retry; + + do { + retry = false; + QLIST_FOREACH(req, &job->inflight_reqs, list) { + if (end > req->start && start < req->end) { + qemu_co_queue_wait(&req->wait_queue); + retry = true; + break; + } + } + } while (retry); +} + +/* Keep track of an in-flight request */ +static void cow_request_begin(CowRequest *req, BackupBlockJob *job, + int64_t start, int64_t end) +{ + req->start = start; + req->end = end; + qemu_co_queue_init(&req->wait_queue); + QLIST_INSERT_HEAD(&job->inflight_reqs, req, list); +} + +/* Forget about a completed request */ +static void cow_request_end(CowRequest *req) +{ + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +static int coroutine_fn backup_do_cow(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + bool *error_is_read, + bool is_write_notifier) +{ + BackupBlockJob *job = (BackupBlockJob *)bs->job; + CowRequest cow_request; + struct iovec iov; + QEMUIOVector bounce_qiov; + void *bounce_buffer = NULL; + int ret = 0; + int64_t start, end; + int n; + + qemu_co_rwlock_rdlock(&job->flush_rwlock); + + start = sector_num / BACKUP_SECTORS_PER_CLUSTER; + end = DIV_ROUND_UP(sector_num + nb_sectors, BACKUP_SECTORS_PER_CLUSTER); + + trace_backup_do_cow_enter(job, start, sector_num, nb_sectors); + + wait_for_overlapping_requests(job, start, end); + cow_request_begin(&cow_request, job, start, end); + + for (; start < end; start++) { + if (hbitmap_get(job->bitmap, start)) { + trace_backup_do_cow_skip(job, start); + continue; /* already copied */ + } + + trace_backup_do_cow_process(job, start); + + n = MIN(BACKUP_SECTORS_PER_CLUSTER, + job->common.len / BDRV_SECTOR_SIZE - + start * BACKUP_SECTORS_PER_CLUSTER); + + if (!bounce_buffer) { + bounce_buffer = qemu_blockalign(bs, BACKUP_CLUSTER_SIZE); + } + iov.iov_base = bounce_buffer; + iov.iov_len = n * BDRV_SECTOR_SIZE; + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + if (is_write_notifier) { + ret = bdrv_co_readv_no_serialising(bs, + start * BACKUP_SECTORS_PER_CLUSTER, + n, &bounce_qiov); + } else { + ret = bdrv_co_readv(bs, start * BACKUP_SECTORS_PER_CLUSTER, n, + &bounce_qiov); + } + if (ret < 0) { + trace_backup_do_cow_read_fail(job, start, ret); + if (error_is_read) { + *error_is_read = true; + } + goto out; + } + + if (buffer_is_zero(iov.iov_base, iov.iov_len)) { + ret = bdrv_co_write_zeroes(job->target, + start * BACKUP_SECTORS_PER_CLUSTER, + n, BDRV_REQ_MAY_UNMAP); + } else { + ret = bdrv_co_writev(job->target, + start * BACKUP_SECTORS_PER_CLUSTER, n, + &bounce_qiov); + } + if (ret < 0) { + trace_backup_do_cow_write_fail(job, start, ret); + if (error_is_read) { + *error_is_read = false; + } + goto out; + } + + hbitmap_set(job->bitmap, start, 1); + + /* Publish progress, guest I/O counts as progress too. Note that the + * offset field is an opaque progress value, it is not a disk offset. + */ + job->sectors_read += n; + job->common.offset += n * BDRV_SECTOR_SIZE; + } + +out: + if (bounce_buffer) { + qemu_vfree(bounce_buffer); + } + + cow_request_end(&cow_request); + + trace_backup_do_cow_return(job, sector_num, nb_sectors, ret); + + qemu_co_rwlock_unlock(&job->flush_rwlock); + + return ret; +} + +static int coroutine_fn backup_before_write_notify( + NotifierWithReturn *notifier, + void *opaque) +{ + BdrvTrackedRequest *req = opaque; + int64_t sector_num = req->offset >> BDRV_SECTOR_BITS; + int nb_sectors = req->bytes >> BDRV_SECTOR_BITS; + + assert((req->offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((req->bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + + return backup_do_cow(req->bs, sector_num, nb_sectors, NULL, true); +} + +static void backup_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + + if (speed < 0) { + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); + return; + } + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static void backup_iostatus_reset(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + + if (s->target->blk) { + blk_iostatus_reset(s->target->blk); + } +} + +static void backup_cleanup_sync_bitmap(BackupBlockJob *job, int ret) +{ + BdrvDirtyBitmap *bm; + BlockDriverState *bs = job->common.bs; + + if (ret < 0 || block_job_is_cancelled(&job->common)) { + /* Merge the successor back into the parent, delete nothing. */ + bm = bdrv_reclaim_dirty_bitmap(bs, job->sync_bitmap, NULL); + assert(bm); + } else { + /* Everything is fine, delete this bitmap and install the backup. */ + bm = bdrv_dirty_bitmap_abdicate(bs, job->sync_bitmap, NULL); + assert(bm); + } +} + +static void backup_commit(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + if (s->sync_bitmap) { + backup_cleanup_sync_bitmap(s, 0); + } +} + +static void backup_abort(BlockJob *job) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + if (s->sync_bitmap) { + backup_cleanup_sync_bitmap(s, -1); + } +} + +static const BlockJobDriver backup_job_driver = { + .instance_size = sizeof(BackupBlockJob), + .job_type = BLOCK_JOB_TYPE_BACKUP, + .set_speed = backup_set_speed, + .iostatus_reset = backup_iostatus_reset, + .commit = backup_commit, + .abort = backup_abort, +}; + +static BlockErrorAction backup_error_action(BackupBlockJob *job, + bool read, int error) +{ + if (read) { + return block_job_error_action(&job->common, job->common.bs, + job->on_source_error, true, error); + } else { + return block_job_error_action(&job->common, job->target, + job->on_target_error, false, error); + } +} + +typedef struct { + int ret; +} BackupCompleteData; + +static void backup_complete(BlockJob *job, void *opaque) +{ + BackupBlockJob *s = container_of(job, BackupBlockJob, common); + BackupCompleteData *data = opaque; + + bdrv_unref(s->target); + + block_job_completed(job, data->ret); + g_free(data); +} + +static bool coroutine_fn yield_and_check(BackupBlockJob *job) +{ + if (block_job_is_cancelled(&job->common)) { + return true; + } + + /* we need to yield so that bdrv_drain_all() returns. + * (without, VM does not reboot) + */ + if (job->common.speed) { + uint64_t delay_ns = ratelimit_calculate_delay(&job->limit, + job->sectors_read); + job->sectors_read = 0; + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, delay_ns); + } else { + block_job_sleep_ns(&job->common, QEMU_CLOCK_REALTIME, 0); + } + + if (block_job_is_cancelled(&job->common)) { + return true; + } + + return false; +} + +static int coroutine_fn backup_run_incremental(BackupBlockJob *job) +{ + bool error_is_read; + int ret = 0; + int clusters_per_iter; + uint32_t granularity; + int64_t sector; + int64_t cluster; + int64_t end; + int64_t last_cluster = -1; + BlockDriverState *bs = job->common.bs; + HBitmapIter hbi; + + granularity = bdrv_dirty_bitmap_granularity(job->sync_bitmap); + clusters_per_iter = MAX((granularity / BACKUP_CLUSTER_SIZE), 1); + bdrv_dirty_iter_init(job->sync_bitmap, &hbi); + + /* Find the next dirty sector(s) */ + while ((sector = hbitmap_iter_next(&hbi)) != -1) { + cluster = sector / BACKUP_SECTORS_PER_CLUSTER; + + /* Fake progress updates for any clusters we skipped */ + if (cluster != last_cluster + 1) { + job->common.offset += ((cluster - last_cluster - 1) * + BACKUP_CLUSTER_SIZE); + } + + for (end = cluster + clusters_per_iter; cluster < end; cluster++) { + do { + if (yield_and_check(job)) { + return ret; + } + ret = backup_do_cow(bs, cluster * BACKUP_SECTORS_PER_CLUSTER, + BACKUP_SECTORS_PER_CLUSTER, &error_is_read, + false); + if ((ret < 0) && + backup_error_action(job, error_is_read, -ret) == + BLOCK_ERROR_ACTION_REPORT) { + return ret; + } + } while (ret < 0); + } + + /* If the bitmap granularity is smaller than the backup granularity, + * we need to advance the iterator pointer to the next cluster. */ + if (granularity < BACKUP_CLUSTER_SIZE) { + bdrv_set_dirty_iter(&hbi, cluster * BACKUP_SECTORS_PER_CLUSTER); + } + + last_cluster = cluster - 1; + } + + /* Play some final catchup with the progress meter */ + end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + if (last_cluster + 1 < end) { + job->common.offset += ((end - last_cluster - 1) * BACKUP_CLUSTER_SIZE); + } + + return ret; +} + +static void coroutine_fn backup_run(void *opaque) +{ + BackupBlockJob *job = opaque; + BackupCompleteData *data; + BlockDriverState *bs = job->common.bs; + BlockDriverState *target = job->target; + BlockdevOnError on_target_error = job->on_target_error; + NotifierWithReturn before_write = { + .notify = backup_before_write_notify, + }; + int64_t start, end; + int ret = 0; + + QLIST_INIT(&job->inflight_reqs); + qemu_co_rwlock_init(&job->flush_rwlock); + + start = 0; + end = DIV_ROUND_UP(job->common.len, BACKUP_CLUSTER_SIZE); + + job->bitmap = hbitmap_alloc(end, 0); + + bdrv_set_enable_write_cache(target, true); + if (target->blk) { + blk_set_on_error(target->blk, on_target_error, on_target_error); + blk_iostatus_enable(target->blk); + } + + bdrv_add_before_write_notifier(bs, &before_write); + + if (job->sync_mode == MIRROR_SYNC_MODE_NONE) { + while (!block_job_is_cancelled(&job->common)) { + /* Yield until the job is cancelled. We just let our before_write + * notify callback service CoW requests. */ + job->common.busy = false; + qemu_coroutine_yield(); + job->common.busy = true; + } + } else if (job->sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { + ret = backup_run_incremental(job); + } else { + /* Both FULL and TOP SYNC_MODE's require copying.. */ + for (; start < end; start++) { + bool error_is_read; + if (yield_and_check(job)) { + break; + } + + if (job->sync_mode == MIRROR_SYNC_MODE_TOP) { + int i, n; + int alloced = 0; + + /* Check to see if these blocks are already in the + * backing file. */ + + for (i = 0; i < BACKUP_SECTORS_PER_CLUSTER;) { + /* bdrv_is_allocated() only returns true/false based + * on the first set of sectors it comes across that + * are are all in the same state. + * For that reason we must verify each sector in the + * backup cluster length. We end up copying more than + * needed but at some point that is always the case. */ + alloced = + bdrv_is_allocated(bs, + start * BACKUP_SECTORS_PER_CLUSTER + i, + BACKUP_SECTORS_PER_CLUSTER - i, &n); + i += n; + + if (alloced == 1 || n == 0) { + break; + } + } + + /* If the above loop never found any sectors that are in + * the topmost image, skip this backup. */ + if (alloced == 0) { + continue; + } + } + /* FULL sync mode we copy the whole drive. */ + ret = backup_do_cow(bs, start * BACKUP_SECTORS_PER_CLUSTER, + BACKUP_SECTORS_PER_CLUSTER, &error_is_read, false); + if (ret < 0) { + /* Depending on error action, fail now or retry cluster */ + BlockErrorAction action = + backup_error_action(job, error_is_read, -ret); + if (action == BLOCK_ERROR_ACTION_REPORT) { + break; + } else { + start--; + continue; + } + } + } + } + + notifier_with_return_remove(&before_write); + + /* wait until pending backup_do_cow() calls have completed */ + qemu_co_rwlock_wrlock(&job->flush_rwlock); + qemu_co_rwlock_unlock(&job->flush_rwlock); + hbitmap_free(job->bitmap); + + if (target->blk) { + blk_iostatus_disable(target->blk); + } + bdrv_op_unblock_all(target, job->common.blocker); + + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&job->common, backup_complete, data); +} + +void backup_start(BlockDriverState *bs, BlockDriverState *target, + int64_t speed, MirrorSyncMode sync_mode, + BdrvDirtyBitmap *sync_bitmap, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + BlockCompletionFunc *cb, void *opaque, + BlockJobTxn *txn, Error **errp) +{ + int64_t len; + + assert(bs); + assert(target); + assert(cb); + + if (bs == target) { + error_setg(errp, "Source and target cannot be the same"); + return; + } + + if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || + on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { + error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); + return; + } + + if (!bdrv_is_inserted(bs)) { + error_setg(errp, "Device is not inserted: %s", + bdrv_get_device_name(bs)); + return; + } + + if (!bdrv_is_inserted(target)) { + error_setg(errp, "Device is not inserted: %s", + bdrv_get_device_name(target)); + return; + } + + if (bdrv_op_is_blocked(bs, BLOCK_OP_TYPE_BACKUP_SOURCE, errp)) { + return; + } + + if (bdrv_op_is_blocked(target, BLOCK_OP_TYPE_BACKUP_TARGET, errp)) { + return; + } + + if (sync_mode == MIRROR_SYNC_MODE_INCREMENTAL) { + if (!sync_bitmap) { + error_setg(errp, "must provide a valid bitmap name for " + "\"incremental\" sync mode"); + return; + } + + /* Create a new bitmap, and freeze/disable this one. */ + if (bdrv_dirty_bitmap_create_successor(bs, sync_bitmap, errp) < 0) { + return; + } + } else if (sync_bitmap) { + error_setg(errp, + "a sync_bitmap was provided to backup_run, " + "but received an incompatible sync_mode (%s)", + MirrorSyncMode_lookup[sync_mode]); + return; + } + + len = bdrv_getlength(bs); + if (len < 0) { + error_setg_errno(errp, -len, "unable to get length for '%s'", + bdrv_get_device_name(bs)); + goto error; + } + + BackupBlockJob *job = block_job_create(&backup_job_driver, bs, speed, + cb, opaque, errp); + if (!job) { + goto error; + } + + bdrv_op_block_all(target, job->common.blocker); + + job->on_source_error = on_source_error; + job->on_target_error = on_target_error; + job->target = target; + job->sync_mode = sync_mode; + job->sync_bitmap = sync_mode == MIRROR_SYNC_MODE_INCREMENTAL ? + sync_bitmap : NULL; + job->common.len = len; + job->common.co = qemu_coroutine_create(backup_run); + block_job_txn_add_job(txn, &job->common); + qemu_coroutine_enter(job->common.co, job); + return; + + error: + if (sync_bitmap) { + bdrv_reclaim_dirty_bitmap(bs, sync_bitmap, NULL); + } +} diff --git a/src/block/blkdebug.c b/src/block/blkdebug.c new file mode 100644 index 0000000..dee3a0e --- /dev/null +++ b/src/block/blkdebug.c @@ -0,0 +1,812 @@ +/* + * Block protocol for I/O error injection + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" +#include "sysemu/qtest.h" + +typedef struct BDRVBlkdebugState { + int state; + int new_state; + + QLIST_HEAD(, BlkdebugRule) rules[BLKDBG_EVENT_MAX]; + QSIMPLEQ_HEAD(, BlkdebugRule) active_rules; + QLIST_HEAD(, BlkdebugSuspendedReq) suspended_reqs; +} BDRVBlkdebugState; + +typedef struct BlkdebugAIOCB { + BlockAIOCB common; + QEMUBH *bh; + int ret; +} BlkdebugAIOCB; + +typedef struct BlkdebugSuspendedReq { + Coroutine *co; + char *tag; + QLIST_ENTRY(BlkdebugSuspendedReq) next; +} BlkdebugSuspendedReq; + +static const AIOCBInfo blkdebug_aiocb_info = { + .aiocb_size = sizeof(BlkdebugAIOCB), +}; + +enum { + ACTION_INJECT_ERROR, + ACTION_SET_STATE, + ACTION_SUSPEND, +}; + +typedef struct BlkdebugRule { + BlkDebugEvent event; + int action; + int state; + union { + struct { + int error; + int immediately; + int once; + int64_t sector; + } inject; + struct { + int new_state; + } set_state; + struct { + char *tag; + } suspend; + } options; + QLIST_ENTRY(BlkdebugRule) next; + QSIMPLEQ_ENTRY(BlkdebugRule) active_next; +} BlkdebugRule; + +static QemuOptsList inject_error_opts = { + .name = "inject-error", + .head = QTAILQ_HEAD_INITIALIZER(inject_error_opts.head), + .desc = { + { + .name = "event", + .type = QEMU_OPT_STRING, + }, + { + .name = "state", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "errno", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "sector", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "once", + .type = QEMU_OPT_BOOL, + }, + { + .name = "immediately", + .type = QEMU_OPT_BOOL, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList set_state_opts = { + .name = "set-state", + .head = QTAILQ_HEAD_INITIALIZER(set_state_opts.head), + .desc = { + { + .name = "event", + .type = QEMU_OPT_STRING, + }, + { + .name = "state", + .type = QEMU_OPT_NUMBER, + }, + { + .name = "new_state", + .type = QEMU_OPT_NUMBER, + }, + { /* end of list */ } + }, +}; + +static QemuOptsList *config_groups[] = { + &inject_error_opts, + &set_state_opts, + NULL +}; + +static const char *event_names[BLKDBG_EVENT_MAX] = { + [BLKDBG_L1_UPDATE] = "l1_update", + [BLKDBG_L1_GROW_ALLOC_TABLE] = "l1_grow.alloc_table", + [BLKDBG_L1_GROW_WRITE_TABLE] = "l1_grow.write_table", + [BLKDBG_L1_GROW_ACTIVATE_TABLE] = "l1_grow.activate_table", + + [BLKDBG_L2_LOAD] = "l2_load", + [BLKDBG_L2_UPDATE] = "l2_update", + [BLKDBG_L2_UPDATE_COMPRESSED] = "l2_update_compressed", + [BLKDBG_L2_ALLOC_COW_READ] = "l2_alloc.cow_read", + [BLKDBG_L2_ALLOC_WRITE] = "l2_alloc.write", + + [BLKDBG_READ_AIO] = "read_aio", + [BLKDBG_READ_BACKING_AIO] = "read_backing_aio", + [BLKDBG_READ_COMPRESSED] = "read_compressed", + + [BLKDBG_WRITE_AIO] = "write_aio", + [BLKDBG_WRITE_COMPRESSED] = "write_compressed", + + [BLKDBG_VMSTATE_LOAD] = "vmstate_load", + [BLKDBG_VMSTATE_SAVE] = "vmstate_save", + + [BLKDBG_COW_READ] = "cow_read", + [BLKDBG_COW_WRITE] = "cow_write", + + [BLKDBG_REFTABLE_LOAD] = "reftable_load", + [BLKDBG_REFTABLE_GROW] = "reftable_grow", + [BLKDBG_REFTABLE_UPDATE] = "reftable_update", + + [BLKDBG_REFBLOCK_LOAD] = "refblock_load", + [BLKDBG_REFBLOCK_UPDATE] = "refblock_update", + [BLKDBG_REFBLOCK_UPDATE_PART] = "refblock_update_part", + [BLKDBG_REFBLOCK_ALLOC] = "refblock_alloc", + [BLKDBG_REFBLOCK_ALLOC_HOOKUP] = "refblock_alloc.hookup", + [BLKDBG_REFBLOCK_ALLOC_WRITE] = "refblock_alloc.write", + [BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS] = "refblock_alloc.write_blocks", + [BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE] = "refblock_alloc.write_table", + [BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE] = "refblock_alloc.switch_table", + + [BLKDBG_CLUSTER_ALLOC] = "cluster_alloc", + [BLKDBG_CLUSTER_ALLOC_BYTES] = "cluster_alloc_bytes", + [BLKDBG_CLUSTER_FREE] = "cluster_free", + + [BLKDBG_FLUSH_TO_OS] = "flush_to_os", + [BLKDBG_FLUSH_TO_DISK] = "flush_to_disk", + + [BLKDBG_PWRITEV_RMW_HEAD] = "pwritev_rmw.head", + [BLKDBG_PWRITEV_RMW_AFTER_HEAD] = "pwritev_rmw.after_head", + [BLKDBG_PWRITEV_RMW_TAIL] = "pwritev_rmw.tail", + [BLKDBG_PWRITEV_RMW_AFTER_TAIL] = "pwritev_rmw.after_tail", + [BLKDBG_PWRITEV] = "pwritev", + [BLKDBG_PWRITEV_ZERO] = "pwritev_zero", + [BLKDBG_PWRITEV_DONE] = "pwritev_done", + + [BLKDBG_EMPTY_IMAGE_PREPARE] = "empty_image_prepare", +}; + +static int get_event_by_name(const char *name, BlkDebugEvent *event) +{ + int i; + + for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + if (!strcmp(event_names[i], name)) { + *event = i; + return 0; + } + } + + return -1; +} + +struct add_rule_data { + BDRVBlkdebugState *s; + int action; +}; + +static int add_rule(void *opaque, QemuOpts *opts, Error **errp) +{ + struct add_rule_data *d = opaque; + BDRVBlkdebugState *s = d->s; + const char* event_name; + BlkDebugEvent event; + struct BlkdebugRule *rule; + + /* Find the right event for the rule */ + event_name = qemu_opt_get(opts, "event"); + if (!event_name) { + error_setg(errp, "Missing event name for rule"); + return -1; + } else if (get_event_by_name(event_name, &event) < 0) { + error_setg(errp, "Invalid event name \"%s\"", event_name); + return -1; + } + + /* Set attributes common for all actions */ + rule = g_malloc0(sizeof(*rule)); + *rule = (struct BlkdebugRule) { + .event = event, + .action = d->action, + .state = qemu_opt_get_number(opts, "state", 0), + }; + + /* Parse action-specific options */ + switch (d->action) { + case ACTION_INJECT_ERROR: + rule->options.inject.error = qemu_opt_get_number(opts, "errno", EIO); + rule->options.inject.once = qemu_opt_get_bool(opts, "once", 0); + rule->options.inject.immediately = + qemu_opt_get_bool(opts, "immediately", 0); + rule->options.inject.sector = qemu_opt_get_number(opts, "sector", -1); + break; + + case ACTION_SET_STATE: + rule->options.set_state.new_state = + qemu_opt_get_number(opts, "new_state", 0); + break; + + case ACTION_SUSPEND: + rule->options.suspend.tag = + g_strdup(qemu_opt_get(opts, "tag")); + break; + }; + + /* Add the rule */ + QLIST_INSERT_HEAD(&s->rules[event], rule, next); + + return 0; +} + +static void remove_rule(BlkdebugRule *rule) +{ + switch (rule->action) { + case ACTION_INJECT_ERROR: + case ACTION_SET_STATE: + break; + case ACTION_SUSPEND: + g_free(rule->options.suspend.tag); + break; + } + + QLIST_REMOVE(rule, next); + g_free(rule); +} + +static int read_config(BDRVBlkdebugState *s, const char *filename, + QDict *options, Error **errp) +{ + FILE *f = NULL; + int ret; + struct add_rule_data d; + Error *local_err = NULL; + + if (filename) { + f = fopen(filename, "r"); + if (f == NULL) { + error_setg_errno(errp, errno, "Could not read blkdebug config file"); + return -errno; + } + + ret = qemu_config_parse(f, config_groups, filename); + if (ret < 0) { + error_setg(errp, "Could not parse blkdebug config file"); + ret = -EINVAL; + goto fail; + } + } + + qemu_config_parse_qdict(options, config_groups, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + d.s = s; + d.action = ACTION_INJECT_ERROR; + qemu_opts_foreach(&inject_error_opts, add_rule, &d, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + d.action = ACTION_SET_STATE; + qemu_opts_foreach(&set_state_opts, add_rule, &d, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + ret = 0; +fail: + qemu_opts_reset(&inject_error_opts); + qemu_opts_reset(&set_state_opts); + if (f) { + fclose(f); + } + return ret; +} + +/* Valid blkdebug filenames look like blkdebug:path/to/config:path/to/image */ +static void blkdebug_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + const char *c; + + /* Parse the blkdebug: prefix */ + if (!strstart(filename, "blkdebug:", &filename)) { + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); + return; + } + + /* Parse config file path */ + c = strchr(filename, ':'); + if (c == NULL) { + error_setg(errp, "blkdebug requires both config file and image path"); + return; + } + + if (c != filename) { + QString *config_path; + config_path = qstring_from_substr(filename, 0, c - filename - 1); + qdict_put(options, "config", config_path); + } + + /* TODO Allow multi-level nesting and set file.filename here */ + filename = c + 1; + qdict_put(options, "x-image", qstring_from_str(filename)); +} + +static QemuOptsList runtime_opts = { + .name = "blkdebug", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "config", + .type = QEMU_OPT_STRING, + .help = "Path to the configuration file", + }, + { + .name = "x-image", + .type = QEMU_OPT_STRING, + .help = "[internal use only, will be removed]", + }, + { + .name = "align", + .type = QEMU_OPT_SIZE, + .help = "Required alignment in bytes", + }, + { /* end of list */ } + }, +}; + +static int blkdebug_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVBlkdebugState *s = bs->opaque; + QemuOpts *opts; + Error *local_err = NULL; + const char *config; + uint64_t align; + int ret; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + + /* Read rules from config file or command line options */ + config = qemu_opt_get(opts, "config"); + ret = read_config(s, config, options, errp); + if (ret) { + goto out; + } + + /* Set initial state */ + s->state = 1; + + /* Open the image file */ + bs->file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options, "image", + bs, &child_file, false, &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto out; + } + + /* Set request alignment */ + align = qemu_opt_get_size(opts, "align", bs->request_alignment); + if (align > 0 && align < INT_MAX && !(align & (align - 1))) { + bs->request_alignment = align; + } else { + error_setg(errp, "Invalid alignment"); + ret = -EINVAL; + goto fail_unref; + } + + ret = 0; + goto out; + +fail_unref: + bdrv_unref_child(bs, bs->file); +out: + qemu_opts_del(opts); + return ret; +} + +static void error_callback_bh(void *opaque) +{ + struct BlkdebugAIOCB *acb = opaque; + qemu_bh_delete(acb->bh); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_unref(acb); +} + +static BlockAIOCB *inject_error(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque, BlkdebugRule *rule) +{ + BDRVBlkdebugState *s = bs->opaque; + int error = rule->options.inject.error; + struct BlkdebugAIOCB *acb; + QEMUBH *bh; + bool immediately = rule->options.inject.immediately; + + if (rule->options.inject.once) { + QSIMPLEQ_REMOVE(&s->active_rules, rule, BlkdebugRule, active_next); + remove_rule(rule); + } + + if (immediately) { + return NULL; + } + + acb = qemu_aio_get(&blkdebug_aiocb_info, bs, cb, opaque); + acb->ret = -error; + + bh = aio_bh_new(bdrv_get_aio_context(bs), error_callback_bh, acb); + acb->bh = bh; + qemu_bh_schedule(bh); + + return &acb->common; +} + +static BlockAIOCB *blkdebug_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule = NULL; + + QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { + if (rule->options.inject.sector == -1 || + (rule->options.inject.sector >= sector_num && + rule->options.inject.sector < sector_num + nb_sectors)) { + break; + } + } + + if (rule && rule->options.inject.error) { + return inject_error(bs, cb, opaque, rule); + } + + return bdrv_aio_readv(bs->file->bs, sector_num, qiov, nb_sectors, + cb, opaque); +} + +static BlockAIOCB *blkdebug_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule = NULL; + + QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { + if (rule->options.inject.sector == -1 || + (rule->options.inject.sector >= sector_num && + rule->options.inject.sector < sector_num + nb_sectors)) { + break; + } + } + + if (rule && rule->options.inject.error) { + return inject_error(bs, cb, opaque, rule); + } + + return bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, + cb, opaque); +} + +static BlockAIOCB *blkdebug_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule = NULL; + + QSIMPLEQ_FOREACH(rule, &s->active_rules, active_next) { + if (rule->options.inject.sector == -1) { + break; + } + } + + if (rule && rule->options.inject.error) { + return inject_error(bs, cb, opaque, rule); + } + + return bdrv_aio_flush(bs->file->bs, cb, opaque); +} + + +static void blkdebug_close(BlockDriverState *bs) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugRule *rule, *next; + int i; + + for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { + remove_rule(rule); + } + } +} + +static void suspend_request(BlockDriverState *bs, BlkdebugRule *rule) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq r; + + r = (BlkdebugSuspendedReq) { + .co = qemu_coroutine_self(), + .tag = g_strdup(rule->options.suspend.tag), + }; + + remove_rule(rule); + QLIST_INSERT_HEAD(&s->suspended_reqs, &r, next); + + if (!qtest_enabled()) { + printf("blkdebug: Suspended request '%s'\n", r.tag); + } + qemu_coroutine_yield(); + if (!qtest_enabled()) { + printf("blkdebug: Resuming request '%s'\n", r.tag); + } + + QLIST_REMOVE(&r, next); + g_free(r.tag); +} + +static bool process_rule(BlockDriverState *bs, struct BlkdebugRule *rule, + bool injected) +{ + BDRVBlkdebugState *s = bs->opaque; + + /* Only process rules for the current state */ + if (rule->state && rule->state != s->state) { + return injected; + } + + /* Take the action */ + switch (rule->action) { + case ACTION_INJECT_ERROR: + if (!injected) { + QSIMPLEQ_INIT(&s->active_rules); + injected = true; + } + QSIMPLEQ_INSERT_HEAD(&s->active_rules, rule, active_next); + break; + + case ACTION_SET_STATE: + s->new_state = rule->options.set_state.new_state; + break; + + case ACTION_SUSPEND: + suspend_request(bs, rule); + break; + } + return injected; +} + +static void blkdebug_debug_event(BlockDriverState *bs, BlkDebugEvent event) +{ + BDRVBlkdebugState *s = bs->opaque; + struct BlkdebugRule *rule, *next; + bool injected; + + assert((int)event >= 0 && event < BLKDBG_EVENT_MAX); + + injected = false; + s->new_state = s->state; + QLIST_FOREACH_SAFE(rule, &s->rules[event], next, next) { + injected = process_rule(bs, rule, injected); + } + s->state = s->new_state; +} + +static int blkdebug_debug_breakpoint(BlockDriverState *bs, const char *event, + const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + struct BlkdebugRule *rule; + BlkDebugEvent blkdebug_event; + + if (get_event_by_name(event, &blkdebug_event) < 0) { + return -ENOENT; + } + + + rule = g_malloc(sizeof(*rule)); + *rule = (struct BlkdebugRule) { + .event = blkdebug_event, + .action = ACTION_SUSPEND, + .state = 0, + .options.suspend.tag = g_strdup(tag), + }; + + QLIST_INSERT_HEAD(&s->rules[blkdebug_event], rule, next); + + return 0; +} + +static int blkdebug_debug_resume(BlockDriverState *bs, const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq *r, *next; + + QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, next) { + if (!strcmp(r->tag, tag)) { + qemu_coroutine_enter(r->co, NULL); + return 0; + } + } + return -ENOENT; +} + +static int blkdebug_debug_remove_breakpoint(BlockDriverState *bs, + const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq *r, *r_next; + BlkdebugRule *rule, *next; + int i, ret = -ENOENT; + + for (i = 0; i < BLKDBG_EVENT_MAX; i++) { + QLIST_FOREACH_SAFE(rule, &s->rules[i], next, next) { + if (rule->action == ACTION_SUSPEND && + !strcmp(rule->options.suspend.tag, tag)) { + remove_rule(rule); + ret = 0; + } + } + } + QLIST_FOREACH_SAFE(r, &s->suspended_reqs, next, r_next) { + if (!strcmp(r->tag, tag)) { + qemu_coroutine_enter(r->co, NULL); + ret = 0; + } + } + return ret; +} + +static bool blkdebug_debug_is_suspended(BlockDriverState *bs, const char *tag) +{ + BDRVBlkdebugState *s = bs->opaque; + BlkdebugSuspendedReq *r; + + QLIST_FOREACH(r, &s->suspended_reqs, next) { + if (!strcmp(r->tag, tag)) { + return true; + } + } + return false; +} + +static int64_t blkdebug_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file->bs); +} + +static int blkdebug_truncate(BlockDriverState *bs, int64_t offset) +{ + return bdrv_truncate(bs->file->bs, offset); +} + +static void blkdebug_refresh_filename(BlockDriverState *bs) +{ + QDict *opts; + const QDictEntry *e; + bool force_json = false; + + for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) { + if (strcmp(qdict_entry_key(e), "config") && + strcmp(qdict_entry_key(e), "x-image") && + strcmp(qdict_entry_key(e), "image") && + strncmp(qdict_entry_key(e), "image.", strlen("image."))) + { + force_json = true; + break; + } + } + + if (force_json && !bs->file->bs->full_open_options) { + /* The config file cannot be recreated, so creating a plain filename + * is impossible */ + return; + } + + if (!force_json && bs->file->bs->exact_filename[0]) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "blkdebug:%s:%s", + qdict_get_try_str(bs->options, "config") ?: "", + bs->file->bs->exact_filename); + } + + opts = qdict_new(); + qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkdebug"))); + + QINCREF(bs->file->bs->full_open_options); + qdict_put_obj(opts, "image", QOBJECT(bs->file->bs->full_open_options)); + + for (e = qdict_first(bs->options); e; e = qdict_next(bs->options, e)) { + if (strcmp(qdict_entry_key(e), "x-image") && + strcmp(qdict_entry_key(e), "image") && + strncmp(qdict_entry_key(e), "image.", strlen("image."))) + { + qobject_incref(qdict_entry_value(e)); + qdict_put_obj(opts, qdict_entry_key(e), qdict_entry_value(e)); + } + } + + bs->full_open_options = opts; +} + +static BlockDriver bdrv_blkdebug = { + .format_name = "blkdebug", + .protocol_name = "blkdebug", + .instance_size = sizeof(BDRVBlkdebugState), + + .bdrv_parse_filename = blkdebug_parse_filename, + .bdrv_file_open = blkdebug_open, + .bdrv_close = blkdebug_close, + .bdrv_getlength = blkdebug_getlength, + .bdrv_truncate = blkdebug_truncate, + .bdrv_refresh_filename = blkdebug_refresh_filename, + + .bdrv_aio_readv = blkdebug_aio_readv, + .bdrv_aio_writev = blkdebug_aio_writev, + .bdrv_aio_flush = blkdebug_aio_flush, + + .bdrv_debug_event = blkdebug_debug_event, + .bdrv_debug_breakpoint = blkdebug_debug_breakpoint, + .bdrv_debug_remove_breakpoint + = blkdebug_debug_remove_breakpoint, + .bdrv_debug_resume = blkdebug_debug_resume, + .bdrv_debug_is_suspended = blkdebug_debug_is_suspended, +}; + +static void bdrv_blkdebug_init(void) +{ + bdrv_register(&bdrv_blkdebug); +} + +block_init(bdrv_blkdebug_init); diff --git a/src/block/blkverify.c b/src/block/blkverify.c new file mode 100644 index 0000000..c5f8e8d --- /dev/null +++ b/src/block/blkverify.c @@ -0,0 +1,369 @@ +/* + * Block protocol for block driver correctness testing + * + * Copyright (C) 2010 IBM, Corp. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include <stdarg.h> +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "block/block_int.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qstring.h" + +typedef struct { + BdrvChild *test_file; +} BDRVBlkverifyState; + +typedef struct BlkverifyAIOCB BlkverifyAIOCB; +struct BlkverifyAIOCB { + BlockAIOCB common; + QEMUBH *bh; + + /* Request metadata */ + bool is_write; + int64_t sector_num; + int nb_sectors; + + int ret; /* first completed request's result */ + unsigned int done; /* completion counter */ + + QEMUIOVector *qiov; /* user I/O vector */ + QEMUIOVector raw_qiov; /* cloned I/O vector for raw file */ + void *buf; /* buffer for raw file I/O */ + + void (*verify)(BlkverifyAIOCB *acb); +}; + +static const AIOCBInfo blkverify_aiocb_info = { + .aiocb_size = sizeof(BlkverifyAIOCB), +}; + +static void GCC_FMT_ATTR(2, 3) blkverify_err(BlkverifyAIOCB *acb, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "blkverify: %s sector_num=%" PRId64 " nb_sectors=%d ", + acb->is_write ? "write" : "read", acb->sector_num, + acb->nb_sectors); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(1); +} + +/* Valid blkverify filenames look like blkverify:path/to/raw_image:path/to/image */ +static void blkverify_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + const char *c; + QString *raw_path; + + + /* Parse the blkverify: prefix */ + if (!strstart(filename, "blkverify:", &filename)) { + /* There was no prefix; therefore, all options have to be already + present in the QDict (except for the filename) */ + qdict_put(options, "x-image", qstring_from_str(filename)); + return; + } + + /* Parse the raw image filename */ + c = strchr(filename, ':'); + if (c == NULL) { + error_setg(errp, "blkverify requires raw copy and original image path"); + return; + } + + /* TODO Implement option pass-through and set raw.filename here */ + raw_path = qstring_from_substr(filename, 0, c - filename - 1); + qdict_put(options, "x-raw", raw_path); + + /* TODO Allow multi-level nesting and set file.filename here */ + filename = c + 1; + qdict_put(options, "x-image", qstring_from_str(filename)); +} + +static QemuOptsList runtime_opts = { + .name = "blkverify", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "x-raw", + .type = QEMU_OPT_STRING, + .help = "[internal use only, will be removed]", + }, + { + .name = "x-image", + .type = QEMU_OPT_STRING, + .help = "[internal use only, will be removed]", + }, + { /* end of list */ } + }, +}; + +static int blkverify_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVBlkverifyState *s = bs->opaque; + QemuOpts *opts; + Error *local_err = NULL; + int ret; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + /* Open the raw file */ + bs->file = bdrv_open_child(qemu_opt_get(opts, "x-raw"), options, "raw", + bs, &child_file, false, &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto fail; + } + + /* Open the test file */ + s->test_file = bdrv_open_child(qemu_opt_get(opts, "x-image"), options, + "test", bs, &child_format, false, + &local_err); + if (local_err) { + ret = -EINVAL; + error_propagate(errp, local_err); + goto fail; + } + + ret = 0; +fail: + if (ret < 0) { + bdrv_unref_child(bs, bs->file); + } + qemu_opts_del(opts); + return ret; +} + +static void blkverify_close(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + bdrv_unref_child(bs, s->test_file); + s->test_file = NULL; +} + +static int64_t blkverify_getlength(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + return bdrv_getlength(s->test_file->bs); +} + +static BlkverifyAIOCB *blkverify_aio_get(BlockDriverState *bs, bool is_write, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + BlkverifyAIOCB *acb = qemu_aio_get(&blkverify_aiocb_info, bs, cb, opaque); + + acb->bh = NULL; + acb->is_write = is_write; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + acb->ret = -EINPROGRESS; + acb->done = 0; + acb->qiov = qiov; + acb->buf = NULL; + acb->verify = NULL; + return acb; +} + +static void blkverify_aio_bh(void *opaque) +{ + BlkverifyAIOCB *acb = opaque; + + qemu_bh_delete(acb->bh); + if (acb->buf) { + qemu_iovec_destroy(&acb->raw_qiov); + qemu_vfree(acb->buf); + } + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_unref(acb); +} + +static void blkverify_aio_cb(void *opaque, int ret) +{ + BlkverifyAIOCB *acb = opaque; + + switch (++acb->done) { + case 1: + acb->ret = ret; + break; + + case 2: + if (acb->ret != ret) { + blkverify_err(acb, "return value mismatch %d != %d", acb->ret, ret); + } + + if (acb->verify) { + acb->verify(acb); + } + + acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), + blkverify_aio_bh, acb); + qemu_bh_schedule(acb->bh); + break; + } +} + +static void blkverify_verify_readv(BlkverifyAIOCB *acb) +{ + ssize_t offset = qemu_iovec_compare(acb->qiov, &acb->raw_qiov); + if (offset != -1) { + blkverify_err(acb, "contents mismatch in sector %" PRId64, + acb->sector_num + (int64_t)(offset / BDRV_SECTOR_SIZE)); + } +} + +static BlockAIOCB *blkverify_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + BlkverifyAIOCB *acb = blkverify_aio_get(bs, false, sector_num, qiov, + nb_sectors, cb, opaque); + + acb->verify = blkverify_verify_readv; + acb->buf = qemu_blockalign(bs->file->bs, qiov->size); + qemu_iovec_init(&acb->raw_qiov, acb->qiov->niov); + qemu_iovec_clone(&acb->raw_qiov, qiov, acb->buf); + + bdrv_aio_readv(s->test_file->bs, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb); + bdrv_aio_readv(bs->file->bs, sector_num, &acb->raw_qiov, nb_sectors, + blkverify_aio_cb, acb); + return &acb->common; +} + +static BlockAIOCB *blkverify_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + BlkverifyAIOCB *acb = blkverify_aio_get(bs, true, sector_num, qiov, + nb_sectors, cb, opaque); + + bdrv_aio_writev(s->test_file->bs, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb); + bdrv_aio_writev(bs->file->bs, sector_num, qiov, nb_sectors, + blkverify_aio_cb, acb); + return &acb->common; +} + +static BlockAIOCB *blkverify_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, + void *opaque) +{ + BDRVBlkverifyState *s = bs->opaque; + + /* Only flush test file, the raw file is not important */ + return bdrv_aio_flush(s->test_file->bs, cb, opaque); +} + +static bool blkverify_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + BDRVBlkverifyState *s = bs->opaque; + + bool perm = bdrv_recurse_is_first_non_filter(bs->file->bs, candidate); + + if (perm) { + return true; + } + + return bdrv_recurse_is_first_non_filter(s->test_file->bs, candidate); +} + +/* Propagate AioContext changes to ->test_file */ +static void blkverify_detach_aio_context(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + bdrv_detach_aio_context(s->test_file->bs); +} + +static void blkverify_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVBlkverifyState *s = bs->opaque; + + bdrv_attach_aio_context(s->test_file->bs, new_context); +} + +static void blkverify_refresh_filename(BlockDriverState *bs) +{ + BDRVBlkverifyState *s = bs->opaque; + + /* bs->file->bs has already been refreshed */ + bdrv_refresh_filename(s->test_file->bs); + + if (bs->file->bs->full_open_options + && s->test_file->bs->full_open_options) + { + QDict *opts = qdict_new(); + qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("blkverify"))); + + QINCREF(bs->file->bs->full_open_options); + qdict_put_obj(opts, "raw", QOBJECT(bs->file->bs->full_open_options)); + QINCREF(s->test_file->bs->full_open_options); + qdict_put_obj(opts, "test", + QOBJECT(s->test_file->bs->full_open_options)); + + bs->full_open_options = opts; + } + + if (bs->file->bs->exact_filename[0] + && s->test_file->bs->exact_filename[0]) + { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "blkverify:%s:%s", + bs->file->bs->exact_filename, + s->test_file->bs->exact_filename); + } +} + +static BlockDriver bdrv_blkverify = { + .format_name = "blkverify", + .protocol_name = "blkverify", + .instance_size = sizeof(BDRVBlkverifyState), + + .bdrv_parse_filename = blkverify_parse_filename, + .bdrv_file_open = blkverify_open, + .bdrv_close = blkverify_close, + .bdrv_getlength = blkverify_getlength, + .bdrv_refresh_filename = blkverify_refresh_filename, + + .bdrv_aio_readv = blkverify_aio_readv, + .bdrv_aio_writev = blkverify_aio_writev, + .bdrv_aio_flush = blkverify_aio_flush, + + .bdrv_attach_aio_context = blkverify_attach_aio_context, + .bdrv_detach_aio_context = blkverify_detach_aio_context, + + .is_filter = true, + .bdrv_recurse_is_first_non_filter = blkverify_recurse_is_first_non_filter, +}; + +static void bdrv_blkverify_init(void) +{ + bdrv_register(&bdrv_blkverify); +} + +block_init(bdrv_blkverify_init); diff --git a/src/block/block-backend.c b/src/block/block-backend.c new file mode 100644 index 0000000..419591f --- /dev/null +++ b/src/block/block-backend.c @@ -0,0 +1,1284 @@ +/* + * QEMU Block backends + * + * Copyright (C) 2014 Red Hat, Inc. + * + * Authors: + * Markus Armbruster <armbru@redhat.com>, + * + * This work is licensed under the terms of the GNU LGPL, version 2.1 + * or later. See the COPYING.LIB file in the top-level directory. + */ + +#include "sysemu/block-backend.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "block/throttle-groups.h" +#include "sysemu/blockdev.h" +#include "sysemu/sysemu.h" +#include "qapi-event.h" + +/* Number of coroutines to reserve per attached device model */ +#define COROUTINE_POOL_RESERVATION 64 + +static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb); + +struct BlockBackend { + char *name; + int refcnt; + BlockDriverState *bs; + DriveInfo *legacy_dinfo; /* null unless created by drive_new() */ + QTAILQ_ENTRY(BlockBackend) link; /* for blk_backends */ + + void *dev; /* attached device model, if any */ + /* TODO change to DeviceState when all users are qdevified */ + const BlockDevOps *dev_ops; + void *dev_opaque; + + /* the block size for which the guest device expects atomicity */ + int guest_block_size; + + /* If the BDS tree is removed, some of its options are stored here (which + * can be used to restore those options in the new BDS on insert) */ + BlockBackendRootState root_state; + + /* I/O stats (display with "info blockstats"). */ + BlockAcctStats stats; + + BlockdevOnError on_read_error, on_write_error; + bool iostatus_enabled; + BlockDeviceIoStatus iostatus; +}; + +typedef struct BlockBackendAIOCB { + BlockAIOCB common; + QEMUBH *bh; + BlockBackend *blk; + int ret; +} BlockBackendAIOCB; + +static const AIOCBInfo block_backend_aiocb_info = { + .get_aio_context = blk_aiocb_get_aio_context, + .aiocb_size = sizeof(BlockBackendAIOCB), +}; + +static void drive_info_del(DriveInfo *dinfo); + +/* All the BlockBackends (except for hidden ones) */ +static QTAILQ_HEAD(, BlockBackend) blk_backends = + QTAILQ_HEAD_INITIALIZER(blk_backends); + +/* + * Create a new BlockBackend with @name, with a reference count of one. + * @name must not be null or empty. + * Fail if a BlockBackend with this name already exists. + * Store an error through @errp on failure, unless it's null. + * Return the new BlockBackend on success, null on failure. + */ +BlockBackend *blk_new(const char *name, Error **errp) +{ + BlockBackend *blk; + + assert(name && name[0]); + if (!id_wellformed(name)) { + error_setg(errp, "Invalid device name"); + return NULL; + } + if (blk_by_name(name)) { + error_setg(errp, "Device with id '%s' already exists", name); + return NULL; + } + if (bdrv_find_node(name)) { + error_setg(errp, + "Device name '%s' conflicts with an existing node name", + name); + return NULL; + } + + blk = g_new0(BlockBackend, 1); + blk->name = g_strdup(name); + blk->refcnt = 1; + QTAILQ_INSERT_TAIL(&blk_backends, blk, link); + return blk; +} + +/* + * Create a new BlockBackend with a new BlockDriverState attached. + * Otherwise just like blk_new(), which see. + */ +BlockBackend *blk_new_with_bs(const char *name, Error **errp) +{ + BlockBackend *blk; + BlockDriverState *bs; + + blk = blk_new(name, errp); + if (!blk) { + return NULL; + } + + bs = bdrv_new_root(); + blk->bs = bs; + bs->blk = blk; + return blk; +} + +/* + * Calls blk_new_with_bs() and then calls bdrv_open() on the BlockDriverState. + * + * Just as with bdrv_open(), after having called this function the reference to + * @options belongs to the block layer (even on failure). + * + * TODO: Remove @filename and @flags; it should be possible to specify a whole + * BDS tree just by specifying the @options QDict (or @reference, + * alternatively). At the time of adding this function, this is not possible, + * though, so callers of this function have to be able to specify @filename and + * @flags. + */ +BlockBackend *blk_new_open(const char *name, const char *filename, + const char *reference, QDict *options, int flags, + Error **errp) +{ + BlockBackend *blk; + int ret; + + blk = blk_new_with_bs(name, errp); + if (!blk) { + QDECREF(options); + return NULL; + } + + ret = bdrv_open(&blk->bs, filename, reference, options, flags, errp); + if (ret < 0) { + blk_unref(blk); + return NULL; + } + + return blk; +} + +static void blk_delete(BlockBackend *blk) +{ + assert(!blk->refcnt); + assert(!blk->dev); + if (blk->bs) { + assert(blk->bs->blk == blk); + blk->bs->blk = NULL; + bdrv_unref(blk->bs); + blk->bs = NULL; + } + if (blk->root_state.throttle_state) { + g_free(blk->root_state.throttle_group); + throttle_group_unref(blk->root_state.throttle_state); + } + /* Avoid double-remove after blk_hide_on_behalf_of_hmp_drive_del() */ + if (blk->name[0]) { + QTAILQ_REMOVE(&blk_backends, blk, link); + } + g_free(blk->name); + drive_info_del(blk->legacy_dinfo); + block_acct_cleanup(&blk->stats); + g_free(blk); +} + +static void drive_info_del(DriveInfo *dinfo) +{ + if (!dinfo) { + return; + } + qemu_opts_del(dinfo->opts); + g_free(dinfo->serial); + g_free(dinfo); +} + +int blk_get_refcnt(BlockBackend *blk) +{ + return blk ? blk->refcnt : 0; +} + +/* + * Increment @blk's reference count. + * @blk must not be null. + */ +void blk_ref(BlockBackend *blk) +{ + blk->refcnt++; +} + +/* + * Decrement @blk's reference count. + * If this drops it to zero, destroy @blk. + * For convenience, do nothing if @blk is null. + */ +void blk_unref(BlockBackend *blk) +{ + if (blk) { + assert(blk->refcnt > 0); + if (!--blk->refcnt) { + blk_delete(blk); + } + } +} + +/* + * Return the BlockBackend after @blk. + * If @blk is null, return the first one. + * Else, return @blk's next sibling, which may be null. + * + * To iterate over all BlockBackends, do + * for (blk = blk_next(NULL); blk; blk = blk_next(blk)) { + * ... + * } + */ +BlockBackend *blk_next(BlockBackend *blk) +{ + return blk ? QTAILQ_NEXT(blk, link) : QTAILQ_FIRST(&blk_backends); +} + +/* + * Return @blk's name, a non-null string. + * Wart: the name is empty iff @blk has been hidden with + * blk_hide_on_behalf_of_hmp_drive_del(). + */ +const char *blk_name(BlockBackend *blk) +{ + return blk->name; +} + +/* + * Return the BlockBackend with name @name if it exists, else null. + * @name must not be null. + */ +BlockBackend *blk_by_name(const char *name) +{ + BlockBackend *blk; + + assert(name); + QTAILQ_FOREACH(blk, &blk_backends, link) { + if (!strcmp(name, blk->name)) { + return blk; + } + } + return NULL; +} + +/* + * Return the BlockDriverState attached to @blk if any, else null. + */ +BlockDriverState *blk_bs(BlockBackend *blk) +{ + return blk->bs; +} + +/* + * Changes the BlockDriverState attached to @blk + */ +void blk_set_bs(BlockBackend *blk, BlockDriverState *bs) +{ + bdrv_ref(bs); + + if (blk->bs) { + blk->bs->blk = NULL; + bdrv_unref(blk->bs); + } + assert(bs->blk == NULL); + + blk->bs = bs; + bs->blk = blk; +} + +/* + * Return @blk's DriveInfo if any, else null. + */ +DriveInfo *blk_legacy_dinfo(BlockBackend *blk) +{ + return blk->legacy_dinfo; +} + +/* + * Set @blk's DriveInfo to @dinfo, and return it. + * @blk must not have a DriveInfo set already. + * No other BlockBackend may have the same DriveInfo set. + */ +DriveInfo *blk_set_legacy_dinfo(BlockBackend *blk, DriveInfo *dinfo) +{ + assert(!blk->legacy_dinfo); + return blk->legacy_dinfo = dinfo; +} + +/* + * Return the BlockBackend with DriveInfo @dinfo. + * It must exist. + */ +BlockBackend *blk_by_legacy_dinfo(DriveInfo *dinfo) +{ + BlockBackend *blk; + + QTAILQ_FOREACH(blk, &blk_backends, link) { + if (blk->legacy_dinfo == dinfo) { + return blk; + } + } + abort(); +} + +/* + * Hide @blk. + * @blk must not have been hidden already. + * Make attached BlockDriverState, if any, anonymous. + * Once hidden, @blk is invisible to all functions that don't receive + * it as argument. For example, blk_by_name() won't return it. + * Strictly for use by do_drive_del(). + * TODO get rid of it! + */ +void blk_hide_on_behalf_of_hmp_drive_del(BlockBackend *blk) +{ + QTAILQ_REMOVE(&blk_backends, blk, link); + blk->name[0] = 0; + if (blk->bs) { + bdrv_make_anon(blk->bs); + } +} + +/* + * Disassociates the currently associated BlockDriverState from @blk. + */ +void blk_remove_bs(BlockBackend *blk) +{ + blk_update_root_state(blk); + + blk->bs->blk = NULL; + bdrv_unref(blk->bs); + blk->bs = NULL; +} + +/* + * Associates a new BlockDriverState with @blk. + */ +void blk_insert_bs(BlockBackend *blk, BlockDriverState *bs) +{ + assert(!blk->bs && !bs->blk); + bdrv_ref(bs); + blk->bs = bs; + bs->blk = blk; +} + +/* + * Attach device model @dev to @blk. + * Return 0 on success, -EBUSY when a device model is attached already. + */ +int blk_attach_dev(BlockBackend *blk, void *dev) +/* TODO change to DeviceState *dev when all users are qdevified */ +{ + if (blk->dev) { + return -EBUSY; + } + blk_ref(blk); + blk->dev = dev; + blk_iostatus_reset(blk); + return 0; +} + +/* + * Attach device model @dev to @blk. + * @blk must not have a device model attached already. + * TODO qdevified devices don't use this, remove when devices are qdevified + */ +void blk_attach_dev_nofail(BlockBackend *blk, void *dev) +{ + if (blk_attach_dev(blk, dev) < 0) { + abort(); + } +} + +/* + * Detach device model @dev from @blk. + * @dev must be currently attached to @blk. + */ +void blk_detach_dev(BlockBackend *blk, void *dev) +/* TODO change to DeviceState *dev when all users are qdevified */ +{ + assert(blk->dev == dev); + blk->dev = NULL; + blk->dev_ops = NULL; + blk->dev_opaque = NULL; + blk->guest_block_size = 512; + blk_unref(blk); +} + +/* + * Return the device model attached to @blk if any, else null. + */ +void *blk_get_attached_dev(BlockBackend *blk) +/* TODO change to return DeviceState * when all users are qdevified */ +{ + return blk->dev; +} + +/* + * Set @blk's device model callbacks to @ops. + * @opaque is the opaque argument to pass to the callbacks. + * This is for use by device models. + */ +void blk_set_dev_ops(BlockBackend *blk, const BlockDevOps *ops, + void *opaque) +{ + blk->dev_ops = ops; + blk->dev_opaque = opaque; +} + +/* + * Notify @blk's attached device model of media change. + * If @load is true, notify of media load. + * Else, notify of media eject. + * Also send DEVICE_TRAY_MOVED events as appropriate. + */ +void blk_dev_change_media_cb(BlockBackend *blk, bool load) +{ + if (blk->dev_ops && blk->dev_ops->change_media_cb) { + bool tray_was_open, tray_is_open; + + tray_was_open = blk_dev_is_tray_open(blk); + blk->dev_ops->change_media_cb(blk->dev_opaque, load); + tray_is_open = blk_dev_is_tray_open(blk); + + if (tray_was_open != tray_is_open) { + qapi_event_send_device_tray_moved(blk_name(blk), tray_is_open, + &error_abort); + } + } +} + +/* + * Does @blk's attached device model have removable media? + * %true if no device model is attached. + */ +bool blk_dev_has_removable_media(BlockBackend *blk) +{ + return !blk->dev || (blk->dev_ops && blk->dev_ops->change_media_cb); +} + +/* + * Does @blk's attached device model have a tray? + */ +bool blk_dev_has_tray(BlockBackend *blk) +{ + return blk->dev_ops && blk->dev_ops->is_tray_open; +} + +/* + * Notify @blk's attached device model of a media eject request. + * If @force is true, the medium is about to be yanked out forcefully. + */ +void blk_dev_eject_request(BlockBackend *blk, bool force) +{ + if (blk->dev_ops && blk->dev_ops->eject_request_cb) { + blk->dev_ops->eject_request_cb(blk->dev_opaque, force); + } +} + +/* + * Does @blk's attached device model have a tray, and is it open? + */ +bool blk_dev_is_tray_open(BlockBackend *blk) +{ + if (blk_dev_has_tray(blk)) { + return blk->dev_ops->is_tray_open(blk->dev_opaque); + } + return false; +} + +/* + * Does @blk's attached device model have the medium locked? + * %false if the device model has no such lock. + */ +bool blk_dev_is_medium_locked(BlockBackend *blk) +{ + if (blk->dev_ops && blk->dev_ops->is_medium_locked) { + return blk->dev_ops->is_medium_locked(blk->dev_opaque); + } + return false; +} + +/* + * Notify @blk's attached device model of a backend size change. + */ +void blk_dev_resize_cb(BlockBackend *blk) +{ + if (blk->dev_ops && blk->dev_ops->resize_cb) { + blk->dev_ops->resize_cb(blk->dev_opaque); + } +} + +void blk_iostatus_enable(BlockBackend *blk) +{ + blk->iostatus_enabled = true; + blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; +} + +/* The I/O status is only enabled if the drive explicitly + * enables it _and_ the VM is configured to stop on errors */ +bool blk_iostatus_is_enabled(const BlockBackend *blk) +{ + return (blk->iostatus_enabled && + (blk->on_write_error == BLOCKDEV_ON_ERROR_ENOSPC || + blk->on_write_error == BLOCKDEV_ON_ERROR_STOP || + blk->on_read_error == BLOCKDEV_ON_ERROR_STOP)); +} + +BlockDeviceIoStatus blk_iostatus(const BlockBackend *blk) +{ + return blk->iostatus; +} + +void blk_iostatus_disable(BlockBackend *blk) +{ + blk->iostatus_enabled = false; +} + +void blk_iostatus_reset(BlockBackend *blk) +{ + if (blk_iostatus_is_enabled(blk)) { + blk->iostatus = BLOCK_DEVICE_IO_STATUS_OK; + if (blk->bs && blk->bs->job) { + block_job_iostatus_reset(blk->bs->job); + } + } +} + +void blk_iostatus_set_err(BlockBackend *blk, int error) +{ + assert(blk_iostatus_is_enabled(blk)); + if (blk->iostatus == BLOCK_DEVICE_IO_STATUS_OK) { + blk->iostatus = error == ENOSPC ? BLOCK_DEVICE_IO_STATUS_NOSPACE : + BLOCK_DEVICE_IO_STATUS_FAILED; + } +} + +static int blk_check_byte_request(BlockBackend *blk, int64_t offset, + size_t size) +{ + int64_t len; + + if (size > INT_MAX) { + return -EIO; + } + + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + len = blk_getlength(blk); + if (len < 0) { + return len; + } + + if (offset < 0) { + return -EIO; + } + + if (offset > len || len - offset < size) { + return -EIO; + } + + return 0; +} + +static int blk_check_request(BlockBackend *blk, int64_t sector_num, + int nb_sectors) +{ + if (sector_num < 0 || sector_num > INT64_MAX / BDRV_SECTOR_SIZE) { + return -EIO; + } + + if (nb_sectors < 0 || nb_sectors > INT_MAX / BDRV_SECTOR_SIZE) { + return -EIO; + } + + return blk_check_byte_request(blk, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); +} + +int blk_read(BlockBackend *blk, int64_t sector_num, uint8_t *buf, + int nb_sectors) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_read(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_read_unthrottled(BlockBackend *blk, int64_t sector_num, uint8_t *buf, + int nb_sectors) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_read_unthrottled(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_write(BlockBackend *blk, int64_t sector_num, const uint8_t *buf, + int nb_sectors) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_write(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_write_zeroes(BlockBackend *blk, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_write_zeroes(blk->bs, sector_num, nb_sectors, flags); +} + +static void error_callback_bh(void *opaque) +{ + struct BlockBackendAIOCB *acb = opaque; + qemu_bh_delete(acb->bh); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_unref(acb); +} + +BlockAIOCB *blk_abort_aio_request(BlockBackend *blk, + BlockCompletionFunc *cb, + void *opaque, int ret) +{ + struct BlockBackendAIOCB *acb; + QEMUBH *bh; + + acb = blk_aio_get(&block_backend_aiocb_info, blk, cb, opaque); + acb->blk = blk; + acb->ret = ret; + + bh = aio_bh_new(blk_get_aio_context(blk), error_callback_bh, acb); + acb->bh = bh; + qemu_bh_schedule(bh); + + return &acb->common; +} + +BlockAIOCB *blk_aio_write_zeroes(BlockBackend *blk, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return blk_abort_aio_request(blk, cb, opaque, ret); + } + + return bdrv_aio_write_zeroes(blk->bs, sector_num, nb_sectors, flags, + cb, opaque); +} + +int blk_pread(BlockBackend *blk, int64_t offset, void *buf, int count) +{ + int ret = blk_check_byte_request(blk, offset, count); + if (ret < 0) { + return ret; + } + + return bdrv_pread(blk->bs, offset, buf, count); +} + +int blk_pwrite(BlockBackend *blk, int64_t offset, const void *buf, int count) +{ + int ret = blk_check_byte_request(blk, offset, count); + if (ret < 0) { + return ret; + } + + return bdrv_pwrite(blk->bs, offset, buf, count); +} + +int64_t blk_getlength(BlockBackend *blk) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_getlength(blk->bs); +} + +void blk_get_geometry(BlockBackend *blk, uint64_t *nb_sectors_ptr) +{ + if (!blk->bs) { + *nb_sectors_ptr = 0; + } else { + bdrv_get_geometry(blk->bs, nb_sectors_ptr); + } +} + +int64_t blk_nb_sectors(BlockBackend *blk) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_nb_sectors(blk->bs); +} + +BlockAIOCB *blk_aio_readv(BlockBackend *blk, int64_t sector_num, + QEMUIOVector *iov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return blk_abort_aio_request(blk, cb, opaque, ret); + } + + return bdrv_aio_readv(blk->bs, sector_num, iov, nb_sectors, cb, opaque); +} + +BlockAIOCB *blk_aio_writev(BlockBackend *blk, int64_t sector_num, + QEMUIOVector *iov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return blk_abort_aio_request(blk, cb, opaque, ret); + } + + return bdrv_aio_writev(blk->bs, sector_num, iov, nb_sectors, cb, opaque); +} + +BlockAIOCB *blk_aio_flush(BlockBackend *blk, + BlockCompletionFunc *cb, void *opaque) +{ + if (!blk_is_available(blk)) { + return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM); + } + + return bdrv_aio_flush(blk->bs, cb, opaque); +} + +BlockAIOCB *blk_aio_discard(BlockBackend *blk, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return blk_abort_aio_request(blk, cb, opaque, ret); + } + + return bdrv_aio_discard(blk->bs, sector_num, nb_sectors, cb, opaque); +} + +void blk_aio_cancel(BlockAIOCB *acb) +{ + bdrv_aio_cancel(acb); +} + +void blk_aio_cancel_async(BlockAIOCB *acb) +{ + bdrv_aio_cancel_async(acb); +} + +int blk_aio_multiwrite(BlockBackend *blk, BlockRequest *reqs, int num_reqs) +{ + int i, ret; + + for (i = 0; i < num_reqs; i++) { + ret = blk_check_request(blk, reqs[i].sector, reqs[i].nb_sectors); + if (ret < 0) { + return ret; + } + } + + return bdrv_aio_multiwrite(blk->bs, reqs, num_reqs); +} + +int blk_ioctl(BlockBackend *blk, unsigned long int req, void *buf) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_ioctl(blk->bs, req, buf); +} + +BlockAIOCB *blk_aio_ioctl(BlockBackend *blk, unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + if (!blk_is_available(blk)) { + return blk_abort_aio_request(blk, cb, opaque, -ENOMEDIUM); + } + + return bdrv_aio_ioctl(blk->bs, req, buf, cb, opaque); +} + +int blk_co_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_co_discard(blk->bs, sector_num, nb_sectors); +} + +int blk_co_flush(BlockBackend *blk) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_co_flush(blk->bs); +} + +int blk_flush(BlockBackend *blk) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_flush(blk->bs); +} + +int blk_flush_all(void) +{ + return bdrv_flush_all(); +} + +void blk_drain(BlockBackend *blk) +{ + if (blk->bs) { + bdrv_drain(blk->bs); + } +} + +void blk_drain_all(void) +{ + bdrv_drain_all(); +} + +void blk_set_on_error(BlockBackend *blk, BlockdevOnError on_read_error, + BlockdevOnError on_write_error) +{ + blk->on_read_error = on_read_error; + blk->on_write_error = on_write_error; +} + +BlockdevOnError blk_get_on_error(BlockBackend *blk, bool is_read) +{ + return is_read ? blk->on_read_error : blk->on_write_error; +} + +BlockErrorAction blk_get_error_action(BlockBackend *blk, bool is_read, + int error) +{ + BlockdevOnError on_err = blk_get_on_error(blk, is_read); + + switch (on_err) { + case BLOCKDEV_ON_ERROR_ENOSPC: + return (error == ENOSPC) ? + BLOCK_ERROR_ACTION_STOP : BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_STOP: + return BLOCK_ERROR_ACTION_STOP; + case BLOCKDEV_ON_ERROR_REPORT: + return BLOCK_ERROR_ACTION_REPORT; + case BLOCKDEV_ON_ERROR_IGNORE: + return BLOCK_ERROR_ACTION_IGNORE; + default: + abort(); + } +} + +static void send_qmp_error_event(BlockBackend *blk, + BlockErrorAction action, + bool is_read, int error) +{ + IoOperationType optype; + + optype = is_read ? IO_OPERATION_TYPE_READ : IO_OPERATION_TYPE_WRITE; + qapi_event_send_block_io_error(blk_name(blk), optype, action, + blk_iostatus_is_enabled(blk), + error == ENOSPC, strerror(error), + &error_abort); +} + +/* This is done by device models because, while the block layer knows + * about the error, it does not know whether an operation comes from + * the device or the block layer (from a job, for example). + */ +void blk_error_action(BlockBackend *blk, BlockErrorAction action, + bool is_read, int error) +{ + assert(error >= 0); + + if (action == BLOCK_ERROR_ACTION_STOP) { + /* First set the iostatus, so that "info block" returns an iostatus + * that matches the events raised so far (an additional error iostatus + * is fine, but not a lost one). + */ + blk_iostatus_set_err(blk, error); + + /* Then raise the request to stop the VM and the event. + * qemu_system_vmstop_request_prepare has two effects. First, + * it ensures that the STOP event always comes after the + * BLOCK_IO_ERROR event. Second, it ensures that even if management + * can observe the STOP event and do a "cont" before the STOP + * event is issued, the VM will not stop. In this case, vm_start() + * also ensures that the STOP/RESUME pair of events is emitted. + */ + qemu_system_vmstop_request_prepare(); + send_qmp_error_event(blk, action, is_read, error); + qemu_system_vmstop_request(RUN_STATE_IO_ERROR); + } else { + send_qmp_error_event(blk, action, is_read, error); + } +} + +int blk_is_read_only(BlockBackend *blk) +{ + if (blk->bs) { + return bdrv_is_read_only(blk->bs); + } else { + return blk->root_state.read_only; + } +} + +int blk_is_sg(BlockBackend *blk) +{ + if (!blk->bs) { + return 0; + } + + return bdrv_is_sg(blk->bs); +} + +int blk_enable_write_cache(BlockBackend *blk) +{ + if (blk->bs) { + return bdrv_enable_write_cache(blk->bs); + } else { + return !!(blk->root_state.open_flags & BDRV_O_CACHE_WB); + } +} + +void blk_set_enable_write_cache(BlockBackend *blk, bool wce) +{ + if (blk->bs) { + bdrv_set_enable_write_cache(blk->bs, wce); + } else { + if (wce) { + blk->root_state.open_flags |= BDRV_O_CACHE_WB; + } else { + blk->root_state.open_flags &= ~BDRV_O_CACHE_WB; + } + } +} + +void blk_invalidate_cache(BlockBackend *blk, Error **errp) +{ + if (!blk->bs) { + error_setg(errp, "Device '%s' has no medium", blk->name); + return; + } + + bdrv_invalidate_cache(blk->bs, errp); +} + +bool blk_is_inserted(BlockBackend *blk) +{ + return blk->bs && bdrv_is_inserted(blk->bs); +} + +bool blk_is_available(BlockBackend *blk) +{ + return blk_is_inserted(blk) && !blk_dev_is_tray_open(blk); +} + +void blk_lock_medium(BlockBackend *blk, bool locked) +{ + if (blk->bs) { + bdrv_lock_medium(blk->bs, locked); + } +} + +void blk_eject(BlockBackend *blk, bool eject_flag) +{ + if (blk->bs) { + bdrv_eject(blk->bs, eject_flag); + } +} + +int blk_get_flags(BlockBackend *blk) +{ + if (blk->bs) { + return bdrv_get_flags(blk->bs); + } else { + return blk->root_state.open_flags; + } +} + +int blk_get_max_transfer_length(BlockBackend *blk) +{ + if (blk->bs) { + return blk->bs->bl.max_transfer_length; + } else { + return 0; + } +} + +void blk_set_guest_block_size(BlockBackend *blk, int align) +{ + blk->guest_block_size = align; +} + +void *blk_blockalign(BlockBackend *blk, size_t size) +{ + return qemu_blockalign(blk ? blk->bs : NULL, size); +} + +bool blk_op_is_blocked(BlockBackend *blk, BlockOpType op, Error **errp) +{ + if (!blk->bs) { + return false; + } + + return bdrv_op_is_blocked(blk->bs, op, errp); +} + +void blk_op_unblock(BlockBackend *blk, BlockOpType op, Error *reason) +{ + if (blk->bs) { + bdrv_op_unblock(blk->bs, op, reason); + } +} + +void blk_op_block_all(BlockBackend *blk, Error *reason) +{ + if (blk->bs) { + bdrv_op_block_all(blk->bs, reason); + } +} + +void blk_op_unblock_all(BlockBackend *blk, Error *reason) +{ + if (blk->bs) { + bdrv_op_unblock_all(blk->bs, reason); + } +} + +AioContext *blk_get_aio_context(BlockBackend *blk) +{ + if (blk->bs) { + return bdrv_get_aio_context(blk->bs); + } else { + return qemu_get_aio_context(); + } +} + +static AioContext *blk_aiocb_get_aio_context(BlockAIOCB *acb) +{ + BlockBackendAIOCB *blk_acb = DO_UPCAST(BlockBackendAIOCB, common, acb); + return blk_get_aio_context(blk_acb->blk); +} + +void blk_set_aio_context(BlockBackend *blk, AioContext *new_context) +{ + if (blk->bs) { + bdrv_set_aio_context(blk->bs, new_context); + } +} + +void blk_add_aio_context_notifier(BlockBackend *blk, + void (*attached_aio_context)(AioContext *new_context, void *opaque), + void (*detach_aio_context)(void *opaque), void *opaque) +{ + if (blk->bs) { + bdrv_add_aio_context_notifier(blk->bs, attached_aio_context, + detach_aio_context, opaque); + } +} + +void blk_remove_aio_context_notifier(BlockBackend *blk, + void (*attached_aio_context)(AioContext *, + void *), + void (*detach_aio_context)(void *), + void *opaque) +{ + if (blk->bs) { + bdrv_remove_aio_context_notifier(blk->bs, attached_aio_context, + detach_aio_context, opaque); + } +} + +void blk_add_close_notifier(BlockBackend *blk, Notifier *notify) +{ + if (blk->bs) { + bdrv_add_close_notifier(blk->bs, notify); + } +} + +void blk_io_plug(BlockBackend *blk) +{ + if (blk->bs) { + bdrv_io_plug(blk->bs); + } +} + +void blk_io_unplug(BlockBackend *blk) +{ + if (blk->bs) { + bdrv_io_unplug(blk->bs); + } +} + +BlockAcctStats *blk_get_stats(BlockBackend *blk) +{ + return &blk->stats; +} + +void *blk_aio_get(const AIOCBInfo *aiocb_info, BlockBackend *blk, + BlockCompletionFunc *cb, void *opaque) +{ + return qemu_aio_get(aiocb_info, blk_bs(blk), cb, opaque); +} + +int coroutine_fn blk_co_write_zeroes(BlockBackend *blk, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_co_write_zeroes(blk->bs, sector_num, nb_sectors, flags); +} + +int blk_write_compressed(BlockBackend *blk, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_write_compressed(blk->bs, sector_num, buf, nb_sectors); +} + +int blk_truncate(BlockBackend *blk, int64_t offset) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_truncate(blk->bs, offset); +} + +int blk_discard(BlockBackend *blk, int64_t sector_num, int nb_sectors) +{ + int ret = blk_check_request(blk, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + return bdrv_discard(blk->bs, sector_num, nb_sectors); +} + +int blk_save_vmstate(BlockBackend *blk, const uint8_t *buf, + int64_t pos, int size) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_save_vmstate(blk->bs, buf, pos, size); +} + +int blk_load_vmstate(BlockBackend *blk, uint8_t *buf, int64_t pos, int size) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_load_vmstate(blk->bs, buf, pos, size); +} + +int blk_probe_blocksizes(BlockBackend *blk, BlockSizes *bsz) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_probe_blocksizes(blk->bs, bsz); +} + +int blk_probe_geometry(BlockBackend *blk, HDGeometry *geo) +{ + if (!blk_is_available(blk)) { + return -ENOMEDIUM; + } + + return bdrv_probe_geometry(blk->bs, geo); +} + +/* + * Updates the BlockBackendRootState object with data from the currently + * attached BlockDriverState. + */ +void blk_update_root_state(BlockBackend *blk) +{ + assert(blk->bs); + + blk->root_state.open_flags = blk->bs->open_flags; + blk->root_state.read_only = blk->bs->read_only; + blk->root_state.detect_zeroes = blk->bs->detect_zeroes; + + if (blk->root_state.throttle_group) { + g_free(blk->root_state.throttle_group); + throttle_group_unref(blk->root_state.throttle_state); + } + if (blk->bs->throttle_state) { + const char *name = throttle_group_get_name(blk->bs); + blk->root_state.throttle_group = g_strdup(name); + blk->root_state.throttle_state = throttle_group_incref(name); + } else { + blk->root_state.throttle_group = NULL; + blk->root_state.throttle_state = NULL; + } +} + +/* + * Applies the information in the root state to the given BlockDriverState. This + * does not include the flags which have to be specified for bdrv_open(), use + * blk_get_open_flags_from_root_state() to inquire them. + */ +void blk_apply_root_state(BlockBackend *blk, BlockDriverState *bs) +{ + bs->detect_zeroes = blk->root_state.detect_zeroes; + if (blk->root_state.throttle_group) { + bdrv_io_limits_enable(bs, blk->root_state.throttle_group); + } +} + +/* + * Returns the flags to be used for bdrv_open() of a BlockDriverState which is + * supposed to inherit the root state. + */ +int blk_get_open_flags_from_root_state(BlockBackend *blk) +{ + int bs_flags; + + bs_flags = blk->root_state.read_only ? 0 : BDRV_O_RDWR; + bs_flags |= blk->root_state.open_flags & ~BDRV_O_RDWR; + + return bs_flags; +} + +BlockBackendRootState *blk_get_root_state(BlockBackend *blk) +{ + return &blk->root_state; +} diff --git a/src/block/bochs.c b/src/block/bochs.c new file mode 100644 index 0000000..18949b9 --- /dev/null +++ b/src/block/bochs.c @@ -0,0 +1,277 @@ +/* + * Block driver for the various disk image formats used by Bochs + * Currently only for "growing" type in read-only mode + * + * Copyright (c) 2005 Alex Beregszaszi + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" + +/**************************************************************/ + +#define HEADER_MAGIC "Bochs Virtual HD Image" +#define HEADER_VERSION 0x00020000 +#define HEADER_V1 0x00010000 +#define HEADER_SIZE 512 + +#define REDOLOG_TYPE "Redolog" +#define GROWING_TYPE "Growing" + +// not allocated: 0xffffffff + +// always little-endian +struct bochs_header { + char magic[32]; /* "Bochs Virtual HD Image" */ + char type[16]; /* "Redolog" */ + char subtype[16]; /* "Undoable" / "Volatile" / "Growing" */ + uint32_t version; + uint32_t header; /* size of header */ + + uint32_t catalog; /* num of entries */ + uint32_t bitmap; /* bitmap size */ + uint32_t extent; /* extent size */ + + union { + struct { + uint32_t reserved; /* for ??? */ + uint64_t disk; /* disk size */ + char padding[HEADER_SIZE - 64 - 20 - 12]; + } QEMU_PACKED redolog; + struct { + uint64_t disk; /* disk size */ + char padding[HEADER_SIZE - 64 - 20 - 8]; + } QEMU_PACKED redolog_v1; + char padding[HEADER_SIZE - 64 - 20]; + } extra; +} QEMU_PACKED; + +typedef struct BDRVBochsState { + CoMutex lock; + uint32_t *catalog_bitmap; + uint32_t catalog_size; + + uint32_t data_offset; + + uint32_t bitmap_blocks; + uint32_t extent_blocks; + uint32_t extent_size; +} BDRVBochsState; + +static int bochs_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const struct bochs_header *bochs = (const void *)buf; + + if (buf_size < HEADER_SIZE) + return 0; + + if (!strcmp(bochs->magic, HEADER_MAGIC) && + !strcmp(bochs->type, REDOLOG_TYPE) && + !strcmp(bochs->subtype, GROWING_TYPE) && + ((le32_to_cpu(bochs->version) == HEADER_VERSION) || + (le32_to_cpu(bochs->version) == HEADER_V1))) + return 100; + + return 0; +} + +static int bochs_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVBochsState *s = bs->opaque; + uint32_t i; + struct bochs_header bochs; + int ret; + + bs->read_only = 1; // no write support yet + + ret = bdrv_pread(bs->file->bs, 0, &bochs, sizeof(bochs)); + if (ret < 0) { + return ret; + } + + if (strcmp(bochs.magic, HEADER_MAGIC) || + strcmp(bochs.type, REDOLOG_TYPE) || + strcmp(bochs.subtype, GROWING_TYPE) || + ((le32_to_cpu(bochs.version) != HEADER_VERSION) && + (le32_to_cpu(bochs.version) != HEADER_V1))) { + error_setg(errp, "Image not in Bochs format"); + return -EINVAL; + } + + if (le32_to_cpu(bochs.version) == HEADER_V1) { + bs->total_sectors = le64_to_cpu(bochs.extra.redolog_v1.disk) / 512; + } else { + bs->total_sectors = le64_to_cpu(bochs.extra.redolog.disk) / 512; + } + + /* Limit to 1M entries to avoid unbounded allocation. This is what is + * needed for the largest image that bximage can create (~8 TB). */ + s->catalog_size = le32_to_cpu(bochs.catalog); + if (s->catalog_size > 0x100000) { + error_setg(errp, "Catalog size is too large"); + return -EFBIG; + } + + s->catalog_bitmap = g_try_new(uint32_t, s->catalog_size); + if (s->catalog_size && s->catalog_bitmap == NULL) { + error_setg(errp, "Could not allocate memory for catalog"); + return -ENOMEM; + } + + ret = bdrv_pread(bs->file->bs, le32_to_cpu(bochs.header), s->catalog_bitmap, + s->catalog_size * 4); + if (ret < 0) { + goto fail; + } + + for (i = 0; i < s->catalog_size; i++) + le32_to_cpus(&s->catalog_bitmap[i]); + + s->data_offset = le32_to_cpu(bochs.header) + (s->catalog_size * 4); + + s->bitmap_blocks = 1 + (le32_to_cpu(bochs.bitmap) - 1) / 512; + s->extent_blocks = 1 + (le32_to_cpu(bochs.extent) - 1) / 512; + + s->extent_size = le32_to_cpu(bochs.extent); + if (s->extent_size < BDRV_SECTOR_SIZE) { + /* bximage actually never creates extents smaller than 4k */ + error_setg(errp, "Extent size must be at least 512"); + ret = -EINVAL; + goto fail; + } else if (!is_power_of_2(s->extent_size)) { + error_setg(errp, "Extent size %" PRIu32 " is not a power of two", + s->extent_size); + ret = -EINVAL; + goto fail; + } else if (s->extent_size > 0x800000) { + error_setg(errp, "Extent size %" PRIu32 " is too large", + s->extent_size); + ret = -EINVAL; + goto fail; + } + + if (s->catalog_size < DIV_ROUND_UP(bs->total_sectors, + s->extent_size / BDRV_SECTOR_SIZE)) + { + error_setg(errp, "Catalog size is too small for this disk size"); + ret = -EINVAL; + goto fail; + } + + qemu_co_mutex_init(&s->lock); + return 0; + +fail: + g_free(s->catalog_bitmap); + return ret; +} + +static int64_t seek_to_sector(BlockDriverState *bs, int64_t sector_num) +{ + BDRVBochsState *s = bs->opaque; + uint64_t offset = sector_num * 512; + uint64_t extent_index, extent_offset, bitmap_offset; + char bitmap_entry; + int ret; + + // seek to sector + extent_index = offset / s->extent_size; + extent_offset = (offset % s->extent_size) / 512; + + if (s->catalog_bitmap[extent_index] == 0xffffffff) { + return 0; /* not allocated */ + } + + bitmap_offset = s->data_offset + + (512 * (uint64_t) s->catalog_bitmap[extent_index] * + (s->extent_blocks + s->bitmap_blocks)); + + /* read in bitmap for current extent */ + ret = bdrv_pread(bs->file->bs, bitmap_offset + (extent_offset / 8), + &bitmap_entry, 1); + if (ret < 0) { + return ret; + } + + if (!((bitmap_entry >> (extent_offset % 8)) & 1)) { + return 0; /* not allocated */ + } + + return bitmap_offset + (512 * (s->bitmap_blocks + extent_offset)); +} + +static int bochs_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + + while (nb_sectors > 0) { + int64_t block_offset = seek_to_sector(bs, sector_num); + if (block_offset < 0) { + return block_offset; + } else if (block_offset > 0) { + ret = bdrv_pread(bs->file->bs, block_offset, buf, 512); + if (ret < 0) { + return ret; + } + } else { + memset(buf, 0, 512); + } + nb_sectors--; + sector_num++; + buf += 512; + } + return 0; +} + +static coroutine_fn int bochs_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVBochsState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = bochs_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void bochs_close(BlockDriverState *bs) +{ + BDRVBochsState *s = bs->opaque; + g_free(s->catalog_bitmap); +} + +static BlockDriver bdrv_bochs = { + .format_name = "bochs", + .instance_size = sizeof(BDRVBochsState), + .bdrv_probe = bochs_probe, + .bdrv_open = bochs_open, + .bdrv_read = bochs_co_read, + .bdrv_close = bochs_close, +}; + +static void bdrv_bochs_init(void) +{ + bdrv_register(&bdrv_bochs); +} + +block_init(bdrv_bochs_init); diff --git a/src/block/cloop.c b/src/block/cloop.c new file mode 100644 index 0000000..4190ae0 --- /dev/null +++ b/src/block/cloop.c @@ -0,0 +1,283 @@ +/* + * QEMU Block driver for CLOOP images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> + +/* Maximum compressed block size */ +#define MAX_BLOCK_SIZE (64 * 1024 * 1024) + +typedef struct BDRVCloopState { + CoMutex lock; + uint32_t block_size; + uint32_t n_blocks; + uint64_t *offsets; + uint32_t sectors_per_block; + uint32_t current_block; + uint8_t *compressed_block; + uint8_t *uncompressed_block; + z_stream zstream; +} BDRVCloopState; + +static int cloop_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const char *magic_version_2_0 = "#!/bin/sh\n" + "#V2.0 Format\n" + "modprobe cloop file=$0 && mount -r -t iso9660 /dev/cloop $1\n"; + int length = strlen(magic_version_2_0); + if (length > buf_size) { + length = buf_size; + } + if (!memcmp(magic_version_2_0, buf, length)) { + return 2; + } + return 0; +} + +static int cloop_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVCloopState *s = bs->opaque; + uint32_t offsets_size, max_compressed_block_size = 1, i; + int ret; + + bs->read_only = 1; + + /* read header */ + ret = bdrv_pread(bs->file->bs, 128, &s->block_size, 4); + if (ret < 0) { + return ret; + } + s->block_size = be32_to_cpu(s->block_size); + if (s->block_size % 512) { + error_setg(errp, "block_size %" PRIu32 " must be a multiple of 512", + s->block_size); + return -EINVAL; + } + if (s->block_size == 0) { + error_setg(errp, "block_size cannot be zero"); + return -EINVAL; + } + + /* cloop's create_compressed_fs.c warns about block sizes beyond 256 KB but + * we can accept more. Prevent ridiculous values like 4 GB - 1 since we + * need a buffer this big. + */ + if (s->block_size > MAX_BLOCK_SIZE) { + error_setg(errp, "block_size %" PRIu32 " must be %u MB or less", + s->block_size, + MAX_BLOCK_SIZE / (1024 * 1024)); + return -EINVAL; + } + + ret = bdrv_pread(bs->file->bs, 128 + 4, &s->n_blocks, 4); + if (ret < 0) { + return ret; + } + s->n_blocks = be32_to_cpu(s->n_blocks); + + /* read offsets */ + if (s->n_blocks > (UINT32_MAX - 1) / sizeof(uint64_t)) { + /* Prevent integer overflow */ + error_setg(errp, "n_blocks %" PRIu32 " must be %zu or less", + s->n_blocks, + (UINT32_MAX - 1) / sizeof(uint64_t)); + return -EINVAL; + } + offsets_size = (s->n_blocks + 1) * sizeof(uint64_t); + if (offsets_size > 512 * 1024 * 1024) { + /* Prevent ridiculous offsets_size which causes memory allocation to + * fail or overflows bdrv_pread() size. In practice the 512 MB + * offsets[] limit supports 16 TB images at 256 KB block size. + */ + error_setg(errp, "image requires too many offsets, " + "try increasing block size"); + return -EINVAL; + } + + s->offsets = g_try_malloc(offsets_size); + if (s->offsets == NULL) { + error_setg(errp, "Could not allocate offsets table"); + return -ENOMEM; + } + + ret = bdrv_pread(bs->file->bs, 128 + 4 + 4, s->offsets, offsets_size); + if (ret < 0) { + goto fail; + } + + for (i = 0; i < s->n_blocks + 1; i++) { + uint64_t size; + + s->offsets[i] = be64_to_cpu(s->offsets[i]); + if (i == 0) { + continue; + } + + if (s->offsets[i] < s->offsets[i - 1]) { + error_setg(errp, "offsets not monotonically increasing at " + "index %" PRIu32 ", image file is corrupt", i); + ret = -EINVAL; + goto fail; + } + + size = s->offsets[i] - s->offsets[i - 1]; + + /* Compressed blocks should be smaller than the uncompressed block size + * but maybe compression performed poorly so the compressed block is + * actually bigger. Clamp down on unrealistic values to prevent + * ridiculous s->compressed_block allocation. + */ + if (size > 2 * MAX_BLOCK_SIZE) { + error_setg(errp, "invalid compressed block size at index %" PRIu32 + ", image file is corrupt", i); + ret = -EINVAL; + goto fail; + } + + if (size > max_compressed_block_size) { + max_compressed_block_size = size; + } + } + + /* initialize zlib engine */ + s->compressed_block = g_try_malloc(max_compressed_block_size + 1); + if (s->compressed_block == NULL) { + error_setg(errp, "Could not allocate compressed_block"); + ret = -ENOMEM; + goto fail; + } + + s->uncompressed_block = g_try_malloc(s->block_size); + if (s->uncompressed_block == NULL) { + error_setg(errp, "Could not allocate uncompressed_block"); + ret = -ENOMEM; + goto fail; + } + + if (inflateInit(&s->zstream) != Z_OK) { + ret = -EINVAL; + goto fail; + } + s->current_block = s->n_blocks; + + s->sectors_per_block = s->block_size/512; + bs->total_sectors = s->n_blocks * s->sectors_per_block; + qemu_co_mutex_init(&s->lock); + return 0; + +fail: + g_free(s->offsets); + g_free(s->compressed_block); + g_free(s->uncompressed_block); + return ret; +} + +static inline int cloop_read_block(BlockDriverState *bs, int block_num) +{ + BDRVCloopState *s = bs->opaque; + + if (s->current_block != block_num) { + int ret; + uint32_t bytes = s->offsets[block_num + 1] - s->offsets[block_num]; + + ret = bdrv_pread(bs->file->bs, s->offsets[block_num], + s->compressed_block, bytes); + if (ret != bytes) { + return -1; + } + + s->zstream.next_in = s->compressed_block; + s->zstream.avail_in = bytes; + s->zstream.next_out = s->uncompressed_block; + s->zstream.avail_out = s->block_size; + ret = inflateReset(&s->zstream); + if (ret != Z_OK) { + return -1; + } + ret = inflate(&s->zstream, Z_FINISH); + if (ret != Z_STREAM_END || s->zstream.total_out != s->block_size) { + return -1; + } + + s->current_block = block_num; + } + return 0; +} + +static int cloop_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVCloopState *s = bs->opaque; + int i; + + for (i = 0; i < nb_sectors; i++) { + uint32_t sector_offset_in_block = + ((sector_num + i) % s->sectors_per_block), + block_num = (sector_num + i) / s->sectors_per_block; + if (cloop_read_block(bs, block_num) != 0) { + return -1; + } + memcpy(buf + i * 512, + s->uncompressed_block + sector_offset_in_block * 512, 512); + } + return 0; +} + +static coroutine_fn int cloop_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVCloopState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = cloop_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void cloop_close(BlockDriverState *bs) +{ + BDRVCloopState *s = bs->opaque; + g_free(s->offsets); + g_free(s->compressed_block); + g_free(s->uncompressed_block); + inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_cloop = { + .format_name = "cloop", + .instance_size = sizeof(BDRVCloopState), + .bdrv_probe = cloop_probe, + .bdrv_open = cloop_open, + .bdrv_read = cloop_co_read, + .bdrv_close = cloop_close, +}; + +static void bdrv_cloop_init(void) +{ + bdrv_register(&bdrv_cloop); +} + +block_init(bdrv_cloop_init); diff --git a/src/block/commit.c b/src/block/commit.c new file mode 100644 index 0000000..a5d02aa --- /dev/null +++ b/src/block/commit.c @@ -0,0 +1,275 @@ +/* + * Live block commit + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * Based on stream.c by Stefan Hajnoczi + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" +#include "sysemu/block-backend.h" + +enum { + /* + * Size of data buffer for populating the image file. This should be large + * enough to process multiple clusters in a single call, so that populating + * contiguous regions of the image is efficient. + */ + COMMIT_BUFFER_SIZE = 512 * 1024, /* in bytes */ +}; + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct CommitBlockJob { + BlockJob common; + RateLimit limit; + BlockDriverState *active; + BlockDriverState *top; + BlockDriverState *base; + BlockdevOnError on_error; + int base_flags; + int orig_overlay_flags; + char *backing_file_str; +} CommitBlockJob; + +static int coroutine_fn commit_populate(BlockDriverState *bs, + BlockDriverState *base, + int64_t sector_num, int nb_sectors, + void *buf) +{ + int ret = 0; + + ret = bdrv_read(bs, sector_num, buf, nb_sectors); + if (ret) { + return ret; + } + + ret = bdrv_write(base, sector_num, buf, nb_sectors); + if (ret) { + return ret; + } + + return 0; +} + +typedef struct { + int ret; +} CommitCompleteData; + +static void commit_complete(BlockJob *job, void *opaque) +{ + CommitBlockJob *s = container_of(job, CommitBlockJob, common); + CommitCompleteData *data = opaque; + BlockDriverState *active = s->active; + BlockDriverState *top = s->top; + BlockDriverState *base = s->base; + BlockDriverState *overlay_bs; + int ret = data->ret; + + if (!block_job_is_cancelled(&s->common) && ret == 0) { + /* success */ + ret = bdrv_drop_intermediate(active, top, base, s->backing_file_str); + } + + /* restore base open flags here if appropriate (e.g., change the base back + * to r/o). These reopens do not need to be atomic, since we won't abort + * even on failure here */ + if (s->base_flags != bdrv_get_flags(base)) { + bdrv_reopen(base, s->base_flags, NULL); + } + overlay_bs = bdrv_find_overlay(active, top); + if (overlay_bs && s->orig_overlay_flags != bdrv_get_flags(overlay_bs)) { + bdrv_reopen(overlay_bs, s->orig_overlay_flags, NULL); + } + g_free(s->backing_file_str); + block_job_completed(&s->common, ret); + g_free(data); +} + +static void coroutine_fn commit_run(void *opaque) +{ + CommitBlockJob *s = opaque; + CommitCompleteData *data; + BlockDriverState *top = s->top; + BlockDriverState *base = s->base; + int64_t sector_num, end; + int ret = 0; + int n = 0; + void *buf = NULL; + int bytes_written = 0; + int64_t base_len; + + ret = s->common.len = bdrv_getlength(top); + + + if (s->common.len < 0) { + goto out; + } + + ret = base_len = bdrv_getlength(base); + if (base_len < 0) { + goto out; + } + + if (base_len < s->common.len) { + ret = bdrv_truncate(base, s->common.len); + if (ret) { + goto out; + } + } + + end = s->common.len >> BDRV_SECTOR_BITS; + buf = qemu_blockalign(top, COMMIT_BUFFER_SIZE); + + for (sector_num = 0; sector_num < end; sector_num += n) { + uint64_t delay_ns = 0; + bool copy; + +wait: + /* Note that even when no rate limit is applied we need to yield + * with no pending I/O here so that bdrv_drain_all() returns. + */ + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + if (block_job_is_cancelled(&s->common)) { + break; + } + /* Copy if allocated above the base */ + ret = bdrv_is_allocated_above(top, base, sector_num, + COMMIT_BUFFER_SIZE / BDRV_SECTOR_SIZE, + &n); + copy = (ret == 1); + trace_commit_one_iteration(s, sector_num, n, ret); + if (copy) { + if (s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, n); + if (delay_ns > 0) { + goto wait; + } + } + ret = commit_populate(top, base, sector_num, n, buf); + bytes_written += n * BDRV_SECTOR_SIZE; + } + if (ret < 0) { + if (s->on_error == BLOCKDEV_ON_ERROR_STOP || + s->on_error == BLOCKDEV_ON_ERROR_REPORT|| + (s->on_error == BLOCKDEV_ON_ERROR_ENOSPC && ret == -ENOSPC)) { + goto out; + } else { + n = 0; + continue; + } + } + /* Publish progress */ + s->common.offset += n * BDRV_SECTOR_SIZE; + } + + ret = 0; + +out: + qemu_vfree(buf); + + data = g_malloc(sizeof(*data)); + data->ret = ret; + block_job_defer_to_main_loop(&s->common, commit_complete, data); +} + +static void commit_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ + CommitBlockJob *s = container_of(job, CommitBlockJob, common); + + if (speed < 0) { + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); + return; + } + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static const BlockJobDriver commit_job_driver = { + .instance_size = sizeof(CommitBlockJob), + .job_type = BLOCK_JOB_TYPE_COMMIT, + .set_speed = commit_set_speed, +}; + +void commit_start(BlockDriverState *bs, BlockDriverState *base, + BlockDriverState *top, int64_t speed, + BlockdevOnError on_error, BlockCompletionFunc *cb, + void *opaque, const char *backing_file_str, Error **errp) +{ + CommitBlockJob *s; + BlockReopenQueue *reopen_queue = NULL; + int orig_overlay_flags; + int orig_base_flags; + BlockDriverState *overlay_bs; + Error *local_err = NULL; + + if ((on_error == BLOCKDEV_ON_ERROR_STOP || + on_error == BLOCKDEV_ON_ERROR_ENOSPC) && + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { + error_setg(errp, "Invalid parameter combination"); + return; + } + + assert(top != bs); + if (top == base) { + error_setg(errp, "Invalid files for merge: top and base are the same"); + return; + } + + overlay_bs = bdrv_find_overlay(bs, top); + + if (overlay_bs == NULL) { + error_setg(errp, "Could not find overlay image for %s:", top->filename); + return; + } + + orig_base_flags = bdrv_get_flags(base); + orig_overlay_flags = bdrv_get_flags(overlay_bs); + + /* convert base & overlay_bs to r/w, if necessary */ + if (!(orig_overlay_flags & BDRV_O_RDWR)) { + reopen_queue = bdrv_reopen_queue(reopen_queue, overlay_bs, NULL, + orig_overlay_flags | BDRV_O_RDWR); + } + if (!(orig_base_flags & BDRV_O_RDWR)) { + reopen_queue = bdrv_reopen_queue(reopen_queue, base, NULL, + orig_base_flags | BDRV_O_RDWR); + } + if (reopen_queue) { + bdrv_reopen_multiple(reopen_queue, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + return; + } + } + + + s = block_job_create(&commit_job_driver, bs, speed, cb, opaque, errp); + if (!s) { + return; + } + + s->base = base; + s->top = top; + s->active = bs; + + s->base_flags = orig_base_flags; + s->orig_overlay_flags = orig_overlay_flags; + + s->backing_file_str = g_strdup(backing_file_str); + + s->on_error = on_error; + s->common.co = qemu_coroutine_create(commit_run); + + trace_commit_start(bs, base, top, s, s->common.co, opaque); + qemu_coroutine_enter(s->common.co, s); +} diff --git a/src/block/curl.c b/src/block/curl.c new file mode 100644 index 0000000..8994182 --- /dev/null +++ b/src/block/curl.c @@ -0,0 +1,827 @@ +/* + * QEMU Block driver for CURL images + * + * Copyright (c) 2009 Alexander Graf <agraf@suse.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "block/block_int.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qstring.h" +#include <curl/curl.h> + +// #define DEBUG_CURL +// #define DEBUG_VERBOSE + +#ifdef DEBUG_CURL +#define DPRINTF(fmt, ...) do { printf(fmt, ## __VA_ARGS__); } while (0) +#else +#define DPRINTF(fmt, ...) do { } while (0) +#endif + +#if LIBCURL_VERSION_NUM >= 0x071000 +/* The multi interface timer callback was introduced in 7.16.0 */ +#define NEED_CURL_TIMER_CALLBACK +#define HAVE_SOCKET_ACTION +#endif + +#ifndef HAVE_SOCKET_ACTION +/* If curl_multi_socket_action isn't available, define it statically here in + * terms of curl_multi_socket. Note that ev_bitmask will be ignored, which is + * less efficient but still safe. */ +static CURLMcode __curl_multi_socket_action(CURLM *multi_handle, + curl_socket_t sockfd, + int ev_bitmask, + int *running_handles) +{ + return curl_multi_socket(multi_handle, sockfd, running_handles); +} +#define curl_multi_socket_action __curl_multi_socket_action +#endif + +#define PROTOCOLS (CURLPROTO_HTTP | CURLPROTO_HTTPS | \ + CURLPROTO_FTP | CURLPROTO_FTPS | \ + CURLPROTO_TFTP) + +#define CURL_NUM_STATES 8 +#define CURL_NUM_ACB 8 +#define SECTOR_SIZE 512 +#define READ_AHEAD_DEFAULT (256 * 1024) +#define CURL_TIMEOUT_DEFAULT 5 +#define CURL_TIMEOUT_MAX 10000 + +#define FIND_RET_NONE 0 +#define FIND_RET_OK 1 +#define FIND_RET_WAIT 2 + +#define CURL_BLOCK_OPT_URL "url" +#define CURL_BLOCK_OPT_READAHEAD "readahead" +#define CURL_BLOCK_OPT_SSLVERIFY "sslverify" +#define CURL_BLOCK_OPT_TIMEOUT "timeout" +#define CURL_BLOCK_OPT_COOKIE "cookie" + +struct BDRVCURLState; + +typedef struct CURLAIOCB { + BlockAIOCB common; + QEMUBH *bh; + QEMUIOVector *qiov; + + int64_t sector_num; + int nb_sectors; + + size_t start; + size_t end; +} CURLAIOCB; + +typedef struct CURLState +{ + struct BDRVCURLState *s; + CURLAIOCB *acb[CURL_NUM_ACB]; + CURL *curl; + curl_socket_t sock_fd; + char *orig_buf; + size_t buf_start; + size_t buf_off; + size_t buf_len; + char range[128]; + char errmsg[CURL_ERROR_SIZE]; + char in_use; +} CURLState; + +typedef struct BDRVCURLState { + CURLM *multi; + QEMUTimer timer; + size_t len; + CURLState states[CURL_NUM_STATES]; + char *url; + size_t readahead_size; + bool sslverify; + uint64_t timeout; + char *cookie; + bool accept_range; + AioContext *aio_context; +} BDRVCURLState; + +static void curl_clean_state(CURLState *s); +static void curl_multi_do(void *arg); +static void curl_multi_read(void *arg); + +#ifdef NEED_CURL_TIMER_CALLBACK +static int curl_timer_cb(CURLM *multi, long timeout_ms, void *opaque) +{ + BDRVCURLState *s = opaque; + + DPRINTF("CURL: timer callback timeout_ms %ld\n", timeout_ms); + if (timeout_ms == -1) { + timer_del(&s->timer); + } else { + int64_t timeout_ns = (int64_t)timeout_ms * 1000 * 1000; + timer_mod(&s->timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + timeout_ns); + } + return 0; +} +#endif + +static int curl_sock_cb(CURL *curl, curl_socket_t fd, int action, + void *userp, void *sp) +{ + BDRVCURLState *s; + CURLState *state = NULL; + curl_easy_getinfo(curl, CURLINFO_PRIVATE, (char **)&state); + state->sock_fd = fd; + s = state->s; + + DPRINTF("CURL (AIO): Sock action %d on fd %d\n", action, fd); + switch (action) { + case CURL_POLL_IN: + aio_set_fd_handler(s->aio_context, fd, false, + curl_multi_read, NULL, state); + break; + case CURL_POLL_OUT: + aio_set_fd_handler(s->aio_context, fd, false, + NULL, curl_multi_do, state); + break; + case CURL_POLL_INOUT: + aio_set_fd_handler(s->aio_context, fd, false, + curl_multi_read, curl_multi_do, state); + break; + case CURL_POLL_REMOVE: + aio_set_fd_handler(s->aio_context, fd, false, + NULL, NULL, NULL); + break; + } + + return 0; +} + +static size_t curl_header_cb(void *ptr, size_t size, size_t nmemb, void *opaque) +{ + BDRVCURLState *s = opaque; + size_t realsize = size * nmemb; + const char *accept_line = "Accept-Ranges: bytes"; + + if (realsize >= strlen(accept_line) + && strncmp((char *)ptr, accept_line, strlen(accept_line)) == 0) { + s->accept_range = true; + } + + return realsize; +} + +static size_t curl_read_cb(void *ptr, size_t size, size_t nmemb, void *opaque) +{ + CURLState *s = ((CURLState*)opaque); + size_t realsize = size * nmemb; + int i; + + DPRINTF("CURL: Just reading %zd bytes\n", realsize); + + if (!s || !s->orig_buf) + return 0; + + if (s->buf_off >= s->buf_len) { + /* buffer full, read nothing */ + return 0; + } + realsize = MIN(realsize, s->buf_len - s->buf_off); + memcpy(s->orig_buf + s->buf_off, ptr, realsize); + s->buf_off += realsize; + + for(i=0; i<CURL_NUM_ACB; i++) { + CURLAIOCB *acb = s->acb[i]; + + if (!acb) + continue; + + if ((s->buf_off >= acb->end)) { + qemu_iovec_from_buf(acb->qiov, 0, s->orig_buf + acb->start, + acb->end - acb->start); + acb->common.cb(acb->common.opaque, 0); + qemu_aio_unref(acb); + s->acb[i] = NULL; + } + } + + return realsize; +} + +static int curl_find_buf(BDRVCURLState *s, size_t start, size_t len, + CURLAIOCB *acb) +{ + int i; + size_t end = start + len; + + for (i=0; i<CURL_NUM_STATES; i++) { + CURLState *state = &s->states[i]; + size_t buf_end = (state->buf_start + state->buf_off); + size_t buf_fend = (state->buf_start + state->buf_len); + + if (!state->orig_buf) + continue; + if (!state->buf_off) + continue; + + // Does the existing buffer cover our section? + if ((start >= state->buf_start) && + (start <= buf_end) && + (end >= state->buf_start) && + (end <= buf_end)) + { + char *buf = state->orig_buf + (start - state->buf_start); + + qemu_iovec_from_buf(acb->qiov, 0, buf, len); + acb->common.cb(acb->common.opaque, 0); + + return FIND_RET_OK; + } + + // Wait for unfinished chunks + if (state->in_use && + (start >= state->buf_start) && + (start <= buf_fend) && + (end >= state->buf_start) && + (end <= buf_fend)) + { + int j; + + acb->start = start - state->buf_start; + acb->end = acb->start + len; + + for (j=0; j<CURL_NUM_ACB; j++) { + if (!state->acb[j]) { + state->acb[j] = acb; + return FIND_RET_WAIT; + } + } + } + } + + return FIND_RET_NONE; +} + +static void curl_multi_check_completion(BDRVCURLState *s) +{ + int msgs_in_queue; + + /* Try to find done transfers, so we can free the easy + * handle again. */ + for (;;) { + CURLMsg *msg; + msg = curl_multi_info_read(s->multi, &msgs_in_queue); + + /* Quit when there are no more completions */ + if (!msg) + break; + + if (msg->msg == CURLMSG_DONE) { + CURLState *state = NULL; + curl_easy_getinfo(msg->easy_handle, CURLINFO_PRIVATE, + (char **)&state); + + /* ACBs for successful messages get completed in curl_read_cb */ + if (msg->data.result != CURLE_OK) { + int i; + static int errcount = 100; + + /* Don't lose the original error message from curl, since + * it contains extra data. + */ + if (errcount > 0) { + error_report("curl: %s", state->errmsg); + if (--errcount == 0) { + error_report("curl: further errors suppressed"); + } + } + + for (i = 0; i < CURL_NUM_ACB; i++) { + CURLAIOCB *acb = state->acb[i]; + + if (acb == NULL) { + continue; + } + + acb->common.cb(acb->common.opaque, -EPROTO); + qemu_aio_unref(acb); + state->acb[i] = NULL; + } + } + + curl_clean_state(state); + break; + } + } +} + +static void curl_multi_do(void *arg) +{ + CURLState *s = (CURLState *)arg; + int running; + int r; + + if (!s->s->multi) { + return; + } + + do { + r = curl_multi_socket_action(s->s->multi, s->sock_fd, 0, &running); + } while(r == CURLM_CALL_MULTI_PERFORM); + +} + +static void curl_multi_read(void *arg) +{ + CURLState *s = (CURLState *)arg; + + curl_multi_do(arg); + curl_multi_check_completion(s->s); +} + +static void curl_multi_timeout_do(void *arg) +{ +#ifdef NEED_CURL_TIMER_CALLBACK + BDRVCURLState *s = (BDRVCURLState *)arg; + int running; + + if (!s->multi) { + return; + } + + curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); + + curl_multi_check_completion(s); +#else + abort(); +#endif +} + +static CURLState *curl_init_state(BlockDriverState *bs, BDRVCURLState *s) +{ + CURLState *state = NULL; + int i, j; + + do { + for (i=0; i<CURL_NUM_STATES; i++) { + for (j=0; j<CURL_NUM_ACB; j++) + if (s->states[i].acb[j]) + continue; + if (s->states[i].in_use) + continue; + + state = &s->states[i]; + state->in_use = 1; + break; + } + if (!state) { + aio_poll(bdrv_get_aio_context(bs), true); + } + } while(!state); + + if (!state->curl) { + state->curl = curl_easy_init(); + if (!state->curl) { + return NULL; + } + curl_easy_setopt(state->curl, CURLOPT_URL, s->url); + curl_easy_setopt(state->curl, CURLOPT_SSL_VERIFYPEER, + (long) s->sslverify); + if (s->cookie) { + curl_easy_setopt(state->curl, CURLOPT_COOKIE, s->cookie); + } + curl_easy_setopt(state->curl, CURLOPT_TIMEOUT, (long)s->timeout); + curl_easy_setopt(state->curl, CURLOPT_WRITEFUNCTION, + (void *)curl_read_cb); + curl_easy_setopt(state->curl, CURLOPT_WRITEDATA, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_PRIVATE, (void *)state); + curl_easy_setopt(state->curl, CURLOPT_AUTOREFERER, 1); + curl_easy_setopt(state->curl, CURLOPT_FOLLOWLOCATION, 1); + curl_easy_setopt(state->curl, CURLOPT_NOSIGNAL, 1); + curl_easy_setopt(state->curl, CURLOPT_ERRORBUFFER, state->errmsg); + curl_easy_setopt(state->curl, CURLOPT_FAILONERROR, 1); + + /* Restrict supported protocols to avoid security issues in the more + * obscure protocols. For example, do not allow POP3/SMTP/IMAP see + * CVE-2013-0249. + * + * Restricting protocols is only supported from 7.19.4 upwards. + */ +#if LIBCURL_VERSION_NUM >= 0x071304 + curl_easy_setopt(state->curl, CURLOPT_PROTOCOLS, PROTOCOLS); + curl_easy_setopt(state->curl, CURLOPT_REDIR_PROTOCOLS, PROTOCOLS); +#endif + +#ifdef DEBUG_VERBOSE + curl_easy_setopt(state->curl, CURLOPT_VERBOSE, 1); +#endif + } + + state->s = s; + + return state; +} + +static void curl_clean_state(CURLState *s) +{ + if (s->s->multi) + curl_multi_remove_handle(s->s->multi, s->curl); + s->in_use = 0; +} + +static void curl_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + qdict_put(options, CURL_BLOCK_OPT_URL, qstring_from_str(filename)); +} + +static void curl_detach_aio_context(BlockDriverState *bs) +{ + BDRVCURLState *s = bs->opaque; + int i; + + for (i = 0; i < CURL_NUM_STATES; i++) { + if (s->states[i].in_use) { + curl_clean_state(&s->states[i]); + } + if (s->states[i].curl) { + curl_easy_cleanup(s->states[i].curl); + s->states[i].curl = NULL; + } + g_free(s->states[i].orig_buf); + s->states[i].orig_buf = NULL; + } + if (s->multi) { + curl_multi_cleanup(s->multi); + s->multi = NULL; + } + + timer_del(&s->timer); +} + +static void curl_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVCURLState *s = bs->opaque; + + aio_timer_init(new_context, &s->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + curl_multi_timeout_do, s); + + assert(!s->multi); + s->multi = curl_multi_init(); + s->aio_context = new_context; + curl_multi_setopt(s->multi, CURLMOPT_SOCKETFUNCTION, curl_sock_cb); +#ifdef NEED_CURL_TIMER_CALLBACK + curl_multi_setopt(s->multi, CURLMOPT_TIMERDATA, s); + curl_multi_setopt(s->multi, CURLMOPT_TIMERFUNCTION, curl_timer_cb); +#endif +} + +static QemuOptsList runtime_opts = { + .name = "curl", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = CURL_BLOCK_OPT_URL, + .type = QEMU_OPT_STRING, + .help = "URL to open", + }, + { + .name = CURL_BLOCK_OPT_READAHEAD, + .type = QEMU_OPT_SIZE, + .help = "Readahead size", + }, + { + .name = CURL_BLOCK_OPT_SSLVERIFY, + .type = QEMU_OPT_BOOL, + .help = "Verify SSL certificate" + }, + { + .name = CURL_BLOCK_OPT_TIMEOUT, + .type = QEMU_OPT_NUMBER, + .help = "Curl timeout" + }, + { + .name = CURL_BLOCK_OPT_COOKIE, + .type = QEMU_OPT_STRING, + .help = "Pass the cookie or list of cookies with each request" + }, + { /* end of list */ } + }, +}; + +static int curl_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVCURLState *s = bs->opaque; + CURLState *state = NULL; + QemuOpts *opts; + Error *local_err = NULL; + const char *file; + const char *cookie; + double d; + + static int inited = 0; + + if (flags & BDRV_O_RDWR) { + error_setg(errp, "curl block device does not support writes"); + return -EROFS; + } + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto out_noclean; + } + + s->readahead_size = qemu_opt_get_size(opts, CURL_BLOCK_OPT_READAHEAD, + READ_AHEAD_DEFAULT); + if ((s->readahead_size & 0x1ff) != 0) { + error_setg(errp, "HTTP_READAHEAD_SIZE %zd is not a multiple of 512", + s->readahead_size); + goto out_noclean; + } + + s->timeout = qemu_opt_get_number(opts, CURL_BLOCK_OPT_TIMEOUT, + CURL_TIMEOUT_DEFAULT); + if (s->timeout > CURL_TIMEOUT_MAX) { + error_setg(errp, "timeout parameter is too large or negative"); + goto out_noclean; + } + + s->sslverify = qemu_opt_get_bool(opts, CURL_BLOCK_OPT_SSLVERIFY, true); + + cookie = qemu_opt_get(opts, CURL_BLOCK_OPT_COOKIE); + s->cookie = g_strdup(cookie); + + file = qemu_opt_get(opts, CURL_BLOCK_OPT_URL); + if (file == NULL) { + error_setg(errp, "curl block driver requires an 'url' option"); + goto out_noclean; + } + + if (!inited) { + curl_global_init(CURL_GLOBAL_ALL); + inited = 1; + } + + DPRINTF("CURL: Opening %s\n", file); + s->aio_context = bdrv_get_aio_context(bs); + s->url = g_strdup(file); + state = curl_init_state(bs, s); + if (!state) + goto out_noclean; + + // Get file size + + s->accept_range = false; + curl_easy_setopt(state->curl, CURLOPT_NOBODY, 1); + curl_easy_setopt(state->curl, CURLOPT_HEADERFUNCTION, + curl_header_cb); + curl_easy_setopt(state->curl, CURLOPT_HEADERDATA, s); + if (curl_easy_perform(state->curl)) + goto out; + curl_easy_getinfo(state->curl, CURLINFO_CONTENT_LENGTH_DOWNLOAD, &d); + if (d) + s->len = (size_t)d; + else if(!s->len) + goto out; + if ((!strncasecmp(s->url, "http://", strlen("http://")) + || !strncasecmp(s->url, "https://", strlen("https://"))) + && !s->accept_range) { + pstrcpy(state->errmsg, CURL_ERROR_SIZE, + "Server does not support 'range' (byte ranges)."); + goto out; + } + DPRINTF("CURL: Size = %zd\n", s->len); + + curl_clean_state(state); + curl_easy_cleanup(state->curl); + state->curl = NULL; + + curl_attach_aio_context(bs, bdrv_get_aio_context(bs)); + + qemu_opts_del(opts); + return 0; + +out: + error_setg(errp, "CURL: Error opening file: %s", state->errmsg); + curl_easy_cleanup(state->curl); + state->curl = NULL; +out_noclean: + g_free(s->cookie); + g_free(s->url); + qemu_opts_del(opts); + return -EINVAL; +} + +static const AIOCBInfo curl_aiocb_info = { + .aiocb_size = sizeof(CURLAIOCB), +}; + + +static void curl_readv_bh_cb(void *p) +{ + CURLState *state; + int running; + + CURLAIOCB *acb = p; + BDRVCURLState *s = acb->common.bs->opaque; + + qemu_bh_delete(acb->bh); + acb->bh = NULL; + + size_t start = acb->sector_num * SECTOR_SIZE; + size_t end; + + // In case we have the requested data already (e.g. read-ahead), + // we can just call the callback and be done. + switch (curl_find_buf(s, start, acb->nb_sectors * SECTOR_SIZE, acb)) { + case FIND_RET_OK: + qemu_aio_unref(acb); + // fall through + case FIND_RET_WAIT: + return; + default: + break; + } + + // No cache found, so let's start a new request + state = curl_init_state(acb->common.bs, s); + if (!state) { + acb->common.cb(acb->common.opaque, -EIO); + qemu_aio_unref(acb); + return; + } + + acb->start = 0; + acb->end = (acb->nb_sectors * SECTOR_SIZE); + + state->buf_off = 0; + g_free(state->orig_buf); + state->buf_start = start; + state->buf_len = acb->end + s->readahead_size; + end = MIN(start + state->buf_len, s->len) - 1; + state->orig_buf = g_try_malloc(state->buf_len); + if (state->buf_len && state->orig_buf == NULL) { + curl_clean_state(state); + acb->common.cb(acb->common.opaque, -ENOMEM); + qemu_aio_unref(acb); + return; + } + state->acb[0] = acb; + + snprintf(state->range, 127, "%zd-%zd", start, end); + DPRINTF("CURL (AIO): Reading %d at %zd (%s)\n", + (acb->nb_sectors * SECTOR_SIZE), start, state->range); + curl_easy_setopt(state->curl, CURLOPT_RANGE, state->range); + + curl_multi_add_handle(s->multi, state->curl); + + /* Tell curl it needs to kick things off */ + curl_multi_socket_action(s->multi, CURL_SOCKET_TIMEOUT, 0, &running); +} + +static BlockAIOCB *curl_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + CURLAIOCB *acb; + + acb = qemu_aio_get(&curl_aiocb_info, bs, cb, opaque); + + acb->qiov = qiov; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), curl_readv_bh_cb, acb); + qemu_bh_schedule(acb->bh); + return &acb->common; +} + +static void curl_close(BlockDriverState *bs) +{ + BDRVCURLState *s = bs->opaque; + + DPRINTF("CURL: Close\n"); + curl_detach_aio_context(bs); + + g_free(s->cookie); + g_free(s->url); +} + +static int64_t curl_getlength(BlockDriverState *bs) +{ + BDRVCURLState *s = bs->opaque; + return s->len; +} + +static BlockDriver bdrv_http = { + .format_name = "http", + .protocol_name = "http", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, + + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, +}; + +static BlockDriver bdrv_https = { + .format_name = "https", + .protocol_name = "https", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, + + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, +}; + +static BlockDriver bdrv_ftp = { + .format_name = "ftp", + .protocol_name = "ftp", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, + + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, +}; + +static BlockDriver bdrv_ftps = { + .format_name = "ftps", + .protocol_name = "ftps", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, + + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, +}; + +static BlockDriver bdrv_tftp = { + .format_name = "tftp", + .protocol_name = "tftp", + + .instance_size = sizeof(BDRVCURLState), + .bdrv_parse_filename = curl_parse_filename, + .bdrv_file_open = curl_open, + .bdrv_close = curl_close, + .bdrv_getlength = curl_getlength, + + .bdrv_aio_readv = curl_aio_readv, + + .bdrv_detach_aio_context = curl_detach_aio_context, + .bdrv_attach_aio_context = curl_attach_aio_context, +}; + +static void curl_block_init(void) +{ + bdrv_register(&bdrv_http); + bdrv_register(&bdrv_https); + bdrv_register(&bdrv_ftp); + bdrv_register(&bdrv_ftps); + bdrv_register(&bdrv_tftp); +} + +block_init(curl_block_init); diff --git a/src/block/dmg.c b/src/block/dmg.c new file mode 100644 index 0000000..546a6f5 --- /dev/null +++ b/src/block/dmg.c @@ -0,0 +1,725 @@ +/* + * QEMU Block driver for DMG images + * + * Copyright (c) 2004 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/bswap.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include <zlib.h> +#ifdef CONFIG_BZIP2 +#include <bzlib.h> +#endif +#include <glib.h> + +enum { + /* Limit chunk sizes to prevent unreasonable amounts of memory being used + * or truncating when converting to 32-bit types + */ + DMG_LENGTHS_MAX = 64 * 1024 * 1024, /* 64 MB */ + DMG_SECTORCOUNTS_MAX = DMG_LENGTHS_MAX / 512, +}; + +typedef struct BDRVDMGState { + CoMutex lock; + /* each chunk contains a certain number of sectors, + * offsets[i] is the offset in the .dmg file, + * lengths[i] is the length of the compressed chunk, + * sectors[i] is the sector beginning at offsets[i], + * sectorcounts[i] is the number of sectors in that chunk, + * the sectors array is ordered + * 0<=i<n_chunks */ + + uint32_t n_chunks; + uint32_t* types; + uint64_t* offsets; + uint64_t* lengths; + uint64_t* sectors; + uint64_t* sectorcounts; + uint32_t current_chunk; + uint8_t *compressed_chunk; + uint8_t *uncompressed_chunk; + z_stream zstream; +#ifdef CONFIG_BZIP2 + bz_stream bzstream; +#endif +} BDRVDMGState; + +static int dmg_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + int len; + + if (!filename) { + return 0; + } + + len = strlen(filename); + if (len > 4 && !strcmp(filename + len - 4, ".dmg")) { + return 2; + } + return 0; +} + +static int read_uint64(BlockDriverState *bs, int64_t offset, uint64_t *result) +{ + uint64_t buffer; + int ret; + + ret = bdrv_pread(bs->file->bs, offset, &buffer, 8); + if (ret < 0) { + return ret; + } + + *result = be64_to_cpu(buffer); + return 0; +} + +static int read_uint32(BlockDriverState *bs, int64_t offset, uint32_t *result) +{ + uint32_t buffer; + int ret; + + ret = bdrv_pread(bs->file->bs, offset, &buffer, 4); + if (ret < 0) { + return ret; + } + + *result = be32_to_cpu(buffer); + return 0; +} + +static inline uint64_t buff_read_uint64(const uint8_t *buffer, int64_t offset) +{ + return be64_to_cpu(*(uint64_t *)&buffer[offset]); +} + +static inline uint32_t buff_read_uint32(const uint8_t *buffer, int64_t offset) +{ + return be32_to_cpu(*(uint32_t *)&buffer[offset]); +} + +/* Increase max chunk sizes, if necessary. This function is used to calculate + * the buffer sizes needed for compressed/uncompressed chunk I/O. + */ +static void update_max_chunk_size(BDRVDMGState *s, uint32_t chunk, + uint32_t *max_compressed_size, + uint32_t *max_sectors_per_chunk) +{ + uint32_t compressed_size = 0; + uint32_t uncompressed_sectors = 0; + + switch (s->types[chunk]) { + case 0x80000005: /* zlib compressed */ + case 0x80000006: /* bzip2 compressed */ + compressed_size = s->lengths[chunk]; + uncompressed_sectors = s->sectorcounts[chunk]; + break; + case 1: /* copy */ + uncompressed_sectors = (s->lengths[chunk] + 511) / 512; + break; + case 2: /* zero */ + /* as the all-zeroes block may be large, it is treated specially: the + * sector is not copied from a large buffer, a simple memset is used + * instead. Therefore uncompressed_sectors does not need to be set. */ + break; + } + + if (compressed_size > *max_compressed_size) { + *max_compressed_size = compressed_size; + } + if (uncompressed_sectors > *max_sectors_per_chunk) { + *max_sectors_per_chunk = uncompressed_sectors; + } +} + +static int64_t dmg_find_koly_offset(BlockDriverState *file_bs, Error **errp) +{ + int64_t length; + int64_t offset = 0; + uint8_t buffer[515]; + int i, ret; + + /* bdrv_getlength returns a multiple of block size (512), rounded up. Since + * dmg images can have odd sizes, try to look for the "koly" magic which + * marks the begin of the UDIF trailer (512 bytes). This magic can be found + * in the last 511 bytes of the second-last sector or the first 4 bytes of + * the last sector (search space: 515 bytes) */ + length = bdrv_getlength(file_bs); + if (length < 0) { + error_setg_errno(errp, -length, + "Failed to get file size while reading UDIF trailer"); + return length; + } else if (length < 512) { + error_setg(errp, "dmg file must be at least 512 bytes long"); + return -EINVAL; + } + if (length > 511 + 512) { + offset = length - 511 - 512; + } + length = length < 515 ? length : 515; + ret = bdrv_pread(file_bs, offset, buffer, length); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed while reading UDIF trailer"); + return ret; + } + for (i = 0; i < length - 3; i++) { + if (buffer[i] == 'k' && buffer[i+1] == 'o' && + buffer[i+2] == 'l' && buffer[i+3] == 'y') { + return offset + i; + } + } + error_setg(errp, "Could not locate UDIF trailer in dmg file"); + return -EINVAL; +} + +/* used when building the sector table */ +typedef struct DmgHeaderState { + /* used internally by dmg_read_mish_block to remember offsets of blocks + * across calls */ + uint64_t data_fork_offset; + /* exported for dmg_open */ + uint32_t max_compressed_size; + uint32_t max_sectors_per_chunk; +} DmgHeaderState; + +static bool dmg_is_known_block_type(uint32_t entry_type) +{ + switch (entry_type) { + case 0x00000001: /* uncompressed */ + case 0x00000002: /* zeroes */ + case 0x80000005: /* zlib */ +#ifdef CONFIG_BZIP2 + case 0x80000006: /* bzip2 */ +#endif + return true; + default: + return false; + } +} + +static int dmg_read_mish_block(BDRVDMGState *s, DmgHeaderState *ds, + uint8_t *buffer, uint32_t count) +{ + uint32_t type, i; + int ret; + size_t new_size; + uint32_t chunk_count; + int64_t offset = 0; + uint64_t data_offset; + uint64_t in_offset = ds->data_fork_offset; + uint64_t out_offset; + + type = buff_read_uint32(buffer, offset); + /* skip data that is not a valid MISH block (invalid magic or too small) */ + if (type != 0x6d697368 || count < 244) { + /* assume success for now */ + return 0; + } + + /* chunk offsets are relative to this sector number */ + out_offset = buff_read_uint64(buffer, offset + 8); + + /* location in data fork for (compressed) blob (in bytes) */ + data_offset = buff_read_uint64(buffer, offset + 0x18); + in_offset += data_offset; + + /* move to begin of chunk entries */ + offset += 204; + + chunk_count = (count - 204) / 40; + new_size = sizeof(uint64_t) * (s->n_chunks + chunk_count); + s->types = g_realloc(s->types, new_size / 2); + s->offsets = g_realloc(s->offsets, new_size); + s->lengths = g_realloc(s->lengths, new_size); + s->sectors = g_realloc(s->sectors, new_size); + s->sectorcounts = g_realloc(s->sectorcounts, new_size); + + for (i = s->n_chunks; i < s->n_chunks + chunk_count; i++) { + s->types[i] = buff_read_uint32(buffer, offset); + if (!dmg_is_known_block_type(s->types[i])) { + chunk_count--; + i--; + offset += 40; + continue; + } + + /* sector number */ + s->sectors[i] = buff_read_uint64(buffer, offset + 8); + s->sectors[i] += out_offset; + + /* sector count */ + s->sectorcounts[i] = buff_read_uint64(buffer, offset + 0x10); + + /* all-zeroes sector (type 2) does not need to be "uncompressed" and can + * therefore be unbounded. */ + if (s->types[i] != 2 && s->sectorcounts[i] > DMG_SECTORCOUNTS_MAX) { + error_report("sector count %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->sectorcounts[i], i, DMG_SECTORCOUNTS_MAX); + ret = -EINVAL; + goto fail; + } + + /* offset in (compressed) data fork */ + s->offsets[i] = buff_read_uint64(buffer, offset + 0x18); + s->offsets[i] += in_offset; + + /* length in (compressed) data fork */ + s->lengths[i] = buff_read_uint64(buffer, offset + 0x20); + + if (s->lengths[i] > DMG_LENGTHS_MAX) { + error_report("length %" PRIu64 " for chunk %" PRIu32 + " is larger than max (%u)", + s->lengths[i], i, DMG_LENGTHS_MAX); + ret = -EINVAL; + goto fail; + } + + update_max_chunk_size(s, i, &ds->max_compressed_size, + &ds->max_sectors_per_chunk); + offset += 40; + } + s->n_chunks += chunk_count; + return 0; + +fail: + return ret; +} + +static int dmg_read_resource_fork(BlockDriverState *bs, DmgHeaderState *ds, + uint64_t info_begin, uint64_t info_length) +{ + BDRVDMGState *s = bs->opaque; + int ret; + uint32_t count, rsrc_data_offset; + uint8_t *buffer = NULL; + uint64_t info_end; + uint64_t offset; + + /* read offset from begin of resource fork (info_begin) to resource data */ + ret = read_uint32(bs, info_begin, &rsrc_data_offset); + if (ret < 0) { + goto fail; + } else if (rsrc_data_offset > info_length) { + ret = -EINVAL; + goto fail; + } + + /* read length of resource data */ + ret = read_uint32(bs, info_begin + 8, &count); + if (ret < 0) { + goto fail; + } else if (count == 0 || rsrc_data_offset + count > info_length) { + ret = -EINVAL; + goto fail; + } + + /* begin of resource data (consisting of one or more resources) */ + offset = info_begin + rsrc_data_offset; + + /* end of resource data (there is possibly a following resource map + * which will be ignored). */ + info_end = offset + count; + + /* read offsets (mish blocks) from one or more resources in resource data */ + while (offset < info_end) { + /* size of following resource */ + ret = read_uint32(bs, offset, &count); + if (ret < 0) { + goto fail; + } else if (count == 0 || count > info_end - offset) { + ret = -EINVAL; + goto fail; + } + offset += 4; + + buffer = g_realloc(buffer, count); + ret = bdrv_pread(bs->file->bs, offset, buffer, count); + if (ret < 0) { + goto fail; + } + + ret = dmg_read_mish_block(s, ds, buffer, count); + if (ret < 0) { + goto fail; + } + /* advance offset by size of resource */ + offset += count; + } + ret = 0; + +fail: + g_free(buffer); + return ret; +} + +static int dmg_read_plist_xml(BlockDriverState *bs, DmgHeaderState *ds, + uint64_t info_begin, uint64_t info_length) +{ + BDRVDMGState *s = bs->opaque; + int ret; + uint8_t *buffer = NULL; + char *data_begin, *data_end; + + /* Have at least some length to avoid NULL for g_malloc. Attempt to set a + * safe upper cap on the data length. A test sample had a XML length of + * about 1 MiB. */ + if (info_length == 0 || info_length > 16 * 1024 * 1024) { + ret = -EINVAL; + goto fail; + } + + buffer = g_malloc(info_length + 1); + buffer[info_length] = '\0'; + ret = bdrv_pread(bs->file->bs, info_begin, buffer, info_length); + if (ret != info_length) { + ret = -EINVAL; + goto fail; + } + + /* look for <data>...</data>. The data is 284 (0x11c) bytes after base64 + * decode. The actual data element has 431 (0x1af) bytes which includes tabs + * and line feeds. */ + data_end = (char *)buffer; + while ((data_begin = strstr(data_end, "<data>")) != NULL) { + guchar *mish; + gsize out_len = 0; + + data_begin += 6; + data_end = strstr(data_begin, "</data>"); + /* malformed XML? */ + if (data_end == NULL) { + ret = -EINVAL; + goto fail; + } + *data_end++ = '\0'; + mish = g_base64_decode(data_begin, &out_len); + ret = dmg_read_mish_block(s, ds, mish, (uint32_t)out_len); + g_free(mish); + if (ret < 0) { + goto fail; + } + } + ret = 0; + +fail: + g_free(buffer); + return ret; +} + +static int dmg_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVDMGState *s = bs->opaque; + DmgHeaderState ds; + uint64_t rsrc_fork_offset, rsrc_fork_length; + uint64_t plist_xml_offset, plist_xml_length; + int64_t offset; + int ret; + + bs->read_only = 1; + s->n_chunks = 0; + s->offsets = s->lengths = s->sectors = s->sectorcounts = NULL; + /* used by dmg_read_mish_block to keep track of the current I/O position */ + ds.data_fork_offset = 0; + ds.max_compressed_size = 1; + ds.max_sectors_per_chunk = 1; + + /* locate the UDIF trailer */ + offset = dmg_find_koly_offset(bs->file->bs, errp); + if (offset < 0) { + ret = offset; + goto fail; + } + + /* offset of data fork (DataForkOffset) */ + ret = read_uint64(bs, offset + 0x18, &ds.data_fork_offset); + if (ret < 0) { + goto fail; + } else if (ds.data_fork_offset > offset) { + ret = -EINVAL; + goto fail; + } + + /* offset of resource fork (RsrcForkOffset) */ + ret = read_uint64(bs, offset + 0x28, &rsrc_fork_offset); + if (ret < 0) { + goto fail; + } + ret = read_uint64(bs, offset + 0x30, &rsrc_fork_length); + if (ret < 0) { + goto fail; + } + if (rsrc_fork_offset >= offset || + rsrc_fork_length > offset - rsrc_fork_offset) { + ret = -EINVAL; + goto fail; + } + /* offset of property list (XMLOffset) */ + ret = read_uint64(bs, offset + 0xd8, &plist_xml_offset); + if (ret < 0) { + goto fail; + } + ret = read_uint64(bs, offset + 0xe0, &plist_xml_length); + if (ret < 0) { + goto fail; + } + if (plist_xml_offset >= offset || + plist_xml_length > offset - plist_xml_offset) { + ret = -EINVAL; + goto fail; + } + ret = read_uint64(bs, offset + 0x1ec, (uint64_t *)&bs->total_sectors); + if (ret < 0) { + goto fail; + } + if (bs->total_sectors < 0) { + ret = -EINVAL; + goto fail; + } + if (rsrc_fork_length != 0) { + ret = dmg_read_resource_fork(bs, &ds, + rsrc_fork_offset, rsrc_fork_length); + if (ret < 0) { + goto fail; + } + } else if (plist_xml_length != 0) { + ret = dmg_read_plist_xml(bs, &ds, plist_xml_offset, plist_xml_length); + if (ret < 0) { + goto fail; + } + } else { + ret = -EINVAL; + goto fail; + } + + /* initialize zlib engine */ + s->compressed_chunk = qemu_try_blockalign(bs->file->bs, + ds.max_compressed_size + 1); + s->uncompressed_chunk = qemu_try_blockalign(bs->file->bs, + 512 * ds.max_sectors_per_chunk); + if (s->compressed_chunk == NULL || s->uncompressed_chunk == NULL) { + ret = -ENOMEM; + goto fail; + } + + if (inflateInit(&s->zstream) != Z_OK) { + ret = -EINVAL; + goto fail; + } + + s->current_chunk = s->n_chunks; + + qemu_co_mutex_init(&s->lock); + return 0; + +fail: + g_free(s->types); + g_free(s->offsets); + g_free(s->lengths); + g_free(s->sectors); + g_free(s->sectorcounts); + qemu_vfree(s->compressed_chunk); + qemu_vfree(s->uncompressed_chunk); + return ret; +} + +static inline int is_sector_in_chunk(BDRVDMGState* s, + uint32_t chunk_num, uint64_t sector_num) +{ + if (chunk_num >= s->n_chunks || s->sectors[chunk_num] > sector_num || + s->sectors[chunk_num] + s->sectorcounts[chunk_num] <= sector_num) { + return 0; + } else { + return -1; + } +} + +static inline uint32_t search_chunk(BDRVDMGState *s, uint64_t sector_num) +{ + /* binary search */ + uint32_t chunk1 = 0, chunk2 = s->n_chunks, chunk3; + while (chunk1 != chunk2) { + chunk3 = (chunk1 + chunk2) / 2; + if (s->sectors[chunk3] > sector_num) { + chunk2 = chunk3; + } else if (s->sectors[chunk3] + s->sectorcounts[chunk3] > sector_num) { + return chunk3; + } else { + chunk1 = chunk3; + } + } + return s->n_chunks; /* error */ +} + +static inline int dmg_read_chunk(BlockDriverState *bs, uint64_t sector_num) +{ + BDRVDMGState *s = bs->opaque; + + if (!is_sector_in_chunk(s, s->current_chunk, sector_num)) { + int ret; + uint32_t chunk = search_chunk(s, sector_num); +#ifdef CONFIG_BZIP2 + uint64_t total_out; +#endif + + if (chunk >= s->n_chunks) { + return -1; + } + + s->current_chunk = s->n_chunks; + switch (s->types[chunk]) { /* block entry type */ + case 0x80000005: { /* zlib compressed */ + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + ret = bdrv_pread(bs->file->bs, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + + s->zstream.next_in = s->compressed_chunk; + s->zstream.avail_in = s->lengths[chunk]; + s->zstream.next_out = s->uncompressed_chunk; + s->zstream.avail_out = 512 * s->sectorcounts[chunk]; + ret = inflateReset(&s->zstream); + if (ret != Z_OK) { + return -1; + } + ret = inflate(&s->zstream, Z_FINISH); + if (ret != Z_STREAM_END || + s->zstream.total_out != 512 * s->sectorcounts[chunk]) { + return -1; + } + break; } +#ifdef CONFIG_BZIP2 + case 0x80000006: /* bzip2 compressed */ + /* we need to buffer, because only the chunk as whole can be + * inflated. */ + ret = bdrv_pread(bs->file->bs, s->offsets[chunk], + s->compressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + + ret = BZ2_bzDecompressInit(&s->bzstream, 0, 0); + if (ret != BZ_OK) { + return -1; + } + s->bzstream.next_in = (char *)s->compressed_chunk; + s->bzstream.avail_in = (unsigned int) s->lengths[chunk]; + s->bzstream.next_out = (char *)s->uncompressed_chunk; + s->bzstream.avail_out = (unsigned int) 512 * s->sectorcounts[chunk]; + ret = BZ2_bzDecompress(&s->bzstream); + total_out = ((uint64_t)s->bzstream.total_out_hi32 << 32) + + s->bzstream.total_out_lo32; + BZ2_bzDecompressEnd(&s->bzstream); + if (ret != BZ_STREAM_END || + total_out != 512 * s->sectorcounts[chunk]) { + return -1; + } + break; +#endif /* CONFIG_BZIP2 */ + case 1: /* copy */ + ret = bdrv_pread(bs->file->bs, s->offsets[chunk], + s->uncompressed_chunk, s->lengths[chunk]); + if (ret != s->lengths[chunk]) { + return -1; + } + break; + case 2: /* zero */ + /* see dmg_read, it is treated specially. No buffer needs to be + * pre-filled, the zeroes can be set directly. */ + break; + } + s->current_chunk = chunk; + } + return 0; +} + +static int dmg_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVDMGState *s = bs->opaque; + int i; + + for (i = 0; i < nb_sectors; i++) { + uint32_t sector_offset_in_chunk; + if (dmg_read_chunk(bs, sector_num + i) != 0) { + return -1; + } + /* Special case: current chunk is all zeroes. Do not perform a memcpy as + * s->uncompressed_chunk may be too small to cover the large all-zeroes + * section. dmg_read_chunk is called to find s->current_chunk */ + if (s->types[s->current_chunk] == 2) { /* all zeroes block entry */ + memset(buf + i * 512, 0, 512); + continue; + } + sector_offset_in_chunk = sector_num + i - s->sectors[s->current_chunk]; + memcpy(buf + i * 512, + s->uncompressed_chunk + sector_offset_in_chunk * 512, 512); + } + return 0; +} + +static coroutine_fn int dmg_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVDMGState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = dmg_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static void dmg_close(BlockDriverState *bs) +{ + BDRVDMGState *s = bs->opaque; + + g_free(s->types); + g_free(s->offsets); + g_free(s->lengths); + g_free(s->sectors); + g_free(s->sectorcounts); + qemu_vfree(s->compressed_chunk); + qemu_vfree(s->uncompressed_chunk); + + inflateEnd(&s->zstream); +} + +static BlockDriver bdrv_dmg = { + .format_name = "dmg", + .instance_size = sizeof(BDRVDMGState), + .bdrv_probe = dmg_probe, + .bdrv_open = dmg_open, + .bdrv_read = dmg_co_read, + .bdrv_close = dmg_close, +}; + +static void bdrv_dmg_init(void) +{ + bdrv_register(&bdrv_dmg); +} + +block_init(bdrv_dmg_init); diff --git a/src/block/gluster.c b/src/block/gluster.c new file mode 100644 index 0000000..0857c14 --- /dev/null +++ b/src/block/gluster.c @@ -0,0 +1,813 @@ +/* + * GlusterFS backend for QEMU + * + * Copyright (C) 2012 Bharata B Rao <bharata@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + * + */ +#include <glusterfs/api/glfs.h> +#include "block/block_int.h" +#include "qemu/uri.h" + +typedef struct GlusterAIOCB { + int64_t size; + int ret; + QEMUBH *bh; + Coroutine *coroutine; + AioContext *aio_context; +} GlusterAIOCB; + +typedef struct BDRVGlusterState { + struct glfs *glfs; + struct glfs_fd *fd; +} BDRVGlusterState; + +typedef struct GlusterConf { + char *server; + int port; + char *volname; + char *image; + char *transport; +} GlusterConf; + +static void qemu_gluster_gconf_free(GlusterConf *gconf) +{ + if (gconf) { + g_free(gconf->server); + g_free(gconf->volname); + g_free(gconf->image); + g_free(gconf->transport); + g_free(gconf); + } +} + +static int parse_volume_options(GlusterConf *gconf, char *path) +{ + char *p, *q; + + if (!path) { + return -EINVAL; + } + + /* volume */ + p = q = path + strspn(path, "/"); + p += strcspn(p, "/"); + if (*p == '\0') { + return -EINVAL; + } + gconf->volname = g_strndup(q, p - q); + + /* image */ + p += strspn(p, "/"); + if (*p == '\0') { + return -EINVAL; + } + gconf->image = g_strdup(p); + return 0; +} + +/* + * file=gluster[+transport]://[server[:port]]/volname/image[?socket=...] + * + * 'gluster' is the protocol. + * + * 'transport' specifies the transport type used to connect to gluster + * management daemon (glusterd). Valid transport types are + * tcp, unix and rdma. If a transport type isn't specified, then tcp + * type is assumed. + * + * 'server' specifies the server where the volume file specification for + * the given volume resides. This can be either hostname, ipv4 address + * or ipv6 address. ipv6 address needs to be within square brackets [ ]. + * If transport type is 'unix', then 'server' field should not be specified. + * The 'socket' field needs to be populated with the path to unix domain + * socket. + * + * 'port' is the port number on which glusterd is listening. This is optional + * and if not specified, QEMU will send 0 which will make gluster to use the + * default port. If the transport type is unix, then 'port' should not be + * specified. + * + * 'volname' is the name of the gluster volume which contains the VM image. + * + * 'image' is the path to the actual VM image that resides on gluster volume. + * + * Examples: + * + * file=gluster://1.2.3.4/testvol/a.img + * file=gluster+tcp://1.2.3.4/testvol/a.img + * file=gluster+tcp://1.2.3.4:24007/testvol/dir/a.img + * file=gluster+tcp://[1:2:3:4:5:6:7:8]/testvol/dir/a.img + * file=gluster+tcp://[1:2:3:4:5:6:7:8]:24007/testvol/dir/a.img + * file=gluster+tcp://server.domain.com:24007/testvol/dir/a.img + * file=gluster+unix:///testvol/dir/a.img?socket=/tmp/glusterd.socket + * file=gluster+rdma://1.2.3.4:24007/testvol/a.img + */ +static int qemu_gluster_parseuri(GlusterConf *gconf, const char *filename) +{ + URI *uri; + QueryParams *qp = NULL; + bool is_unix = false; + int ret = 0; + + uri = uri_parse(filename); + if (!uri) { + return -EINVAL; + } + + /* transport */ + if (!uri->scheme || !strcmp(uri->scheme, "gluster")) { + gconf->transport = g_strdup("tcp"); + } else if (!strcmp(uri->scheme, "gluster+tcp")) { + gconf->transport = g_strdup("tcp"); + } else if (!strcmp(uri->scheme, "gluster+unix")) { + gconf->transport = g_strdup("unix"); + is_unix = true; + } else if (!strcmp(uri->scheme, "gluster+rdma")) { + gconf->transport = g_strdup("rdma"); + } else { + ret = -EINVAL; + goto out; + } + + ret = parse_volume_options(gconf, uri->path); + if (ret < 0) { + goto out; + } + + qp = query_params_parse(uri->query); + if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { + ret = -EINVAL; + goto out; + } + + if (is_unix) { + if (uri->server || uri->port) { + ret = -EINVAL; + goto out; + } + if (strcmp(qp->p[0].name, "socket")) { + ret = -EINVAL; + goto out; + } + gconf->server = g_strdup(qp->p[0].value); + } else { + gconf->server = g_strdup(uri->server ? uri->server : "localhost"); + gconf->port = uri->port; + } + +out: + if (qp) { + query_params_free(qp); + } + uri_free(uri); + return ret; +} + +static struct glfs *qemu_gluster_init(GlusterConf *gconf, const char *filename, + Error **errp) +{ + struct glfs *glfs = NULL; + int ret; + int old_errno; + + ret = qemu_gluster_parseuri(gconf, filename); + if (ret < 0) { + error_setg(errp, "Usage: file=gluster[+transport]://[server[:port]]/" + "volname/image[?socket=...]"); + errno = -ret; + goto out; + } + + glfs = glfs_new(gconf->volname); + if (!glfs) { + goto out; + } + + ret = glfs_set_volfile_server(glfs, gconf->transport, gconf->server, + gconf->port); + if (ret < 0) { + goto out; + } + + /* + * TODO: Use GF_LOG_ERROR instead of hard code value of 4 here when + * GlusterFS makes GF_LOG_* macros available to libgfapi users. + */ + ret = glfs_set_logging(glfs, "-", 4); + if (ret < 0) { + goto out; + } + + ret = glfs_init(glfs); + if (ret) { + error_setg_errno(errp, errno, + "Gluster connection failed for server=%s port=%d " + "volume=%s image=%s transport=%s", gconf->server, + gconf->port, gconf->volname, gconf->image, + gconf->transport); + + /* glfs_init sometimes doesn't set errno although docs suggest that */ + if (errno == 0) + errno = EINVAL; + + goto out; + } + return glfs; + +out: + if (glfs) { + old_errno = errno; + glfs_fini(glfs); + errno = old_errno; + } + return NULL; +} + +static void qemu_gluster_complete_aio(void *opaque) +{ + GlusterAIOCB *acb = (GlusterAIOCB *)opaque; + + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_coroutine_enter(acb->coroutine, NULL); +} + +/* + * AIO callback routine called from GlusterFS thread. + */ +static void gluster_finish_aiocb(struct glfs_fd *fd, ssize_t ret, void *arg) +{ + GlusterAIOCB *acb = (GlusterAIOCB *)arg; + + if (!ret || ret == acb->size) { + acb->ret = 0; /* Success */ + } else if (ret < 0) { + acb->ret = ret; /* Read/Write failed */ + } else { + acb->ret = -EIO; /* Partial read/write - fail it */ + } + + acb->bh = aio_bh_new(acb->aio_context, qemu_gluster_complete_aio, acb); + qemu_bh_schedule(acb->bh); +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "gluster", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the gluster image", + }, + { /* end of list */ } + }, +}; + +static void qemu_gluster_parse_flags(int bdrv_flags, int *open_flags) +{ + assert(open_flags != NULL); + + *open_flags |= O_BINARY; + + if (bdrv_flags & BDRV_O_RDWR) { + *open_flags |= O_RDWR; + } else { + *open_flags |= O_RDONLY; + } + + if ((bdrv_flags & BDRV_O_NOCACHE)) { + *open_flags |= O_DIRECT; + } +} + +static int qemu_gluster_open(BlockDriverState *bs, QDict *options, + int bdrv_flags, Error **errp) +{ + BDRVGlusterState *s = bs->opaque; + int open_flags = 0; + int ret = 0; + GlusterConf *gconf = g_new0(GlusterConf, 1); + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + + filename = qemu_opt_get(opts, "filename"); + + s->glfs = qemu_gluster_init(gconf, filename, errp); + if (!s->glfs) { + ret = -errno; + goto out; + } + + qemu_gluster_parse_flags(bdrv_flags, &open_flags); + + s->fd = glfs_open(s->glfs, gconf->image, open_flags); + if (!s->fd) { + ret = -errno; + } + +out: + qemu_opts_del(opts); + qemu_gluster_gconf_free(gconf); + if (!ret) { + return ret; + } + if (s->fd) { + glfs_close(s->fd); + } + if (s->glfs) { + glfs_fini(s->glfs); + } + return ret; +} + +typedef struct BDRVGlusterReopenState { + struct glfs *glfs; + struct glfs_fd *fd; +} BDRVGlusterReopenState; + + +static int qemu_gluster_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + int ret = 0; + BDRVGlusterReopenState *reop_s; + GlusterConf *gconf = NULL; + int open_flags = 0; + + assert(state != NULL); + assert(state->bs != NULL); + + state->opaque = g_new0(BDRVGlusterReopenState, 1); + reop_s = state->opaque; + + qemu_gluster_parse_flags(state->flags, &open_flags); + + gconf = g_new0(GlusterConf, 1); + + reop_s->glfs = qemu_gluster_init(gconf, state->bs->filename, errp); + if (reop_s->glfs == NULL) { + ret = -errno; + goto exit; + } + + reop_s->fd = glfs_open(reop_s->glfs, gconf->image, open_flags); + if (reop_s->fd == NULL) { + /* reops->glfs will be cleaned up in _abort */ + ret = -errno; + goto exit; + } + +exit: + /* state->opaque will be freed in either the _abort or _commit */ + qemu_gluster_gconf_free(gconf); + return ret; +} + +static void qemu_gluster_reopen_commit(BDRVReopenState *state) +{ + BDRVGlusterReopenState *reop_s = state->opaque; + BDRVGlusterState *s = state->bs->opaque; + + + /* close the old */ + if (s->fd) { + glfs_close(s->fd); + } + if (s->glfs) { + glfs_fini(s->glfs); + } + + /* use the newly opened image / connection */ + s->fd = reop_s->fd; + s->glfs = reop_s->glfs; + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + + +static void qemu_gluster_reopen_abort(BDRVReopenState *state) +{ + BDRVGlusterReopenState *reop_s = state->opaque; + + if (reop_s == NULL) { + return; + } + + if (reop_s->fd) { + glfs_close(reop_s->fd); + } + + if (reop_s->glfs) { + glfs_fini(reop_s->glfs); + } + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + +#ifdef CONFIG_GLUSTERFS_ZEROFILL +static coroutine_fn int qemu_gluster_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + int ret; + GlusterAIOCB acb; + BDRVGlusterState *s = bs->opaque; + off_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; + + acb.size = size; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); + + ret = glfs_zerofill_async(s->fd, offset, size, gluster_finish_aiocb, &acb); + if (ret < 0) { + return -errno; + } + + qemu_coroutine_yield(); + return acb.ret; +} + +static inline bool gluster_supports_zerofill(void) +{ + return 1; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return glfs_zerofill(fd, offset, size); +} + +#else +static inline bool gluster_supports_zerofill(void) +{ + return 0; +} + +static inline int qemu_gluster_zerofill(struct glfs_fd *fd, int64_t offset, + int64_t size) +{ + return 0; +} +#endif + +static int qemu_gluster_create(const char *filename, + QemuOpts *opts, Error **errp) +{ + struct glfs *glfs; + struct glfs_fd *fd; + int ret = 0; + int prealloc = 0; + int64_t total_size = 0; + char *tmp = NULL; + GlusterConf *gconf = g_new0(GlusterConf, 1); + + glfs = qemu_gluster_init(gconf, filename, errp); + if (!glfs) { + ret = -errno; + goto out; + } + + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + + tmp = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + if (!tmp || !strcmp(tmp, "off")) { + prealloc = 0; + } else if (!strcmp(tmp, "full") && + gluster_supports_zerofill()) { + prealloc = 1; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'" + " or GlusterFS doesn't support zerofill API", + tmp); + ret = -EINVAL; + goto out; + } + + fd = glfs_creat(glfs, gconf->image, + O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, S_IRUSR | S_IWUSR); + if (!fd) { + ret = -errno; + } else { + if (!glfs_ftruncate(fd, total_size)) { + if (prealloc && qemu_gluster_zerofill(fd, 0, total_size)) { + ret = -errno; + } + } else { + ret = -errno; + } + + if (glfs_close(fd) != 0) { + ret = -errno; + } + } +out: + g_free(tmp); + qemu_gluster_gconf_free(gconf); + if (glfs) { + glfs_fini(glfs); + } + return ret; +} + +static coroutine_fn int qemu_gluster_co_rw(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, int write) +{ + int ret; + GlusterAIOCB acb; + BDRVGlusterState *s = bs->opaque; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; + + acb.size = size; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); + + if (write) { + ret = glfs_pwritev_async(s->fd, qiov->iov, qiov->niov, offset, 0, + gluster_finish_aiocb, &acb); + } else { + ret = glfs_preadv_async(s->fd, qiov->iov, qiov->niov, offset, 0, + gluster_finish_aiocb, &acb); + } + + if (ret < 0) { + return -errno; + } + + qemu_coroutine_yield(); + return acb.ret; +} + +static int qemu_gluster_truncate(BlockDriverState *bs, int64_t offset) +{ + int ret; + BDRVGlusterState *s = bs->opaque; + + ret = glfs_ftruncate(s->fd, offset); + if (ret < 0) { + return -errno; + } + + return 0; +} + +static coroutine_fn int qemu_gluster_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 0); +} + +static coroutine_fn int qemu_gluster_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + return qemu_gluster_co_rw(bs, sector_num, nb_sectors, qiov, 1); +} + +static coroutine_fn int qemu_gluster_co_flush_to_disk(BlockDriverState *bs) +{ + int ret; + GlusterAIOCB acb; + BDRVGlusterState *s = bs->opaque; + + acb.size = 0; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); + + ret = glfs_fsync_async(s->fd, gluster_finish_aiocb, &acb); + if (ret < 0) { + return -errno; + } + + qemu_coroutine_yield(); + return acb.ret; +} + +#ifdef CONFIG_GLUSTERFS_DISCARD +static coroutine_fn int qemu_gluster_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + int ret; + GlusterAIOCB acb; + BDRVGlusterState *s = bs->opaque; + size_t size = nb_sectors * BDRV_SECTOR_SIZE; + off_t offset = sector_num * BDRV_SECTOR_SIZE; + + acb.size = 0; + acb.ret = 0; + acb.coroutine = qemu_coroutine_self(); + acb.aio_context = bdrv_get_aio_context(bs); + + ret = glfs_discard_async(s->fd, offset, size, gluster_finish_aiocb, &acb); + if (ret < 0) { + return -errno; + } + + qemu_coroutine_yield(); + return acb.ret; +} +#endif + +static int64_t qemu_gluster_getlength(BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + int64_t ret; + + ret = glfs_lseek(s->fd, 0, SEEK_END); + if (ret < 0) { + return -errno; + } else { + return ret; + } +} + +static int64_t qemu_gluster_allocated_file_size(BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + struct stat st; + int ret; + + ret = glfs_fstat(s->fd, &st); + if (ret < 0) { + return -errno; + } else { + return st.st_blocks * 512; + } +} + +static void qemu_gluster_close(BlockDriverState *bs) +{ + BDRVGlusterState *s = bs->opaque; + + if (s->fd) { + glfs_close(s->fd); + s->fd = NULL; + } + glfs_fini(s->glfs); +} + +static int qemu_gluster_has_zero_init(BlockDriverState *bs) +{ + /* GlusterFS volume could be backed by a block device */ + return 0; +} + +static QemuOptsList qemu_gluster_create_opts = { + .name = "qemu-gluster-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_gluster_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_gluster = { + .format_name = "gluster", + .protocol_name = "gluster", + .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, + .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, + .bdrv_close = qemu_gluster_close, + .bdrv_create = qemu_gluster_create, + .bdrv_getlength = qemu_gluster_getlength, + .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, +#endif + .create_opts = &qemu_gluster_create_opts, +}; + +static BlockDriver bdrv_gluster_tcp = { + .format_name = "gluster", + .protocol_name = "gluster+tcp", + .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, + .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, + .bdrv_close = qemu_gluster_close, + .bdrv_create = qemu_gluster_create, + .bdrv_getlength = qemu_gluster_getlength, + .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, +#endif + .create_opts = &qemu_gluster_create_opts, +}; + +static BlockDriver bdrv_gluster_unix = { + .format_name = "gluster", + .protocol_name = "gluster+unix", + .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, + .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, + .bdrv_close = qemu_gluster_close, + .bdrv_create = qemu_gluster_create, + .bdrv_getlength = qemu_gluster_getlength, + .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, +#endif + .create_opts = &qemu_gluster_create_opts, +}; + +static BlockDriver bdrv_gluster_rdma = { + .format_name = "gluster", + .protocol_name = "gluster+rdma", + .instance_size = sizeof(BDRVGlusterState), + .bdrv_needs_filename = true, + .bdrv_file_open = qemu_gluster_open, + .bdrv_reopen_prepare = qemu_gluster_reopen_prepare, + .bdrv_reopen_commit = qemu_gluster_reopen_commit, + .bdrv_reopen_abort = qemu_gluster_reopen_abort, + .bdrv_close = qemu_gluster_close, + .bdrv_create = qemu_gluster_create, + .bdrv_getlength = qemu_gluster_getlength, + .bdrv_get_allocated_file_size = qemu_gluster_allocated_file_size, + .bdrv_truncate = qemu_gluster_truncate, + .bdrv_co_readv = qemu_gluster_co_readv, + .bdrv_co_writev = qemu_gluster_co_writev, + .bdrv_co_flush_to_disk = qemu_gluster_co_flush_to_disk, + .bdrv_has_zero_init = qemu_gluster_has_zero_init, +#ifdef CONFIG_GLUSTERFS_DISCARD + .bdrv_co_discard = qemu_gluster_co_discard, +#endif +#ifdef CONFIG_GLUSTERFS_ZEROFILL + .bdrv_co_write_zeroes = qemu_gluster_co_write_zeroes, +#endif + .create_opts = &qemu_gluster_create_opts, +}; + +static void bdrv_gluster_init(void) +{ + bdrv_register(&bdrv_gluster_rdma); + bdrv_register(&bdrv_gluster_unix); + bdrv_register(&bdrv_gluster_tcp); + bdrv_register(&bdrv_gluster); +} + +block_init(bdrv_gluster_init); diff --git a/src/block/io.c b/src/block/io.c new file mode 100644 index 0000000..e00fb5d --- /dev/null +++ b/src/block/io.c @@ -0,0 +1,2758 @@ +/* + * Block layer I/O functions + * + * Copyright (c) 2003 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "trace.h" +#include "sysemu/block-backend.h" +#include "block/blockjob.h" +#include "block/block_int.h" +#include "block/throttle-groups.h" +#include "qemu/error-report.h" + +#define NOT_DONE 0x7fffffff /* used while emulated sync operation in progress */ + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque); +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov); +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags); +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write); +static void coroutine_fn bdrv_co_do_rw(void *opaque); +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags); + +/* throttling disk I/O limits */ +void bdrv_set_io_limits(BlockDriverState *bs, + ThrottleConfig *cfg) +{ + int i; + + throttle_group_config(bs, cfg); + + for (i = 0; i < 2; i++) { + qemu_co_enter_next(&bs->throttled_reqs[i]); + } +} + +/* this function drain all the throttled IOs */ +static bool bdrv_start_throttled_reqs(BlockDriverState *bs) +{ + bool drained = false; + bool enabled = bs->io_limits_enabled; + int i; + + bs->io_limits_enabled = false; + + for (i = 0; i < 2; i++) { + while (qemu_co_enter_next(&bs->throttled_reqs[i])) { + drained = true; + } + } + + bs->io_limits_enabled = enabled; + + return drained; +} + +void bdrv_io_limits_disable(BlockDriverState *bs) +{ + bs->io_limits_enabled = false; + bdrv_start_throttled_reqs(bs); + throttle_group_unregister_bs(bs); +} + +/* should be called before bdrv_set_io_limits if a limit is set */ +void bdrv_io_limits_enable(BlockDriverState *bs, const char *group) +{ + assert(!bs->io_limits_enabled); + throttle_group_register_bs(bs, group); + bs->io_limits_enabled = true; +} + +void bdrv_io_limits_update_group(BlockDriverState *bs, const char *group) +{ + /* this bs is not part of any group */ + if (!bs->throttle_state) { + return; + } + + /* this bs is a part of the same group than the one we want */ + if (!g_strcmp0(throttle_group_get_name(bs), group)) { + return; + } + + /* need to change the group this bs belong to */ + bdrv_io_limits_disable(bs); + bdrv_io_limits_enable(bs, group); +} + +void bdrv_setup_io_funcs(BlockDriver *bdrv) +{ + /* Block drivers without coroutine functions need emulation */ + if (!bdrv->bdrv_co_readv) { + bdrv->bdrv_co_readv = bdrv_co_readv_em; + bdrv->bdrv_co_writev = bdrv_co_writev_em; + + /* bdrv_co_readv_em()/brdv_co_writev_em() work in terms of aio, so if + * the block driver lacks aio we need to emulate that too. + */ + if (!bdrv->bdrv_aio_readv) { + /* add AIO emulation layer */ + bdrv->bdrv_aio_readv = bdrv_aio_readv_em; + bdrv->bdrv_aio_writev = bdrv_aio_writev_em; + } + } +} + +void bdrv_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BlockDriver *drv = bs->drv; + Error *local_err = NULL; + + memset(&bs->bl, 0, sizeof(bs->bl)); + + if (!drv) { + return; + } + + /* Take some limits from the children as a default */ + if (bs->file) { + bdrv_refresh_limits(bs->file->bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = bs->file->bs->bl.opt_transfer_length; + bs->bl.max_transfer_length = bs->file->bs->bl.max_transfer_length; + bs->bl.min_mem_alignment = bs->file->bs->bl.min_mem_alignment; + bs->bl.opt_mem_alignment = bs->file->bs->bl.opt_mem_alignment; + } else { + bs->bl.min_mem_alignment = 512; + bs->bl.opt_mem_alignment = getpagesize(); + } + + if (bs->backing) { + bdrv_refresh_limits(bs->backing->bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + bs->bl.opt_transfer_length = + MAX(bs->bl.opt_transfer_length, + bs->backing->bs->bl.opt_transfer_length); + bs->bl.max_transfer_length = + MIN_NON_ZERO(bs->bl.max_transfer_length, + bs->backing->bs->bl.max_transfer_length); + bs->bl.opt_mem_alignment = + MAX(bs->bl.opt_mem_alignment, + bs->backing->bs->bl.opt_mem_alignment); + bs->bl.min_mem_alignment = + MAX(bs->bl.min_mem_alignment, + bs->backing->bs->bl.min_mem_alignment); + } + + /* Then let the driver override it */ + if (drv->bdrv_refresh_limits) { + drv->bdrv_refresh_limits(bs, errp); + } +} + +/** + * The copy-on-read flag is actually a reference count so multiple users may + * use the feature without worrying about clobbering its previous state. + * Copy-on-read stays enabled until all users have called to disable it. + */ +void bdrv_enable_copy_on_read(BlockDriverState *bs) +{ + bs->copy_on_read++; +} + +void bdrv_disable_copy_on_read(BlockDriverState *bs) +{ + assert(bs->copy_on_read > 0); + bs->copy_on_read--; +} + +/* Check if any requests are in-flight (including throttled requests) */ +bool bdrv_requests_pending(BlockDriverState *bs) +{ + BdrvChild *child; + + if (!QLIST_EMPTY(&bs->tracked_requests)) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[0])) { + return true; + } + if (!qemu_co_queue_empty(&bs->throttled_reqs[1])) { + return true; + } + + QLIST_FOREACH(child, &bs->children, next) { + if (bdrv_requests_pending(child->bs)) { + return true; + } + } + + return false; +} + +static void bdrv_drain_recurse(BlockDriverState *bs) +{ + BdrvChild *child; + + if (bs->drv && bs->drv->bdrv_drain) { + bs->drv->bdrv_drain(bs); + } + QLIST_FOREACH(child, &bs->children, next) { + bdrv_drain_recurse(child->bs); + } +} + +/* + * Wait for pending requests to complete on a single BlockDriverState subtree, + * and suspend block driver's internal I/O until next request arrives. + * + * Note that unlike bdrv_drain_all(), the caller must hold the BlockDriverState + * AioContext. + * + * Only this BlockDriverState's AioContext is run, so in-flight requests must + * not depend on events in other AioContexts. In that case, use + * bdrv_drain_all() instead. + */ +void bdrv_drain(BlockDriverState *bs) +{ + bool busy = true; + + bdrv_drain_recurse(bs); + while (busy) { + /* Keep iterating */ + bdrv_flush_io_queue(bs); + busy = bdrv_requests_pending(bs); + busy |= aio_poll(bdrv_get_aio_context(bs), busy); + } +} + +/* + * Wait for pending requests to complete across all BlockDriverStates + * + * This function does not flush data to disk, use bdrv_flush_all() for that + * after calling this function. + */ +void bdrv_drain_all(void) +{ + /* Always run first iteration so any pending completion BHs run */ + bool busy = true; + BlockDriverState *bs = NULL; + GSList *aio_ctxs = NULL, *ctx; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_pause(bs->job); + } + aio_context_release(aio_context); + + if (!g_slist_find(aio_ctxs, aio_context)) { + aio_ctxs = g_slist_prepend(aio_ctxs, aio_context); + } + } + + /* Note that completion of an asynchronous I/O operation can trigger any + * number of other I/O operations on other devices---for example a + * coroutine can submit an I/O request to another device in response to + * request completion. Therefore we must keep looping until there was no + * more activity rather than simply draining each device independently. + */ + while (busy) { + busy = false; + + for (ctx = aio_ctxs; ctx != NULL; ctx = ctx->next) { + AioContext *aio_context = ctx->data; + bs = NULL; + + aio_context_acquire(aio_context); + while ((bs = bdrv_next(bs))) { + if (aio_context == bdrv_get_aio_context(bs)) { + bdrv_flush_io_queue(bs); + if (bdrv_requests_pending(bs)) { + busy = true; + aio_poll(aio_context, busy); + } + } + } + busy |= aio_poll(aio_context, false); + aio_context_release(aio_context); + } + } + + bs = NULL; + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + + aio_context_acquire(aio_context); + if (bs->job) { + block_job_resume(bs->job); + } + aio_context_release(aio_context); + } + g_slist_free(aio_ctxs); +} + +/** + * Remove an active request from the tracked requests list + * + * This function should be called when a tracked request is completing. + */ +static void tracked_request_end(BdrvTrackedRequest *req) +{ + if (req->serialising) { + req->bs->serialising_in_flight--; + } + + QLIST_REMOVE(req, list); + qemu_co_queue_restart_all(&req->wait_queue); +} + +/** + * Add an active request to the tracked requests list + */ +static void tracked_request_begin(BdrvTrackedRequest *req, + BlockDriverState *bs, + int64_t offset, + unsigned int bytes, + enum BdrvTrackedRequestType type) +{ + *req = (BdrvTrackedRequest){ + .bs = bs, + .offset = offset, + .bytes = bytes, + .type = type, + .co = qemu_coroutine_self(), + .serialising = false, + .overlap_offset = offset, + .overlap_bytes = bytes, + }; + + qemu_co_queue_init(&req->wait_queue); + + QLIST_INSERT_HEAD(&bs->tracked_requests, req, list); +} + +static void mark_request_serialising(BdrvTrackedRequest *req, uint64_t align) +{ + int64_t overlap_offset = req->offset & ~(align - 1); + unsigned int overlap_bytes = ROUND_UP(req->offset + req->bytes, align) + - overlap_offset; + + if (!req->serialising) { + req->bs->serialising_in_flight++; + req->serialising = true; + } + + req->overlap_offset = MIN(req->overlap_offset, overlap_offset); + req->overlap_bytes = MAX(req->overlap_bytes, overlap_bytes); +} + +/** + * Round a region to cluster boundaries + */ +void bdrv_round_to_clusters(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + int64_t *cluster_sector_num, + int *cluster_nb_sectors) +{ + BlockDriverInfo bdi; + + if (bdrv_get_info(bs, &bdi) < 0 || bdi.cluster_size == 0) { + *cluster_sector_num = sector_num; + *cluster_nb_sectors = nb_sectors; + } else { + int64_t c = bdi.cluster_size / BDRV_SECTOR_SIZE; + *cluster_sector_num = QEMU_ALIGN_DOWN(sector_num, c); + *cluster_nb_sectors = QEMU_ALIGN_UP(sector_num - *cluster_sector_num + + nb_sectors, c); + } +} + +static int bdrv_get_cluster_size(BlockDriverState *bs) +{ + BlockDriverInfo bdi; + int ret; + + ret = bdrv_get_info(bs, &bdi); + if (ret < 0 || bdi.cluster_size == 0) { + return bs->request_alignment; + } else { + return bdi.cluster_size; + } +} + +static bool tracked_request_overlaps(BdrvTrackedRequest *req, + int64_t offset, unsigned int bytes) +{ + /* aaaa bbbb */ + if (offset >= req->overlap_offset + req->overlap_bytes) { + return false; + } + /* bbbb aaaa */ + if (req->overlap_offset >= offset + bytes) { + return false; + } + return true; +} + +static bool coroutine_fn wait_serialising_requests(BdrvTrackedRequest *self) +{ + BlockDriverState *bs = self->bs; + BdrvTrackedRequest *req; + bool retry; + bool waited = false; + + if (!bs->serialising_in_flight) { + return false; + } + + do { + retry = false; + QLIST_FOREACH(req, &bs->tracked_requests, list) { + if (req == self || (!req->serialising && !self->serialising)) { + continue; + } + if (tracked_request_overlaps(req, self->overlap_offset, + self->overlap_bytes)) + { + /* Hitting this means there was a reentrant request, for + * example, a block driver issuing nested requests. This must + * never happen since it means deadlock. + */ + assert(qemu_coroutine_self() != req->co); + + /* If the request is already (indirectly) waiting for us, or + * will wait for us as soon as it wakes up, then just go on + * (instead of producing a deadlock in the former case). */ + if (!req->waiting_for) { + self->waiting_for = req; + qemu_co_queue_wait(&req->wait_queue); + self->waiting_for = NULL; + retry = true; + waited = true; + break; + } + } + } + } while (retry); + + return waited; +} + +static int bdrv_check_byte_request(BlockDriverState *bs, int64_t offset, + size_t size) +{ + if (size > BDRV_REQUEST_MAX_SECTORS << BDRV_SECTOR_BITS) { + return -EIO; + } + + if (!bdrv_is_inserted(bs)) { + return -ENOMEDIUM; + } + + if (offset < 0) { + return -EIO; + } + + return 0; +} + +static int bdrv_check_request(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EIO; + } + + return bdrv_check_byte_request(bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE); +} + +typedef struct RwCo { + BlockDriverState *bs; + int64_t offset; + QEMUIOVector *qiov; + bool is_write; + int ret; + BdrvRequestFlags flags; +} RwCo; + +static void coroutine_fn bdrv_rw_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + if (!rwco->is_write) { + rwco->ret = bdrv_co_do_preadv(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } else { + rwco->ret = bdrv_co_do_pwritev(rwco->bs, rwco->offset, + rwco->qiov->size, rwco->qiov, + rwco->flags); + } +} + +/* + * Process a vectored synchronous request using coroutines + */ +static int bdrv_prwv_co(BlockDriverState *bs, int64_t offset, + QEMUIOVector *qiov, bool is_write, + BdrvRequestFlags flags) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .offset = offset, + .qiov = qiov, + .is_write = is_write, + .ret = NOT_DONE, + .flags = flags, + }; + + /** + * In sync call context, when the vcpu is blocked, this throttling timer + * will not fire; so the I/O throttling function has to be disabled here + * if it has been enabled. + */ + if (bs->io_limits_enabled) { + fprintf(stderr, "Disabling I/O throttling on '%s' due " + "to synchronous I/O.\n", bdrv_get_device_name(bs)); + bdrv_io_limits_disable(bs); + } + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_rw_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_rw_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + return rwco.ret; +} + +/* + * Process a synchronous request using coroutines + */ +static int bdrv_rw_co(BlockDriverState *bs, int64_t sector_num, uint8_t *buf, + int nb_sectors, bool is_write, BdrvRequestFlags flags) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_prwv_co(bs, sector_num << BDRV_SECTOR_BITS, + &qiov, is_write, flags); +} + +/* return < 0 if error. See bdrv_write() for the return codes */ +int bdrv_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, buf, nb_sectors, false, 0); +} + +/* Just like bdrv_read(), but with I/O throttling temporarily disabled */ +int bdrv_read_unthrottled(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + bool enabled; + int ret; + + enabled = bs->io_limits_enabled; + bs->io_limits_enabled = false; + ret = bdrv_read(bs, sector_num, buf, nb_sectors); + bs->io_limits_enabled = enabled; + return ret; +} + +/* Return < 0 if error. Important errors are: + -EIO generic I/O error (may happen for all errors) + -ENOMEDIUM No media inserted. + -EINVAL Invalid sector number or nb_sectors + -EACCES Trying to write a read-only device +*/ +int bdrv_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + return bdrv_rw_co(bs, sector_num, (uint8_t *)buf, nb_sectors, true, 0); +} + +int bdrv_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + return bdrv_rw_co(bs, sector_num, NULL, nb_sectors, true, + BDRV_REQ_ZERO_WRITE | flags); +} + +/* + * Completely zero out a block device with the help of bdrv_write_zeroes. + * The operation is sped up by checking the block status and only writing + * zeroes to the device if they currently do not return zeroes. Optional + * flags are passed through to bdrv_write_zeroes (e.g. BDRV_REQ_MAY_UNMAP). + * + * Returns < 0 on error, 0 on success. For error codes see bdrv_write(). + */ +int bdrv_make_zero(BlockDriverState *bs, BdrvRequestFlags flags) +{ + int64_t target_sectors, ret, nb_sectors, sector_num = 0; + int n; + + target_sectors = bdrv_nb_sectors(bs); + if (target_sectors < 0) { + return target_sectors; + } + + for (;;) { + nb_sectors = MIN(target_sectors - sector_num, BDRV_REQUEST_MAX_SECTORS); + if (nb_sectors <= 0) { + return 0; + } + ret = bdrv_get_block_status(bs, sector_num, nb_sectors, &n); + if (ret < 0) { + error_report("error getting block status at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + if (ret & BDRV_BLOCK_ZERO) { + sector_num += n; + continue; + } + ret = bdrv_write_zeroes(bs, sector_num, n, flags); + if (ret < 0) { + error_report("error writing zeroes at sector %" PRId64 ": %s", + sector_num, strerror(-ret)); + return ret; + } + sector_num += n; + } +} + +int bdrv_pread(BlockDriverState *bs, int64_t offset, void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *)buf, + .iov_len = bytes, + }; + int ret; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + ret = bdrv_prwv_co(bs, offset, &qiov, false, 0); + if (ret < 0) { + return ret; + } + + return bytes; +} + +int bdrv_pwritev(BlockDriverState *bs, int64_t offset, QEMUIOVector *qiov) +{ + int ret; + + ret = bdrv_prwv_co(bs, offset, qiov, true, 0); + if (ret < 0) { + return ret; + } + + return qiov->size; +} + +int bdrv_pwrite(BlockDriverState *bs, int64_t offset, + const void *buf, int bytes) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = bytes, + }; + + if (bytes < 0) { + return -EINVAL; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_pwritev(bs, offset, &qiov); +} + +/* + * Writes to the file and ensures that no writes are reordered across this + * request (acts as a barrier) + * + * Returns 0 on success, -errno in error cases. + */ +int bdrv_pwrite_sync(BlockDriverState *bs, int64_t offset, + const void *buf, int count) +{ + int ret; + + ret = bdrv_pwrite(bs, offset, buf, count); + if (ret < 0) { + return ret; + } + + /* No flush needed for cache modes that already do it */ + if (bs->enable_write_cache) { + bdrv_flush(bs); + } + + return 0; +} + +static int coroutine_fn bdrv_co_do_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + /* Perform I/O through a temporary buffer so that users who scribble over + * their read buffer while the operation is in progress do not end up + * modifying the image file. This is critical for zero-copy guest I/O + * where anything might happen inside guest memory. + */ + void *bounce_buffer; + + BlockDriver *drv = bs->drv; + struct iovec iov; + QEMUIOVector bounce_qiov; + int64_t cluster_sector_num; + int cluster_nb_sectors; + size_t skip_bytes; + int ret; + + /* Cover entire cluster so no additional backing file I/O is required when + * allocating cluster in the image file. + */ + bdrv_round_to_clusters(bs, sector_num, nb_sectors, + &cluster_sector_num, &cluster_nb_sectors); + + trace_bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, + cluster_sector_num, cluster_nb_sectors); + + iov.iov_len = cluster_nb_sectors * BDRV_SECTOR_SIZE; + iov.iov_base = bounce_buffer = qemu_try_blockalign(bs, iov.iov_len); + if (bounce_buffer == NULL) { + ret = -ENOMEM; + goto err; + } + + qemu_iovec_init_external(&bounce_qiov, &iov, 1); + + ret = drv->bdrv_co_readv(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + if (ret < 0) { + goto err; + } + + if (drv->bdrv_co_write_zeroes && + buffer_is_zero(bounce_buffer, iov.iov_len)) { + ret = bdrv_co_do_write_zeroes(bs, cluster_sector_num, + cluster_nb_sectors, 0); + } else { + /* This does not change the data on the disk, it is not necessary + * to flush even in cache=writethrough mode. + */ + ret = drv->bdrv_co_writev(bs, cluster_sector_num, cluster_nb_sectors, + &bounce_qiov); + } + + if (ret < 0) { + /* It might be okay to ignore write errors for guest requests. If this + * is a deliberate copy-on-read then we don't want to ignore the error. + * Simply report it in all cases. + */ + goto err; + } + + skip_bytes = (sector_num - cluster_sector_num) * BDRV_SECTOR_SIZE; + qemu_iovec_from_buf(qiov, 0, bounce_buffer + skip_bytes, + nb_sectors * BDRV_SECTOR_SIZE); + +err: + qemu_vfree(bounce_buffer); + return ret; +} + +/* + * Forwards an already correctly aligned request to the BlockDriver. This + * handles copy on read and zeroing after EOF; any other features must be + * implemented by the caller. + */ +static int coroutine_fn bdrv_aligned_preadv(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + int64_t align, QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + /* Handle Copy on Read and associated serialisation */ + if (flags & BDRV_REQ_COPY_ON_READ) { + /* If we touch the same cluster it counts as an overlap. This + * guarantees that allocating writes will be serialized and not race + * with each other for the same cluster. For example, in copy-on-read + * it ensures that the CoR read and write operations are atomic and + * guest writes cannot interleave between them. */ + mark_request_serialising(req, bdrv_get_cluster_size(bs)); + } + + if (!(flags & BDRV_REQ_NO_SERIALISING)) { + wait_serialising_requests(req); + } + + if (flags & BDRV_REQ_COPY_ON_READ) { + int pnum; + + ret = bdrv_is_allocated(bs, sector_num, nb_sectors, &pnum); + if (ret < 0) { + goto out; + } + + if (!ret || pnum != nb_sectors) { + ret = bdrv_co_do_copy_on_readv(bs, sector_num, nb_sectors, qiov); + goto out; + } + } + + /* Forward the request to the BlockDriver */ + if (!bs->zero_beyond_eof) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else { + /* Read zeros after EOF */ + int64_t total_sectors, max_nb_sectors; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + ret = total_sectors; + goto out; + } + + max_nb_sectors = ROUND_UP(MAX(0, total_sectors - sector_num), + align >> BDRV_SECTOR_BITS); + if (nb_sectors < max_nb_sectors) { + ret = drv->bdrv_co_readv(bs, sector_num, nb_sectors, qiov); + } else if (max_nb_sectors > 0) { + QEMUIOVector local_qiov; + + qemu_iovec_init(&local_qiov, qiov->niov); + qemu_iovec_concat(&local_qiov, qiov, 0, + max_nb_sectors * BDRV_SECTOR_SIZE); + + ret = drv->bdrv_co_readv(bs, sector_num, max_nb_sectors, + &local_qiov); + + qemu_iovec_destroy(&local_qiov); + } else { + ret = 0; + } + + /* Reading beyond end of file is supposed to produce zeroes */ + if (ret == 0 && total_sectors < sector_num + nb_sectors) { + uint64_t offset = MAX(0, total_sectors - sector_num); + uint64_t bytes = (sector_num + nb_sectors - offset) * + BDRV_SECTOR_SIZE; + qemu_iovec_memset(qiov, offset * BDRV_SECTOR_SIZE, 0, bytes); + } + } + +out: + return ret; +} + +/* + * Handle a read request in coroutine context + */ +static int coroutine_fn bdrv_co_do_preadv(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest req; + + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + /* Don't do copy-on-read if we read data before write operation */ + if (bs->copy_on_read && !(flags & BDRV_REQ_NO_SERIALISING)) { + flags |= BDRV_REQ_COPY_ON_READ; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + throttle_group_co_io_limits_intercept(bs, bytes, false); + } + + /* Align read if necessary by padding qiov */ + if (offset & (align - 1)) { + head_buf = qemu_blockalign(bs, align); + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + tail_buf = qemu_blockalign(bs, align); + qemu_iovec_add(&local_qiov, tail_buf, + align - ((offset + bytes) & (align - 1))); + + bytes = ROUND_UP(bytes, align); + } + + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_READ); + ret = bdrv_aligned_preadv(bs, &req, offset, bytes, align, + use_local_qiov ? &local_qiov : qiov, + flags); + tracked_request_end(&req); + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + qemu_vfree(head_buf); + qemu_vfree(tail_buf); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_preadv(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_readv_no_serialising(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_readv_no_serialising(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_NO_SERIALISING); +} + +int coroutine_fn bdrv_co_copy_on_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_copy_on_readv(bs, sector_num, nb_sectors); + + return bdrv_co_do_readv(bs, sector_num, nb_sectors, qiov, + BDRV_REQ_COPY_ON_READ); +} + +#define MAX_WRITE_ZEROES_BOUNCE_BUFFER 32768 + +static int coroutine_fn bdrv_co_do_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BlockDriver *drv = bs->drv; + QEMUIOVector qiov; + struct iovec iov = {0}; + int ret = 0; + + int max_write_zeroes = MIN_NON_ZERO(bs->bl.max_write_zeroes, + BDRV_REQUEST_MAX_SECTORS); + + while (nb_sectors > 0 && !ret) { + int num = nb_sectors; + + /* Align request. Block drivers can expect the "bulk" of the request + * to be aligned. + */ + if (bs->bl.write_zeroes_alignment + && num > bs->bl.write_zeroes_alignment) { + if (sector_num % bs->bl.write_zeroes_alignment != 0) { + /* Make a small request up to the first aligned sector. */ + num = bs->bl.write_zeroes_alignment; + num -= sector_num % bs->bl.write_zeroes_alignment; + } else if ((sector_num + num) % bs->bl.write_zeroes_alignment != 0) { + /* Shorten the request to the last aligned sector. num cannot + * underflow because num > bs->bl.write_zeroes_alignment. + */ + num -= (sector_num + num) % bs->bl.write_zeroes_alignment; + } + } + + /* limit request size */ + if (num > max_write_zeroes) { + num = max_write_zeroes; + } + + ret = -ENOTSUP; + /* First try the efficient write zeroes operation */ + if (drv->bdrv_co_write_zeroes) { + ret = drv->bdrv_co_write_zeroes(bs, sector_num, num, flags); + } + + if (ret == -ENOTSUP) { + /* Fall back to bounce buffer if write zeroes is unsupported */ + int max_xfer_len = MIN_NON_ZERO(bs->bl.max_transfer_length, + MAX_WRITE_ZEROES_BOUNCE_BUFFER); + num = MIN(num, max_xfer_len); + iov.iov_len = num * BDRV_SECTOR_SIZE; + if (iov.iov_base == NULL) { + iov.iov_base = qemu_try_blockalign(bs, num * BDRV_SECTOR_SIZE); + if (iov.iov_base == NULL) { + ret = -ENOMEM; + goto fail; + } + memset(iov.iov_base, 0, num * BDRV_SECTOR_SIZE); + } + qemu_iovec_init_external(&qiov, &iov, 1); + + ret = drv->bdrv_co_writev(bs, sector_num, num, &qiov); + + /* Keep bounce buffer around if it is big enough for all + * all future requests. + */ + if (num < max_xfer_len) { + qemu_vfree(iov.iov_base); + iov.iov_base = NULL; + } + } + + sector_num += num; + nb_sectors -= num; + } + +fail: + qemu_vfree(iov.iov_base); + return ret; +} + +/* + * Forwards an already correctly aligned write request to the BlockDriver. + */ +static int coroutine_fn bdrv_aligned_pwritev(BlockDriverState *bs, + BdrvTrackedRequest *req, int64_t offset, unsigned int bytes, + QEMUIOVector *qiov, int flags) +{ + BlockDriver *drv = bs->drv; + bool waited; + int ret; + + int64_t sector_num = offset >> BDRV_SECTOR_BITS; + unsigned int nb_sectors = bytes >> BDRV_SECTOR_BITS; + + assert((offset & (BDRV_SECTOR_SIZE - 1)) == 0); + assert((bytes & (BDRV_SECTOR_SIZE - 1)) == 0); + assert(!qiov || bytes == qiov->size); + + waited = wait_serialising_requests(req); + assert(!waited || !req->serialising); + assert(req->overlap_offset <= offset); + assert(offset + bytes <= req->overlap_offset + req->overlap_bytes); + + ret = notifier_with_return_list_notify(&bs->before_write_notifiers, req); + + if (!ret && bs->detect_zeroes != BLOCKDEV_DETECT_ZEROES_OPTIONS_OFF && + !(flags & BDRV_REQ_ZERO_WRITE) && drv->bdrv_co_write_zeroes && + qemu_iovec_is_zero(qiov)) { + flags |= BDRV_REQ_ZERO_WRITE; + if (bs->detect_zeroes == BLOCKDEV_DETECT_ZEROES_OPTIONS_UNMAP) { + flags |= BDRV_REQ_MAY_UNMAP; + } + } + + if (ret < 0) { + /* Do nothing, write notifier decided to fail this request */ + } else if (flags & BDRV_REQ_ZERO_WRITE) { + bdrv_debug_event(bs, BLKDBG_PWRITEV_ZERO); + ret = bdrv_co_do_write_zeroes(bs, sector_num, nb_sectors, flags); + } else { + bdrv_debug_event(bs, BLKDBG_PWRITEV); + ret = drv->bdrv_co_writev(bs, sector_num, nb_sectors, qiov); + } + bdrv_debug_event(bs, BLKDBG_PWRITEV_DONE); + + if (ret == 0 && !bs->enable_write_cache) { + ret = bdrv_co_flush(bs); + } + + bdrv_set_dirty(bs, sector_num, nb_sectors); + + if (bs->wr_highest_offset < offset + bytes) { + bs->wr_highest_offset = offset + bytes; + } + + if (ret >= 0) { + bs->total_sectors = MAX(bs->total_sectors, sector_num + nb_sectors); + } + + return ret; +} + +static int coroutine_fn bdrv_co_do_zero_pwritev(BlockDriverState *bs, + int64_t offset, + unsigned int bytes, + BdrvRequestFlags flags, + BdrvTrackedRequest *req) +{ + uint8_t *buf = NULL; + QEMUIOVector local_qiov; + struct iovec iov; + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + unsigned int head_padding_bytes, tail_padding_bytes; + int ret = 0; + + head_padding_bytes = offset & (align - 1); + tail_padding_bytes = align - ((offset + bytes) & (align - 1)); + + + assert(flags & BDRV_REQ_ZERO_WRITE); + if (head_padding_bytes || tail_padding_bytes) { + buf = qemu_blockalign(bs, align); + iov = (struct iovec) { + .iov_base = buf, + .iov_len = align, + }; + qemu_iovec_init_external(&local_qiov, &iov, 1); + } + if (head_padding_bytes) { + uint64_t zero_bytes = MIN(bytes, align - head_padding_bytes); + + /* RMW the unaligned part before head. */ + mark_request_serialising(req, align); + wait_serialising_requests(req); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, req, offset & ~(align - 1), align, + align, &local_qiov, 0); + if (ret < 0) { + goto fail; + } + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + memset(buf + head_padding_bytes, 0, zero_bytes); + ret = bdrv_aligned_pwritev(bs, req, offset & ~(align - 1), align, + &local_qiov, + flags & ~BDRV_REQ_ZERO_WRITE); + if (ret < 0) { + goto fail; + } + offset += zero_bytes; + bytes -= zero_bytes; + } + + assert(!bytes || (offset & (align - 1)) == 0); + if (bytes >= align) { + /* Write the aligned part in the middle. */ + uint64_t aligned_bytes = bytes & ~(align - 1); + ret = bdrv_aligned_pwritev(bs, req, offset, aligned_bytes, + NULL, flags); + if (ret < 0) { + goto fail; + } + bytes -= aligned_bytes; + offset += aligned_bytes; + } + + assert(!bytes || (offset & (align - 1)) == 0); + if (bytes) { + assert(align == tail_padding_bytes + bytes); + /* RMW the unaligned part after tail. */ + mark_request_serialising(req, align); + wait_serialising_requests(req); + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, req, offset, align, + align, &local_qiov, 0); + if (ret < 0) { + goto fail; + } + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + memset(buf, 0, bytes); + ret = bdrv_aligned_pwritev(bs, req, offset, align, + &local_qiov, flags & ~BDRV_REQ_ZERO_WRITE); + } +fail: + qemu_vfree(buf); + return ret; + +} + +/* + * Handle a write request in coroutine context + */ +static int coroutine_fn bdrv_co_do_pwritev(BlockDriverState *bs, + int64_t offset, unsigned int bytes, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + BdrvTrackedRequest req; + /* TODO Lift BDRV_SECTOR_SIZE restriction in BlockDriver interface */ + uint64_t align = MAX(BDRV_SECTOR_SIZE, bs->request_alignment); + uint8_t *head_buf = NULL; + uint8_t *tail_buf = NULL; + QEMUIOVector local_qiov; + bool use_local_qiov = false; + int ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + if (bs->read_only) { + return -EPERM; + } + + ret = bdrv_check_byte_request(bs, offset, bytes); + if (ret < 0) { + return ret; + } + + /* throttling disk I/O */ + if (bs->io_limits_enabled) { + throttle_group_co_io_limits_intercept(bs, bytes, true); + } + + /* + * Align write if necessary by performing a read-modify-write cycle. + * Pad qiov with the read parts and be sure to have a tracked request not + * only for bdrv_aligned_pwritev, but also for the reads of the RMW cycle. + */ + tracked_request_begin(&req, bs, offset, bytes, BDRV_TRACKED_WRITE); + + if (!qiov) { + ret = bdrv_co_do_zero_pwritev(bs, offset, bytes, flags, &req); + goto out; + } + + if (offset & (align - 1)) { + QEMUIOVector head_qiov; + struct iovec head_iov; + + mark_request_serialising(&req, align); + wait_serialising_requests(&req); + + head_buf = qemu_blockalign(bs, align); + head_iov = (struct iovec) { + .iov_base = head_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&head_qiov, &head_iov, 1); + + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_HEAD); + ret = bdrv_aligned_preadv(bs, &req, offset & ~(align - 1), align, + align, &head_qiov, 0); + if (ret < 0) { + goto fail; + } + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_HEAD); + + qemu_iovec_init(&local_qiov, qiov->niov + 2); + qemu_iovec_add(&local_qiov, head_buf, offset & (align - 1)); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + + bytes += offset & (align - 1); + offset = offset & ~(align - 1); + } + + if ((offset + bytes) & (align - 1)) { + QEMUIOVector tail_qiov; + struct iovec tail_iov; + size_t tail_bytes; + bool waited; + + mark_request_serialising(&req, align); + waited = wait_serialising_requests(&req); + assert(!waited || !use_local_qiov); + + tail_buf = qemu_blockalign(bs, align); + tail_iov = (struct iovec) { + .iov_base = tail_buf, + .iov_len = align, + }; + qemu_iovec_init_external(&tail_qiov, &tail_iov, 1); + + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_TAIL); + ret = bdrv_aligned_preadv(bs, &req, (offset + bytes) & ~(align - 1), align, + align, &tail_qiov, 0); + if (ret < 0) { + goto fail; + } + bdrv_debug_event(bs, BLKDBG_PWRITEV_RMW_AFTER_TAIL); + + if (!use_local_qiov) { + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_concat(&local_qiov, qiov, 0, qiov->size); + use_local_qiov = true; + } + + tail_bytes = (offset + bytes) & (align - 1); + qemu_iovec_add(&local_qiov, tail_buf + tail_bytes, align - tail_bytes); + + bytes = ROUND_UP(bytes, align); + } + + ret = bdrv_aligned_pwritev(bs, &req, offset, bytes, + use_local_qiov ? &local_qiov : qiov, + flags); + +fail: + + if (use_local_qiov) { + qemu_iovec_destroy(&local_qiov); + } + qemu_vfree(head_buf); + qemu_vfree(tail_buf); +out: + tracked_request_end(&req); + return ret; +} + +static int coroutine_fn bdrv_co_do_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov, + BdrvRequestFlags flags) +{ + if (nb_sectors < 0 || nb_sectors > BDRV_REQUEST_MAX_SECTORS) { + return -EINVAL; + } + + return bdrv_co_do_pwritev(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors << BDRV_SECTOR_BITS, qiov, flags); +} + +int coroutine_fn bdrv_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + trace_bdrv_co_writev(bs, sector_num, nb_sectors); + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, qiov, 0); +} + +int coroutine_fn bdrv_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) +{ + trace_bdrv_co_write_zeroes(bs, sector_num, nb_sectors, flags); + + if (!(bs->open_flags & BDRV_O_UNMAP)) { + flags &= ~BDRV_REQ_MAY_UNMAP; + } + + return bdrv_co_do_writev(bs, sector_num, nb_sectors, NULL, + BDRV_REQ_ZERO_WRITE | flags); +} + +int bdrv_flush_all(void) +{ + BlockDriverState *bs = NULL; + int result = 0; + + while ((bs = bdrv_next(bs))) { + AioContext *aio_context = bdrv_get_aio_context(bs); + int ret; + + aio_context_acquire(aio_context); + ret = bdrv_flush(bs); + if (ret < 0 && !result) { + result = ret; + } + aio_context_release(aio_context); + } + + return result; +} + +typedef struct BdrvCoGetBlockStatusData { + BlockDriverState *bs; + BlockDriverState *base; + int64_t sector_num; + int nb_sectors; + int *pnum; + int64_t ret; + bool done; +} BdrvCoGetBlockStatusData; + +/* + * Returns the allocation status of the specified sectors. + * Drivers not implementing the functionality are assumed to not support + * backing files, hence all their sectors are reported as allocated. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn bdrv_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t total_sectors; + int64_t n; + int64_t ret, ret2; + + total_sectors = bdrv_nb_sectors(bs); + if (total_sectors < 0) { + return total_sectors; + } + + if (sector_num >= total_sectors) { + *pnum = 0; + return 0; + } + + n = total_sectors - sector_num; + if (n < nb_sectors) { + nb_sectors = n; + } + + if (!bs->drv->bdrv_co_get_block_status) { + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA | BDRV_BLOCK_ALLOCATED; + if (bs->drv->protocol_name) { + ret |= BDRV_BLOCK_OFFSET_VALID | (sector_num * BDRV_SECTOR_SIZE); + } + return ret; + } + + ret = bs->drv->bdrv_co_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + *pnum = 0; + return ret; + } + + if (ret & BDRV_BLOCK_RAW) { + assert(ret & BDRV_BLOCK_OFFSET_VALID); + return bdrv_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, + *pnum, pnum); + } + + if (ret & (BDRV_BLOCK_DATA | BDRV_BLOCK_ZERO)) { + ret |= BDRV_BLOCK_ALLOCATED; + } else { + if (bdrv_unallocated_blocks_are_zero(bs)) { + ret |= BDRV_BLOCK_ZERO; + } else if (bs->backing) { + BlockDriverState *bs2 = bs->backing->bs; + int64_t nb_sectors2 = bdrv_nb_sectors(bs2); + if (nb_sectors2 >= 0 && sector_num >= nb_sectors2) { + ret |= BDRV_BLOCK_ZERO; + } + } + } + + if (bs->file && + (ret & BDRV_BLOCK_DATA) && !(ret & BDRV_BLOCK_ZERO) && + (ret & BDRV_BLOCK_OFFSET_VALID)) { + int file_pnum; + + ret2 = bdrv_co_get_block_status(bs->file->bs, ret >> BDRV_SECTOR_BITS, + *pnum, &file_pnum); + if (ret2 >= 0) { + /* Ignore errors. This is just providing extra information, it + * is useful but not necessary. + */ + if (!file_pnum) { + /* !file_pnum indicates an offset at or beyond the EOF; it is + * perfectly valid for the format block driver to point to such + * offsets, so catch it and mark everything as zero */ + ret |= BDRV_BLOCK_ZERO; + } else { + /* Limit request to the range reported by the protocol driver */ + *pnum = file_pnum; + ret |= (ret2 & BDRV_BLOCK_ZERO); + } + } + } + + return ret; +} + +static int64_t coroutine_fn bdrv_co_get_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, + int *pnum) +{ + BlockDriverState *p; + int64_t ret = 0; + + assert(bs != base); + for (p = bs; p != base; p = backing_bs(p)) { + ret = bdrv_co_get_block_status(p, sector_num, nb_sectors, pnum); + if (ret < 0 || ret & BDRV_BLOCK_ALLOCATED) { + break; + } + /* [sector_num, pnum] unallocated on this layer, which could be only + * the first part of [sector_num, nb_sectors]. */ + nb_sectors = MIN(nb_sectors, *pnum); + } + return ret; +} + +/* Coroutine wrapper for bdrv_get_block_status_above() */ +static void coroutine_fn bdrv_get_block_status_above_co_entry(void *opaque) +{ + BdrvCoGetBlockStatusData *data = opaque; + + data->ret = bdrv_co_get_block_status_above(data->bs, data->base, + data->sector_num, + data->nb_sectors, + data->pnum); + data->done = true; +} + +/* + * Synchronous wrapper around bdrv_co_get_block_status_above(). + * + * See bdrv_co_get_block_status_above() for details. + */ +int64_t bdrv_get_block_status_above(BlockDriverState *bs, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + Coroutine *co; + BdrvCoGetBlockStatusData data = { + .bs = bs, + .base = base, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .pnum = pnum, + .done = false, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_get_block_status_above_co_entry(&data); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_get_block_status_above_co_entry); + qemu_coroutine_enter(co, &data); + while (!data.done) { + aio_poll(aio_context, true); + } + } + return data.ret; +} + +int64_t bdrv_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + return bdrv_get_block_status_above(bs, backing_bs(bs), + sector_num, nb_sectors, pnum); +} + +int coroutine_fn bdrv_is_allocated(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t ret = bdrv_get_block_status(bs, sector_num, nb_sectors, pnum); + if (ret < 0) { + return ret; + } + return !!(ret & BDRV_BLOCK_ALLOCATED); +} + +/* + * Given an image chain: ... -> [BASE] -> [INTER1] -> [INTER2] -> [TOP] + * + * Return true if the given sector is allocated in any image between + * BASE and TOP (inclusive). BASE can be NULL to check if the given + * sector is allocated in any image of the chain. Return false otherwise. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + */ +int bdrv_is_allocated_above(BlockDriverState *top, + BlockDriverState *base, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BlockDriverState *intermediate; + int ret, n = nb_sectors; + + intermediate = top; + while (intermediate && intermediate != base) { + int pnum_inter; + ret = bdrv_is_allocated(intermediate, sector_num, nb_sectors, + &pnum_inter); + if (ret < 0) { + return ret; + } else if (ret) { + *pnum = pnum_inter; + return 1; + } + + /* + * [sector_num, nb_sectors] is unallocated on top but intermediate + * might have + * + * [sector_num+x, nr_sectors] allocated. + */ + if (n > pnum_inter && + (intermediate == top || + sector_num + pnum_inter < intermediate->total_sectors)) { + n = pnum_inter; + } + + intermediate = backing_bs(intermediate); + } + + *pnum = n; + return 0; +} + +int bdrv_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BlockDriver *drv = bs->drv; + int ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (!drv->bdrv_write_compressed) { + return -ENOTSUP; + } + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } + + assert(QLIST_EMPTY(&bs->dirty_bitmaps)); + + return drv->bdrv_write_compressed(bs, sector_num, buf, nb_sectors); +} + +int bdrv_save_vmstate(BlockDriverState *bs, const uint8_t *buf, + int64_t pos, int size) +{ + QEMUIOVector qiov; + struct iovec iov = { + .iov_base = (void *) buf, + .iov_len = size, + }; + + qemu_iovec_init_external(&qiov, &iov, 1); + return bdrv_writev_vmstate(bs, &qiov, pos); +} + +int bdrv_writev_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, int64_t pos) +{ + BlockDriver *drv = bs->drv; + + if (!drv) { + return -ENOMEDIUM; + } else if (drv->bdrv_save_vmstate) { + return drv->bdrv_save_vmstate(bs, qiov, pos); + } else if (bs->file) { + return bdrv_writev_vmstate(bs->file->bs, qiov, pos); + } + + return -ENOTSUP; +} + +int bdrv_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BlockDriver *drv = bs->drv; + if (!drv) + return -ENOMEDIUM; + if (drv->bdrv_load_vmstate) + return drv->bdrv_load_vmstate(bs, buf, pos, size); + if (bs->file) + return bdrv_load_vmstate(bs->file->bs, buf, pos, size); + return -ENOTSUP; +} + +/**************************************************************/ +/* async I/Os */ + +BlockAIOCB *bdrv_aio_readv(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_readv(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, false); +} + +BlockAIOCB *bdrv_aio_writev(BlockDriverState *bs, int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_writev(bs, sector_num, nb_sectors, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, qiov, nb_sectors, 0, + cb, opaque, true); +} + +BlockAIOCB *bdrv_aio_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_write_zeroes(bs, sector_num, nb_sectors, flags, opaque); + + return bdrv_co_aio_rw_vector(bs, sector_num, NULL, nb_sectors, + BDRV_REQ_ZERO_WRITE | flags, + cb, opaque, true); +} + + +typedef struct MultiwriteCB { + int error; + int num_requests; + int num_callbacks; + struct { + BlockCompletionFunc *cb; + void *opaque; + QEMUIOVector *free_qiov; + } callbacks[]; +} MultiwriteCB; + +static void multiwrite_user_cb(MultiwriteCB *mcb) +{ + int i; + + for (i = 0; i < mcb->num_callbacks; i++) { + mcb->callbacks[i].cb(mcb->callbacks[i].opaque, mcb->error); + if (mcb->callbacks[i].free_qiov) { + qemu_iovec_destroy(mcb->callbacks[i].free_qiov); + } + g_free(mcb->callbacks[i].free_qiov); + } +} + +static void multiwrite_cb(void *opaque, int ret) +{ + MultiwriteCB *mcb = opaque; + + trace_multiwrite_cb(mcb, ret); + + if (ret < 0 && !mcb->error) { + mcb->error = ret; + } + + mcb->num_requests--; + if (mcb->num_requests == 0) { + multiwrite_user_cb(mcb); + g_free(mcb); + } +} + +static int multiwrite_req_compare(const void *a, const void *b) +{ + const BlockRequest *req1 = a, *req2 = b; + + /* + * Note that we can't simply subtract req2->sector from req1->sector + * here as that could overflow the return value. + */ + if (req1->sector > req2->sector) { + return 1; + } else if (req1->sector < req2->sector) { + return -1; + } else { + return 0; + } +} + +/* + * Takes a bunch of requests and tries to merge them. Returns the number of + * requests that remain after merging. + */ +static int multiwrite_merge(BlockDriverState *bs, BlockRequest *reqs, + int num_reqs, MultiwriteCB *mcb) +{ + int i, outidx; + + // Sort requests by start sector + qsort(reqs, num_reqs, sizeof(*reqs), &multiwrite_req_compare); + + // Check if adjacent requests touch the same clusters. If so, combine them, + // filling up gaps with zero sectors. + outidx = 0; + for (i = 1; i < num_reqs; i++) { + int merge = 0; + int64_t oldreq_last = reqs[outidx].sector + reqs[outidx].nb_sectors; + + // Handle exactly sequential writes and overlapping writes. + if (reqs[i].sector <= oldreq_last) { + merge = 1; + } + + if (reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1 > IOV_MAX) { + merge = 0; + } + + if (bs->bl.max_transfer_length && reqs[outidx].nb_sectors + + reqs[i].nb_sectors > bs->bl.max_transfer_length) { + merge = 0; + } + + if (merge) { + size_t size; + QEMUIOVector *qiov = g_malloc0(sizeof(*qiov)); + qemu_iovec_init(qiov, + reqs[outidx].qiov->niov + reqs[i].qiov->niov + 1); + + // Add the first request to the merged one. If the requests are + // overlapping, drop the last sectors of the first request. + size = (reqs[i].sector - reqs[outidx].sector) << 9; + qemu_iovec_concat(qiov, reqs[outidx].qiov, 0, size); + + // We should need to add any zeros between the two requests + assert (reqs[i].sector <= oldreq_last); + + // Add the second request + qemu_iovec_concat(qiov, reqs[i].qiov, 0, reqs[i].qiov->size); + + // Add tail of first request, if necessary + if (qiov->size < reqs[outidx].qiov->size) { + qemu_iovec_concat(qiov, reqs[outidx].qiov, qiov->size, + reqs[outidx].qiov->size - qiov->size); + } + + reqs[outidx].nb_sectors = qiov->size >> 9; + reqs[outidx].qiov = qiov; + + mcb->callbacks[i].free_qiov = reqs[outidx].qiov; + } else { + outidx++; + reqs[outidx].sector = reqs[i].sector; + reqs[outidx].nb_sectors = reqs[i].nb_sectors; + reqs[outidx].qiov = reqs[i].qiov; + } + } + + if (bs->blk) { + block_acct_merge_done(blk_get_stats(bs->blk), BLOCK_ACCT_WRITE, + num_reqs - outidx - 1); + } + + return outidx + 1; +} + +/* + * Submit multiple AIO write requests at once. + * + * On success, the function returns 0 and all requests in the reqs array have + * been submitted. In error case this function returns -1, and any of the + * requests may or may not be submitted yet. In particular, this means that the + * callback will be called for some of the requests, for others it won't. The + * caller must check the error field of the BlockRequest to wait for the right + * callbacks (if error != 0, no callback will be called). + * + * The implementation may modify the contents of the reqs array, e.g. to merge + * requests. However, the fields opaque and error are left unmodified as they + * are used to signal failure for a single request to the caller. + */ +int bdrv_aio_multiwrite(BlockDriverState *bs, BlockRequest *reqs, int num_reqs) +{ + MultiwriteCB *mcb; + int i; + + /* don't submit writes if we don't have a medium */ + if (bs->drv == NULL) { + for (i = 0; i < num_reqs; i++) { + reqs[i].error = -ENOMEDIUM; + } + return -1; + } + + if (num_reqs == 0) { + return 0; + } + + // Create MultiwriteCB structure + mcb = g_malloc0(sizeof(*mcb) + num_reqs * sizeof(*mcb->callbacks)); + mcb->num_requests = 0; + mcb->num_callbacks = num_reqs; + + for (i = 0; i < num_reqs; i++) { + mcb->callbacks[i].cb = reqs[i].cb; + mcb->callbacks[i].opaque = reqs[i].opaque; + } + + // Check for mergable requests + num_reqs = multiwrite_merge(bs, reqs, num_reqs, mcb); + + trace_bdrv_aio_multiwrite(mcb, mcb->num_callbacks, num_reqs); + + /* Run the aio requests. */ + mcb->num_requests = num_reqs; + for (i = 0; i < num_reqs; i++) { + bdrv_co_aio_rw_vector(bs, reqs[i].sector, reqs[i].qiov, + reqs[i].nb_sectors, reqs[i].flags, + multiwrite_cb, mcb, + true); + } + + return 0; +} + +void bdrv_aio_cancel(BlockAIOCB *acb) +{ + qemu_aio_ref(acb); + bdrv_aio_cancel_async(acb); + while (acb->refcnt > 1) { + if (acb->aiocb_info->get_aio_context) { + aio_poll(acb->aiocb_info->get_aio_context(acb), true); + } else if (acb->bs) { + aio_poll(bdrv_get_aio_context(acb->bs), true); + } else { + abort(); + } + } + qemu_aio_unref(acb); +} + +/* Async version of aio cancel. The caller is not blocked if the acb implements + * cancel_async, otherwise we do nothing and let the request normally complete. + * In either case the completion callback must be called. */ +void bdrv_aio_cancel_async(BlockAIOCB *acb) +{ + if (acb->aiocb_info->cancel_async) { + acb->aiocb_info->cancel_async(acb); + } +} + +/**************************************************************/ +/* async block device emulation */ + +typedef struct BlockAIOCBSync { + BlockAIOCB common; + QEMUBH *bh; + int ret; + /* vector translation state */ + QEMUIOVector *qiov; + uint8_t *bounce; + int is_write; +} BlockAIOCBSync; + +static const AIOCBInfo bdrv_em_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBSync), +}; + +static void bdrv_aio_bh_cb(void *opaque) +{ + BlockAIOCBSync *acb = opaque; + + if (!acb->is_write && acb->ret >= 0) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_bh_delete(acb->bh); + acb->bh = NULL; + qemu_aio_unref(acb); +} + +static BlockAIOCB *bdrv_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, + int is_write) + +{ + BlockAIOCBSync *acb; + + acb = qemu_aio_get(&bdrv_em_aiocb_info, bs, cb, opaque); + acb->is_write = is_write; + acb->qiov = qiov; + acb->bounce = qemu_try_blockalign(bs, qiov->size); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_aio_bh_cb, acb); + + if (acb->bounce == NULL) { + acb->ret = -ENOMEM; + } else if (is_write) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + acb->ret = bs->drv->bdrv_write(bs, sector_num, acb->bounce, nb_sectors); + } else { + acb->ret = bs->drv->bdrv_read(bs, sector_num, acb->bounce, nb_sectors); + } + + qemu_bh_schedule(acb->bh); + + return &acb->common; +} + +static BlockAIOCB *bdrv_aio_readv_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_aio_writev_em(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return bdrv_aio_rw_vector(bs, sector_num, qiov, nb_sectors, cb, opaque, 1); +} + + +typedef struct BlockAIOCBCoroutine { + BlockAIOCB common; + BlockRequest req; + bool is_write; + bool need_bh; + bool *done; + QEMUBH* bh; +} BlockAIOCBCoroutine; + +static const AIOCBInfo bdrv_em_co_aiocb_info = { + .aiocb_size = sizeof(BlockAIOCBCoroutine), +}; + +static void bdrv_co_complete(BlockAIOCBCoroutine *acb) +{ + if (!acb->need_bh) { + acb->common.cb(acb->common.opaque, acb->req.error); + qemu_aio_unref(acb); + } +} + +static void bdrv_co_em_bh(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + + assert(!acb->need_bh); + qemu_bh_delete(acb->bh); + bdrv_co_complete(acb); +} + +static void bdrv_co_maybe_schedule_bh(BlockAIOCBCoroutine *acb) +{ + acb->need_bh = false; + if (acb->req.error != -EINPROGRESS) { + BlockDriverState *bs = acb->common.bs; + + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), bdrv_co_em_bh, acb); + qemu_bh_schedule(acb->bh); + } +} + +/* Invoke bdrv_co_do_readv/bdrv_co_do_writev */ +static void coroutine_fn bdrv_co_do_rw(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + if (!acb->is_write) { + acb->req.error = bdrv_co_do_readv(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } else { + acb->req.error = bdrv_co_do_writev(bs, acb->req.sector, + acb->req.nb_sectors, acb->req.qiov, acb->req.flags); + } + + bdrv_co_complete(acb); +} + +static BlockAIOCB *bdrv_co_aio_rw_vector(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BdrvRequestFlags flags, + BlockCompletionFunc *cb, + void *opaque, + bool is_write) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + acb->req.qiov = qiov; + acb->req.flags = flags; + acb->is_write = is_write; + + co = qemu_coroutine_create(bdrv_co_do_rw); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_flush_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_flush(bs); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + trace_bdrv_aio_flush(bs, opaque); + + Coroutine *co; + BlockAIOCBCoroutine *acb; + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + + co = qemu_coroutine_create(bdrv_aio_flush_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +static void coroutine_fn bdrv_aio_discard_co_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + BlockDriverState *bs = acb->common.bs; + + acb->req.error = bdrv_co_discard(bs, acb->req.sector, acb->req.nb_sectors); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + Coroutine *co; + BlockAIOCBCoroutine *acb; + + trace_bdrv_aio_discard(bs, sector_num, nb_sectors, opaque); + + acb = qemu_aio_get(&bdrv_em_co_aiocb_info, bs, cb, opaque); + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.sector = sector_num; + acb->req.nb_sectors = nb_sectors; + co = qemu_coroutine_create(bdrv_aio_discard_co_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +void *qemu_aio_get(const AIOCBInfo *aiocb_info, BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BlockAIOCB *acb; + + acb = g_malloc(aiocb_info->aiocb_size); + acb->aiocb_info = aiocb_info; + acb->bs = bs; + acb->cb = cb; + acb->opaque = opaque; + acb->refcnt = 1; + return acb; +} + +void qemu_aio_ref(void *p) +{ + BlockAIOCB *acb = p; + acb->refcnt++; +} + +void qemu_aio_unref(void *p) +{ + BlockAIOCB *acb = p; + assert(acb->refcnt > 0); + if (--acb->refcnt == 0) { + g_free(acb); + } +} + +/**************************************************************/ +/* Coroutine block device emulation */ + +typedef struct CoroutineIOCompletion { + Coroutine *coroutine; + int ret; +} CoroutineIOCompletion; + +static void bdrv_co_io_em_complete(void *opaque, int ret) +{ + CoroutineIOCompletion *co = opaque; + + co->ret = ret; + qemu_coroutine_enter(co->coroutine, NULL); +} + +static int coroutine_fn bdrv_co_io_em(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *iov, + bool is_write) +{ + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; + + if (is_write) { + acb = bs->drv->bdrv_aio_writev(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } else { + acb = bs->drv->bdrv_aio_readv(bs, sector_num, iov, nb_sectors, + bdrv_co_io_em_complete, &co); + } + + trace_bdrv_co_io_em(bs, sector_num, nb_sectors, is_write, acb); + if (!acb) { + return -EIO; + } + qemu_coroutine_yield(); + + return co.ret; +} + +static int coroutine_fn bdrv_co_readv_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, false); +} + +static int coroutine_fn bdrv_co_writev_em(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + return bdrv_co_io_em(bs, sector_num, nb_sectors, iov, true); +} + +static void coroutine_fn bdrv_flush_co_entry(void *opaque) +{ + RwCo *rwco = opaque; + + rwco->ret = bdrv_co_flush(rwco->bs); +} + +int coroutine_fn bdrv_co_flush(BlockDriverState *bs) +{ + int ret; + BdrvTrackedRequest req; + + if (!bs || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs) || + bdrv_is_sg(bs)) { + return 0; + } + + tracked_request_begin(&req, bs, 0, 0, BDRV_TRACKED_FLUSH); + /* Write back cached data to the OS even with cache=unsafe */ + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_OS); + if (bs->drv->bdrv_co_flush_to_os) { + ret = bs->drv->bdrv_co_flush_to_os(bs); + if (ret < 0) { + goto out; + } + } + + /* But don't actually force it to the disk with cache=unsafe */ + if (bs->open_flags & BDRV_O_NO_FLUSH) { + goto flush_parent; + } + + BLKDBG_EVENT(bs->file, BLKDBG_FLUSH_TO_DISK); + if (bs->drv->bdrv_co_flush_to_disk) { + ret = bs->drv->bdrv_co_flush_to_disk(bs); + } else if (bs->drv->bdrv_aio_flush) { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_flush(bs, bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } else { + /* + * Some block drivers always operate in either writethrough or unsafe + * mode and don't support bdrv_flush therefore. Usually qemu doesn't + * know how the server works (because the behaviour is hardcoded or + * depends on server-side configuration), so we can't ensure that + * everything is safe on disk. Returning an error doesn't work because + * that would break guests even if the server operates in writethrough + * mode. + * + * Let's hope the user knows what he's doing. + */ + ret = 0; + } + if (ret < 0) { + goto out; + } + + /* Now flush the underlying protocol. It will also have BDRV_O_NO_FLUSH + * in the case of cache=unsafe, so there are no useless flushes. + */ +flush_parent: + ret = bs->file ? bdrv_co_flush(bs->file->bs) : 0; +out: + tracked_request_end(&req); + return ret; +} + +int bdrv_flush(BlockDriverState *bs) +{ + Coroutine *co; + RwCo rwco = { + .bs = bs, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_flush_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_flush_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +typedef struct DiscardCo { + BlockDriverState *bs; + int64_t sector_num; + int nb_sectors; + int ret; +} DiscardCo; +static void coroutine_fn bdrv_discard_co_entry(void *opaque) +{ + DiscardCo *rwco = opaque; + + rwco->ret = bdrv_co_discard(rwco->bs, rwco->sector_num, rwco->nb_sectors); +} + +int coroutine_fn bdrv_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + BdrvTrackedRequest req; + int max_discard, ret; + + if (!bs->drv) { + return -ENOMEDIUM; + } + + ret = bdrv_check_request(bs, sector_num, nb_sectors); + if (ret < 0) { + return ret; + } else if (bs->read_only) { + return -EPERM; + } + + /* Do nothing if disabled. */ + if (!(bs->open_flags & BDRV_O_UNMAP)) { + return 0; + } + + if (!bs->drv->bdrv_co_discard && !bs->drv->bdrv_aio_discard) { + return 0; + } + + tracked_request_begin(&req, bs, sector_num, nb_sectors, + BDRV_TRACKED_DISCARD); + bdrv_set_dirty(bs, sector_num, nb_sectors); + + max_discard = MIN_NON_ZERO(bs->bl.max_discard, BDRV_REQUEST_MAX_SECTORS); + while (nb_sectors > 0) { + int ret; + int num = nb_sectors; + + /* align request */ + if (bs->bl.discard_alignment && + num >= bs->bl.discard_alignment && + sector_num % bs->bl.discard_alignment) { + if (num > bs->bl.discard_alignment) { + num = bs->bl.discard_alignment; + } + num -= sector_num % bs->bl.discard_alignment; + } + + /* limit request size */ + if (num > max_discard) { + num = max_discard; + } + + if (bs->drv->bdrv_co_discard) { + ret = bs->drv->bdrv_co_discard(bs, sector_num, num); + } else { + BlockAIOCB *acb; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + + acb = bs->drv->bdrv_aio_discard(bs, sector_num, nb_sectors, + bdrv_co_io_em_complete, &co); + if (acb == NULL) { + ret = -EIO; + goto out; + } else { + qemu_coroutine_yield(); + ret = co.ret; + } + } + if (ret && ret != -ENOTSUP) { + goto out; + } + + sector_num += num; + nb_sectors -= num; + } + ret = 0; +out: + tracked_request_end(&req); + return ret; +} + +int bdrv_discard(BlockDriverState *bs, int64_t sector_num, int nb_sectors) +{ + Coroutine *co; + DiscardCo rwco = { + .bs = bs, + .sector_num = sector_num, + .nb_sectors = nb_sectors, + .ret = NOT_DONE, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_discard_co_entry(&rwco); + } else { + AioContext *aio_context = bdrv_get_aio_context(bs); + + co = qemu_coroutine_create(bdrv_discard_co_entry); + qemu_coroutine_enter(co, &rwco); + while (rwco.ret == NOT_DONE) { + aio_poll(aio_context, true); + } + } + + return rwco.ret; +} + +typedef struct { + CoroutineIOCompletion *co; + QEMUBH *bh; +} BdrvIoctlCompletionData; + +static void bdrv_ioctl_bh_cb(void *opaque) +{ + BdrvIoctlCompletionData *data = opaque; + + bdrv_co_io_em_complete(data->co, -ENOTSUP); + qemu_bh_delete(data->bh); +} + +static int bdrv_co_do_ioctl(BlockDriverState *bs, int req, void *buf) +{ + BlockDriver *drv = bs->drv; + BdrvTrackedRequest tracked_req; + CoroutineIOCompletion co = { + .coroutine = qemu_coroutine_self(), + }; + BlockAIOCB *acb; + + tracked_request_begin(&tracked_req, bs, 0, 0, BDRV_TRACKED_IOCTL); + if (!drv || !drv->bdrv_aio_ioctl) { + co.ret = -ENOTSUP; + goto out; + } + + acb = drv->bdrv_aio_ioctl(bs, req, buf, bdrv_co_io_em_complete, &co); + if (!acb) { + BdrvIoctlCompletionData *data = g_new(BdrvIoctlCompletionData, 1); + data->bh = aio_bh_new(bdrv_get_aio_context(bs), + bdrv_ioctl_bh_cb, data); + data->co = &co; + qemu_bh_schedule(data->bh); + } + qemu_coroutine_yield(); +out: + tracked_request_end(&tracked_req); + return co.ret; +} + +typedef struct { + BlockDriverState *bs; + int req; + void *buf; + int ret; +} BdrvIoctlCoData; + +static void coroutine_fn bdrv_co_ioctl_entry(void *opaque) +{ + BdrvIoctlCoData *data = opaque; + data->ret = bdrv_co_do_ioctl(data->bs, data->req, data->buf); +} + +/* needed for generic scsi interface */ +int bdrv_ioctl(BlockDriverState *bs, unsigned long int req, void *buf) +{ + BdrvIoctlCoData data = { + .bs = bs, + .req = req, + .buf = buf, + .ret = -EINPROGRESS, + }; + + if (qemu_in_coroutine()) { + /* Fast-path if already in coroutine context */ + bdrv_co_ioctl_entry(&data); + } else { + Coroutine *co = qemu_coroutine_create(bdrv_co_ioctl_entry); + qemu_coroutine_enter(co, &data); + } + while (data.ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(bs), true); + } + return data.ret; +} + +static void coroutine_fn bdrv_co_aio_ioctl_entry(void *opaque) +{ + BlockAIOCBCoroutine *acb = opaque; + acb->req.error = bdrv_co_do_ioctl(acb->common.bs, + acb->req.req, acb->req.buf); + bdrv_co_complete(acb); +} + +BlockAIOCB *bdrv_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + BlockAIOCBCoroutine *acb = qemu_aio_get(&bdrv_em_co_aiocb_info, + bs, cb, opaque); + Coroutine *co; + + acb->need_bh = true; + acb->req.error = -EINPROGRESS; + acb->req.req = req; + acb->req.buf = buf; + co = qemu_coroutine_create(bdrv_co_aio_ioctl_entry); + qemu_coroutine_enter(co, acb); + + bdrv_co_maybe_schedule_bh(acb); + return &acb->common; +} + +void *qemu_blockalign(BlockDriverState *bs, size_t size) +{ + return qemu_memalign(bdrv_opt_mem_align(bs), size); +} + +void *qemu_blockalign0(BlockDriverState *bs, size_t size) +{ + return memset(qemu_blockalign(bs, size), 0, size); +} + +void *qemu_try_blockalign(BlockDriverState *bs, size_t size) +{ + size_t align = bdrv_opt_mem_align(bs); + + /* Ensure that NULL is never returned on success */ + assert(align > 0); + if (size == 0) { + size = align; + } + + return qemu_try_memalign(align, size); +} + +void *qemu_try_blockalign0(BlockDriverState *bs, size_t size) +{ + void *mem = qemu_try_blockalign(bs, size); + + if (mem) { + memset(mem, 0, size); + } + + return mem; +} + +/* + * Check if all memory in this vector is sector aligned. + */ +bool bdrv_qiov_is_aligned(BlockDriverState *bs, QEMUIOVector *qiov) +{ + int i; + size_t alignment = bdrv_min_mem_align(bs); + + for (i = 0; i < qiov->niov; i++) { + if ((uintptr_t) qiov->iov[i].iov_base % alignment) { + return false; + } + if (qiov->iov[i].iov_len % alignment) { + return false; + } + } + + return true; +} + +void bdrv_add_before_write_notifier(BlockDriverState *bs, + NotifierWithReturn *notifier) +{ + notifier_with_return_list_add(&bs->before_write_notifiers, notifier); +} + +void bdrv_io_plug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_plug) { + drv->bdrv_io_plug(bs); + } else if (bs->file) { + bdrv_io_plug(bs->file->bs); + } +} + +void bdrv_io_unplug(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_io_unplug) { + drv->bdrv_io_unplug(bs); + } else if (bs->file) { + bdrv_io_unplug(bs->file->bs); + } +} + +void bdrv_flush_io_queue(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (drv && drv->bdrv_flush_io_queue) { + drv->bdrv_flush_io_queue(bs); + } else if (bs->file) { + bdrv_flush_io_queue(bs->file->bs); + } + bdrv_start_throttled_reqs(bs); +} + +void bdrv_drained_begin(BlockDriverState *bs) +{ + if (!bs->quiesce_counter++) { + aio_disable_external(bdrv_get_aio_context(bs)); + } + bdrv_drain(bs); +} + +void bdrv_drained_end(BlockDriverState *bs) +{ + assert(bs->quiesce_counter > 0); + if (--bs->quiesce_counter > 0) { + return; + } + aio_enable_external(bdrv_get_aio_context(bs)); +} diff --git a/src/block/iscsi.c b/src/block/iscsi.c new file mode 100644 index 0000000..bd1f1bf --- /dev/null +++ b/src/block/iscsi.c @@ -0,0 +1,1874 @@ +/* + * QEMU Block driver for iSCSI images + * + * Copyright (c) 2010-2011 Ronnie Sahlberg <ronniesahlberg@gmail.com> + * Copyright (c) 2012-2015 Peter Lieven <pl@kamp.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" + +#include <poll.h> +#include <math.h> +#include <arpa/inet.h> +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "qemu/bitops.h" +#include "qemu/bitmap.h" +#include "block/block_int.h" +#include "block/scsi.h" +#include "qemu/iov.h" +#include "sysemu/sysemu.h" +#include "qmp-commands.h" +#include "qapi/qmp/qstring.h" + +#include <iscsi/iscsi.h> +#include <iscsi/scsi-lowlevel.h> + +#ifdef __linux__ +#include <scsi/sg.h> +#include <block/scsi.h> +#endif + +typedef struct IscsiLun { + struct iscsi_context *iscsi; + AioContext *aio_context; + int lun; + enum scsi_inquiry_peripheral_device_type type; + int block_size; + uint64_t num_blocks; + int events; + QEMUTimer *nop_timer; + QEMUTimer *event_timer; + struct scsi_inquiry_logical_block_provisioning lbp; + struct scsi_inquiry_block_limits bl; + unsigned char *zeroblock; + unsigned long *allocationmap; + int cluster_sectors; + bool use_16_for_rw; + bool write_protected; + bool lbpme; + bool lbprz; + bool dpofua; + bool has_write_same; + bool force_next_flush; + bool request_timed_out; +} IscsiLun; + +typedef struct IscsiTask { + int status; + int complete; + int retries; + int do_retry; + struct scsi_task *task; + Coroutine *co; + QEMUBH *bh; + IscsiLun *iscsilun; + QEMUTimer retry_timer; + bool force_next_flush; + int err_code; +} IscsiTask; + +typedef struct IscsiAIOCB { + BlockAIOCB common; + QEMUIOVector *qiov; + QEMUBH *bh; + IscsiLun *iscsilun; + struct scsi_task *task; + uint8_t *buf; + int status; + int64_t sector_num; + int nb_sectors; + int ret; +#ifdef __linux__ + sg_io_hdr_t *ioh; +#endif +} IscsiAIOCB; + +/* libiscsi uses time_t so its enough to process events every second */ +#define EVENT_INTERVAL 1000 +#define NOP_INTERVAL 5000 +#define MAX_NOP_FAILURES 3 +#define ISCSI_CMD_RETRIES ARRAY_SIZE(iscsi_retry_times) +static const unsigned iscsi_retry_times[] = {8, 32, 128, 512, 2048, 8192, 32768}; + +/* this threshold is a trade-off knob to choose between + * the potential additional overhead of an extra GET_LBA_STATUS request + * vs. unnecessarily reading a lot of zero sectors over the wire. + * If a read request is greater or equal than ISCSI_CHECKALLOC_THRES + * sectors we check the allocation status of the area covered by the + * request first if the allocationmap indicates that the area might be + * unallocated. */ +#define ISCSI_CHECKALLOC_THRES 64 + +static void +iscsi_bh_cb(void *p) +{ + IscsiAIOCB *acb = p; + + qemu_bh_delete(acb->bh); + + g_free(acb->buf); + acb->buf = NULL; + + acb->common.cb(acb->common.opaque, acb->status); + + if (acb->task != NULL) { + scsi_free_scsi_task(acb->task); + acb->task = NULL; + } + + qemu_aio_unref(acb); +} + +static void +iscsi_schedule_bh(IscsiAIOCB *acb) +{ + if (acb->bh) { + return; + } + acb->bh = aio_bh_new(acb->iscsilun->aio_context, iscsi_bh_cb, acb); + qemu_bh_schedule(acb->bh); +} + +static void iscsi_co_generic_bh_cb(void *opaque) +{ + struct IscsiTask *iTask = opaque; + iTask->complete = 1; + qemu_bh_delete(iTask->bh); + qemu_coroutine_enter(iTask->co, NULL); +} + +static void iscsi_retry_timer_expired(void *opaque) +{ + struct IscsiTask *iTask = opaque; + iTask->complete = 1; + if (iTask->co) { + qemu_coroutine_enter(iTask->co, NULL); + } +} + +static inline unsigned exp_random(double mean) +{ + return -mean * log((double)rand() / RAND_MAX); +} + +/* SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST was introduced in + * libiscsi 1.10.0, together with other constants we need. Use it as + * a hint that we have to define them ourselves if needed, to keep the + * minimum required libiscsi version at 1.9.0. We use an ASCQ macro for + * the test because SCSI_STATUS_* is an enum. + * + * To guard against future changes where SCSI_SENSE_ASCQ_* also becomes + * an enum, check against the LIBISCSI_API_VERSION macro, which was + * introduced in 1.11.0. If it is present, there is no need to define + * anything. + */ +#if !defined(SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST) && \ + !defined(LIBISCSI_API_VERSION) +#define SCSI_STATUS_TASK_SET_FULL 0x28 +#define SCSI_STATUS_TIMEOUT 0x0f000002 +#define SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST 0x2600 +#define SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR 0x1a00 +#endif + +static int iscsi_translate_sense(struct scsi_sense *sense) +{ + int ret; + + switch (sense->key) { + case SCSI_SENSE_NOT_READY: + return -EBUSY; + case SCSI_SENSE_DATA_PROTECTION: + return -EACCES; + case SCSI_SENSE_COMMAND_ABORTED: + return -ECANCELED; + case SCSI_SENSE_ILLEGAL_REQUEST: + /* Parse ASCQ */ + break; + default: + return -EIO; + } + switch (sense->ascq) { + case SCSI_SENSE_ASCQ_PARAMETER_LIST_LENGTH_ERROR: + case SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE: + case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB: + case SCSI_SENSE_ASCQ_INVALID_FIELD_IN_PARAMETER_LIST: + ret = -EINVAL; + break; + case SCSI_SENSE_ASCQ_LBA_OUT_OF_RANGE: + ret = -ENOSPC; + break; + case SCSI_SENSE_ASCQ_LOGICAL_UNIT_NOT_SUPPORTED: + ret = -ENOTSUP; + break; + case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT: + case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_CLOSED: + case SCSI_SENSE_ASCQ_MEDIUM_NOT_PRESENT_TRAY_OPEN: + ret = -ENOMEDIUM; + break; + case SCSI_SENSE_ASCQ_WRITE_PROTECTED: + ret = -EACCES; + break; + default: + ret = -EIO; + break; + } + return ret; +} + +static void +iscsi_co_generic_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + struct IscsiTask *iTask = opaque; + struct scsi_task *task = command_data; + + iTask->status = status; + iTask->do_retry = 0; + iTask->task = task; + + if (status != SCSI_STATUS_GOOD) { + if (iTask->retries++ < ISCSI_CMD_RETRIES) { + if (status == SCSI_STATUS_CHECK_CONDITION + && task->sense.key == SCSI_SENSE_UNIT_ATTENTION) { + error_report("iSCSI CheckCondition: %s", + iscsi_get_error(iscsi)); + iTask->do_retry = 1; + goto out; + } + if (status == SCSI_STATUS_BUSY || + status == SCSI_STATUS_TIMEOUT || + status == SCSI_STATUS_TASK_SET_FULL) { + unsigned retry_time = + exp_random(iscsi_retry_times[iTask->retries - 1]); + if (status == SCSI_STATUS_TIMEOUT) { + /* make sure the request is rescheduled AFTER the + * reconnect is initiated */ + retry_time = EVENT_INTERVAL * 2; + iTask->iscsilun->request_timed_out = true; + } + error_report("iSCSI Busy/TaskSetFull/TimeOut" + " (retry #%u in %u ms): %s", + iTask->retries, retry_time, + iscsi_get_error(iscsi)); + aio_timer_init(iTask->iscsilun->aio_context, + &iTask->retry_timer, QEMU_CLOCK_REALTIME, + SCALE_MS, iscsi_retry_timer_expired, iTask); + timer_mod(&iTask->retry_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + retry_time); + iTask->do_retry = 1; + return; + } + } + iTask->err_code = iscsi_translate_sense(&task->sense); + error_report("iSCSI Failure: %s", iscsi_get_error(iscsi)); + } else { + iTask->iscsilun->force_next_flush |= iTask->force_next_flush; + } + +out: + if (iTask->co) { + iTask->bh = aio_bh_new(iTask->iscsilun->aio_context, + iscsi_co_generic_bh_cb, iTask); + qemu_bh_schedule(iTask->bh); + } else { + iTask->complete = 1; + } +} + +static void iscsi_co_init_iscsitask(IscsiLun *iscsilun, struct IscsiTask *iTask) +{ + *iTask = (struct IscsiTask) { + .co = qemu_coroutine_self(), + .iscsilun = iscsilun, + }; +} + +static void +iscsi_abort_task_cb(struct iscsi_context *iscsi, int status, void *command_data, + void *private_data) +{ + IscsiAIOCB *acb = private_data; + + acb->status = -ECANCELED; + iscsi_schedule_bh(acb); +} + +static void +iscsi_aio_cancel(BlockAIOCB *blockacb) +{ + IscsiAIOCB *acb = (IscsiAIOCB *)blockacb; + IscsiLun *iscsilun = acb->iscsilun; + + if (acb->status != -EINPROGRESS) { + return; + } + + /* send a task mgmt call to the target to cancel the task on the target */ + iscsi_task_mgmt_abort_task_async(iscsilun->iscsi, acb->task, + iscsi_abort_task_cb, acb); + +} + +static const AIOCBInfo iscsi_aiocb_info = { + .aiocb_size = sizeof(IscsiAIOCB), + .cancel_async = iscsi_aio_cancel, +}; + + +static void iscsi_process_read(void *arg); +static void iscsi_process_write(void *arg); + +static void +iscsi_set_events(IscsiLun *iscsilun) +{ + struct iscsi_context *iscsi = iscsilun->iscsi; + int ev = iscsi_which_events(iscsi); + + if (ev != iscsilun->events) { + aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsi), + false, + (ev & POLLIN) ? iscsi_process_read : NULL, + (ev & POLLOUT) ? iscsi_process_write : NULL, + iscsilun); + iscsilun->events = ev; + } +} + +static void iscsi_timed_check_events(void *opaque) +{ + IscsiLun *iscsilun = opaque; + + /* check for timed out requests */ + iscsi_service(iscsilun->iscsi, 0); + + if (iscsilun->request_timed_out) { + iscsilun->request_timed_out = false; + iscsi_reconnect(iscsilun->iscsi); + } + + /* newer versions of libiscsi may return zero events. Ensure we are able + * to return to service once this situation changes. */ + iscsi_set_events(iscsilun); + + timer_mod(iscsilun->event_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); +} + +static void +iscsi_process_read(void *arg) +{ + IscsiLun *iscsilun = arg; + struct iscsi_context *iscsi = iscsilun->iscsi; + + iscsi_service(iscsi, POLLIN); + iscsi_set_events(iscsilun); +} + +static void +iscsi_process_write(void *arg) +{ + IscsiLun *iscsilun = arg; + struct iscsi_context *iscsi = iscsilun->iscsi; + + iscsi_service(iscsi, POLLOUT); + iscsi_set_events(iscsilun); +} + +static int64_t sector_lun2qemu(int64_t sector, IscsiLun *iscsilun) +{ + return sector * iscsilun->block_size / BDRV_SECTOR_SIZE; +} + +static int64_t sector_qemu2lun(int64_t sector, IscsiLun *iscsilun) +{ + return sector * BDRV_SECTOR_SIZE / iscsilun->block_size; +} + +static bool is_request_lun_aligned(int64_t sector_num, int nb_sectors, + IscsiLun *iscsilun) +{ + if ((sector_num * BDRV_SECTOR_SIZE) % iscsilun->block_size || + (nb_sectors * BDRV_SECTOR_SIZE) % iscsilun->block_size) { + error_report("iSCSI misaligned request: " + "iscsilun->block_size %u, sector_num %" PRIi64 + ", nb_sectors %d", + iscsilun->block_size, sector_num, nb_sectors); + return 0; + } + return 1; +} + +static unsigned long *iscsi_allocationmap_init(IscsiLun *iscsilun) +{ + return bitmap_try_new(DIV_ROUND_UP(sector_lun2qemu(iscsilun->num_blocks, + iscsilun), + iscsilun->cluster_sectors)); +} + +static void iscsi_allocationmap_set(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + if (iscsilun->allocationmap == NULL) { + return; + } + bitmap_set(iscsilun->allocationmap, + sector_num / iscsilun->cluster_sectors, + DIV_ROUND_UP(nb_sectors, iscsilun->cluster_sectors)); +} + +static void iscsi_allocationmap_clear(IscsiLun *iscsilun, int64_t sector_num, + int nb_sectors) +{ + int64_t cluster_num, nb_clusters; + if (iscsilun->allocationmap == NULL) { + return; + } + cluster_num = DIV_ROUND_UP(sector_num, iscsilun->cluster_sectors); + nb_clusters = (sector_num + nb_sectors) / iscsilun->cluster_sectors + - cluster_num; + if (nb_clusters > 0) { + bitmap_clear(iscsilun->allocationmap, cluster_num, nb_clusters); + } +} + +static int coroutine_fn iscsi_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + uint64_t lba; + uint32_t num_sectors; + int fua; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { + error_report("iSCSI Error: Write of %d sectors exceeds max_xfer_len " + "of %d sectors", nb_sectors, bs->bl.max_transfer_length); + return -EINVAL; + } + + lba = sector_qemu2lun(sector_num, iscsilun); + num_sectors = sector_qemu2lun(nb_sectors, iscsilun); + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + fua = iscsilun->dpofua && !bs->enable_write_cache; + iTask.force_next_flush = !fua; + if (iscsilun->use_16_for_rw) { + iTask.task = iscsi_write16_task(iscsilun->iscsi, iscsilun->lun, lba, + NULL, num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, fua, 0, 0, + iscsi_co_generic_cb, &iTask); + } else { + iTask.task = iscsi_write10_task(iscsilun->iscsi, iscsilun->lun, lba, + NULL, num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, fua, 0, 0, + iscsi_co_generic_cb, &iTask); + } + if (iTask.task == NULL) { + return -ENOMEM; + } + scsi_task_set_iov_out(iTask.task, (struct scsi_iovec *) iov->iov, + iov->niov); + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return iTask.err_code; + } + + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + + return 0; +} + + +static bool iscsi_allocationmap_is_allocated(IscsiLun *iscsilun, + int64_t sector_num, int nb_sectors) +{ + unsigned long size; + if (iscsilun->allocationmap == NULL) { + return true; + } + size = DIV_ROUND_UP(sector_num + nb_sectors, iscsilun->cluster_sectors); + return !(find_next_bit(iscsilun->allocationmap, size, + sector_num / iscsilun->cluster_sectors) == size); +} + +static int64_t coroutine_fn iscsi_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + IscsiLun *iscsilun = bs->opaque; + struct scsi_get_lba_status *lbas = NULL; + struct scsi_lba_status_descriptor *lbasd = NULL; + struct IscsiTask iTask; + int64_t ret; + + iscsi_co_init_iscsitask(iscsilun, &iTask); + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + ret = -EINVAL; + goto out; + } + + /* default to all sectors allocated */ + ret = BDRV_BLOCK_DATA; + ret |= (sector_num << BDRV_SECTOR_BITS) | BDRV_BLOCK_OFFSET_VALID; + *pnum = nb_sectors; + + /* LUN does not support logical block provisioning */ + if (!iscsilun->lbpme) { + goto out; + } + +retry: + if (iscsi_get_lba_status_task(iscsilun->iscsi, iscsilun->lun, + sector_qemu2lun(sector_num, iscsilun), + 8 + 16, iscsi_co_generic_cb, + &iTask) == NULL) { + ret = -ENOMEM; + goto out; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.do_retry) { + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + /* in case the get_lba_status_callout fails (i.e. + * because the device is busy or the cmd is not + * supported) we pretend all blocks are allocated + * for backwards compatibility */ + goto out; + } + + lbas = scsi_datain_unmarshall(iTask.task); + if (lbas == NULL) { + ret = -EIO; + goto out; + } + + lbasd = &lbas->descriptors[0]; + + if (sector_qemu2lun(sector_num, iscsilun) != lbasd->lba) { + ret = -EIO; + goto out; + } + + *pnum = sector_lun2qemu(lbasd->num_blocks, iscsilun); + + if (lbasd->provisioning == SCSI_PROVISIONING_TYPE_DEALLOCATED || + lbasd->provisioning == SCSI_PROVISIONING_TYPE_ANCHORED) { + ret &= ~BDRV_BLOCK_DATA; + if (iscsilun->lbprz) { + ret |= BDRV_BLOCK_ZERO; + } + } + + if (ret & BDRV_BLOCK_ZERO) { + iscsi_allocationmap_clear(iscsilun, sector_num, *pnum); + } else { + iscsi_allocationmap_set(iscsilun, sector_num, *pnum); + } + + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } +out: + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + } + return ret; +} + +static int coroutine_fn iscsi_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + uint64_t lba; + uint32_t num_sectors; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (bs->bl.max_transfer_length && nb_sectors > bs->bl.max_transfer_length) { + error_report("iSCSI Error: Read of %d sectors exceeds max_xfer_len " + "of %d sectors", nb_sectors, bs->bl.max_transfer_length); + return -EINVAL; + } + + if (iscsilun->lbprz && nb_sectors >= ISCSI_CHECKALLOC_THRES && + !iscsi_allocationmap_is_allocated(iscsilun, sector_num, nb_sectors)) { + int64_t ret; + int pnum; + ret = iscsi_co_get_block_status(bs, sector_num, INT_MAX, &pnum); + if (ret < 0) { + return ret; + } + if (ret & BDRV_BLOCK_ZERO && pnum >= nb_sectors) { + qemu_iovec_memset(iov, 0, 0x00, iov->size); + return 0; + } + } + + lba = sector_qemu2lun(sector_num, iscsilun); + num_sectors = sector_qemu2lun(nb_sectors, iscsilun); + + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + if (iscsilun->use_16_for_rw) { + iTask.task = iscsi_read16_task(iscsilun->iscsi, iscsilun->lun, lba, + num_sectors * iscsilun->block_size, + iscsilun->block_size, 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); + } else { + iTask.task = iscsi_read10_task(iscsilun->iscsi, iscsilun->lun, lba, + num_sectors * iscsilun->block_size, + iscsilun->block_size, + 0, 0, 0, 0, 0, + iscsi_co_generic_cb, &iTask); + } + if (iTask.task == NULL) { + return -ENOMEM; + } + scsi_task_set_iov_in(iTask.task, (struct scsi_iovec *) iov->iov, iov->niov); + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return iTask.err_code; + } + + return 0; +} + +static int coroutine_fn iscsi_co_flush(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + + if (!iscsilun->force_next_flush) { + return 0; + } + iscsilun->force_next_flush = false; + + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + if (iscsi_synchronizecache10_task(iscsilun->iscsi, iscsilun->lun, 0, 0, 0, + 0, iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return iTask.err_code; + } + + return 0; +} + +#ifdef __linux__ +static void +iscsi_aio_ioctl_cb(struct iscsi_context *iscsi, int status, + void *command_data, void *opaque) +{ + IscsiAIOCB *acb = opaque; + + g_free(acb->buf); + acb->buf = NULL; + + acb->status = 0; + if (status < 0) { + error_report("Failed to ioctl(SG_IO) to iSCSI lun. %s", + iscsi_get_error(iscsi)); + acb->status = iscsi_translate_sense(&acb->task->sense); + } + + acb->ioh->driver_status = 0; + acb->ioh->host_status = 0; + acb->ioh->resid = 0; + +#define SG_ERR_DRIVER_SENSE 0x08 + + if (status == SCSI_STATUS_CHECK_CONDITION && acb->task->datain.size >= 2) { + int ss; + + acb->ioh->driver_status |= SG_ERR_DRIVER_SENSE; + + acb->ioh->sb_len_wr = acb->task->datain.size - 2; + ss = (acb->ioh->mx_sb_len >= acb->ioh->sb_len_wr) ? + acb->ioh->mx_sb_len : acb->ioh->sb_len_wr; + memcpy(acb->ioh->sbp, &acb->task->datain.data[2], ss); + } + + iscsi_schedule_bh(acb); +} + +static void iscsi_ioctl_bh_completion(void *opaque) +{ + IscsiAIOCB *acb = opaque; + + qemu_bh_delete(acb->bh); + acb->common.cb(acb->common.opaque, acb->ret); + qemu_aio_unref(acb); +} + +static void iscsi_ioctl_handle_emulated(IscsiAIOCB *acb, int req, void *buf) +{ + BlockDriverState *bs = acb->common.bs; + IscsiLun *iscsilun = bs->opaque; + int ret = 0; + + switch (req) { + case SG_GET_VERSION_NUM: + *(int *)buf = 30000; + break; + case SG_GET_SCSI_ID: + ((struct sg_scsi_id *)buf)->scsi_type = iscsilun->type; + break; + default: + ret = -EINVAL; + } + assert(!acb->bh); + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), + iscsi_ioctl_bh_completion, acb); + acb->ret = ret; + qemu_bh_schedule(acb->bh); +} + +static BlockAIOCB *iscsi_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + struct iscsi_data data; + IscsiAIOCB *acb; + + acb = qemu_aio_get(&iscsi_aiocb_info, bs, cb, opaque); + + acb->iscsilun = iscsilun; + acb->bh = NULL; + acb->status = -EINPROGRESS; + acb->buf = NULL; + acb->ioh = buf; + + if (req != SG_IO) { + iscsi_ioctl_handle_emulated(acb, req, buf); + return &acb->common; + } + + acb->task = malloc(sizeof(struct scsi_task)); + if (acb->task == NULL) { + error_report("iSCSI: Failed to allocate task for scsi command. %s", + iscsi_get_error(iscsi)); + qemu_aio_unref(acb); + return NULL; + } + memset(acb->task, 0, sizeof(struct scsi_task)); + + switch (acb->ioh->dxfer_direction) { + case SG_DXFER_TO_DEV: + acb->task->xfer_dir = SCSI_XFER_WRITE; + break; + case SG_DXFER_FROM_DEV: + acb->task->xfer_dir = SCSI_XFER_READ; + break; + default: + acb->task->xfer_dir = SCSI_XFER_NONE; + break; + } + + acb->task->cdb_size = acb->ioh->cmd_len; + memcpy(&acb->task->cdb[0], acb->ioh->cmdp, acb->ioh->cmd_len); + acb->task->expxferlen = acb->ioh->dxfer_len; + + data.size = 0; + if (acb->task->xfer_dir == SCSI_XFER_WRITE) { + if (acb->ioh->iovec_count == 0) { + data.data = acb->ioh->dxferp; + data.size = acb->ioh->dxfer_len; + } else { + scsi_task_set_iov_out(acb->task, + (struct scsi_iovec *) acb->ioh->dxferp, + acb->ioh->iovec_count); + } + } + + if (iscsi_scsi_command_async(iscsi, iscsilun->lun, acb->task, + iscsi_aio_ioctl_cb, + (data.size > 0) ? &data : NULL, + acb) != 0) { + scsi_free_scsi_task(acb->task); + qemu_aio_unref(acb); + return NULL; + } + + /* tell libiscsi to read straight into the buffer we got from ioctl */ + if (acb->task->xfer_dir == SCSI_XFER_READ) { + if (acb->ioh->iovec_count == 0) { + scsi_task_add_data_in_buffer(acb->task, + acb->ioh->dxfer_len, + acb->ioh->dxferp); + } else { + scsi_task_set_iov_in(acb->task, + (struct scsi_iovec *) acb->ioh->dxferp, + acb->ioh->iovec_count); + } + } + + iscsi_set_events(iscsilun); + + return &acb->common; +} + +#endif + +static int64_t +iscsi_getlength(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + int64_t len; + + len = iscsilun->num_blocks; + len *= iscsilun->block_size; + + return len; +} + +static int +coroutine_fn iscsi_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + struct unmap_list list; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (!iscsilun->lbp.lbpu) { + /* UNMAP is not supported by the target */ + return 0; + } + + list.lba = sector_qemu2lun(sector_num, iscsilun); + list.num = sector_qemu2lun(nb_sectors, iscsilun); + + iscsi_co_init_iscsitask(iscsilun, &iTask); +retry: + if (iscsi_unmap_task(iscsilun->iscsi, iscsilun->lun, 0, 0, &list, 1, + iscsi_co_generic_cb, &iTask) == NULL) { + return -ENOMEM; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status == SCSI_STATUS_CHECK_CONDITION) { + /* the target might fail with a check condition if it + is not happy with the alignment of the UNMAP request + we silently fail in this case */ + return 0; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return iTask.err_code; + } + + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + + return 0; +} + +static int +coroutine_fn iscsi_co_write_zeroes(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + IscsiLun *iscsilun = bs->opaque; + struct IscsiTask iTask; + uint64_t lba; + uint32_t nb_blocks; + bool use_16_for_ws = iscsilun->use_16_for_rw; + + if (!is_request_lun_aligned(sector_num, nb_sectors, iscsilun)) { + return -EINVAL; + } + + if (flags & BDRV_REQ_MAY_UNMAP) { + if (!use_16_for_ws && !iscsilun->lbp.lbpws10) { + /* WRITESAME10 with UNMAP is unsupported try WRITESAME16 */ + use_16_for_ws = true; + } + if (use_16_for_ws && !iscsilun->lbp.lbpws) { + /* WRITESAME16 with UNMAP is not supported by the target, + * fall back and try WRITESAME10/16 without UNMAP */ + flags &= ~BDRV_REQ_MAY_UNMAP; + use_16_for_ws = iscsilun->use_16_for_rw; + } + } + + if (!(flags & BDRV_REQ_MAY_UNMAP) && !iscsilun->has_write_same) { + /* WRITESAME without UNMAP is not supported by the target */ + return -ENOTSUP; + } + + lba = sector_qemu2lun(sector_num, iscsilun); + nb_blocks = sector_qemu2lun(nb_sectors, iscsilun); + + if (iscsilun->zeroblock == NULL) { + iscsilun->zeroblock = g_try_malloc0(iscsilun->block_size); + if (iscsilun->zeroblock == NULL) { + return -ENOMEM; + } + } + + iscsi_co_init_iscsitask(iscsilun, &iTask); + iTask.force_next_flush = true; +retry: + if (use_16_for_ws) { + iTask.task = iscsi_writesame16_task(iscsilun->iscsi, iscsilun->lun, lba, + iscsilun->zeroblock, iscsilun->block_size, + nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), + 0, 0, iscsi_co_generic_cb, &iTask); + } else { + iTask.task = iscsi_writesame10_task(iscsilun->iscsi, iscsilun->lun, lba, + iscsilun->zeroblock, iscsilun->block_size, + nb_blocks, 0, !!(flags & BDRV_REQ_MAY_UNMAP), + 0, 0, iscsi_co_generic_cb, &iTask); + } + if (iTask.task == NULL) { + return -ENOMEM; + } + + while (!iTask.complete) { + iscsi_set_events(iscsilun); + qemu_coroutine_yield(); + } + + if (iTask.status == SCSI_STATUS_CHECK_CONDITION && + iTask.task->sense.key == SCSI_SENSE_ILLEGAL_REQUEST && + (iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_OPERATION_CODE || + iTask.task->sense.ascq == SCSI_SENSE_ASCQ_INVALID_FIELD_IN_CDB)) { + /* WRITE SAME is not supported by the target */ + iscsilun->has_write_same = false; + scsi_free_scsi_task(iTask.task); + return -ENOTSUP; + } + + if (iTask.task != NULL) { + scsi_free_scsi_task(iTask.task); + iTask.task = NULL; + } + + if (iTask.do_retry) { + iTask.complete = 0; + goto retry; + } + + if (iTask.status != SCSI_STATUS_GOOD) { + return iTask.err_code; + } + + if (flags & BDRV_REQ_MAY_UNMAP) { + iscsi_allocationmap_clear(iscsilun, sector_num, nb_sectors); + } else { + iscsi_allocationmap_set(iscsilun, sector_num, nb_sectors); + } + + return 0; +} + +static void parse_chap(struct iscsi_context *iscsi, const char *target, + Error **errp) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *user = NULL; + const char *password = NULL; + + list = qemu_find_opts("iscsi"); + if (!list) { + return; + } + + opts = qemu_opts_find(list, target); + if (opts == NULL) { + opts = QTAILQ_FIRST(&list->head); + if (!opts) { + return; + } + } + + user = qemu_opt_get(opts, "user"); + if (!user) { + return; + } + + password = qemu_opt_get(opts, "password"); + if (!password) { + error_setg(errp, "CHAP username specified but no password was given"); + return; + } + + if (iscsi_set_initiator_username_pwd(iscsi, user, password)) { + error_setg(errp, "Failed to set initiator username and password"); + } +} + +static void parse_header_digest(struct iscsi_context *iscsi, const char *target, + Error **errp) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *digest = NULL; + + list = qemu_find_opts("iscsi"); + if (!list) { + return; + } + + opts = qemu_opts_find(list, target); + if (opts == NULL) { + opts = QTAILQ_FIRST(&list->head); + if (!opts) { + return; + } + } + + digest = qemu_opt_get(opts, "header-digest"); + if (!digest) { + return; + } + + if (!strcmp(digest, "CRC32C")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C); + } else if (!strcmp(digest, "NONE")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE); + } else if (!strcmp(digest, "CRC32C-NONE")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_CRC32C_NONE); + } else if (!strcmp(digest, "NONE-CRC32C")) { + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); + } else { + error_setg(errp, "Invalid header-digest setting : %s", digest); + } +} + +static char *parse_initiator_name(const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *name; + char *iscsi_name; + UuidInfo *uuid_info; + + list = qemu_find_opts("iscsi"); + if (list) { + opts = qemu_opts_find(list, target); + if (!opts) { + opts = QTAILQ_FIRST(&list->head); + } + if (opts) { + name = qemu_opt_get(opts, "initiator-name"); + if (name) { + return g_strdup(name); + } + } + } + + uuid_info = qmp_query_uuid(NULL); + if (strcmp(uuid_info->UUID, UUID_NONE) == 0) { + name = qemu_get_vm_name(); + } else { + name = uuid_info->UUID; + } + iscsi_name = g_strdup_printf("iqn.2008-11.org.linux-kvm%s%s", + name ? ":" : "", name ? name : ""); + qapi_free_UuidInfo(uuid_info); + return iscsi_name; +} + +static int parse_timeout(const char *target) +{ + QemuOptsList *list; + QemuOpts *opts; + const char *timeout; + + list = qemu_find_opts("iscsi"); + if (list) { + opts = qemu_opts_find(list, target); + if (!opts) { + opts = QTAILQ_FIRST(&list->head); + } + if (opts) { + timeout = qemu_opt_get(opts, "timeout"); + if (timeout) { + return atoi(timeout); + } + } + } + + return 0; +} + +static void iscsi_nop_timed_event(void *opaque) +{ + IscsiLun *iscsilun = opaque; + + if (iscsi_get_nops_in_flight(iscsilun->iscsi) >= MAX_NOP_FAILURES) { + error_report("iSCSI: NOP timeout. Reconnecting..."); + iscsilun->request_timed_out = true; + } else if (iscsi_nop_out_async(iscsilun->iscsi, NULL, NULL, 0, NULL) != 0) { + error_report("iSCSI: failed to sent NOP-Out. Disabling NOP messages."); + return; + } + + timer_mod(iscsilun->nop_timer, qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); + iscsi_set_events(iscsilun); +} + +static void iscsi_readcapacity_sync(IscsiLun *iscsilun, Error **errp) +{ + struct scsi_task *task = NULL; + struct scsi_readcapacity10 *rc10 = NULL; + struct scsi_readcapacity16 *rc16 = NULL; + int retries = ISCSI_CMD_RETRIES; + + do { + if (task != NULL) { + scsi_free_scsi_task(task); + task = NULL; + } + + switch (iscsilun->type) { + case TYPE_DISK: + task = iscsi_readcapacity16_sync(iscsilun->iscsi, iscsilun->lun); + if (task != NULL && task->status == SCSI_STATUS_GOOD) { + rc16 = scsi_datain_unmarshall(task); + if (rc16 == NULL) { + error_setg(errp, "iSCSI: Failed to unmarshall readcapacity16 data."); + } else { + iscsilun->block_size = rc16->block_length; + iscsilun->num_blocks = rc16->returned_lba + 1; + iscsilun->lbpme = !!rc16->lbpme; + iscsilun->lbprz = !!rc16->lbprz; + iscsilun->use_16_for_rw = (rc16->returned_lba > 0xffffffff); + } + } + break; + case TYPE_ROM: + task = iscsi_readcapacity10_sync(iscsilun->iscsi, iscsilun->lun, 0, 0); + if (task != NULL && task->status == SCSI_STATUS_GOOD) { + rc10 = scsi_datain_unmarshall(task); + if (rc10 == NULL) { + error_setg(errp, "iSCSI: Failed to unmarshall readcapacity10 data."); + } else { + iscsilun->block_size = rc10->block_size; + if (rc10->lba == 0) { + /* blank disk loaded */ + iscsilun->num_blocks = 0; + } else { + iscsilun->num_blocks = rc10->lba + 1; + } + } + } + break; + default: + return; + } + } while (task != NULL && task->status == SCSI_STATUS_CHECK_CONDITION + && task->sense.key == SCSI_SENSE_UNIT_ATTENTION + && retries-- > 0); + + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + error_setg(errp, "iSCSI: failed to send readcapacity10 command."); + } else if (!iscsilun->block_size || + iscsilun->block_size % BDRV_SECTOR_SIZE) { + error_setg(errp, "iSCSI: the target returned an invalid " + "block size of %d.", iscsilun->block_size); + } + if (task) { + scsi_free_scsi_task(task); + } +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "iscsi", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the iscsi image", + }, + { /* end of list */ } + }, +}; + +static struct scsi_task *iscsi_do_inquiry(struct iscsi_context *iscsi, int lun, + int evpd, int pc, void **inq, Error **errp) +{ + int full_size; + struct scsi_task *task = NULL; + task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, 64); + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + goto fail; + } + full_size = scsi_datain_getfullsize(task); + if (full_size > task->datain.size) { + scsi_free_scsi_task(task); + + /* we need more data for the full list */ + task = iscsi_inquiry_sync(iscsi, lun, evpd, pc, full_size); + if (task == NULL || task->status != SCSI_STATUS_GOOD) { + goto fail; + } + } + + *inq = scsi_datain_unmarshall(task); + if (*inq == NULL) { + error_setg(errp, "iSCSI: failed to unmarshall inquiry datain blob"); + goto fail_with_err; + } + + return task; + +fail: + error_setg(errp, "iSCSI: Inquiry command failed : %s", + iscsi_get_error(iscsi)); +fail_with_err: + if (task != NULL) { + scsi_free_scsi_task(task); + } + return NULL; +} + +static void iscsi_detach_aio_context(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + + aio_set_fd_handler(iscsilun->aio_context, iscsi_get_fd(iscsilun->iscsi), + false, NULL, NULL, NULL); + iscsilun->events = 0; + + if (iscsilun->nop_timer) { + timer_del(iscsilun->nop_timer); + timer_free(iscsilun->nop_timer); + iscsilun->nop_timer = NULL; + } + if (iscsilun->event_timer) { + timer_del(iscsilun->event_timer); + timer_free(iscsilun->event_timer); + iscsilun->event_timer = NULL; + } +} + +static void iscsi_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + IscsiLun *iscsilun = bs->opaque; + + iscsilun->aio_context = new_context; + iscsi_set_events(iscsilun); + + /* Set up a timer for sending out iSCSI NOPs */ + iscsilun->nop_timer = aio_timer_new(iscsilun->aio_context, + QEMU_CLOCK_REALTIME, SCALE_MS, + iscsi_nop_timed_event, iscsilun); + timer_mod(iscsilun->nop_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + NOP_INTERVAL); + + /* Set up a timer for periodic calls to iscsi_set_events and to + * scan for command timeout */ + iscsilun->event_timer = aio_timer_new(iscsilun->aio_context, + QEMU_CLOCK_REALTIME, SCALE_MS, + iscsi_timed_check_events, iscsilun); + timer_mod(iscsilun->event_timer, + qemu_clock_get_ms(QEMU_CLOCK_REALTIME) + EVENT_INTERVAL); +} + +static void iscsi_modesense_sync(IscsiLun *iscsilun) +{ + struct scsi_task *task; + struct scsi_mode_sense *ms = NULL; + iscsilun->write_protected = false; + iscsilun->dpofua = false; + + task = iscsi_modesense6_sync(iscsilun->iscsi, iscsilun->lun, + 1, SCSI_MODESENSE_PC_CURRENT, + 0x3F, 0, 255); + if (task == NULL) { + error_report("iSCSI: Failed to send MODE_SENSE(6) command: %s", + iscsi_get_error(iscsilun->iscsi)); + goto out; + } + + if (task->status != SCSI_STATUS_GOOD) { + error_report("iSCSI: Failed MODE_SENSE(6), LUN assumed writable"); + goto out; + } + ms = scsi_datain_unmarshall(task); + if (!ms) { + error_report("iSCSI: Failed to unmarshall MODE_SENSE(6) data: %s", + iscsi_get_error(iscsilun->iscsi)); + goto out; + } + iscsilun->write_protected = ms->device_specific_parameter & 0x80; + iscsilun->dpofua = ms->device_specific_parameter & 0x10; + +out: + if (task) { + scsi_free_scsi_task(task); + } +} + +/* + * We support iscsi url's on the form + * iscsi://[<username>%<password>@]<host>[:<port>]/<targetname>/<lun> + */ +static int iscsi_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = NULL; + struct iscsi_url *iscsi_url = NULL; + struct scsi_task *task = NULL; + struct scsi_inquiry_standard *inq = NULL; + struct scsi_inquiry_supported_pages *inq_vpd; + char *initiator_name = NULL; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + int i, ret = 0, timeout = 0; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + + filename = qemu_opt_get(opts, "filename"); + + iscsi_url = iscsi_parse_full_url(iscsi, filename); + if (iscsi_url == NULL) { + error_setg(errp, "Failed to parse URL : %s", filename); + ret = -EINVAL; + goto out; + } + + memset(iscsilun, 0, sizeof(IscsiLun)); + + initiator_name = parse_initiator_name(iscsi_url->target); + + iscsi = iscsi_create_context(initiator_name); + if (iscsi == NULL) { + error_setg(errp, "iSCSI: Failed to create iSCSI context."); + ret = -ENOMEM; + goto out; + } + + if (iscsi_set_targetname(iscsi, iscsi_url->target)) { + error_setg(errp, "iSCSI: Failed to set target name."); + ret = -EINVAL; + goto out; + } + + if (iscsi_url->user[0] != '\0') { + ret = iscsi_set_initiator_username_pwd(iscsi, iscsi_url->user, + iscsi_url->passwd); + if (ret != 0) { + error_setg(errp, "Failed to set initiator username and password"); + ret = -EINVAL; + goto out; + } + } + + /* check if we got CHAP username/password via the options */ + parse_chap(iscsi, iscsi_url->target, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + + if (iscsi_set_session_type(iscsi, ISCSI_SESSION_NORMAL) != 0) { + error_setg(errp, "iSCSI: Failed to set session type to normal."); + ret = -EINVAL; + goto out; + } + + iscsi_set_header_digest(iscsi, ISCSI_HEADER_DIGEST_NONE_CRC32C); + + /* check if we got HEADER_DIGEST via the options */ + parse_header_digest(iscsi, iscsi_url->target, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + + /* timeout handling is broken in libiscsi before 1.15.0 */ + timeout = parse_timeout(iscsi_url->target); +#if defined(LIBISCSI_API_VERSION) && LIBISCSI_API_VERSION >= 20150621 + iscsi_set_timeout(iscsi, timeout); +#else + if (timeout) { + error_report("iSCSI: ignoring timeout value for libiscsi <1.15.0"); + } +#endif + + if (iscsi_full_connect_sync(iscsi, iscsi_url->portal, iscsi_url->lun) != 0) { + error_setg(errp, "iSCSI: Failed to connect to LUN : %s", + iscsi_get_error(iscsi)); + ret = -EINVAL; + goto out; + } + + iscsilun->iscsi = iscsi; + iscsilun->aio_context = bdrv_get_aio_context(bs); + iscsilun->lun = iscsi_url->lun; + iscsilun->has_write_same = true; + + task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 0, 0, + (void **) &inq, errp); + if (task == NULL) { + ret = -EINVAL; + goto out; + } + iscsilun->type = inq->periperal_device_type; + scsi_free_scsi_task(task); + task = NULL; + + iscsi_modesense_sync(iscsilun); + + /* Check the write protect flag of the LUN if we want to write */ + if (iscsilun->type == TYPE_DISK && (flags & BDRV_O_RDWR) && + iscsilun->write_protected) { + error_setg(errp, "Cannot open a write protected LUN as read-write"); + ret = -EACCES; + goto out; + } + + iscsi_readcapacity_sync(iscsilun, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + bs->total_sectors = sector_lun2qemu(iscsilun->num_blocks, iscsilun); + bs->request_alignment = iscsilun->block_size; + + /* We don't have any emulation for devices other than disks and CD-ROMs, so + * this must be sg ioctl compatible. We force it to be sg, otherwise qemu + * will try to read from the device to guess the image format. + */ + if (iscsilun->type != TYPE_DISK && iscsilun->type != TYPE_ROM) { + bs->sg = 1; + } + + task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_SUPPORTED_VPD_PAGES, + (void **) &inq_vpd, errp); + if (task == NULL) { + ret = -EINVAL; + goto out; + } + for (i = 0; i < inq_vpd->num_pages; i++) { + struct scsi_task *inq_task; + struct scsi_inquiry_logical_block_provisioning *inq_lbp; + struct scsi_inquiry_block_limits *inq_bl; + switch (inq_vpd->pages[i]) { + case SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING: + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_LOGICAL_BLOCK_PROVISIONING, + (void **) &inq_lbp, errp); + if (inq_task == NULL) { + ret = -EINVAL; + goto out; + } + memcpy(&iscsilun->lbp, inq_lbp, + sizeof(struct scsi_inquiry_logical_block_provisioning)); + scsi_free_scsi_task(inq_task); + break; + case SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS: + inq_task = iscsi_do_inquiry(iscsilun->iscsi, iscsilun->lun, 1, + SCSI_INQUIRY_PAGECODE_BLOCK_LIMITS, + (void **) &inq_bl, errp); + if (inq_task == NULL) { + ret = -EINVAL; + goto out; + } + memcpy(&iscsilun->bl, inq_bl, + sizeof(struct scsi_inquiry_block_limits)); + scsi_free_scsi_task(inq_task); + break; + default: + break; + } + } + scsi_free_scsi_task(task); + task = NULL; + + iscsi_attach_aio_context(bs, iscsilun->aio_context); + + /* Guess the internal cluster (page) size of the iscsi target by the means + * of opt_unmap_gran. Transfer the unmap granularity only if it has a + * reasonable size */ + if (iscsilun->bl.opt_unmap_gran * iscsilun->block_size >= 4 * 1024 && + iscsilun->bl.opt_unmap_gran * iscsilun->block_size <= 16 * 1024 * 1024) { + iscsilun->cluster_sectors = (iscsilun->bl.opt_unmap_gran * + iscsilun->block_size) >> BDRV_SECTOR_BITS; + if (iscsilun->lbprz) { + iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); + if (iscsilun->allocationmap == NULL) { + ret = -ENOMEM; + } + } + } + +out: + qemu_opts_del(opts); + g_free(initiator_name); + if (iscsi_url != NULL) { + iscsi_destroy_url(iscsi_url); + } + if (task != NULL) { + scsi_free_scsi_task(task); + } + + if (ret) { + if (iscsi != NULL) { + if (iscsi_is_logged_in(iscsi)) { + iscsi_logout_sync(iscsi); + } + iscsi_destroy_context(iscsi); + } + memset(iscsilun, 0, sizeof(IscsiLun)); + } + return ret; +} + +static void iscsi_close(BlockDriverState *bs) +{ + IscsiLun *iscsilun = bs->opaque; + struct iscsi_context *iscsi = iscsilun->iscsi; + + iscsi_detach_aio_context(bs); + if (iscsi_is_logged_in(iscsi)) { + iscsi_logout_sync(iscsi); + } + iscsi_destroy_context(iscsi); + g_free(iscsilun->zeroblock); + g_free(iscsilun->allocationmap); + memset(iscsilun, 0, sizeof(IscsiLun)); +} + +static int sector_limits_lun2qemu(int64_t sector, IscsiLun *iscsilun) +{ + return MIN(sector_lun2qemu(sector, iscsilun), INT_MAX / 2 + 1); +} + +static void iscsi_refresh_limits(BlockDriverState *bs, Error **errp) +{ + /* We don't actually refresh here, but just return data queried in + * iscsi_open(): iscsi targets don't change their limits. */ + + IscsiLun *iscsilun = bs->opaque; + uint32_t max_xfer_len = iscsilun->use_16_for_rw ? 0xffffffff : 0xffff; + + if (iscsilun->bl.max_xfer_len) { + max_xfer_len = MIN(max_xfer_len, iscsilun->bl.max_xfer_len); + } + + bs->bl.max_transfer_length = sector_limits_lun2qemu(max_xfer_len, iscsilun); + + if (iscsilun->lbp.lbpu) { + if (iscsilun->bl.max_unmap < 0xffffffff) { + bs->bl.max_discard = + sector_limits_lun2qemu(iscsilun->bl.max_unmap, iscsilun); + } + bs->bl.discard_alignment = + sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); + } + + if (iscsilun->bl.max_ws_len < 0xffffffff) { + bs->bl.max_write_zeroes = + sector_limits_lun2qemu(iscsilun->bl.max_ws_len, iscsilun); + } + if (iscsilun->lbp.lbpws) { + bs->bl.write_zeroes_alignment = + sector_limits_lun2qemu(iscsilun->bl.opt_unmap_gran, iscsilun); + } + bs->bl.opt_transfer_length = + sector_limits_lun2qemu(iscsilun->bl.opt_xfer_len, iscsilun); +} + +/* Note that this will not re-establish a connection with an iSCSI target - it + * is effectively a NOP. */ +static int iscsi_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + IscsiLun *iscsilun = state->bs->opaque; + + if (state->flags & BDRV_O_RDWR && iscsilun->write_protected) { + error_setg(errp, "Cannot open a write protected LUN as read-write"); + return -EACCES; + } + return 0; +} + +static int iscsi_truncate(BlockDriverState *bs, int64_t offset) +{ + IscsiLun *iscsilun = bs->opaque; + Error *local_err = NULL; + + if (iscsilun->type != TYPE_DISK) { + return -ENOTSUP; + } + + iscsi_readcapacity_sync(iscsilun, &local_err); + if (local_err != NULL) { + error_free(local_err); + return -EIO; + } + + if (offset > iscsi_getlength(bs)) { + return -EINVAL; + } + + if (iscsilun->allocationmap != NULL) { + g_free(iscsilun->allocationmap); + iscsilun->allocationmap = iscsi_allocationmap_init(iscsilun); + } + + return 0; +} + +static int iscsi_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int ret = 0; + int64_t total_size = 0; + BlockDriverState *bs; + IscsiLun *iscsilun = NULL; + QDict *bs_options; + + bs = bdrv_new(); + + /* Read out options */ + total_size = DIV_ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + bs->opaque = g_new0(struct IscsiLun, 1); + iscsilun = bs->opaque; + + bs_options = qdict_new(); + qdict_put(bs_options, "filename", qstring_from_str(filename)); + ret = iscsi_open(bs, bs_options, 0, NULL); + QDECREF(bs_options); + + if (ret != 0) { + goto out; + } + iscsi_detach_aio_context(bs); + if (iscsilun->type != TYPE_DISK) { + ret = -ENODEV; + goto out; + } + if (bs->total_sectors < total_size) { + ret = -ENOSPC; + goto out; + } + + ret = 0; +out: + if (iscsilun->iscsi != NULL) { + iscsi_destroy_context(iscsilun->iscsi); + } + g_free(bs->opaque); + bs->opaque = NULL; + bdrv_unref(bs); + return ret; +} + +static int iscsi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + IscsiLun *iscsilun = bs->opaque; + bdi->unallocated_blocks_are_zero = iscsilun->lbprz; + bdi->can_write_zeroes_with_unmap = iscsilun->lbprz && iscsilun->lbp.lbpws; + bdi->cluster_size = iscsilun->cluster_sectors * BDRV_SECTOR_SIZE; + return 0; +} + +static QemuOptsList iscsi_create_opts = { + .name = "iscsi-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(iscsi_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_iscsi = { + .format_name = "iscsi", + .protocol_name = "iscsi", + + .instance_size = sizeof(IscsiLun), + .bdrv_needs_filename = true, + .bdrv_file_open = iscsi_open, + .bdrv_close = iscsi_close, + .bdrv_create = iscsi_create, + .create_opts = &iscsi_create_opts, + .bdrv_reopen_prepare = iscsi_reopen_prepare, + + .bdrv_getlength = iscsi_getlength, + .bdrv_get_info = iscsi_get_info, + .bdrv_truncate = iscsi_truncate, + .bdrv_refresh_limits = iscsi_refresh_limits, + + .bdrv_co_get_block_status = iscsi_co_get_block_status, + .bdrv_co_discard = iscsi_co_discard, + .bdrv_co_write_zeroes = iscsi_co_write_zeroes, + .bdrv_co_readv = iscsi_co_readv, + .bdrv_co_writev = iscsi_co_writev, + .bdrv_co_flush_to_disk = iscsi_co_flush, + +#ifdef __linux__ + .bdrv_aio_ioctl = iscsi_aio_ioctl, +#endif + + .bdrv_detach_aio_context = iscsi_detach_aio_context, + .bdrv_attach_aio_context = iscsi_attach_aio_context, +}; + +static QemuOptsList qemu_iscsi_opts = { + .name = "iscsi", + .head = QTAILQ_HEAD_INITIALIZER(qemu_iscsi_opts.head), + .desc = { + { + .name = "user", + .type = QEMU_OPT_STRING, + .help = "username for CHAP authentication to target", + },{ + .name = "password", + .type = QEMU_OPT_STRING, + .help = "password for CHAP authentication to target", + },{ + .name = "header-digest", + .type = QEMU_OPT_STRING, + .help = "HeaderDigest setting. " + "{CRC32C|CRC32C-NONE|NONE-CRC32C|NONE}", + },{ + .name = "initiator-name", + .type = QEMU_OPT_STRING, + .help = "Initiator iqn name to use when connecting", + },{ + .name = "timeout", + .type = QEMU_OPT_NUMBER, + .help = "Request timeout in seconds (default 0 = no timeout)", + }, + { /* end of list */ } + }, +}; + +static void iscsi_block_init(void) +{ + bdrv_register(&bdrv_iscsi); + qemu_add_opts(&qemu_iscsi_opts); +} + +block_init(iscsi_block_init); diff --git a/src/block/linux-aio.c b/src/block/linux-aio.c new file mode 100644 index 0000000..88b0520 --- /dev/null +++ b/src/block/linux-aio.c @@ -0,0 +1,338 @@ +/* + * Linux native AIO support. + * + * Copyright (C) 2009 IBM, Corp. + * Copyright (C) 2009 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ +#include "qemu-common.h" +#include "block/aio.h" +#include "qemu/queue.h" +#include "block/raw-aio.h" +#include "qemu/event_notifier.h" + +#include <libaio.h> + +/* + * Queue size (per-device). + * + * XXX: eventually we need to communicate this to the guest and/or make it + * tunable by the guest. If we get more outstanding requests at a time + * than this we will get EAGAIN from io_submit which is communicated to + * the guest as an I/O error. + */ +#define MAX_EVENTS 128 + +#define MAX_QUEUED_IO 128 + +struct qemu_laiocb { + BlockAIOCB common; + struct qemu_laio_state *ctx; + struct iocb iocb; + ssize_t ret; + size_t nbytes; + QEMUIOVector *qiov; + bool is_read; + QSIMPLEQ_ENTRY(qemu_laiocb) next; +}; + +typedef struct { + int plugged; + unsigned int n; + bool blocked; + QSIMPLEQ_HEAD(, qemu_laiocb) pending; +} LaioQueue; + +struct qemu_laio_state { + io_context_t ctx; + EventNotifier e; + + /* io queue for submit at batch */ + LaioQueue io_q; + + /* I/O completion processing */ + QEMUBH *completion_bh; + struct io_event events[MAX_EVENTS]; + int event_idx; + int event_max; +}; + +static void ioq_submit(struct qemu_laio_state *s); + +static inline ssize_t io_event_ret(struct io_event *ev) +{ + return (ssize_t)(((uint64_t)ev->res2 << 32) | ev->res); +} + +/* + * Completes an AIO request (calls the callback and frees the ACB). + */ +static void qemu_laio_process_completion(struct qemu_laio_state *s, + struct qemu_laiocb *laiocb) +{ + int ret; + + ret = laiocb->ret; + if (ret != -ECANCELED) { + if (ret == laiocb->nbytes) { + ret = 0; + } else if (ret >= 0) { + /* Short reads mean EOF, pad with zeros. */ + if (laiocb->is_read) { + qemu_iovec_memset(laiocb->qiov, ret, 0, + laiocb->qiov->size - ret); + } else { + ret = -EINVAL; + } + } + } + laiocb->common.cb(laiocb->common.opaque, ret); + + qemu_aio_unref(laiocb); +} + +/* The completion BH fetches completed I/O requests and invokes their + * callbacks. + * + * The function is somewhat tricky because it supports nested event loops, for + * example when a request callback invokes aio_poll(). In order to do this, + * the completion events array and index are kept in qemu_laio_state. The BH + * reschedules itself as long as there are completions pending so it will + * either be called again in a nested event loop or will be called after all + * events have been completed. When there are no events left to complete, the + * BH returns without rescheduling. + */ +static void qemu_laio_completion_bh(void *opaque) +{ + struct qemu_laio_state *s = opaque; + + /* Fetch more completion events when empty */ + if (s->event_idx == s->event_max) { + do { + struct timespec ts = { 0 }; + s->event_max = io_getevents(s->ctx, MAX_EVENTS, MAX_EVENTS, + s->events, &ts); + } while (s->event_max == -EINTR); + + s->event_idx = 0; + if (s->event_max <= 0) { + s->event_max = 0; + return; /* no more events */ + } + } + + /* Reschedule so nested event loops see currently pending completions */ + qemu_bh_schedule(s->completion_bh); + + /* Process completion events */ + while (s->event_idx < s->event_max) { + struct iocb *iocb = s->events[s->event_idx].obj; + struct qemu_laiocb *laiocb = + container_of(iocb, struct qemu_laiocb, iocb); + + laiocb->ret = io_event_ret(&s->events[s->event_idx]); + s->event_idx++; + + qemu_laio_process_completion(s, laiocb); + } + + if (!s->io_q.plugged && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { + ioq_submit(s); + } +} + +static void qemu_laio_completion_cb(EventNotifier *e) +{ + struct qemu_laio_state *s = container_of(e, struct qemu_laio_state, e); + + if (event_notifier_test_and_clear(&s->e)) { + qemu_bh_schedule(s->completion_bh); + } +} + +static void laio_cancel(BlockAIOCB *blockacb) +{ + struct qemu_laiocb *laiocb = (struct qemu_laiocb *)blockacb; + struct io_event event; + int ret; + + if (laiocb->ret != -EINPROGRESS) { + return; + } + ret = io_cancel(laiocb->ctx->ctx, &laiocb->iocb, &event); + laiocb->ret = -ECANCELED; + if (ret != 0) { + /* iocb is not cancelled, cb will be called by the event loop later */ + return; + } + + laiocb->common.cb(laiocb->common.opaque, laiocb->ret); +} + +static const AIOCBInfo laio_aiocb_info = { + .aiocb_size = sizeof(struct qemu_laiocb), + .cancel_async = laio_cancel, +}; + +static void ioq_init(LaioQueue *io_q) +{ + QSIMPLEQ_INIT(&io_q->pending); + io_q->plugged = 0; + io_q->n = 0; + io_q->blocked = false; +} + +static void ioq_submit(struct qemu_laio_state *s) +{ + int ret, len; + struct qemu_laiocb *aiocb; + struct iocb *iocbs[MAX_QUEUED_IO]; + QSIMPLEQ_HEAD(, qemu_laiocb) completed; + + do { + len = 0; + QSIMPLEQ_FOREACH(aiocb, &s->io_q.pending, next) { + iocbs[len++] = &aiocb->iocb; + if (len == MAX_QUEUED_IO) { + break; + } + } + + ret = io_submit(s->ctx, len, iocbs); + if (ret == -EAGAIN) { + break; + } + if (ret < 0) { + abort(); + } + + s->io_q.n -= ret; + aiocb = container_of(iocbs[ret - 1], struct qemu_laiocb, iocb); + QSIMPLEQ_SPLIT_AFTER(&s->io_q.pending, aiocb, next, &completed); + } while (ret == len && !QSIMPLEQ_EMPTY(&s->io_q.pending)); + s->io_q.blocked = (s->io_q.n > 0); +} + +void laio_io_plug(BlockDriverState *bs, void *aio_ctx) +{ + struct qemu_laio_state *s = aio_ctx; + + s->io_q.plugged++; +} + +void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug) +{ + struct qemu_laio_state *s = aio_ctx; + + assert(s->io_q.plugged > 0 || !unplug); + + if (unplug && --s->io_q.plugged > 0) { + return; + } + + if (!s->io_q.blocked && !QSIMPLEQ_EMPTY(&s->io_q.pending)) { + ioq_submit(s); + } +} + +BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type) +{ + struct qemu_laio_state *s = aio_ctx; + struct qemu_laiocb *laiocb; + struct iocb *iocbs; + off_t offset = sector_num * 512; + + laiocb = qemu_aio_get(&laio_aiocb_info, bs, cb, opaque); + laiocb->nbytes = nb_sectors * 512; + laiocb->ctx = s; + laiocb->ret = -EINPROGRESS; + laiocb->is_read = (type == QEMU_AIO_READ); + laiocb->qiov = qiov; + + iocbs = &laiocb->iocb; + + switch (type) { + case QEMU_AIO_WRITE: + io_prep_pwritev(iocbs, fd, qiov->iov, qiov->niov, offset); + break; + case QEMU_AIO_READ: + io_prep_preadv(iocbs, fd, qiov->iov, qiov->niov, offset); + break; + /* Currently Linux kernel does not support other operations */ + default: + fprintf(stderr, "%s: invalid AIO request type 0x%x.\n", + __func__, type); + goto out_free_aiocb; + } + io_set_eventfd(&laiocb->iocb, event_notifier_get_fd(&s->e)); + + QSIMPLEQ_INSERT_TAIL(&s->io_q.pending, laiocb, next); + s->io_q.n++; + if (!s->io_q.blocked && + (!s->io_q.plugged || s->io_q.n >= MAX_QUEUED_IO)) { + ioq_submit(s); + } + return &laiocb->common; + +out_free_aiocb: + qemu_aio_unref(laiocb); + return NULL; +} + +void laio_detach_aio_context(void *s_, AioContext *old_context) +{ + struct qemu_laio_state *s = s_; + + aio_set_event_notifier(old_context, &s->e, false, NULL); + qemu_bh_delete(s->completion_bh); +} + +void laio_attach_aio_context(void *s_, AioContext *new_context) +{ + struct qemu_laio_state *s = s_; + + s->completion_bh = aio_bh_new(new_context, qemu_laio_completion_bh, s); + aio_set_event_notifier(new_context, &s->e, false, + qemu_laio_completion_cb); +} + +void *laio_init(void) +{ + struct qemu_laio_state *s; + + s = g_malloc0(sizeof(*s)); + if (event_notifier_init(&s->e, false) < 0) { + goto out_free_state; + } + + if (io_setup(MAX_EVENTS, &s->ctx) != 0) { + goto out_close_efd; + } + + ioq_init(&s->io_q); + + return s; + +out_close_efd: + event_notifier_cleanup(&s->e); +out_free_state: + g_free(s); + return NULL; +} + +void laio_cleanup(void *s_) +{ + struct qemu_laio_state *s = s_; + + event_notifier_cleanup(&s->e); + + if (io_destroy(s->ctx) != 0) { + fprintf(stderr, "%s: destroy AIO context %p failed\n", + __func__, &s->ctx); + } + g_free(s); +} diff --git a/src/block/mirror.c b/src/block/mirror.c new file mode 100644 index 0000000..0e8f556 --- /dev/null +++ b/src/block/mirror.c @@ -0,0 +1,847 @@ +/* + * Image mirroring + * + * Copyright Red Hat, Inc. 2012 + * + * Authors: + * Paolo Bonzini <pbonzini@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block/blockjob.h" +#include "block/block_int.h" +#include "sysemu/block-backend.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" +#include "qemu/bitmap.h" + +#define SLICE_TIME 100000000ULL /* ns */ +#define MAX_IN_FLIGHT 16 +#define DEFAULT_MIRROR_BUF_SIZE (10 << 20) + +/* The mirroring buffer is a list of granularity-sized chunks. + * Free chunks are organized in a list. + */ +typedef struct MirrorBuffer { + QSIMPLEQ_ENTRY(MirrorBuffer) next; +} MirrorBuffer; + +typedef struct MirrorBlockJob { + BlockJob common; + RateLimit limit; + BlockDriverState *target; + BlockDriverState *base; + /* The name of the graph node to replace */ + char *replaces; + /* The BDS to replace */ + BlockDriverState *to_replace; + /* Used to block operations on the drive-mirror-replace target */ + Error *replace_blocker; + bool is_none_mode; + BlockdevOnError on_source_error, on_target_error; + bool synced; + bool should_complete; + int64_t sector_num; + int64_t granularity; + size_t buf_size; + int64_t bdev_length; + unsigned long *cow_bitmap; + BdrvDirtyBitmap *dirty_bitmap; + HBitmapIter hbi; + uint8_t *buf; + QSIMPLEQ_HEAD(, MirrorBuffer) buf_free; + int buf_free_count; + + unsigned long *in_flight_bitmap; + int in_flight; + int sectors_in_flight; + int ret; + bool unmap; + bool waiting_for_io; +} MirrorBlockJob; + +typedef struct MirrorOp { + MirrorBlockJob *s; + QEMUIOVector qiov; + int64_t sector_num; + int nb_sectors; +} MirrorOp; + +static BlockErrorAction mirror_error_action(MirrorBlockJob *s, bool read, + int error) +{ + s->synced = false; + if (read) { + return block_job_error_action(&s->common, s->common.bs, + s->on_source_error, true, error); + } else { + return block_job_error_action(&s->common, s->target, + s->on_target_error, false, error); + } +} + +static void mirror_iteration_done(MirrorOp *op, int ret) +{ + MirrorBlockJob *s = op->s; + struct iovec *iov; + int64_t chunk_num; + int i, nb_chunks, sectors_per_chunk; + + trace_mirror_iteration_done(s, op->sector_num, op->nb_sectors, ret); + + s->in_flight--; + s->sectors_in_flight -= op->nb_sectors; + iov = op->qiov.iov; + for (i = 0; i < op->qiov.niov; i++) { + MirrorBuffer *buf = (MirrorBuffer *) iov[i].iov_base; + QSIMPLEQ_INSERT_TAIL(&s->buf_free, buf, next); + s->buf_free_count++; + } + + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + chunk_num = op->sector_num / sectors_per_chunk; + nb_chunks = op->nb_sectors / sectors_per_chunk; + bitmap_clear(s->in_flight_bitmap, chunk_num, nb_chunks); + if (ret >= 0) { + if (s->cow_bitmap) { + bitmap_set(s->cow_bitmap, chunk_num, nb_chunks); + } + s->common.offset += (uint64_t)op->nb_sectors * BDRV_SECTOR_SIZE; + } + + qemu_iovec_destroy(&op->qiov); + g_free(op); + + if (s->waiting_for_io) { + qemu_coroutine_enter(s->common.co, NULL); + } +} + +static void mirror_write_complete(void *opaque, int ret) +{ + MirrorOp *op = opaque; + MirrorBlockJob *s = op->s; + if (ret < 0) { + BlockErrorAction action; + + bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); + action = mirror_error_action(s, false, -ret); + if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { + s->ret = ret; + } + } + mirror_iteration_done(op, ret); +} + +static void mirror_read_complete(void *opaque, int ret) +{ + MirrorOp *op = opaque; + MirrorBlockJob *s = op->s; + if (ret < 0) { + BlockErrorAction action; + + bdrv_set_dirty_bitmap(s->dirty_bitmap, op->sector_num, op->nb_sectors); + action = mirror_error_action(s, true, -ret); + if (action == BLOCK_ERROR_ACTION_REPORT && s->ret >= 0) { + s->ret = ret; + } + + mirror_iteration_done(op, ret); + return; + } + bdrv_aio_writev(s->target, op->sector_num, &op->qiov, op->nb_sectors, + mirror_write_complete, op); +} + +static uint64_t coroutine_fn mirror_iteration(MirrorBlockJob *s) +{ + BlockDriverState *source = s->common.bs; + int nb_sectors, sectors_per_chunk, nb_chunks; + int64_t end, sector_num, next_chunk, next_sector, hbitmap_next_sector; + uint64_t delay_ns = 0; + MirrorOp *op; + int pnum; + int64_t ret; + + s->sector_num = hbitmap_iter_next(&s->hbi); + if (s->sector_num < 0) { + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); + s->sector_num = hbitmap_iter_next(&s->hbi); + trace_mirror_restart_iter(s, bdrv_get_dirty_count(s->dirty_bitmap)); + assert(s->sector_num >= 0); + } + + hbitmap_next_sector = s->sector_num; + sector_num = s->sector_num; + sectors_per_chunk = s->granularity >> BDRV_SECTOR_BITS; + end = s->bdev_length / BDRV_SECTOR_SIZE; + + /* Extend the QEMUIOVector to include all adjacent blocks that will + * be copied in this operation. + * + * We have to do this if we have no backing file yet in the destination, + * and the cluster size is very large. Then we need to do COW ourselves. + * The first time a cluster is copied, copy it entirely. Note that, + * because both the granularity and the cluster size are powers of two, + * the number of sectors to copy cannot exceed one cluster. + * + * We also want to extend the QEMUIOVector to include more adjacent + * dirty blocks if possible, to limit the number of I/O operations and + * run efficiently even with a small granularity. + */ + nb_chunks = 0; + nb_sectors = 0; + next_sector = sector_num; + next_chunk = sector_num / sectors_per_chunk; + + /* Wait for I/O to this cluster (from a previous iteration) to be done. */ + while (test_bit(next_chunk, s->in_flight_bitmap)) { + trace_mirror_yield_in_flight(s, sector_num, s->in_flight); + s->waiting_for_io = true; + qemu_coroutine_yield(); + s->waiting_for_io = false; + } + + do { + int added_sectors, added_chunks; + + if (!bdrv_get_dirty(source, s->dirty_bitmap, next_sector) || + test_bit(next_chunk, s->in_flight_bitmap)) { + assert(nb_sectors > 0); + break; + } + + added_sectors = sectors_per_chunk; + if (s->cow_bitmap && !test_bit(next_chunk, s->cow_bitmap)) { + bdrv_round_to_clusters(s->target, + next_sector, added_sectors, + &next_sector, &added_sectors); + + /* On the first iteration, the rounding may make us copy + * sectors before the first dirty one. + */ + if (next_sector < sector_num) { + assert(nb_sectors == 0); + sector_num = next_sector; + next_chunk = next_sector / sectors_per_chunk; + } + } + + added_sectors = MIN(added_sectors, end - (sector_num + nb_sectors)); + added_chunks = (added_sectors + sectors_per_chunk - 1) / sectors_per_chunk; + + /* When doing COW, it may happen that there is not enough space for + * a full cluster. Wait if that is the case. + */ + while (nb_chunks == 0 && s->buf_free_count < added_chunks) { + trace_mirror_yield_buf_busy(s, nb_chunks, s->in_flight); + s->waiting_for_io = true; + qemu_coroutine_yield(); + s->waiting_for_io = false; + } + if (s->buf_free_count < nb_chunks + added_chunks) { + trace_mirror_break_buf_busy(s, nb_chunks, s->in_flight); + break; + } + if (IOV_MAX < nb_chunks + added_chunks) { + trace_mirror_break_iov_max(s, nb_chunks, added_chunks); + break; + } + + /* We have enough free space to copy these sectors. */ + bitmap_set(s->in_flight_bitmap, next_chunk, added_chunks); + + nb_sectors += added_sectors; + nb_chunks += added_chunks; + next_sector += added_sectors; + next_chunk += added_chunks; + if (!s->synced && s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, added_sectors); + } + } while (delay_ns == 0 && next_sector < end); + + /* Allocate a MirrorOp that is used as an AIO callback. */ + op = g_new(MirrorOp, 1); + op->s = s; + op->sector_num = sector_num; + op->nb_sectors = nb_sectors; + + /* Now make a QEMUIOVector taking enough granularity-sized chunks + * from s->buf_free. + */ + qemu_iovec_init(&op->qiov, nb_chunks); + next_sector = sector_num; + while (nb_chunks-- > 0) { + MirrorBuffer *buf = QSIMPLEQ_FIRST(&s->buf_free); + size_t remaining = (nb_sectors * BDRV_SECTOR_SIZE) - op->qiov.size; + + QSIMPLEQ_REMOVE_HEAD(&s->buf_free, next); + s->buf_free_count--; + qemu_iovec_add(&op->qiov, buf, MIN(s->granularity, remaining)); + + /* Advance the HBitmapIter in parallel, so that we do not examine + * the same sector twice. + */ + if (next_sector > hbitmap_next_sector + && bdrv_get_dirty(source, s->dirty_bitmap, next_sector)) { + hbitmap_next_sector = hbitmap_iter_next(&s->hbi); + } + + next_sector += sectors_per_chunk; + } + + bdrv_reset_dirty_bitmap(s->dirty_bitmap, sector_num, nb_sectors); + + /* Copy the dirty cluster. */ + s->in_flight++; + s->sectors_in_flight += nb_sectors; + trace_mirror_one_iteration(s, sector_num, nb_sectors); + + ret = bdrv_get_block_status_above(source, NULL, sector_num, + nb_sectors, &pnum); + if (ret < 0 || pnum < nb_sectors || + (ret & BDRV_BLOCK_DATA && !(ret & BDRV_BLOCK_ZERO))) { + bdrv_aio_readv(source, sector_num, &op->qiov, nb_sectors, + mirror_read_complete, op); + } else if (ret & BDRV_BLOCK_ZERO) { + bdrv_aio_write_zeroes(s->target, sector_num, op->nb_sectors, + s->unmap ? BDRV_REQ_MAY_UNMAP : 0, + mirror_write_complete, op); + } else { + assert(!(ret & BDRV_BLOCK_DATA)); + bdrv_aio_discard(s->target, sector_num, op->nb_sectors, + mirror_write_complete, op); + } + return delay_ns; +} + +static void mirror_free_init(MirrorBlockJob *s) +{ + int granularity = s->granularity; + size_t buf_size = s->buf_size; + uint8_t *buf = s->buf; + + assert(s->buf_free_count == 0); + QSIMPLEQ_INIT(&s->buf_free); + while (buf_size != 0) { + MirrorBuffer *cur = (MirrorBuffer *)buf; + QSIMPLEQ_INSERT_TAIL(&s->buf_free, cur, next); + s->buf_free_count++; + buf_size -= granularity; + buf += granularity; + } +} + +static void mirror_drain(MirrorBlockJob *s) +{ + while (s->in_flight > 0) { + s->waiting_for_io = true; + qemu_coroutine_yield(); + s->waiting_for_io = false; + } +} + +typedef struct { + int ret; +} MirrorExitData; + +static void mirror_exit(BlockJob *job, void *opaque) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + MirrorExitData *data = opaque; + AioContext *replace_aio_context = NULL; + BlockDriverState *src = s->common.bs; + + /* Make sure that the source BDS doesn't go away before we called + * block_job_completed(). */ + bdrv_ref(src); + + if (s->to_replace) { + replace_aio_context = bdrv_get_aio_context(s->to_replace); + aio_context_acquire(replace_aio_context); + } + + if (s->should_complete && data->ret == 0) { + BlockDriverState *to_replace = s->common.bs; + if (s->to_replace) { + to_replace = s->to_replace; + } + if (bdrv_get_flags(s->target) != bdrv_get_flags(to_replace)) { + bdrv_reopen(s->target, bdrv_get_flags(to_replace), NULL); + } + bdrv_replace_in_backing_chain(to_replace, s->target); + } + if (s->to_replace) { + bdrv_op_unblock_all(s->to_replace, s->replace_blocker); + error_free(s->replace_blocker); + bdrv_unref(s->to_replace); + } + if (replace_aio_context) { + aio_context_release(replace_aio_context); + } + g_free(s->replaces); + bdrv_op_unblock_all(s->target, s->common.blocker); + bdrv_unref(s->target); + block_job_completed(&s->common, data->ret); + g_free(data); + bdrv_drained_end(src); + bdrv_unref(src); +} + +static void coroutine_fn mirror_run(void *opaque) +{ + MirrorBlockJob *s = opaque; + MirrorExitData *data; + BlockDriverState *bs = s->common.bs; + int64_t sector_num, end, length; + uint64_t last_pause_ns; + BlockDriverInfo bdi; + char backing_filename[2]; /* we only need 2 characters because we are only + checking for a NULL string */ + int ret = 0; + int n; + + if (block_job_is_cancelled(&s->common)) { + goto immediate_exit; + } + + s->bdev_length = bdrv_getlength(bs); + if (s->bdev_length < 0) { + ret = s->bdev_length; + goto immediate_exit; + } else if (s->bdev_length == 0) { + /* Report BLOCK_JOB_READY and wait for complete. */ + block_job_event_ready(&s->common); + s->synced = true; + while (!block_job_is_cancelled(&s->common) && !s->should_complete) { + block_job_yield(&s->common); + } + s->common.cancelled = false; + goto immediate_exit; + } + + length = DIV_ROUND_UP(s->bdev_length, s->granularity); + s->in_flight_bitmap = bitmap_new(length); + + /* If we have no backing file yet in the destination, we cannot let + * the destination do COW. Instead, we copy sectors around the + * dirty data if needed. We need a bitmap to do that. + */ + bdrv_get_backing_filename(s->target, backing_filename, + sizeof(backing_filename)); + if (backing_filename[0] && !s->target->backing) { + ret = bdrv_get_info(s->target, &bdi); + if (ret < 0) { + goto immediate_exit; + } + if (s->granularity < bdi.cluster_size) { + s->buf_size = MAX(s->buf_size, bdi.cluster_size); + s->cow_bitmap = bitmap_new(length); + } + } + + end = s->bdev_length / BDRV_SECTOR_SIZE; + s->buf = qemu_try_blockalign(bs, s->buf_size); + if (s->buf == NULL) { + ret = -ENOMEM; + goto immediate_exit; + } + + mirror_free_init(s); + + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + if (!s->is_none_mode) { + /* First part, loop on the sectors and initialize the dirty bitmap. */ + BlockDriverState *base = s->base; + bool mark_all_dirty = s->base == NULL && !bdrv_has_zero_init(s->target); + + for (sector_num = 0; sector_num < end; ) { + /* Just to make sure we are not exceeding int limit. */ + int nb_sectors = MIN(INT_MAX >> BDRV_SECTOR_BITS, + end - sector_num); + int64_t now = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + + if (now - last_pause_ns > SLICE_TIME) { + last_pause_ns = now; + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, 0); + } + + if (block_job_is_cancelled(&s->common)) { + goto immediate_exit; + } + + ret = bdrv_is_allocated_above(bs, base, sector_num, nb_sectors, &n); + + if (ret < 0) { + goto immediate_exit; + } + + assert(n > 0); + if (ret == 1 || mark_all_dirty) { + bdrv_set_dirty_bitmap(s->dirty_bitmap, sector_num, n); + } + sector_num += n; + } + } + + bdrv_dirty_iter_init(s->dirty_bitmap, &s->hbi); + for (;;) { + uint64_t delay_ns = 0; + int64_t cnt; + bool should_complete; + + if (s->ret < 0) { + ret = s->ret; + goto immediate_exit; + } + + cnt = bdrv_get_dirty_count(s->dirty_bitmap); + /* s->common.offset contains the number of bytes already processed so + * far, cnt is the number of dirty sectors remaining and + * s->sectors_in_flight is the number of sectors currently being + * processed; together those are the current total operation length */ + s->common.len = s->common.offset + + (cnt + s->sectors_in_flight) * BDRV_SECTOR_SIZE; + + /* Note that even when no rate limit is applied we need to yield + * periodically with no pending I/O so that bdrv_drain_all() returns. + * We do so every SLICE_TIME nanoseconds, or when there is an error, + * or when the source is clean, whichever comes first. + */ + if (qemu_clock_get_ns(QEMU_CLOCK_REALTIME) - last_pause_ns < SLICE_TIME && + s->common.iostatus == BLOCK_DEVICE_IO_STATUS_OK) { + if (s->in_flight == MAX_IN_FLIGHT || s->buf_free_count == 0 || + (cnt == 0 && s->in_flight > 0)) { + trace_mirror_yield(s, s->in_flight, s->buf_free_count, cnt); + s->waiting_for_io = true; + qemu_coroutine_yield(); + s->waiting_for_io = false; + continue; + } else if (cnt != 0) { + delay_ns = mirror_iteration(s); + } + } + + should_complete = false; + if (s->in_flight == 0 && cnt == 0) { + trace_mirror_before_flush(s); + ret = bdrv_flush(s->target); + if (ret < 0) { + if (mirror_error_action(s, false, -ret) == + BLOCK_ERROR_ACTION_REPORT) { + goto immediate_exit; + } + } else { + /* We're out of the streaming phase. From now on, if the job + * is cancelled we will actually complete all pending I/O and + * report completion. This way, block-job-cancel will leave + * the target in a consistent state. + */ + if (!s->synced) { + block_job_event_ready(&s->common); + s->synced = true; + } + + should_complete = s->should_complete || + block_job_is_cancelled(&s->common); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); + } + } + + if (cnt == 0 && should_complete) { + /* The dirty bitmap is not updated while operations are pending. + * If we're about to exit, wait for pending operations before + * calling bdrv_get_dirty_count(bs), or we may exit while the + * source has dirty data to copy! + * + * Note that I/O can be submitted by the guest while + * mirror_populate runs. + */ + trace_mirror_before_drain(s, cnt); + bdrv_drain(bs); + cnt = bdrv_get_dirty_count(s->dirty_bitmap); + } + + ret = 0; + trace_mirror_before_sleep(s, cnt, s->synced, delay_ns); + if (!s->synced) { + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + if (block_job_is_cancelled(&s->common)) { + break; + } + } else if (!should_complete) { + delay_ns = (s->in_flight == 0 && cnt == 0 ? SLICE_TIME : 0); + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + } else if (cnt == 0) { + /* The two disks are in sync. Exit and report successful + * completion. + */ + assert(QLIST_EMPTY(&bs->tracked_requests)); + s->common.cancelled = false; + break; + } + last_pause_ns = qemu_clock_get_ns(QEMU_CLOCK_REALTIME); + } + +immediate_exit: + if (s->in_flight > 0) { + /* We get here only if something went wrong. Either the job failed, + * or it was cancelled prematurely so that we do not guarantee that + * the target is a copy of the source. + */ + assert(ret < 0 || (!s->synced && block_job_is_cancelled(&s->common))); + mirror_drain(s); + } + + assert(s->in_flight == 0); + qemu_vfree(s->buf); + g_free(s->cow_bitmap); + g_free(s->in_flight_bitmap); + bdrv_release_dirty_bitmap(bs, s->dirty_bitmap); + if (s->target->blk) { + blk_iostatus_disable(s->target->blk); + } + + data = g_malloc(sizeof(*data)); + data->ret = ret; + /* Before we switch to target in mirror_exit, make sure data doesn't + * change. */ + bdrv_drained_begin(s->common.bs); + block_job_defer_to_main_loop(&s->common, mirror_exit, data); +} + +static void mirror_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + + if (speed < 0) { + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); + return; + } + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static void mirror_iostatus_reset(BlockJob *job) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + + if (s->target->blk) { + blk_iostatus_reset(s->target->blk); + } +} + +static void mirror_complete(BlockJob *job, Error **errp) +{ + MirrorBlockJob *s = container_of(job, MirrorBlockJob, common); + Error *local_err = NULL; + int ret; + + ret = bdrv_open_backing_file(s->target, NULL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return; + } + if (!s->synced) { + error_setg(errp, QERR_BLOCK_JOB_NOT_READY, job->id); + return; + } + + /* check the target bs is not blocked and block all operations on it */ + if (s->replaces) { + AioContext *replace_aio_context; + + s->to_replace = bdrv_find_node(s->replaces); + if (!s->to_replace) { + error_setg(errp, "Node name '%s' not found", s->replaces); + return; + } + + replace_aio_context = bdrv_get_aio_context(s->to_replace); + aio_context_acquire(replace_aio_context); + + error_setg(&s->replace_blocker, + "block device is in use by block-job-complete"); + bdrv_op_block_all(s->to_replace, s->replace_blocker); + bdrv_ref(s->to_replace); + + aio_context_release(replace_aio_context); + } + + s->should_complete = true; + block_job_enter(&s->common); +} + +static const BlockJobDriver mirror_job_driver = { + .instance_size = sizeof(MirrorBlockJob), + .job_type = BLOCK_JOB_TYPE_MIRROR, + .set_speed = mirror_set_speed, + .iostatus_reset= mirror_iostatus_reset, + .complete = mirror_complete, +}; + +static const BlockJobDriver commit_active_job_driver = { + .instance_size = sizeof(MirrorBlockJob), + .job_type = BLOCK_JOB_TYPE_COMMIT, + .set_speed = mirror_set_speed, + .iostatus_reset + = mirror_iostatus_reset, + .complete = mirror_complete, +}; + +static void mirror_start_job(BlockDriverState *bs, BlockDriverState *target, + const char *replaces, + int64_t speed, uint32_t granularity, + int64_t buf_size, + BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, + BlockCompletionFunc *cb, + void *opaque, Error **errp, + const BlockJobDriver *driver, + bool is_none_mode, BlockDriverState *base) +{ + MirrorBlockJob *s; + + if (granularity == 0) { + granularity = bdrv_get_default_bitmap_granularity(target); + } + + assert ((granularity & (granularity - 1)) == 0); + + if ((on_source_error == BLOCKDEV_ON_ERROR_STOP || + on_source_error == BLOCKDEV_ON_ERROR_ENOSPC) && + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { + error_setg(errp, QERR_INVALID_PARAMETER, "on-source-error"); + return; + } + + if (buf_size < 0) { + error_setg(errp, "Invalid parameter 'buf-size'"); + return; + } + + if (buf_size == 0) { + buf_size = DEFAULT_MIRROR_BUF_SIZE; + } + + s = block_job_create(driver, bs, speed, cb, opaque, errp); + if (!s) { + return; + } + + s->replaces = g_strdup(replaces); + s->on_source_error = on_source_error; + s->on_target_error = on_target_error; + s->target = target; + s->is_none_mode = is_none_mode; + s->base = base; + s->granularity = granularity; + s->buf_size = ROUND_UP(buf_size, granularity); + s->unmap = unmap; + + s->dirty_bitmap = bdrv_create_dirty_bitmap(bs, granularity, NULL, errp); + if (!s->dirty_bitmap) { + g_free(s->replaces); + block_job_unref(&s->common); + return; + } + + bdrv_op_block_all(s->target, s->common.blocker); + + bdrv_set_enable_write_cache(s->target, true); + if (s->target->blk) { + blk_set_on_error(s->target->blk, on_target_error, on_target_error); + blk_iostatus_enable(s->target->blk); + } + s->common.co = qemu_coroutine_create(mirror_run); + trace_mirror_start(bs, s, s->common.co, opaque); + qemu_coroutine_enter(s->common.co, s); +} + +void mirror_start(BlockDriverState *bs, BlockDriverState *target, + const char *replaces, + int64_t speed, uint32_t granularity, int64_t buf_size, + MirrorSyncMode mode, BlockdevOnError on_source_error, + BlockdevOnError on_target_error, + bool unmap, + BlockCompletionFunc *cb, + void *opaque, Error **errp) +{ + bool is_none_mode; + BlockDriverState *base; + + if (mode == MIRROR_SYNC_MODE_INCREMENTAL) { + error_setg(errp, "Sync mode 'incremental' not supported"); + return; + } + is_none_mode = mode == MIRROR_SYNC_MODE_NONE; + base = mode == MIRROR_SYNC_MODE_TOP ? backing_bs(bs) : NULL; + mirror_start_job(bs, target, replaces, + speed, granularity, buf_size, + on_source_error, on_target_error, unmap, cb, opaque, errp, + &mirror_job_driver, is_none_mode, base); +} + +void commit_active_start(BlockDriverState *bs, BlockDriverState *base, + int64_t speed, + BlockdevOnError on_error, + BlockCompletionFunc *cb, + void *opaque, Error **errp) +{ + int64_t length, base_length; + int orig_base_flags; + int ret; + Error *local_err = NULL; + + orig_base_flags = bdrv_get_flags(base); + + if (bdrv_reopen(base, bs->open_flags, errp)) { + return; + } + + length = bdrv_getlength(bs); + if (length < 0) { + error_setg_errno(errp, -length, + "Unable to determine length of %s", bs->filename); + goto error_restore_flags; + } + + base_length = bdrv_getlength(base); + if (base_length < 0) { + error_setg_errno(errp, -base_length, + "Unable to determine length of %s", base->filename); + goto error_restore_flags; + } + + if (length > base_length) { + ret = bdrv_truncate(base, length); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Top image %s is larger than base image %s, and " + "resize of base image failed", + bs->filename, base->filename); + goto error_restore_flags; + } + } + + bdrv_ref(base); + mirror_start_job(bs, base, NULL, speed, 0, 0, + on_error, on_error, false, cb, opaque, &local_err, + &commit_active_job_driver, false, base); + if (local_err) { + error_propagate(errp, local_err); + goto error_restore_flags; + } + + return; + +error_restore_flags: + /* ignore error and errp for bdrv_reopen, because we want to propagate + * the original error */ + bdrv_reopen(base, orig_base_flags, NULL); + return; +} diff --git a/src/block/nbd-client.c b/src/block/nbd-client.c new file mode 100644 index 0000000..b7fd17a --- /dev/null +++ b/src/block/nbd-client.c @@ -0,0 +1,409 @@ +/* + * QEMU Block driver for NBD + * + * Copyright (C) 2008 Bull S.A.S. + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + * Some parts: + * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "nbd-client.h" +#include "qemu/sockets.h" + +#define HANDLE_TO_INDEX(bs, handle) ((handle) ^ ((uint64_t)(intptr_t)bs)) +#define INDEX_TO_HANDLE(bs, index) ((index) ^ ((uint64_t)(intptr_t)bs)) + +static void nbd_recv_coroutines_enter_all(NbdClientSession *s) +{ + int i; + + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + } + } +} + +static void nbd_teardown_connection(BlockDriverState *bs) +{ + NbdClientSession *client = nbd_get_client_session(bs); + + /* finish any pending coroutines */ + shutdown(client->sock, 2); + nbd_recv_coroutines_enter_all(client); + + nbd_client_detach_aio_context(bs); + closesocket(client->sock); + client->sock = -1; +} + +static void nbd_reply_ready(void *opaque) +{ + BlockDriverState *bs = opaque; + NbdClientSession *s = nbd_get_client_session(bs); + uint64_t i; + int ret; + + if (s->reply.handle == 0) { + /* No reply already in flight. Fetch a header. It is possible + * that another thread has done the same thing in parallel, so + * the socket is not readable anymore. + */ + ret = nbd_receive_reply(s->sock, &s->reply); + if (ret == -EAGAIN) { + return; + } + if (ret < 0) { + s->reply.handle = 0; + goto fail; + } + } + + /* There's no need for a mutex on the receive side, because the + * handler acts as a synchronization point and ensures that only + * one coroutine is called until the reply finishes. */ + i = HANDLE_TO_INDEX(s, s->reply.handle); + if (i >= MAX_NBD_REQUESTS) { + goto fail; + } + + if (s->recv_coroutine[i]) { + qemu_coroutine_enter(s->recv_coroutine[i], NULL); + return; + } + +fail: + nbd_teardown_connection(bs); +} + +static void nbd_restart_write(void *opaque) +{ + BlockDriverState *bs = opaque; + + qemu_coroutine_enter(nbd_get_client_session(bs)->send_coroutine, NULL); +} + +static int nbd_co_send_request(BlockDriverState *bs, + struct nbd_request *request, + QEMUIOVector *qiov, int offset) +{ + NbdClientSession *s = nbd_get_client_session(bs); + AioContext *aio_context; + int rc, ret, i; + + qemu_co_mutex_lock(&s->send_mutex); + + for (i = 0; i < MAX_NBD_REQUESTS; i++) { + if (s->recv_coroutine[i] == NULL) { + s->recv_coroutine[i] = qemu_coroutine_self(); + break; + } + } + + assert(i < MAX_NBD_REQUESTS); + request->handle = INDEX_TO_HANDLE(s, i); + s->send_coroutine = qemu_coroutine_self(); + aio_context = bdrv_get_aio_context(bs); + + aio_set_fd_handler(aio_context, s->sock, false, + nbd_reply_ready, nbd_restart_write, bs); + if (qiov) { + if (!s->is_unix) { + socket_set_cork(s->sock, 1); + } + rc = nbd_send_request(s->sock, request); + if (rc >= 0) { + ret = qemu_co_sendv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + rc = -EIO; + } + } + if (!s->is_unix) { + socket_set_cork(s->sock, 0); + } + } else { + rc = nbd_send_request(s->sock, request); + } + aio_set_fd_handler(aio_context, s->sock, false, + nbd_reply_ready, NULL, bs); + s->send_coroutine = NULL; + qemu_co_mutex_unlock(&s->send_mutex); + return rc; +} + +static void nbd_co_receive_reply(NbdClientSession *s, + struct nbd_request *request, struct nbd_reply *reply, + QEMUIOVector *qiov, int offset) +{ + int ret; + + /* Wait until we're woken up by the read handler. TODO: perhaps + * peek at the next reply and avoid yielding if it's ours? */ + qemu_coroutine_yield(); + *reply = s->reply; + if (reply->handle != request->handle) { + reply->error = EIO; + } else { + if (qiov && reply->error == 0) { + ret = qemu_co_recvv(s->sock, qiov->iov, qiov->niov, + offset, request->len); + if (ret != request->len) { + reply->error = EIO; + } + } + + /* Tell the read handler to read another header. */ + s->reply.handle = 0; + } +} + +static void nbd_coroutine_start(NbdClientSession *s, + struct nbd_request *request) +{ + /* Poor man semaphore. The free_sema is locked when no other request + * can be accepted, and unlocked after receiving one reply. */ + if (s->in_flight >= MAX_NBD_REQUESTS - 1) { + qemu_co_mutex_lock(&s->free_sema); + assert(s->in_flight < MAX_NBD_REQUESTS); + } + s->in_flight++; + + /* s->recv_coroutine[i] is set as soon as we get the send_lock. */ +} + +static void nbd_coroutine_end(NbdClientSession *s, + struct nbd_request *request) +{ + int i = HANDLE_TO_INDEX(s, request->handle); + s->recv_coroutine[i] = NULL; + if (s->in_flight-- == MAX_NBD_REQUESTS) { + qemu_co_mutex_unlock(&s->free_sema); + } +} + +static int nbd_co_readv_1(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + NbdClientSession *client = nbd_get_client_session(bs); + struct nbd_request request = { .type = NBD_CMD_READ }; + struct nbd_reply reply; + ssize_t ret; + + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(bs, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, qiov, offset); + } + nbd_coroutine_end(client, &request); + return -reply.error; + +} + +static int nbd_co_writev_1(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov, + int offset) +{ + NbdClientSession *client = nbd_get_client_session(bs); + struct nbd_request request = { .type = NBD_CMD_WRITE }; + struct nbd_reply reply; + ssize_t ret; + + if (!bdrv_enable_write_cache(bs) && + (client->nbdflags & NBD_FLAG_SEND_FUA)) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(bs, &request, qiov, offset); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; +} + +/* qemu-nbd has a limit of slightly less than 1M per request. Try to + * remain aligned to 4K. */ +#define NBD_MAX_SECTORS 2040 + +int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_readv_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_readv_1(bs, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int offset = 0; + int ret; + while (nb_sectors > NBD_MAX_SECTORS) { + ret = nbd_co_writev_1(bs, sector_num, NBD_MAX_SECTORS, qiov, offset); + if (ret < 0) { + return ret; + } + offset += NBD_MAX_SECTORS * 512; + sector_num += NBD_MAX_SECTORS; + nb_sectors -= NBD_MAX_SECTORS; + } + return nbd_co_writev_1(bs, sector_num, nb_sectors, qiov, offset); +} + +int nbd_client_co_flush(BlockDriverState *bs) +{ + NbdClientSession *client = nbd_get_client_session(bs); + struct nbd_request request = { .type = NBD_CMD_FLUSH }; + struct nbd_reply reply; + ssize_t ret; + + if (!(client->nbdflags & NBD_FLAG_SEND_FLUSH)) { + return 0; + } + + if (client->nbdflags & NBD_FLAG_SEND_FUA) { + request.type |= NBD_CMD_FLAG_FUA; + } + + request.from = 0; + request.len = 0; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(bs, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; +} + +int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + NbdClientSession *client = nbd_get_client_session(bs); + struct nbd_request request = { .type = NBD_CMD_TRIM }; + struct nbd_reply reply; + ssize_t ret; + + if (!(client->nbdflags & NBD_FLAG_SEND_TRIM)) { + return 0; + } + request.from = sector_num * 512; + request.len = nb_sectors * 512; + + nbd_coroutine_start(client, &request); + ret = nbd_co_send_request(bs, &request, NULL, 0); + if (ret < 0) { + reply.error = -ret; + } else { + nbd_co_receive_reply(client, &request, &reply, NULL, 0); + } + nbd_coroutine_end(client, &request); + return -reply.error; + +} + +void nbd_client_detach_aio_context(BlockDriverState *bs) +{ + aio_set_fd_handler(bdrv_get_aio_context(bs), + nbd_get_client_session(bs)->sock, + false, NULL, NULL, NULL); +} + +void nbd_client_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + aio_set_fd_handler(new_context, nbd_get_client_session(bs)->sock, + false, nbd_reply_ready, NULL, bs); +} + +void nbd_client_close(BlockDriverState *bs) +{ + NbdClientSession *client = nbd_get_client_session(bs); + struct nbd_request request = { + .type = NBD_CMD_DISC, + .from = 0, + .len = 0 + }; + + if (client->sock == -1) { + return; + } + + nbd_send_request(client->sock, &request); + + nbd_teardown_connection(bs); +} + +int nbd_client_init(BlockDriverState *bs, int sock, const char *export, + Error **errp) +{ + NbdClientSession *client = nbd_get_client_session(bs); + int ret; + + /* NBD handshake */ + logout("session init %s\n", export); + qemu_set_block(sock); + ret = nbd_receive_negotiate(sock, export, + &client->nbdflags, &client->size, errp); + if (ret < 0) { + logout("Failed to negotiate with the NBD server\n"); + closesocket(sock); + return ret; + } + + qemu_co_mutex_init(&client->send_mutex); + qemu_co_mutex_init(&client->free_sema); + client->sock = sock; + + /* Now that we're connected, set the socket to be non-blocking and + * kick the reply mechanism. */ + qemu_set_nonblock(sock); + nbd_client_attach_aio_context(bs, bdrv_get_aio_context(bs)); + + logout("Established connection with NBD server\n"); + return 0; +} diff --git a/src/block/nbd-client.h b/src/block/nbd-client.h new file mode 100644 index 0000000..e841340 --- /dev/null +++ b/src/block/nbd-client.h @@ -0,0 +1,53 @@ +#ifndef NBD_CLIENT_H +#define NBD_CLIENT_H + +#include "qemu-common.h" +#include "block/nbd.h" +#include "block/block_int.h" + +/* #define DEBUG_NBD */ + +#if defined(DEBUG_NBD) +#define logout(fmt, ...) \ + fprintf(stderr, "nbd\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +#define MAX_NBD_REQUESTS 16 + +typedef struct NbdClientSession { + int sock; + uint32_t nbdflags; + off_t size; + + CoMutex send_mutex; + CoMutex free_sema; + Coroutine *send_coroutine; + int in_flight; + + Coroutine *recv_coroutine[MAX_NBD_REQUESTS]; + struct nbd_reply reply; + + bool is_unix; +} NbdClientSession; + +NbdClientSession *nbd_get_client_session(BlockDriverState *bs); + +int nbd_client_init(BlockDriverState *bs, int sock, const char *export_name, + Error **errp); +void nbd_client_close(BlockDriverState *bs); + +int nbd_client_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors); +int nbd_client_co_flush(BlockDriverState *bs); +int nbd_client_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); +int nbd_client_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov); + +void nbd_client_detach_aio_context(BlockDriverState *bs); +void nbd_client_attach_aio_context(BlockDriverState *bs, + AioContext *new_context); + +#endif /* NBD_CLIENT_H */ diff --git a/src/block/nbd.c b/src/block/nbd.c new file mode 100644 index 0000000..cd6a587 --- /dev/null +++ b/src/block/nbd.c @@ -0,0 +1,451 @@ +/* + * QEMU Block driver for NBD + * + * Copyright (C) 2008 Bull S.A.S. + * Author: Laurent Vivier <Laurent.Vivier@bull.net> + * + * Some parts: + * Copyright (C) 2007 Anthony Liguori <anthony@codemonkey.ws> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/nbd-client.h" +#include "qemu/uri.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/sockets.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qjson.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" + +#include <sys/types.h> +#include <unistd.h> + +#define EN_OPTSTR ":exportname=" + +typedef struct BDRVNBDState { + NbdClientSession client; +} BDRVNBDState; + +static int nbd_parse_uri(const char *filename, QDict *options) +{ + URI *uri; + const char *p; + QueryParams *qp = NULL; + int ret = 0; + bool is_unix; + + uri = uri_parse(filename); + if (!uri) { + return -EINVAL; + } + + /* transport */ + if (!strcmp(uri->scheme, "nbd")) { + is_unix = false; + } else if (!strcmp(uri->scheme, "nbd+tcp")) { + is_unix = false; + } else if (!strcmp(uri->scheme, "nbd+unix")) { + is_unix = true; + } else { + ret = -EINVAL; + goto out; + } + + p = uri->path ? uri->path : "/"; + p += strspn(p, "/"); + if (p[0]) { + qdict_put(options, "export", qstring_from_str(p)); + } + + qp = query_params_parse(uri->query); + if (qp->n > 1 || (is_unix && !qp->n) || (!is_unix && qp->n)) { + ret = -EINVAL; + goto out; + } + + if (is_unix) { + /* nbd+unix:///export?socket=path */ + if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { + ret = -EINVAL; + goto out; + } + qdict_put(options, "path", qstring_from_str(qp->p[0].value)); + } else { + QString *host; + /* nbd[+tcp]://host[:port]/export */ + if (!uri->server) { + ret = -EINVAL; + goto out; + } + + /* strip braces from literal IPv6 address */ + if (uri->server[0] == '[') { + host = qstring_from_substr(uri->server, 1, + strlen(uri->server) - 2); + } else { + host = qstring_from_str(uri->server); + } + + qdict_put(options, "host", host); + if (uri->port) { + char* port_str = g_strdup_printf("%d", uri->port); + qdict_put(options, "port", qstring_from_str(port_str)); + g_free(port_str); + } + } + +out: + if (qp) { + query_params_free(qp); + } + uri_free(uri); + return ret; +} + +static void nbd_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + char *file; + char *export_name; + const char *host_spec; + const char *unixpath; + + if (qdict_haskey(options, "host") + || qdict_haskey(options, "port") + || qdict_haskey(options, "path")) + { + error_setg(errp, "host/port/path and a file name may not be specified " + "at the same time"); + return; + } + + if (strstr(filename, "://")) { + int ret = nbd_parse_uri(filename, options); + if (ret < 0) { + error_setg(errp, "No valid URL specified"); + } + return; + } + + file = g_strdup(filename); + + export_name = strstr(file, EN_OPTSTR); + if (export_name) { + if (export_name[strlen(EN_OPTSTR)] == 0) { + goto out; + } + export_name[0] = 0; /* truncate 'file' */ + export_name += strlen(EN_OPTSTR); + + qdict_put(options, "export", qstring_from_str(export_name)); + } + + /* extract the host_spec - fail if it's not nbd:... */ + if (!strstart(file, "nbd:", &host_spec)) { + error_setg(errp, "File name string for NBD must start with 'nbd:'"); + goto out; + } + + if (!*host_spec) { + goto out; + } + + /* are we a UNIX or TCP socket? */ + if (strstart(host_spec, "unix:", &unixpath)) { + qdict_put(options, "path", qstring_from_str(unixpath)); + } else { + InetSocketAddress *addr = NULL; + + addr = inet_parse(host_spec, errp); + if (!addr) { + goto out; + } + + qdict_put(options, "host", qstring_from_str(addr->host)); + qdict_put(options, "port", qstring_from_str(addr->port)); + qapi_free_InetSocketAddress(addr); + } + +out: + g_free(file); +} + +static SocketAddress *nbd_config(BDRVNBDState *s, QDict *options, char **export, + Error **errp) +{ + SocketAddress *saddr; + + if (qdict_haskey(options, "path") == qdict_haskey(options, "host")) { + if (qdict_haskey(options, "path")) { + error_setg(errp, "path and host may not be used at the same time."); + } else { + error_setg(errp, "one of path and host must be specified."); + } + return NULL; + } + + saddr = g_new0(SocketAddress, 1); + + if (qdict_haskey(options, "path")) { + saddr->type = SOCKET_ADDRESS_KIND_UNIX; + saddr->u.q_unix = g_new0(UnixSocketAddress, 1); + saddr->u.q_unix->path = g_strdup(qdict_get_str(options, "path")); + qdict_del(options, "path"); + } else { + saddr->type = SOCKET_ADDRESS_KIND_INET; + saddr->u.inet = g_new0(InetSocketAddress, 1); + saddr->u.inet->host = g_strdup(qdict_get_str(options, "host")); + if (!qdict_get_try_str(options, "port")) { + saddr->u.inet->port = g_strdup_printf("%d", NBD_DEFAULT_PORT); + } else { + saddr->u.inet->port = g_strdup(qdict_get_str(options, "port")); + } + qdict_del(options, "host"); + qdict_del(options, "port"); + } + + s->client.is_unix = saddr->type == SOCKET_ADDRESS_KIND_UNIX; + + *export = g_strdup(qdict_get_try_str(options, "export")); + if (*export) { + qdict_del(options, "export"); + } + + return saddr; +} + +NbdClientSession *nbd_get_client_session(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + return &s->client; +} + +static int nbd_establish_connection(BlockDriverState *bs, + SocketAddress *saddr, + Error **errp) +{ + BDRVNBDState *s = bs->opaque; + int sock; + + sock = socket_connect(saddr, errp, NULL, NULL); + + if (sock < 0) { + logout("Failed to establish connection to NBD server\n"); + return -EIO; + } + + if (!s->client.is_unix) { + socket_set_nodelay(sock); + } + + return sock; +} + +static int nbd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVNBDState *s = bs->opaque; + char *export = NULL; + int result, sock; + SocketAddress *saddr; + + /* Pop the config into our state object. Exit if invalid. */ + saddr = nbd_config(s, options, &export, errp); + if (!saddr) { + return -EINVAL; + } + + /* establish TCP connection, return error if it fails + * TODO: Configurable retry-until-timeout behaviour. + */ + sock = nbd_establish_connection(bs, saddr, errp); + qapi_free_SocketAddress(saddr); + if (sock < 0) { + g_free(export); + return sock; + } + + /* NBD handshake */ + result = nbd_client_init(bs, sock, export, errp); + g_free(export); + return result; +} + +static int nbd_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + return nbd_client_co_readv(bs, sector_num, nb_sectors, qiov); +} + +static int nbd_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + return nbd_client_co_writev(bs, sector_num, nb_sectors, qiov); +} + +static int nbd_co_flush(BlockDriverState *bs) +{ + return nbd_client_co_flush(bs); +} + +static void nbd_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl.max_discard = UINT32_MAX >> BDRV_SECTOR_BITS; + bs->bl.max_transfer_length = UINT32_MAX >> BDRV_SECTOR_BITS; +} + +static int nbd_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + return nbd_client_co_discard(bs, sector_num, nb_sectors); +} + +static void nbd_close(BlockDriverState *bs) +{ + nbd_client_close(bs); +} + +static int64_t nbd_getlength(BlockDriverState *bs) +{ + BDRVNBDState *s = bs->opaque; + + return s->client.size; +} + +static void nbd_detach_aio_context(BlockDriverState *bs) +{ + nbd_client_detach_aio_context(bs); +} + +static void nbd_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + nbd_client_attach_aio_context(bs, new_context); +} + +static void nbd_refresh_filename(BlockDriverState *bs) +{ + QDict *opts = qdict_new(); + const char *path = qdict_get_try_str(bs->options, "path"); + const char *host = qdict_get_try_str(bs->options, "host"); + const char *port = qdict_get_try_str(bs->options, "port"); + const char *export = qdict_get_try_str(bs->options, "export"); + + qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("nbd"))); + + if (path && export) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "nbd+unix:///%s?socket=%s", export, path); + } else if (path && !export) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "nbd+unix://?socket=%s", path); + } else if (!path && export && port) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "nbd://%s:%s/%s", host, port, export); + } else if (!path && export && !port) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "nbd://%s/%s", host, export); + } else if (!path && !export && port) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "nbd://%s:%s", host, port); + } else if (!path && !export && !port) { + snprintf(bs->exact_filename, sizeof(bs->exact_filename), + "nbd://%s", host); + } + + if (path) { + qdict_put_obj(opts, "path", QOBJECT(qstring_from_str(path))); + } else if (port) { + qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); + qdict_put_obj(opts, "port", QOBJECT(qstring_from_str(port))); + } else { + qdict_put_obj(opts, "host", QOBJECT(qstring_from_str(host))); + } + if (export) { + qdict_put_obj(opts, "export", QOBJECT(qstring_from_str(export))); + } + + bs->full_open_options = opts; +} + +static BlockDriver bdrv_nbd = { + .format_name = "nbd", + .protocol_name = "nbd", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, + .bdrv_file_open = nbd_open, + .bdrv_co_readv = nbd_co_readv, + .bdrv_co_writev = nbd_co_writev, + .bdrv_close = nbd_close, + .bdrv_co_flush_to_os = nbd_co_flush, + .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, + .bdrv_getlength = nbd_getlength, + .bdrv_detach_aio_context = nbd_detach_aio_context, + .bdrv_attach_aio_context = nbd_attach_aio_context, + .bdrv_refresh_filename = nbd_refresh_filename, +}; + +static BlockDriver bdrv_nbd_tcp = { + .format_name = "nbd", + .protocol_name = "nbd+tcp", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, + .bdrv_file_open = nbd_open, + .bdrv_co_readv = nbd_co_readv, + .bdrv_co_writev = nbd_co_writev, + .bdrv_close = nbd_close, + .bdrv_co_flush_to_os = nbd_co_flush, + .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, + .bdrv_getlength = nbd_getlength, + .bdrv_detach_aio_context = nbd_detach_aio_context, + .bdrv_attach_aio_context = nbd_attach_aio_context, + .bdrv_refresh_filename = nbd_refresh_filename, +}; + +static BlockDriver bdrv_nbd_unix = { + .format_name = "nbd", + .protocol_name = "nbd+unix", + .instance_size = sizeof(BDRVNBDState), + .bdrv_parse_filename = nbd_parse_filename, + .bdrv_file_open = nbd_open, + .bdrv_co_readv = nbd_co_readv, + .bdrv_co_writev = nbd_co_writev, + .bdrv_close = nbd_close, + .bdrv_co_flush_to_os = nbd_co_flush, + .bdrv_co_discard = nbd_co_discard, + .bdrv_refresh_limits = nbd_refresh_limits, + .bdrv_getlength = nbd_getlength, + .bdrv_detach_aio_context = nbd_detach_aio_context, + .bdrv_attach_aio_context = nbd_attach_aio_context, + .bdrv_refresh_filename = nbd_refresh_filename, +}; + +static void bdrv_nbd_init(void) +{ + bdrv_register(&bdrv_nbd); + bdrv_register(&bdrv_nbd_tcp); + bdrv_register(&bdrv_nbd_unix); +} + +block_init(bdrv_nbd_init); diff --git a/src/block/nfs.c b/src/block/nfs.c new file mode 100644 index 0000000..fd79f89 --- /dev/null +++ b/src/block/nfs.c @@ -0,0 +1,549 @@ +/* + * QEMU Block driver for native access to files on NFS shares + * + * Copyright (c) 2014 Peter Lieven <pl@kamp.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "config-host.h" + +#include <poll.h> +#include "qemu-common.h" +#include "qemu/config-file.h" +#include "qemu/error-report.h" +#include "block/block_int.h" +#include "trace.h" +#include "qemu/iov.h" +#include "qemu/uri.h" +#include "sysemu/sysemu.h" +#include <nfsc/libnfs.h> + +#define QEMU_NFS_MAX_READAHEAD_SIZE 1048576 + +typedef struct NFSClient { + struct nfs_context *context; + struct nfsfh *fh; + int events; + bool has_zero_init; + AioContext *aio_context; + blkcnt_t st_blocks; +} NFSClient; + +typedef struct NFSRPC { + int ret; + int complete; + QEMUIOVector *iov; + struct stat *st; + Coroutine *co; + QEMUBH *bh; + NFSClient *client; +} NFSRPC; + +static void nfs_process_read(void *arg); +static void nfs_process_write(void *arg); + +static void nfs_set_events(NFSClient *client) +{ + int ev = nfs_which_events(client->context); + if (ev != client->events) { + aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), + false, + (ev & POLLIN) ? nfs_process_read : NULL, + (ev & POLLOUT) ? nfs_process_write : NULL, client); + + } + client->events = ev; +} + +static void nfs_process_read(void *arg) +{ + NFSClient *client = arg; + nfs_service(client->context, POLLIN); + nfs_set_events(client); +} + +static void nfs_process_write(void *arg) +{ + NFSClient *client = arg; + nfs_service(client->context, POLLOUT); + nfs_set_events(client); +} + +static void nfs_co_init_task(NFSClient *client, NFSRPC *task) +{ + *task = (NFSRPC) { + .co = qemu_coroutine_self(), + .client = client, + }; +} + +static void nfs_co_generic_bh_cb(void *opaque) +{ + NFSRPC *task = opaque; + task->complete = 1; + qemu_bh_delete(task->bh); + qemu_coroutine_enter(task->co, NULL); +} + +static void +nfs_co_generic_cb(int ret, struct nfs_context *nfs, void *data, + void *private_data) +{ + NFSRPC *task = private_data; + task->ret = ret; + if (task->ret > 0 && task->iov) { + if (task->ret <= task->iov->size) { + qemu_iovec_from_buf(task->iov, 0, data, task->ret); + } else { + task->ret = -EIO; + } + } + if (task->ret == 0 && task->st) { + memcpy(task->st, data, sizeof(struct stat)); + } + if (task->ret < 0) { + error_report("NFS Error: %s", nfs_get_error(nfs)); + } + if (task->co) { + task->bh = aio_bh_new(task->client->aio_context, + nfs_co_generic_bh_cb, task); + qemu_bh_schedule(task->bh); + } else { + task->complete = 1; + } +} + +static int coroutine_fn nfs_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + + nfs_co_init_task(client, &task); + task.iov = iov; + + if (nfs_pread_async(client->context, client->fh, + sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, + nfs_co_generic_cb, &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + if (task.ret < 0) { + return task.ret; + } + + /* zero pad short reads */ + if (task.ret < iov->size) { + qemu_iovec_memset(iov, task.ret, 0, iov->size - task.ret); + } + + return 0; +} + +static int coroutine_fn nfs_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *iov) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + char *buf = NULL; + + nfs_co_init_task(client, &task); + + buf = g_try_malloc(nb_sectors * BDRV_SECTOR_SIZE); + if (nb_sectors && buf == NULL) { + return -ENOMEM; + } + + qemu_iovec_to_buf(iov, 0, buf, nb_sectors * BDRV_SECTOR_SIZE); + + if (nfs_pwrite_async(client->context, client->fh, + sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, + buf, nfs_co_generic_cb, &task) != 0) { + g_free(buf); + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + g_free(buf); + + if (task.ret != nb_sectors * BDRV_SECTOR_SIZE) { + return task.ret < 0 ? task.ret : -EIO; + } + + return 0; +} + +static int coroutine_fn nfs_co_flush(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + NFSRPC task; + + nfs_co_init_task(client, &task); + + if (nfs_fsync_async(client->context, client->fh, nfs_co_generic_cb, + &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + qemu_coroutine_yield(); + } + + return task.ret; +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "nfs", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the NFS file", + }, + { /* end of list */ } + }, +}; + +static void nfs_detach_aio_context(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + + aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), + false, NULL, NULL, NULL); + client->events = 0; +} + +static void nfs_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + NFSClient *client = bs->opaque; + + client->aio_context = new_context; + nfs_set_events(client); +} + +static void nfs_client_close(NFSClient *client) +{ + if (client->context) { + if (client->fh) { + nfs_close(client->context, client->fh); + } + aio_set_fd_handler(client->aio_context, nfs_get_fd(client->context), + false, NULL, NULL, NULL); + nfs_destroy_context(client->context); + } + memset(client, 0, sizeof(NFSClient)); +} + +static void nfs_file_close(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + nfs_client_close(client); +} + +static int64_t nfs_client_open(NFSClient *client, const char *filename, + int flags, Error **errp) +{ + int ret = -EINVAL, i; + struct stat st; + URI *uri; + QueryParams *qp = NULL; + char *file = NULL, *strp = NULL; + + uri = uri_parse(filename); + if (!uri) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + if (!uri->server) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + strp = strrchr(uri->path, '/'); + if (strp == NULL) { + error_setg(errp, "Invalid URL specified"); + goto fail; + } + file = g_strdup(strp); + *strp = 0; + + client->context = nfs_init_context(); + if (client->context == NULL) { + error_setg(errp, "Failed to init NFS context"); + goto fail; + } + + qp = query_params_parse(uri->query); + for (i = 0; i < qp->n; i++) { + unsigned long long val; + if (!qp->p[i].value) { + error_setg(errp, "Value for NFS parameter expected: %s", + qp->p[i].name); + goto fail; + } + if (parse_uint_full(qp->p[i].value, &val, 0)) { + error_setg(errp, "Illegal value for NFS parameter: %s", + qp->p[i].name); + goto fail; + } + if (!strcmp(qp->p[i].name, "uid")) { + nfs_set_uid(client->context, val); + } else if (!strcmp(qp->p[i].name, "gid")) { + nfs_set_gid(client->context, val); + } else if (!strcmp(qp->p[i].name, "tcp-syncnt")) { + nfs_set_tcp_syncnt(client->context, val); +#ifdef LIBNFS_FEATURE_READAHEAD + } else if (!strcmp(qp->p[i].name, "readahead")) { + if (val > QEMU_NFS_MAX_READAHEAD_SIZE) { + error_report("NFS Warning: Truncating NFS readahead" + " size to %d", QEMU_NFS_MAX_READAHEAD_SIZE); + val = QEMU_NFS_MAX_READAHEAD_SIZE; + } + nfs_set_readahead(client->context, val); +#endif + } else { + error_setg(errp, "Unknown NFS parameter name: %s", + qp->p[i].name); + goto fail; + } + } + + ret = nfs_mount(client->context, uri->server, uri->path); + if (ret < 0) { + error_setg(errp, "Failed to mount nfs share: %s", + nfs_get_error(client->context)); + goto fail; + } + + if (flags & O_CREAT) { + ret = nfs_creat(client->context, file, 0600, &client->fh); + if (ret < 0) { + error_setg(errp, "Failed to create file: %s", + nfs_get_error(client->context)); + goto fail; + } + } else { + ret = nfs_open(client->context, file, flags, &client->fh); + if (ret < 0) { + error_setg(errp, "Failed to open file : %s", + nfs_get_error(client->context)); + goto fail; + } + } + + ret = nfs_fstat(client->context, client->fh, &st); + if (ret < 0) { + error_setg(errp, "Failed to fstat file: %s", + nfs_get_error(client->context)); + goto fail; + } + + ret = DIV_ROUND_UP(st.st_size, BDRV_SECTOR_SIZE); + client->st_blocks = st.st_blocks; + client->has_zero_init = S_ISREG(st.st_mode); + goto out; +fail: + nfs_client_close(client); +out: + if (qp) { + query_params_free(qp); + } + uri_free(uri); + g_free(file); + return ret; +} + +static int nfs_file_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) { + NFSClient *client = bs->opaque; + int64_t ret; + QemuOpts *opts; + Error *local_err = NULL; + + client->aio_context = bdrv_get_aio_context(bs); + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + ret = nfs_client_open(client, qemu_opt_get(opts, "filename"), + (flags & BDRV_O_RDWR) ? O_RDWR : O_RDONLY, + errp); + if (ret < 0) { + goto out; + } + bs->total_sectors = ret; + ret = 0; +out: + qemu_opts_del(opts); + return ret; +} + +static QemuOptsList nfs_create_opts = { + .name = "nfs-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(nfs_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + +static int nfs_file_create(const char *url, QemuOpts *opts, Error **errp) +{ + int ret = 0; + int64_t total_size = 0; + NFSClient *client = g_new0(NFSClient, 1); + + client->aio_context = qemu_get_aio_context(); + + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + + ret = nfs_client_open(client, url, O_CREAT, errp); + if (ret < 0) { + goto out; + } + ret = nfs_ftruncate(client->context, client->fh, total_size); + nfs_client_close(client); +out: + g_free(client); + return ret; +} + +static int nfs_has_zero_init(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + return client->has_zero_init; +} + +static int64_t nfs_get_allocated_file_size(BlockDriverState *bs) +{ + NFSClient *client = bs->opaque; + NFSRPC task = {0}; + struct stat st; + + if (bdrv_is_read_only(bs) && + !(bs->open_flags & BDRV_O_NOCACHE)) { + return client->st_blocks * 512; + } + + task.st = &st; + if (nfs_fstat_async(client->context, client->fh, nfs_co_generic_cb, + &task) != 0) { + return -ENOMEM; + } + + while (!task.complete) { + nfs_set_events(client); + aio_poll(client->aio_context, true); + } + + return (task.ret < 0 ? task.ret : st.st_blocks * 512); +} + +static int nfs_file_truncate(BlockDriverState *bs, int64_t offset) +{ + NFSClient *client = bs->opaque; + return nfs_ftruncate(client->context, client->fh, offset); +} + +/* Note that this will not re-establish a connection with the NFS server + * - it is effectively a NOP. */ +static int nfs_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + NFSClient *client = state->bs->opaque; + struct stat st; + int ret = 0; + + if (state->flags & BDRV_O_RDWR && bdrv_is_read_only(state->bs)) { + error_setg(errp, "Cannot open a read-only mount as read-write"); + return -EACCES; + } + + /* Update cache for read-only reopens */ + if (!(state->flags & BDRV_O_RDWR)) { + ret = nfs_fstat(client->context, client->fh, &st); + if (ret < 0) { + error_setg(errp, "Failed to fstat file: %s", + nfs_get_error(client->context)); + return ret; + } + client->st_blocks = st.st_blocks; + } + + return 0; +} + +static BlockDriver bdrv_nfs = { + .format_name = "nfs", + .protocol_name = "nfs", + + .instance_size = sizeof(NFSClient), + .bdrv_needs_filename = true, + .create_opts = &nfs_create_opts, + + .bdrv_has_zero_init = nfs_has_zero_init, + .bdrv_get_allocated_file_size = nfs_get_allocated_file_size, + .bdrv_truncate = nfs_file_truncate, + + .bdrv_file_open = nfs_file_open, + .bdrv_close = nfs_file_close, + .bdrv_create = nfs_file_create, + .bdrv_reopen_prepare = nfs_reopen_prepare, + + .bdrv_co_readv = nfs_co_readv, + .bdrv_co_writev = nfs_co_writev, + .bdrv_co_flush_to_disk = nfs_co_flush, + + .bdrv_detach_aio_context = nfs_detach_aio_context, + .bdrv_attach_aio_context = nfs_attach_aio_context, +}; + +static void nfs_block_init(void) +{ + bdrv_register(&bdrv_nfs); +} + +block_init(nfs_block_init); diff --git a/src/block/null.c b/src/block/null.c new file mode 100644 index 0000000..7d08323 --- /dev/null +++ b/src/block/null.c @@ -0,0 +1,222 @@ +/* + * Null block driver + * + * Authors: + * Fam Zheng <famz@redhat.com> + * + * Copyright (C) 2014 Red Hat, Inc. + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "block/block_int.h" + +#define NULL_OPT_LATENCY "latency-ns" + +typedef struct { + int64_t length; + int64_t latency_ns; +} BDRVNullState; + +static QemuOptsList runtime_opts = { + .name = "null", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "", + }, + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "size of the null block", + }, + { + .name = NULL_OPT_LATENCY, + .type = QEMU_OPT_NUMBER, + .help = "nanoseconds (approximated) to wait " + "before completing request", + }, + { /* end of list */ } + }, +}; + +static int null_file_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + QemuOpts *opts; + BDRVNullState *s = bs->opaque; + int ret = 0; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &error_abort); + s->length = + qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 1 << 30); + s->latency_ns = + qemu_opt_get_number(opts, NULL_OPT_LATENCY, 0); + if (s->latency_ns < 0) { + error_setg(errp, "latency-ns is invalid"); + ret = -EINVAL; + } + qemu_opts_del(opts); + return ret; +} + +static void null_close(BlockDriverState *bs) +{ +} + +static int64_t null_getlength(BlockDriverState *bs) +{ + BDRVNullState *s = bs->opaque; + return s->length; +} + +static coroutine_fn int null_co_common(BlockDriverState *bs) +{ + BDRVNullState *s = bs->opaque; + + if (s->latency_ns) { + co_aio_sleep_ns(bdrv_get_aio_context(bs), QEMU_CLOCK_REALTIME, + s->latency_ns); + } + return 0; +} + +static coroutine_fn int null_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + return null_co_common(bs); +} + +static coroutine_fn int null_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + QEMUIOVector *qiov) +{ + return null_co_common(bs); +} + +static coroutine_fn int null_co_flush(BlockDriverState *bs) +{ + return null_co_common(bs); +} + +typedef struct { + BlockAIOCB common; + QEMUBH *bh; + QEMUTimer timer; +} NullAIOCB; + +static const AIOCBInfo null_aiocb_info = { + .aiocb_size = sizeof(NullAIOCB), +}; + +static void null_bh_cb(void *opaque) +{ + NullAIOCB *acb = opaque; + acb->common.cb(acb->common.opaque, 0); + qemu_bh_delete(acb->bh); + qemu_aio_unref(acb); +} + +static void null_timer_cb(void *opaque) +{ + NullAIOCB *acb = opaque; + acb->common.cb(acb->common.opaque, 0); + timer_deinit(&acb->timer); + qemu_aio_unref(acb); +} + +static inline BlockAIOCB *null_aio_common(BlockDriverState *bs, + BlockCompletionFunc *cb, + void *opaque) +{ + NullAIOCB *acb; + BDRVNullState *s = bs->opaque; + + acb = qemu_aio_get(&null_aiocb_info, bs, cb, opaque); + /* Only emulate latency after vcpu is running. */ + if (s->latency_ns) { + aio_timer_init(bdrv_get_aio_context(bs), &acb->timer, + QEMU_CLOCK_REALTIME, SCALE_NS, + null_timer_cb, acb); + timer_mod_ns(&acb->timer, + qemu_clock_get_ns(QEMU_CLOCK_REALTIME) + s->latency_ns); + } else { + acb->bh = aio_bh_new(bdrv_get_aio_context(bs), null_bh_cb, acb); + qemu_bh_schedule(acb->bh); + } + return &acb->common; +} + +static BlockAIOCB *null_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + return null_aio_common(bs, cb, opaque); +} + +static BlockAIOCB *null_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + return null_aio_common(bs, cb, opaque); +} + +static BlockAIOCB *null_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, + void *opaque) +{ + return null_aio_common(bs, cb, opaque); +} + +static int null_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static BlockDriver bdrv_null_co = { + .format_name = "null-co", + .protocol_name = "null-co", + .instance_size = sizeof(BDRVNullState), + + .bdrv_file_open = null_file_open, + .bdrv_close = null_close, + .bdrv_getlength = null_getlength, + + .bdrv_co_readv = null_co_readv, + .bdrv_co_writev = null_co_writev, + .bdrv_co_flush_to_disk = null_co_flush, + .bdrv_reopen_prepare = null_reopen_prepare, +}; + +static BlockDriver bdrv_null_aio = { + .format_name = "null-aio", + .protocol_name = "null-aio", + .instance_size = sizeof(BDRVNullState), + + .bdrv_file_open = null_file_open, + .bdrv_close = null_close, + .bdrv_getlength = null_getlength, + + .bdrv_aio_readv = null_aio_readv, + .bdrv_aio_writev = null_aio_writev, + .bdrv_aio_flush = null_aio_flush, + .bdrv_reopen_prepare = null_reopen_prepare, +}; + +static void bdrv_null_init(void) +{ + bdrv_register(&bdrv_null_co); + bdrv_register(&bdrv_null_aio); +} + +block_init(bdrv_null_init); diff --git a/src/block/parallels.c b/src/block/parallels.c new file mode 100644 index 0000000..f689fde --- /dev/null +++ b/src/block/parallels.c @@ -0,0 +1,760 @@ +/* + * Block driver for Parallels disk image format + * + * Copyright (c) 2007 Alex Beregszaszi + * Copyright (c) 2015 Denis V. Lunev <den@openvz.org> + * + * This code was originally based on comparing different disk images created + * by Parallels. Currently it is based on opened OpenVZ sources + * available at + * http://git.openvz.org/?p=ploop;a=summary + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/bitmap.h" +#include "qapi/util.h" + +/**************************************************************/ + +#define HEADER_MAGIC "WithoutFreeSpace" +#define HEADER_MAGIC2 "WithouFreSpacExt" +#define HEADER_VERSION 2 +#define HEADER_INUSE_MAGIC (0x746F6E59) + +#define DEFAULT_CLUSTER_SIZE 1048576 /* 1 MiB */ + + +// always little-endian +typedef struct ParallelsHeader { + char magic[16]; // "WithoutFreeSpace" + uint32_t version; + uint32_t heads; + uint32_t cylinders; + uint32_t tracks; + uint32_t bat_entries; + uint64_t nb_sectors; + uint32_t inuse; + uint32_t data_off; + char padding[12]; +} QEMU_PACKED ParallelsHeader; + + +typedef enum ParallelsPreallocMode { + PRL_PREALLOC_MODE_FALLOCATE = 0, + PRL_PREALLOC_MODE_TRUNCATE = 1, + PRL_PREALLOC_MODE_MAX = 2, +} ParallelsPreallocMode; + +static const char *prealloc_mode_lookup[] = { + "falloc", + "truncate", + NULL, +}; + + +typedef struct BDRVParallelsState { + /** Locking is conservative, the lock protects + * - image file extending (truncate, fallocate) + * - any access to block allocation table + */ + CoMutex lock; + + ParallelsHeader *header; + uint32_t header_size; + bool header_unclean; + + unsigned long *bat_dirty_bmap; + unsigned int bat_dirty_block; + + uint32_t *bat_bitmap; + unsigned int bat_size; + + int64_t data_end; + uint64_t prealloc_size; + ParallelsPreallocMode prealloc_mode; + + unsigned int tracks; + + unsigned int off_multiplier; +} BDRVParallelsState; + + +#define PARALLELS_OPT_PREALLOC_MODE "prealloc-mode" +#define PARALLELS_OPT_PREALLOC_SIZE "prealloc-size" + +static QemuOptsList parallels_runtime_opts = { + .name = "parallels", + .head = QTAILQ_HEAD_INITIALIZER(parallels_runtime_opts.head), + .desc = { + { + .name = PARALLELS_OPT_PREALLOC_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Preallocation size on image expansion", + .def_value_str = "128MiB", + }, + { + .name = PARALLELS_OPT_PREALLOC_MODE, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode on image expansion " + "(allowed values: falloc, truncate)", + .def_value_str = "falloc", + }, + { /* end of list */ }, + }, +}; + + +static int64_t bat2sect(BDRVParallelsState *s, uint32_t idx) +{ + return (uint64_t)le32_to_cpu(s->bat_bitmap[idx]) * s->off_multiplier; +} + +static uint32_t bat_entry_off(uint32_t idx) +{ + return sizeof(ParallelsHeader) + sizeof(uint32_t) * idx; +} + +static int64_t seek_to_sector(BDRVParallelsState *s, int64_t sector_num) +{ + uint32_t index, offset; + + index = sector_num / s->tracks; + offset = sector_num % s->tracks; + + /* not allocated */ + if ((index >= s->bat_size) || (s->bat_bitmap[index] == 0)) { + return -1; + } + return bat2sect(s, index) + offset; +} + +static int cluster_remainder(BDRVParallelsState *s, int64_t sector_num, + int nb_sectors) +{ + int ret = s->tracks - sector_num % s->tracks; + return MIN(nb_sectors, ret); +} + +static int64_t block_status(BDRVParallelsState *s, int64_t sector_num, + int nb_sectors, int *pnum) +{ + int64_t start_off = -2, prev_end_off = -2; + + *pnum = 0; + while (nb_sectors > 0 || start_off == -2) { + int64_t offset = seek_to_sector(s, sector_num); + int to_end; + + if (start_off == -2) { + start_off = offset; + prev_end_off = offset; + } else if (offset != prev_end_off) { + break; + } + + to_end = cluster_remainder(s, sector_num, nb_sectors); + nb_sectors -= to_end; + sector_num += to_end; + *pnum += to_end; + + if (offset > 0) { + prev_end_off += to_end; + } + } + return start_off; +} + +static int64_t allocate_clusters(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVParallelsState *s = bs->opaque; + uint32_t idx, to_allocate, i; + int64_t pos, space; + + pos = block_status(s, sector_num, nb_sectors, pnum); + if (pos > 0) { + return pos; + } + + idx = sector_num / s->tracks; + if (idx >= s->bat_size) { + return -EINVAL; + } + + to_allocate = (sector_num + *pnum + s->tracks - 1) / s->tracks - idx; + space = to_allocate * s->tracks; + if (s->data_end + space > bdrv_getlength(bs->file->bs) >> BDRV_SECTOR_BITS) { + int ret; + space += s->prealloc_size; + if (s->prealloc_mode == PRL_PREALLOC_MODE_FALLOCATE) { + ret = bdrv_write_zeroes(bs->file->bs, s->data_end, space, 0); + } else { + ret = bdrv_truncate(bs->file->bs, + (s->data_end + space) << BDRV_SECTOR_BITS); + } + if (ret < 0) { + return ret; + } + } + + for (i = 0; i < to_allocate; i++) { + s->bat_bitmap[idx + i] = cpu_to_le32(s->data_end / s->off_multiplier); + s->data_end += s->tracks; + bitmap_set(s->bat_dirty_bmap, + bat_entry_off(idx + i) / s->bat_dirty_block, 1); + } + + return bat2sect(s, idx) + sector_num % s->tracks; +} + + +static coroutine_fn int parallels_co_flush_to_os(BlockDriverState *bs) +{ + BDRVParallelsState *s = bs->opaque; + unsigned long size = DIV_ROUND_UP(s->header_size, s->bat_dirty_block); + unsigned long bit; + + qemu_co_mutex_lock(&s->lock); + + bit = find_first_bit(s->bat_dirty_bmap, size); + while (bit < size) { + uint32_t off = bit * s->bat_dirty_block; + uint32_t to_write = s->bat_dirty_block; + int ret; + + if (off + to_write > s->header_size) { + to_write = s->header_size - off; + } + ret = bdrv_pwrite(bs->file->bs, off, (uint8_t *)s->header + off, + to_write); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + bit = find_next_bit(s->bat_dirty_bmap, size, bit + 1); + } + bitmap_zero(s->bat_dirty_bmap, size); + + qemu_co_mutex_unlock(&s->lock); + return 0; +} + + +static int64_t coroutine_fn parallels_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVParallelsState *s = bs->opaque; + int64_t offset; + + qemu_co_mutex_lock(&s->lock); + offset = block_status(s, sector_num, nb_sectors, pnum); + qemu_co_mutex_unlock(&s->lock); + + if (offset < 0) { + return 0; + } + + return (offset << BDRV_SECTOR_BITS) | + BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID; +} + +static coroutine_fn int parallels_co_writev(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + BDRVParallelsState *s = bs->opaque; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + int ret = 0; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + while (nb_sectors > 0) { + int64_t position; + int n, nbytes; + + qemu_co_mutex_lock(&s->lock); + position = allocate_clusters(bs, sector_num, nb_sectors, &n); + qemu_co_mutex_unlock(&s->lock); + if (position < 0) { + ret = (int)position; + break; + } + + nbytes = n << BDRV_SECTOR_BITS; + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); + + ret = bdrv_co_writev(bs->file->bs, position, n, &hd_qiov); + if (ret < 0) { + break; + } + + nb_sectors -= n; + sector_num += n; + bytes_done += nbytes; + } + + qemu_iovec_destroy(&hd_qiov); + return ret; +} + +static coroutine_fn int parallels_co_readv(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, QEMUIOVector *qiov) +{ + BDRVParallelsState *s = bs->opaque; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + int ret = 0; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + while (nb_sectors > 0) { + int64_t position; + int n, nbytes; + + qemu_co_mutex_lock(&s->lock); + position = block_status(s, sector_num, nb_sectors, &n); + qemu_co_mutex_unlock(&s->lock); + + nbytes = n << BDRV_SECTOR_BITS; + + if (position < 0) { + qemu_iovec_memset(qiov, bytes_done, 0, nbytes); + } else { + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, nbytes); + + ret = bdrv_co_readv(bs->file->bs, position, n, &hd_qiov); + if (ret < 0) { + break; + } + } + + nb_sectors -= n; + sector_num += n; + bytes_done += nbytes; + } + + qemu_iovec_destroy(&hd_qiov); + return ret; +} + + +static int parallels_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVParallelsState *s = bs->opaque; + int64_t size, prev_off, high_off; + int ret; + uint32_t i; + bool flush_bat = false; + int cluster_size = s->tracks << BDRV_SECTOR_BITS; + + size = bdrv_getlength(bs->file->bs); + if (size < 0) { + res->check_errors++; + return size; + } + + if (s->header_unclean) { + fprintf(stderr, "%s image was not closed correctly\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR"); + res->corruptions++; + if (fix & BDRV_FIX_ERRORS) { + /* parallels_close will do the job right */ + res->corruptions_fixed++; + s->header_unclean = false; + } + } + + res->bfi.total_clusters = s->bat_size; + res->bfi.compressed_clusters = 0; /* compression is not supported */ + + high_off = 0; + prev_off = 0; + for (i = 0; i < s->bat_size; i++) { + int64_t off = bat2sect(s, i) << BDRV_SECTOR_BITS; + if (off == 0) { + prev_off = 0; + continue; + } + + /* cluster outside the image */ + if (off > size) { + fprintf(stderr, "%s cluster %u is outside image\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); + res->corruptions++; + if (fix & BDRV_FIX_ERRORS) { + prev_off = 0; + s->bat_bitmap[i] = 0; + res->corruptions_fixed++; + flush_bat = true; + continue; + } + } + + res->bfi.allocated_clusters++; + if (off > high_off) { + high_off = off; + } + + if (prev_off != 0 && (prev_off + cluster_size) != off) { + res->bfi.fragmented_clusters++; + } + prev_off = off; + } + + if (flush_bat) { + ret = bdrv_pwrite_sync(bs->file->bs, 0, s->header, s->header_size); + if (ret < 0) { + res->check_errors++; + return ret; + } + } + + res->image_end_offset = high_off + cluster_size; + if (size > res->image_end_offset) { + int64_t count; + count = DIV_ROUND_UP(size - res->image_end_offset, cluster_size); + fprintf(stderr, "%s space leaked at the end of the image %" PRId64 "\n", + fix & BDRV_FIX_LEAKS ? "Repairing" : "ERROR", + size - res->image_end_offset); + res->leaks += count; + if (fix & BDRV_FIX_LEAKS) { + ret = bdrv_truncate(bs->file->bs, res->image_end_offset); + if (ret < 0) { + res->check_errors++; + return ret; + } + res->leaks_fixed += count; + } + } + + return 0; +} + + +static int parallels_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int64_t total_size, cl_size; + uint8_t tmp[BDRV_SECTOR_SIZE]; + Error *local_err = NULL; + BlockDriverState *file; + uint32_t bat_entries, bat_sectors; + ParallelsHeader header; + int ret; + + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + cl_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, + DEFAULT_CLUSTER_SIZE), BDRV_SECTOR_SIZE); + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + + file = NULL; + ret = bdrv_open(&file, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + ret = bdrv_truncate(file, 0); + if (ret < 0) { + goto exit; + } + + bat_entries = DIV_ROUND_UP(total_size, cl_size); + bat_sectors = DIV_ROUND_UP(bat_entry_off(bat_entries), cl_size); + bat_sectors = (bat_sectors * cl_size) >> BDRV_SECTOR_BITS; + + memset(&header, 0, sizeof(header)); + memcpy(header.magic, HEADER_MAGIC2, sizeof(header.magic)); + header.version = cpu_to_le32(HEADER_VERSION); + /* don't care much about geometry, it is not used on image level */ + header.heads = cpu_to_le32(16); + header.cylinders = cpu_to_le32(total_size / BDRV_SECTOR_SIZE / 16 / 32); + header.tracks = cpu_to_le32(cl_size >> BDRV_SECTOR_BITS); + header.bat_entries = cpu_to_le32(bat_entries); + header.nb_sectors = cpu_to_le64(DIV_ROUND_UP(total_size, BDRV_SECTOR_SIZE)); + header.data_off = cpu_to_le32(bat_sectors); + + /* write all the data */ + memset(tmp, 0, sizeof(tmp)); + memcpy(tmp, &header, sizeof(header)); + + ret = bdrv_pwrite(file, 0, tmp, BDRV_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + ret = bdrv_write_zeroes(file, 1, bat_sectors - 1, 0); + if (ret < 0) { + goto exit; + } + ret = 0; + +done: + bdrv_unref(file); + return ret; + +exit: + error_setg_errno(errp, -ret, "Failed to create Parallels image"); + goto done; +} + + +static int parallels_probe(const uint8_t *buf, int buf_size, + const char *filename) +{ + const ParallelsHeader *ph = (const void *)buf; + + if (buf_size < sizeof(ParallelsHeader)) { + return 0; + } + + if ((!memcmp(ph->magic, HEADER_MAGIC, 16) || + !memcmp(ph->magic, HEADER_MAGIC2, 16)) && + (le32_to_cpu(ph->version) == HEADER_VERSION)) { + return 100; + } + + return 0; +} + +static int parallels_update_header(BlockDriverState *bs) +{ + BDRVParallelsState *s = bs->opaque; + unsigned size = MAX(bdrv_opt_mem_align(bs->file->bs), + sizeof(ParallelsHeader)); + + if (size > s->header_size) { + size = s->header_size; + } + return bdrv_pwrite_sync(bs->file->bs, 0, s->header, size); +} + +static int parallels_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVParallelsState *s = bs->opaque; + ParallelsHeader ph; + int ret, size, i; + QemuOpts *opts = NULL; + Error *local_err = NULL; + char *buf; + + ret = bdrv_pread(bs->file->bs, 0, &ph, sizeof(ph)); + if (ret < 0) { + goto fail; + } + + bs->total_sectors = le64_to_cpu(ph.nb_sectors); + + if (le32_to_cpu(ph.version) != HEADER_VERSION) { + goto fail_format; + } + if (!memcmp(ph.magic, HEADER_MAGIC, 16)) { + s->off_multiplier = 1; + bs->total_sectors = 0xffffffff & bs->total_sectors; + } else if (!memcmp(ph.magic, HEADER_MAGIC2, 16)) { + s->off_multiplier = le32_to_cpu(ph.tracks); + } else { + goto fail_format; + } + + s->tracks = le32_to_cpu(ph.tracks); + if (s->tracks == 0) { + error_setg(errp, "Invalid image: Zero sectors per track"); + ret = -EINVAL; + goto fail; + } + if (s->tracks > INT32_MAX/513) { + error_setg(errp, "Invalid image: Too big cluster"); + ret = -EFBIG; + goto fail; + } + + s->bat_size = le32_to_cpu(ph.bat_entries); + if (s->bat_size > INT_MAX / sizeof(uint32_t)) { + error_setg(errp, "Catalog too large"); + ret = -EFBIG; + goto fail; + } + + size = bat_entry_off(s->bat_size); + s->header_size = ROUND_UP(size, bdrv_opt_mem_align(bs->file->bs)); + s->header = qemu_try_blockalign(bs->file->bs, s->header_size); + if (s->header == NULL) { + ret = -ENOMEM; + goto fail; + } + s->data_end = le32_to_cpu(ph.data_off); + if (s->data_end == 0) { + s->data_end = ROUND_UP(bat_entry_off(s->bat_size), BDRV_SECTOR_SIZE); + } + if (s->data_end < s->header_size) { + /* there is not enough unused space to fit to block align between BAT + and actual data. We can't avoid read-modify-write... */ + s->header_size = size; + } + + ret = bdrv_pread(bs->file->bs, 0, s->header, s->header_size); + if (ret < 0) { + goto fail; + } + s->bat_bitmap = (uint32_t *)(s->header + 1); + + for (i = 0; i < s->bat_size; i++) { + int64_t off = bat2sect(s, i); + if (off >= s->data_end) { + s->data_end = off + s->tracks; + } + } + + if (le32_to_cpu(ph.inuse) == HEADER_INUSE_MAGIC) { + /* Image was not closed correctly. The check is mandatory */ + s->header_unclean = true; + if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { + error_setg(errp, "parallels: Image was not closed correctly; " + "cannot be opened read/write"); + ret = -EACCES; + goto fail; + } + } + + opts = qemu_opts_create(¶llels_runtime_opts, NULL, 0, &local_err); + if (local_err != NULL) { + goto fail_options; + } + + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err != NULL) { + goto fail_options; + } + + s->prealloc_size = + qemu_opt_get_size_del(opts, PARALLELS_OPT_PREALLOC_SIZE, 0); + s->prealloc_size = MAX(s->tracks, s->prealloc_size >> BDRV_SECTOR_BITS); + buf = qemu_opt_get_del(opts, PARALLELS_OPT_PREALLOC_MODE); + s->prealloc_mode = qapi_enum_parse(prealloc_mode_lookup, buf, + PRL_PREALLOC_MODE_MAX, PRL_PREALLOC_MODE_FALLOCATE, &local_err); + g_free(buf); + if (local_err != NULL) { + goto fail_options; + } + if (!bdrv_has_zero_init(bs->file->bs) || + bdrv_truncate(bs->file->bs, bdrv_getlength(bs->file->bs)) != 0) { + s->prealloc_mode = PRL_PREALLOC_MODE_FALLOCATE; + } + + if (flags & BDRV_O_RDWR) { + s->header->inuse = cpu_to_le32(HEADER_INUSE_MAGIC); + ret = parallels_update_header(bs); + if (ret < 0) { + goto fail; + } + } + + s->bat_dirty_block = 4 * getpagesize(); + s->bat_dirty_bmap = + bitmap_new(DIV_ROUND_UP(s->header_size, s->bat_dirty_block)); + + qemu_co_mutex_init(&s->lock); + return 0; + +fail_format: + error_setg(errp, "Image not in Parallels format"); + ret = -EINVAL; +fail: + qemu_vfree(s->header); + return ret; + +fail_options: + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; +} + + +static void parallels_close(BlockDriverState *bs) +{ + BDRVParallelsState *s = bs->opaque; + + if (bs->open_flags & BDRV_O_RDWR) { + s->header->inuse = 0; + parallels_update_header(bs); + } + + if (bs->open_flags & BDRV_O_RDWR) { + bdrv_truncate(bs->file->bs, s->data_end << BDRV_SECTOR_BITS); + } + + g_free(s->bat_dirty_bmap); + qemu_vfree(s->header); +} + +static QemuOptsList parallels_create_opts = { + .name = "parallels-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(parallels_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size", + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Parallels image cluster size", + .def_value_str = stringify(DEFAULT_CLUSTER_SIZE), + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_parallels = { + .format_name = "parallels", + .instance_size = sizeof(BDRVParallelsState), + .bdrv_probe = parallels_probe, + .bdrv_open = parallels_open, + .bdrv_close = parallels_close, + .bdrv_co_get_block_status = parallels_co_get_block_status, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_flush_to_os = parallels_co_flush_to_os, + .bdrv_co_readv = parallels_co_readv, + .bdrv_co_writev = parallels_co_writev, + + .bdrv_create = parallels_create, + .bdrv_check = parallels_check, + .create_opts = ¶llels_create_opts, +}; + +static void bdrv_parallels_init(void) +{ + bdrv_register(&bdrv_parallels); +} + +block_init(bdrv_parallels_init); diff --git a/src/block/qapi.c b/src/block/qapi.c new file mode 100644 index 0000000..267f147 --- /dev/null +++ b/src/block/qapi.c @@ -0,0 +1,719 @@ +/* + * Block layer qmp and info dump related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/qapi.h" +#include "block/block_int.h" +#include "block/throttle-groups.h" +#include "block/write-threshold.h" +#include "qmp-commands.h" +#include "qapi-visit.h" +#include "qapi/qmp-output-visitor.h" +#include "qapi/qmp/types.h" +#include "sysemu/block-backend.h" + +BlockDeviceInfo *bdrv_block_device_info(BlockDriverState *bs, Error **errp) +{ + ImageInfo **p_image_info; + BlockDriverState *bs0; + BlockDeviceInfo *info = g_malloc0(sizeof(*info)); + + info->file = g_strdup(bs->filename); + info->ro = bs->read_only; + info->drv = g_strdup(bs->drv->format_name); + info->encrypted = bs->encrypted; + info->encryption_key_missing = bdrv_key_required(bs); + + info->cache = g_new(BlockdevCacheInfo, 1); + *info->cache = (BlockdevCacheInfo) { + .writeback = bdrv_enable_write_cache(bs), + .direct = !!(bs->open_flags & BDRV_O_NOCACHE), + .no_flush = !!(bs->open_flags & BDRV_O_NO_FLUSH), + }; + + if (bs->node_name[0]) { + info->has_node_name = true; + info->node_name = g_strdup(bs->node_name); + } + + if (bs->backing_file[0]) { + info->has_backing_file = true; + info->backing_file = g_strdup(bs->backing_file); + } + + info->backing_file_depth = bdrv_get_backing_file_depth(bs); + info->detect_zeroes = bs->detect_zeroes; + + if (bs->throttle_state) { + ThrottleConfig cfg; + + throttle_group_get_config(bs, &cfg); + + info->bps = cfg.buckets[THROTTLE_BPS_TOTAL].avg; + info->bps_rd = cfg.buckets[THROTTLE_BPS_READ].avg; + info->bps_wr = cfg.buckets[THROTTLE_BPS_WRITE].avg; + + info->iops = cfg.buckets[THROTTLE_OPS_TOTAL].avg; + info->iops_rd = cfg.buckets[THROTTLE_OPS_READ].avg; + info->iops_wr = cfg.buckets[THROTTLE_OPS_WRITE].avg; + + info->has_bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->bps_max = cfg.buckets[THROTTLE_BPS_TOTAL].max; + info->has_bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->bps_rd_max = cfg.buckets[THROTTLE_BPS_READ].max; + info->has_bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + info->bps_wr_max = cfg.buckets[THROTTLE_BPS_WRITE].max; + + info->has_iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->iops_max = cfg.buckets[THROTTLE_OPS_TOTAL].max; + info->has_iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->iops_rd_max = cfg.buckets[THROTTLE_OPS_READ].max; + info->has_iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + info->iops_wr_max = cfg.buckets[THROTTLE_OPS_WRITE].max; + + info->has_iops_size = cfg.op_size; + info->iops_size = cfg.op_size; + + info->has_group = true; + info->group = g_strdup(throttle_group_get_name(bs)); + } + + info->write_threshold = bdrv_write_threshold_get(bs); + + bs0 = bs; + p_image_info = &info->image; + while (1) { + Error *local_err = NULL; + bdrv_query_image_info(bs0, p_image_info, &local_err); + if (local_err) { + error_propagate(errp, local_err); + qapi_free_BlockDeviceInfo(info); + return NULL; + } + if (bs0->drv && bs0->backing) { + bs0 = bs0->backing->bs; + (*p_image_info)->has_backing_image = true; + p_image_info = &((*p_image_info)->backing_image); + } else { + break; + } + } + + return info; +} + +/* + * Returns 0 on success, with *p_list either set to describe snapshot + * information, or NULL because there are no snapshots. Returns -errno on + * error, with *p_list untouched. + */ +int bdrv_query_snapshot_info_list(BlockDriverState *bs, + SnapshotInfoList **p_list, + Error **errp) +{ + int i, sn_count; + QEMUSnapshotInfo *sn_tab = NULL; + SnapshotInfoList *info_list, *cur_item = NULL, *head = NULL; + SnapshotInfo *info; + + sn_count = bdrv_snapshot_list(bs, &sn_tab); + if (sn_count < 0) { + const char *dev = bdrv_get_device_name(bs); + switch (sn_count) { + case -ENOMEDIUM: + error_setg(errp, "Device '%s' is not inserted", dev); + break; + case -ENOTSUP: + error_setg(errp, + "Device '%s' does not support internal snapshots", + dev); + break; + default: + error_setg_errno(errp, -sn_count, + "Can't list snapshots of device '%s'", dev); + break; + } + return sn_count; + } + + for (i = 0; i < sn_count; i++) { + info = g_new0(SnapshotInfo, 1); + info->id = g_strdup(sn_tab[i].id_str); + info->name = g_strdup(sn_tab[i].name); + info->vm_state_size = sn_tab[i].vm_state_size; + info->date_sec = sn_tab[i].date_sec; + info->date_nsec = sn_tab[i].date_nsec; + info->vm_clock_sec = sn_tab[i].vm_clock_nsec / 1000000000; + info->vm_clock_nsec = sn_tab[i].vm_clock_nsec % 1000000000; + + info_list = g_new0(SnapshotInfoList, 1); + info_list->value = info; + + /* XXX: waiting for the qapi to support qemu-queue.h types */ + if (!cur_item) { + head = cur_item = info_list; + } else { + cur_item->next = info_list; + cur_item = info_list; + } + + } + + g_free(sn_tab); + *p_list = head; + return 0; +} + +/** + * bdrv_query_image_info: + * @bs: block device to examine + * @p_info: location to store image information + * @errp: location to store error information + * + * Store "flat" image information in @p_info. + * + * "Flat" means it does *not* query backing image information, + * i.e. (*pinfo)->has_backing_image will be set to false and + * (*pinfo)->backing_image to NULL even when the image does in fact have + * a backing image. + * + * @p_info will be set only on success. On error, store error in @errp. + */ +void bdrv_query_image_info(BlockDriverState *bs, + ImageInfo **p_info, + Error **errp) +{ + int64_t size; + const char *backing_filename; + BlockDriverInfo bdi; + int ret; + Error *err = NULL; + ImageInfo *info; + + size = bdrv_getlength(bs); + if (size < 0) { + error_setg_errno(errp, -size, "Can't get size of device '%s'", + bdrv_get_device_name(bs)); + return; + } + + info = g_new0(ImageInfo, 1); + info->filename = g_strdup(bs->filename); + info->format = g_strdup(bdrv_get_format_name(bs)); + info->virtual_size = size; + info->actual_size = bdrv_get_allocated_file_size(bs); + info->has_actual_size = info->actual_size >= 0; + if (bdrv_is_encrypted(bs)) { + info->encrypted = true; + info->has_encrypted = true; + } + if (bdrv_get_info(bs, &bdi) >= 0) { + if (bdi.cluster_size != 0) { + info->cluster_size = bdi.cluster_size; + info->has_cluster_size = true; + } + info->dirty_flag = bdi.is_dirty; + info->has_dirty_flag = true; + } + info->format_specific = bdrv_get_specific_info(bs); + info->has_format_specific = info->format_specific != NULL; + + backing_filename = bs->backing_file; + if (backing_filename[0] != '\0') { + char *backing_filename2 = g_malloc0(PATH_MAX); + info->backing_filename = g_strdup(backing_filename); + info->has_backing_filename = true; + bdrv_get_full_backing_filename(bs, backing_filename2, PATH_MAX, &err); + if (err) { + error_propagate(errp, err); + qapi_free_ImageInfo(info); + g_free(backing_filename2); + return; + } + + if (strcmp(backing_filename, backing_filename2) != 0) { + info->full_backing_filename = + g_strdup(backing_filename2); + info->has_full_backing_filename = true; + } + + if (bs->backing_format[0]) { + info->backing_filename_format = g_strdup(bs->backing_format); + info->has_backing_filename_format = true; + } + g_free(backing_filename2); + } + + ret = bdrv_query_snapshot_info_list(bs, &info->snapshots, &err); + switch (ret) { + case 0: + if (info->snapshots) { + info->has_snapshots = true; + } + break; + /* recoverable error */ + case -ENOMEDIUM: + case -ENOTSUP: + error_free(err); + break; + default: + error_propagate(errp, err); + qapi_free_ImageInfo(info); + return; + } + + *p_info = info; +} + +/* @p_info will be set only on success. */ +static void bdrv_query_info(BlockBackend *blk, BlockInfo **p_info, + Error **errp) +{ + BlockInfo *info = g_malloc0(sizeof(*info)); + BlockDriverState *bs = blk_bs(blk); + info->device = g_strdup(blk_name(blk)); + info->type = g_strdup("unknown"); + info->locked = blk_dev_is_medium_locked(blk); + info->removable = blk_dev_has_removable_media(blk); + + if (blk_dev_has_removable_media(blk)) { + info->has_tray_open = true; + info->tray_open = blk_dev_is_tray_open(blk); + } + + if (blk_iostatus_is_enabled(blk)) { + info->has_io_status = true; + info->io_status = blk_iostatus(blk); + } + + if (bs && !QLIST_EMPTY(&bs->dirty_bitmaps)) { + info->has_dirty_bitmaps = true; + info->dirty_bitmaps = bdrv_query_dirty_bitmaps(bs); + } + + if (bs && bs->drv) { + info->has_inserted = true; + info->inserted = bdrv_block_device_info(bs, errp); + if (info->inserted == NULL) { + goto err; + } + } + + *p_info = info; + return; + + err: + qapi_free_BlockInfo(info); +} + +static BlockStats *bdrv_query_stats(const BlockDriverState *bs, + bool query_backing) +{ + BlockStats *s; + + s = g_malloc0(sizeof(*s)); + + if (bdrv_get_device_name(bs)[0]) { + s->has_device = true; + s->device = g_strdup(bdrv_get_device_name(bs)); + } + + if (bdrv_get_node_name(bs)[0]) { + s->has_node_name = true; + s->node_name = g_strdup(bdrv_get_node_name(bs)); + } + + s->stats = g_malloc0(sizeof(*s->stats)); + if (bs->blk) { + BlockAcctStats *stats = blk_get_stats(bs->blk); + BlockAcctTimedStats *ts = NULL; + + s->stats->rd_bytes = stats->nr_bytes[BLOCK_ACCT_READ]; + s->stats->wr_bytes = stats->nr_bytes[BLOCK_ACCT_WRITE]; + s->stats->rd_operations = stats->nr_ops[BLOCK_ACCT_READ]; + s->stats->wr_operations = stats->nr_ops[BLOCK_ACCT_WRITE]; + + s->stats->failed_rd_operations = stats->failed_ops[BLOCK_ACCT_READ]; + s->stats->failed_wr_operations = stats->failed_ops[BLOCK_ACCT_WRITE]; + s->stats->failed_flush_operations = stats->failed_ops[BLOCK_ACCT_FLUSH]; + + s->stats->invalid_rd_operations = stats->invalid_ops[BLOCK_ACCT_READ]; + s->stats->invalid_wr_operations = stats->invalid_ops[BLOCK_ACCT_WRITE]; + s->stats->invalid_flush_operations = + stats->invalid_ops[BLOCK_ACCT_FLUSH]; + + s->stats->rd_merged = stats->merged[BLOCK_ACCT_READ]; + s->stats->wr_merged = stats->merged[BLOCK_ACCT_WRITE]; + s->stats->flush_operations = stats->nr_ops[BLOCK_ACCT_FLUSH]; + s->stats->wr_total_time_ns = stats->total_time_ns[BLOCK_ACCT_WRITE]; + s->stats->rd_total_time_ns = stats->total_time_ns[BLOCK_ACCT_READ]; + s->stats->flush_total_time_ns = stats->total_time_ns[BLOCK_ACCT_FLUSH]; + + s->stats->has_idle_time_ns = stats->last_access_time_ns > 0; + if (s->stats->has_idle_time_ns) { + s->stats->idle_time_ns = block_acct_idle_time_ns(stats); + } + + s->stats->account_invalid = stats->account_invalid; + s->stats->account_failed = stats->account_failed; + + while ((ts = block_acct_interval_next(stats, ts))) { + BlockDeviceTimedStatsList *timed_stats = + g_malloc0(sizeof(*timed_stats)); + BlockDeviceTimedStats *dev_stats = g_malloc0(sizeof(*dev_stats)); + timed_stats->next = s->stats->timed_stats; + timed_stats->value = dev_stats; + s->stats->timed_stats = timed_stats; + + TimedAverage *rd = &ts->latency[BLOCK_ACCT_READ]; + TimedAverage *wr = &ts->latency[BLOCK_ACCT_WRITE]; + TimedAverage *fl = &ts->latency[BLOCK_ACCT_FLUSH]; + + dev_stats->interval_length = ts->interval_length; + + dev_stats->min_rd_latency_ns = timed_average_min(rd); + dev_stats->max_rd_latency_ns = timed_average_max(rd); + dev_stats->avg_rd_latency_ns = timed_average_avg(rd); + + dev_stats->min_wr_latency_ns = timed_average_min(wr); + dev_stats->max_wr_latency_ns = timed_average_max(wr); + dev_stats->avg_wr_latency_ns = timed_average_avg(wr); + + dev_stats->min_flush_latency_ns = timed_average_min(fl); + dev_stats->max_flush_latency_ns = timed_average_max(fl); + dev_stats->avg_flush_latency_ns = timed_average_avg(fl); + + dev_stats->avg_rd_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_READ); + dev_stats->avg_wr_queue_depth = + block_acct_queue_depth(ts, BLOCK_ACCT_WRITE); + } + } + + s->stats->wr_highest_offset = bs->wr_highest_offset; + + if (bs->file) { + s->has_parent = true; + s->parent = bdrv_query_stats(bs->file->bs, query_backing); + } + + if (query_backing && bs->backing) { + s->has_backing = true; + s->backing = bdrv_query_stats(bs->backing->bs, query_backing); + } + + return s; +} + +BlockInfoList *qmp_query_block(Error **errp) +{ + BlockInfoList *head = NULL, **p_next = &head; + BlockBackend *blk; + Error *local_err = NULL; + + for (blk = blk_next(NULL); blk; blk = blk_next(blk)) { + BlockInfoList *info = g_malloc0(sizeof(*info)); + bdrv_query_info(blk, &info->value, &local_err); + if (local_err) { + error_propagate(errp, local_err); + g_free(info); + qapi_free_BlockInfoList(head); + return NULL; + } + + *p_next = info; + p_next = &info->next; + } + + return head; +} + +BlockStatsList *qmp_query_blockstats(bool has_query_nodes, + bool query_nodes, + Error **errp) +{ + BlockStatsList *head = NULL, **p_next = &head; + BlockDriverState *bs = NULL; + + /* Just to be safe if query_nodes is not always initialized */ + query_nodes = has_query_nodes && query_nodes; + + while ((bs = query_nodes ? bdrv_next_node(bs) : bdrv_next(bs))) { + BlockStatsList *info = g_malloc0(sizeof(*info)); + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + info->value = bdrv_query_stats(bs, !query_nodes); + aio_context_release(ctx); + + *p_next = info; + p_next = &info->next; + } + + return head; +} + +#define NB_SUFFIXES 4 + +static char *get_human_readable_size(char *buf, int buf_size, int64_t size) +{ + static const char suffixes[NB_SUFFIXES] = {'K', 'M', 'G', 'T'}; + int64_t base; + int i; + + if (size <= 999) { + snprintf(buf, buf_size, "%" PRId64, size); + } else { + base = 1024; + for (i = 0; i < NB_SUFFIXES; i++) { + if (size < (10 * base)) { + snprintf(buf, buf_size, "%0.1f%c", + (double)size / base, + suffixes[i]); + break; + } else if (size < (1000 * base) || i == (NB_SUFFIXES - 1)) { + snprintf(buf, buf_size, "%" PRId64 "%c", + ((size + (base >> 1)) / base), + suffixes[i]); + break; + } + base = base * 1024; + } + } + return buf; +} + +void bdrv_snapshot_dump(fprintf_function func_fprintf, void *f, + QEMUSnapshotInfo *sn) +{ + char buf1[128], date_buf[128], clock_buf[128]; + struct tm tm; + time_t ti; + int64_t secs; + + if (!sn) { + func_fprintf(f, + "%-10s%-20s%7s%20s%15s", + "ID", "TAG", "VM SIZE", "DATE", "VM CLOCK"); + } else { + ti = sn->date_sec; + localtime_r(&ti, &tm); + strftime(date_buf, sizeof(date_buf), + "%Y-%m-%d %H:%M:%S", &tm); + secs = sn->vm_clock_nsec / 1000000000; + snprintf(clock_buf, sizeof(clock_buf), + "%02d:%02d:%02d.%03d", + (int)(secs / 3600), + (int)((secs / 60) % 60), + (int)(secs % 60), + (int)((sn->vm_clock_nsec / 1000000) % 1000)); + func_fprintf(f, + "%-10s%-20s%7s%20s%15s", + sn->id_str, sn->name, + get_human_readable_size(buf1, sizeof(buf1), + sn->vm_state_size), + date_buf, + clock_buf); + } +} + +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, + QDict *dict); +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, + QList *list); + +static void dump_qobject(fprintf_function func_fprintf, void *f, + int comp_indent, QObject *obj) +{ + switch (qobject_type(obj)) { + case QTYPE_QINT: { + QInt *value = qobject_to_qint(obj); + func_fprintf(f, "%" PRId64, qint_get_int(value)); + break; + } + case QTYPE_QSTRING: { + QString *value = qobject_to_qstring(obj); + func_fprintf(f, "%s", qstring_get_str(value)); + break; + } + case QTYPE_QDICT: { + QDict *value = qobject_to_qdict(obj); + dump_qdict(func_fprintf, f, comp_indent, value); + break; + } + case QTYPE_QLIST: { + QList *value = qobject_to_qlist(obj); + dump_qlist(func_fprintf, f, comp_indent, value); + break; + } + case QTYPE_QFLOAT: { + QFloat *value = qobject_to_qfloat(obj); + func_fprintf(f, "%g", qfloat_get_double(value)); + break; + } + case QTYPE_QBOOL: { + QBool *value = qobject_to_qbool(obj); + func_fprintf(f, "%s", qbool_get_bool(value) ? "true" : "false"); + break; + } + default: + abort(); + } +} + +static void dump_qlist(fprintf_function func_fprintf, void *f, int indentation, + QList *list) +{ + const QListEntry *entry; + int i = 0; + + for (entry = qlist_first(list); entry; entry = qlist_next(entry), i++) { + qtype_code type = qobject_type(entry->value); + bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); + const char *format = composite ? "%*s[%i]:\n" : "%*s[%i]: "; + + func_fprintf(f, format, indentation * 4, "", i); + dump_qobject(func_fprintf, f, indentation + 1, entry->value); + if (!composite) { + func_fprintf(f, "\n"); + } + } +} + +static void dump_qdict(fprintf_function func_fprintf, void *f, int indentation, + QDict *dict) +{ + const QDictEntry *entry; + + for (entry = qdict_first(dict); entry; entry = qdict_next(dict, entry)) { + qtype_code type = qobject_type(entry->value); + bool composite = (type == QTYPE_QDICT || type == QTYPE_QLIST); + const char *format = composite ? "%*s%s:\n" : "%*s%s: "; + char key[strlen(entry->key) + 1]; + int i; + + /* replace dashes with spaces in key (variable) names */ + for (i = 0; entry->key[i]; i++) { + key[i] = entry->key[i] == '-' ? ' ' : entry->key[i]; + } + key[i] = 0; + + func_fprintf(f, format, indentation * 4, "", key); + dump_qobject(func_fprintf, f, indentation + 1, entry->value); + if (!composite) { + func_fprintf(f, "\n"); + } + } +} + +void bdrv_image_info_specific_dump(fprintf_function func_fprintf, void *f, + ImageInfoSpecific *info_spec) +{ + QmpOutputVisitor *ov = qmp_output_visitor_new(); + QObject *obj, *data; + + visit_type_ImageInfoSpecific(qmp_output_get_visitor(ov), &info_spec, NULL, + &error_abort); + obj = qmp_output_get_qobject(ov); + assert(qobject_type(obj) == QTYPE_QDICT); + data = qdict_get(qobject_to_qdict(obj), "data"); + dump_qobject(func_fprintf, f, 1, data); + qmp_output_visitor_cleanup(ov); +} + +void bdrv_image_info_dump(fprintf_function func_fprintf, void *f, + ImageInfo *info) +{ + char size_buf[128], dsize_buf[128]; + if (!info->has_actual_size) { + snprintf(dsize_buf, sizeof(dsize_buf), "unavailable"); + } else { + get_human_readable_size(dsize_buf, sizeof(dsize_buf), + info->actual_size); + } + get_human_readable_size(size_buf, sizeof(size_buf), info->virtual_size); + func_fprintf(f, + "image: %s\n" + "file format: %s\n" + "virtual size: %s (%" PRId64 " bytes)\n" + "disk size: %s\n", + info->filename, info->format, size_buf, + info->virtual_size, + dsize_buf); + + if (info->has_encrypted && info->encrypted) { + func_fprintf(f, "encrypted: yes\n"); + } + + if (info->has_cluster_size) { + func_fprintf(f, "cluster_size: %" PRId64 "\n", + info->cluster_size); + } + + if (info->has_dirty_flag && info->dirty_flag) { + func_fprintf(f, "cleanly shut down: no\n"); + } + + if (info->has_backing_filename) { + func_fprintf(f, "backing file: %s", info->backing_filename); + if (info->has_full_backing_filename) { + func_fprintf(f, " (actual path: %s)", info->full_backing_filename); + } + func_fprintf(f, "\n"); + if (info->has_backing_filename_format) { + func_fprintf(f, "backing file format: %s\n", + info->backing_filename_format); + } + } + + if (info->has_snapshots) { + SnapshotInfoList *elem; + + func_fprintf(f, "Snapshot list:\n"); + bdrv_snapshot_dump(func_fprintf, f, NULL); + func_fprintf(f, "\n"); + + /* Ideally bdrv_snapshot_dump() would operate on SnapshotInfoList but + * we convert to the block layer's native QEMUSnapshotInfo for now. + */ + for (elem = info->snapshots; elem; elem = elem->next) { + QEMUSnapshotInfo sn = { + .vm_state_size = elem->value->vm_state_size, + .date_sec = elem->value->date_sec, + .date_nsec = elem->value->date_nsec, + .vm_clock_nsec = elem->value->vm_clock_sec * 1000000000ULL + + elem->value->vm_clock_nsec, + }; + + pstrcpy(sn.id_str, sizeof(sn.id_str), elem->value->id); + pstrcpy(sn.name, sizeof(sn.name), elem->value->name); + bdrv_snapshot_dump(func_fprintf, f, &sn); + func_fprintf(f, "\n"); + } + } + + if (info->has_format_specific) { + func_fprintf(f, "Format specific information:\n"); + bdrv_image_info_specific_dump(func_fprintf, f, info->format_specific); + } +} diff --git a/src/block/qcow.c b/src/block/qcow.c new file mode 100644 index 0000000..635085e --- /dev/null +++ b/src/block/qcow.c @@ -0,0 +1,1039 @@ +/* + * Block driver for the QCOW format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "qapi/qmp/qerror.h" +#include "crypto/cipher.h" +#include "migration/migration.h" + +/**************************************************************/ +/* QEMU COW block driver with compression and encryption support */ + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) +#define QCOW_VERSION 1 + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_OFLAG_COMPRESSED (1LL << 63) + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t mtime; + uint64_t size; /* in bytes */ + uint8_t cluster_bits; + uint8_t l2_bits; + uint16_t padding; + uint32_t crypt_method; + uint64_t l1_table_offset; +} QEMU_PACKED QCowHeader; + +#define L2_CACHE_SIZE 16 + +typedef struct BDRVQcowState { + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + unsigned int l1_size; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + uint64_t *l2_cache; + uint64_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + QCryptoCipher *cipher; /* NULL if no key yet */ + uint32_t crypt_method_header; + CoMutex lock; + Error *migration_blocker; +} BDRVQcowState; + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); + +static int qcow_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) == QCOW_VERSION) + return 100; + else + return 0; +} + +static int qcow_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVQcowState *s = bs->opaque; + unsigned int len, i, shift; + int ret; + QCowHeader header; + + ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); + if (ret < 0) { + goto fail; + } + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be32_to_cpus(&header.mtime); + be64_to_cpus(&header.size); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + + if (header.magic != QCOW_MAGIC) { + error_setg(errp, "Image not in qcow format"); + ret = -EINVAL; + goto fail; + } + if (header.version != QCOW_VERSION) { + char version[64]; + snprintf(version, sizeof(version), "QCOW version %" PRIu32, + header.version); + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "qcow", version); + ret = -ENOTSUP; + goto fail; + } + + if (header.size <= 1) { + error_setg(errp, "Image size is too small (must be at least 2 bytes)"); + ret = -EINVAL; + goto fail; + } + if (header.cluster_bits < 9 || header.cluster_bits > 16) { + error_setg(errp, "Cluster size must be between 512 and 64k"); + ret = -EINVAL; + goto fail; + } + + /* l2_bits specifies number of entries; storing a uint64_t in each entry, + * so bytes = num_entries << 3. */ + if (header.l2_bits < 9 - 3 || header.l2_bits > 16 - 3) { + error_setg(errp, "L2 table size must be between 512 and 64k"); + ret = -EINVAL; + goto fail; + } + + if (header.crypt_method > QCOW_CRYPT_AES) { + error_setg(errp, "invalid encryption method in qcow header"); + ret = -EINVAL; + goto fail; + } + if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { + error_setg(errp, "AES cipher not available"); + ret = -EINVAL; + goto fail; + } + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) { + bs->encrypted = 1; + } + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + s->l2_bits = header.l2_bits; + s->l2_size = 1 << s->l2_bits; + bs->total_sectors = header.size / 512; + s->cluster_offset_mask = (1LL << (63 - s->cluster_bits)) - 1; + + /* read the level 1 table */ + shift = s->cluster_bits + s->l2_bits; + if (header.size > UINT64_MAX - (1LL << shift)) { + error_setg(errp, "Image too large"); + ret = -EINVAL; + goto fail; + } else { + uint64_t l1_size = (header.size + (1LL << shift) - 1) >> shift; + if (l1_size > INT_MAX / sizeof(uint64_t)) { + error_setg(errp, "Image too large"); + ret = -EINVAL; + goto fail; + } + s->l1_size = l1_size; + } + + s->l1_table_offset = header.l1_table_offset; + s->l1_table = g_try_new(uint64_t, s->l1_size); + if (s->l1_table == NULL) { + error_setg(errp, "Could not allocate memory for L1 table"); + ret = -ENOMEM; + goto fail; + } + + ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + + /* alloc L2 cache (max. 64k * 16 * 8 = 8 MB) */ + s->l2_cache = + qemu_try_blockalign(bs->file->bs, + s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + if (s->l2_cache == NULL) { + error_setg(errp, "Could not allocate L2 table cache"); + ret = -ENOMEM; + goto fail; + } + s->cluster_cache = g_malloc(s->cluster_size); + s->cluster_data = g_malloc(s->cluster_size); + s->cluster_cache_offset = -1; + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > 1023 || len >= sizeof(bs->backing_file)) { + error_setg(errp, "Backing file name too long"); + ret = -EINVAL; + goto fail; + } + ret = bdrv_pread(bs->file->bs, header.backing_file_offset, + bs->backing_file, len); + if (ret < 0) { + goto fail; + } + bs->backing_file[len] = '\0'; + } + + /* Disable migration when qcow images are used */ + error_setg(&s->migration_blocker, "The qcow format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); + migrate_add_blocker(s->migration_blocker); + + qemu_co_mutex_init(&s->lock); + return 0; + + fail: + g_free(s->l1_table); + qemu_vfree(s->l2_cache); + g_free(s->cluster_cache); + g_free(s->cluster_data); + return ret; +} + + +/* We have nothing to do for QCOW reopen, stubs just return + * success */ +static int qcow_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static int qcow_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcowState *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + Error *err; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + assert(bs->encrypted); + + qcrypto_cipher_free(s->cipher); + s->cipher = qcrypto_cipher_new( + QCRYPTO_CIPHER_ALG_AES_128, + QCRYPTO_CIPHER_MODE_CBC, + keybuf, G_N_ELEMENTS(keybuf), + &err); + + if (!s->cipher) { + /* XXX would be nice if errors in this method could + * be properly propagate to the caller. Would need + * the bdrv_set_key() API signature to be fixed. */ + error_free(err); + return -1; + } + return 0; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +static int encrypt_sectors(BDRVQcowState *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, bool enc, Error **errp) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + int ret; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + if (qcrypto_cipher_setiv(s->cipher, + ivec.b, G_N_ELEMENTS(ivec.b), + errp) < 0) { + return -1; + } + if (enc) { + ret = qcrypto_cipher_encrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } else { + ret = qcrypto_cipher_decrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } + if (ret < 0) { + return -1; + } + sector_num++; + in_buf += 512; + out_buf += 512; + } + return 0; +} + +/* 'allocate' is: + * + * 0 to not allocate. + * + * 1 to allocate a normal cluster (for sector indexes 'n_start' to + * 'n_end') + * + * 2 to allocate a compressed cluster of size + * 'compressed_size'. 'compressed_size' must be > 0 and < + * cluster_size + * + * return 0 if not allocated. + */ +static uint64_t get_cluster_offset(BlockDriverState *bs, + uint64_t offset, int allocate, + int compressed_size, + int n_start, int n_end) +{ + BDRVQcowState *s = bs->opaque; + int min_index, i, j, l1_index, l2_index; + uint64_t l2_offset, *l2_table, cluster_offset, tmp; + uint32_t min_count; + int new_l2_table; + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + l2_offset = s->l1_table[l1_index]; + new_l2_table = 0; + if (!l2_offset) { + if (!allocate) + return 0; + /* allocate a new l2 entry */ + l2_offset = bdrv_getlength(bs->file->bs); + /* round to cluster size */ + l2_offset = (l2_offset + s->cluster_size - 1) & ~(s->cluster_size - 1); + /* update the L1 entry */ + s->l1_table[l1_index] = l2_offset; + tmp = cpu_to_be64(l2_offset); + if (bdrv_pwrite_sync(bs->file->bs, + s->l1_table_offset + l1_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) + return 0; + new_l2_table = 1; + } + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == s->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++s->l2_cache_counts[i] == 0xffffffff) { + for(j = 0; j < L2_CACHE_SIZE; j++) { + s->l2_cache_counts[j] >>= 1; + } + } + l2_table = s->l2_cache + (i << s->l2_bits); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for(i = 0; i < L2_CACHE_SIZE; i++) { + if (s->l2_cache_counts[i] < min_count) { + min_count = s->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = s->l2_cache + (min_index << s->l2_bits); + if (new_l2_table) { + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + if (bdrv_pwrite_sync(bs->file->bs, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)) < 0) + return 0; + } else { + if (bdrv_pread(bs->file->bs, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)) != + s->l2_size * sizeof(uint64_t)) + return 0; + } + s->l2_cache_offsets[min_index] = l2_offset; + s->l2_cache_counts[min_index] = 1; + found: + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (!cluster_offset || + ((cluster_offset & QCOW_OFLAG_COMPRESSED) && allocate == 1)) { + if (!allocate) + return 0; + /* allocate a new cluster */ + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) && + (n_end - n_start) < s->cluster_sectors) { + /* if the cluster is already compressed, we must + decompress it in the case it is not completely + overwritten */ + if (decompress_cluster(bs, cluster_offset) < 0) + return 0; + cluster_offset = bdrv_getlength(bs->file->bs); + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + /* write the cluster content */ + if (bdrv_pwrite(bs->file->bs, cluster_offset, s->cluster_cache, + s->cluster_size) != + s->cluster_size) + return -1; + } else { + cluster_offset = bdrv_getlength(bs->file->bs); + if (allocate == 1) { + /* round to cluster size */ + cluster_offset = (cluster_offset + s->cluster_size - 1) & + ~(s->cluster_size - 1); + bdrv_truncate(bs->file->bs, cluster_offset + s->cluster_size); + /* if encrypted, we must initialize the cluster + content which won't be written */ + if (bs->encrypted && + (n_end - n_start) < s->cluster_sectors) { + uint64_t start_sect; + assert(s->cipher); + start_sect = (offset & ~(s->cluster_size - 1)) >> 9; + memset(s->cluster_data + 512, 0x00, 512); + for(i = 0; i < s->cluster_sectors; i++) { + if (i < n_start || i >= n_end) { + Error *err = NULL; + if (encrypt_sectors(s, start_sect + i, + s->cluster_data, + s->cluster_data + 512, 1, + true, &err) < 0) { + error_free(err); + errno = EIO; + return -1; + } + if (bdrv_pwrite(bs->file->bs, + cluster_offset + i * 512, + s->cluster_data, 512) != 512) + return -1; + } + } + } + } else if (allocate == 2) { + cluster_offset |= QCOW_OFLAG_COMPRESSED | + (uint64_t)compressed_size << (63 - s->cluster_bits); + } + } + /* update L2 table */ + tmp = cpu_to_be64(cluster_offset); + l2_table[l2_index] = tmp; + if (bdrv_pwrite_sync(bs->file->bs, l2_offset + l2_index * sizeof(tmp), + &tmp, sizeof(tmp)) < 0) + return 0; + } + return cluster_offset; +} + +static int64_t coroutine_fn qcow_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster, n; + uint64_t cluster_offset; + + qemu_co_mutex_lock(&s->lock); + cluster_offset = get_cluster_offset(bs, sector_num << 9, 0, 0, 0, 0); + qemu_co_mutex_unlock(&s->lock); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) + n = nb_sectors; + *pnum = n; + if (!cluster_offset) { + return 0; + } + if ((cluster_offset & QCOW_OFLAG_COMPRESSED) || s->cipher) { + return BDRV_BLOCK_DATA; + } + cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | cluster_offset; +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +static int decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ + BDRVQcowState *s = bs->opaque; + int ret, csize; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + csize = cluster_offset >> (63 - s->cluster_bits); + csize &= (s->cluster_size - 1); + ret = bdrv_pread(bs->file->bs, coffset, s->cluster_data, csize); + if (ret != csize) + return -1; + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data, csize) < 0) { + return -1; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +static coroutine_fn int qcow_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + int ret = 0, n; + uint64_t cluster_offset; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; + Error *err = NULL; + + if (qiov->niov > 1) { + buf = orig_buf = qemu_try_blockalign(bs, qiov->size); + if (buf == NULL) { + return -ENOMEM; + } + } else { + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; + } + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors != 0) { + /* prepare next request */ + cluster_offset = get_cluster_offset(bs, sector_num << 9, + 0, 0, 0, 0); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + + if (!cluster_offset) { + if (bs->backing) { + /* read from the base image */ + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing->bs, sector_num, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + } else { + /* Note: in this case, no need to wait */ + memset(buf, 0, 512 * n); + } + } else if (cluster_offset & QCOW_OFLAG_COMPRESSED) { + /* add AIO support for compressed blocks ? */ + if (decompress_cluster(bs, cluster_offset) < 0) { + goto fail; + } + memcpy(buf, + s->cluster_cache + index_in_cluster * 512, 512 * n); + } else { + if ((cluster_offset & 511) != 0) { + goto fail; + } + hd_iov.iov_base = (void *)buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file->bs, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + break; + } + if (bs->encrypted) { + assert(s->cipher); + if (encrypt_sectors(s, sector_num, buf, buf, + n, false, &err) < 0) { + goto fail; + } + } + } + ret = 0; + + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + +done: + qemu_co_mutex_unlock(&s->lock); + + if (qiov->niov > 1) { + qemu_iovec_from_buf(qiov, 0, orig_buf, qiov->size); + qemu_vfree(orig_buf); + } + + return ret; + +fail: + error_free(err); + ret = -EIO; + goto done; +} + +static coroutine_fn int qcow_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVQcowState *s = bs->opaque; + int index_in_cluster; + uint64_t cluster_offset; + const uint8_t *src_buf; + int ret = 0, n; + uint8_t *cluster_data = NULL; + struct iovec hd_iov; + QEMUIOVector hd_qiov; + uint8_t *buf; + void *orig_buf; + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + if (qiov->niov > 1) { + buf = orig_buf = qemu_try_blockalign(bs, qiov->size); + if (buf == NULL) { + return -ENOMEM; + } + qemu_iovec_to_buf(qiov, 0, buf, qiov->size); + } else { + orig_buf = NULL; + buf = (uint8_t *)qiov->iov->iov_base; + } + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors != 0) { + + index_in_cluster = sector_num & (s->cluster_sectors - 1); + n = s->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + cluster_offset = get_cluster_offset(bs, sector_num << 9, 1, 0, + index_in_cluster, + index_in_cluster + n); + if (!cluster_offset || (cluster_offset & 511) != 0) { + ret = -EIO; + break; + } + if (bs->encrypted) { + Error *err = NULL; + assert(s->cipher); + if (!cluster_data) { + cluster_data = g_malloc0(s->cluster_size); + } + if (encrypt_sectors(s, sector_num, cluster_data, buf, + n, true, &err) < 0) { + error_free(err); + ret = -EIO; + break; + } + src_buf = cluster_data; + } else { + src_buf = buf; + } + + hd_iov.iov_base = (void *)src_buf; + hd_iov.iov_len = n * 512; + qemu_iovec_init_external(&hd_qiov, &hd_iov, 1); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file->bs, + (cluster_offset >> 9) + index_in_cluster, + n, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + break; + } + ret = 0; + + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + qemu_co_mutex_unlock(&s->lock); + + if (qiov->niov > 1) { + qemu_vfree(orig_buf); + } + g_free(cluster_data); + + return ret; +} + +static void qcow_close(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + + qcrypto_cipher_free(s->cipher); + s->cipher = NULL; + g_free(s->l1_table); + qemu_vfree(s->l2_cache); + g_free(s->cluster_cache); + g_free(s->cluster_data); + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static int qcow_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int header_size, backing_filename_len, l1_size, shift, i; + QCowHeader header; + uint8_t *tmp; + int64_t total_size = 0; + char *backing_file = NULL; + int flags = 0; + Error *local_err = NULL; + int ret; + BlockDriverState *qcow_bs; + + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { + flags |= BLOCK_FLAG_ENCRYPT; + } + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto cleanup; + } + + qcow_bs = NULL; + ret = bdrv_open(&qcow_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto cleanup; + } + + ret = bdrv_truncate(qcow_bs, 0); + if (ret < 0) { + goto exit; + } + + memset(&header, 0, sizeof(header)); + header.magic = cpu_to_be32(QCOW_MAGIC); + header.version = cpu_to_be32(QCOW_VERSION); + header.size = cpu_to_be64(total_size); + header_size = sizeof(header); + backing_filename_len = 0; + if (backing_file) { + if (strcmp(backing_file, "fat:")) { + header.backing_file_offset = cpu_to_be64(header_size); + backing_filename_len = strlen(backing_file); + header.backing_file_size = cpu_to_be32(backing_filename_len); + header_size += backing_filename_len; + } else { + /* special backing file for vvfat */ + backing_file = NULL; + } + header.cluster_bits = 9; /* 512 byte cluster to avoid copying + unmodified sectors */ + header.l2_bits = 12; /* 32 KB L2 tables */ + } else { + header.cluster_bits = 12; /* 4 KB clusters */ + header.l2_bits = 9; /* 4 KB L2 tables */ + } + header_size = (header_size + 7) & ~7; + shift = header.cluster_bits + header.l2_bits; + l1_size = (total_size + (1LL << shift) - 1) >> shift; + + header.l1_table_offset = cpu_to_be64(header_size); + if (flags & BLOCK_FLAG_ENCRYPT) { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header.crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + /* write all the data */ + ret = bdrv_pwrite(qcow_bs, 0, &header, sizeof(header)); + if (ret != sizeof(header)) { + goto exit; + } + + if (backing_file) { + ret = bdrv_pwrite(qcow_bs, sizeof(header), + backing_file, backing_filename_len); + if (ret != backing_filename_len) { + goto exit; + } + } + + tmp = g_malloc0(BDRV_SECTOR_SIZE); + for (i = 0; i < ((sizeof(uint64_t)*l1_size + BDRV_SECTOR_SIZE - 1)/ + BDRV_SECTOR_SIZE); i++) { + ret = bdrv_pwrite(qcow_bs, header_size + + BDRV_SECTOR_SIZE*i, tmp, BDRV_SECTOR_SIZE); + if (ret != BDRV_SECTOR_SIZE) { + g_free(tmp); + goto exit; + } + } + + g_free(tmp); + ret = 0; +exit: + bdrv_unref(qcow_bs); +cleanup: + g_free(backing_file); + return ret; +} + +static int qcow_make_empty(BlockDriverState *bs) +{ + BDRVQcowState *s = bs->opaque; + uint32_t l1_length = s->l1_size * sizeof(uint64_t); + int ret; + + memset(s->l1_table, 0, l1_length); + if (bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, s->l1_table, + l1_length) < 0) + return -1; + ret = bdrv_truncate(bs->file->bs, s->l1_table_offset + l1_length); + if (ret < 0) + return ret; + + memset(s->l2_cache, 0, s->l2_size * L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_offsets, 0, L2_CACHE_SIZE * sizeof(uint64_t)); + memset(s->l2_cache_counts, 0, L2_CACHE_SIZE * sizeof(uint32_t)); + + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcowState *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors != s->cluster_sectors) { + ret = -EINVAL; + + /* Zero-pad last write if image size is not cluster aligned */ + if (sector_num + nb_sectors == bs->total_sectors && + nb_sectors < s->cluster_sectors) { + uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); + memset(pad_buf, 0, s->cluster_size); + memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); + ret = qcow_write_compressed(bs, sector_num, + pad_buf, s->cluster_sectors); + qemu_vfree(pad_buf); + } + return ret; + } + + out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + ret = -EINVAL; + goto fail; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + deflateEnd(&strm); + ret = -EINVAL; + goto fail; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } else { + cluster_offset = get_cluster_offset(bs, sector_num << 9, 2, + out_len, 0, 0); + if (cluster_offset == 0) { + ret = -EIO; + goto fail; + } + + cluster_offset &= s->cluster_offset_mask; + ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + g_free(out_buf); + return ret; +} + +static int qcow_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcowState *s = bs->opaque; + bdi->cluster_size = s->cluster_size; + return 0; +} + +static QemuOptsList qcow_create_opts = { + .name = "qcow-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qcow_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = QEMU_OPT_BOOL, + .help = "Encrypt the image", + .def_value_str = "off" + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_qcow = { + .format_name = "qcow", + .instance_size = sizeof(BDRVQcowState), + .bdrv_probe = qcow_probe, + .bdrv_open = qcow_open, + .bdrv_close = qcow_close, + .bdrv_reopen_prepare = qcow_reopen_prepare, + .bdrv_create = qcow_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .supports_backing = true, + + .bdrv_co_readv = qcow_co_readv, + .bdrv_co_writev = qcow_co_writev, + .bdrv_co_get_block_status = qcow_co_get_block_status, + + .bdrv_set_key = qcow_set_key, + .bdrv_make_empty = qcow_make_empty, + .bdrv_write_compressed = qcow_write_compressed, + .bdrv_get_info = qcow_get_info, + + .create_opts = &qcow_create_opts, +}; + +static void bdrv_qcow_init(void) +{ + bdrv_register(&bdrv_qcow); +} + +block_init(bdrv_qcow_init); diff --git a/src/block/qcow2-cache.c b/src/block/qcow2-cache.c new file mode 100644 index 0000000..86dd7f2 --- /dev/null +++ b/src/block/qcow2-cache.c @@ -0,0 +1,412 @@ +/* + * L2/refcount table cache for the QCOW2 format + * + * Copyright (c) 2010 Kevin Wolf <kwolf@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +/* Needed for CONFIG_MADVISE */ +#include "config-host.h" + +#if defined(CONFIG_MADVISE) || defined(CONFIG_POSIX_MADVISE) +#include <sys/mman.h> +#endif + +#include "block/block_int.h" +#include "qemu-common.h" +#include "qemu/osdep.h" +#include "qcow2.h" +#include "trace.h" + +typedef struct Qcow2CachedTable { + int64_t offset; + uint64_t lru_counter; + int ref; + bool dirty; +} Qcow2CachedTable; + +struct Qcow2Cache { + Qcow2CachedTable *entries; + struct Qcow2Cache *depends; + int size; + bool depends_on_flush; + void *table_array; + uint64_t lru_counter; + uint64_t cache_clean_lru_counter; +}; + +static inline void *qcow2_cache_get_table_addr(BlockDriverState *bs, + Qcow2Cache *c, int table) +{ + BDRVQcow2State *s = bs->opaque; + return (uint8_t *) c->table_array + (size_t) table * s->cluster_size; +} + +static inline int qcow2_cache_get_table_idx(BlockDriverState *bs, + Qcow2Cache *c, void *table) +{ + BDRVQcow2State *s = bs->opaque; + ptrdiff_t table_offset = (uint8_t *) table - (uint8_t *) c->table_array; + int idx = table_offset / s->cluster_size; + assert(idx >= 0 && idx < c->size && table_offset % s->cluster_size == 0); + return idx; +} + +static void qcow2_cache_table_release(BlockDriverState *bs, Qcow2Cache *c, + int i, int num_tables) +{ +#if QEMU_MADV_DONTNEED != QEMU_MADV_INVALID + BDRVQcow2State *s = bs->opaque; + void *t = qcow2_cache_get_table_addr(bs, c, i); + int align = getpagesize(); + size_t mem_size = (size_t) s->cluster_size * num_tables; + size_t offset = QEMU_ALIGN_UP((uintptr_t) t, align) - (uintptr_t) t; + size_t length = QEMU_ALIGN_DOWN(mem_size - offset, align); + if (length > 0) { + qemu_madvise((uint8_t *) t + offset, length, QEMU_MADV_DONTNEED); + } +#endif +} + +static inline bool can_clean_entry(Qcow2Cache *c, int i) +{ + Qcow2CachedTable *t = &c->entries[i]; + return t->ref == 0 && !t->dirty && t->offset != 0 && + t->lru_counter <= c->cache_clean_lru_counter; +} + +void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c) +{ + int i = 0; + while (i < c->size) { + int to_clean = 0; + + /* Skip the entries that we don't need to clean */ + while (i < c->size && !can_clean_entry(c, i)) { + i++; + } + + /* And count how many we can clean in a row */ + while (i < c->size && can_clean_entry(c, i)) { + c->entries[i].offset = 0; + c->entries[i].lru_counter = 0; + i++; + to_clean++; + } + + if (to_clean > 0) { + qcow2_cache_table_release(bs, c, i - to_clean, to_clean); + } + } + + c->cache_clean_lru_counter = c->lru_counter; +} + +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2Cache *c; + + c = g_new0(Qcow2Cache, 1); + c->size = num_tables; + c->entries = g_try_new0(Qcow2CachedTable, num_tables); + c->table_array = qemu_try_blockalign(bs->file->bs, + (size_t) num_tables * s->cluster_size); + + if (!c->entries || !c->table_array) { + qemu_vfree(c->table_array); + g_free(c->entries); + g_free(c); + c = NULL; + } + + return c; +} + +int qcow2_cache_destroy(BlockDriverState *bs, Qcow2Cache *c) +{ + int i; + + for (i = 0; i < c->size; i++) { + assert(c->entries[i].ref == 0); + } + + qemu_vfree(c->table_array); + g_free(c->entries); + g_free(c); + + return 0; +} + +static int qcow2_cache_flush_dependency(BlockDriverState *bs, Qcow2Cache *c) +{ + int ret; + + ret = qcow2_cache_flush(bs, c->depends); + if (ret < 0) { + return ret; + } + + c->depends = NULL; + c->depends_on_flush = false; + + return 0; +} + +static int qcow2_cache_entry_flush(BlockDriverState *bs, Qcow2Cache *c, int i) +{ + BDRVQcow2State *s = bs->opaque; + int ret = 0; + + if (!c->entries[i].dirty || !c->entries[i].offset) { + return 0; + } + + trace_qcow2_cache_entry_flush(qemu_coroutine_self(), + c == s->l2_table_cache, i); + + if (c->depends) { + ret = qcow2_cache_flush_dependency(bs, c); + } else if (c->depends_on_flush) { + ret = bdrv_flush(bs->file->bs); + if (ret >= 0) { + c->depends_on_flush = false; + } + } + + if (ret < 0) { + return ret; + } + + if (c == s->refcount_block_cache) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_REFCOUNT_BLOCK, + c->entries[i].offset, s->cluster_size); + } else if (c == s->l2_table_cache) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, + c->entries[i].offset, s->cluster_size); + } else { + ret = qcow2_pre_write_overlap_check(bs, 0, + c->entries[i].offset, s->cluster_size); + } + + if (ret < 0) { + return ret; + } + + if (c == s->refcount_block_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_UPDATE_PART); + } else if (c == s->l2_table_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE); + } + + ret = bdrv_pwrite(bs->file->bs, c->entries[i].offset, + qcow2_cache_get_table_addr(bs, c, i), s->cluster_size); + if (ret < 0) { + return ret; + } + + c->entries[i].dirty = false; + + return 0; +} + +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c) +{ + BDRVQcow2State *s = bs->opaque; + int result = 0; + int ret; + int i; + + trace_qcow2_cache_flush(qemu_coroutine_self(), c == s->l2_table_cache); + + for (i = 0; i < c->size; i++) { + ret = qcow2_cache_entry_flush(bs, c, i); + if (ret < 0 && result != -ENOSPC) { + result = ret; + } + } + + if (result == 0) { + ret = bdrv_flush(bs->file->bs); + if (ret < 0) { + result = ret; + } + } + + return result; +} + +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, + Qcow2Cache *dependency) +{ + int ret; + + if (dependency->depends) { + ret = qcow2_cache_flush_dependency(bs, dependency); + if (ret < 0) { + return ret; + } + } + + if (c->depends && (c->depends != dependency)) { + ret = qcow2_cache_flush_dependency(bs, c); + if (ret < 0) { + return ret; + } + } + + c->depends = dependency; + return 0; +} + +void qcow2_cache_depends_on_flush(Qcow2Cache *c) +{ + c->depends_on_flush = true; +} + +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c) +{ + int ret, i; + + ret = qcow2_cache_flush(bs, c); + if (ret < 0) { + return ret; + } + + for (i = 0; i < c->size; i++) { + assert(c->entries[i].ref == 0); + c->entries[i].offset = 0; + c->entries[i].lru_counter = 0; + } + + qcow2_cache_table_release(bs, c, 0, c->size); + + c->lru_counter = 0; + + return 0; +} + +static int qcow2_cache_do_get(BlockDriverState *bs, Qcow2Cache *c, + uint64_t offset, void **table, bool read_from_disk) +{ + BDRVQcow2State *s = bs->opaque; + int i; + int ret; + int lookup_index; + uint64_t min_lru_counter = UINT64_MAX; + int min_lru_index = -1; + + trace_qcow2_cache_get(qemu_coroutine_self(), c == s->l2_table_cache, + offset, read_from_disk); + + /* Check if the table is already cached */ + i = lookup_index = (offset / s->cluster_size * 4) % c->size; + do { + const Qcow2CachedTable *t = &c->entries[i]; + if (t->offset == offset) { + goto found; + } + if (t->ref == 0 && t->lru_counter < min_lru_counter) { + min_lru_counter = t->lru_counter; + min_lru_index = i; + } + if (++i == c->size) { + i = 0; + } + } while (i != lookup_index); + + if (min_lru_index == -1) { + /* This can't happen in current synchronous code, but leave the check + * here as a reminder for whoever starts using AIO with the cache */ + abort(); + } + + /* Cache miss: write a table back and replace it */ + i = min_lru_index; + trace_qcow2_cache_get_replace_entry(qemu_coroutine_self(), + c == s->l2_table_cache, i); + + ret = qcow2_cache_entry_flush(bs, c, i); + if (ret < 0) { + return ret; + } + + trace_qcow2_cache_get_read(qemu_coroutine_self(), + c == s->l2_table_cache, i); + c->entries[i].offset = 0; + if (read_from_disk) { + if (c == s->l2_table_cache) { + BLKDBG_EVENT(bs->file, BLKDBG_L2_LOAD); + } + + ret = bdrv_pread(bs->file->bs, offset, + qcow2_cache_get_table_addr(bs, c, i), + s->cluster_size); + if (ret < 0) { + return ret; + } + } + + c->entries[i].offset = offset; + + /* And return the right table */ +found: + c->entries[i].ref++; + *table = qcow2_cache_get_table_addr(bs, c, i); + + trace_qcow2_cache_get_done(qemu_coroutine_self(), + c == s->l2_table_cache, i); + + return 0; +} + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table) +{ + return qcow2_cache_do_get(bs, c, offset, table, true); +} + +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table) +{ + return qcow2_cache_do_get(bs, c, offset, table, false); +} + +void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table) +{ + int i = qcow2_cache_get_table_idx(bs, c, *table); + + c->entries[i].ref--; + *table = NULL; + + if (c->entries[i].ref == 0) { + c->entries[i].lru_counter = ++c->lru_counter; + } + + assert(c->entries[i].ref >= 0); +} + +void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, + void *table) +{ + int i = qcow2_cache_get_table_idx(bs, c, table); + assert(c->entries[i].offset != 0); + c->entries[i].dirty = true; +} diff --git a/src/block/qcow2-cluster.c b/src/block/qcow2-cluster.c new file mode 100644 index 0000000..24a60e2 --- /dev/null +++ b/src/block/qcow2-cluster.c @@ -0,0 +1,1895 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <zlib.h> + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "trace.h" + +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, + bool exact_size) +{ + BDRVQcow2State *s = bs->opaque; + int new_l1_size2, ret, i; + uint64_t *new_l1_table; + int64_t old_l1_table_offset, old_l1_size; + int64_t new_l1_table_offset, new_l1_size; + uint8_t data[12]; + + if (min_size <= s->l1_size) + return 0; + + /* Do a sanity check on min_size before trying to calculate new_l1_size + * (this prevents overflows during the while loop for the calculation of + * new_l1_size) */ + if (min_size > INT_MAX / sizeof(uint64_t)) { + return -EFBIG; + } + + if (exact_size) { + new_l1_size = min_size; + } else { + /* Bump size up to reduce the number of times we have to grow */ + new_l1_size = s->l1_size; + if (new_l1_size == 0) { + new_l1_size = 1; + } + while (min_size > new_l1_size) { + new_l1_size = (new_l1_size * 3 + 1) / 2; + } + } + + if (new_l1_size > INT_MAX / sizeof(uint64_t)) { + return -EFBIG; + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "grow l1_table from %d to %" PRId64 "\n", + s->l1_size, new_l1_size); +#endif + + new_l1_size2 = sizeof(uint64_t) * new_l1_size; + new_l1_table = qemu_try_blockalign(bs->file->bs, + align_offset(new_l1_size2, 512)); + if (new_l1_table == NULL) { + return -ENOMEM; + } + memset(new_l1_table, 0, align_offset(new_l1_size2, 512)); + + memcpy(new_l1_table, s->l1_table, s->l1_size * sizeof(uint64_t)); + + /* write new table (align to cluster) */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ALLOC_TABLE); + new_l1_table_offset = qcow2_alloc_clusters(bs, new_l1_size2); + if (new_l1_table_offset < 0) { + qemu_vfree(new_l1_table); + return new_l1_table_offset; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + /* the L1 position has not yet been updated, so these clusters must + * indeed be completely free */ + ret = qcow2_pre_write_overlap_check(bs, 0, new_l1_table_offset, + new_l1_size2); + if (ret < 0) { + goto fail; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_WRITE_TABLE); + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = cpu_to_be64(new_l1_table[i]); + ret = bdrv_pwrite_sync(bs->file->bs, new_l1_table_offset, + new_l1_table, new_l1_size2); + if (ret < 0) + goto fail; + for(i = 0; i < s->l1_size; i++) + new_l1_table[i] = be64_to_cpu(new_l1_table[i]); + + /* set new table */ + BLKDBG_EVENT(bs->file, BLKDBG_L1_GROW_ACTIVATE_TABLE); + cpu_to_be32w((uint32_t*)data, new_l1_size); + stq_be_p(data + 4, new_l1_table_offset); + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_size), + data, sizeof(data)); + if (ret < 0) { + goto fail; + } + qemu_vfree(s->l1_table); + old_l1_table_offset = s->l1_table_offset; + s->l1_table_offset = new_l1_table_offset; + s->l1_table = new_l1_table; + old_l1_size = s->l1_size; + s->l1_size = new_l1_size; + qcow2_free_clusters(bs, old_l1_table_offset, old_l1_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + return 0; + fail: + qemu_vfree(new_l1_table); + qcow2_free_clusters(bs, new_l1_table_offset, new_l1_size2, + QCOW2_DISCARD_OTHER); + return ret; +} + +/* + * l2_load + * + * Loads a L2 table into memory. If the table is in the cache, the cache + * is used; otherwise the L2 table is loaded from the image file. + * + * Returns a pointer to the L2 table on success, or NULL if the read from + * the image file failed. + */ + +static int l2_load(BlockDriverState *bs, uint64_t l2_offset, + uint64_t **l2_table) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, (void**) l2_table); + + return ret; +} + +/* + * Writes one sector of the L1 table to the disk (can't update single entries + * and we really don't want bdrv_pread to perform a read-modify-write) + */ +#define L1_ENTRIES_PER_SECTOR (512 / 8) +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t buf[L1_ENTRIES_PER_SECTOR] = { 0 }; + int l1_start_index; + int i, ret; + + l1_start_index = l1_index & ~(L1_ENTRIES_PER_SECTOR - 1); + for (i = 0; i < L1_ENTRIES_PER_SECTOR && l1_start_index + i < s->l1_size; + i++) + { + buf[i] = cpu_to_be64(s->l1_table[l1_start_index + i]); + } + + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, + s->l1_table_offset + 8 * l1_start_index, sizeof(buf)); + if (ret < 0) { + return ret; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + ret = bdrv_pwrite_sync(bs->file->bs, + s->l1_table_offset + 8 * l1_start_index, + buf, sizeof(buf)); + if (ret < 0) { + return ret; + } + + return 0; +} + +/* + * l2_allocate + * + * Allocate a new l2 entry in the file. If l1_index points to an already + * used entry in the L2 table (i.e. we are doing a copy on write for the L2 + * table) copy the contents of the old L2 table into the newly allocated one. + * Otherwise the new table is initialized with zeros. + * + */ + +static int l2_allocate(BlockDriverState *bs, int l1_index, uint64_t **table) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t old_l2_offset; + uint64_t *l2_table = NULL; + int64_t l2_offset; + int ret; + + old_l2_offset = s->l1_table[l1_index]; + + trace_qcow2_l2_allocate(bs, l1_index); + + /* allocate a new l2 entry */ + + l2_offset = qcow2_alloc_clusters(bs, s->l2_size * sizeof(uint64_t)); + if (l2_offset < 0) { + ret = l2_offset; + goto fail; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + /* allocate a new entry in the l2 cache */ + + trace_qcow2_l2_allocate_get_empty(bs, l1_index); + ret = qcow2_cache_get_empty(bs, s->l2_table_cache, l2_offset, (void**) table); + if (ret < 0) { + goto fail; + } + + l2_table = *table; + + if ((old_l2_offset & L1E_OFFSET_MASK) == 0) { + /* if there was no old l2 table, clear the new table */ + memset(l2_table, 0, s->l2_size * sizeof(uint64_t)); + } else { + uint64_t* old_table; + + /* if there was an old l2 table, read it from the disk */ + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_COW_READ); + ret = qcow2_cache_get(bs, s->l2_table_cache, + old_l2_offset & L1E_OFFSET_MASK, + (void**) &old_table); + if (ret < 0) { + goto fail; + } + + memcpy(l2_table, old_table, s->cluster_size); + + qcow2_cache_put(bs, s->l2_table_cache, (void **) &old_table); + } + + /* write the l2 table to the file */ + BLKDBG_EVENT(bs->file, BLKDBG_L2_ALLOC_WRITE); + + trace_qcow2_l2_allocate_write_l2(bs, l1_index); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + /* update the L1 entry */ + trace_qcow2_l2_allocate_write_l1(bs, l1_index); + s->l1_table[l1_index] = l2_offset | QCOW_OFLAG_COPIED; + ret = qcow2_write_l1_entry(bs, l1_index); + if (ret < 0) { + goto fail; + } + + *table = l2_table; + trace_qcow2_l2_allocate_done(bs, l1_index, 0); + return 0; + +fail: + trace_qcow2_l2_allocate_done(bs, l1_index, ret); + if (l2_table != NULL) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) table); + } + s->l1_table[l1_index] = old_l2_offset; + if (l2_offset > 0) { + qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), + QCOW2_DISCARD_ALWAYS); + } + return ret; +} + +/* + * Checks how many clusters in a given L2 table are contiguous in the image + * file. As soon as one of the flags in the bitmask stop_flags changes compared + * to the first cluster, the search is stopped and the cluster is not counted + * as contiguous. (This allows it, for example, to stop at the first compressed + * cluster which may require a different handling) + */ +static int count_contiguous_clusters(int nb_clusters, int cluster_size, + uint64_t *l2_table, uint64_t stop_flags) +{ + int i; + uint64_t mask = stop_flags | L2E_OFFSET_MASK | QCOW_OFLAG_COMPRESSED; + uint64_t first_entry = be64_to_cpu(l2_table[0]); + uint64_t offset = first_entry & mask; + + if (!offset) + return 0; + + assert(qcow2_get_cluster_type(first_entry) == QCOW2_CLUSTER_NORMAL); + + for (i = 0; i < nb_clusters; i++) { + uint64_t l2_entry = be64_to_cpu(l2_table[i]) & mask; + if (offset + (uint64_t) i * cluster_size != l2_entry) { + break; + } + } + + return i; +} + +static int count_contiguous_clusters_by_type(int nb_clusters, + uint64_t *l2_table, + int wanted_type) +{ + int i; + + for (i = 0; i < nb_clusters; i++) { + int type = qcow2_get_cluster_type(be64_to_cpu(l2_table[i])); + + if (type != wanted_type) { + break; + } + } + + return i; +} + +/* The crypt function is compatible with the linux cryptoloop + algorithm for < 4 GB images. NOTE: out_buf == in_buf is + supported */ +int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, bool enc, + Error **errp) +{ + union { + uint64_t ll[2]; + uint8_t b[16]; + } ivec; + int i; + int ret; + + for(i = 0; i < nb_sectors; i++) { + ivec.ll[0] = cpu_to_le64(sector_num); + ivec.ll[1] = 0; + if (qcrypto_cipher_setiv(s->cipher, + ivec.b, G_N_ELEMENTS(ivec.b), + errp) < 0) { + return -1; + } + if (enc) { + ret = qcrypto_cipher_encrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } else { + ret = qcrypto_cipher_decrypt(s->cipher, + in_buf, + out_buf, + 512, + errp); + } + if (ret < 0) { + return -1; + } + sector_num++; + in_buf += 512; + out_buf += 512; + } + return 0; +} + +static int coroutine_fn copy_sectors(BlockDriverState *bs, + uint64_t start_sect, + uint64_t cluster_offset, + int n_start, int n_end) +{ + BDRVQcow2State *s = bs->opaque; + QEMUIOVector qiov; + struct iovec iov; + int n, ret; + + n = n_end - n_start; + if (n <= 0) { + return 0; + } + + iov.iov_len = n * BDRV_SECTOR_SIZE; + iov.iov_base = qemu_try_blockalign(bs, iov.iov_len); + if (iov.iov_base == NULL) { + return -ENOMEM; + } + + qemu_iovec_init_external(&qiov, &iov, 1); + + BLKDBG_EVENT(bs->file, BLKDBG_COW_READ); + + if (!bs->drv) { + ret = -ENOMEDIUM; + goto out; + } + + /* Call .bdrv_co_readv() directly instead of using the public block-layer + * interface. This avoids double I/O throttling and request tracking, + * which can lead to deadlock when block layer copy-on-read is enabled. + */ + ret = bs->drv->bdrv_co_readv(bs, start_sect + n_start, n, &qiov); + if (ret < 0) { + goto out; + } + + if (bs->encrypted) { + Error *err = NULL; + assert(s->cipher); + if (qcow2_encrypt_sectors(s, start_sect + n_start, + iov.iov_base, iov.iov_base, n, + true, &err) < 0) { + ret = -EIO; + error_free(err); + goto out; + } + } + + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + n_start * BDRV_SECTOR_SIZE, n * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto out; + } + + BLKDBG_EVENT(bs->file, BLKDBG_COW_WRITE); + ret = bdrv_co_writev(bs->file->bs, (cluster_offset >> 9) + n_start, n, + &qiov); + if (ret < 0) { + goto out; + } + + ret = 0; +out: + qemu_vfree(iov.iov_base); + return ret; +} + + +/* + * get_cluster_offset + * + * For a given offset of the disk image, find the cluster offset in + * qcow2 file. The offset is stored in *cluster_offset. + * + * on entry, *num is the number of contiguous sectors we'd like to + * access following offset. + * + * on exit, *num is the number of contiguous sectors we can read. + * + * Returns the cluster type (QCOW2_CLUSTER_*) on success, -errno in error + * cases. + */ +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset) +{ + BDRVQcow2State *s = bs->opaque; + unsigned int l2_index; + uint64_t l1_index, l2_offset, *l2_table; + int l1_bits, c; + unsigned int index_in_cluster, nb_clusters; + uint64_t nb_available, nb_needed; + int ret; + + index_in_cluster = (offset >> 9) & (s->cluster_sectors - 1); + nb_needed = *num + index_in_cluster; + + l1_bits = s->l2_bits + s->cluster_bits; + + /* compute how many bytes there are between the offset and + * the end of the l1 entry + */ + + nb_available = (1ULL << l1_bits) - (offset & ((1ULL << l1_bits) - 1)); + + /* compute the number of available sectors */ + + nb_available = (nb_available >> 9) + index_in_cluster; + + if (nb_needed > nb_available) { + nb_needed = nb_available; + } + assert(nb_needed <= INT_MAX); + + *cluster_offset = 0; + + /* seek to the l2 offset in the l1 table */ + + l1_index = offset >> l1_bits; + if (l1_index >= s->l1_size) { + ret = QCOW2_CLUSTER_UNALLOCATED; + goto out; + } + + l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + if (!l2_offset) { + ret = QCOW2_CLUSTER_UNALLOCATED; + goto out; + } + + if (offset_into_cluster(s, l2_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 + " unaligned (L1 index: %#" PRIx64 ")", + l2_offset, l1_index); + return -EIO; + } + + /* load the l2 table in memory */ + + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + *cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* nb_needed <= INT_MAX, thus nb_clusters <= INT_MAX, too */ + nb_clusters = size_to_clusters(s, nb_needed << 9); + + ret = qcow2_get_cluster_type(*cluster_offset); + switch (ret) { + case QCOW2_CLUSTER_COMPRESSED: + /* Compressed clusters can only be processed one by one */ + c = 1; + *cluster_offset &= L2E_COMPRESSED_OFFSET_SIZE_MASK; + break; + case QCOW2_CLUSTER_ZERO: + if (s->qcow_version < 3) { + qcow2_signal_corruption(bs, true, -1, -1, "Zero cluster entry found" + " in pre-v3 image (L2 offset: %#" PRIx64 + ", L2 index: %#x)", l2_offset, l2_index); + ret = -EIO; + goto fail; + } + c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], + QCOW2_CLUSTER_ZERO); + *cluster_offset = 0; + break; + case QCOW2_CLUSTER_UNALLOCATED: + /* how many empty clusters ? */ + c = count_contiguous_clusters_by_type(nb_clusters, &l2_table[l2_index], + QCOW2_CLUSTER_UNALLOCATED); + *cluster_offset = 0; + break; + case QCOW2_CLUSTER_NORMAL: + /* how many allocated clusters ? */ + c = count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], QCOW_OFLAG_ZERO); + *cluster_offset &= L2E_OFFSET_MASK; + if (offset_into_cluster(s, *cluster_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset %#" + PRIx64 " unaligned (L2 offset: %#" PRIx64 + ", L2 index: %#x)", *cluster_offset, + l2_offset, l2_index); + ret = -EIO; + goto fail; + } + break; + default: + abort(); + } + + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + + nb_available = (c * s->cluster_sectors); + +out: + if (nb_available > nb_needed) + nb_available = nb_needed; + + *num = nb_available - index_in_cluster; + + return ret; + +fail: + qcow2_cache_put(bs, s->l2_table_cache, (void **)&l2_table); + return ret; +} + +/* + * get_cluster_table + * + * for a given disk offset, load (and allocate if needed) + * the l2 table. + * + * the l2 table offset in the qcow2 file and the cluster index + * in the l2 table are given to the caller. + * + * Returns 0 on success, -errno in failure case + */ +static int get_cluster_table(BlockDriverState *bs, uint64_t offset, + uint64_t **new_l2_table, + int *new_l2_index) +{ + BDRVQcow2State *s = bs->opaque; + unsigned int l2_index; + uint64_t l1_index, l2_offset; + uint64_t *l2_table = NULL; + int ret; + + /* seek to the l2 offset in the l1 table */ + + l1_index = offset >> (s->l2_bits + s->cluster_bits); + if (l1_index >= s->l1_size) { + ret = qcow2_grow_l1_table(bs, l1_index + 1, false); + if (ret < 0) { + return ret; + } + } + + assert(l1_index < s->l1_size); + l2_offset = s->l1_table[l1_index] & L1E_OFFSET_MASK; + if (offset_into_cluster(s, l2_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" PRIx64 + " unaligned (L1 index: %#" PRIx64 ")", + l2_offset, l1_index); + return -EIO; + } + + /* seek the l2 table of the given l2 offset */ + + if (s->l1_table[l1_index] & QCOW_OFLAG_COPIED) { + /* load the l2 table in memory */ + ret = l2_load(bs, l2_offset, &l2_table); + if (ret < 0) { + return ret; + } + } else { + /* First allocate a new L2 table (and do COW if needed) */ + ret = l2_allocate(bs, l1_index, &l2_table); + if (ret < 0) { + return ret; + } + + /* Then decrease the refcount of the old table */ + if (l2_offset) { + qcow2_free_clusters(bs, l2_offset, s->l2_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + } + } + + /* find the cluster offset for the given disk offset */ + + l2_index = (offset >> s->cluster_bits) & (s->l2_size - 1); + + *new_l2_table = l2_table; + *new_l2_index = l2_index; + + return 0; +} + +/* + * alloc_compressed_cluster_offset + * + * For a given offset of the disk image, return cluster offset in + * qcow2 file. + * + * If the offset is not found, allocate a new compressed cluster. + * + * Return the cluster offset if successful, + * Return 0, otherwise. + * + */ + +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int compressed_size) +{ + BDRVQcow2State *s = bs->opaque; + int l2_index, ret; + uint64_t *l2_table; + int64_t cluster_offset; + int nb_csectors; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return 0; + } + + /* Compression can't overwrite anything. Fail if the cluster was already + * allocated. */ + cluster_offset = be64_to_cpu(l2_table[l2_index]); + if (cluster_offset & L2E_OFFSET_MASK) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + return 0; + } + + cluster_offset = qcow2_alloc_bytes(bs, compressed_size); + if (cluster_offset < 0) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + return 0; + } + + nb_csectors = ((cluster_offset + compressed_size - 1) >> 9) - + (cluster_offset >> 9); + + cluster_offset |= QCOW_OFLAG_COMPRESSED | + ((uint64_t)nb_csectors << s->csize_shift); + + /* update L2 table */ + + /* compressed clusters never have the copied flag */ + + BLKDBG_EVENT(bs->file, BLKDBG_L2_UPDATE_COMPRESSED); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); + l2_table[l2_index] = cpu_to_be64(cluster_offset); + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + + return cluster_offset; +} + +static int perform_cow(BlockDriverState *bs, QCowL2Meta *m, Qcow2COWRegion *r) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + + if (r->nb_sectors == 0) { + return 0; + } + + qemu_co_mutex_unlock(&s->lock); + ret = copy_sectors(bs, m->offset / BDRV_SECTOR_SIZE, m->alloc_offset, + r->offset / BDRV_SECTOR_SIZE, + r->offset / BDRV_SECTOR_SIZE + r->nb_sectors); + qemu_co_mutex_lock(&s->lock); + + if (ret < 0) { + return ret; + } + + /* + * Before we update the L2 table to actually point to the new cluster, we + * need to be sure that the refcounts have been increased and COW was + * handled. + */ + qcow2_cache_depends_on_flush(s->l2_table_cache); + + return 0; +} + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m) +{ + BDRVQcow2State *s = bs->opaque; + int i, j = 0, l2_index, ret; + uint64_t *old_cluster, *l2_table; + uint64_t cluster_offset = m->alloc_offset; + + trace_qcow2_cluster_link_l2(qemu_coroutine_self(), m->nb_clusters); + assert(m->nb_clusters > 0); + + old_cluster = g_try_new(uint64_t, m->nb_clusters); + if (old_cluster == NULL) { + ret = -ENOMEM; + goto err; + } + + /* copy content of unmodified sectors */ + ret = perform_cow(bs, m, &m->cow_start); + if (ret < 0) { + goto err; + } + + ret = perform_cow(bs, m, &m->cow_end); + if (ret < 0) { + goto err; + } + + /* Update L2 table. */ + if (s->use_lazy_refcounts) { + qcow2_mark_dirty(bs); + } + if (qcow2_need_accurate_refcounts(s)) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); + } + + ret = get_cluster_table(bs, m->offset, &l2_table, &l2_index); + if (ret < 0) { + goto err; + } + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); + + assert(l2_index + m->nb_clusters <= s->l2_size); + for (i = 0; i < m->nb_clusters; i++) { + /* if two concurrent writes happen to the same unallocated cluster + * each write allocates separate cluster and writes data concurrently. + * The first one to complete updates l2 table with pointer to its + * cluster the second one has to do RMW (which is done above by + * copy_sectors()), update l2 table with its cluster pointer and free + * old cluster. This is what this loop does */ + if(l2_table[l2_index + i] != 0) + old_cluster[j++] = l2_table[l2_index + i]; + + l2_table[l2_index + i] = cpu_to_be64((cluster_offset + + (i << s->cluster_bits)) | QCOW_OFLAG_COPIED); + } + + + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + + /* + * If this was a COW, we need to decrease the refcount of the old cluster. + * + * Don't discard clusters that reach a refcount of 0 (e.g. compressed + * clusters), the next write will reuse them anyway. + */ + if (j != 0) { + for (i = 0; i < j; i++) { + qcow2_free_any_clusters(bs, be64_to_cpu(old_cluster[i]), 1, + QCOW2_DISCARD_NEVER); + } + } + + ret = 0; +err: + g_free(old_cluster); + return ret; + } + +/* + * Returns the number of contiguous clusters that can be used for an allocating + * write, but require COW to be performed (this includes yet unallocated space, + * which must copy from the backing file) + */ +static int count_cow_clusters(BDRVQcow2State *s, int nb_clusters, + uint64_t *l2_table, int l2_index) +{ + int i; + + for (i = 0; i < nb_clusters; i++) { + uint64_t l2_entry = be64_to_cpu(l2_table[l2_index + i]); + int cluster_type = qcow2_get_cluster_type(l2_entry); + + switch(cluster_type) { + case QCOW2_CLUSTER_NORMAL: + if (l2_entry & QCOW_OFLAG_COPIED) { + goto out; + } + break; + case QCOW2_CLUSTER_UNALLOCATED: + case QCOW2_CLUSTER_COMPRESSED: + case QCOW2_CLUSTER_ZERO: + break; + default: + abort(); + } + } + +out: + assert(i <= nb_clusters); + return i; +} + +/* + * Check if there already is an AIO write request in flight which allocates + * the same cluster. In this case we need to wait until the previous + * request has completed and updated the L2 table accordingly. + * + * Returns: + * 0 if there was no dependency. *cur_bytes indicates the number of + * bytes from guest_offset that can be read before the next + * dependency must be processed (or the request is complete) + * + * -EAGAIN if we had to wait for another request, previously gathered + * information on cluster allocation may be invalid now. The caller + * must start over anyway, so consider *cur_bytes undefined. + */ +static int handle_dependencies(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *cur_bytes, QCowL2Meta **m) +{ + BDRVQcow2State *s = bs->opaque; + QCowL2Meta *old_alloc; + uint64_t bytes = *cur_bytes; + + QLIST_FOREACH(old_alloc, &s->cluster_allocs, next_in_flight) { + + uint64_t start = guest_offset; + uint64_t end = start + bytes; + uint64_t old_start = l2meta_cow_start(old_alloc); + uint64_t old_end = l2meta_cow_end(old_alloc); + + if (end <= old_start || start >= old_end) { + /* No intersection */ + } else { + if (start < old_start) { + /* Stop at the start of a running allocation */ + bytes = old_start - start; + } else { + bytes = 0; + } + + /* Stop if already an l2meta exists. After yielding, it wouldn't + * be valid any more, so we'd have to clean up the old L2Metas + * and deal with requests depending on them before starting to + * gather new ones. Not worth the trouble. */ + if (bytes == 0 && *m) { + *cur_bytes = 0; + return 0; + } + + if (bytes == 0) { + /* Wait for the dependency to complete. We need to recheck + * the free/allocated clusters when we continue. */ + qemu_co_mutex_unlock(&s->lock); + qemu_co_queue_wait(&old_alloc->dependent_requests); + qemu_co_mutex_lock(&s->lock); + return -EAGAIN; + } + } + } + + /* Make sure that existing clusters and new allocations are only used up to + * the next dependency if we shortened the request above */ + *cur_bytes = bytes; + + return 0; +} + +/* + * Checks how many already allocated clusters that don't require a copy on + * write there are at the given guest_offset (up to *bytes). If + * *host_offset is not zero, only physically contiguous clusters beginning at + * this host offset are counted. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + * 0: if no allocated clusters are available at the given offset. + * *bytes is normally unchanged. It is set to 0 if the cluster + * is allocated and doesn't need COW, but doesn't have the right + * physical offset. + * + * 1: if allocated clusters that don't require a COW are available at + * the requested offset. *bytes may have decreased and describes + * the length of the area that can be written to. + * + * -errno: in error cases + */ +static int handle_copied(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ + BDRVQcow2State *s = bs->opaque; + int l2_index; + uint64_t cluster_offset; + uint64_t *l2_table; + uint64_t nb_clusters; + unsigned int keep_clusters; + int ret; + + trace_qcow2_handle_copied(qemu_coroutine_self(), guest_offset, *host_offset, + *bytes); + + assert(*host_offset == 0 || offset_into_cluster(s, guest_offset) + == offset_into_cluster(s, *host_offset)); + + /* + * Calculate the number of clusters to look for. We stop at L2 table + * boundaries to keep things simple. + */ + nb_clusters = + size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + + l2_index = offset_to_l2_index(s, guest_offset); + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); + + /* Find L2 entry for the first involved cluster */ + ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + cluster_offset = be64_to_cpu(l2_table[l2_index]); + + /* Check how many clusters are already allocated and don't need COW */ + if (qcow2_get_cluster_type(cluster_offset) == QCOW2_CLUSTER_NORMAL + && (cluster_offset & QCOW_OFLAG_COPIED)) + { + /* If a specific host_offset is required, check it */ + bool offset_matches = + (cluster_offset & L2E_OFFSET_MASK) == *host_offset; + + if (offset_into_cluster(s, cluster_offset & L2E_OFFSET_MASK)) { + qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " + "%#llx unaligned (guest offset: %#" PRIx64 + ")", cluster_offset & L2E_OFFSET_MASK, + guest_offset); + ret = -EIO; + goto out; + } + + if (*host_offset != 0 && !offset_matches) { + *bytes = 0; + ret = 0; + goto out; + } + + /* We keep all QCOW_OFLAG_COPIED clusters */ + keep_clusters = + count_contiguous_clusters(nb_clusters, s->cluster_size, + &l2_table[l2_index], + QCOW_OFLAG_COPIED | QCOW_OFLAG_ZERO); + assert(keep_clusters <= nb_clusters); + + *bytes = MIN(*bytes, + keep_clusters * s->cluster_size + - offset_into_cluster(s, guest_offset)); + + ret = 1; + } else { + ret = 0; + } + + /* Cleanup */ +out: + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + + /* Only return a host offset if we actually made progress. Otherwise we + * would make requirements for handle_alloc() that it can't fulfill */ + if (ret > 0) { + *host_offset = (cluster_offset & L2E_OFFSET_MASK) + + offset_into_cluster(s, guest_offset); + } + + return ret; +} + +/* + * Allocates new clusters for the given guest_offset. + * + * At most *nb_clusters are allocated, and on return *nb_clusters is updated to + * contain the number of clusters that have been allocated and are contiguous + * in the image file. + * + * If *host_offset is non-zero, it specifies the offset in the image file at + * which the new clusters must start. *nb_clusters can be 0 on return in this + * case if the cluster at host_offset is already in use. If *host_offset is + * zero, the clusters can be allocated anywhere in the image file. + * + * *host_offset is updated to contain the offset into the image file at which + * the first allocated cluster starts. + * + * Return 0 on success and -errno in error cases. -EAGAIN means that the + * function has been waiting for another request and the allocation must be + * restarted, but the whole request should not be failed. + */ +static int do_alloc_cluster_offset(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, uint64_t *nb_clusters) +{ + BDRVQcow2State *s = bs->opaque; + + trace_qcow2_do_alloc_clusters_offset(qemu_coroutine_self(), guest_offset, + *host_offset, *nb_clusters); + + /* Allocate new clusters */ + trace_qcow2_cluster_alloc_phys(qemu_coroutine_self()); + if (*host_offset == 0) { + int64_t cluster_offset = + qcow2_alloc_clusters(bs, *nb_clusters * s->cluster_size); + if (cluster_offset < 0) { + return cluster_offset; + } + *host_offset = cluster_offset; + return 0; + } else { + int64_t ret = qcow2_alloc_clusters_at(bs, *host_offset, *nb_clusters); + if (ret < 0) { + return ret; + } + *nb_clusters = ret; + return 0; + } +} + +/* + * Allocates new clusters for an area that either is yet unallocated or needs a + * copy on write. If *host_offset is non-zero, clusters are only allocated if + * the new allocation can match the specified host offset. + * + * Note that guest_offset may not be cluster aligned. In this case, the + * returned *host_offset points to exact byte referenced by guest_offset and + * therefore isn't cluster aligned as well. + * + * Returns: + * 0: if no clusters could be allocated. *bytes is set to 0, + * *host_offset is left unchanged. + * + * 1: if new clusters were allocated. *bytes may be decreased if the + * new allocation doesn't cover all of the requested area. + * *host_offset is updated to contain the host offset of the first + * newly allocated cluster. + * + * -errno: in error cases + */ +static int handle_alloc(BlockDriverState *bs, uint64_t guest_offset, + uint64_t *host_offset, uint64_t *bytes, QCowL2Meta **m) +{ + BDRVQcow2State *s = bs->opaque; + int l2_index; + uint64_t *l2_table; + uint64_t entry; + uint64_t nb_clusters; + int ret; + + uint64_t alloc_cluster_offset; + + trace_qcow2_handle_alloc(qemu_coroutine_self(), guest_offset, *host_offset, + *bytes); + assert(*bytes > 0); + + /* + * Calculate the number of clusters to look for. We stop at L2 table + * boundaries to keep things simple. + */ + nb_clusters = + size_to_clusters(s, offset_into_cluster(s, guest_offset) + *bytes); + + l2_index = offset_to_l2_index(s, guest_offset); + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); + + /* Find L2 entry for the first involved cluster */ + ret = get_cluster_table(bs, guest_offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + entry = be64_to_cpu(l2_table[l2_index]); + + /* For the moment, overwrite compressed clusters one by one */ + if (entry & QCOW_OFLAG_COMPRESSED) { + nb_clusters = 1; + } else { + nb_clusters = count_cow_clusters(s, nb_clusters, l2_table, l2_index); + } + + /* This function is only called when there were no non-COW clusters, so if + * we can't find any unallocated or COW clusters either, something is + * wrong with our code. */ + assert(nb_clusters > 0); + + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + + /* Allocate, if necessary at a given offset in the image file */ + alloc_cluster_offset = start_of_cluster(s, *host_offset); + ret = do_alloc_cluster_offset(bs, guest_offset, &alloc_cluster_offset, + &nb_clusters); + if (ret < 0) { + goto fail; + } + + /* Can't extend contiguous allocation */ + if (nb_clusters == 0) { + *bytes = 0; + return 0; + } + + /* !*host_offset would overwrite the image header and is reserved for "no + * host offset preferred". If 0 was a valid host offset, it'd trigger the + * following overlap check; do that now to avoid having an invalid value in + * *host_offset. */ + if (!alloc_cluster_offset) { + ret = qcow2_pre_write_overlap_check(bs, 0, alloc_cluster_offset, + nb_clusters * s->cluster_size); + assert(ret < 0); + goto fail; + } + + /* + * Save info needed for meta data update. + * + * requested_sectors: Number of sectors from the start of the first + * newly allocated cluster to the end of the (possibly shortened + * before) write request. + * + * avail_sectors: Number of sectors from the start of the first + * newly allocated to the end of the last newly allocated cluster. + * + * nb_sectors: The number of sectors from the start of the first + * newly allocated cluster to the end of the area that the write + * request actually writes to (excluding COW at the end) + */ + int requested_sectors = + (*bytes + offset_into_cluster(s, guest_offset)) + >> BDRV_SECTOR_BITS; + int avail_sectors = nb_clusters + << (s->cluster_bits - BDRV_SECTOR_BITS); + int alloc_n_start = offset_into_cluster(s, guest_offset) + >> BDRV_SECTOR_BITS; + int nb_sectors = MIN(requested_sectors, avail_sectors); + QCowL2Meta *old_m = *m; + + *m = g_malloc0(sizeof(**m)); + + **m = (QCowL2Meta) { + .next = old_m, + + .alloc_offset = alloc_cluster_offset, + .offset = start_of_cluster(s, guest_offset), + .nb_clusters = nb_clusters, + .nb_available = nb_sectors, + + .cow_start = { + .offset = 0, + .nb_sectors = alloc_n_start, + }, + .cow_end = { + .offset = nb_sectors * BDRV_SECTOR_SIZE, + .nb_sectors = avail_sectors - nb_sectors, + }, + }; + qemu_co_queue_init(&(*m)->dependent_requests); + QLIST_INSERT_HEAD(&s->cluster_allocs, *m, next_in_flight); + + *host_offset = alloc_cluster_offset + offset_into_cluster(s, guest_offset); + *bytes = MIN(*bytes, (nb_sectors * BDRV_SECTOR_SIZE) + - offset_into_cluster(s, guest_offset)); + assert(*bytes != 0); + + return 1; + +fail: + if (*m && (*m)->nb_clusters > 0) { + QLIST_REMOVE(*m, next_in_flight); + } + return ret; +} + +/* + * alloc_cluster_offset + * + * For a given offset on the virtual disk, find the cluster offset in qcow2 + * file. If the offset is not found, allocate a new cluster. + * + * If the cluster was already allocated, m->nb_clusters is set to 0 and + * other fields in m are meaningless. + * + * If the cluster is newly allocated, m->nb_clusters is set to the number of + * contiguous clusters that have been allocated. In this case, the other + * fields of m are valid and contain information about the first allocated + * cluster. + * + * If the request conflicts with another write request in flight, the coroutine + * is queued and will be reentered when the dependency has completed. + * + * Return 0 on success and -errno in error cases + */ +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *host_offset, QCowL2Meta **m) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t start, remaining; + uint64_t cluster_offset; + uint64_t cur_bytes; + int ret; + + trace_qcow2_alloc_clusters_offset(qemu_coroutine_self(), offset, *num); + + assert((offset & ~BDRV_SECTOR_MASK) == 0); + +again: + start = offset; + remaining = (uint64_t)*num << BDRV_SECTOR_BITS; + cluster_offset = 0; + *host_offset = 0; + cur_bytes = 0; + *m = NULL; + + while (true) { + + if (!*host_offset) { + *host_offset = start_of_cluster(s, cluster_offset); + } + + assert(remaining >= cur_bytes); + + start += cur_bytes; + remaining -= cur_bytes; + cluster_offset += cur_bytes; + + if (remaining == 0) { + break; + } + + cur_bytes = remaining; + + /* + * Now start gathering as many contiguous clusters as possible: + * + * 1. Check for overlaps with in-flight allocations + * + * a) Overlap not in the first cluster -> shorten this request and + * let the caller handle the rest in its next loop iteration. + * + * b) Real overlaps of two requests. Yield and restart the search + * for contiguous clusters (the situation could have changed + * while we were sleeping) + * + * c) TODO: Request starts in the same cluster as the in-flight + * allocation ends. Shorten the COW of the in-fight allocation, + * set cluster_offset to write to the same cluster and set up + * the right synchronisation between the in-flight request and + * the new one. + */ + ret = handle_dependencies(bs, start, &cur_bytes, m); + if (ret == -EAGAIN) { + /* Currently handle_dependencies() doesn't yield if we already had + * an allocation. If it did, we would have to clean up the L2Meta + * structs before starting over. */ + assert(*m == NULL); + goto again; + } else if (ret < 0) { + return ret; + } else if (cur_bytes == 0) { + break; + } else { + /* handle_dependencies() may have decreased cur_bytes (shortened + * the allocations below) so that the next dependency is processed + * correctly during the next loop iteration. */ + } + + /* + * 2. Count contiguous COPIED clusters. + */ + ret = handle_copied(bs, start, &cluster_offset, &cur_bytes, m); + if (ret < 0) { + return ret; + } else if (ret) { + continue; + } else if (cur_bytes == 0) { + break; + } + + /* + * 3. If the request still hasn't completed, allocate new clusters, + * considering any cluster_offset of steps 1c or 2. + */ + ret = handle_alloc(bs, start, &cluster_offset, &cur_bytes, m); + if (ret < 0) { + return ret; + } else if (ret) { + continue; + } else { + assert(cur_bytes == 0); + break; + } + } + + *num -= remaining >> BDRV_SECTOR_BITS; + assert(*num > 0); + assert(*host_offset != 0); + + return 0; +} + +static int decompress_buffer(uint8_t *out_buf, int out_buf_size, + const uint8_t *buf, int buf_size) +{ + z_stream strm1, *strm = &strm1; + int ret, out_len; + + memset(strm, 0, sizeof(*strm)); + + strm->next_in = (uint8_t *)buf; + strm->avail_in = buf_size; + strm->next_out = out_buf; + strm->avail_out = out_buf_size; + + ret = inflateInit2(strm, -12); + if (ret != Z_OK) + return -1; + ret = inflate(strm, Z_FINISH); + out_len = strm->next_out - out_buf; + if ((ret != Z_STREAM_END && ret != Z_BUF_ERROR) || + out_len != out_buf_size) { + inflateEnd(strm); + return -1; + } + inflateEnd(strm); + return 0; +} + +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset) +{ + BDRVQcow2State *s = bs->opaque; + int ret, csize, nb_csectors, sector_offset; + uint64_t coffset; + + coffset = cluster_offset & s->cluster_offset_mask; + if (s->cluster_cache_offset != coffset) { + nb_csectors = ((cluster_offset >> s->csize_shift) & s->csize_mask) + 1; + sector_offset = coffset & 511; + csize = nb_csectors * 512 - sector_offset; + BLKDBG_EVENT(bs->file, BLKDBG_READ_COMPRESSED); + ret = bdrv_read(bs->file->bs, coffset >> 9, s->cluster_data, + nb_csectors); + if (ret < 0) { + return ret; + } + if (decompress_buffer(s->cluster_cache, s->cluster_size, + s->cluster_data + sector_offset, csize) < 0) { + return -EIO; + } + s->cluster_cache_offset = coffset; + } + return 0; +} + +/* + * This discards as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of discarded + * clusters. + */ +static int discard_single_l2(BlockDriverState *bs, uint64_t offset, + uint64_t nb_clusters, enum qcow2_discard_type type, + bool full_discard) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t *l2_table; + int l2_index; + int ret; + int i; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + /* Limit nb_clusters to one L2 table */ + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); + + for (i = 0; i < nb_clusters; i++) { + uint64_t old_l2_entry; + + old_l2_entry = be64_to_cpu(l2_table[l2_index + i]); + + /* + * If full_discard is false, make sure that a discarded area reads back + * as zeroes for v3 images (we cannot do it for v2 without actually + * writing a zero-filled buffer). We can skip the operation if the + * cluster is already marked as zero, or if it's unallocated and we + * don't have a backing file. + * + * TODO We might want to use bdrv_get_block_status(bs) here, but we're + * holding s->lock, so that doesn't work today. + * + * If full_discard is true, the sector should not read back as zeroes, + * but rather fall through to the backing file. + */ + switch (qcow2_get_cluster_type(old_l2_entry)) { + case QCOW2_CLUSTER_UNALLOCATED: + if (full_discard || !bs->backing) { + continue; + } + break; + + case QCOW2_CLUSTER_ZERO: + if (!full_discard) { + continue; + } + break; + + case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_COMPRESSED: + break; + + default: + abort(); + } + + /* First remove L2 entries */ + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); + if (!full_discard && s->qcow_version >= 3) { + l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); + } else { + l2_table[l2_index + i] = cpu_to_be64(0); + } + + /* Then decrease the refcount */ + qcow2_free_any_clusters(bs, old_l2_entry, 1, type); + } + + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + + return nb_clusters; +} + +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, + int nb_sectors, enum qcow2_discard_type type, bool full_discard) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t end_offset; + uint64_t nb_clusters; + int ret; + + end_offset = offset + (nb_sectors << BDRV_SECTOR_BITS); + + /* Round start up and end down */ + offset = align_offset(offset, s->cluster_size); + end_offset = start_of_cluster(s, end_offset); + + if (offset > end_offset) { + return 0; + } + + nb_clusters = size_to_clusters(s, end_offset - offset); + + s->cache_discards = true; + + /* Each L2 table is handled by its own loop iteration */ + while (nb_clusters > 0) { + ret = discard_single_l2(bs, offset, nb_clusters, type, full_discard); + if (ret < 0) { + goto fail; + } + + nb_clusters -= ret; + offset += (ret * s->cluster_size); + } + + ret = 0; +fail: + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + return ret; +} + +/* + * This zeroes as many clusters of nb_clusters as possible at once (i.e. + * all clusters in the same L2 table) and returns the number of zeroed + * clusters. + */ +static int zero_single_l2(BlockDriverState *bs, uint64_t offset, + uint64_t nb_clusters) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t *l2_table; + int l2_index; + int ret; + int i; + + ret = get_cluster_table(bs, offset, &l2_table, &l2_index); + if (ret < 0) { + return ret; + } + + /* Limit nb_clusters to one L2 table */ + nb_clusters = MIN(nb_clusters, s->l2_size - l2_index); + assert(nb_clusters <= INT_MAX); + + for (i = 0; i < nb_clusters; i++) { + uint64_t old_offset; + + old_offset = be64_to_cpu(l2_table[l2_index + i]); + + /* Update L2 entries */ + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); + if (old_offset & QCOW_OFLAG_COMPRESSED) { + l2_table[l2_index + i] = cpu_to_be64(QCOW_OFLAG_ZERO); + qcow2_free_any_clusters(bs, old_offset, 1, QCOW2_DISCARD_REQUEST); + } else { + l2_table[l2_index + i] |= cpu_to_be64(QCOW_OFLAG_ZERO); + } + } + + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + + return nb_clusters; +} + +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t nb_clusters; + int ret; + + /* The zero flag is only supported by version 3 and newer */ + if (s->qcow_version < 3) { + return -ENOTSUP; + } + + /* Each L2 table is handled by its own loop iteration */ + nb_clusters = size_to_clusters(s, nb_sectors << BDRV_SECTOR_BITS); + + s->cache_discards = true; + + while (nb_clusters > 0) { + ret = zero_single_l2(bs, offset, nb_clusters); + if (ret < 0) { + goto fail; + } + + nb_clusters -= ret; + offset += (ret * s->cluster_size); + } + + ret = 0; +fail: + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + return ret; +} + +/* + * Expands all zero clusters in a specific L1 table (or deallocates them, for + * non-backed non-pre-allocated zero clusters). + * + * l1_entries and *visited_l1_entries are used to keep track of progress for + * status_cb(). l1_entries contains the total number of L1 entries and + * *visited_l1_entries counts all visited L1 entries. + */ +static int expand_zero_clusters_in_l1(BlockDriverState *bs, uint64_t *l1_table, + int l1_size, int64_t *visited_l1_entries, + int64_t l1_entries, + BlockDriverAmendStatusCB *status_cb) +{ + BDRVQcow2State *s = bs->opaque; + bool is_active_l1 = (l1_table == s->l1_table); + uint64_t *l2_table = NULL; + int ret; + int i, j; + + if (!is_active_l1) { + /* inactive L2 tables require a buffer to be stored in when loading + * them from disk */ + l2_table = qemu_try_blockalign(bs->file->bs, s->cluster_size); + if (l2_table == NULL) { + return -ENOMEM; + } + } + + for (i = 0; i < l1_size; i++) { + uint64_t l2_offset = l1_table[i] & L1E_OFFSET_MASK; + bool l2_dirty = false; + uint64_t l2_refcount; + + if (!l2_offset) { + /* unallocated */ + (*visited_l1_entries)++; + if (status_cb) { + status_cb(bs, *visited_l1_entries, l1_entries); + } + continue; + } + + if (offset_into_cluster(s, l2_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" + PRIx64 " unaligned (L1 index: %#x)", + l2_offset, i); + ret = -EIO; + goto fail; + } + + if (is_active_l1) { + /* get active L2 tables from cache */ + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, + (void **)&l2_table); + } else { + /* load inactive L2 tables from disk */ + ret = bdrv_read(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); + } + if (ret < 0) { + goto fail; + } + + ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, + &l2_refcount); + if (ret < 0) { + goto fail; + } + + for (j = 0; j < s->l2_size; j++) { + uint64_t l2_entry = be64_to_cpu(l2_table[j]); + int64_t offset = l2_entry & L2E_OFFSET_MASK; + int cluster_type = qcow2_get_cluster_type(l2_entry); + bool preallocated = offset != 0; + + if (cluster_type != QCOW2_CLUSTER_ZERO) { + continue; + } + + if (!preallocated) { + if (!bs->backing) { + /* not backed; therefore we can simply deallocate the + * cluster */ + l2_table[j] = 0; + l2_dirty = true; + continue; + } + + offset = qcow2_alloc_clusters(bs, s->cluster_size); + if (offset < 0) { + ret = offset; + goto fail; + } + + if (l2_refcount > 1) { + /* For shared L2 tables, set the refcount accordingly (it is + * already 1 and needs to be l2_refcount) */ + ret = qcow2_update_cluster_refcount(bs, + offset >> s->cluster_bits, + refcount_diff(1, l2_refcount), false, + QCOW2_DISCARD_OTHER); + if (ret < 0) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_OTHER); + goto fail; + } + } + } + + if (offset_into_cluster(s, offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Data cluster offset " + "%#" PRIx64 " unaligned (L2 offset: %#" + PRIx64 ", L2 index: %#x)", offset, + l2_offset, j); + if (!preallocated) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_ALWAYS); + } + ret = -EIO; + goto fail; + } + + ret = qcow2_pre_write_overlap_check(bs, 0, offset, s->cluster_size); + if (ret < 0) { + if (!preallocated) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_ALWAYS); + } + goto fail; + } + + ret = bdrv_write_zeroes(bs->file->bs, offset / BDRV_SECTOR_SIZE, + s->cluster_sectors, 0); + if (ret < 0) { + if (!preallocated) { + qcow2_free_clusters(bs, offset, s->cluster_size, + QCOW2_DISCARD_ALWAYS); + } + goto fail; + } + + if (l2_refcount == 1) { + l2_table[j] = cpu_to_be64(offset | QCOW_OFLAG_COPIED); + } else { + l2_table[j] = cpu_to_be64(offset); + } + l2_dirty = true; + } + + if (is_active_l1) { + if (l2_dirty) { + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, l2_table); + qcow2_cache_depends_on_flush(s->l2_table_cache); + } + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + } else { + if (l2_dirty) { + ret = qcow2_pre_write_overlap_check(bs, + QCOW2_OL_INACTIVE_L2 | QCOW2_OL_ACTIVE_L2, l2_offset, + s->cluster_size); + if (ret < 0) { + goto fail; + } + + ret = bdrv_write(bs->file->bs, l2_offset / BDRV_SECTOR_SIZE, + (void *)l2_table, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } + } + + (*visited_l1_entries)++; + if (status_cb) { + status_cb(bs, *visited_l1_entries, l1_entries); + } + } + + ret = 0; + +fail: + if (l2_table) { + if (!is_active_l1) { + qemu_vfree(l2_table); + } else { + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + } + } + return ret; +} + +/* + * For backed images, expands all zero clusters on the image. For non-backed + * images, deallocates all non-pre-allocated zero clusters (and claims the + * allocation for pre-allocated ones). This is important for downgrading to a + * qcow2 version which doesn't yet support metadata zero clusters. + */ +int qcow2_expand_zero_clusters(BlockDriverState *bs, + BlockDriverAmendStatusCB *status_cb) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t *l1_table = NULL; + int64_t l1_entries = 0, visited_l1_entries = 0; + int ret; + int i, j; + + if (status_cb) { + l1_entries = s->l1_size; + for (i = 0; i < s->nb_snapshots; i++) { + l1_entries += s->snapshots[i].l1_size; + } + } + + ret = expand_zero_clusters_in_l1(bs, s->l1_table, s->l1_size, + &visited_l1_entries, l1_entries, + status_cb); + if (ret < 0) { + goto fail; + } + + /* Inactive L1 tables may point to active L2 tables - therefore it is + * necessary to flush the L2 table cache before trying to access the L2 + * tables pointed to by inactive L1 entries (else we might try to expand + * zero clusters that have already been expanded); furthermore, it is also + * necessary to empty the L2 table cache, since it may contain tables which + * are now going to be modified directly on disk, bypassing the cache. + * qcow2_cache_empty() does both for us. */ + ret = qcow2_cache_empty(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + for (i = 0; i < s->nb_snapshots; i++) { + int l1_sectors = (s->snapshots[i].l1_size * sizeof(uint64_t) + + BDRV_SECTOR_SIZE - 1) / BDRV_SECTOR_SIZE; + + l1_table = g_realloc(l1_table, l1_sectors * BDRV_SECTOR_SIZE); + + ret = bdrv_read(bs->file->bs, + s->snapshots[i].l1_table_offset / BDRV_SECTOR_SIZE, + (void *)l1_table, l1_sectors); + if (ret < 0) { + goto fail; + } + + for (j = 0; j < s->snapshots[i].l1_size; j++) { + be64_to_cpus(&l1_table[j]); + } + + ret = expand_zero_clusters_in_l1(bs, l1_table, s->snapshots[i].l1_size, + &visited_l1_entries, l1_entries, + status_cb); + if (ret < 0) { + goto fail; + } + } + + ret = 0; + +fail: + g_free(l1_table); + return ret; +} diff --git a/src/block/qcow2-refcount.c b/src/block/qcow2-refcount.c new file mode 100644 index 0000000..820f412 --- /dev/null +++ b/src/block/qcow2-refcount.c @@ -0,0 +1,2469 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "qemu/range.h" + +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size); +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, + int64_t offset, int64_t length, uint64_t addend, + bool decrease, enum qcow2_discard_type type); + +static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index); +static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index); + +static void set_refcount_ro0(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro1(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro2(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro3(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro4(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro5(void *refcount_array, uint64_t index, + uint64_t value); +static void set_refcount_ro6(void *refcount_array, uint64_t index, + uint64_t value); + + +static Qcow2GetRefcountFunc *const get_refcount_funcs[] = { + &get_refcount_ro0, + &get_refcount_ro1, + &get_refcount_ro2, + &get_refcount_ro3, + &get_refcount_ro4, + &get_refcount_ro5, + &get_refcount_ro6 +}; + +static Qcow2SetRefcountFunc *const set_refcount_funcs[] = { + &set_refcount_ro0, + &set_refcount_ro1, + &set_refcount_ro2, + &set_refcount_ro3, + &set_refcount_ro4, + &set_refcount_ro5, + &set_refcount_ro6 +}; + + +/*********************************************************/ +/* refcount handling */ + +int qcow2_refcount_init(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + unsigned int refcount_table_size2, i; + int ret; + + assert(s->refcount_order >= 0 && s->refcount_order <= 6); + + s->get_refcount = get_refcount_funcs[s->refcount_order]; + s->set_refcount = set_refcount_funcs[s->refcount_order]; + + assert(s->refcount_table_size <= INT_MAX / sizeof(uint64_t)); + refcount_table_size2 = s->refcount_table_size * sizeof(uint64_t); + s->refcount_table = g_try_malloc(refcount_table_size2); + + if (s->refcount_table_size > 0) { + if (s->refcount_table == NULL) { + ret = -ENOMEM; + goto fail; + } + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_LOAD); + ret = bdrv_pread(bs->file->bs, s->refcount_table_offset, + s->refcount_table, refcount_table_size2); + if (ret < 0) { + goto fail; + } + for(i = 0; i < s->refcount_table_size; i++) + be64_to_cpus(&s->refcount_table[i]); + } + return 0; + fail: + return ret; +} + +void qcow2_refcount_close(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + g_free(s->refcount_table); +} + + +static uint64_t get_refcount_ro0(const void *refcount_array, uint64_t index) +{ + return (((const uint8_t *)refcount_array)[index / 8] >> (index % 8)) & 0x1; +} + +static void set_refcount_ro0(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 1)); + ((uint8_t *)refcount_array)[index / 8] &= ~(0x1 << (index % 8)); + ((uint8_t *)refcount_array)[index / 8] |= value << (index % 8); +} + +static uint64_t get_refcount_ro1(const void *refcount_array, uint64_t index) +{ + return (((const uint8_t *)refcount_array)[index / 4] >> (2 * (index % 4))) + & 0x3; +} + +static void set_refcount_ro1(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 2)); + ((uint8_t *)refcount_array)[index / 4] &= ~(0x3 << (2 * (index % 4))); + ((uint8_t *)refcount_array)[index / 4] |= value << (2 * (index % 4)); +} + +static uint64_t get_refcount_ro2(const void *refcount_array, uint64_t index) +{ + return (((const uint8_t *)refcount_array)[index / 2] >> (4 * (index % 2))) + & 0xf; +} + +static void set_refcount_ro2(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 4)); + ((uint8_t *)refcount_array)[index / 2] &= ~(0xf << (4 * (index % 2))); + ((uint8_t *)refcount_array)[index / 2] |= value << (4 * (index % 2)); +} + +static uint64_t get_refcount_ro3(const void *refcount_array, uint64_t index) +{ + return ((const uint8_t *)refcount_array)[index]; +} + +static void set_refcount_ro3(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 8)); + ((uint8_t *)refcount_array)[index] = value; +} + +static uint64_t get_refcount_ro4(const void *refcount_array, uint64_t index) +{ + return be16_to_cpu(((const uint16_t *)refcount_array)[index]); +} + +static void set_refcount_ro4(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 16)); + ((uint16_t *)refcount_array)[index] = cpu_to_be16(value); +} + +static uint64_t get_refcount_ro5(const void *refcount_array, uint64_t index) +{ + return be32_to_cpu(((const uint32_t *)refcount_array)[index]); +} + +static void set_refcount_ro5(void *refcount_array, uint64_t index, + uint64_t value) +{ + assert(!(value >> 32)); + ((uint32_t *)refcount_array)[index] = cpu_to_be32(value); +} + +static uint64_t get_refcount_ro6(const void *refcount_array, uint64_t index) +{ + return be64_to_cpu(((const uint64_t *)refcount_array)[index]); +} + +static void set_refcount_ro6(void *refcount_array, uint64_t index, + uint64_t value) +{ + ((uint64_t *)refcount_array)[index] = cpu_to_be64(value); +} + + +static int load_refcount_block(BlockDriverState *bs, + int64_t refcount_block_offset, + void **refcount_block) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_LOAD); + ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, + refcount_block); + + return ret; +} + +/* + * Retrieves the refcount of the cluster given by its index and stores it in + * *refcount. Returns 0 on success and -errno on failure. + */ +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, + uint64_t *refcount) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t refcount_table_index, block_index; + int64_t refcount_block_offset; + int ret; + void *refcount_block; + + refcount_table_index = cluster_index >> s->refcount_block_bits; + if (refcount_table_index >= s->refcount_table_size) { + *refcount = 0; + return 0; + } + refcount_block_offset = + s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; + if (!refcount_block_offset) { + *refcount = 0; + return 0; + } + + if (offset_into_cluster(s, refcount_block_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" PRIx64 + " unaligned (reftable index: %#" PRIx64 ")", + refcount_block_offset, refcount_table_index); + return -EIO; + } + + ret = qcow2_cache_get(bs, s->refcount_block_cache, refcount_block_offset, + &refcount_block); + if (ret < 0) { + return ret; + } + + block_index = cluster_index & (s->refcount_block_size - 1); + *refcount = s->get_refcount(refcount_block, block_index); + + qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); + + return 0; +} + +/* + * Rounds the refcount table size up to avoid growing the table for each single + * refcount block that is allocated. + */ +static unsigned int next_refcount_table_size(BDRVQcow2State *s, + unsigned int min_size) +{ + unsigned int min_clusters = (min_size >> (s->cluster_bits - 3)) + 1; + unsigned int refcount_table_clusters = + MAX(1, s->refcount_table_size >> (s->cluster_bits - 3)); + + while (min_clusters > refcount_table_clusters) { + refcount_table_clusters = (refcount_table_clusters * 3 + 1) / 2; + } + + return refcount_table_clusters << (s->cluster_bits - 3); +} + + +/* Checks if two offsets are described by the same refcount block */ +static int in_same_refcount_block(BDRVQcow2State *s, uint64_t offset_a, + uint64_t offset_b) +{ + uint64_t block_a = offset_a >> (s->cluster_bits + s->refcount_block_bits); + uint64_t block_b = offset_b >> (s->cluster_bits + s->refcount_block_bits); + + return (block_a == block_b); +} + +/* + * Loads a refcount block. If it doesn't exist yet, it is allocated first + * (including growing the refcount table if needed). + * + * Returns 0 on success or -errno in error case + */ +static int alloc_refcount_block(BlockDriverState *bs, + int64_t cluster_index, void **refcount_block) +{ + BDRVQcow2State *s = bs->opaque; + unsigned int refcount_table_index; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + + /* Find the refcount block for the given cluster */ + refcount_table_index = cluster_index >> s->refcount_block_bits; + + if (refcount_table_index < s->refcount_table_size) { + + uint64_t refcount_block_offset = + s->refcount_table[refcount_table_index] & REFT_OFFSET_MASK; + + /* If it's already there, we're done */ + if (refcount_block_offset) { + if (offset_into_cluster(s, refcount_block_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "Refblock offset %#" + PRIx64 " unaligned (reftable index: " + "%#x)", refcount_block_offset, + refcount_table_index); + return -EIO; + } + + return load_refcount_block(bs, refcount_block_offset, + refcount_block); + } + } + + /* + * If we came here, we need to allocate something. Something is at least + * a cluster for the new refcount block. It may also include a new refcount + * table if the old refcount table is too small. + * + * Note that allocating clusters here needs some special care: + * + * - We can't use the normal qcow2_alloc_clusters(), it would try to + * increase the refcount and very likely we would end up with an endless + * recursion. Instead we must place the refcount blocks in a way that + * they can describe them themselves. + * + * - We need to consider that at this point we are inside update_refcounts + * and potentially doing an initial refcount increase. This means that + * some clusters have already been allocated by the caller, but their + * refcount isn't accurate yet. If we allocate clusters for metadata, we + * need to return -EAGAIN to signal the caller that it needs to restart + * the search for free clusters. + * + * - alloc_clusters_noref and qcow2_free_clusters may load a different + * refcount block into the cache + */ + + *refcount_block = NULL; + + /* We write to the refcount table, so we might depend on L2 tables */ + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + return ret; + } + + /* Allocate the refcount block itself and mark it as used */ + int64_t new_block = alloc_clusters_noref(bs, s->cluster_size); + if (new_block < 0) { + return new_block; + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Allocate refcount block %d for %" PRIx64 + " at %" PRIx64 "\n", + refcount_table_index, cluster_index << s->cluster_bits, new_block); +#endif + + if (in_same_refcount_block(s, new_block, cluster_index << s->cluster_bits)) { + /* Zero the new refcount block before updating it */ + ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, + refcount_block); + if (ret < 0) { + goto fail_block; + } + + memset(*refcount_block, 0, s->cluster_size); + + /* The block describes itself, need to update the cache */ + int block_index = (new_block >> s->cluster_bits) & + (s->refcount_block_size - 1); + s->set_refcount(*refcount_block, block_index, 1); + } else { + /* Described somewhere else. This can recurse at most twice before we + * arrive at a block that describes itself. */ + ret = update_refcount(bs, new_block, s->cluster_size, 1, false, + QCOW2_DISCARD_NEVER); + if (ret < 0) { + goto fail_block; + } + + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail_block; + } + + /* Initialize the new refcount block only after updating its refcount, + * update_refcount uses the refcount cache itself */ + ret = qcow2_cache_get_empty(bs, s->refcount_block_cache, new_block, + refcount_block); + if (ret < 0) { + goto fail_block; + } + + memset(*refcount_block, 0, s->cluster_size); + } + + /* Now the new refcount block needs to be written to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE); + qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, *refcount_block); + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail_block; + } + + /* If the refcount table is big enough, just hook the block up there */ + if (refcount_table_index < s->refcount_table_size) { + uint64_t data64 = cpu_to_be64(new_block); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_HOOKUP); + ret = bdrv_pwrite_sync(bs->file->bs, + s->refcount_table_offset + refcount_table_index * sizeof(uint64_t), + &data64, sizeof(data64)); + if (ret < 0) { + goto fail_block; + } + + s->refcount_table[refcount_table_index] = new_block; + + /* The new refcount block may be where the caller intended to put its + * data, so let it restart the search. */ + return -EAGAIN; + } + + qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); + + /* + * If we come here, we need to grow the refcount table. Again, a new + * refcount table needs some space and we can't simply allocate to avoid + * endless recursion. + * + * Therefore let's grab new refcount blocks at the end of the image, which + * will describe themselves and the new refcount table. This way we can + * reference them only in the new table and do the switch to the new + * refcount table at once without producing an inconsistent state in + * between. + */ + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_GROW); + + /* Calculate the number of refcount blocks needed so far; this will be the + * basis for calculating the index of the first cluster used for the + * self-describing refcount structures which we are about to create. + * + * Because we reached this point, there cannot be any refcount entries for + * cluster_index or higher indices yet. However, because new_block has been + * allocated to describe that cluster (and it will assume this role later + * on), we cannot use that index; also, new_block may actually have a higher + * cluster index than cluster_index, so it needs to be taken into account + * here (and 1 needs to be added to its value because that cluster is used). + */ + uint64_t blocks_used = DIV_ROUND_UP(MAX(cluster_index + 1, + (new_block >> s->cluster_bits) + 1), + s->refcount_block_size); + + if (blocks_used > QCOW_MAX_REFTABLE_SIZE / sizeof(uint64_t)) { + return -EFBIG; + } + + /* And now we need at least one block more for the new metadata */ + uint64_t table_size = next_refcount_table_size(s, blocks_used + 1); + uint64_t last_table_size; + uint64_t blocks_clusters; + do { + uint64_t table_clusters = + size_to_clusters(s, table_size * sizeof(uint64_t)); + blocks_clusters = 1 + + ((table_clusters + s->refcount_block_size - 1) + / s->refcount_block_size); + uint64_t meta_clusters = table_clusters + blocks_clusters; + + last_table_size = table_size; + table_size = next_refcount_table_size(s, blocks_used + + ((meta_clusters + s->refcount_block_size - 1) + / s->refcount_block_size)); + + } while (last_table_size != table_size); + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "qcow2: Grow refcount table %" PRId32 " => %" PRId64 "\n", + s->refcount_table_size, table_size); +#endif + + /* Create the new refcount table and blocks */ + uint64_t meta_offset = (blocks_used * s->refcount_block_size) * + s->cluster_size; + uint64_t table_offset = meta_offset + blocks_clusters * s->cluster_size; + uint64_t *new_table = g_try_new0(uint64_t, table_size); + void *new_blocks = g_try_malloc0(blocks_clusters * s->cluster_size); + + assert(table_size > 0 && blocks_clusters > 0); + if (new_table == NULL || new_blocks == NULL) { + ret = -ENOMEM; + goto fail_table; + } + + /* Fill the new refcount table */ + memcpy(new_table, s->refcount_table, + s->refcount_table_size * sizeof(uint64_t)); + new_table[refcount_table_index] = new_block; + + int i; + for (i = 0; i < blocks_clusters; i++) { + new_table[blocks_used + i] = meta_offset + (i * s->cluster_size); + } + + /* Fill the refcount blocks */ + uint64_t table_clusters = size_to_clusters(s, table_size * sizeof(uint64_t)); + int block = 0; + for (i = 0; i < table_clusters + blocks_clusters; i++) { + s->set_refcount(new_blocks, block++, 1); + } + + /* Write refcount blocks to disk */ + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_BLOCKS); + ret = bdrv_pwrite_sync(bs->file->bs, meta_offset, new_blocks, + blocks_clusters * s->cluster_size); + g_free(new_blocks); + new_blocks = NULL; + if (ret < 0) { + goto fail_table; + } + + /* Write refcount table to disk */ + for(i = 0; i < table_size; i++) { + cpu_to_be64s(&new_table[i]); + } + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_WRITE_TABLE); + ret = bdrv_pwrite_sync(bs->file->bs, table_offset, new_table, + table_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail_table; + } + + for(i = 0; i < table_size; i++) { + be64_to_cpus(&new_table[i]); + } + + /* Hook up the new refcount table in the qcow2 header */ + struct QEMU_PACKED { + uint64_t d64; + uint32_t d32; + } data; + cpu_to_be64w(&data.d64, table_offset); + cpu_to_be32w(&data.d32, table_clusters); + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC_SWITCH_TABLE); + ret = bdrv_pwrite_sync(bs->file->bs, + offsetof(QCowHeader, refcount_table_offset), + &data, sizeof(data)); + if (ret < 0) { + goto fail_table; + } + + /* And switch it in memory */ + uint64_t old_table_offset = s->refcount_table_offset; + uint64_t old_table_size = s->refcount_table_size; + + g_free(s->refcount_table); + s->refcount_table = new_table; + s->refcount_table_size = table_size; + s->refcount_table_offset = table_offset; + + /* Free old table. */ + qcow2_free_clusters(bs, old_table_offset, old_table_size * sizeof(uint64_t), + QCOW2_DISCARD_OTHER); + + ret = load_refcount_block(bs, new_block, refcount_block); + if (ret < 0) { + return ret; + } + + /* If we were trying to do the initial refcount update for some cluster + * allocation, we might have used the same clusters to store newly + * allocated metadata. Make the caller search some new space. */ + return -EAGAIN; + +fail_table: + g_free(new_blocks); + g_free(new_table); +fail_block: + if (*refcount_block != NULL) { + qcow2_cache_put(bs, s->refcount_block_cache, refcount_block); + } + return ret; +} + +void qcow2_process_discards(BlockDriverState *bs, int ret) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2DiscardRegion *d, *next; + + QTAILQ_FOREACH_SAFE(d, &s->discards, next, next) { + QTAILQ_REMOVE(&s->discards, d, next); + + /* Discard is optional, ignore the return value */ + if (ret >= 0) { + bdrv_discard(bs->file->bs, + d->offset >> BDRV_SECTOR_BITS, + d->bytes >> BDRV_SECTOR_BITS); + } + + g_free(d); + } +} + +static void update_refcount_discard(BlockDriverState *bs, + uint64_t offset, uint64_t length) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2DiscardRegion *d, *p, *next; + + QTAILQ_FOREACH(d, &s->discards, next) { + uint64_t new_start = MIN(offset, d->offset); + uint64_t new_end = MAX(offset + length, d->offset + d->bytes); + + if (new_end - new_start <= length + d->bytes) { + /* There can't be any overlap, areas ending up here have no + * references any more and therefore shouldn't get freed another + * time. */ + assert(d->bytes + length == new_end - new_start); + d->offset = new_start; + d->bytes = new_end - new_start; + goto found; + } + } + + d = g_malloc(sizeof(*d)); + *d = (Qcow2DiscardRegion) { + .bs = bs, + .offset = offset, + .bytes = length, + }; + QTAILQ_INSERT_TAIL(&s->discards, d, next); + +found: + /* Merge discard requests if they are adjacent now */ + QTAILQ_FOREACH_SAFE(p, &s->discards, next, next) { + if (p == d + || p->offset > d->offset + d->bytes + || d->offset > p->offset + p->bytes) + { + continue; + } + + /* Still no overlap possible */ + assert(p->offset == d->offset + d->bytes + || d->offset == p->offset + p->bytes); + + QTAILQ_REMOVE(&s->discards, p, next); + d->offset = MIN(d->offset, p->offset); + d->bytes += p->bytes; + g_free(p); + } +} + +/* XXX: cache several refcount block clusters ? */ +/* @addend is the absolute value of the addend; if @decrease is set, @addend + * will be subtracted from the current refcount, otherwise it will be added */ +static int QEMU_WARN_UNUSED_RESULT update_refcount(BlockDriverState *bs, + int64_t offset, + int64_t length, + uint64_t addend, + bool decrease, + enum qcow2_discard_type type) +{ + BDRVQcow2State *s = bs->opaque; + int64_t start, last, cluster_offset; + void *refcount_block = NULL; + int64_t old_table_index = -1; + int ret; + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "update_refcount: offset=%" PRId64 " size=%" PRId64 + " addend=%s%" PRIu64 "\n", offset, length, decrease ? "-" : "", + addend); +#endif + if (length < 0) { + return -EINVAL; + } else if (length == 0) { + return 0; + } + + if (decrease) { + qcow2_cache_set_dependency(bs, s->refcount_block_cache, + s->l2_table_cache); + } + + start = start_of_cluster(s, offset); + last = start_of_cluster(s, offset + length - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) + { + int block_index; + uint64_t refcount; + int64_t cluster_index = cluster_offset >> s->cluster_bits; + int64_t table_index = cluster_index >> s->refcount_block_bits; + + /* Load the refcount block and allocate it if needed */ + if (table_index != old_table_index) { + if (refcount_block) { + qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); + } + ret = alloc_refcount_block(bs, cluster_index, &refcount_block); + if (ret < 0) { + goto fail; + } + } + old_table_index = table_index; + + qcow2_cache_entry_mark_dirty(bs, s->refcount_block_cache, + refcount_block); + + /* we can update the count and save it */ + block_index = cluster_index & (s->refcount_block_size - 1); + + refcount = s->get_refcount(refcount_block, block_index); + if (decrease ? (refcount - addend > refcount) + : (refcount + addend < refcount || + refcount + addend > s->refcount_max)) + { + ret = -EINVAL; + goto fail; + } + if (decrease) { + refcount -= addend; + } else { + refcount += addend; + } + if (refcount == 0 && cluster_index < s->free_cluster_index) { + s->free_cluster_index = cluster_index; + } + s->set_refcount(refcount_block, block_index, refcount); + + if (refcount == 0 && s->discard_passthrough[type]) { + update_refcount_discard(bs, cluster_offset, s->cluster_size); + } + } + + ret = 0; +fail: + if (!s->cache_discards) { + qcow2_process_discards(bs, ret); + } + + /* Write last changed block to disk */ + if (refcount_block) { + qcow2_cache_put(bs, s->refcount_block_cache, &refcount_block); + } + + /* + * Try do undo any updates if an error is returned (This may succeed in + * some cases like ENOSPC for allocating a new refcount block) + */ + if (ret < 0) { + int dummy; + dummy = update_refcount(bs, offset, cluster_offset - offset, addend, + !decrease, QCOW2_DISCARD_NEVER); + (void)dummy; + } + + return ret; +} + +/* + * Increases or decreases the refcount of a given cluster. + * + * @addend is the absolute value of the addend; if @decrease is set, @addend + * will be subtracted from the current refcount, otherwise it will be added. + * + * On success 0 is returned; on failure -errno is returned. + */ +int qcow2_update_cluster_refcount(BlockDriverState *bs, + int64_t cluster_index, + uint64_t addend, bool decrease, + enum qcow2_discard_type type) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + + ret = update_refcount(bs, cluster_index << s->cluster_bits, 1, addend, + decrease, type); + if (ret < 0) { + return ret; + } + + return 0; +} + + + +/*********************************************************/ +/* cluster allocation functions */ + + + +/* return < 0 if error */ +static int64_t alloc_clusters_noref(BlockDriverState *bs, uint64_t size) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t i, nb_clusters, refcount; + int ret; + + /* We can't allocate clusters if they may still be queued for discard. */ + if (s->cache_discards) { + qcow2_process_discards(bs, 0); + } + + nb_clusters = size_to_clusters(s, size); +retry: + for(i = 0; i < nb_clusters; i++) { + uint64_t next_cluster_index = s->free_cluster_index++; + ret = qcow2_get_refcount(bs, next_cluster_index, &refcount); + + if (ret < 0) { + return ret; + } else if (refcount != 0) { + goto retry; + } + } + + /* Make sure that all offsets in the "allocated" range are representable + * in an int64_t */ + if (s->free_cluster_index > 0 && + s->free_cluster_index - 1 > (INT64_MAX >> s->cluster_bits)) + { + return -EFBIG; + } + +#ifdef DEBUG_ALLOC2 + fprintf(stderr, "alloc_clusters: size=%" PRId64 " -> %" PRId64 "\n", + size, + (s->free_cluster_index - nb_clusters) << s->cluster_bits); +#endif + return (s->free_cluster_index - nb_clusters) << s->cluster_bits; +} + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size) +{ + int64_t offset; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC); + do { + offset = alloc_clusters_noref(bs, size); + if (offset < 0) { + return offset; + } + + ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); + } while (ret == -EAGAIN); + + if (ret < 0) { + return ret; + } + + return offset; +} + +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t cluster_index, refcount; + uint64_t i; + int ret; + + assert(nb_clusters >= 0); + if (nb_clusters == 0) { + return 0; + } + + do { + /* Check how many clusters there are free */ + cluster_index = offset >> s->cluster_bits; + for(i = 0; i < nb_clusters; i++) { + ret = qcow2_get_refcount(bs, cluster_index++, &refcount); + if (ret < 0) { + return ret; + } else if (refcount != 0) { + break; + } + } + + /* And then allocate them */ + ret = update_refcount(bs, offset, i << s->cluster_bits, 1, false, + QCOW2_DISCARD_NEVER); + } while (ret == -EAGAIN); + + if (ret < 0) { + return ret; + } + + return i; +} + +/* only used to allocate compressed sectors. We try to allocate + contiguous sectors. size must be <= cluster_size */ +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size) +{ + BDRVQcow2State *s = bs->opaque; + int64_t offset; + size_t free_in_cluster; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_ALLOC_BYTES); + assert(size > 0 && size <= s->cluster_size); + assert(!s->free_byte_offset || offset_into_cluster(s, s->free_byte_offset)); + + offset = s->free_byte_offset; + + if (offset) { + uint64_t refcount; + ret = qcow2_get_refcount(bs, offset >> s->cluster_bits, &refcount); + if (ret < 0) { + return ret; + } + + if (refcount == s->refcount_max) { + offset = 0; + } + } + + free_in_cluster = s->cluster_size - offset_into_cluster(s, offset); + do { + if (!offset || free_in_cluster < size) { + int64_t new_cluster = alloc_clusters_noref(bs, s->cluster_size); + if (new_cluster < 0) { + return new_cluster; + } + + if (!offset || ROUND_UP(offset, s->cluster_size) != new_cluster) { + offset = new_cluster; + free_in_cluster = s->cluster_size; + } else { + free_in_cluster += s->cluster_size; + } + } + + assert(offset); + ret = update_refcount(bs, offset, size, 1, false, QCOW2_DISCARD_NEVER); + if (ret < 0) { + offset = 0; + } + } while (ret == -EAGAIN); + if (ret < 0) { + return ret; + } + + /* The cluster refcount was incremented; refcount blocks must be flushed + * before the caller's L2 table updates. */ + qcow2_cache_set_dependency(bs, s->l2_table_cache, s->refcount_block_cache); + + s->free_byte_offset = offset + size; + if (!offset_into_cluster(s, s->free_byte_offset)) { + s->free_byte_offset = 0; + } + + return offset; +} + +void qcow2_free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size, + enum qcow2_discard_type type) +{ + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_CLUSTER_FREE); + ret = update_refcount(bs, offset, size, 1, true, type); + if (ret < 0) { + fprintf(stderr, "qcow2_free_clusters failed: %s\n", strerror(-ret)); + /* TODO Remember the clusters to free them later and avoid leaking */ + } +} + +/* + * Free a cluster using its L2 entry (handles clusters of all types, e.g. + * normal cluster, compressed cluster, etc.) + */ +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, + int nb_clusters, enum qcow2_discard_type type) +{ + BDRVQcow2State *s = bs->opaque; + + switch (qcow2_get_cluster_type(l2_entry)) { + case QCOW2_CLUSTER_COMPRESSED: + { + int nb_csectors; + nb_csectors = ((l2_entry >> s->csize_shift) & + s->csize_mask) + 1; + qcow2_free_clusters(bs, + (l2_entry & s->cluster_offset_mask) & ~511, + nb_csectors * 512, type); + } + break; + case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_ZERO: + if (l2_entry & L2E_OFFSET_MASK) { + if (offset_into_cluster(s, l2_entry & L2E_OFFSET_MASK)) { + qcow2_signal_corruption(bs, false, -1, -1, + "Cannot free unaligned cluster %#llx", + l2_entry & L2E_OFFSET_MASK); + } else { + qcow2_free_clusters(bs, l2_entry & L2E_OFFSET_MASK, + nb_clusters << s->cluster_bits, type); + } + } + break; + case QCOW2_CLUSTER_UNALLOCATED: + break; + default: + abort(); + } +} + + + +/*********************************************************/ +/* snapshots and image creation */ + + + +/* update the refcounts of snapshots and the copied flag */ +int qcow2_update_snapshot_refcount(BlockDriverState *bs, + int64_t l1_table_offset, int l1_size, int addend) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t *l1_table, *l2_table, l2_offset, offset, l1_size2, refcount; + bool l1_allocated = false; + int64_t old_offset, old_l2_offset; + int i, j, l1_modified = 0, nb_csectors; + int ret; + + assert(addend >= -1 && addend <= 1); + + l2_table = NULL; + l1_table = NULL; + l1_size2 = l1_size * sizeof(uint64_t); + + s->cache_discards = true; + + /* WARNING: qcow2_snapshot_goto relies on this function not using the + * l1_table_offset when it is the current s->l1_table_offset! Be careful + * when changing this! */ + if (l1_table_offset != s->l1_table_offset) { + l1_table = g_try_malloc0(align_offset(l1_size2, 512)); + if (l1_size2 && l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } + l1_allocated = true; + + ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); + if (ret < 0) { + goto fail; + } + + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } else { + assert(l1_size == s->l1_size); + l1_table = s->l1_table; + l1_allocated = false; + } + + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + old_l2_offset = l2_offset; + l2_offset &= L1E_OFFSET_MASK; + + if (offset_into_cluster(s, l2_offset)) { + qcow2_signal_corruption(bs, true, -1, -1, "L2 table offset %#" + PRIx64 " unaligned (L1 index: %#x)", + l2_offset, i); + ret = -EIO; + goto fail; + } + + ret = qcow2_cache_get(bs, s->l2_table_cache, l2_offset, + (void**) &l2_table); + if (ret < 0) { + goto fail; + } + + for(j = 0; j < s->l2_size; j++) { + uint64_t cluster_index; + + offset = be64_to_cpu(l2_table[j]); + old_offset = offset; + offset &= ~QCOW_OFLAG_COPIED; + + switch (qcow2_get_cluster_type(offset)) { + case QCOW2_CLUSTER_COMPRESSED: + nb_csectors = ((offset >> s->csize_shift) & + s->csize_mask) + 1; + if (addend != 0) { + ret = update_refcount(bs, + (offset & s->cluster_offset_mask) & ~511, + nb_csectors * 512, abs(addend), addend < 0, + QCOW2_DISCARD_SNAPSHOT); + if (ret < 0) { + goto fail; + } + } + /* compressed clusters are never modified */ + refcount = 2; + break; + + case QCOW2_CLUSTER_NORMAL: + case QCOW2_CLUSTER_ZERO: + if (offset_into_cluster(s, offset & L2E_OFFSET_MASK)) { + qcow2_signal_corruption(bs, true, -1, -1, "Data " + "cluster offset %#llx " + "unaligned (L2 offset: %#" + PRIx64 ", L2 index: %#x)", + offset & L2E_OFFSET_MASK, + l2_offset, j); + ret = -EIO; + goto fail; + } + + cluster_index = (offset & L2E_OFFSET_MASK) >> s->cluster_bits; + if (!cluster_index) { + /* unallocated */ + refcount = 0; + break; + } + if (addend != 0) { + ret = qcow2_update_cluster_refcount(bs, + cluster_index, abs(addend), addend < 0, + QCOW2_DISCARD_SNAPSHOT); + if (ret < 0) { + goto fail; + } + } + + ret = qcow2_get_refcount(bs, cluster_index, &refcount); + if (ret < 0) { + goto fail; + } + break; + + case QCOW2_CLUSTER_UNALLOCATED: + refcount = 0; + break; + + default: + abort(); + } + + if (refcount == 1) { + offset |= QCOW_OFLAG_COPIED; + } + if (offset != old_offset) { + if (addend > 0) { + qcow2_cache_set_dependency(bs, s->l2_table_cache, + s->refcount_block_cache); + } + l2_table[j] = cpu_to_be64(offset); + qcow2_cache_entry_mark_dirty(bs, s->l2_table_cache, + l2_table); + } + } + + qcow2_cache_put(bs, s->l2_table_cache, (void **) &l2_table); + + if (addend != 0) { + ret = qcow2_update_cluster_refcount(bs, l2_offset >> + s->cluster_bits, + abs(addend), addend < 0, + QCOW2_DISCARD_SNAPSHOT); + if (ret < 0) { + goto fail; + } + } + ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, + &refcount); + if (ret < 0) { + goto fail; + } else if (refcount == 1) { + l2_offset |= QCOW_OFLAG_COPIED; + } + if (l2_offset != old_l2_offset) { + l1_table[i] = l2_offset; + l1_modified = 1; + } + } + } + + ret = bdrv_flush(bs); +fail: + if (l2_table) { + qcow2_cache_put(bs, s->l2_table_cache, (void**) &l2_table); + } + + s->cache_discards = false; + qcow2_process_discards(bs, ret); + + /* Update L1 only if it isn't deleted anyway (addend = -1) */ + if (ret == 0 && addend >= 0 && l1_modified) { + for (i = 0; i < l1_size; i++) { + cpu_to_be64s(&l1_table[i]); + } + + ret = bdrv_pwrite_sync(bs->file->bs, l1_table_offset, + l1_table, l1_size2); + + for (i = 0; i < l1_size; i++) { + be64_to_cpus(&l1_table[i]); + } + } + if (l1_allocated) + g_free(l1_table); + return ret; +} + + + + +/*********************************************************/ +/* refcount checking functions */ + + +static uint64_t refcount_array_byte_size(BDRVQcow2State *s, uint64_t entries) +{ + /* This assertion holds because there is no way we can address more than + * 2^(64 - 9) clusters at once (with cluster size 512 = 2^9, and because + * offsets have to be representable in bytes); due to every cluster + * corresponding to one refcount entry, we are well below that limit */ + assert(entries < (UINT64_C(1) << (64 - 9))); + + /* Thanks to the assertion this will not overflow, because + * s->refcount_order < 7. + * (note: x << s->refcount_order == x * s->refcount_bits) */ + return DIV_ROUND_UP(entries << s->refcount_order, 8); +} + +/** + * Reallocates *array so that it can hold new_size entries. *size must contain + * the current number of entries in *array. If the reallocation fails, *array + * and *size will not be modified and -errno will be returned. If the + * reallocation is successful, *array will be set to the new buffer, *size + * will be set to new_size and 0 will be returned. The size of the reallocated + * refcount array buffer will be aligned to a cluster boundary, and the newly + * allocated area will be zeroed. + */ +static int realloc_refcount_array(BDRVQcow2State *s, void **array, + int64_t *size, int64_t new_size) +{ + int64_t old_byte_size, new_byte_size; + void *new_ptr; + + /* Round to clusters so the array can be directly written to disk */ + old_byte_size = size_to_clusters(s, refcount_array_byte_size(s, *size)) + * s->cluster_size; + new_byte_size = size_to_clusters(s, refcount_array_byte_size(s, new_size)) + * s->cluster_size; + + if (new_byte_size == old_byte_size) { + *size = new_size; + return 0; + } + + assert(new_byte_size > 0); + + if (new_byte_size > SIZE_MAX) { + return -ENOMEM; + } + + new_ptr = g_try_realloc(*array, new_byte_size); + if (!new_ptr) { + return -ENOMEM; + } + + if (new_byte_size > old_byte_size) { + memset((char *)new_ptr + old_byte_size, 0, + new_byte_size - old_byte_size); + } + + *array = new_ptr; + *size = new_size; + + return 0; +} + +/* + * Increases the refcount for a range of clusters in a given refcount table. + * This is used to construct a temporary refcount table out of L1 and L2 tables + * which can be compared to the refcount table saved in the image. + * + * Modifies the number of errors in res. + */ +static int inc_refcounts(BlockDriverState *bs, + BdrvCheckResult *res, + void **refcount_table, + int64_t *refcount_table_size, + int64_t offset, int64_t size) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t start, last, cluster_offset, k, refcount; + int ret; + + if (size <= 0) { + return 0; + } + + start = start_of_cluster(s, offset); + last = start_of_cluster(s, offset + size - 1); + for(cluster_offset = start; cluster_offset <= last; + cluster_offset += s->cluster_size) { + k = cluster_offset >> s->cluster_bits; + if (k >= *refcount_table_size) { + ret = realloc_refcount_array(s, refcount_table, + refcount_table_size, k + 1); + if (ret < 0) { + res->check_errors++; + return ret; + } + } + + refcount = s->get_refcount(*refcount_table, k); + if (refcount == s->refcount_max) { + fprintf(stderr, "ERROR: overflow cluster offset=0x%" PRIx64 + "\n", cluster_offset); + res->corruptions++; + continue; + } + s->set_refcount(*refcount_table, k, refcount + 1); + } + + return 0; +} + +/* Flags for check_refcounts_l1() and check_refcounts_l2() */ +enum { + CHECK_FRAG_INFO = 0x2, /* update BlockFragInfo counters */ +}; + +/* + * Increases the refcount in the given refcount table for the all clusters + * referenced in the L2 table. While doing so, performs some checks on L2 + * entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l2(BlockDriverState *bs, BdrvCheckResult *res, + void **refcount_table, + int64_t *refcount_table_size, int64_t l2_offset, + int flags) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t *l2_table, l2_entry; + uint64_t next_contiguous_offset = 0; + int i, l2_size, nb_csectors, ret; + + /* Read L2 table from disk */ + l2_size = s->l2_size * sizeof(uint64_t); + l2_table = g_malloc(l2_size); + + ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, l2_size); + if (ret < 0) { + fprintf(stderr, "ERROR: I/O error in check_refcounts_l2\n"); + res->check_errors++; + goto fail; + } + + /* Do the actual checks */ + for(i = 0; i < s->l2_size; i++) { + l2_entry = be64_to_cpu(l2_table[i]); + + switch (qcow2_get_cluster_type(l2_entry)) { + case QCOW2_CLUSTER_COMPRESSED: + /* Compressed clusters don't have QCOW_OFLAG_COPIED */ + if (l2_entry & QCOW_OFLAG_COPIED) { + fprintf(stderr, "ERROR: cluster %" PRId64 ": " + "copied flag must never be set for compressed " + "clusters\n", l2_entry >> s->cluster_bits); + l2_entry &= ~QCOW_OFLAG_COPIED; + res->corruptions++; + } + + /* Mark cluster as used */ + nb_csectors = ((l2_entry >> s->csize_shift) & + s->csize_mask) + 1; + l2_entry &= s->cluster_offset_mask; + ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, + l2_entry & ~511, nb_csectors * 512); + if (ret < 0) { + goto fail; + } + + if (flags & CHECK_FRAG_INFO) { + res->bfi.allocated_clusters++; + res->bfi.compressed_clusters++; + + /* Compressed clusters are fragmented by nature. Since they + * take up sub-sector space but we only have sector granularity + * I/O we need to re-read the same sectors even for adjacent + * compressed clusters. + */ + res->bfi.fragmented_clusters++; + } + break; + + case QCOW2_CLUSTER_ZERO: + if ((l2_entry & L2E_OFFSET_MASK) == 0) { + break; + } + /* fall through */ + + case QCOW2_CLUSTER_NORMAL: + { + uint64_t offset = l2_entry & L2E_OFFSET_MASK; + + if (flags & CHECK_FRAG_INFO) { + res->bfi.allocated_clusters++; + if (next_contiguous_offset && + offset != next_contiguous_offset) { + res->bfi.fragmented_clusters++; + } + next_contiguous_offset = offset + s->cluster_size; + } + + /* Mark cluster as used */ + ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, + offset, s->cluster_size); + if (ret < 0) { + goto fail; + } + + /* Correct offsets are cluster aligned */ + if (offset_into_cluster(s, offset)) { + fprintf(stderr, "ERROR offset=%" PRIx64 ": Cluster is not " + "properly aligned; L2 entry corrupted.\n", offset); + res->corruptions++; + } + break; + } + + case QCOW2_CLUSTER_UNALLOCATED: + break; + + default: + abort(); + } + } + + g_free(l2_table); + return 0; + +fail: + g_free(l2_table); + return ret; +} + +/* + * Increases the refcount for the L1 table, its L2 tables and all referenced + * clusters in the given refcount table. While doing so, performs some checks + * on L1 and L2 entries. + * + * Returns the number of errors found by the checks or -errno if an internal + * error occurred. + */ +static int check_refcounts_l1(BlockDriverState *bs, + BdrvCheckResult *res, + void **refcount_table, + int64_t *refcount_table_size, + int64_t l1_table_offset, int l1_size, + int flags) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t *l1_table = NULL, l2_offset, l1_size2; + int i, ret; + + l1_size2 = l1_size * sizeof(uint64_t); + + /* Mark L1 table as used */ + ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, + l1_table_offset, l1_size2); + if (ret < 0) { + goto fail; + } + + /* Read L1 table entries from disk */ + if (l1_size2 > 0) { + l1_table = g_try_malloc(l1_size2); + if (l1_table == NULL) { + ret = -ENOMEM; + res->check_errors++; + goto fail; + } + ret = bdrv_pread(bs->file->bs, l1_table_offset, l1_table, l1_size2); + if (ret < 0) { + fprintf(stderr, "ERROR: I/O error in check_refcounts_l1\n"); + res->check_errors++; + goto fail; + } + for(i = 0;i < l1_size; i++) + be64_to_cpus(&l1_table[i]); + } + + /* Do the actual checks */ + for(i = 0; i < l1_size; i++) { + l2_offset = l1_table[i]; + if (l2_offset) { + /* Mark L2 table as used */ + l2_offset &= L1E_OFFSET_MASK; + ret = inc_refcounts(bs, res, refcount_table, refcount_table_size, + l2_offset, s->cluster_size); + if (ret < 0) { + goto fail; + } + + /* L2 tables are cluster aligned */ + if (offset_into_cluster(s, l2_offset)) { + fprintf(stderr, "ERROR l2_offset=%" PRIx64 ": Table is not " + "cluster aligned; L1 entry corrupted\n", l2_offset); + res->corruptions++; + } + + /* Process and check L2 entries */ + ret = check_refcounts_l2(bs, res, refcount_table, + refcount_table_size, l2_offset, flags); + if (ret < 0) { + goto fail; + } + } + } + g_free(l1_table); + return 0; + +fail: + g_free(l1_table); + return ret; +} + +/* + * Checks the OFLAG_COPIED flag for all L1 and L2 entries. + * + * This function does not print an error message nor does it increment + * check_errors if qcow2_get_refcount fails (this is because such an error will + * have been already detected and sufficiently signaled by the calling function + * (qcow2_check_refcounts) by the time this function is called). + */ +static int check_oflag_copied(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t *l2_table = qemu_blockalign(bs, s->cluster_size); + int ret; + uint64_t refcount; + int i, j; + + for (i = 0; i < s->l1_size; i++) { + uint64_t l1_entry = s->l1_table[i]; + uint64_t l2_offset = l1_entry & L1E_OFFSET_MASK; + bool l2_dirty = false; + + if (!l2_offset) { + continue; + } + + ret = qcow2_get_refcount(bs, l2_offset >> s->cluster_bits, + &refcount); + if (ret < 0) { + /* don't print message nor increment check_errors */ + continue; + } + if ((refcount == 1) != ((l1_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "%s OFLAG_COPIED L2 cluster: l1_index=%d " + "l1_entry=%" PRIx64 " refcount=%" PRIu64 "\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", + i, l1_entry, refcount); + if (fix & BDRV_FIX_ERRORS) { + s->l1_table[i] = refcount == 1 + ? l1_entry | QCOW_OFLAG_COPIED + : l1_entry & ~QCOW_OFLAG_COPIED; + ret = qcow2_write_l1_entry(bs, i); + if (ret < 0) { + res->check_errors++; + goto fail; + } + res->corruptions_fixed++; + } else { + res->corruptions++; + } + } + + ret = bdrv_pread(bs->file->bs, l2_offset, l2_table, + s->l2_size * sizeof(uint64_t)); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not read L2 table: %s\n", + strerror(-ret)); + res->check_errors++; + goto fail; + } + + for (j = 0; j < s->l2_size; j++) { + uint64_t l2_entry = be64_to_cpu(l2_table[j]); + uint64_t data_offset = l2_entry & L2E_OFFSET_MASK; + int cluster_type = qcow2_get_cluster_type(l2_entry); + + if ((cluster_type == QCOW2_CLUSTER_NORMAL) || + ((cluster_type == QCOW2_CLUSTER_ZERO) && (data_offset != 0))) { + ret = qcow2_get_refcount(bs, + data_offset >> s->cluster_bits, + &refcount); + if (ret < 0) { + /* don't print message nor increment check_errors */ + continue; + } + if ((refcount == 1) != ((l2_entry & QCOW_OFLAG_COPIED) != 0)) { + fprintf(stderr, "%s OFLAG_COPIED data cluster: " + "l2_entry=%" PRIx64 " refcount=%" PRIu64 "\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : + "ERROR", + l2_entry, refcount); + if (fix & BDRV_FIX_ERRORS) { + l2_table[j] = cpu_to_be64(refcount == 1 + ? l2_entry | QCOW_OFLAG_COPIED + : l2_entry & ~QCOW_OFLAG_COPIED); + l2_dirty = true; + res->corruptions_fixed++; + } else { + res->corruptions++; + } + } + } + } + + if (l2_dirty) { + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L2, + l2_offset, s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not write L2 table; metadata " + "overlap check failed: %s\n", strerror(-ret)); + res->check_errors++; + goto fail; + } + + ret = bdrv_pwrite(bs->file->bs, l2_offset, l2_table, + s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR: Could not write L2 table: %s\n", + strerror(-ret)); + res->check_errors++; + goto fail; + } + } + } + + ret = 0; + +fail: + qemu_vfree(l2_table); + return ret; +} + +/* + * Checks consistency of refblocks and accounts for each refblock in + * *refcount_table. + */ +static int check_refblocks(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix, bool *rebuild, + void **refcount_table, int64_t *nb_clusters) +{ + BDRVQcow2State *s = bs->opaque; + int64_t i, size; + int ret; + + for(i = 0; i < s->refcount_table_size; i++) { + uint64_t offset, cluster; + offset = s->refcount_table[i]; + cluster = offset >> s->cluster_bits; + + /* Refcount blocks are cluster aligned */ + if (offset_into_cluster(s, offset)) { + fprintf(stderr, "ERROR refcount block %" PRId64 " is not " + "cluster aligned; refcount table entry corrupted\n", i); + res->corruptions++; + *rebuild = true; + continue; + } + + if (cluster >= *nb_clusters) { + fprintf(stderr, "%s refcount block %" PRId64 " is outside image\n", + fix & BDRV_FIX_ERRORS ? "Repairing" : "ERROR", i); + + if (fix & BDRV_FIX_ERRORS) { + int64_t new_nb_clusters; + + if (offset > INT64_MAX - s->cluster_size) { + ret = -EINVAL; + goto resize_fail; + } + + ret = bdrv_truncate(bs->file->bs, offset + s->cluster_size); + if (ret < 0) { + goto resize_fail; + } + size = bdrv_getlength(bs->file->bs); + if (size < 0) { + ret = size; + goto resize_fail; + } + + new_nb_clusters = size_to_clusters(s, size); + assert(new_nb_clusters >= *nb_clusters); + + ret = realloc_refcount_array(s, refcount_table, + nb_clusters, new_nb_clusters); + if (ret < 0) { + res->check_errors++; + return ret; + } + + if (cluster >= *nb_clusters) { + ret = -EINVAL; + goto resize_fail; + } + + res->corruptions_fixed++; + ret = inc_refcounts(bs, res, refcount_table, nb_clusters, + offset, s->cluster_size); + if (ret < 0) { + return ret; + } + /* No need to check whether the refcount is now greater than 1: + * This area was just allocated and zeroed, so it can only be + * exactly 1 after inc_refcounts() */ + continue; + +resize_fail: + res->corruptions++; + *rebuild = true; + fprintf(stderr, "ERROR could not resize image: %s\n", + strerror(-ret)); + } else { + res->corruptions++; + } + continue; + } + + if (offset != 0) { + ret = inc_refcounts(bs, res, refcount_table, nb_clusters, + offset, s->cluster_size); + if (ret < 0) { + return ret; + } + if (s->get_refcount(*refcount_table, cluster) != 1) { + fprintf(stderr, "ERROR refcount block %" PRId64 + " refcount=%" PRIu64 "\n", i, + s->get_refcount(*refcount_table, cluster)); + res->corruptions++; + *rebuild = true; + } + } + } + + return 0; +} + +/* + * Calculates an in-memory refcount table. + */ +static int calculate_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix, bool *rebuild, + void **refcount_table, int64_t *nb_clusters) +{ + BDRVQcow2State *s = bs->opaque; + int64_t i; + QCowSnapshot *sn; + int ret; + + if (!*refcount_table) { + int64_t old_size = 0; + ret = realloc_refcount_array(s, refcount_table, + &old_size, *nb_clusters); + if (ret < 0) { + res->check_errors++; + return ret; + } + } + + /* header */ + ret = inc_refcounts(bs, res, refcount_table, nb_clusters, + 0, s->cluster_size); + if (ret < 0) { + return ret; + } + + /* current L1 table */ + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, + s->l1_table_offset, s->l1_size, CHECK_FRAG_INFO); + if (ret < 0) { + return ret; + } + + /* snapshots */ + for (i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + ret = check_refcounts_l1(bs, res, refcount_table, nb_clusters, + sn->l1_table_offset, sn->l1_size, 0); + if (ret < 0) { + return ret; + } + } + ret = inc_refcounts(bs, res, refcount_table, nb_clusters, + s->snapshots_offset, s->snapshots_size); + if (ret < 0) { + return ret; + } + + /* refcount data */ + ret = inc_refcounts(bs, res, refcount_table, nb_clusters, + s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t)); + if (ret < 0) { + return ret; + } + + return check_refblocks(bs, res, fix, rebuild, refcount_table, nb_clusters); +} + +/* + * Compares the actual reference count for each cluster in the image against the + * refcount as reported by the refcount structures on-disk. + */ +static void compare_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix, bool *rebuild, + int64_t *highest_cluster, + void *refcount_table, int64_t nb_clusters) +{ + BDRVQcow2State *s = bs->opaque; + int64_t i; + uint64_t refcount1, refcount2; + int ret; + + for (i = 0, *highest_cluster = 0; i < nb_clusters; i++) { + ret = qcow2_get_refcount(bs, i, &refcount1); + if (ret < 0) { + fprintf(stderr, "Can't get refcount for cluster %" PRId64 ": %s\n", + i, strerror(-ret)); + res->check_errors++; + continue; + } + + refcount2 = s->get_refcount(refcount_table, i); + + if (refcount1 > 0 || refcount2 > 0) { + *highest_cluster = i; + } + + if (refcount1 != refcount2) { + /* Check if we're allowed to fix the mismatch */ + int *num_fixed = NULL; + if (refcount1 == 0) { + *rebuild = true; + } else if (refcount1 > refcount2 && (fix & BDRV_FIX_LEAKS)) { + num_fixed = &res->leaks_fixed; + } else if (refcount1 < refcount2 && (fix & BDRV_FIX_ERRORS)) { + num_fixed = &res->corruptions_fixed; + } + + fprintf(stderr, "%s cluster %" PRId64 " refcount=%" PRIu64 + " reference=%" PRIu64 "\n", + num_fixed != NULL ? "Repairing" : + refcount1 < refcount2 ? "ERROR" : + "Leaked", + i, refcount1, refcount2); + + if (num_fixed) { + ret = update_refcount(bs, i << s->cluster_bits, 1, + refcount_diff(refcount1, refcount2), + refcount1 > refcount2, + QCOW2_DISCARD_ALWAYS); + if (ret >= 0) { + (*num_fixed)++; + continue; + } + } + + /* And if we couldn't, print an error */ + if (refcount1 < refcount2) { + res->corruptions++; + } else { + res->leaks++; + } + } + } +} + +/* + * Allocates clusters using an in-memory refcount table (IMRT) in contrast to + * the on-disk refcount structures. + * + * On input, *first_free_cluster tells where to start looking, and need not + * actually be a free cluster; the returned offset will not be before that + * cluster. On output, *first_free_cluster points to the first gap found, even + * if that gap was too small to be used as the returned offset. + * + * Note that *first_free_cluster is a cluster index whereas the return value is + * an offset. + */ +static int64_t alloc_clusters_imrt(BlockDriverState *bs, + int cluster_count, + void **refcount_table, + int64_t *imrt_nb_clusters, + int64_t *first_free_cluster) +{ + BDRVQcow2State *s = bs->opaque; + int64_t cluster = *first_free_cluster, i; + bool first_gap = true; + int contiguous_free_clusters; + int ret; + + /* Starting at *first_free_cluster, find a range of at least cluster_count + * continuously free clusters */ + for (contiguous_free_clusters = 0; + cluster < *imrt_nb_clusters && + contiguous_free_clusters < cluster_count; + cluster++) + { + if (!s->get_refcount(*refcount_table, cluster)) { + contiguous_free_clusters++; + if (first_gap) { + /* If this is the first free cluster found, update + * *first_free_cluster accordingly */ + *first_free_cluster = cluster; + first_gap = false; + } + } else if (contiguous_free_clusters) { + contiguous_free_clusters = 0; + } + } + + /* If contiguous_free_clusters is greater than zero, it contains the number + * of continuously free clusters until the current cluster; the first free + * cluster in the current "gap" is therefore + * cluster - contiguous_free_clusters */ + + /* If no such range could be found, grow the in-memory refcount table + * accordingly to append free clusters at the end of the image */ + if (contiguous_free_clusters < cluster_count) { + /* contiguous_free_clusters clusters are already empty at the image end; + * we need cluster_count clusters; therefore, we have to allocate + * cluster_count - contiguous_free_clusters new clusters at the end of + * the image (which is the current value of cluster; note that cluster + * may exceed old_imrt_nb_clusters if *first_free_cluster pointed beyond + * the image end) */ + ret = realloc_refcount_array(s, refcount_table, imrt_nb_clusters, + cluster + cluster_count + - contiguous_free_clusters); + if (ret < 0) { + return ret; + } + } + + /* Go back to the first free cluster */ + cluster -= contiguous_free_clusters; + for (i = 0; i < cluster_count; i++) { + s->set_refcount(*refcount_table, cluster + i, 1); + } + + return cluster << s->cluster_bits; +} + +/* + * Creates a new refcount structure based solely on the in-memory information + * given through *refcount_table. All necessary allocations will be reflected + * in that array. + * + * On success, the old refcount structure is leaked (it will be covered by the + * new refcount structure). + */ +static int rebuild_refcount_structure(BlockDriverState *bs, + BdrvCheckResult *res, + void **refcount_table, + int64_t *nb_clusters) +{ + BDRVQcow2State *s = bs->opaque; + int64_t first_free_cluster = 0, reftable_offset = -1, cluster = 0; + int64_t refblock_offset, refblock_start, refblock_index; + uint32_t reftable_size = 0; + uint64_t *on_disk_reftable = NULL; + void *on_disk_refblock; + int ret = 0; + struct { + uint64_t reftable_offset; + uint32_t reftable_clusters; + } QEMU_PACKED reftable_offset_and_clusters; + + qcow2_cache_empty(bs, s->refcount_block_cache); + +write_refblocks: + for (; cluster < *nb_clusters; cluster++) { + if (!s->get_refcount(*refcount_table, cluster)) { + continue; + } + + refblock_index = cluster >> s->refcount_block_bits; + refblock_start = refblock_index << s->refcount_block_bits; + + /* Don't allocate a cluster in a refblock already written to disk */ + if (first_free_cluster < refblock_start) { + first_free_cluster = refblock_start; + } + refblock_offset = alloc_clusters_imrt(bs, 1, refcount_table, + nb_clusters, &first_free_cluster); + if (refblock_offset < 0) { + fprintf(stderr, "ERROR allocating refblock: %s\n", + strerror(-refblock_offset)); + res->check_errors++; + ret = refblock_offset; + goto fail; + } + + if (reftable_size <= refblock_index) { + uint32_t old_reftable_size = reftable_size; + uint64_t *new_on_disk_reftable; + + reftable_size = ROUND_UP((refblock_index + 1) * sizeof(uint64_t), + s->cluster_size) / sizeof(uint64_t); + new_on_disk_reftable = g_try_realloc(on_disk_reftable, + reftable_size * + sizeof(uint64_t)); + if (!new_on_disk_reftable) { + res->check_errors++; + ret = -ENOMEM; + goto fail; + } + on_disk_reftable = new_on_disk_reftable; + + memset(on_disk_reftable + old_reftable_size, 0, + (reftable_size - old_reftable_size) * sizeof(uint64_t)); + + /* The offset we have for the reftable is now no longer valid; + * this will leak that range, but we can easily fix that by running + * a leak-fixing check after this rebuild operation */ + reftable_offset = -1; + } + on_disk_reftable[refblock_index] = refblock_offset; + + /* If this is apparently the last refblock (for now), try to squeeze the + * reftable in */ + if (refblock_index == (*nb_clusters - 1) >> s->refcount_block_bits && + reftable_offset < 0) + { + uint64_t reftable_clusters = size_to_clusters(s, reftable_size * + sizeof(uint64_t)); + reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, + refcount_table, nb_clusters, + &first_free_cluster); + if (reftable_offset < 0) { + fprintf(stderr, "ERROR allocating reftable: %s\n", + strerror(-reftable_offset)); + res->check_errors++; + ret = reftable_offset; + goto fail; + } + } + + ret = qcow2_pre_write_overlap_check(bs, 0, refblock_offset, + s->cluster_size); + if (ret < 0) { + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); + goto fail; + } + + /* The size of *refcount_table is always cluster-aligned, therefore the + * write operation will not overflow */ + on_disk_refblock = (void *)((char *) *refcount_table + + refblock_index * s->cluster_size); + + ret = bdrv_write(bs->file->bs, refblock_offset / BDRV_SECTOR_SIZE, + on_disk_refblock, s->cluster_sectors); + if (ret < 0) { + fprintf(stderr, "ERROR writing refblock: %s\n", strerror(-ret)); + goto fail; + } + + /* Go to the end of this refblock */ + cluster = refblock_start + s->refcount_block_size - 1; + } + + if (reftable_offset < 0) { + uint64_t post_refblock_start, reftable_clusters; + + post_refblock_start = ROUND_UP(*nb_clusters, s->refcount_block_size); + reftable_clusters = size_to_clusters(s, + reftable_size * sizeof(uint64_t)); + /* Not pretty but simple */ + if (first_free_cluster < post_refblock_start) { + first_free_cluster = post_refblock_start; + } + reftable_offset = alloc_clusters_imrt(bs, reftable_clusters, + refcount_table, nb_clusters, + &first_free_cluster); + if (reftable_offset < 0) { + fprintf(stderr, "ERROR allocating reftable: %s\n", + strerror(-reftable_offset)); + res->check_errors++; + ret = reftable_offset; + goto fail; + } + + goto write_refblocks; + } + + assert(on_disk_reftable); + + for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { + cpu_to_be64s(&on_disk_reftable[refblock_index]); + } + + ret = qcow2_pre_write_overlap_check(bs, 0, reftable_offset, + reftable_size * sizeof(uint64_t)); + if (ret < 0) { + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); + goto fail; + } + + assert(reftable_size < INT_MAX / sizeof(uint64_t)); + ret = bdrv_pwrite(bs->file->bs, reftable_offset, on_disk_reftable, + reftable_size * sizeof(uint64_t)); + if (ret < 0) { + fprintf(stderr, "ERROR writing reftable: %s\n", strerror(-ret)); + goto fail; + } + + /* Enter new reftable into the image header */ + cpu_to_be64w(&reftable_offset_and_clusters.reftable_offset, + reftable_offset); + cpu_to_be32w(&reftable_offset_and_clusters.reftable_clusters, + size_to_clusters(s, reftable_size * sizeof(uint64_t))); + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, + refcount_table_offset), + &reftable_offset_and_clusters, + sizeof(reftable_offset_and_clusters)); + if (ret < 0) { + fprintf(stderr, "ERROR setting reftable: %s\n", strerror(-ret)); + goto fail; + } + + for (refblock_index = 0; refblock_index < reftable_size; refblock_index++) { + be64_to_cpus(&on_disk_reftable[refblock_index]); + } + s->refcount_table = on_disk_reftable; + s->refcount_table_offset = reftable_offset; + s->refcount_table_size = reftable_size; + + return 0; + +fail: + g_free(on_disk_reftable); + return ret; +} + +/* + * Checks an image for refcount consistency. + * + * Returns 0 if no errors are found, the number of errors in case the image is + * detected as corrupted, and -errno when an internal error occurred. + */ +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + BDRVQcow2State *s = bs->opaque; + BdrvCheckResult pre_compare_res; + int64_t size, highest_cluster, nb_clusters; + void *refcount_table = NULL; + bool rebuild = false; + int ret; + + size = bdrv_getlength(bs->file->bs); + if (size < 0) { + res->check_errors++; + return size; + } + + nb_clusters = size_to_clusters(s, size); + if (nb_clusters > INT_MAX) { + res->check_errors++; + return -EFBIG; + } + + res->bfi.total_clusters = + size_to_clusters(s, bs->total_sectors * BDRV_SECTOR_SIZE); + + ret = calculate_refcounts(bs, res, fix, &rebuild, &refcount_table, + &nb_clusters); + if (ret < 0) { + goto fail; + } + + /* In case we don't need to rebuild the refcount structure (but want to fix + * something), this function is immediately called again, in which case the + * result should be ignored */ + pre_compare_res = *res; + compare_refcounts(bs, res, 0, &rebuild, &highest_cluster, refcount_table, + nb_clusters); + + if (rebuild && (fix & BDRV_FIX_ERRORS)) { + BdrvCheckResult old_res = *res; + int fresh_leaks = 0; + + fprintf(stderr, "Rebuilding refcount structure\n"); + ret = rebuild_refcount_structure(bs, res, &refcount_table, + &nb_clusters); + if (ret < 0) { + goto fail; + } + + res->corruptions = 0; + res->leaks = 0; + + /* Because the old reftable has been exchanged for a new one the + * references have to be recalculated */ + rebuild = false; + memset(refcount_table, 0, refcount_array_byte_size(s, nb_clusters)); + ret = calculate_refcounts(bs, res, 0, &rebuild, &refcount_table, + &nb_clusters); + if (ret < 0) { + goto fail; + } + + if (fix & BDRV_FIX_LEAKS) { + /* The old refcount structures are now leaked, fix it; the result + * can be ignored, aside from leaks which were introduced by + * rebuild_refcount_structure() that could not be fixed */ + BdrvCheckResult saved_res = *res; + *res = (BdrvCheckResult){ 0 }; + + compare_refcounts(bs, res, BDRV_FIX_LEAKS, &rebuild, + &highest_cluster, refcount_table, nb_clusters); + if (rebuild) { + fprintf(stderr, "ERROR rebuilt refcount structure is still " + "broken\n"); + } + + /* Any leaks accounted for here were introduced by + * rebuild_refcount_structure() because that function has created a + * new refcount structure from scratch */ + fresh_leaks = res->leaks; + *res = saved_res; + } + + if (res->corruptions < old_res.corruptions) { + res->corruptions_fixed += old_res.corruptions - res->corruptions; + } + if (res->leaks < old_res.leaks) { + res->leaks_fixed += old_res.leaks - res->leaks; + } + res->leaks += fresh_leaks; + } else if (fix) { + if (rebuild) { + fprintf(stderr, "ERROR need to rebuild refcount structures\n"); + res->check_errors++; + ret = -EIO; + goto fail; + } + + if (res->leaks || res->corruptions) { + *res = pre_compare_res; + compare_refcounts(bs, res, fix, &rebuild, &highest_cluster, + refcount_table, nb_clusters); + } + } + + /* check OFLAG_COPIED */ + ret = check_oflag_copied(bs, res, fix); + if (ret < 0) { + goto fail; + } + + res->image_end_offset = (highest_cluster + 1) * s->cluster_size; + ret = 0; + +fail: + g_free(refcount_table); + + return ret; +} + +#define overlaps_with(ofs, sz) \ + ranges_overlap(offset, size, ofs, sz) + +/* + * Checks if the given offset into the image file is actually free to use by + * looking for overlaps with important metadata sections (L1/L2 tables etc.), + * i.e. a sanity check without relying on the refcount tables. + * + * The ign parameter specifies what checks not to perform (being a bitmask of + * QCow2MetadataOverlap values), i.e., what sections to ignore. + * + * Returns: + * - 0 if writing to this offset will not affect the mentioned metadata + * - a positive QCow2MetadataOverlap value indicating one overlapping section + * - a negative value (-errno) indicating an error while performing a check, + * e.g. when bdrv_read failed on QCOW2_OL_INACTIVE_L2 + */ +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, + int64_t size) +{ + BDRVQcow2State *s = bs->opaque; + int chk = s->overlap_check & ~ign; + int i, j; + + if (!size) { + return 0; + } + + if (chk & QCOW2_OL_MAIN_HEADER) { + if (offset < s->cluster_size) { + return QCOW2_OL_MAIN_HEADER; + } + } + + /* align range to test to cluster boundaries */ + size = align_offset(offset_into_cluster(s, offset) + size, s->cluster_size); + offset = start_of_cluster(s, offset); + + if ((chk & QCOW2_OL_ACTIVE_L1) && s->l1_size) { + if (overlaps_with(s->l1_table_offset, s->l1_size * sizeof(uint64_t))) { + return QCOW2_OL_ACTIVE_L1; + } + } + + if ((chk & QCOW2_OL_REFCOUNT_TABLE) && s->refcount_table_size) { + if (overlaps_with(s->refcount_table_offset, + s->refcount_table_size * sizeof(uint64_t))) { + return QCOW2_OL_REFCOUNT_TABLE; + } + } + + if ((chk & QCOW2_OL_SNAPSHOT_TABLE) && s->snapshots_size) { + if (overlaps_with(s->snapshots_offset, s->snapshots_size)) { + return QCOW2_OL_SNAPSHOT_TABLE; + } + } + + if ((chk & QCOW2_OL_INACTIVE_L1) && s->snapshots) { + for (i = 0; i < s->nb_snapshots; i++) { + if (s->snapshots[i].l1_size && + overlaps_with(s->snapshots[i].l1_table_offset, + s->snapshots[i].l1_size * sizeof(uint64_t))) { + return QCOW2_OL_INACTIVE_L1; + } + } + } + + if ((chk & QCOW2_OL_ACTIVE_L2) && s->l1_table) { + for (i = 0; i < s->l1_size; i++) { + if ((s->l1_table[i] & L1E_OFFSET_MASK) && + overlaps_with(s->l1_table[i] & L1E_OFFSET_MASK, + s->cluster_size)) { + return QCOW2_OL_ACTIVE_L2; + } + } + } + + if ((chk & QCOW2_OL_REFCOUNT_BLOCK) && s->refcount_table) { + for (i = 0; i < s->refcount_table_size; i++) { + if ((s->refcount_table[i] & REFT_OFFSET_MASK) && + overlaps_with(s->refcount_table[i] & REFT_OFFSET_MASK, + s->cluster_size)) { + return QCOW2_OL_REFCOUNT_BLOCK; + } + } + } + + if ((chk & QCOW2_OL_INACTIVE_L2) && s->snapshots) { + for (i = 0; i < s->nb_snapshots; i++) { + uint64_t l1_ofs = s->snapshots[i].l1_table_offset; + uint32_t l1_sz = s->snapshots[i].l1_size; + uint64_t l1_sz2 = l1_sz * sizeof(uint64_t); + uint64_t *l1 = g_try_malloc(l1_sz2); + int ret; + + if (l1_sz2 && l1 == NULL) { + return -ENOMEM; + } + + ret = bdrv_pread(bs->file->bs, l1_ofs, l1, l1_sz2); + if (ret < 0) { + g_free(l1); + return ret; + } + + for (j = 0; j < l1_sz; j++) { + uint64_t l2_ofs = be64_to_cpu(l1[j]) & L1E_OFFSET_MASK; + if (l2_ofs && overlaps_with(l2_ofs, s->cluster_size)) { + g_free(l1); + return QCOW2_OL_INACTIVE_L2; + } + } + + g_free(l1); + } + } + + return 0; +} + +static const char *metadata_ol_names[] = { + [QCOW2_OL_MAIN_HEADER_BITNR] = "qcow2_header", + [QCOW2_OL_ACTIVE_L1_BITNR] = "active L1 table", + [QCOW2_OL_ACTIVE_L2_BITNR] = "active L2 table", + [QCOW2_OL_REFCOUNT_TABLE_BITNR] = "refcount table", + [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = "refcount block", + [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = "snapshot table", + [QCOW2_OL_INACTIVE_L1_BITNR] = "inactive L1 table", + [QCOW2_OL_INACTIVE_L2_BITNR] = "inactive L2 table", +}; + +/* + * First performs a check for metadata overlaps (through + * qcow2_check_metadata_overlap); if that fails with a negative value (error + * while performing a check), that value is returned. If an impending overlap + * is detected, the BDS will be made unusable, the qcow2 file marked corrupt + * and -EIO returned. + * + * Returns 0 if there were neither overlaps nor errors while checking for + * overlaps; or a negative value (-errno) on error. + */ +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, + int64_t size) +{ + int ret = qcow2_check_metadata_overlap(bs, ign, offset, size); + + if (ret < 0) { + return ret; + } else if (ret > 0) { + int metadata_ol_bitnr = ctz32(ret); + assert(metadata_ol_bitnr < QCOW2_OL_MAX_BITNR); + + qcow2_signal_corruption(bs, true, offset, size, "Preventing invalid " + "write on metadata (overlaps with %s)", + metadata_ol_names[metadata_ol_bitnr]); + return -EIO; + } + + return 0; +} diff --git a/src/block/qcow2-snapshot.c b/src/block/qcow2-snapshot.c new file mode 100644 index 0000000..def7201 --- /dev/null +++ b/src/block/qcow2-snapshot.c @@ -0,0 +1,736 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/qcow2.h" +#include "qemu/error-report.h" + +void qcow2_free_snapshots(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + int i; + + for(i = 0; i < s->nb_snapshots; i++) { + g_free(s->snapshots[i].name); + g_free(s->snapshots[i].id_str); + } + g_free(s->snapshots); + s->snapshots = NULL; + s->nb_snapshots = 0; +} + +int qcow2_read_snapshots(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + QCowSnapshotHeader h; + QCowSnapshotExtraData extra; + QCowSnapshot *sn; + int i, id_str_size, name_size; + int64_t offset; + uint32_t extra_data_size; + int ret; + + if (!s->nb_snapshots) { + s->snapshots = NULL; + s->snapshots_size = 0; + return 0; + } + + offset = s->snapshots_offset; + s->snapshots = g_new0(QCowSnapshot, s->nb_snapshots); + + for(i = 0; i < s->nb_snapshots; i++) { + /* Read statically sized part of the snapshot header */ + offset = align_offset(offset, 8); + ret = bdrv_pread(bs->file->bs, offset, &h, sizeof(h)); + if (ret < 0) { + goto fail; + } + + offset += sizeof(h); + sn = s->snapshots + i; + sn->l1_table_offset = be64_to_cpu(h.l1_table_offset); + sn->l1_size = be32_to_cpu(h.l1_size); + sn->vm_state_size = be32_to_cpu(h.vm_state_size); + sn->date_sec = be32_to_cpu(h.date_sec); + sn->date_nsec = be32_to_cpu(h.date_nsec); + sn->vm_clock_nsec = be64_to_cpu(h.vm_clock_nsec); + extra_data_size = be32_to_cpu(h.extra_data_size); + + id_str_size = be16_to_cpu(h.id_str_size); + name_size = be16_to_cpu(h.name_size); + + /* Read extra data */ + ret = bdrv_pread(bs->file->bs, offset, &extra, + MIN(sizeof(extra), extra_data_size)); + if (ret < 0) { + goto fail; + } + offset += extra_data_size; + + if (extra_data_size >= 8) { + sn->vm_state_size = be64_to_cpu(extra.vm_state_size_large); + } + + if (extra_data_size >= 16) { + sn->disk_size = be64_to_cpu(extra.disk_size); + } else { + sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + } + + /* Read snapshot ID */ + sn->id_str = g_malloc(id_str_size + 1); + ret = bdrv_pread(bs->file->bs, offset, sn->id_str, id_str_size); + if (ret < 0) { + goto fail; + } + offset += id_str_size; + sn->id_str[id_str_size] = '\0'; + + /* Read snapshot name */ + sn->name = g_malloc(name_size + 1); + ret = bdrv_pread(bs->file->bs, offset, sn->name, name_size); + if (ret < 0) { + goto fail; + } + offset += name_size; + sn->name[name_size] = '\0'; + + if (offset - s->snapshots_offset > QCOW_MAX_SNAPSHOTS_SIZE) { + ret = -EFBIG; + goto fail; + } + } + + assert(offset - s->snapshots_offset <= INT_MAX); + s->snapshots_size = offset - s->snapshots_offset; + return 0; + +fail: + qcow2_free_snapshots(bs); + return ret; +} + +/* add at the end of the file a new list of snapshots */ +static int qcow2_write_snapshots(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + QCowSnapshot *sn; + QCowSnapshotHeader h; + QCowSnapshotExtraData extra; + int i, name_size, id_str_size, snapshots_size; + struct { + uint32_t nb_snapshots; + uint64_t snapshots_offset; + } QEMU_PACKED header_data; + int64_t offset, snapshots_offset = 0; + int ret; + + /* compute the size of the snapshots */ + offset = 0; + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + offset = align_offset(offset, 8); + offset += sizeof(h); + offset += sizeof(extra); + offset += strlen(sn->id_str); + offset += strlen(sn->name); + + if (offset > QCOW_MAX_SNAPSHOTS_SIZE) { + ret = -EFBIG; + goto fail; + } + } + + assert(offset <= INT_MAX); + snapshots_size = offset; + + /* Allocate space for the new snapshot list */ + snapshots_offset = qcow2_alloc_clusters(bs, snapshots_size); + offset = snapshots_offset; + if (offset < 0) { + ret = offset; + goto fail; + } + ret = bdrv_flush(bs); + if (ret < 0) { + goto fail; + } + + /* The snapshot list position has not yet been updated, so these clusters + * must indeed be completely free */ + ret = qcow2_pre_write_overlap_check(bs, 0, offset, snapshots_size); + if (ret < 0) { + goto fail; + } + + + /* Write all snapshots to the new list */ + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + memset(&h, 0, sizeof(h)); + h.l1_table_offset = cpu_to_be64(sn->l1_table_offset); + h.l1_size = cpu_to_be32(sn->l1_size); + /* If it doesn't fit in 32 bit, older implementations should treat it + * as a disk-only snapshot rather than truncate the VM state */ + if (sn->vm_state_size <= 0xffffffff) { + h.vm_state_size = cpu_to_be32(sn->vm_state_size); + } + h.date_sec = cpu_to_be32(sn->date_sec); + h.date_nsec = cpu_to_be32(sn->date_nsec); + h.vm_clock_nsec = cpu_to_be64(sn->vm_clock_nsec); + h.extra_data_size = cpu_to_be32(sizeof(extra)); + + memset(&extra, 0, sizeof(extra)); + extra.vm_state_size_large = cpu_to_be64(sn->vm_state_size); + extra.disk_size = cpu_to_be64(sn->disk_size); + + id_str_size = strlen(sn->id_str); + name_size = strlen(sn->name); + assert(id_str_size <= UINT16_MAX && name_size <= UINT16_MAX); + h.id_str_size = cpu_to_be16(id_str_size); + h.name_size = cpu_to_be16(name_size); + offset = align_offset(offset, 8); + + ret = bdrv_pwrite(bs->file->bs, offset, &h, sizeof(h)); + if (ret < 0) { + goto fail; + } + offset += sizeof(h); + + ret = bdrv_pwrite(bs->file->bs, offset, &extra, sizeof(extra)); + if (ret < 0) { + goto fail; + } + offset += sizeof(extra); + + ret = bdrv_pwrite(bs->file->bs, offset, sn->id_str, id_str_size); + if (ret < 0) { + goto fail; + } + offset += id_str_size; + + ret = bdrv_pwrite(bs->file->bs, offset, sn->name, name_size); + if (ret < 0) { + goto fail; + } + offset += name_size; + } + + /* + * Update the header to point to the new snapshot table. This requires the + * new table and its refcounts to be stable on disk. + */ + ret = bdrv_flush(bs); + if (ret < 0) { + goto fail; + } + + QEMU_BUILD_BUG_ON(offsetof(QCowHeader, snapshots_offset) != + offsetof(QCowHeader, nb_snapshots) + sizeof(header_data.nb_snapshots)); + + header_data.nb_snapshots = cpu_to_be32(s->nb_snapshots); + header_data.snapshots_offset = cpu_to_be64(snapshots_offset); + + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, nb_snapshots), + &header_data, sizeof(header_data)); + if (ret < 0) { + goto fail; + } + + /* free the old snapshot table */ + qcow2_free_clusters(bs, s->snapshots_offset, s->snapshots_size, + QCOW2_DISCARD_SNAPSHOT); + s->snapshots_offset = snapshots_offset; + s->snapshots_size = snapshots_size; + return 0; + +fail: + if (snapshots_offset > 0) { + qcow2_free_clusters(bs, snapshots_offset, snapshots_size, + QCOW2_DISCARD_ALWAYS); + } + return ret; +} + +static void find_new_snapshot_id(BlockDriverState *bs, + char *id_str, int id_str_size) +{ + BDRVQcow2State *s = bs->opaque; + QCowSnapshot *sn; + int i; + unsigned long id, id_max = 0; + + for(i = 0; i < s->nb_snapshots; i++) { + sn = s->snapshots + i; + id = strtoul(sn->id_str, NULL, 10); + if (id > id_max) + id_max = id; + } + snprintf(id_str, id_str_size, "%lu", id_max + 1); +} + +static int find_snapshot_by_id_and_name(BlockDriverState *bs, + const char *id, + const char *name) +{ + BDRVQcow2State *s = bs->opaque; + int i; + + if (id && name) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id) && + !strcmp(s->snapshots[i].name, name)) { + return i; + } + } + } else if (id) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].id_str, id)) { + return i; + } + } + } else if (name) { + for (i = 0; i < s->nb_snapshots; i++) { + if (!strcmp(s->snapshots[i].name, name)) { + return i; + } + } + } + + return -1; +} + +static int find_snapshot_by_id_or_name(BlockDriverState *bs, + const char *id_or_name) +{ + int ret; + + ret = find_snapshot_by_id_and_name(bs, id_or_name, NULL); + if (ret >= 0) { + return ret; + } + return find_snapshot_by_id_and_name(bs, NULL, id_or_name); +} + +/* if no id is provided, a new one is constructed */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ + BDRVQcow2State *s = bs->opaque; + QCowSnapshot *new_snapshot_list = NULL; + QCowSnapshot *old_snapshot_list = NULL; + QCowSnapshot sn1, *sn = &sn1; + int i, ret; + uint64_t *l1_table = NULL; + int64_t l1_table_offset; + + if (s->nb_snapshots >= QCOW_MAX_SNAPSHOTS) { + return -EFBIG; + } + + memset(sn, 0, sizeof(*sn)); + + /* Generate an ID */ + find_new_snapshot_id(bs, sn_info->id_str, sizeof(sn_info->id_str)); + + /* Check that the ID is unique */ + if (find_snapshot_by_id_and_name(bs, sn_info->id_str, NULL) >= 0) { + return -EEXIST; + } + + /* Populate sn with passed data */ + sn->id_str = g_strdup(sn_info->id_str); + sn->name = g_strdup(sn_info->name); + + sn->disk_size = bs->total_sectors * BDRV_SECTOR_SIZE; + sn->vm_state_size = sn_info->vm_state_size; + sn->date_sec = sn_info->date_sec; + sn->date_nsec = sn_info->date_nsec; + sn->vm_clock_nsec = sn_info->vm_clock_nsec; + + /* Allocate the L1 table of the snapshot and copy the current one there. */ + l1_table_offset = qcow2_alloc_clusters(bs, s->l1_size * sizeof(uint64_t)); + if (l1_table_offset < 0) { + ret = l1_table_offset; + goto fail; + } + + sn->l1_table_offset = l1_table_offset; + sn->l1_size = s->l1_size; + + l1_table = g_try_new(uint64_t, s->l1_size); + if (s->l1_size && l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } + + for(i = 0; i < s->l1_size; i++) { + l1_table[i] = cpu_to_be64(s->l1_table[i]); + } + + ret = qcow2_pre_write_overlap_check(bs, 0, sn->l1_table_offset, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + ret = bdrv_pwrite(bs->file->bs, sn->l1_table_offset, l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + + g_free(l1_table); + l1_table = NULL; + + /* + * Increase the refcounts of all clusters and make sure everything is + * stable on disk before updating the snapshot table to contain a pointer + * to the new L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 1); + if (ret < 0) { + goto fail; + } + + /* Append the new snapshot to the snapshot list */ + new_snapshot_list = g_new(QCowSnapshot, s->nb_snapshots + 1); + if (s->snapshots) { + memcpy(new_snapshot_list, s->snapshots, + s->nb_snapshots * sizeof(QCowSnapshot)); + old_snapshot_list = s->snapshots; + } + s->snapshots = new_snapshot_list; + s->snapshots[s->nb_snapshots++] = *sn; + + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + g_free(s->snapshots); + s->snapshots = old_snapshot_list; + s->nb_snapshots--; + goto fail; + } + + g_free(old_snapshot_list); + + /* The VM state isn't needed any more in the active L1 table; in fact, it + * hurts by causing expensive COW for the next snapshot. */ + qcow2_discard_clusters(bs, qcow2_vm_state_offset(s), + align_offset(sn->vm_state_size, s->cluster_size) + >> BDRV_SECTOR_BITS, + QCOW2_DISCARD_NEVER, false); + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; + +fail: + g_free(sn->id_str); + g_free(sn->name); + g_free(l1_table); + + return ret; +} + +/* copy the snapshot 'snapshot_name' into the current disk image */ +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVQcow2State *s = bs->opaque; + QCowSnapshot *sn; + int i, snapshot_index; + int cur_l1_bytes, sn_l1_bytes; + int ret; + uint64_t *sn_l1_table = NULL; + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_or_name(bs, snapshot_id); + if (snapshot_index < 0) { + return -ENOENT; + } + sn = &s->snapshots[snapshot_index]; + + if (sn->disk_size != bs->total_sectors * BDRV_SECTOR_SIZE) { + error_report("qcow2: Loading snapshots with different disk " + "size is not implemented"); + ret = -ENOTSUP; + goto fail; + } + + /* + * Make sure that the current L1 table is big enough to contain the whole + * L1 table of the snapshot. If the snapshot L1 table is smaller, the + * current one must be padded with zeros. + */ + ret = qcow2_grow_l1_table(bs, sn->l1_size, true); + if (ret < 0) { + goto fail; + } + + cur_l1_bytes = s->l1_size * sizeof(uint64_t); + sn_l1_bytes = sn->l1_size * sizeof(uint64_t); + + /* + * Copy the snapshot L1 table to the current L1 table. + * + * Before overwriting the old current L1 table on disk, make sure to + * increase all refcounts for the clusters referenced by the new one. + * Decrease the refcount referenced by the old one only when the L1 + * table is overwritten. + */ + sn_l1_table = g_try_malloc0(cur_l1_bytes); + if (cur_l1_bytes && sn_l1_table == NULL) { + ret = -ENOMEM; + goto fail; + } + + ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, + sn_l1_table, sn_l1_bytes); + if (ret < 0) { + goto fail; + } + + ret = qcow2_update_snapshot_refcount(bs, sn->l1_table_offset, + sn->l1_size, 1); + if (ret < 0) { + goto fail; + } + + ret = qcow2_pre_write_overlap_check(bs, QCOW2_OL_ACTIVE_L1, + s->l1_table_offset, cur_l1_bytes); + if (ret < 0) { + goto fail; + } + + ret = bdrv_pwrite_sync(bs->file->bs, s->l1_table_offset, sn_l1_table, + cur_l1_bytes); + if (ret < 0) { + goto fail; + } + + /* + * Decrease refcount of clusters of current L1 table. + * + * At this point, the in-memory s->l1_table points to the old L1 table, + * whereas on disk we already have the new one. + * + * qcow2_update_snapshot_refcount special cases the current L1 table to use + * the in-memory data instead of really using the offset to load a new one, + * which is why this works. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, + s->l1_size, -1); + + /* + * Now update the in-memory L1 table to be in sync with the on-disk one. We + * need to do this even if updating refcounts failed. + */ + for(i = 0;i < s->l1_size; i++) { + s->l1_table[i] = be64_to_cpu(sn_l1_table[i]); + } + + if (ret < 0) { + goto fail; + } + + g_free(sn_l1_table); + sn_l1_table = NULL; + + /* + * Update QCOW_OFLAG_COPIED in the active L1 table (it may have changed + * when we decreased the refcount of the old snapshot. + */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) { + goto fail; + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; + +fail: + g_free(sn_l1_table); + return ret; +} + +int qcow2_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + QCowSnapshot sn; + int snapshot_index, ret; + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); + if (snapshot_index < 0) { + error_setg(errp, "Can't find the snapshot"); + return -ENOENT; + } + sn = s->snapshots[snapshot_index]; + + /* Remove it from the snapshot list */ + memmove(s->snapshots + snapshot_index, + s->snapshots + snapshot_index + 1, + (s->nb_snapshots - snapshot_index - 1) * sizeof(sn)); + s->nb_snapshots--; + ret = qcow2_write_snapshots(bs); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to remove snapshot from snapshot list"); + return ret; + } + + /* + * The snapshot is now unused, clean up. If we fail after this point, we + * won't recover but just leak clusters. + */ + g_free(sn.id_str); + g_free(sn.name); + + /* + * Now decrease the refcounts of clusters referenced by the snapshot and + * free the L1 table. + */ + ret = qcow2_update_snapshot_refcount(bs, sn.l1_table_offset, + sn.l1_size, -1); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to free the cluster and L1 table"); + return ret; + } + qcow2_free_clusters(bs, sn.l1_table_offset, sn.l1_size * sizeof(uint64_t), + QCOW2_DISCARD_SNAPSHOT); + + /* must update the copied flag on the current cluster offsets */ + ret = qcow2_update_snapshot_refcount(bs, s->l1_table_offset, s->l1_size, 0); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Failed to update snapshot status in disk"); + return ret; + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return 0; +} + +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ + BDRVQcow2State *s = bs->opaque; + QEMUSnapshotInfo *sn_tab, *sn_info; + QCowSnapshot *sn; + int i; + + if (!s->nb_snapshots) { + *psn_tab = NULL; + return s->nb_snapshots; + } + + sn_tab = g_new0(QEMUSnapshotInfo, s->nb_snapshots); + for(i = 0; i < s->nb_snapshots; i++) { + sn_info = sn_tab + i; + sn = s->snapshots + i; + pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), + sn->id_str); + pstrcpy(sn_info->name, sizeof(sn_info->name), + sn->name); + sn_info->vm_state_size = sn->vm_state_size; + sn_info->date_sec = sn->date_sec; + sn_info->date_nsec = sn->date_nsec; + sn_info->vm_clock_nsec = sn->vm_clock_nsec; + } + *psn_tab = sn_tab; + return s->nb_snapshots; +} + +int qcow2_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) +{ + int i, snapshot_index; + BDRVQcow2State *s = bs->opaque; + QCowSnapshot *sn; + uint64_t *new_l1_table; + int new_l1_bytes; + int ret; + + assert(bs->read_only); + + /* Search the snapshot */ + snapshot_index = find_snapshot_by_id_and_name(bs, snapshot_id, name); + if (snapshot_index < 0) { + error_setg(errp, + "Can't find snapshot"); + return -ENOENT; + } + sn = &s->snapshots[snapshot_index]; + + /* Allocate and read in the snapshot's L1 table */ + if (sn->l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { + error_setg(errp, "Snapshot L1 table too large"); + return -EFBIG; + } + new_l1_bytes = sn->l1_size * sizeof(uint64_t); + new_l1_table = qemu_try_blockalign(bs->file->bs, + align_offset(new_l1_bytes, 512)); + if (new_l1_table == NULL) { + return -ENOMEM; + } + + ret = bdrv_pread(bs->file->bs, sn->l1_table_offset, + new_l1_table, new_l1_bytes); + if (ret < 0) { + error_setg(errp, "Failed to read l1 table for snapshot"); + qemu_vfree(new_l1_table); + return ret; + } + + /* Switch the L1 table */ + qemu_vfree(s->l1_table); + + s->l1_size = sn->l1_size; + s->l1_table_offset = sn->l1_table_offset; + s->l1_table = new_l1_table; + + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + + return 0; +} diff --git a/src/block/qcow2.c b/src/block/qcow2.c new file mode 100644 index 0000000..88f56c8 --- /dev/null +++ b/src/block/qcow2.c @@ -0,0 +1,3193 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include <zlib.h> +#include "block/qcow2.h" +#include "qemu/error-report.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qbool.h" +#include "qapi/util.h" +#include "qapi/qmp/types.h" +#include "qapi-event.h" +#include "trace.h" +#include "qemu/option_int.h" + +/* + Differences with QCOW: + + - Support for multiple incremental snapshots. + - Memory management by reference counts. + - Clusters which have a reference count of one have the bit + QCOW_OFLAG_COPIED to optimize write performance. + - Size of compressed clusters is stored in sectors to reduce bit usage + in the cluster offsets. + - Support for storing additional data (such as the VM state) in the + snapshots. + - If a backing store is used, the cluster size is not constrained + (could be backported to QCOW). + - L2 tables have always a size of one cluster. +*/ + + +typedef struct { + uint32_t magic; + uint32_t len; +} QEMU_PACKED QCowExtension; + +#define QCOW2_EXT_MAGIC_END 0 +#define QCOW2_EXT_MAGIC_BACKING_FORMAT 0xE2792ACA +#define QCOW2_EXT_MAGIC_FEATURE_TABLE 0x6803f857 + +static int qcow2_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const QCowHeader *cow_header = (const void *)buf; + + if (buf_size >= sizeof(QCowHeader) && + be32_to_cpu(cow_header->magic) == QCOW_MAGIC && + be32_to_cpu(cow_header->version) >= 2) + return 100; + else + return 0; +} + + +/* + * read qcow2 extension and fill bs + * start reading from start_offset + * finish reading upon magic of value 0 or when end_offset reached + * unknown magic is skipped (future extension this version knows nothing about) + * return 0 upon success, non-0 otherwise + */ +static int qcow2_read_extensions(BlockDriverState *bs, uint64_t start_offset, + uint64_t end_offset, void **p_feature_table, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + QCowExtension ext; + uint64_t offset; + int ret; + +#ifdef DEBUG_EXT + printf("qcow2_read_extensions: start=%ld end=%ld\n", start_offset, end_offset); +#endif + offset = start_offset; + while (offset < end_offset) { + +#ifdef DEBUG_EXT + /* Sanity check */ + if (offset > s->cluster_size) + printf("qcow2_read_extension: suspicious offset %lu\n", offset); + + printf("attempting to read extended header in offset %lu\n", offset); +#endif + + ret = bdrv_pread(bs->file->bs, offset, &ext, sizeof(ext)); + if (ret < 0) { + error_setg_errno(errp, -ret, "qcow2_read_extension: ERROR: " + "pread fail from offset %" PRIu64, offset); + return 1; + } + be32_to_cpus(&ext.magic); + be32_to_cpus(&ext.len); + offset += sizeof(ext); +#ifdef DEBUG_EXT + printf("ext.magic = 0x%x\n", ext.magic); +#endif + if (offset > end_offset || ext.len > end_offset - offset) { + error_setg(errp, "Header extension too large"); + return -EINVAL; + } + + switch (ext.magic) { + case QCOW2_EXT_MAGIC_END: + return 0; + + case QCOW2_EXT_MAGIC_BACKING_FORMAT: + if (ext.len >= sizeof(bs->backing_format)) { + error_setg(errp, "ERROR: ext_backing_format: len=%" PRIu32 + " too large (>=%zu)", ext.len, + sizeof(bs->backing_format)); + return 2; + } + ret = bdrv_pread(bs->file->bs, offset, bs->backing_format, ext.len); + if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: ext_backing_format: " + "Could not read format name"); + return 3; + } + bs->backing_format[ext.len] = '\0'; + s->image_backing_format = g_strdup(bs->backing_format); +#ifdef DEBUG_EXT + printf("Qcow2: Got format extension %s\n", bs->backing_format); +#endif + break; + + case QCOW2_EXT_MAGIC_FEATURE_TABLE: + if (p_feature_table != NULL) { + void* feature_table = g_malloc0(ext.len + 2 * sizeof(Qcow2Feature)); + ret = bdrv_pread(bs->file->bs, offset , feature_table, ext.len); + if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: ext_feature_table: " + "Could not read table"); + return ret; + } + + *p_feature_table = feature_table; + } + break; + + default: + /* unknown magic - save it in case we need to rewrite the header */ + { + Qcow2UnknownHeaderExtension *uext; + + uext = g_malloc0(sizeof(*uext) + ext.len); + uext->magic = ext.magic; + uext->len = ext.len; + QLIST_INSERT_HEAD(&s->unknown_header_ext, uext, next); + + ret = bdrv_pread(bs->file->bs, offset , uext->data, uext->len); + if (ret < 0) { + error_setg_errno(errp, -ret, "ERROR: unknown extension: " + "Could not read data"); + return ret; + } + } + break; + } + + offset += ((ext.len + 7) & ~7); + } + + return 0; +} + +static void cleanup_unknown_header_ext(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + Qcow2UnknownHeaderExtension *uext, *next; + + QLIST_FOREACH_SAFE(uext, &s->unknown_header_ext, next, next) { + QLIST_REMOVE(uext, next); + g_free(uext); + } +} + +static void GCC_FMT_ATTR(3, 4) report_unsupported(BlockDriverState *bs, + Error **errp, const char *fmt, ...) +{ + char msg[64]; + va_list ap; + + va_start(ap, fmt); + vsnprintf(msg, sizeof(msg), fmt, ap); + va_end(ap); + + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "qcow2", msg); +} + +static void report_unsupported_feature(BlockDriverState *bs, + Error **errp, Qcow2Feature *table, uint64_t mask) +{ + char *features = g_strdup(""); + char *old; + + while (table && table->name[0] != '\0') { + if (table->type == QCOW2_FEAT_TYPE_INCOMPATIBLE) { + if (mask & (1ULL << table->bit)) { + old = features; + features = g_strdup_printf("%s%s%.46s", old, *old ? ", " : "", + table->name); + g_free(old); + mask &= ~(1ULL << table->bit); + } + } + table++; + } + + if (mask) { + old = features; + features = g_strdup_printf("%s%sUnknown incompatible feature: %" PRIx64, + old, *old ? ", " : "", mask); + g_free(old); + } + + report_unsupported(bs, errp, "%s", features); + g_free(features); +} + +/* + * Sets the dirty bit and flushes afterwards if necessary. + * + * The incompatible_features bit is only set if the image file header was + * updated successfully. Therefore it is not required to check the return + * value of this function. + */ +int qcow2_mark_dirty(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t val; + int ret; + + assert(s->qcow_version >= 3); + + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + return 0; /* already dirty */ + } + + val = cpu_to_be64(s->incompatible_features | QCOW2_INCOMPAT_DIRTY); + ret = bdrv_pwrite(bs->file->bs, offsetof(QCowHeader, incompatible_features), + &val, sizeof(val)); + if (ret < 0) { + return ret; + } + ret = bdrv_flush(bs->file->bs); + if (ret < 0) { + return ret; + } + + /* Only treat image as dirty if the header was updated successfully */ + s->incompatible_features |= QCOW2_INCOMPAT_DIRTY; + return 0; +} + +/* + * Clears the dirty bit and flushes before if necessary. Only call this + * function when there are no pending requests, it does not guard against + * concurrent requests dirtying the image. + */ +static int qcow2_mark_clean(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + int ret; + + s->incompatible_features &= ~QCOW2_INCOMPAT_DIRTY; + + ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } + + return qcow2_update_header(bs); + } + return 0; +} + +/* + * Marks the image as corrupt. + */ +int qcow2_mark_corrupt(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + + s->incompatible_features |= QCOW2_INCOMPAT_CORRUPT; + return qcow2_update_header(bs); +} + +/* + * Marks the image as consistent, i.e., unsets the corrupt bit, and flushes + * before if necessary. + */ +int qcow2_mark_consistent(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + + if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { + int ret = bdrv_flush(bs); + if (ret < 0) { + return ret; + } + + s->incompatible_features &= ~QCOW2_INCOMPAT_CORRUPT; + return qcow2_update_header(bs); + } + return 0; +} + +static int qcow2_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + int ret = qcow2_check_refcounts(bs, result, fix); + if (ret < 0) { + return ret; + } + + if (fix && result->check_errors == 0 && result->corruptions == 0) { + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + return qcow2_mark_consistent(bs); + } + return ret; +} + +static int validate_table_offset(BlockDriverState *bs, uint64_t offset, + uint64_t entries, size_t entry_len) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t size; + + /* Use signed INT64_MAX as the maximum even for uint64_t header fields, + * because values will be passed to qemu functions taking int64_t. */ + if (entries > INT64_MAX / entry_len) { + return -EINVAL; + } + + size = entries * entry_len; + + if (INT64_MAX - size < offset) { + return -EINVAL; + } + + /* Tables must be cluster aligned */ + if (offset & (s->cluster_size - 1)) { + return -EINVAL; + } + + return 0; +} + +static QemuOptsList qcow2_runtime_opts = { + .name = "qcow2", + .head = QTAILQ_HEAD_INITIALIZER(qcow2_runtime_opts.head), + .desc = { + { + .name = QCOW2_OPT_LAZY_REFCOUNTS, + .type = QEMU_OPT_BOOL, + .help = "Postpone refcount updates", + }, + { + .name = QCOW2_OPT_DISCARD_REQUEST, + .type = QEMU_OPT_BOOL, + .help = "Pass guest discard requests to the layer below", + }, + { + .name = QCOW2_OPT_DISCARD_SNAPSHOT, + .type = QEMU_OPT_BOOL, + .help = "Generate discard requests when snapshot related space " + "is freed", + }, + { + .name = QCOW2_OPT_DISCARD_OTHER, + .type = QEMU_OPT_BOOL, + .help = "Generate discard requests when other clusters are freed", + }, + { + .name = QCOW2_OPT_OVERLAP, + .type = QEMU_OPT_STRING, + .help = "Selects which overlap checks to perform from a range of " + "templates (none, constant, cached, all)", + }, + { + .name = QCOW2_OPT_OVERLAP_TEMPLATE, + .type = QEMU_OPT_STRING, + .help = "Selects which overlap checks to perform from a range of " + "templates (none, constant, cached, all)", + }, + { + .name = QCOW2_OPT_OVERLAP_MAIN_HEADER, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the main qcow2 header", + }, + { + .name = QCOW2_OPT_OVERLAP_ACTIVE_L1, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the active L1 table", + }, + { + .name = QCOW2_OPT_OVERLAP_ACTIVE_L2, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an active L2 table", + }, + { + .name = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the refcount table", + }, + { + .name = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into a refcount block", + }, + { + .name = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into the snapshot table", + }, + { + .name = QCOW2_OPT_OVERLAP_INACTIVE_L1, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an inactive L1 table", + }, + { + .name = QCOW2_OPT_OVERLAP_INACTIVE_L2, + .type = QEMU_OPT_BOOL, + .help = "Check for unintended writes into an inactive L2 table", + }, + { + .name = QCOW2_OPT_CACHE_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Maximum combined metadata (L2 tables and refcount blocks) " + "cache size", + }, + { + .name = QCOW2_OPT_L2_CACHE_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Maximum L2 table cache size", + }, + { + .name = QCOW2_OPT_REFCOUNT_CACHE_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Maximum refcount block cache size", + }, + { + .name = QCOW2_OPT_CACHE_CLEAN_INTERVAL, + .type = QEMU_OPT_NUMBER, + .help = "Clean unused cache entries after this time (in seconds)", + }, + { /* end of list */ } + }, +}; + +static const char *overlap_bool_option_names[QCOW2_OL_MAX_BITNR] = { + [QCOW2_OL_MAIN_HEADER_BITNR] = QCOW2_OPT_OVERLAP_MAIN_HEADER, + [QCOW2_OL_ACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L1, + [QCOW2_OL_ACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_ACTIVE_L2, + [QCOW2_OL_REFCOUNT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_TABLE, + [QCOW2_OL_REFCOUNT_BLOCK_BITNR] = QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK, + [QCOW2_OL_SNAPSHOT_TABLE_BITNR] = QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE, + [QCOW2_OL_INACTIVE_L1_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L1, + [QCOW2_OL_INACTIVE_L2_BITNR] = QCOW2_OPT_OVERLAP_INACTIVE_L2, +}; + +static void cache_clean_timer_cb(void *opaque) +{ + BlockDriverState *bs = opaque; + BDRVQcow2State *s = bs->opaque; + qcow2_cache_clean_unused(bs, s->l2_table_cache); + qcow2_cache_clean_unused(bs, s->refcount_block_cache); + timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + (int64_t) s->cache_clean_interval * 1000); +} + +static void cache_clean_timer_init(BlockDriverState *bs, AioContext *context) +{ + BDRVQcow2State *s = bs->opaque; + if (s->cache_clean_interval > 0) { + s->cache_clean_timer = aio_timer_new(context, QEMU_CLOCK_VIRTUAL, + SCALE_MS, cache_clean_timer_cb, + bs); + timer_mod(s->cache_clean_timer, qemu_clock_get_ms(QEMU_CLOCK_VIRTUAL) + + (int64_t) s->cache_clean_interval * 1000); + } +} + +static void cache_clean_timer_del(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + if (s->cache_clean_timer) { + timer_del(s->cache_clean_timer); + timer_free(s->cache_clean_timer); + s->cache_clean_timer = NULL; + } +} + +static void qcow2_detach_aio_context(BlockDriverState *bs) +{ + cache_clean_timer_del(bs); +} + +static void qcow2_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + cache_clean_timer_init(bs, new_context); +} + +static void read_cache_sizes(BlockDriverState *bs, QemuOpts *opts, + uint64_t *l2_cache_size, + uint64_t *refcount_cache_size, Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t combined_cache_size; + bool l2_cache_size_set, refcount_cache_size_set, combined_cache_size_set; + + combined_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_CACHE_SIZE); + l2_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_L2_CACHE_SIZE); + refcount_cache_size_set = qemu_opt_get(opts, QCOW2_OPT_REFCOUNT_CACHE_SIZE); + + combined_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_CACHE_SIZE, 0); + *l2_cache_size = qemu_opt_get_size(opts, QCOW2_OPT_L2_CACHE_SIZE, 0); + *refcount_cache_size = qemu_opt_get_size(opts, + QCOW2_OPT_REFCOUNT_CACHE_SIZE, 0); + + if (combined_cache_size_set) { + if (l2_cache_size_set && refcount_cache_size_set) { + error_setg(errp, QCOW2_OPT_CACHE_SIZE ", " QCOW2_OPT_L2_CACHE_SIZE + " and " QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not be set " + "the same time"); + return; + } else if (*l2_cache_size > combined_cache_size) { + error_setg(errp, QCOW2_OPT_L2_CACHE_SIZE " may not exceed " + QCOW2_OPT_CACHE_SIZE); + return; + } else if (*refcount_cache_size > combined_cache_size) { + error_setg(errp, QCOW2_OPT_REFCOUNT_CACHE_SIZE " may not exceed " + QCOW2_OPT_CACHE_SIZE); + return; + } + + if (l2_cache_size_set) { + *refcount_cache_size = combined_cache_size - *l2_cache_size; + } else if (refcount_cache_size_set) { + *l2_cache_size = combined_cache_size - *refcount_cache_size; + } else { + *refcount_cache_size = combined_cache_size + / (DEFAULT_L2_REFCOUNT_SIZE_RATIO + 1); + *l2_cache_size = combined_cache_size - *refcount_cache_size; + } + } else { + if (!l2_cache_size_set && !refcount_cache_size_set) { + *l2_cache_size = MAX(DEFAULT_L2_CACHE_BYTE_SIZE, + (uint64_t)DEFAULT_L2_CACHE_CLUSTERS + * s->cluster_size); + *refcount_cache_size = *l2_cache_size + / DEFAULT_L2_REFCOUNT_SIZE_RATIO; + } else if (!l2_cache_size_set) { + *l2_cache_size = *refcount_cache_size + * DEFAULT_L2_REFCOUNT_SIZE_RATIO; + } else if (!refcount_cache_size_set) { + *refcount_cache_size = *l2_cache_size + / DEFAULT_L2_REFCOUNT_SIZE_RATIO; + } + } +} + +typedef struct Qcow2ReopenState { + Qcow2Cache *l2_table_cache; + Qcow2Cache *refcount_block_cache; + bool use_lazy_refcounts; + int overlap_check; + bool discard_passthrough[QCOW2_DISCARD_MAX]; + uint64_t cache_clean_interval; +} Qcow2ReopenState; + +static int qcow2_update_options_prepare(BlockDriverState *bs, + Qcow2ReopenState *r, + QDict *options, int flags, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + QemuOpts *opts = NULL; + const char *opt_overlap_check, *opt_overlap_check_template; + int overlap_check_template = 0; + uint64_t l2_cache_size, refcount_cache_size; + int i; + Error *local_err = NULL; + int ret; + + opts = qemu_opts_create(&qcow2_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + /* get L2 table/refcount block cache size from command line options */ + read_cache_sizes(bs, opts, &l2_cache_size, &refcount_cache_size, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + l2_cache_size /= s->cluster_size; + if (l2_cache_size < MIN_L2_CACHE_SIZE) { + l2_cache_size = MIN_L2_CACHE_SIZE; + } + if (l2_cache_size > INT_MAX) { + error_setg(errp, "L2 cache size too big"); + ret = -EINVAL; + goto fail; + } + + refcount_cache_size /= s->cluster_size; + if (refcount_cache_size < MIN_REFCOUNT_CACHE_SIZE) { + refcount_cache_size = MIN_REFCOUNT_CACHE_SIZE; + } + if (refcount_cache_size > INT_MAX) { + error_setg(errp, "Refcount cache size too big"); + ret = -EINVAL; + goto fail; + } + + /* alloc new L2 table/refcount block cache, flush old one */ + if (s->l2_table_cache) { + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret) { + error_setg_errno(errp, -ret, "Failed to flush the L2 table cache"); + goto fail; + } + } + + if (s->refcount_block_cache) { + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret) { + error_setg_errno(errp, -ret, + "Failed to flush the refcount block cache"); + goto fail; + } + } + + r->l2_table_cache = qcow2_cache_create(bs, l2_cache_size); + r->refcount_block_cache = qcow2_cache_create(bs, refcount_cache_size); + if (r->l2_table_cache == NULL || r->refcount_block_cache == NULL) { + error_setg(errp, "Could not allocate metadata caches"); + ret = -ENOMEM; + goto fail; + } + + /* New interval for cache cleanup timer */ + r->cache_clean_interval = + qemu_opt_get_number(opts, QCOW2_OPT_CACHE_CLEAN_INTERVAL, + s->cache_clean_interval); + if (r->cache_clean_interval > UINT_MAX) { + error_setg(errp, "Cache clean interval too big"); + ret = -EINVAL; + goto fail; + } + + /* lazy-refcounts; flush if going from enabled to disabled */ + r->use_lazy_refcounts = qemu_opt_get_bool(opts, QCOW2_OPT_LAZY_REFCOUNTS, + (s->compatible_features & QCOW2_COMPAT_LAZY_REFCOUNTS)); + if (r->use_lazy_refcounts && s->qcow_version < 3) { + error_setg(errp, "Lazy refcounts require a qcow2 image with at least " + "qemu 1.1 compatibility level"); + ret = -EINVAL; + goto fail; + } + + if (s->use_lazy_refcounts && !r->use_lazy_refcounts) { + ret = qcow2_mark_clean(bs); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to disable lazy refcounts"); + goto fail; + } + } + + /* Overlap check options */ + opt_overlap_check = qemu_opt_get(opts, QCOW2_OPT_OVERLAP); + opt_overlap_check_template = qemu_opt_get(opts, QCOW2_OPT_OVERLAP_TEMPLATE); + if (opt_overlap_check_template && opt_overlap_check && + strcmp(opt_overlap_check_template, opt_overlap_check)) + { + error_setg(errp, "Conflicting values for qcow2 options '" + QCOW2_OPT_OVERLAP "' ('%s') and '" QCOW2_OPT_OVERLAP_TEMPLATE + "' ('%s')", opt_overlap_check, opt_overlap_check_template); + ret = -EINVAL; + goto fail; + } + if (!opt_overlap_check) { + opt_overlap_check = opt_overlap_check_template ?: "cached"; + } + + if (!strcmp(opt_overlap_check, "none")) { + overlap_check_template = 0; + } else if (!strcmp(opt_overlap_check, "constant")) { + overlap_check_template = QCOW2_OL_CONSTANT; + } else if (!strcmp(opt_overlap_check, "cached")) { + overlap_check_template = QCOW2_OL_CACHED; + } else if (!strcmp(opt_overlap_check, "all")) { + overlap_check_template = QCOW2_OL_ALL; + } else { + error_setg(errp, "Unsupported value '%s' for qcow2 option " + "'overlap-check'. Allowed are any of the following: " + "none, constant, cached, all", opt_overlap_check); + ret = -EINVAL; + goto fail; + } + + r->overlap_check = 0; + for (i = 0; i < QCOW2_OL_MAX_BITNR; i++) { + /* overlap-check defines a template bitmask, but every flag may be + * overwritten through the associated boolean option */ + r->overlap_check |= + qemu_opt_get_bool(opts, overlap_bool_option_names[i], + overlap_check_template & (1 << i)) << i; + } + + r->discard_passthrough[QCOW2_DISCARD_NEVER] = false; + r->discard_passthrough[QCOW2_DISCARD_ALWAYS] = true; + r->discard_passthrough[QCOW2_DISCARD_REQUEST] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_REQUEST, + flags & BDRV_O_UNMAP); + r->discard_passthrough[QCOW2_DISCARD_SNAPSHOT] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_SNAPSHOT, true); + r->discard_passthrough[QCOW2_DISCARD_OTHER] = + qemu_opt_get_bool(opts, QCOW2_OPT_DISCARD_OTHER, false); + + ret = 0; +fail: + qemu_opts_del(opts); + opts = NULL; + return ret; +} + +static void qcow2_update_options_commit(BlockDriverState *bs, + Qcow2ReopenState *r) +{ + BDRVQcow2State *s = bs->opaque; + int i; + + if (s->l2_table_cache) { + qcow2_cache_destroy(bs, s->l2_table_cache); + } + if (s->refcount_block_cache) { + qcow2_cache_destroy(bs, s->refcount_block_cache); + } + s->l2_table_cache = r->l2_table_cache; + s->refcount_block_cache = r->refcount_block_cache; + + s->overlap_check = r->overlap_check; + s->use_lazy_refcounts = r->use_lazy_refcounts; + + for (i = 0; i < QCOW2_DISCARD_MAX; i++) { + s->discard_passthrough[i] = r->discard_passthrough[i]; + } + + if (s->cache_clean_interval != r->cache_clean_interval) { + cache_clean_timer_del(bs); + s->cache_clean_interval = r->cache_clean_interval; + cache_clean_timer_init(bs, bdrv_get_aio_context(bs)); + } +} + +static void qcow2_update_options_abort(BlockDriverState *bs, + Qcow2ReopenState *r) +{ + if (r->l2_table_cache) { + qcow2_cache_destroy(bs, r->l2_table_cache); + } + if (r->refcount_block_cache) { + qcow2_cache_destroy(bs, r->refcount_block_cache); + } +} + +static int qcow2_update_options(BlockDriverState *bs, QDict *options, + int flags, Error **errp) +{ + Qcow2ReopenState r = {}; + int ret; + + ret = qcow2_update_options_prepare(bs, &r, options, flags, errp); + if (ret >= 0) { + qcow2_update_options_commit(bs, &r); + } else { + qcow2_update_options_abort(bs, &r); + } + + return ret; +} + +static int qcow2_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + unsigned int len, i; + int ret = 0; + QCowHeader header; + Error *local_err = NULL; + uint64_t ext_end; + uint64_t l1_vm_state_index; + + ret = bdrv_pread(bs->file->bs, 0, &header, sizeof(header)); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read qcow2 header"); + goto fail; + } + be32_to_cpus(&header.magic); + be32_to_cpus(&header.version); + be64_to_cpus(&header.backing_file_offset); + be32_to_cpus(&header.backing_file_size); + be64_to_cpus(&header.size); + be32_to_cpus(&header.cluster_bits); + be32_to_cpus(&header.crypt_method); + be64_to_cpus(&header.l1_table_offset); + be32_to_cpus(&header.l1_size); + be64_to_cpus(&header.refcount_table_offset); + be32_to_cpus(&header.refcount_table_clusters); + be64_to_cpus(&header.snapshots_offset); + be32_to_cpus(&header.nb_snapshots); + + if (header.magic != QCOW_MAGIC) { + error_setg(errp, "Image is not in qcow2 format"); + ret = -EINVAL; + goto fail; + } + if (header.version < 2 || header.version > 3) { + report_unsupported(bs, errp, "QCOW version %" PRIu32, header.version); + ret = -ENOTSUP; + goto fail; + } + + s->qcow_version = header.version; + + /* Initialise cluster size */ + if (header.cluster_bits < MIN_CLUSTER_BITS || + header.cluster_bits > MAX_CLUSTER_BITS) { + error_setg(errp, "Unsupported cluster size: 2^%" PRIu32, + header.cluster_bits); + ret = -EINVAL; + goto fail; + } + + s->cluster_bits = header.cluster_bits; + s->cluster_size = 1 << s->cluster_bits; + s->cluster_sectors = 1 << (s->cluster_bits - 9); + + /* Initialise version 3 header fields */ + if (header.version == 2) { + header.incompatible_features = 0; + header.compatible_features = 0; + header.autoclear_features = 0; + header.refcount_order = 4; + header.header_length = 72; + } else { + be64_to_cpus(&header.incompatible_features); + be64_to_cpus(&header.compatible_features); + be64_to_cpus(&header.autoclear_features); + be32_to_cpus(&header.refcount_order); + be32_to_cpus(&header.header_length); + + if (header.header_length < 104) { + error_setg(errp, "qcow2 header too short"); + ret = -EINVAL; + goto fail; + } + } + + if (header.header_length > s->cluster_size) { + error_setg(errp, "qcow2 header exceeds cluster size"); + ret = -EINVAL; + goto fail; + } + + if (header.header_length > sizeof(header)) { + s->unknown_header_fields_size = header.header_length - sizeof(header); + s->unknown_header_fields = g_malloc(s->unknown_header_fields_size); + ret = bdrv_pread(bs->file->bs, sizeof(header), s->unknown_header_fields, + s->unknown_header_fields_size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read unknown qcow2 header " + "fields"); + goto fail; + } + } + + if (header.backing_file_offset > s->cluster_size) { + error_setg(errp, "Invalid backing file offset"); + ret = -EINVAL; + goto fail; + } + + if (header.backing_file_offset) { + ext_end = header.backing_file_offset; + } else { + ext_end = 1 << header.cluster_bits; + } + + /* Handle feature bits */ + s->incompatible_features = header.incompatible_features; + s->compatible_features = header.compatible_features; + s->autoclear_features = header.autoclear_features; + + if (s->incompatible_features & ~QCOW2_INCOMPAT_MASK) { + void *feature_table = NULL; + qcow2_read_extensions(bs, header.header_length, ext_end, + &feature_table, NULL); + report_unsupported_feature(bs, errp, feature_table, + s->incompatible_features & + ~QCOW2_INCOMPAT_MASK); + ret = -ENOTSUP; + g_free(feature_table); + goto fail; + } + + if (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT) { + /* Corrupt images may not be written to unless they are being repaired + */ + if ((flags & BDRV_O_RDWR) && !(flags & BDRV_O_CHECK)) { + error_setg(errp, "qcow2: Image is corrupt; cannot be opened " + "read/write"); + ret = -EACCES; + goto fail; + } + } + + /* Check support for various header values */ + if (header.refcount_order > 6) { + error_setg(errp, "Reference count entry width too large; may not " + "exceed 64 bits"); + ret = -EINVAL; + goto fail; + } + s->refcount_order = header.refcount_order; + s->refcount_bits = 1 << s->refcount_order; + s->refcount_max = UINT64_C(1) << (s->refcount_bits - 1); + s->refcount_max += s->refcount_max - 1; + + if (header.crypt_method > QCOW_CRYPT_AES) { + error_setg(errp, "Unsupported encryption method: %" PRIu32, + header.crypt_method); + ret = -EINVAL; + goto fail; + } + if (!qcrypto_cipher_supports(QCRYPTO_CIPHER_ALG_AES_128)) { + error_setg(errp, "AES cipher not available"); + ret = -EINVAL; + goto fail; + } + s->crypt_method_header = header.crypt_method; + if (s->crypt_method_header) { + bs->encrypted = 1; + } + + s->l2_bits = s->cluster_bits - 3; /* L2 is always one cluster */ + s->l2_size = 1 << s->l2_bits; + /* 2^(s->refcount_order - 3) is the refcount width in bytes */ + s->refcount_block_bits = s->cluster_bits - (s->refcount_order - 3); + s->refcount_block_size = 1 << s->refcount_block_bits; + bs->total_sectors = header.size / 512; + s->csize_shift = (62 - (s->cluster_bits - 8)); + s->csize_mask = (1 << (s->cluster_bits - 8)) - 1; + s->cluster_offset_mask = (1LL << s->csize_shift) - 1; + + s->refcount_table_offset = header.refcount_table_offset; + s->refcount_table_size = + header.refcount_table_clusters << (s->cluster_bits - 3); + + if (header.refcount_table_clusters > qcow2_max_refcount_clusters(s)) { + error_setg(errp, "Reference count table too large"); + ret = -EINVAL; + goto fail; + } + + ret = validate_table_offset(bs, s->refcount_table_offset, + s->refcount_table_size, sizeof(uint64_t)); + if (ret < 0) { + error_setg(errp, "Invalid reference count table offset"); + goto fail; + } + + /* Snapshot table offset/length */ + if (header.nb_snapshots > QCOW_MAX_SNAPSHOTS) { + error_setg(errp, "Too many snapshots"); + ret = -EINVAL; + goto fail; + } + + ret = validate_table_offset(bs, header.snapshots_offset, + header.nb_snapshots, + sizeof(QCowSnapshotHeader)); + if (ret < 0) { + error_setg(errp, "Invalid snapshot table offset"); + goto fail; + } + + /* read the level 1 table */ + if (header.l1_size > QCOW_MAX_L1_SIZE / sizeof(uint64_t)) { + error_setg(errp, "Active L1 table too large"); + ret = -EFBIG; + goto fail; + } + s->l1_size = header.l1_size; + + l1_vm_state_index = size_to_l1(s, header.size); + if (l1_vm_state_index > INT_MAX) { + error_setg(errp, "Image is too big"); + ret = -EFBIG; + goto fail; + } + s->l1_vm_state_index = l1_vm_state_index; + + /* the L1 table must contain at least enough entries to put + header.size bytes */ + if (s->l1_size < s->l1_vm_state_index) { + error_setg(errp, "L1 table is too small"); + ret = -EINVAL; + goto fail; + } + + ret = validate_table_offset(bs, header.l1_table_offset, + header.l1_size, sizeof(uint64_t)); + if (ret < 0) { + error_setg(errp, "Invalid L1 table offset"); + goto fail; + } + s->l1_table_offset = header.l1_table_offset; + + + if (s->l1_size > 0) { + s->l1_table = qemu_try_blockalign(bs->file->bs, + align_offset(s->l1_size * sizeof(uint64_t), 512)); + if (s->l1_table == NULL) { + error_setg(errp, "Could not allocate L1 table"); + ret = -ENOMEM; + goto fail; + } + ret = bdrv_pread(bs->file->bs, s->l1_table_offset, s->l1_table, + s->l1_size * sizeof(uint64_t)); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read L1 table"); + goto fail; + } + for(i = 0;i < s->l1_size; i++) { + be64_to_cpus(&s->l1_table[i]); + } + } + + /* Parse driver-specific options */ + ret = qcow2_update_options(bs, options, flags, errp); + if (ret < 0) { + goto fail; + } + + s->cluster_cache = g_malloc(s->cluster_size); + /* one more sector for decompressed data alignment */ + s->cluster_data = qemu_try_blockalign(bs->file->bs, QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size + 512); + if (s->cluster_data == NULL) { + error_setg(errp, "Could not allocate temporary cluster buffer"); + ret = -ENOMEM; + goto fail; + } + + s->cluster_cache_offset = -1; + s->flags = flags; + + ret = qcow2_refcount_init(bs); + if (ret != 0) { + error_setg_errno(errp, -ret, "Could not initialize refcount handling"); + goto fail; + } + + QLIST_INIT(&s->cluster_allocs); + QTAILQ_INIT(&s->discards); + + /* read qcow2 extensions */ + if (qcow2_read_extensions(bs, header.header_length, ext_end, NULL, + &local_err)) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + /* read the backing file name */ + if (header.backing_file_offset != 0) { + len = header.backing_file_size; + if (len > MIN(1023, s->cluster_size - header.backing_file_offset) || + len >= sizeof(bs->backing_file)) { + error_setg(errp, "Backing file name too long"); + ret = -EINVAL; + goto fail; + } + ret = bdrv_pread(bs->file->bs, header.backing_file_offset, + bs->backing_file, len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read backing file name"); + goto fail; + } + bs->backing_file[len] = '\0'; + s->image_backing_file = g_strdup(bs->backing_file); + } + + /* Internal snapshots */ + s->snapshots_offset = header.snapshots_offset; + s->nb_snapshots = header.nb_snapshots; + + ret = qcow2_read_snapshots(bs); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read snapshots"); + goto fail; + } + + /* Clear unknown autoclear feature bits */ + if (!bs->read_only && !(flags & BDRV_O_INCOMING) && s->autoclear_features) { + s->autoclear_features = 0; + ret = qcow2_update_header(bs); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not update qcow2 header"); + goto fail; + } + } + + /* Initialise locks */ + qemu_co_mutex_init(&s->lock); + + /* Repair image if dirty */ + if (!(flags & (BDRV_O_CHECK | BDRV_O_INCOMING)) && !bs->read_only && + (s->incompatible_features & QCOW2_INCOMPAT_DIRTY)) { + BdrvCheckResult result = {0}; + + ret = qcow2_check(bs, &result, BDRV_FIX_ERRORS | BDRV_FIX_LEAKS); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not repair dirty image"); + goto fail; + } + } + +#ifdef DEBUG_ALLOC + { + BdrvCheckResult result = {0}; + qcow2_check_refcounts(bs, &result, 0); + } +#endif + return ret; + + fail: + g_free(s->unknown_header_fields); + cleanup_unknown_header_ext(bs); + qcow2_free_snapshots(bs); + qcow2_refcount_close(bs); + qemu_vfree(s->l1_table); + /* else pre-write overlap checks in cache_destroy may crash */ + s->l1_table = NULL; + cache_clean_timer_del(bs); + if (s->l2_table_cache) { + qcow2_cache_destroy(bs, s->l2_table_cache); + } + if (s->refcount_block_cache) { + qcow2_cache_destroy(bs, s->refcount_block_cache); + } + g_free(s->cluster_cache); + qemu_vfree(s->cluster_data); + return ret; +} + +static void qcow2_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->cluster_sectors; +} + +static int qcow2_set_key(BlockDriverState *bs, const char *key) +{ + BDRVQcow2State *s = bs->opaque; + uint8_t keybuf[16]; + int len, i; + Error *err = NULL; + + memset(keybuf, 0, 16); + len = strlen(key); + if (len > 16) + len = 16; + /* XXX: we could compress the chars to 7 bits to increase + entropy */ + for(i = 0;i < len;i++) { + keybuf[i] = key[i]; + } + assert(bs->encrypted); + + qcrypto_cipher_free(s->cipher); + s->cipher = qcrypto_cipher_new( + QCRYPTO_CIPHER_ALG_AES_128, + QCRYPTO_CIPHER_MODE_CBC, + keybuf, G_N_ELEMENTS(keybuf), + &err); + + if (!s->cipher) { + /* XXX would be nice if errors in this method could + * be properly propagate to the caller. Would need + * the bdrv_set_key() API signature to be fixed. */ + error_free(err); + return -1; + } + return 0; +} + +static int qcow2_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + Qcow2ReopenState *r; + int ret; + + r = g_new0(Qcow2ReopenState, 1); + state->opaque = r; + + ret = qcow2_update_options_prepare(state->bs, r, state->options, + state->flags, errp); + if (ret < 0) { + goto fail; + } + + /* We need to write out any unwritten data if we reopen read-only. */ + if ((state->flags & BDRV_O_RDWR) == 0) { + ret = bdrv_flush(state->bs); + if (ret < 0) { + goto fail; + } + + ret = qcow2_mark_clean(state->bs); + if (ret < 0) { + goto fail; + } + } + + return 0; + +fail: + qcow2_update_options_abort(state->bs, r); + g_free(r); + return ret; +} + +static void qcow2_reopen_commit(BDRVReopenState *state) +{ + qcow2_update_options_commit(state->bs, state->opaque); + g_free(state->opaque); +} + +static void qcow2_reopen_abort(BDRVReopenState *state) +{ + qcow2_update_options_abort(state->bs, state->opaque); + g_free(state->opaque); +} + +static int64_t coroutine_fn qcow2_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t cluster_offset; + int index_in_cluster, ret; + int64_t status = 0; + + *pnum = nb_sectors; + qemu_co_mutex_lock(&s->lock); + ret = qcow2_get_cluster_offset(bs, sector_num << 9, pnum, &cluster_offset); + qemu_co_mutex_unlock(&s->lock); + if (ret < 0) { + return ret; + } + + if (cluster_offset != 0 && ret != QCOW2_CLUSTER_COMPRESSED && + !s->cipher) { + index_in_cluster = sector_num & (s->cluster_sectors - 1); + cluster_offset |= (index_in_cluster << BDRV_SECTOR_BITS); + status |= BDRV_BLOCK_OFFSET_VALID | cluster_offset; + } + if (ret == QCOW2_CLUSTER_ZERO) { + status |= BDRV_BLOCK_ZERO; + } else if (ret != QCOW2_CLUSTER_UNALLOCATED) { + status |= BDRV_BLOCK_DATA; + } + return status; +} + +/* handle reading after the end of the backing file */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors) +{ + int n1; + if ((sector_num + nb_sectors) <= bs->total_sectors) + return nb_sectors; + if (sector_num >= bs->total_sectors) + n1 = 0; + else + n1 = bs->total_sectors - sector_num; + + qemu_iovec_memset(qiov, 512 * n1, 0, 512 * (nb_sectors - n1)); + + return n1; +} + +static coroutine_fn int qcow2_co_readv(BlockDriverState *bs, int64_t sector_num, + int remaining_sectors, QEMUIOVector *qiov) +{ + BDRVQcow2State *s = bs->opaque; + int index_in_cluster, n1; + int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t cluster_offset = 0; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + uint8_t *cluster_data = NULL; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + while (remaining_sectors != 0) { + + /* prepare next request */ + cur_nr_sectors = remaining_sectors; + if (s->cipher) { + cur_nr_sectors = MIN(cur_nr_sectors, + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + } + + ret = qcow2_get_cluster_offset(bs, sector_num << 9, + &cur_nr_sectors, &cluster_offset); + if (ret < 0) { + goto fail; + } + + index_in_cluster = sector_num & (s->cluster_sectors - 1); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + + switch (ret) { + case QCOW2_CLUSTER_UNALLOCATED: + + if (bs->backing) { + /* read from the base image */ + n1 = qcow2_backing_read1(bs->backing->bs, &hd_qiov, + sector_num, cur_nr_sectors); + if (n1 > 0) { + QEMUIOVector local_qiov; + + qemu_iovec_init(&local_qiov, hd_qiov.niov); + qemu_iovec_concat(&local_qiov, &hd_qiov, 0, + n1 * BDRV_SECTOR_SIZE); + + BLKDBG_EVENT(bs->file, BLKDBG_READ_BACKING_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->backing->bs, sector_num, + n1, &local_qiov); + qemu_co_mutex_lock(&s->lock); + + qemu_iovec_destroy(&local_qiov); + + if (ret < 0) { + goto fail; + } + } + } else { + /* Note: in this case, no need to wait */ + qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + } + break; + + case QCOW2_CLUSTER_ZERO: + qemu_iovec_memset(&hd_qiov, 0, 0, 512 * cur_nr_sectors); + break; + + case QCOW2_CLUSTER_COMPRESSED: + /* add AIO support for compressed blocks ? */ + ret = qcow2_decompress_cluster(bs, cluster_offset); + if (ret < 0) { + goto fail; + } + + qemu_iovec_from_buf(&hd_qiov, 0, + s->cluster_cache + index_in_cluster * 512, + 512 * cur_nr_sectors); + break; + + case QCOW2_CLUSTER_NORMAL: + if ((cluster_offset & 511) != 0) { + ret = -EIO; + goto fail; + } + + if (bs->encrypted) { + assert(s->cipher); + + /* + * For encrypted images, read everything into a temporary + * contiguous buffer on which the AES functions can work. + */ + if (!cluster_data) { + cluster_data = + qemu_try_blockalign(bs->file->bs, + QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size); + if (cluster_data == NULL) { + ret = -ENOMEM; + goto fail; + } + } + + assert(cur_nr_sectors <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors); + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + 512 * cur_nr_sectors); + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file->bs, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + if (bs->encrypted) { + assert(s->cipher); + Error *err = NULL; + if (qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, false, + &err) < 0) { + error_free(err); + ret = -EIO; + goto fail; + } + qemu_iovec_from_buf(qiov, bytes_done, + cluster_data, 512 * cur_nr_sectors); + } + break; + + default: + g_assert_not_reached(); + ret = -EIO; + goto fail; + } + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + } + ret = 0; + +fail: + qemu_co_mutex_unlock(&s->lock); + + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cluster_data); + + return ret; +} + +static coroutine_fn int qcow2_co_writev(BlockDriverState *bs, + int64_t sector_num, + int remaining_sectors, + QEMUIOVector *qiov) +{ + BDRVQcow2State *s = bs->opaque; + int index_in_cluster; + int ret; + int cur_nr_sectors; /* number of sectors in current iteration */ + uint64_t cluster_offset; + QEMUIOVector hd_qiov; + uint64_t bytes_done = 0; + uint8_t *cluster_data = NULL; + QCowL2Meta *l2meta = NULL; + + trace_qcow2_writev_start_req(qemu_coroutine_self(), sector_num, + remaining_sectors); + + qemu_iovec_init(&hd_qiov, qiov->niov); + + s->cluster_cache_offset = -1; /* disable compressed cache */ + + qemu_co_mutex_lock(&s->lock); + + while (remaining_sectors != 0) { + + l2meta = NULL; + + trace_qcow2_writev_start_part(qemu_coroutine_self()); + index_in_cluster = sector_num & (s->cluster_sectors - 1); + cur_nr_sectors = remaining_sectors; + if (bs->encrypted && + cur_nr_sectors > + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster) { + cur_nr_sectors = + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_sectors - index_in_cluster; + } + + ret = qcow2_alloc_cluster_offset(bs, sector_num << 9, + &cur_nr_sectors, &cluster_offset, &l2meta); + if (ret < 0) { + goto fail; + } + + assert((cluster_offset & 511) == 0); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + cur_nr_sectors * 512); + + if (bs->encrypted) { + Error *err = NULL; + assert(s->cipher); + if (!cluster_data) { + cluster_data = qemu_try_blockalign(bs->file->bs, + QCOW_MAX_CRYPT_CLUSTERS + * s->cluster_size); + if (cluster_data == NULL) { + ret = -ENOMEM; + goto fail; + } + } + + assert(hd_qiov.size <= + QCOW_MAX_CRYPT_CLUSTERS * s->cluster_size); + qemu_iovec_to_buf(&hd_qiov, 0, cluster_data, hd_qiov.size); + + if (qcow2_encrypt_sectors(s, sector_num, cluster_data, + cluster_data, cur_nr_sectors, + true, &err) < 0) { + error_free(err); + ret = -EIO; + goto fail; + } + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_add(&hd_qiov, cluster_data, + cur_nr_sectors * 512); + } + + ret = qcow2_pre_write_overlap_check(bs, 0, + cluster_offset + index_in_cluster * BDRV_SECTOR_SIZE, + cur_nr_sectors * BDRV_SECTOR_SIZE); + if (ret < 0) { + goto fail; + } + + qemu_co_mutex_unlock(&s->lock); + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); + trace_qcow2_writev_data(qemu_coroutine_self(), + (cluster_offset >> 9) + index_in_cluster); + ret = bdrv_co_writev(bs->file->bs, + (cluster_offset >> 9) + index_in_cluster, + cur_nr_sectors, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto fail; + } + + while (l2meta != NULL) { + QCowL2Meta *next; + + ret = qcow2_alloc_cluster_link_l2(bs, l2meta); + if (ret < 0) { + goto fail; + } + + /* Take the request off the list of running requests */ + if (l2meta->nb_clusters != 0) { + QLIST_REMOVE(l2meta, next_in_flight); + } + + qemu_co_queue_restart_all(&l2meta->dependent_requests); + + next = l2meta->next; + g_free(l2meta); + l2meta = next; + } + + remaining_sectors -= cur_nr_sectors; + sector_num += cur_nr_sectors; + bytes_done += cur_nr_sectors * 512; + trace_qcow2_writev_done_part(qemu_coroutine_self(), cur_nr_sectors); + } + ret = 0; + +fail: + qemu_co_mutex_unlock(&s->lock); + + while (l2meta != NULL) { + QCowL2Meta *next; + + if (l2meta->nb_clusters != 0) { + QLIST_REMOVE(l2meta, next_in_flight); + } + qemu_co_queue_restart_all(&l2meta->dependent_requests); + + next = l2meta->next; + g_free(l2meta); + l2meta = next; + } + + qemu_iovec_destroy(&hd_qiov); + qemu_vfree(cluster_data); + trace_qcow2_writev_done_req(qemu_coroutine_self(), ret); + + return ret; +} + +static void qcow2_close(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + qemu_vfree(s->l1_table); + /* else pre-write overlap checks in cache_destroy may crash */ + s->l1_table = NULL; + + if (!(bs->open_flags & BDRV_O_INCOMING)) { + int ret1, ret2; + + ret1 = qcow2_cache_flush(bs, s->l2_table_cache); + ret2 = qcow2_cache_flush(bs, s->refcount_block_cache); + + if (ret1) { + error_report("Failed to flush the L2 table cache: %s", + strerror(-ret1)); + } + if (ret2) { + error_report("Failed to flush the refcount block cache: %s", + strerror(-ret2)); + } + + if (!ret1 && !ret2) { + qcow2_mark_clean(bs); + } + } + + cache_clean_timer_del(bs); + qcow2_cache_destroy(bs, s->l2_table_cache); + qcow2_cache_destroy(bs, s->refcount_block_cache); + + qcrypto_cipher_free(s->cipher); + s->cipher = NULL; + + g_free(s->unknown_header_fields); + cleanup_unknown_header_ext(bs); + + g_free(s->image_backing_file); + g_free(s->image_backing_format); + + g_free(s->cluster_cache); + qemu_vfree(s->cluster_data); + qcow2_refcount_close(bs); + qcow2_free_snapshots(bs); +} + +static void qcow2_invalidate_cache(BlockDriverState *bs, Error **errp) +{ + BDRVQcow2State *s = bs->opaque; + int flags = s->flags; + QCryptoCipher *cipher = NULL; + QDict *options; + Error *local_err = NULL; + int ret; + + /* + * Backing files are read-only which makes all of their metadata immutable, + * that means we don't have to worry about reopening them here. + */ + + cipher = s->cipher; + s->cipher = NULL; + + qcow2_close(bs); + + bdrv_invalidate_cache(bs->file->bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + memset(s, 0, sizeof(BDRVQcow2State)); + options = qdict_clone_shallow(bs->options); + + ret = qcow2_open(bs, options, flags, &local_err); + QDECREF(options); + if (local_err) { + error_setg(errp, "Could not reopen qcow2 layer: %s", + error_get_pretty(local_err)); + error_free(local_err); + return; + } else if (ret < 0) { + error_setg_errno(errp, -ret, "Could not reopen qcow2 layer"); + return; + } + + s->cipher = cipher; +} + +static size_t header_ext_add(char *buf, uint32_t magic, const void *s, + size_t len, size_t buflen) +{ + QCowExtension *ext_backing_fmt = (QCowExtension*) buf; + size_t ext_len = sizeof(QCowExtension) + ((len + 7) & ~7); + + if (buflen < ext_len) { + return -ENOSPC; + } + + *ext_backing_fmt = (QCowExtension) { + .magic = cpu_to_be32(magic), + .len = cpu_to_be32(len), + }; + memcpy(buf + sizeof(QCowExtension), s, len); + + return ext_len; +} + +/* + * Updates the qcow2 header, including the variable length parts of it, i.e. + * the backing file name and all extensions. qcow2 was not designed to allow + * such changes, so if we run out of space (we can only use the first cluster) + * this function may fail. + * + * Returns 0 on success, -errno in error cases. + */ +int qcow2_update_header(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + QCowHeader *header; + char *buf; + size_t buflen = s->cluster_size; + int ret; + uint64_t total_size; + uint32_t refcount_table_clusters; + size_t header_length; + Qcow2UnknownHeaderExtension *uext; + + buf = qemu_blockalign(bs, buflen); + + /* Header structure */ + header = (QCowHeader*) buf; + + if (buflen < sizeof(*header)) { + ret = -ENOSPC; + goto fail; + } + + header_length = sizeof(*header) + s->unknown_header_fields_size; + total_size = bs->total_sectors * BDRV_SECTOR_SIZE; + refcount_table_clusters = s->refcount_table_size >> (s->cluster_bits - 3); + + *header = (QCowHeader) { + /* Version 2 fields */ + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(s->qcow_version), + .backing_file_offset = 0, + .backing_file_size = 0, + .cluster_bits = cpu_to_be32(s->cluster_bits), + .size = cpu_to_be64(total_size), + .crypt_method = cpu_to_be32(s->crypt_method_header), + .l1_size = cpu_to_be32(s->l1_size), + .l1_table_offset = cpu_to_be64(s->l1_table_offset), + .refcount_table_offset = cpu_to_be64(s->refcount_table_offset), + .refcount_table_clusters = cpu_to_be32(refcount_table_clusters), + .nb_snapshots = cpu_to_be32(s->nb_snapshots), + .snapshots_offset = cpu_to_be64(s->snapshots_offset), + + /* Version 3 fields */ + .incompatible_features = cpu_to_be64(s->incompatible_features), + .compatible_features = cpu_to_be64(s->compatible_features), + .autoclear_features = cpu_to_be64(s->autoclear_features), + .refcount_order = cpu_to_be32(s->refcount_order), + .header_length = cpu_to_be32(header_length), + }; + + /* For older versions, write a shorter header */ + switch (s->qcow_version) { + case 2: + ret = offsetof(QCowHeader, incompatible_features); + break; + case 3: + ret = sizeof(*header); + break; + default: + ret = -EINVAL; + goto fail; + } + + buf += ret; + buflen -= ret; + memset(buf, 0, buflen); + + /* Preserve any unknown field in the header */ + if (s->unknown_header_fields_size) { + if (buflen < s->unknown_header_fields_size) { + ret = -ENOSPC; + goto fail; + } + + memcpy(buf, s->unknown_header_fields, s->unknown_header_fields_size); + buf += s->unknown_header_fields_size; + buflen -= s->unknown_header_fields_size; + } + + /* Backing file format header extension */ + if (s->image_backing_format) { + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_BACKING_FORMAT, + s->image_backing_format, + strlen(s->image_backing_format), + buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + } + + /* Feature table */ + Qcow2Feature features[] = { + { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_DIRTY_BITNR, + .name = "dirty bit", + }, + { + .type = QCOW2_FEAT_TYPE_INCOMPATIBLE, + .bit = QCOW2_INCOMPAT_CORRUPT_BITNR, + .name = "corrupt bit", + }, + { + .type = QCOW2_FEAT_TYPE_COMPATIBLE, + .bit = QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + .name = "lazy refcounts", + }, + }; + + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_FEATURE_TABLE, + features, sizeof(features), buflen); + if (ret < 0) { + goto fail; + } + buf += ret; + buflen -= ret; + + /* Keep unknown header extensions */ + QLIST_FOREACH(uext, &s->unknown_header_ext, next) { + ret = header_ext_add(buf, uext->magic, uext->data, uext->len, buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + } + + /* End of header extensions */ + ret = header_ext_add(buf, QCOW2_EXT_MAGIC_END, NULL, 0, buflen); + if (ret < 0) { + goto fail; + } + + buf += ret; + buflen -= ret; + + /* Backing file name */ + if (s->image_backing_file) { + size_t backing_file_len = strlen(s->image_backing_file); + + if (buflen < backing_file_len) { + ret = -ENOSPC; + goto fail; + } + + /* Using strncpy is ok here, since buf is not NUL-terminated. */ + strncpy(buf, s->image_backing_file, buflen); + + header->backing_file_offset = cpu_to_be64(buf - ((char*) header)); + header->backing_file_size = cpu_to_be32(backing_file_len); + } + + /* Write the new header */ + ret = bdrv_pwrite(bs->file->bs, 0, header, s->cluster_size); + if (ret < 0) { + goto fail; + } + + ret = 0; +fail: + qemu_vfree(header); + return ret; +} + +static int qcow2_change_backing_file(BlockDriverState *bs, + const char *backing_file, const char *backing_fmt) +{ + BDRVQcow2State *s = bs->opaque; + + pstrcpy(bs->backing_file, sizeof(bs->backing_file), backing_file ?: ""); + pstrcpy(bs->backing_format, sizeof(bs->backing_format), backing_fmt ?: ""); + + g_free(s->image_backing_file); + g_free(s->image_backing_format); + + s->image_backing_file = backing_file ? g_strdup(bs->backing_file) : NULL; + s->image_backing_format = backing_fmt ? g_strdup(bs->backing_format) : NULL; + + return qcow2_update_header(bs); +} + +static int preallocate(BlockDriverState *bs) +{ + uint64_t nb_sectors; + uint64_t offset; + uint64_t host_offset = 0; + int num; + int ret; + QCowL2Meta *meta; + + nb_sectors = bdrv_nb_sectors(bs); + offset = 0; + + while (nb_sectors) { + num = MIN(nb_sectors, INT_MAX >> BDRV_SECTOR_BITS); + ret = qcow2_alloc_cluster_offset(bs, offset, &num, + &host_offset, &meta); + if (ret < 0) { + return ret; + } + + while (meta) { + QCowL2Meta *next = meta->next; + + ret = qcow2_alloc_cluster_link_l2(bs, meta); + if (ret < 0) { + qcow2_free_any_clusters(bs, meta->alloc_offset, + meta->nb_clusters, QCOW2_DISCARD_NEVER); + return ret; + } + + /* There are no dependent requests, but we need to remove our + * request from the list of in-flight requests */ + QLIST_REMOVE(meta, next_in_flight); + + g_free(meta); + meta = next; + } + + /* TODO Preallocate data if requested */ + + nb_sectors -= num; + offset += num << BDRV_SECTOR_BITS; + } + + /* + * It is expected that the image file is large enough to actually contain + * all of the allocated clusters (otherwise we get failing reads after + * EOF). Extend the image to the last allocated sector. + */ + if (host_offset != 0) { + uint8_t buf[BDRV_SECTOR_SIZE]; + memset(buf, 0, BDRV_SECTOR_SIZE); + ret = bdrv_write(bs->file->bs, + (host_offset >> BDRV_SECTOR_BITS) + num - 1, + buf, 1); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +static int qcow2_create2(const char *filename, int64_t total_size, + const char *backing_file, const char *backing_format, + int flags, size_t cluster_size, PreallocMode prealloc, + QemuOpts *opts, int version, int refcount_order, + Error **errp) +{ + int cluster_bits; + QDict *options; + + /* Calculate cluster_bits */ + cluster_bits = ctz32(cluster_size); + if (cluster_bits < MIN_CLUSTER_BITS || cluster_bits > MAX_CLUSTER_BITS || + (1 << cluster_bits) != cluster_size) + { + error_setg(errp, "Cluster size must be a power of two between %d and " + "%dk", 1 << MIN_CLUSTER_BITS, 1 << (MAX_CLUSTER_BITS - 10)); + return -EINVAL; + } + + /* + * Open the image file and write a minimal qcow2 header. + * + * We keep things simple and start with a zero-sized image. We also + * do without refcount blocks or a L1 table for now. We'll fix the + * inconsistency later. + * + * We do need a refcount table because growing the refcount table means + * allocating two new refcount blocks - the seconds of which would be at + * 2 GB for 64k clusters, and we don't want to have a 2 GB initial file + * size for any qcow2 image. + */ + BlockDriverState* bs; + QCowHeader *header; + uint64_t* refcount_table; + Error *local_err = NULL; + int ret; + + if (prealloc == PREALLOC_MODE_FULL || prealloc == PREALLOC_MODE_FALLOC) { + /* Note: The following calculation does not need to be exact; if it is a + * bit off, either some bytes will be "leaked" (which is fine) or we + * will need to increase the file size by some bytes (which is fine, + * too, as long as the bulk is allocated here). Therefore, using + * floating point arithmetic is fine. */ + int64_t meta_size = 0; + uint64_t nreftablee, nrefblocke, nl1e, nl2e; + int64_t aligned_total_size = align_offset(total_size, cluster_size); + int refblock_bits, refblock_size; + /* refcount entry size in bytes */ + double rces = (1 << refcount_order) / 8.; + + /* see qcow2_open() */ + refblock_bits = cluster_bits - (refcount_order - 3); + refblock_size = 1 << refblock_bits; + + /* header: 1 cluster */ + meta_size += cluster_size; + + /* total size of L2 tables */ + nl2e = aligned_total_size / cluster_size; + nl2e = align_offset(nl2e, cluster_size / sizeof(uint64_t)); + meta_size += nl2e * sizeof(uint64_t); + + /* total size of L1 tables */ + nl1e = nl2e * sizeof(uint64_t) / cluster_size; + nl1e = align_offset(nl1e, cluster_size / sizeof(uint64_t)); + meta_size += nl1e * sizeof(uint64_t); + + /* total size of refcount blocks + * + * note: every host cluster is reference-counted, including metadata + * (even refcount blocks are recursively included). + * Let: + * a = total_size (this is the guest disk size) + * m = meta size not including refcount blocks and refcount tables + * c = cluster size + * y1 = number of refcount blocks entries + * y2 = meta size including everything + * rces = refcount entry size in bytes + * then, + * y1 = (y2 + a)/c + * y2 = y1 * rces + y1 * rces * sizeof(u64) / c + m + * we can get y1: + * y1 = (a + m) / (c - rces - rces * sizeof(u64) / c) + */ + nrefblocke = (aligned_total_size + meta_size + cluster_size) + / (cluster_size - rces - rces * sizeof(uint64_t) + / cluster_size); + meta_size += DIV_ROUND_UP(nrefblocke, refblock_size) * cluster_size; + + /* total size of refcount tables */ + nreftablee = nrefblocke / refblock_size; + nreftablee = align_offset(nreftablee, cluster_size / sizeof(uint64_t)); + meta_size += nreftablee * sizeof(uint64_t); + + qemu_opt_set_number(opts, BLOCK_OPT_SIZE, + aligned_total_size + meta_size, &error_abort); + qemu_opt_set(opts, BLOCK_OPT_PREALLOC, PreallocMode_lookup[prealloc], + &error_abort); + } + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + + /* Write the header */ + QEMU_BUILD_BUG_ON((1 << MIN_CLUSTER_BITS) < sizeof(*header)); + header = g_malloc0(cluster_size); + *header = (QCowHeader) { + .magic = cpu_to_be32(QCOW_MAGIC), + .version = cpu_to_be32(version), + .cluster_bits = cpu_to_be32(cluster_bits), + .size = cpu_to_be64(0), + .l1_table_offset = cpu_to_be64(0), + .l1_size = cpu_to_be32(0), + .refcount_table_offset = cpu_to_be64(cluster_size), + .refcount_table_clusters = cpu_to_be32(1), + .refcount_order = cpu_to_be32(refcount_order), + .header_length = cpu_to_be32(sizeof(*header)), + }; + + if (flags & BLOCK_FLAG_ENCRYPT) { + header->crypt_method = cpu_to_be32(QCOW_CRYPT_AES); + } else { + header->crypt_method = cpu_to_be32(QCOW_CRYPT_NONE); + } + + if (flags & BLOCK_FLAG_LAZY_REFCOUNTS) { + header->compatible_features |= + cpu_to_be64(QCOW2_COMPAT_LAZY_REFCOUNTS); + } + + ret = bdrv_pwrite(bs, 0, header, cluster_size); + g_free(header); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write qcow2 header"); + goto out; + } + + /* Write a refcount table with one refcount block */ + refcount_table = g_malloc0(2 * cluster_size); + refcount_table[0] = cpu_to_be64(2 * cluster_size); + ret = bdrv_pwrite(bs, cluster_size, refcount_table, 2 * cluster_size); + g_free(refcount_table); + + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write refcount table"); + goto out; + } + + bdrv_unref(bs); + bs = NULL; + + /* + * And now open the image and make it consistent first (i.e. increase the + * refcount of the cluster that is occupied by the header and the refcount + * table) + */ + options = qdict_new(); + qdict_put(options, "driver", qstring_from_str("qcow2")); + ret = bdrv_open(&bs, filename, NULL, options, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, + &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto out; + } + + ret = qcow2_alloc_clusters(bs, 3 * cluster_size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not allocate clusters for qcow2 " + "header and refcount table"); + goto out; + + } else if (ret != 0) { + error_report("Huh, first cluster in empty image is already in use?"); + abort(); + } + + /* Okay, now that we have a valid image, let's give it the right size */ + ret = bdrv_truncate(bs, total_size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not resize image"); + goto out; + } + + /* Want a backing file? There you go.*/ + if (backing_file) { + ret = bdrv_change_backing_file(bs, backing_file, backing_format); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not assign backing file '%s' " + "with format '%s'", backing_file, backing_format); + goto out; + } + } + + /* And if we're supposed to preallocate metadata, do that now */ + if (prealloc != PREALLOC_MODE_OFF) { + BDRVQcow2State *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = preallocate(bs); + qemu_co_mutex_unlock(&s->lock); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not preallocate metadata"); + goto out; + } + } + + bdrv_unref(bs); + bs = NULL; + + /* Reopen the image without BDRV_O_NO_FLUSH to flush it before returning */ + options = qdict_new(); + qdict_put(options, "driver", qstring_from_str("qcow2")); + ret = bdrv_open(&bs, filename, NULL, options, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_BACKING, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + goto out; + } + + ret = 0; +out: + if (bs) { + bdrv_unref(bs); + } + return ret; +} + +static int qcow2_create(const char *filename, QemuOpts *opts, Error **errp) +{ + char *backing_file = NULL; + char *backing_fmt = NULL; + char *buf = NULL; + uint64_t size = 0; + int flags = 0; + size_t cluster_size = DEFAULT_CLUSTER_SIZE; + PreallocMode prealloc; + int version = 3; + uint64_t refcount_bits = 16; + int refcount_order; + Error *local_err = NULL; + int ret; + + /* Read out options */ + size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ENCRYPT, false)) { + flags |= BLOCK_FLAG_ENCRYPT; + } + cluster_size = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, + DEFAULT_CLUSTER_SIZE); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + prealloc = qapi_enum_parse(PreallocMode_lookup, buf, + PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto finish; + } + g_free(buf); + buf = qemu_opt_get_del(opts, BLOCK_OPT_COMPAT_LEVEL); + if (!buf) { + /* keep the default */ + } else if (!strcmp(buf, "0.10")) { + version = 2; + } else if (!strcmp(buf, "1.1")) { + version = 3; + } else { + error_setg(errp, "Invalid compatibility level: '%s'", buf); + ret = -EINVAL; + goto finish; + } + + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_LAZY_REFCOUNTS, false)) { + flags |= BLOCK_FLAG_LAZY_REFCOUNTS; + } + + if (backing_file && prealloc != PREALLOC_MODE_OFF) { + error_setg(errp, "Backing file and preallocation cannot be used at " + "the same time"); + ret = -EINVAL; + goto finish; + } + + if (version < 3 && (flags & BLOCK_FLAG_LAZY_REFCOUNTS)) { + error_setg(errp, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)"); + ret = -EINVAL; + goto finish; + } + + refcount_bits = qemu_opt_get_number_del(opts, BLOCK_OPT_REFCOUNT_BITS, + refcount_bits); + if (refcount_bits > 64 || !is_power_of_2(refcount_bits)) { + error_setg(errp, "Refcount width must be a power of two and may not " + "exceed 64 bits"); + ret = -EINVAL; + goto finish; + } + + if (version < 3 && refcount_bits != 16) { + error_setg(errp, "Different refcount widths than 16 bits require " + "compatibility level 1.1 or above (use compat=1.1 or " + "greater)"); + ret = -EINVAL; + goto finish; + } + + refcount_order = ctz32(refcount_bits); + + ret = qcow2_create2(filename, size, backing_file, backing_fmt, flags, + cluster_size, prealloc, opts, version, refcount_order, + &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + +finish: + g_free(backing_file); + g_free(backing_fmt); + g_free(buf); + return ret; +} + +static coroutine_fn int qcow2_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + + /* Emulate misaligned zero writes */ + if (sector_num % s->cluster_sectors || nb_sectors % s->cluster_sectors) { + return -ENOTSUP; + } + + /* Whatever is left can use real zero clusters */ + qemu_co_mutex_lock(&s->lock); + ret = qcow2_zero_clusters(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +static coroutine_fn int qcow2_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + int ret; + BDRVQcow2State *s = bs->opaque; + + qemu_co_mutex_lock(&s->lock); + ret = qcow2_discard_clusters(bs, sector_num << BDRV_SECTOR_BITS, + nb_sectors, QCOW2_DISCARD_REQUEST, false); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int qcow2_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVQcow2State *s = bs->opaque; + int64_t new_l1_size; + int ret; + + if (offset & 511) { + error_report("The new size must be a multiple of 512"); + return -EINVAL; + } + + /* cannot proceed if image has snapshots */ + if (s->nb_snapshots) { + error_report("Can't resize an image which has snapshots"); + return -ENOTSUP; + } + + /* shrinking is currently not supported */ + if (offset < bs->total_sectors * 512) { + error_report("qcow2 doesn't support shrinking images yet"); + return -ENOTSUP; + } + + new_l1_size = size_to_l1(s, offset); + ret = qcow2_grow_l1_table(bs, new_l1_size, true); + if (ret < 0) { + return ret; + } + + /* write updated header.size */ + offset = cpu_to_be64(offset); + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, size), + &offset, sizeof(uint64_t)); + if (ret < 0) { + return ret; + } + + s->l1_vm_state_index = new_l1_size; + return 0; +} + +/* XXX: put compressed sectors first, then all the cluster aligned + tables to avoid losing bytes in alignment */ +static int qcow2_write_compressed(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVQcow2State *s = bs->opaque; + z_stream strm; + int ret, out_len; + uint8_t *out_buf; + uint64_t cluster_offset; + + if (nb_sectors == 0) { + /* align end of file to a sector boundary to ease reading with + sector based I/Os */ + cluster_offset = bdrv_getlength(bs->file->bs); + return bdrv_truncate(bs->file->bs, cluster_offset); + } + + if (nb_sectors != s->cluster_sectors) { + ret = -EINVAL; + + /* Zero-pad last write if image size is not cluster aligned */ + if (sector_num + nb_sectors == bs->total_sectors && + nb_sectors < s->cluster_sectors) { + uint8_t *pad_buf = qemu_blockalign(bs, s->cluster_size); + memset(pad_buf, 0, s->cluster_size); + memcpy(pad_buf, buf, nb_sectors * BDRV_SECTOR_SIZE); + ret = qcow2_write_compressed(bs, sector_num, + pad_buf, s->cluster_sectors); + qemu_vfree(pad_buf); + } + return ret; + } + + out_buf = g_malloc(s->cluster_size + (s->cluster_size / 1000) + 128); + + /* best compression, small window, no zlib header */ + memset(&strm, 0, sizeof(strm)); + ret = deflateInit2(&strm, Z_DEFAULT_COMPRESSION, + Z_DEFLATED, -12, + 9, Z_DEFAULT_STRATEGY); + if (ret != 0) { + ret = -EINVAL; + goto fail; + } + + strm.avail_in = s->cluster_size; + strm.next_in = (uint8_t *)buf; + strm.avail_out = s->cluster_size; + strm.next_out = out_buf; + + ret = deflate(&strm, Z_FINISH); + if (ret != Z_STREAM_END && ret != Z_OK) { + deflateEnd(&strm); + ret = -EINVAL; + goto fail; + } + out_len = strm.next_out - out_buf; + + deflateEnd(&strm); + + if (ret != Z_STREAM_END || out_len >= s->cluster_size) { + /* could not compress: write normal cluster */ + ret = bdrv_write(bs, sector_num, buf, s->cluster_sectors); + if (ret < 0) { + goto fail; + } + } else { + cluster_offset = qcow2_alloc_compressed_cluster_offset(bs, + sector_num << 9, out_len); + if (!cluster_offset) { + ret = -EIO; + goto fail; + } + cluster_offset &= s->cluster_offset_mask; + + ret = qcow2_pre_write_overlap_check(bs, 0, cluster_offset, out_len); + if (ret < 0) { + goto fail; + } + + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_COMPRESSED); + ret = bdrv_pwrite(bs->file->bs, cluster_offset, out_buf, out_len); + if (ret < 0) { + goto fail; + } + } + + ret = 0; +fail: + g_free(out_buf); + return ret; +} + +static int make_completely_empty(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + int ret, l1_clusters; + int64_t offset; + uint64_t *new_reftable = NULL; + uint64_t rt_entry, l1_size2; + struct { + uint64_t l1_offset; + uint64_t reftable_offset; + uint32_t reftable_clusters; + } QEMU_PACKED l1_ofs_rt_ofs_cls; + + ret = qcow2_cache_empty(bs, s->l2_table_cache); + if (ret < 0) { + goto fail; + } + + ret = qcow2_cache_empty(bs, s->refcount_block_cache); + if (ret < 0) { + goto fail; + } + + /* Refcounts will be broken utterly */ + ret = qcow2_mark_dirty(bs); + if (ret < 0) { + goto fail; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + + l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); + l1_size2 = (uint64_t)s->l1_size * sizeof(uint64_t); + + /* After this call, neither the in-memory nor the on-disk refcount + * information accurately describe the actual references */ + + ret = bdrv_write_zeroes(bs->file->bs, s->l1_table_offset / BDRV_SECTOR_SIZE, + l1_clusters * s->cluster_sectors, 0); + if (ret < 0) { + goto fail_broken_refcounts; + } + memset(s->l1_table, 0, l1_size2); + + BLKDBG_EVENT(bs->file, BLKDBG_EMPTY_IMAGE_PREPARE); + + /* Overwrite enough clusters at the beginning of the sectors to place + * the refcount table, a refcount block and the L1 table in; this may + * overwrite parts of the existing refcount and L1 table, which is not + * an issue because the dirty flag is set, complete data loss is in fact + * desired and partial data loss is consequently fine as well */ + ret = bdrv_write_zeroes(bs->file->bs, s->cluster_size / BDRV_SECTOR_SIZE, + (2 + l1_clusters) * s->cluster_size / + BDRV_SECTOR_SIZE, 0); + /* This call (even if it failed overall) may have overwritten on-disk + * refcount structures; in that case, the in-memory refcount information + * will probably differ from the on-disk information which makes the BDS + * unusable */ + if (ret < 0) { + goto fail_broken_refcounts; + } + + BLKDBG_EVENT(bs->file, BLKDBG_L1_UPDATE); + BLKDBG_EVENT(bs->file, BLKDBG_REFTABLE_UPDATE); + + /* "Create" an empty reftable (one cluster) directly after the image + * header and an empty L1 table three clusters after the image header; + * the cluster between those two will be used as the first refblock */ + cpu_to_be64w(&l1_ofs_rt_ofs_cls.l1_offset, 3 * s->cluster_size); + cpu_to_be64w(&l1_ofs_rt_ofs_cls.reftable_offset, s->cluster_size); + cpu_to_be32w(&l1_ofs_rt_ofs_cls.reftable_clusters, 1); + ret = bdrv_pwrite_sync(bs->file->bs, offsetof(QCowHeader, l1_table_offset), + &l1_ofs_rt_ofs_cls, sizeof(l1_ofs_rt_ofs_cls)); + if (ret < 0) { + goto fail_broken_refcounts; + } + + s->l1_table_offset = 3 * s->cluster_size; + + new_reftable = g_try_new0(uint64_t, s->cluster_size / sizeof(uint64_t)); + if (!new_reftable) { + ret = -ENOMEM; + goto fail_broken_refcounts; + } + + s->refcount_table_offset = s->cluster_size; + s->refcount_table_size = s->cluster_size / sizeof(uint64_t); + + g_free(s->refcount_table); + s->refcount_table = new_reftable; + new_reftable = NULL; + + /* Now the in-memory refcount information again corresponds to the on-disk + * information (reftable is empty and no refblocks (the refblock cache is + * empty)); however, this means some clusters (e.g. the image header) are + * referenced, but not refcounted, but the normal qcow2 code assumes that + * the in-memory information is always correct */ + + BLKDBG_EVENT(bs->file, BLKDBG_REFBLOCK_ALLOC); + + /* Enter the first refblock into the reftable */ + rt_entry = cpu_to_be64(2 * s->cluster_size); + ret = bdrv_pwrite_sync(bs->file->bs, s->cluster_size, + &rt_entry, sizeof(rt_entry)); + if (ret < 0) { + goto fail_broken_refcounts; + } + s->refcount_table[0] = 2 * s->cluster_size; + + s->free_cluster_index = 0; + assert(3 + l1_clusters <= s->refcount_block_size); + offset = qcow2_alloc_clusters(bs, 3 * s->cluster_size + l1_size2); + if (offset < 0) { + ret = offset; + goto fail_broken_refcounts; + } else if (offset > 0) { + error_report("First cluster in emptied image is in use"); + abort(); + } + + /* Now finally the in-memory information corresponds to the on-disk + * structures and is correct */ + ret = qcow2_mark_clean(bs); + if (ret < 0) { + goto fail; + } + + ret = bdrv_truncate(bs->file->bs, (3 + l1_clusters) * s->cluster_size); + if (ret < 0) { + goto fail; + } + + return 0; + +fail_broken_refcounts: + /* The BDS is unusable at this point. If we wanted to make it usable, we + * would have to call qcow2_refcount_close(), qcow2_refcount_init(), + * qcow2_check_refcounts(), qcow2_refcount_close() and qcow2_refcount_init() + * again. However, because the functions which could have caused this error + * path to be taken are used by those functions as well, it's very likely + * that that sequence will fail as well. Therefore, just eject the BDS. */ + bs->drv = NULL; + +fail: + g_free(new_reftable); + return ret; +} + +static int qcow2_make_empty(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + uint64_t start_sector; + int sector_step = INT_MAX / BDRV_SECTOR_SIZE; + int l1_clusters, ret = 0; + + l1_clusters = DIV_ROUND_UP(s->l1_size, s->cluster_size / sizeof(uint64_t)); + + if (s->qcow_version >= 3 && !s->snapshots && + 3 + l1_clusters <= s->refcount_block_size) { + /* The following function only works for qcow2 v3 images (it requires + * the dirty flag) and only as long as there are no snapshots (because + * it completely empties the image). Furthermore, the L1 table and three + * additional clusters (image header, refcount table, one refcount + * block) have to fit inside one refcount block. */ + return make_completely_empty(bs); + } + + /* This fallback code simply discards every active cluster; this is slow, + * but works in all cases */ + for (start_sector = 0; start_sector < bs->total_sectors; + start_sector += sector_step) + { + /* As this function is generally used after committing an external + * snapshot, QCOW2_DISCARD_SNAPSHOT seems appropriate. Also, the + * default action for this kind of discard is to pass the discard, + * which will ideally result in an actually smaller image file, as + * is probably desired. */ + ret = qcow2_discard_clusters(bs, start_sector * BDRV_SECTOR_SIZE, + MIN(sector_step, + bs->total_sectors - start_sector), + QCOW2_DISCARD_SNAPSHOT, true); + if (ret < 0) { + break; + } + } + + return ret; +} + +static coroutine_fn int qcow2_co_flush_to_os(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = qcow2_cache_flush(bs, s->l2_table_cache); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + + if (qcow2_need_accurate_refcounts(s)) { + ret = qcow2_cache_flush(bs, s->refcount_block_cache); + if (ret < 0) { + qemu_co_mutex_unlock(&s->lock); + return ret; + } + } + qemu_co_mutex_unlock(&s->lock); + + return 0; +} + +static int qcow2_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQcow2State *s = bs->opaque; + bdi->unallocated_blocks_are_zero = true; + bdi->can_write_zeroes_with_unmap = (s->qcow_version >= 3); + bdi->cluster_size = s->cluster_size; + bdi->vm_state_offset = qcow2_vm_state_offset(s); + return 0; +} + +static ImageInfoSpecific *qcow2_get_specific_info(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + ImageInfoSpecific *spec_info = g_new(ImageInfoSpecific, 1); + + *spec_info = (ImageInfoSpecific){ + .type = IMAGE_INFO_SPECIFIC_KIND_QCOW2, + .u.qcow2 = g_new(ImageInfoSpecificQCow2, 1), + }; + if (s->qcow_version == 2) { + *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){ + .compat = g_strdup("0.10"), + .refcount_bits = s->refcount_bits, + }; + } else if (s->qcow_version == 3) { + *spec_info->u.qcow2 = (ImageInfoSpecificQCow2){ + .compat = g_strdup("1.1"), + .lazy_refcounts = s->compatible_features & + QCOW2_COMPAT_LAZY_REFCOUNTS, + .has_lazy_refcounts = true, + .corrupt = s->incompatible_features & + QCOW2_INCOMPAT_CORRUPT, + .has_corrupt = true, + .refcount_bits = s->refcount_bits, + }; + } + + return spec_info; +} + +#if 0 +static void dump_refcounts(BlockDriverState *bs) +{ + BDRVQcow2State *s = bs->opaque; + int64_t nb_clusters, k, k1, size; + int refcount; + + size = bdrv_getlength(bs->file->bs); + nb_clusters = size_to_clusters(s, size); + for(k = 0; k < nb_clusters;) { + k1 = k; + refcount = get_refcount(bs, k); + k++; + while (k < nb_clusters && get_refcount(bs, k) == refcount) + k++; + printf("%" PRId64 ": refcount=%d nb=%" PRId64 "\n", k, refcount, + k - k1); + } +} +#endif + +static int qcow2_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) +{ + BDRVQcow2State *s = bs->opaque; + int64_t total_sectors = bs->total_sectors; + bool zero_beyond_eof = bs->zero_beyond_eof; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_SAVE); + bs->zero_beyond_eof = false; + ret = bdrv_pwritev(bs, qcow2_vm_state_offset(s) + pos, qiov); + bs->zero_beyond_eof = zero_beyond_eof; + + /* bdrv_co_do_writev will have increased the total_sectors value to include + * the VM state - the VM state is however not an actual part of the block + * device, therefore, we need to restore the old value. */ + bs->total_sectors = total_sectors; + + return ret; +} + +static int qcow2_load_vmstate(BlockDriverState *bs, uint8_t *buf, + int64_t pos, int size) +{ + BDRVQcow2State *s = bs->opaque; + bool zero_beyond_eof = bs->zero_beyond_eof; + int ret; + + BLKDBG_EVENT(bs->file, BLKDBG_VMSTATE_LOAD); + bs->zero_beyond_eof = false; + ret = bdrv_pread(bs, qcow2_vm_state_offset(s) + pos, buf, size); + bs->zero_beyond_eof = zero_beyond_eof; + + return ret; +} + +/* + * Downgrades an image's version. To achieve this, any incompatible features + * have to be removed. + */ +static int qcow2_downgrade(BlockDriverState *bs, int target_version, + BlockDriverAmendStatusCB *status_cb) +{ + BDRVQcow2State *s = bs->opaque; + int current_version = s->qcow_version; + int ret; + + if (target_version == current_version) { + return 0; + } else if (target_version > current_version) { + return -EINVAL; + } else if (target_version != 2) { + return -EINVAL; + } + + if (s->refcount_order != 4) { + /* we would have to convert the image to a refcount_order == 4 image + * here; however, since qemu (at the time of writing this) does not + * support anything different than 4 anyway, there is no point in doing + * so right now; however, we should error out (if qemu supports this in + * the future and this code has not been adapted) */ + error_report("qcow2_downgrade: Image refcount orders other than 4 are " + "currently not supported."); + return -ENOTSUP; + } + + /* clear incompatible features */ + if (s->incompatible_features & QCOW2_INCOMPAT_DIRTY) { + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + } + + /* with QCOW2_INCOMPAT_CORRUPT, it is pretty much impossible to get here in + * the first place; if that happens nonetheless, returning -ENOTSUP is the + * best thing to do anyway */ + + if (s->incompatible_features) { + return -ENOTSUP; + } + + /* since we can ignore compatible features, we can set them to 0 as well */ + s->compatible_features = 0; + /* if lazy refcounts have been used, they have already been fixed through + * clearing the dirty flag */ + + /* clearing autoclear features is trivial */ + s->autoclear_features = 0; + + ret = qcow2_expand_zero_clusters(bs, status_cb); + if (ret < 0) { + return ret; + } + + s->qcow_version = target_version; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->qcow_version = current_version; + return ret; + } + return 0; +} + +static int qcow2_amend_options(BlockDriverState *bs, QemuOpts *opts, + BlockDriverAmendStatusCB *status_cb) +{ + BDRVQcow2State *s = bs->opaque; + int old_version = s->qcow_version, new_version = old_version; + uint64_t new_size = 0; + const char *backing_file = NULL, *backing_format = NULL; + bool lazy_refcounts = s->use_lazy_refcounts; + const char *compat = NULL; + uint64_t cluster_size = s->cluster_size; + bool encrypt; + int ret; + QemuOptDesc *desc = opts->list->desc; + + while (desc && desc->name) { + if (!qemu_opt_find(opts, desc->name)) { + /* only change explicitly defined options */ + desc++; + continue; + } + + if (!strcmp(desc->name, BLOCK_OPT_COMPAT_LEVEL)) { + compat = qemu_opt_get(opts, BLOCK_OPT_COMPAT_LEVEL); + if (!compat) { + /* preserve default */ + } else if (!strcmp(compat, "0.10")) { + new_version = 2; + } else if (!strcmp(compat, "1.1")) { + new_version = 3; + } else { + fprintf(stderr, "Unknown compatibility level %s.\n", compat); + return -EINVAL; + } + } else if (!strcmp(desc->name, BLOCK_OPT_PREALLOC)) { + fprintf(stderr, "Cannot change preallocation mode.\n"); + return -ENOTSUP; + } else if (!strcmp(desc->name, BLOCK_OPT_SIZE)) { + new_size = qemu_opt_get_size(opts, BLOCK_OPT_SIZE, 0); + } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FILE)) { + backing_file = qemu_opt_get(opts, BLOCK_OPT_BACKING_FILE); + } else if (!strcmp(desc->name, BLOCK_OPT_BACKING_FMT)) { + backing_format = qemu_opt_get(opts, BLOCK_OPT_BACKING_FMT); + } else if (!strcmp(desc->name, BLOCK_OPT_ENCRYPT)) { + encrypt = qemu_opt_get_bool(opts, BLOCK_OPT_ENCRYPT, + !!s->cipher); + + if (encrypt != !!s->cipher) { + fprintf(stderr, "Changing the encryption flag is not " + "supported.\n"); + return -ENOTSUP; + } + } else if (!strcmp(desc->name, BLOCK_OPT_CLUSTER_SIZE)) { + cluster_size = qemu_opt_get_size(opts, BLOCK_OPT_CLUSTER_SIZE, + cluster_size); + if (cluster_size != s->cluster_size) { + fprintf(stderr, "Changing the cluster size is not " + "supported.\n"); + return -ENOTSUP; + } + } else if (!strcmp(desc->name, BLOCK_OPT_LAZY_REFCOUNTS)) { + lazy_refcounts = qemu_opt_get_bool(opts, BLOCK_OPT_LAZY_REFCOUNTS, + lazy_refcounts); + } else if (!strcmp(desc->name, BLOCK_OPT_REFCOUNT_BITS)) { + error_report("Cannot change refcount entry width"); + return -ENOTSUP; + } else { + /* if this assertion fails, this probably means a new option was + * added without having it covered here */ + assert(false); + } + + desc++; + } + + if (new_version != old_version) { + if (new_version > old_version) { + /* Upgrade */ + s->qcow_version = new_version; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->qcow_version = old_version; + return ret; + } + } else { + ret = qcow2_downgrade(bs, new_version, status_cb); + if (ret < 0) { + return ret; + } + } + } + + if (backing_file || backing_format) { + ret = qcow2_change_backing_file(bs, + backing_file ?: s->image_backing_file, + backing_format ?: s->image_backing_format); + if (ret < 0) { + return ret; + } + } + + if (s->use_lazy_refcounts != lazy_refcounts) { + if (lazy_refcounts) { + if (s->qcow_version < 3) { + fprintf(stderr, "Lazy refcounts only supported with compatibility " + "level 1.1 and above (use compat=1.1 or greater)\n"); + return -EINVAL; + } + s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; + return ret; + } + s->use_lazy_refcounts = true; + } else { + /* make image clean first */ + ret = qcow2_mark_clean(bs); + if (ret < 0) { + return ret; + } + /* now disallow lazy refcounts */ + s->compatible_features &= ~QCOW2_COMPAT_LAZY_REFCOUNTS; + ret = qcow2_update_header(bs); + if (ret < 0) { + s->compatible_features |= QCOW2_COMPAT_LAZY_REFCOUNTS; + return ret; + } + s->use_lazy_refcounts = false; + } + } + + if (new_size) { + ret = bdrv_truncate(bs, new_size); + if (ret < 0) { + return ret; + } + } + + return 0; +} + +/* + * If offset or size are negative, respectively, they will not be included in + * the BLOCK_IMAGE_CORRUPTED event emitted. + * fatal will be ignored for read-only BDS; corruptions found there will always + * be considered non-fatal. + */ +void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, + int64_t size, const char *message_format, ...) +{ + BDRVQcow2State *s = bs->opaque; + const char *node_name; + char *message; + va_list ap; + + fatal = fatal && !bs->read_only; + + if (s->signaled_corruption && + (!fatal || (s->incompatible_features & QCOW2_INCOMPAT_CORRUPT))) + { + return; + } + + va_start(ap, message_format); + message = g_strdup_vprintf(message_format, ap); + va_end(ap); + + if (fatal) { + fprintf(stderr, "qcow2: Marking image as corrupt: %s; further " + "corruption events will be suppressed\n", message); + } else { + fprintf(stderr, "qcow2: Image is corrupt: %s; further non-fatal " + "corruption events will be suppressed\n", message); + } + + node_name = bdrv_get_node_name(bs); + qapi_event_send_block_image_corrupted(bdrv_get_device_name(bs), + *node_name != '\0', node_name, + message, offset >= 0, offset, + size >= 0, size, + fatal, &error_abort); + g_free(message); + + if (fatal) { + qcow2_mark_corrupt(bs); + bs->drv = NULL; /* make BDS unusable */ + } + + s->signaled_corruption = true; +} + +static QemuOptsList qcow2_create_opts = { + .name = "qcow2-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qcow2_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_COMPAT_LEVEL, + .type = QEMU_OPT_STRING, + .help = "Compatibility level (0.10 or 1.1)" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_BACKING_FMT, + .type = QEMU_OPT_STRING, + .help = "Image format of the base image" + }, + { + .name = BLOCK_OPT_ENCRYPT, + .type = QEMU_OPT_BOOL, + .help = "Encrypt the image", + .def_value_str = "off" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "qcow2 cluster size", + .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, metadata, " + "falloc, full)" + }, + { + .name = BLOCK_OPT_LAZY_REFCOUNTS, + .type = QEMU_OPT_BOOL, + .help = "Postpone refcount updates", + .def_value_str = "off" + }, + { + .name = BLOCK_OPT_REFCOUNT_BITS, + .type = QEMU_OPT_NUMBER, + .help = "Width of a reference count entry in bits", + .def_value_str = "16" + }, + { /* end of list */ } + } +}; + +BlockDriver bdrv_qcow2 = { + .format_name = "qcow2", + .instance_size = sizeof(BDRVQcow2State), + .bdrv_probe = qcow2_probe, + .bdrv_open = qcow2_open, + .bdrv_close = qcow2_close, + .bdrv_reopen_prepare = qcow2_reopen_prepare, + .bdrv_reopen_commit = qcow2_reopen_commit, + .bdrv_reopen_abort = qcow2_reopen_abort, + .bdrv_create = qcow2_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_get_block_status = qcow2_co_get_block_status, + .bdrv_set_key = qcow2_set_key, + + .bdrv_co_readv = qcow2_co_readv, + .bdrv_co_writev = qcow2_co_writev, + .bdrv_co_flush_to_os = qcow2_co_flush_to_os, + + .bdrv_co_write_zeroes = qcow2_co_write_zeroes, + .bdrv_co_discard = qcow2_co_discard, + .bdrv_truncate = qcow2_truncate, + .bdrv_write_compressed = qcow2_write_compressed, + .bdrv_make_empty = qcow2_make_empty, + + .bdrv_snapshot_create = qcow2_snapshot_create, + .bdrv_snapshot_goto = qcow2_snapshot_goto, + .bdrv_snapshot_delete = qcow2_snapshot_delete, + .bdrv_snapshot_list = qcow2_snapshot_list, + .bdrv_snapshot_load_tmp = qcow2_snapshot_load_tmp, + .bdrv_get_info = qcow2_get_info, + .bdrv_get_specific_info = qcow2_get_specific_info, + + .bdrv_save_vmstate = qcow2_save_vmstate, + .bdrv_load_vmstate = qcow2_load_vmstate, + + .supports_backing = true, + .bdrv_change_backing_file = qcow2_change_backing_file, + + .bdrv_refresh_limits = qcow2_refresh_limits, + .bdrv_invalidate_cache = qcow2_invalidate_cache, + + .create_opts = &qcow2_create_opts, + .bdrv_check = qcow2_check, + .bdrv_amend_options = qcow2_amend_options, + + .bdrv_detach_aio_context = qcow2_detach_aio_context, + .bdrv_attach_aio_context = qcow2_attach_aio_context, +}; + +static void bdrv_qcow2_init(void) +{ + bdrv_register(&bdrv_qcow2); +} + +block_init(bdrv_qcow2_init); diff --git a/src/block/qcow2.h b/src/block/qcow2.h new file mode 100644 index 0000000..b8c500b --- /dev/null +++ b/src/block/qcow2.h @@ -0,0 +1,594 @@ +/* + * Block driver for the QCOW version 2 format + * + * Copyright (c) 2004-2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#ifndef BLOCK_QCOW2_H +#define BLOCK_QCOW2_H + +#include "crypto/cipher.h" +#include "qemu/coroutine.h" + +//#define DEBUG_ALLOC +//#define DEBUG_ALLOC2 +//#define DEBUG_EXT + +#define QCOW_MAGIC (('Q' << 24) | ('F' << 16) | ('I' << 8) | 0xfb) + +#define QCOW_CRYPT_NONE 0 +#define QCOW_CRYPT_AES 1 + +#define QCOW_MAX_CRYPT_CLUSTERS 32 +#define QCOW_MAX_SNAPSHOTS 65536 + +/* 8 MB refcount table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_REFTABLE_SIZE 0x800000 + +/* 32 MB L1 table is enough for 2 PB images at 64k cluster size + * (128 GB for 512 byte clusters, 2 EB for 2 MB clusters) */ +#define QCOW_MAX_L1_SIZE 0x2000000 + +/* Allow for an average of 1k per snapshot table entry, should be plenty of + * space for snapshot names and IDs */ +#define QCOW_MAX_SNAPSHOTS_SIZE (1024 * QCOW_MAX_SNAPSHOTS) + +/* indicate that the refcount of the referenced cluster is exactly one. */ +#define QCOW_OFLAG_COPIED (1ULL << 63) +/* indicate that the cluster is compressed (they never have the copied flag) */ +#define QCOW_OFLAG_COMPRESSED (1ULL << 62) +/* The cluster reads as all zeros */ +#define QCOW_OFLAG_ZERO (1ULL << 0) + +#define MIN_CLUSTER_BITS 9 +#define MAX_CLUSTER_BITS 21 + +/* Must be at least 2 to cover COW */ +#define MIN_L2_CACHE_SIZE 2 /* clusters */ + +/* Must be at least 4 to cover all cases of refcount table growth */ +#define MIN_REFCOUNT_CACHE_SIZE 4 /* clusters */ + +/* Whichever is more */ +#define DEFAULT_L2_CACHE_CLUSTERS 8 /* clusters */ +#define DEFAULT_L2_CACHE_BYTE_SIZE 1048576 /* bytes */ + +/* The refblock cache needs only a fourth of the L2 cache size to cover as many + * clusters */ +#define DEFAULT_L2_REFCOUNT_SIZE_RATIO 4 + +#define DEFAULT_CLUSTER_SIZE 65536 + + +#define QCOW2_OPT_LAZY_REFCOUNTS "lazy-refcounts" +#define QCOW2_OPT_DISCARD_REQUEST "pass-discard-request" +#define QCOW2_OPT_DISCARD_SNAPSHOT "pass-discard-snapshot" +#define QCOW2_OPT_DISCARD_OTHER "pass-discard-other" +#define QCOW2_OPT_OVERLAP "overlap-check" +#define QCOW2_OPT_OVERLAP_TEMPLATE "overlap-check.template" +#define QCOW2_OPT_OVERLAP_MAIN_HEADER "overlap-check.main-header" +#define QCOW2_OPT_OVERLAP_ACTIVE_L1 "overlap-check.active-l1" +#define QCOW2_OPT_OVERLAP_ACTIVE_L2 "overlap-check.active-l2" +#define QCOW2_OPT_OVERLAP_REFCOUNT_TABLE "overlap-check.refcount-table" +#define QCOW2_OPT_OVERLAP_REFCOUNT_BLOCK "overlap-check.refcount-block" +#define QCOW2_OPT_OVERLAP_SNAPSHOT_TABLE "overlap-check.snapshot-table" +#define QCOW2_OPT_OVERLAP_INACTIVE_L1 "overlap-check.inactive-l1" +#define QCOW2_OPT_OVERLAP_INACTIVE_L2 "overlap-check.inactive-l2" +#define QCOW2_OPT_CACHE_SIZE "cache-size" +#define QCOW2_OPT_L2_CACHE_SIZE "l2-cache-size" +#define QCOW2_OPT_REFCOUNT_CACHE_SIZE "refcount-cache-size" +#define QCOW2_OPT_CACHE_CLEAN_INTERVAL "cache-clean-interval" + +typedef struct QCowHeader { + uint32_t magic; + uint32_t version; + uint64_t backing_file_offset; + uint32_t backing_file_size; + uint32_t cluster_bits; + uint64_t size; /* in bytes */ + uint32_t crypt_method; + uint32_t l1_size; /* XXX: save number of clusters instead ? */ + uint64_t l1_table_offset; + uint64_t refcount_table_offset; + uint32_t refcount_table_clusters; + uint32_t nb_snapshots; + uint64_t snapshots_offset; + + /* The following fields are only valid for version >= 3 */ + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + uint32_t refcount_order; + uint32_t header_length; +} QEMU_PACKED QCowHeader; + +typedef struct QEMU_PACKED QCowSnapshotHeader { + /* header is 8 byte aligned */ + uint64_t l1_table_offset; + + uint32_t l1_size; + uint16_t id_str_size; + uint16_t name_size; + + uint32_t date_sec; + uint32_t date_nsec; + + uint64_t vm_clock_nsec; + + uint32_t vm_state_size; + uint32_t extra_data_size; /* for extension */ + /* extra data follows */ + /* id_str follows */ + /* name follows */ +} QCowSnapshotHeader; + +typedef struct QEMU_PACKED QCowSnapshotExtraData { + uint64_t vm_state_size_large; + uint64_t disk_size; +} QCowSnapshotExtraData; + + +typedef struct QCowSnapshot { + uint64_t l1_table_offset; + uint32_t l1_size; + char *id_str; + char *name; + uint64_t disk_size; + uint64_t vm_state_size; + uint32_t date_sec; + uint32_t date_nsec; + uint64_t vm_clock_nsec; +} QCowSnapshot; + +struct Qcow2Cache; +typedef struct Qcow2Cache Qcow2Cache; + +typedef struct Qcow2UnknownHeaderExtension { + uint32_t magic; + uint32_t len; + QLIST_ENTRY(Qcow2UnknownHeaderExtension) next; + uint8_t data[]; +} Qcow2UnknownHeaderExtension; + +enum { + QCOW2_FEAT_TYPE_INCOMPATIBLE = 0, + QCOW2_FEAT_TYPE_COMPATIBLE = 1, + QCOW2_FEAT_TYPE_AUTOCLEAR = 2, +}; + +/* Incompatible feature bits */ +enum { + QCOW2_INCOMPAT_DIRTY_BITNR = 0, + QCOW2_INCOMPAT_CORRUPT_BITNR = 1, + QCOW2_INCOMPAT_DIRTY = 1 << QCOW2_INCOMPAT_DIRTY_BITNR, + QCOW2_INCOMPAT_CORRUPT = 1 << QCOW2_INCOMPAT_CORRUPT_BITNR, + + QCOW2_INCOMPAT_MASK = QCOW2_INCOMPAT_DIRTY + | QCOW2_INCOMPAT_CORRUPT, +}; + +/* Compatible feature bits */ +enum { + QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR = 0, + QCOW2_COMPAT_LAZY_REFCOUNTS = 1 << QCOW2_COMPAT_LAZY_REFCOUNTS_BITNR, + + QCOW2_COMPAT_FEAT_MASK = QCOW2_COMPAT_LAZY_REFCOUNTS, +}; + +enum qcow2_discard_type { + QCOW2_DISCARD_NEVER = 0, + QCOW2_DISCARD_ALWAYS, + QCOW2_DISCARD_REQUEST, + QCOW2_DISCARD_SNAPSHOT, + QCOW2_DISCARD_OTHER, + QCOW2_DISCARD_MAX +}; + +typedef struct Qcow2Feature { + uint8_t type; + uint8_t bit; + char name[46]; +} QEMU_PACKED Qcow2Feature; + +typedef struct Qcow2DiscardRegion { + BlockDriverState *bs; + uint64_t offset; + uint64_t bytes; + QTAILQ_ENTRY(Qcow2DiscardRegion) next; +} Qcow2DiscardRegion; + +typedef uint64_t Qcow2GetRefcountFunc(const void *refcount_array, + uint64_t index); +typedef void Qcow2SetRefcountFunc(void *refcount_array, + uint64_t index, uint64_t value); + +typedef struct BDRVQcow2State { + int cluster_bits; + int cluster_size; + int cluster_sectors; + int l2_bits; + int l2_size; + int l1_size; + int l1_vm_state_index; + int refcount_block_bits; + int refcount_block_size; + int csize_shift; + int csize_mask; + uint64_t cluster_offset_mask; + uint64_t l1_table_offset; + uint64_t *l1_table; + + Qcow2Cache* l2_table_cache; + Qcow2Cache* refcount_block_cache; + QEMUTimer *cache_clean_timer; + unsigned cache_clean_interval; + + uint8_t *cluster_cache; + uint8_t *cluster_data; + uint64_t cluster_cache_offset; + QLIST_HEAD(QCowClusterAlloc, QCowL2Meta) cluster_allocs; + + uint64_t *refcount_table; + uint64_t refcount_table_offset; + uint32_t refcount_table_size; + uint64_t free_cluster_index; + uint64_t free_byte_offset; + + CoMutex lock; + + QCryptoCipher *cipher; /* current cipher, NULL if no key yet */ + uint32_t crypt_method_header; + uint64_t snapshots_offset; + int snapshots_size; + unsigned int nb_snapshots; + QCowSnapshot *snapshots; + + int flags; + int qcow_version; + bool use_lazy_refcounts; + int refcount_order; + int refcount_bits; + uint64_t refcount_max; + + Qcow2GetRefcountFunc *get_refcount; + Qcow2SetRefcountFunc *set_refcount; + + bool discard_passthrough[QCOW2_DISCARD_MAX]; + + int overlap_check; /* bitmask of Qcow2MetadataOverlap values */ + bool signaled_corruption; + + uint64_t incompatible_features; + uint64_t compatible_features; + uint64_t autoclear_features; + + size_t unknown_header_fields_size; + void* unknown_header_fields; + QLIST_HEAD(, Qcow2UnknownHeaderExtension) unknown_header_ext; + QTAILQ_HEAD (, Qcow2DiscardRegion) discards; + bool cache_discards; + + /* Backing file path and format as stored in the image (this is not the + * effective path/format, which may be the result of a runtime option + * override) */ + char *image_backing_file; + char *image_backing_format; +} BDRVQcow2State; + +typedef struct Qcow2COWRegion { + /** + * Offset of the COW region in bytes from the start of the first cluster + * touched by the request. + */ + uint64_t offset; + + /** Number of sectors to copy */ + int nb_sectors; +} Qcow2COWRegion; + +/** + * Describes an in-flight (part of a) write request that writes to clusters + * that are not referenced in their L2 table yet. + */ +typedef struct QCowL2Meta +{ + /** Guest offset of the first newly allocated cluster */ + uint64_t offset; + + /** Host offset of the first newly allocated cluster */ + uint64_t alloc_offset; + + /** + * Number of sectors from the start of the first allocated cluster to + * the end of the (possibly shortened) request + */ + int nb_available; + + /** Number of newly allocated clusters */ + int nb_clusters; + + /** + * Requests that overlap with this allocation and wait to be restarted + * when the allocating request has completed. + */ + CoQueue dependent_requests; + + /** + * The COW Region between the start of the first allocated cluster and the + * area the guest actually writes to. + */ + Qcow2COWRegion cow_start; + + /** + * The COW Region between the area the guest actually writes to and the + * end of the last allocated cluster. + */ + Qcow2COWRegion cow_end; + + /** Pointer to next L2Meta of the same write request */ + struct QCowL2Meta *next; + + QLIST_ENTRY(QCowL2Meta) next_in_flight; +} QCowL2Meta; + +enum { + QCOW2_CLUSTER_UNALLOCATED, + QCOW2_CLUSTER_NORMAL, + QCOW2_CLUSTER_COMPRESSED, + QCOW2_CLUSTER_ZERO +}; + +typedef enum QCow2MetadataOverlap { + QCOW2_OL_MAIN_HEADER_BITNR = 0, + QCOW2_OL_ACTIVE_L1_BITNR = 1, + QCOW2_OL_ACTIVE_L2_BITNR = 2, + QCOW2_OL_REFCOUNT_TABLE_BITNR = 3, + QCOW2_OL_REFCOUNT_BLOCK_BITNR = 4, + QCOW2_OL_SNAPSHOT_TABLE_BITNR = 5, + QCOW2_OL_INACTIVE_L1_BITNR = 6, + QCOW2_OL_INACTIVE_L2_BITNR = 7, + + QCOW2_OL_MAX_BITNR = 8, + + QCOW2_OL_NONE = 0, + QCOW2_OL_MAIN_HEADER = (1 << QCOW2_OL_MAIN_HEADER_BITNR), + QCOW2_OL_ACTIVE_L1 = (1 << QCOW2_OL_ACTIVE_L1_BITNR), + QCOW2_OL_ACTIVE_L2 = (1 << QCOW2_OL_ACTIVE_L2_BITNR), + QCOW2_OL_REFCOUNT_TABLE = (1 << QCOW2_OL_REFCOUNT_TABLE_BITNR), + QCOW2_OL_REFCOUNT_BLOCK = (1 << QCOW2_OL_REFCOUNT_BLOCK_BITNR), + QCOW2_OL_SNAPSHOT_TABLE = (1 << QCOW2_OL_SNAPSHOT_TABLE_BITNR), + QCOW2_OL_INACTIVE_L1 = (1 << QCOW2_OL_INACTIVE_L1_BITNR), + /* NOTE: Checking overlaps with inactive L2 tables will result in bdrv + * reads. */ + QCOW2_OL_INACTIVE_L2 = (1 << QCOW2_OL_INACTIVE_L2_BITNR), +} QCow2MetadataOverlap; + +/* Perform all overlap checks which can be done in constant time */ +#define QCOW2_OL_CONSTANT \ + (QCOW2_OL_MAIN_HEADER | QCOW2_OL_ACTIVE_L1 | QCOW2_OL_REFCOUNT_TABLE | \ + QCOW2_OL_SNAPSHOT_TABLE) + +/* Perform all overlap checks which don't require disk access */ +#define QCOW2_OL_CACHED \ + (QCOW2_OL_CONSTANT | QCOW2_OL_ACTIVE_L2 | QCOW2_OL_REFCOUNT_BLOCK | \ + QCOW2_OL_INACTIVE_L1) + +/* Perform all overlap checks */ +#define QCOW2_OL_ALL \ + (QCOW2_OL_CACHED | QCOW2_OL_INACTIVE_L2) + +#define L1E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_OFFSET_MASK 0x00fffffffffffe00ULL +#define L2E_COMPRESSED_OFFSET_SIZE_MASK 0x3fffffffffffffffULL + +#define REFT_OFFSET_MASK 0xfffffffffffffe00ULL + +static inline int64_t start_of_cluster(BDRVQcow2State *s, int64_t offset) +{ + return offset & ~(s->cluster_size - 1); +} + +static inline int64_t offset_into_cluster(BDRVQcow2State *s, int64_t offset) +{ + return offset & (s->cluster_size - 1); +} + +static inline uint64_t size_to_clusters(BDRVQcow2State *s, uint64_t size) +{ + return (size + (s->cluster_size - 1)) >> s->cluster_bits; +} + +static inline int64_t size_to_l1(BDRVQcow2State *s, int64_t size) +{ + int shift = s->cluster_bits + s->l2_bits; + return (size + (1ULL << shift) - 1) >> shift; +} + +static inline int offset_to_l2_index(BDRVQcow2State *s, int64_t offset) +{ + return (offset >> s->cluster_bits) & (s->l2_size - 1); +} + +static inline int64_t align_offset(int64_t offset, int n) +{ + offset = (offset + n - 1) & ~(n - 1); + return offset; +} + +static inline int64_t qcow2_vm_state_offset(BDRVQcow2State *s) +{ + return (int64_t)s->l1_vm_state_index << (s->cluster_bits + s->l2_bits); +} + +static inline uint64_t qcow2_max_refcount_clusters(BDRVQcow2State *s) +{ + return QCOW_MAX_REFTABLE_SIZE >> s->cluster_bits; +} + +static inline int qcow2_get_cluster_type(uint64_t l2_entry) +{ + if (l2_entry & QCOW_OFLAG_COMPRESSED) { + return QCOW2_CLUSTER_COMPRESSED; + } else if (l2_entry & QCOW_OFLAG_ZERO) { + return QCOW2_CLUSTER_ZERO; + } else if (!(l2_entry & L2E_OFFSET_MASK)) { + return QCOW2_CLUSTER_UNALLOCATED; + } else { + return QCOW2_CLUSTER_NORMAL; + } +} + +/* Check whether refcounts are eager or lazy */ +static inline bool qcow2_need_accurate_refcounts(BDRVQcow2State *s) +{ + return !(s->incompatible_features & QCOW2_INCOMPAT_DIRTY); +} + +static inline uint64_t l2meta_cow_start(QCowL2Meta *m) +{ + return m->offset + m->cow_start.offset; +} + +static inline uint64_t l2meta_cow_end(QCowL2Meta *m) +{ + return m->offset + m->cow_end.offset + + (m->cow_end.nb_sectors << BDRV_SECTOR_BITS); +} + +static inline uint64_t refcount_diff(uint64_t r1, uint64_t r2) +{ + return r1 > r2 ? r1 - r2 : r2 - r1; +} + +// FIXME Need qcow2_ prefix to global functions + +/* qcow2.c functions */ +int qcow2_backing_read1(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors); + +int qcow2_mark_dirty(BlockDriverState *bs); +int qcow2_mark_corrupt(BlockDriverState *bs); +int qcow2_mark_consistent(BlockDriverState *bs); +int qcow2_update_header(BlockDriverState *bs); + +void qcow2_signal_corruption(BlockDriverState *bs, bool fatal, int64_t offset, + int64_t size, const char *message_format, ...) + GCC_FMT_ATTR(5, 6); + +/* qcow2-refcount.c functions */ +int qcow2_refcount_init(BlockDriverState *bs); +void qcow2_refcount_close(BlockDriverState *bs); + +int qcow2_get_refcount(BlockDriverState *bs, int64_t cluster_index, + uint64_t *refcount); + +int qcow2_update_cluster_refcount(BlockDriverState *bs, int64_t cluster_index, + uint64_t addend, bool decrease, + enum qcow2_discard_type type); + +int64_t qcow2_alloc_clusters(BlockDriverState *bs, uint64_t size); +int64_t qcow2_alloc_clusters_at(BlockDriverState *bs, uint64_t offset, + int64_t nb_clusters); +int64_t qcow2_alloc_bytes(BlockDriverState *bs, int size); +void qcow2_free_clusters(BlockDriverState *bs, + int64_t offset, int64_t size, + enum qcow2_discard_type type); +void qcow2_free_any_clusters(BlockDriverState *bs, uint64_t l2_entry, + int nb_clusters, enum qcow2_discard_type type); + +int qcow2_update_snapshot_refcount(BlockDriverState *bs, + int64_t l1_table_offset, int l1_size, int addend); + +int qcow2_check_refcounts(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix); + +void qcow2_process_discards(BlockDriverState *bs, int ret); + +int qcow2_check_metadata_overlap(BlockDriverState *bs, int ign, int64_t offset, + int64_t size); +int qcow2_pre_write_overlap_check(BlockDriverState *bs, int ign, int64_t offset, + int64_t size); + +/* qcow2-cluster.c functions */ +int qcow2_grow_l1_table(BlockDriverState *bs, uint64_t min_size, + bool exact_size); +int qcow2_write_l1_entry(BlockDriverState *bs, int l1_index); +void qcow2_l2_cache_reset(BlockDriverState *bs); +int qcow2_decompress_cluster(BlockDriverState *bs, uint64_t cluster_offset); +int qcow2_encrypt_sectors(BDRVQcow2State *s, int64_t sector_num, + uint8_t *out_buf, const uint8_t *in_buf, + int nb_sectors, bool enc, Error **errp); + +int qcow2_get_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *cluster_offset); +int qcow2_alloc_cluster_offset(BlockDriverState *bs, uint64_t offset, + int *num, uint64_t *host_offset, QCowL2Meta **m); +uint64_t qcow2_alloc_compressed_cluster_offset(BlockDriverState *bs, + uint64_t offset, + int compressed_size); + +int qcow2_alloc_cluster_link_l2(BlockDriverState *bs, QCowL2Meta *m); +int qcow2_discard_clusters(BlockDriverState *bs, uint64_t offset, + int nb_sectors, enum qcow2_discard_type type, bool full_discard); +int qcow2_zero_clusters(BlockDriverState *bs, uint64_t offset, int nb_sectors); + +int qcow2_expand_zero_clusters(BlockDriverState *bs, + BlockDriverAmendStatusCB *status_cb); + +/* qcow2-snapshot.c functions */ +int qcow2_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info); +int qcow2_snapshot_goto(BlockDriverState *bs, const char *snapshot_id); +int qcow2_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); +int qcow2_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab); +int qcow2_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp); + +void qcow2_free_snapshots(BlockDriverState *bs); +int qcow2_read_snapshots(BlockDriverState *bs); + +/* qcow2-cache.c functions */ +Qcow2Cache *qcow2_cache_create(BlockDriverState *bs, int num_tables); +int qcow2_cache_destroy(BlockDriverState* bs, Qcow2Cache *c); + +void qcow2_cache_entry_mark_dirty(BlockDriverState *bs, Qcow2Cache *c, + void *table); +int qcow2_cache_flush(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_set_dependency(BlockDriverState *bs, Qcow2Cache *c, + Qcow2Cache *dependency); +void qcow2_cache_depends_on_flush(Qcow2Cache *c); + +void qcow2_cache_clean_unused(BlockDriverState *bs, Qcow2Cache *c); +int qcow2_cache_empty(BlockDriverState *bs, Qcow2Cache *c); + +int qcow2_cache_get(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table); +int qcow2_cache_get_empty(BlockDriverState *bs, Qcow2Cache *c, uint64_t offset, + void **table); +void qcow2_cache_put(BlockDriverState *bs, Qcow2Cache *c, void **table); + +#endif diff --git a/src/block/qed-check.c b/src/block/qed-check.c new file mode 100644 index 0000000..36ecd29 --- /dev/null +++ b/src/block/qed-check.c @@ -0,0 +1,250 @@ +/* + * QEMU Enhanced Disk Format Consistency Check + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +typedef struct { + BDRVQEDState *s; + BdrvCheckResult *result; + bool fix; /* whether to fix invalid offsets */ + + uint64_t nclusters; + uint32_t *used_clusters; /* referenced cluster bitmap */ + + QEDRequest request; +} QEDCheck; + +static bool qed_test_bit(uint32_t *bitmap, uint64_t n) { + return !!(bitmap[n / 32] & (1 << (n % 32))); +} + +static void qed_set_bit(uint32_t *bitmap, uint64_t n) { + bitmap[n / 32] |= 1 << (n % 32); +} + +/** + * Set bitmap bits for clusters + * + * @check: Check structure + * @offset: Starting offset in bytes + * @n: Number of clusters + */ +static bool qed_set_used_clusters(QEDCheck *check, uint64_t offset, + unsigned int n) +{ + uint64_t cluster = qed_bytes_to_clusters(check->s, offset); + unsigned int corruptions = 0; + + while (n-- != 0) { + /* Clusters should only be referenced once */ + if (qed_test_bit(check->used_clusters, cluster)) { + corruptions++; + } + + qed_set_bit(check->used_clusters, cluster); + cluster++; + } + + check->result->corruptions += corruptions; + return corruptions == 0; +} + +/** + * Check an L2 table + * + * @ret: Number of invalid cluster offsets + */ +static unsigned int qed_check_l2_table(QEDCheck *check, QEDTable *table) +{ + BDRVQEDState *s = check->s; + unsigned int i, num_invalid = 0; + uint64_t last_offset = 0; + + for (i = 0; i < s->table_nelems; i++) { + uint64_t offset = table->offsets[i]; + + if (qed_offset_is_unalloc_cluster(offset) || + qed_offset_is_zero_cluster(offset)) { + continue; + } + check->result->bfi.allocated_clusters++; + if (last_offset && (last_offset + s->header.cluster_size != offset)) { + check->result->bfi.fragmented_clusters++; + } + last_offset = offset; + + /* Detect invalid cluster offset */ + if (!qed_check_cluster_offset(s, offset)) { + if (check->fix) { + table->offsets[i] = 0; + check->result->corruptions_fixed++; + } else { + check->result->corruptions++; + } + + num_invalid++; + continue; + } + + qed_set_used_clusters(check, offset, 1); + } + + return num_invalid; +} + +/** + * Descend tables and check each cluster is referenced once only + */ +static int qed_check_l1_table(QEDCheck *check, QEDTable *table) +{ + BDRVQEDState *s = check->s; + unsigned int i, num_invalid_l1 = 0; + int ret, last_error = 0; + + /* Mark L1 table clusters used */ + qed_set_used_clusters(check, s->header.l1_table_offset, + s->header.table_size); + + for (i = 0; i < s->table_nelems; i++) { + unsigned int num_invalid_l2; + uint64_t offset = table->offsets[i]; + + if (qed_offset_is_unalloc_cluster(offset)) { + continue; + } + + /* Detect invalid L2 offset */ + if (!qed_check_table_offset(s, offset)) { + /* Clear invalid offset */ + if (check->fix) { + table->offsets[i] = 0; + check->result->corruptions_fixed++; + } else { + check->result->corruptions++; + } + + num_invalid_l1++; + continue; + } + + if (!qed_set_used_clusters(check, offset, s->header.table_size)) { + continue; /* skip an invalid table */ + } + + ret = qed_read_l2_table_sync(s, &check->request, offset); + if (ret) { + check->result->check_errors++; + last_error = ret; + continue; + } + + num_invalid_l2 = qed_check_l2_table(check, + check->request.l2_table->table); + + /* Write out fixed L2 table */ + if (num_invalid_l2 > 0 && check->fix) { + ret = qed_write_l2_table_sync(s, &check->request, 0, + s->table_nelems, false); + if (ret) { + check->result->check_errors++; + last_error = ret; + continue; + } + } + } + + /* Drop reference to final table */ + qed_unref_l2_cache_entry(check->request.l2_table); + check->request.l2_table = NULL; + + /* Write out fixed L1 table */ + if (num_invalid_l1 > 0 && check->fix) { + ret = qed_write_l1_table_sync(s, 0, s->table_nelems); + if (ret) { + check->result->check_errors++; + last_error = ret; + } + } + + return last_error; +} + +/** + * Check for unreferenced (leaked) clusters + */ +static void qed_check_for_leaks(QEDCheck *check) +{ + BDRVQEDState *s = check->s; + uint64_t i; + + for (i = s->header.header_size; i < check->nclusters; i++) { + if (!qed_test_bit(check->used_clusters, i)) { + check->result->leaks++; + } + } +} + +/** + * Mark an image clean once it passes check or has been repaired + */ +static void qed_check_mark_clean(BDRVQEDState *s, BdrvCheckResult *result) +{ + /* Skip if there were unfixable corruptions or I/O errors */ + if (result->corruptions > 0 || result->check_errors > 0) { + return; + } + + /* Skip if image is already marked clean */ + if (!(s->header.features & QED_F_NEED_CHECK)) { + return; + } + + /* Ensure fixes reach storage before clearing check bit */ + bdrv_flush(s->bs); + + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header_sync(s); +} + +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix) +{ + QEDCheck check = { + .s = s, + .result = result, + .nclusters = qed_bytes_to_clusters(s, s->file_size), + .request = { .l2_table = NULL }, + .fix = fix, + }; + int ret; + + check.used_clusters = g_try_new0(uint32_t, (check.nclusters + 31) / 32); + if (check.nclusters && check.used_clusters == NULL) { + return -ENOMEM; + } + + check.result->bfi.total_clusters = + (s->header.image_size + s->header.cluster_size - 1) / + s->header.cluster_size; + ret = qed_check_l1_table(&check, s->l1_table); + if (ret == 0) { + /* Only check for leaks if entire image was scanned successfully */ + qed_check_for_leaks(&check); + + if (fix) { + qed_check_mark_clean(s, result); + } + } + + g_free(check.used_clusters); + return ret; +} diff --git a/src/block/qed-cluster.c b/src/block/qed-cluster.c new file mode 100644 index 0000000..f64b2af --- /dev/null +++ b/src/block/qed-cluster.c @@ -0,0 +1,165 @@ +/* + * QEMU Enhanced Disk Format Cluster functions + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +/** + * Count the number of contiguous data clusters + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Maximum number of clusters + * @offset: Set to first cluster offset + * + * This function scans tables for contiguous clusters. A contiguous run of + * clusters may be allocated, unallocated, or zero. + */ +static unsigned int qed_count_contiguous_clusters(BDRVQEDState *s, + QEDTable *table, + unsigned int index, + unsigned int n, + uint64_t *offset) +{ + unsigned int end = MIN(index + n, s->table_nelems); + uint64_t last = table->offsets[index]; + unsigned int i; + + *offset = last; + + for (i = index + 1; i < end; i++) { + if (qed_offset_is_unalloc_cluster(last)) { + /* Counting unallocated clusters */ + if (!qed_offset_is_unalloc_cluster(table->offsets[i])) { + break; + } + } else if (qed_offset_is_zero_cluster(last)) { + /* Counting zero clusters */ + if (!qed_offset_is_zero_cluster(table->offsets[i])) { + break; + } + } else { + /* Counting allocated clusters */ + if (table->offsets[i] != last + s->header.cluster_size) { + break; + } + last = table->offsets[i]; + } + } + return i - index; +} + +typedef struct { + BDRVQEDState *s; + uint64_t pos; + size_t len; + + QEDRequest *request; + + /* User callback */ + QEDFindClusterFunc *cb; + void *opaque; +} QEDFindClusterCB; + +static void qed_find_cluster_cb(void *opaque, int ret) +{ + QEDFindClusterCB *find_cluster_cb = opaque; + BDRVQEDState *s = find_cluster_cb->s; + QEDRequest *request = find_cluster_cb->request; + uint64_t offset = 0; + size_t len = 0; + unsigned int index; + unsigned int n; + + if (ret) { + goto out; + } + + index = qed_l2_index(s, find_cluster_cb->pos); + n = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, find_cluster_cb->pos) + + find_cluster_cb->len); + n = qed_count_contiguous_clusters(s, request->l2_table->table, + index, n, &offset); + + if (qed_offset_is_unalloc_cluster(offset)) { + ret = QED_CLUSTER_L2; + } else if (qed_offset_is_zero_cluster(offset)) { + ret = QED_CLUSTER_ZERO; + } else if (qed_check_cluster_offset(s, offset)) { + ret = QED_CLUSTER_FOUND; + } else { + ret = -EINVAL; + } + + len = MIN(find_cluster_cb->len, n * s->header.cluster_size - + qed_offset_into_cluster(s, find_cluster_cb->pos)); + +out: + find_cluster_cb->cb(find_cluster_cb->opaque, ret, offset, len); + g_free(find_cluster_cb); +} + +/** + * Find the offset of a data cluster + * + * @s: QED state + * @request: L2 cache entry + * @pos: Byte position in device + * @len: Number of bytes + * @cb: Completion function + * @opaque: User data for completion function + * + * This function translates a position in the block device to an offset in the + * image file. It invokes the cb completion callback to report back the + * translated offset or unallocated range in the image file. + * + * If the L2 table exists, request->l2_table points to the L2 table cache entry + * and the caller must free the reference when they are finished. The cache + * entry is exposed in this way to avoid callers having to read the L2 table + * again later during request processing. If request->l2_table is non-NULL it + * will be unreferenced before taking on the new cache entry. + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque) +{ + QEDFindClusterCB *find_cluster_cb; + uint64_t l2_offset; + + /* Limit length to L2 boundary. Requests are broken up at the L2 boundary + * so that a request acts on one L2 table at a time. + */ + len = MIN(len, (((pos >> s->l1_shift) + 1) << s->l1_shift) - pos); + + l2_offset = s->l1_table->offsets[qed_l1_index(s, pos)]; + if (qed_offset_is_unalloc_cluster(l2_offset)) { + cb(opaque, QED_CLUSTER_L1, 0, len); + return; + } + if (!qed_check_table_offset(s, l2_offset)) { + cb(opaque, -EINVAL, 0, 0); + return; + } + + find_cluster_cb = g_malloc(sizeof(*find_cluster_cb)); + find_cluster_cb->s = s; + find_cluster_cb->pos = pos; + find_cluster_cb->len = len; + find_cluster_cb->cb = cb; + find_cluster_cb->opaque = opaque; + find_cluster_cb->request = request; + + qed_read_l2_table(s, request, l2_offset, + qed_find_cluster_cb, find_cluster_cb); +} diff --git a/src/block/qed-gencb.c b/src/block/qed-gencb.c new file mode 100644 index 0000000..b817a8b --- /dev/null +++ b/src/block/qed-gencb.c @@ -0,0 +1,32 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qed.h" + +void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque) +{ + GenericCB *gencb = g_malloc(len); + gencb->cb = cb; + gencb->opaque = opaque; + return gencb; +} + +void gencb_complete(void *opaque, int ret) +{ + GenericCB *gencb = opaque; + BlockCompletionFunc *cb = gencb->cb; + void *user_opaque = gencb->opaque; + + g_free(gencb); + cb(user_opaque, ret); +} diff --git a/src/block/qed-l2-cache.c b/src/block/qed-l2-cache.c new file mode 100644 index 0000000..e9b2aae --- /dev/null +++ b/src/block/qed-l2-cache.c @@ -0,0 +1,187 @@ +/* + * QEMU Enhanced Disk Format L2 Cache + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +/* + * L2 table cache usage is as follows: + * + * An open image has one L2 table cache that is used to avoid accessing the + * image file for recently referenced L2 tables. + * + * Cluster offset lookup translates the logical offset within the block device + * to a cluster offset within the image file. This is done by indexing into + * the L1 and L2 tables which store cluster offsets. It is here where the L2 + * table cache serves up recently referenced L2 tables. + * + * If there is a cache miss, that L2 table is read from the image file and + * committed to the cache. Subsequent accesses to that L2 table will be served + * from the cache until the table is evicted from the cache. + * + * L2 tables are also committed to the cache when new L2 tables are allocated + * in the image file. Since the L2 table cache is write-through, the new L2 + * table is first written out to the image file and then committed to the + * cache. + * + * Multiple I/O requests may be using an L2 table cache entry at any given + * time. That means an entry may be in use across several requests and + * reference counting is needed to free the entry at the correct time. In + * particular, an entry evicted from the cache will only be freed once all + * references are dropped. + * + * An in-flight I/O request will hold a reference to a L2 table cache entry for + * the period during which it needs to access the L2 table. This includes + * cluster offset lookup, L2 table allocation, and L2 table update when a new + * data cluster has been allocated. + * + * An interesting case occurs when two requests need to access an L2 table that + * is not in the cache. Since the operation to read the table from the image + * file takes some time to complete, both requests may see a cache miss and + * start reading the L2 table from the image file. The first to finish will + * commit its L2 table into the cache. When the second tries to commit its + * table will be deleted in favor of the existing cache entry. + */ + +#include "trace.h" +#include "qed.h" + +/* Each L2 holds 2GB so this let's us fully cache a 100GB disk */ +#define MAX_L2_CACHE_SIZE 50 + +/** + * Initialize the L2 cache + */ +void qed_init_l2_cache(L2TableCache *l2_cache) +{ + QTAILQ_INIT(&l2_cache->entries); + l2_cache->n_entries = 0; +} + +/** + * Free the L2 cache + */ +void qed_free_l2_cache(L2TableCache *l2_cache) +{ + CachedL2Table *entry, *next_entry; + + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next_entry) { + qemu_vfree(entry->table); + g_free(entry); + } +} + +/** + * Allocate an uninitialized entry from the cache + * + * The returned entry has a reference count of 1 and is owned by the caller. + * The caller must allocate the actual table field for this entry and it must + * be freeable using qemu_vfree(). + */ +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache) +{ + CachedL2Table *entry; + + entry = g_malloc0(sizeof(*entry)); + entry->ref++; + + trace_qed_alloc_l2_cache_entry(l2_cache, entry); + + return entry; +} + +/** + * Decrease an entry's reference count and free if necessary when the reference + * count drops to zero. + */ +void qed_unref_l2_cache_entry(CachedL2Table *entry) +{ + if (!entry) { + return; + } + + entry->ref--; + trace_qed_unref_l2_cache_entry(entry, entry->ref); + if (entry->ref == 0) { + qemu_vfree(entry->table); + g_free(entry); + } +} + +/** + * Find an entry in the L2 cache. This may return NULL and it's up to the + * caller to satisfy the cache miss. + * + * For a cached entry, this function increases the reference count and returns + * the entry. + */ +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset) +{ + CachedL2Table *entry; + + QTAILQ_FOREACH(entry, &l2_cache->entries, node) { + if (entry->offset == offset) { + trace_qed_find_l2_cache_entry(l2_cache, entry, offset, entry->ref); + entry->ref++; + return entry; + } + } + return NULL; +} + +/** + * Commit an L2 cache entry into the cache. This is meant to be used as part of + * the process to satisfy a cache miss. A caller would allocate an entry which + * is not actually in the L2 cache and then once the entry was valid and + * present on disk, the entry can be committed into the cache. + * + * Since the cache is write-through, it's important that this function is not + * called until the entry is present on disk and the L1 has been updated to + * point to the entry. + * + * N.B. This function steals a reference to the l2_table from the caller so the + * caller must obtain a new reference by issuing a call to + * qed_find_l2_cache_entry(). + */ +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table) +{ + CachedL2Table *entry; + + entry = qed_find_l2_cache_entry(l2_cache, l2_table->offset); + if (entry) { + qed_unref_l2_cache_entry(entry); + qed_unref_l2_cache_entry(l2_table); + return; + } + + /* Evict an unused cache entry so we have space. If all entries are in use + * we can grow the cache temporarily and we try to shrink back down later. + */ + if (l2_cache->n_entries >= MAX_L2_CACHE_SIZE) { + CachedL2Table *next; + QTAILQ_FOREACH_SAFE(entry, &l2_cache->entries, node, next) { + if (entry->ref > 1) { + continue; + } + + QTAILQ_REMOVE(&l2_cache->entries, entry, node); + l2_cache->n_entries--; + qed_unref_l2_cache_entry(entry); + + /* Stop evicting when we've shrunk back to max size */ + if (l2_cache->n_entries < MAX_L2_CACHE_SIZE) { + break; + } + } + } + + l2_cache->n_entries++; + QTAILQ_INSERT_TAIL(&l2_cache->entries, l2_table, node); +} diff --git a/src/block/qed-table.c b/src/block/qed-table.c new file mode 100644 index 0000000..f4219b8 --- /dev/null +++ b/src/block/qed-table.c @@ -0,0 +1,296 @@ +/* + * QEMU Enhanced Disk Format Table I/O + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "qemu/sockets.h" /* for EINPROGRESS on Windows */ +#include "qed.h" + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *table; + + struct iovec iov; + QEMUIOVector qiov; +} QEDReadTableCB; + +static void qed_read_table_cb(void *opaque, int ret) +{ + QEDReadTableCB *read_table_cb = opaque; + QEDTable *table = read_table_cb->table; + int noffsets = read_table_cb->qiov.size / sizeof(uint64_t); + int i; + + /* Handle I/O error */ + if (ret) { + goto out; + } + + /* Byteswap offsets */ + for (i = 0; i < noffsets; i++) { + table->offsets[i] = le64_to_cpu(table->offsets[i]); + } + +out: + /* Completion */ + trace_qed_read_table_cb(read_table_cb->s, read_table_cb->table, ret); + gencb_complete(&read_table_cb->gencb, ret); +} + +static void qed_read_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + BlockCompletionFunc *cb, void *opaque) +{ + QEDReadTableCB *read_table_cb = gencb_alloc(sizeof(*read_table_cb), + cb, opaque); + QEMUIOVector *qiov = &read_table_cb->qiov; + + trace_qed_read_table(s, offset, table); + + read_table_cb->s = s; + read_table_cb->table = table; + read_table_cb->iov.iov_base = table->offsets, + read_table_cb->iov.iov_len = s->header.cluster_size * s->header.table_size, + + qemu_iovec_init_external(qiov, &read_table_cb->iov, 1); + bdrv_aio_readv(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, qiov, + qiov->size / BDRV_SECTOR_SIZE, + qed_read_table_cb, read_table_cb); +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEDTable *orig_table; + QEDTable *table; + bool flush; /* flush after write? */ + + struct iovec iov; + QEMUIOVector qiov; +} QEDWriteTableCB; + +static void qed_write_table_cb(void *opaque, int ret) +{ + QEDWriteTableCB *write_table_cb = opaque; + + trace_qed_write_table_cb(write_table_cb->s, + write_table_cb->orig_table, + write_table_cb->flush, + ret); + + if (ret) { + goto out; + } + + if (write_table_cb->flush) { + /* We still need to flush first */ + write_table_cb->flush = false; + bdrv_aio_flush(write_table_cb->s->bs, qed_write_table_cb, + write_table_cb); + return; + } + +out: + qemu_vfree(write_table_cb->table); + gencb_complete(&write_table_cb->gencb, ret); +} + +/** + * Write out an updated part or all of a table + * + * @s: QED state + * @offset: Offset of table in image file, in bytes + * @table: Table + * @index: Index of first element + * @n: Number of elements + * @flush: Whether or not to sync to disk + * @cb: Completion function + * @opaque: Argument for completion function + */ +static void qed_write_table(BDRVQEDState *s, uint64_t offset, QEDTable *table, + unsigned int index, unsigned int n, bool flush, + BlockCompletionFunc *cb, void *opaque) +{ + QEDWriteTableCB *write_table_cb; + unsigned int sector_mask = BDRV_SECTOR_SIZE / sizeof(uint64_t) - 1; + unsigned int start, end, i; + size_t len_bytes; + + trace_qed_write_table(s, offset, table, index, n); + + /* Calculate indices of the first and one after last elements */ + start = index & ~sector_mask; + end = (index + n + sector_mask) & ~sector_mask; + + len_bytes = (end - start) * sizeof(uint64_t); + + write_table_cb = gencb_alloc(sizeof(*write_table_cb), cb, opaque); + write_table_cb->s = s; + write_table_cb->orig_table = table; + write_table_cb->flush = flush; + write_table_cb->table = qemu_blockalign(s->bs, len_bytes); + write_table_cb->iov.iov_base = write_table_cb->table->offsets; + write_table_cb->iov.iov_len = len_bytes; + qemu_iovec_init_external(&write_table_cb->qiov, &write_table_cb->iov, 1); + + /* Byteswap table */ + for (i = start; i < end; i++) { + uint64_t le_offset = cpu_to_le64(table->offsets[i]); + write_table_cb->table->offsets[i - start] = le_offset; + } + + /* Adjust for offset into table */ + offset += start * sizeof(uint64_t); + + bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, + &write_table_cb->qiov, + write_table_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_write_table_cb, write_table_cb); +} + +/** + * Propagate return value from async callback + */ +static void qed_sync_cb(void *opaque, int ret) +{ + *(int *)opaque = ret; +} + +int qed_read_l1_table_sync(BDRVQEDState *s) +{ + int ret = -EINPROGRESS; + + qed_read_table(s, s->header.l1_table_offset, + s->l1_table, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(s->bs), true); + } + + return ret; +} + +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockCompletionFunc *cb, void *opaque) +{ + BLKDBG_EVENT(s->bs->file, BLKDBG_L1_UPDATE); + qed_write_table(s, s->header.l1_table_offset, + s->l1_table, index, n, false, cb, opaque); +} + +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, + unsigned int n) +{ + int ret = -EINPROGRESS; + + qed_write_l1_table(s, index, n, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(s->bs), true); + } + + return ret; +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + uint64_t l2_offset; + QEDRequest *request; +} QEDReadL2TableCB; + +static void qed_read_l2_table_cb(void *opaque, int ret) +{ + QEDReadL2TableCB *read_l2_table_cb = opaque; + QEDRequest *request = read_l2_table_cb->request; + BDRVQEDState *s = read_l2_table_cb->s; + CachedL2Table *l2_table = request->l2_table; + uint64_t l2_offset = read_l2_table_cb->l2_offset; + + if (ret) { + /* can't trust loaded L2 table anymore */ + qed_unref_l2_cache_entry(l2_table); + request->l2_table = NULL; + } else { + l2_table->offset = l2_offset; + + qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + + /* This is guaranteed to succeed because we just committed the entry + * to the cache. + */ + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); + assert(request->l2_table != NULL); + } + + gencb_complete(&read_l2_table_cb->gencb, ret); +} + +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockCompletionFunc *cb, void *opaque) +{ + QEDReadL2TableCB *read_l2_table_cb; + + qed_unref_l2_cache_entry(request->l2_table); + + /* Check for cached L2 entry */ + request->l2_table = qed_find_l2_cache_entry(&s->l2_cache, offset); + if (request->l2_table) { + cb(opaque, 0); + return; + } + + request->l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + request->l2_table->table = qed_alloc_table(s); + + read_l2_table_cb = gencb_alloc(sizeof(*read_l2_table_cb), cb, opaque); + read_l2_table_cb->s = s; + read_l2_table_cb->l2_offset = offset; + read_l2_table_cb->request = request; + + BLKDBG_EVENT(s->bs->file, BLKDBG_L2_LOAD); + qed_read_table(s, offset, request->l2_table->table, + qed_read_l2_table_cb, read_l2_table_cb); +} + +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, uint64_t offset) +{ + int ret = -EINPROGRESS; + + qed_read_l2_table(s, request, offset, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(s->bs), true); + } + + return ret; +} + +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockCompletionFunc *cb, void *opaque) +{ + BLKDBG_EVENT(s->bs->file, BLKDBG_L2_UPDATE); + qed_write_table(s, request->l2_table->offset, + request->l2_table->table, index, n, flush, cb, opaque); +} + +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush) +{ + int ret = -EINPROGRESS; + + qed_write_l2_table(s, request, index, n, flush, qed_sync_cb, &ret); + while (ret == -EINPROGRESS) { + aio_poll(bdrv_get_aio_context(s->bs), true); + } + + return ret; +} diff --git a/src/block/qed.c b/src/block/qed.c new file mode 100644 index 0000000..9b88895 --- /dev/null +++ b/src/block/qed.c @@ -0,0 +1,1699 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu/timer.h" +#include "trace.h" +#include "qed.h" +#include "qapi/qmp/qerror.h" +#include "migration/migration.h" + +static const AIOCBInfo qed_aiocb_info = { + .aiocb_size = sizeof(QEDAIOCB), +}; + +static int bdrv_qed_probe(const uint8_t *buf, int buf_size, + const char *filename) +{ + const QEDHeader *header = (const QEDHeader *)buf; + + if (buf_size < sizeof(*header)) { + return 0; + } + if (le32_to_cpu(header->magic) != QED_MAGIC) { + return 0; + } + return 100; +} + +/** + * Check whether an image format is raw + * + * @fmt: Backing file format, may be NULL + */ +static bool qed_fmt_is_raw(const char *fmt) +{ + return fmt && strcmp(fmt, "raw") == 0; +} + +static void qed_header_le_to_cpu(const QEDHeader *le, QEDHeader *cpu) +{ + cpu->magic = le32_to_cpu(le->magic); + cpu->cluster_size = le32_to_cpu(le->cluster_size); + cpu->table_size = le32_to_cpu(le->table_size); + cpu->header_size = le32_to_cpu(le->header_size); + cpu->features = le64_to_cpu(le->features); + cpu->compat_features = le64_to_cpu(le->compat_features); + cpu->autoclear_features = le64_to_cpu(le->autoclear_features); + cpu->l1_table_offset = le64_to_cpu(le->l1_table_offset); + cpu->image_size = le64_to_cpu(le->image_size); + cpu->backing_filename_offset = le32_to_cpu(le->backing_filename_offset); + cpu->backing_filename_size = le32_to_cpu(le->backing_filename_size); +} + +static void qed_header_cpu_to_le(const QEDHeader *cpu, QEDHeader *le) +{ + le->magic = cpu_to_le32(cpu->magic); + le->cluster_size = cpu_to_le32(cpu->cluster_size); + le->table_size = cpu_to_le32(cpu->table_size); + le->header_size = cpu_to_le32(cpu->header_size); + le->features = cpu_to_le64(cpu->features); + le->compat_features = cpu_to_le64(cpu->compat_features); + le->autoclear_features = cpu_to_le64(cpu->autoclear_features); + le->l1_table_offset = cpu_to_le64(cpu->l1_table_offset); + le->image_size = cpu_to_le64(cpu->image_size); + le->backing_filename_offset = cpu_to_le32(cpu->backing_filename_offset); + le->backing_filename_size = cpu_to_le32(cpu->backing_filename_size); +} + +int qed_write_header_sync(BDRVQEDState *s) +{ + QEDHeader le; + int ret; + + qed_header_cpu_to_le(&s->header, &le); + ret = bdrv_pwrite(s->bs->file->bs, 0, &le, sizeof(le)); + if (ret != sizeof(le)) { + return ret; + } + return 0; +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + struct iovec iov; + QEMUIOVector qiov; + int nsectors; + uint8_t *buf; +} QEDWriteHeaderCB; + +static void qed_write_header_cb(void *opaque, int ret) +{ + QEDWriteHeaderCB *write_header_cb = opaque; + + qemu_vfree(write_header_cb->buf); + gencb_complete(write_header_cb, ret); +} + +static void qed_write_header_read_cb(void *opaque, int ret) +{ + QEDWriteHeaderCB *write_header_cb = opaque; + BDRVQEDState *s = write_header_cb->s; + + if (ret) { + qed_write_header_cb(write_header_cb, ret); + return; + } + + /* Update header */ + qed_header_cpu_to_le(&s->header, (QEDHeader *)write_header_cb->buf); + + bdrv_aio_writev(s->bs->file->bs, 0, &write_header_cb->qiov, + write_header_cb->nsectors, qed_write_header_cb, + write_header_cb); +} + +/** + * Update header in-place (does not rewrite backing filename or other strings) + * + * This function only updates known header fields in-place and does not affect + * extra data after the QED header. + */ +static void qed_write_header(BDRVQEDState *s, BlockCompletionFunc cb, + void *opaque) +{ + /* We must write full sectors for O_DIRECT but cannot necessarily generate + * the data following the header if an unrecognized compat feature is + * active. Therefore, first read the sectors containing the header, update + * them, and write back. + */ + + int nsectors = (sizeof(QEDHeader) + BDRV_SECTOR_SIZE - 1) / + BDRV_SECTOR_SIZE; + size_t len = nsectors * BDRV_SECTOR_SIZE; + QEDWriteHeaderCB *write_header_cb = gencb_alloc(sizeof(*write_header_cb), + cb, opaque); + + write_header_cb->s = s; + write_header_cb->nsectors = nsectors; + write_header_cb->buf = qemu_blockalign(s->bs, len); + write_header_cb->iov.iov_base = write_header_cb->buf; + write_header_cb->iov.iov_len = len; + qemu_iovec_init_external(&write_header_cb->qiov, &write_header_cb->iov, 1); + + bdrv_aio_readv(s->bs->file->bs, 0, &write_header_cb->qiov, nsectors, + qed_write_header_read_cb, write_header_cb); +} + +static uint64_t qed_max_image_size(uint32_t cluster_size, uint32_t table_size) +{ + uint64_t table_entries; + uint64_t l2_size; + + table_entries = (table_size * cluster_size) / sizeof(uint64_t); + l2_size = table_entries * cluster_size; + + return l2_size * table_entries; +} + +static bool qed_is_cluster_size_valid(uint32_t cluster_size) +{ + if (cluster_size < QED_MIN_CLUSTER_SIZE || + cluster_size > QED_MAX_CLUSTER_SIZE) { + return false; + } + if (cluster_size & (cluster_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_table_size_valid(uint32_t table_size) +{ + if (table_size < QED_MIN_TABLE_SIZE || + table_size > QED_MAX_TABLE_SIZE) { + return false; + } + if (table_size & (table_size - 1)) { + return false; /* not power of 2 */ + } + return true; +} + +static bool qed_is_image_size_valid(uint64_t image_size, uint32_t cluster_size, + uint32_t table_size) +{ + if (image_size % BDRV_SECTOR_SIZE != 0) { + return false; /* not multiple of sector size */ + } + if (image_size > qed_max_image_size(cluster_size, table_size)) { + return false; /* image is too large */ + } + return true; +} + +/** + * Read a string of known length from the image file + * + * @file: Image file + * @offset: File offset to start of string, in bytes + * @n: String length in bytes + * @buf: Destination buffer + * @buflen: Destination buffer length in bytes + * @ret: 0 on success, -errno on failure + * + * The string is NUL-terminated. + */ +static int qed_read_string(BlockDriverState *file, uint64_t offset, size_t n, + char *buf, size_t buflen) +{ + int ret; + if (n >= buflen) { + return -EINVAL; + } + ret = bdrv_pread(file, offset, buf, n); + if (ret < 0) { + return ret; + } + buf[n] = '\0'; + return 0; +} + +/** + * Allocate new clusters + * + * @s: QED state + * @n: Number of contiguous clusters to allocate + * @ret: Offset of first allocated cluster + * + * This function only produces the offset where the new clusters should be + * written. It updates BDRVQEDState but does not make any changes to the image + * file. + */ +static uint64_t qed_alloc_clusters(BDRVQEDState *s, unsigned int n) +{ + uint64_t offset = s->file_size; + s->file_size += n * s->header.cluster_size; + return offset; +} + +QEDTable *qed_alloc_table(BDRVQEDState *s) +{ + /* Honor O_DIRECT memory alignment requirements */ + return qemu_blockalign(s->bs, + s->header.cluster_size * s->header.table_size); +} + +/** + * Allocate a new zeroed L2 table + */ +static CachedL2Table *qed_new_l2_table(BDRVQEDState *s) +{ + CachedL2Table *l2_table = qed_alloc_l2_cache_entry(&s->l2_cache); + + l2_table->table = qed_alloc_table(s); + l2_table->offset = qed_alloc_clusters(s, s->header.table_size); + + memset(l2_table->table->offsets, 0, + s->header.cluster_size * s->header.table_size); + return l2_table; +} + +static void qed_aio_next_io(void *opaque, int ret); + +static void qed_plug_allocating_write_reqs(BDRVQEDState *s) +{ + assert(!s->allocating_write_reqs_plugged); + + s->allocating_write_reqs_plugged = true; +} + +static void qed_unplug_allocating_write_reqs(BDRVQEDState *s) +{ + QEDAIOCB *acb; + + assert(s->allocating_write_reqs_plugged); + + s->allocating_write_reqs_plugged = false; + + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } +} + +static void qed_finish_clear_need_check(void *opaque, int ret) +{ + /* Do nothing */ +} + +static void qed_flush_after_clear_need_check(void *opaque, int ret) +{ + BDRVQEDState *s = opaque; + + bdrv_aio_flush(s->bs, qed_finish_clear_need_check, s); + + /* No need to wait until flush completes */ + qed_unplug_allocating_write_reqs(s); +} + +static void qed_clear_need_check(void *opaque, int ret) +{ + BDRVQEDState *s = opaque; + + if (ret) { + qed_unplug_allocating_write_reqs(s); + return; + } + + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header(s, qed_flush_after_clear_need_check, s); +} + +static void qed_need_check_timer_cb(void *opaque) +{ + BDRVQEDState *s = opaque; + + /* The timer should only fire when allocating writes have drained */ + assert(!QSIMPLEQ_FIRST(&s->allocating_write_reqs)); + + trace_qed_need_check_timer_cb(s); + + qed_plug_allocating_write_reqs(s); + + /* Ensure writes are on disk before clearing flag */ + bdrv_aio_flush(s->bs, qed_clear_need_check, s); +} + +static void qed_start_need_check_timer(BDRVQEDState *s) +{ + trace_qed_start_need_check_timer(s); + + /* Use QEMU_CLOCK_VIRTUAL so we don't alter the image file while suspended for + * migration. + */ + timer_mod(s->need_check_timer, qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + + get_ticks_per_sec() * QED_NEED_CHECK_TIMEOUT); +} + +/* It's okay to call this multiple times or when no timer is started */ +static void qed_cancel_need_check_timer(BDRVQEDState *s) +{ + trace_qed_cancel_need_check_timer(s); + timer_del(s->need_check_timer); +} + +static void bdrv_qed_detach_aio_context(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + qed_cancel_need_check_timer(s); + timer_free(s->need_check_timer); +} + +static void bdrv_qed_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVQEDState *s = bs->opaque; + + s->need_check_timer = aio_timer_new(new_context, + QEMU_CLOCK_VIRTUAL, SCALE_NS, + qed_need_check_timer_cb, s); + if (s->header.features & QED_F_NEED_CHECK) { + qed_start_need_check_timer(s); + } +} + +static void bdrv_qed_drain(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + /* Cancel timer and start doing I/O that were meant to happen as if it + * fired, that way we get bdrv_drain() taking care of the ongoing requests + * correctly. */ + qed_cancel_need_check_timer(s); + qed_plug_allocating_write_reqs(s); + bdrv_aio_flush(s->bs, qed_clear_need_check, s); +} + +static int bdrv_qed_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVQEDState *s = bs->opaque; + QEDHeader le_header; + int64_t file_size; + int ret; + + s->bs = bs; + QSIMPLEQ_INIT(&s->allocating_write_reqs); + + ret = bdrv_pread(bs->file->bs, 0, &le_header, sizeof(le_header)); + if (ret < 0) { + return ret; + } + qed_header_le_to_cpu(&le_header, &s->header); + + if (s->header.magic != QED_MAGIC) { + error_setg(errp, "Image not in QED format"); + return -EINVAL; + } + if (s->header.features & ~QED_FEATURE_MASK) { + /* image uses unsupported feature bits */ + char buf[64]; + snprintf(buf, sizeof(buf), "%" PRIx64, + s->header.features & ~QED_FEATURE_MASK); + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "QED", buf); + return -ENOTSUP; + } + if (!qed_is_cluster_size_valid(s->header.cluster_size)) { + return -EINVAL; + } + + /* Round down file size to the last cluster */ + file_size = bdrv_getlength(bs->file->bs); + if (file_size < 0) { + return file_size; + } + s->file_size = qed_start_of_cluster(s, file_size); + + if (!qed_is_table_size_valid(s->header.table_size)) { + return -EINVAL; + } + if (!qed_is_image_size_valid(s->header.image_size, + s->header.cluster_size, + s->header.table_size)) { + return -EINVAL; + } + if (!qed_check_table_offset(s, s->header.l1_table_offset)) { + return -EINVAL; + } + + s->table_nelems = (s->header.cluster_size * s->header.table_size) / + sizeof(uint64_t); + s->l2_shift = ctz32(s->header.cluster_size); + s->l2_mask = s->table_nelems - 1; + s->l1_shift = s->l2_shift + ctz32(s->table_nelems); + + /* Header size calculation must not overflow uint32_t */ + if (s->header.header_size > UINT32_MAX / s->header.cluster_size) { + return -EINVAL; + } + + if ((s->header.features & QED_F_BACKING_FILE)) { + if ((uint64_t)s->header.backing_filename_offset + + s->header.backing_filename_size > + s->header.cluster_size * s->header.header_size) { + return -EINVAL; + } + + ret = qed_read_string(bs->file->bs, s->header.backing_filename_offset, + s->header.backing_filename_size, bs->backing_file, + sizeof(bs->backing_file)); + if (ret < 0) { + return ret; + } + + if (s->header.features & QED_F_BACKING_FORMAT_NO_PROBE) { + pstrcpy(bs->backing_format, sizeof(bs->backing_format), "raw"); + } + } + + /* Reset unknown autoclear feature bits. This is a backwards + * compatibility mechanism that allows images to be opened by older + * programs, which "knock out" unknown feature bits. When an image is + * opened by a newer program again it can detect that the autoclear + * feature is no longer valid. + */ + if ((s->header.autoclear_features & ~QED_AUTOCLEAR_FEATURE_MASK) != 0 && + !bdrv_is_read_only(bs->file->bs) && !(flags & BDRV_O_INCOMING)) { + s->header.autoclear_features &= QED_AUTOCLEAR_FEATURE_MASK; + + ret = qed_write_header_sync(s); + if (ret) { + return ret; + } + + /* From here on only known autoclear feature bits are valid */ + bdrv_flush(bs->file->bs); + } + + s->l1_table = qed_alloc_table(s); + qed_init_l2_cache(&s->l2_cache); + + ret = qed_read_l1_table_sync(s); + if (ret) { + goto out; + } + + /* If image was not closed cleanly, check consistency */ + if (!(flags & BDRV_O_CHECK) && (s->header.features & QED_F_NEED_CHECK)) { + /* Read-only images cannot be fixed. There is no risk of corruption + * since write operations are not possible. Therefore, allow + * potentially inconsistent images to be opened read-only. This can + * aid data recovery from an otherwise inconsistent image. + */ + if (!bdrv_is_read_only(bs->file->bs) && + !(flags & BDRV_O_INCOMING)) { + BdrvCheckResult result = {0}; + + ret = qed_check(s, &result, true); + if (ret) { + goto out; + } + } + } + + bdrv_qed_attach_aio_context(bs, bdrv_get_aio_context(bs)); + +out: + if (ret) { + qed_free_l2_cache(&s->l2_cache); + qemu_vfree(s->l1_table); + } + return ret; +} + +static void bdrv_qed_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVQEDState *s = bs->opaque; + + bs->bl.write_zeroes_alignment = s->header.cluster_size >> BDRV_SECTOR_BITS; +} + +/* We have nothing to do for QED reopen, stubs just return + * success */ +static int bdrv_qed_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static void bdrv_qed_close(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + + bdrv_qed_detach_aio_context(bs); + + /* Ensure writes reach stable storage */ + bdrv_flush(bs->file->bs); + + /* Clean shutdown, no check required on next open */ + if (s->header.features & QED_F_NEED_CHECK) { + s->header.features &= ~QED_F_NEED_CHECK; + qed_write_header_sync(s); + } + + qed_free_l2_cache(&s->l2_cache); + qemu_vfree(s->l1_table); +} + +static int qed_create(const char *filename, uint32_t cluster_size, + uint64_t image_size, uint32_t table_size, + const char *backing_file, const char *backing_fmt, + QemuOpts *opts, Error **errp) +{ + QEDHeader header = { + .magic = QED_MAGIC, + .cluster_size = cluster_size, + .table_size = table_size, + .header_size = 1, + .features = 0, + .compat_features = 0, + .l1_table_offset = cluster_size, + .image_size = image_size, + }; + QEDHeader le_header; + uint8_t *l1_table = NULL; + size_t l1_size = header.cluster_size * header.table_size; + Error *local_err = NULL; + int ret = 0; + BlockDriverState *bs; + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_PROTOCOL, + &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + return ret; + } + + /* File must start empty and grow, check truncate is supported */ + ret = bdrv_truncate(bs, 0); + if (ret < 0) { + goto out; + } + + if (backing_file) { + header.features |= QED_F_BACKING_FILE; + header.backing_filename_offset = sizeof(le_header); + header.backing_filename_size = strlen(backing_file); + + if (qed_fmt_is_raw(backing_fmt)) { + header.features |= QED_F_BACKING_FORMAT_NO_PROBE; + } + } + + qed_header_cpu_to_le(&header, &le_header); + ret = bdrv_pwrite(bs, 0, &le_header, sizeof(le_header)); + if (ret < 0) { + goto out; + } + ret = bdrv_pwrite(bs, sizeof(le_header), backing_file, + header.backing_filename_size); + if (ret < 0) { + goto out; + } + + l1_table = g_malloc0(l1_size); + ret = bdrv_pwrite(bs, header.l1_table_offset, l1_table, l1_size); + if (ret < 0) { + goto out; + } + + ret = 0; /* success */ +out: + g_free(l1_table); + bdrv_unref(bs); + return ret; +} + +static int bdrv_qed_create(const char *filename, QemuOpts *opts, Error **errp) +{ + uint64_t image_size = 0; + uint32_t cluster_size = QED_DEFAULT_CLUSTER_SIZE; + uint32_t table_size = QED_DEFAULT_TABLE_SIZE; + char *backing_file = NULL; + char *backing_fmt = NULL; + int ret; + + image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + backing_fmt = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FMT); + cluster_size = qemu_opt_get_size_del(opts, + BLOCK_OPT_CLUSTER_SIZE, + QED_DEFAULT_CLUSTER_SIZE); + table_size = qemu_opt_get_size_del(opts, BLOCK_OPT_TABLE_SIZE, + QED_DEFAULT_TABLE_SIZE); + + if (!qed_is_cluster_size_valid(cluster_size)) { + error_setg(errp, "QED cluster size must be within range [%u, %u] " + "and power of 2", + QED_MIN_CLUSTER_SIZE, QED_MAX_CLUSTER_SIZE); + ret = -EINVAL; + goto finish; + } + if (!qed_is_table_size_valid(table_size)) { + error_setg(errp, "QED table size must be within range [%u, %u] " + "and power of 2", + QED_MIN_TABLE_SIZE, QED_MAX_TABLE_SIZE); + ret = -EINVAL; + goto finish; + } + if (!qed_is_image_size_valid(image_size, cluster_size, table_size)) { + error_setg(errp, "QED image size must be a non-zero multiple of " + "cluster size and less than %" PRIu64 " bytes", + qed_max_image_size(cluster_size, table_size)); + ret = -EINVAL; + goto finish; + } + + ret = qed_create(filename, cluster_size, image_size, table_size, + backing_file, backing_fmt, opts, errp); + +finish: + g_free(backing_file); + g_free(backing_fmt); + return ret; +} + +typedef struct { + BlockDriverState *bs; + Coroutine *co; + uint64_t pos; + int64_t status; + int *pnum; +} QEDIsAllocatedCB; + +static void qed_is_allocated_cb(void *opaque, int ret, uint64_t offset, size_t len) +{ + QEDIsAllocatedCB *cb = opaque; + BDRVQEDState *s = cb->bs->opaque; + *cb->pnum = len / BDRV_SECTOR_SIZE; + switch (ret) { + case QED_CLUSTER_FOUND: + offset |= qed_offset_into_cluster(s, cb->pos); + cb->status = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + break; + case QED_CLUSTER_ZERO: + cb->status = BDRV_BLOCK_ZERO; + break; + case QED_CLUSTER_L2: + case QED_CLUSTER_L1: + cb->status = 0; + break; + default: + assert(ret < 0); + cb->status = ret; + break; + } + + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } +} + +static int64_t coroutine_fn bdrv_qed_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + BDRVQEDState *s = bs->opaque; + size_t len = (size_t)nb_sectors * BDRV_SECTOR_SIZE; + QEDIsAllocatedCB cb = { + .bs = bs, + .pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE, + .status = BDRV_BLOCK_OFFSET_MASK, + .pnum = pnum, + }; + QEDRequest request = { .l2_table = NULL }; + + qed_find_cluster(s, &request, cb.pos, len, qed_is_allocated_cb, &cb); + + /* Now sleep if the callback wasn't invoked immediately */ + while (cb.status == BDRV_BLOCK_OFFSET_MASK) { + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); + } + + qed_unref_l2_cache_entry(request.l2_table); + + return cb.status; +} + +static BDRVQEDState *acb_to_s(QEDAIOCB *acb) +{ + return acb->common.bs->opaque; +} + +/** + * Read from the backing file or zero-fill if no backing file + * + * @s: QED state + * @pos: Byte position in device + * @qiov: Destination I/O vector + * @backing_qiov: Possibly shortened copy of qiov, to be allocated here + * @cb: Completion function + * @opaque: User data for completion function + * + * This function reads qiov->size bytes starting at pos from the backing file. + * If there is no backing file then zeroes are read. + */ +static void qed_read_backing_file(BDRVQEDState *s, uint64_t pos, + QEMUIOVector *qiov, + QEMUIOVector **backing_qiov, + BlockCompletionFunc *cb, void *opaque) +{ + uint64_t backing_length = 0; + size_t size; + + /* If there is a backing file, get its length. Treat the absence of a + * backing file like a zero length backing file. + */ + if (s->bs->backing) { + int64_t l = bdrv_getlength(s->bs->backing->bs); + if (l < 0) { + cb(opaque, l); + return; + } + backing_length = l; + } + + /* Zero all sectors if reading beyond the end of the backing file */ + if (pos >= backing_length || + pos + qiov->size > backing_length) { + qemu_iovec_memset(qiov, 0, 0, qiov->size); + } + + /* Complete now if there are no backing file sectors to read */ + if (pos >= backing_length) { + cb(opaque, 0); + return; + } + + /* If the read straddles the end of the backing file, shorten it */ + size = MIN((uint64_t)backing_length - pos, qiov->size); + + assert(*backing_qiov == NULL); + *backing_qiov = g_new(QEMUIOVector, 1); + qemu_iovec_init(*backing_qiov, qiov->niov); + qemu_iovec_concat(*backing_qiov, qiov, 0, size); + + BLKDBG_EVENT(s->bs->file, BLKDBG_READ_BACKING_AIO); + bdrv_aio_readv(s->bs->backing->bs, pos / BDRV_SECTOR_SIZE, + *backing_qiov, size / BDRV_SECTOR_SIZE, cb, opaque); +} + +typedef struct { + GenericCB gencb; + BDRVQEDState *s; + QEMUIOVector qiov; + QEMUIOVector *backing_qiov; + struct iovec iov; + uint64_t offset; +} CopyFromBackingFileCB; + +static void qed_copy_from_backing_file_cb(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + qemu_vfree(copy_cb->iov.iov_base); + gencb_complete(©_cb->gencb, ret); +} + +static void qed_copy_from_backing_file_write(void *opaque, int ret) +{ + CopyFromBackingFileCB *copy_cb = opaque; + BDRVQEDState *s = copy_cb->s; + + if (copy_cb->backing_qiov) { + qemu_iovec_destroy(copy_cb->backing_qiov); + g_free(copy_cb->backing_qiov); + copy_cb->backing_qiov = NULL; + } + + if (ret) { + qed_copy_from_backing_file_cb(copy_cb, ret); + return; + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_COW_WRITE); + bdrv_aio_writev(s->bs->file->bs, copy_cb->offset / BDRV_SECTOR_SIZE, + ©_cb->qiov, copy_cb->qiov.size / BDRV_SECTOR_SIZE, + qed_copy_from_backing_file_cb, copy_cb); +} + +/** + * Copy data from backing file into the image + * + * @s: QED state + * @pos: Byte position in device + * @len: Number of bytes + * @offset: Byte offset in image file + * @cb: Completion function + * @opaque: User data for completion function + */ +static void qed_copy_from_backing_file(BDRVQEDState *s, uint64_t pos, + uint64_t len, uint64_t offset, + BlockCompletionFunc *cb, + void *opaque) +{ + CopyFromBackingFileCB *copy_cb; + + /* Skip copy entirely if there is no work to do */ + if (len == 0) { + cb(opaque, 0); + return; + } + + copy_cb = gencb_alloc(sizeof(*copy_cb), cb, opaque); + copy_cb->s = s; + copy_cb->offset = offset; + copy_cb->backing_qiov = NULL; + copy_cb->iov.iov_base = qemu_blockalign(s->bs, len); + copy_cb->iov.iov_len = len; + qemu_iovec_init_external(©_cb->qiov, ©_cb->iov, 1); + + qed_read_backing_file(s, pos, ©_cb->qiov, ©_cb->backing_qiov, + qed_copy_from_backing_file_write, copy_cb); +} + +/** + * Link one or more contiguous clusters into a table + * + * @s: QED state + * @table: L2 table + * @index: First cluster index + * @n: Number of contiguous clusters + * @cluster: First cluster offset + * + * The cluster offset may be an allocated byte offset in the image file, the + * zero cluster marker, or the unallocated cluster marker. + */ +static void qed_update_l2_table(BDRVQEDState *s, QEDTable *table, int index, + unsigned int n, uint64_t cluster) +{ + int i; + for (i = index; i < index + n; i++) { + table->offsets[i] = cluster; + if (!qed_offset_is_unalloc_cluster(cluster) && + !qed_offset_is_zero_cluster(cluster)) { + cluster += s->header.cluster_size; + } + } +} + +static void qed_aio_complete_bh(void *opaque) +{ + QEDAIOCB *acb = opaque; + BlockCompletionFunc *cb = acb->common.cb; + void *user_opaque = acb->common.opaque; + int ret = acb->bh_ret; + + qemu_bh_delete(acb->bh); + qemu_aio_unref(acb); + + /* Invoke callback */ + cb(user_opaque, ret); +} + +static void qed_aio_complete(QEDAIOCB *acb, int ret) +{ + BDRVQEDState *s = acb_to_s(acb); + + trace_qed_aio_complete(s, acb, ret); + + /* Free resources */ + qemu_iovec_destroy(&acb->cur_qiov); + qed_unref_l2_cache_entry(acb->request.l2_table); + + /* Free the buffer we may have allocated for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + qemu_vfree(acb->qiov->iov[0].iov_base); + acb->qiov->iov[0].iov_base = NULL; + } + + /* Arrange for a bh to invoke the completion function */ + acb->bh_ret = ret; + acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), + qed_aio_complete_bh, acb); + qemu_bh_schedule(acb->bh); + + /* Start next allocating write request waiting behind this one. Note that + * requests enqueue themselves when they first hit an unallocated cluster + * but they wait until the entire request is finished before waking up the + * next request in the queue. This ensures that we don't cycle through + * requests multiple times but rather finish one at a time completely. + */ + if (acb == QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_REMOVE_HEAD(&s->allocating_write_reqs, next); + acb = QSIMPLEQ_FIRST(&s->allocating_write_reqs); + if (acb) { + qed_aio_next_io(acb, 0); + } else if (s->header.features & QED_F_NEED_CHECK) { + qed_start_need_check_timer(s); + } + } +} + +/** + * Commit the current L2 table to the cache + */ +static void qed_commit_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + CachedL2Table *l2_table = acb->request.l2_table; + uint64_t l2_offset = l2_table->offset; + + qed_commit_l2_cache_entry(&s->l2_cache, l2_table); + + /* This is guaranteed to succeed because we just committed the entry to the + * cache. + */ + acb->request.l2_table = qed_find_l2_cache_entry(&s->l2_cache, l2_offset); + assert(acb->request.l2_table != NULL); + + qed_aio_next_io(opaque, ret); +} + +/** + * Update L1 table with new L2 table offset and write it out + */ +static void qed_aio_write_l1_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + int index; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + index = qed_l1_index(s, acb->cur_pos); + s->l1_table->offsets[index] = acb->request.l2_table->offset; + + qed_write_l1_table(s, index, 1, qed_commit_l2_update, acb); +} + +/** + * Update L2 table with new cluster offsets and write them out + */ +static void qed_aio_write_l2_update(QEDAIOCB *acb, int ret, uint64_t offset) +{ + BDRVQEDState *s = acb_to_s(acb); + bool need_alloc = acb->find_cluster_ret == QED_CLUSTER_L1; + int index; + + if (ret) { + goto err; + } + + if (need_alloc) { + qed_unref_l2_cache_entry(acb->request.l2_table); + acb->request.l2_table = qed_new_l2_table(s); + } + + index = qed_l2_index(s, acb->cur_pos); + qed_update_l2_table(s, acb->request.l2_table->table, index, acb->cur_nclusters, + offset); + + if (need_alloc) { + /* Write out the whole new L2 table */ + qed_write_l2_table(s, &acb->request, 0, s->table_nelems, true, + qed_aio_write_l1_update, acb); + } else { + /* Write out only the updated part of the L2 table */ + qed_write_l2_table(s, &acb->request, index, acb->cur_nclusters, false, + qed_aio_next_io, acb); + } + return; + +err: + qed_aio_complete(acb, ret); +} + +static void qed_aio_write_l2_update_cb(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + qed_aio_write_l2_update(acb, ret, acb->cur_cluster); +} + +/** + * Flush new data clusters before updating the L2 table + * + * This flush is necessary when a backing file is in use. A crash during an + * allocating write could result in empty clusters in the image. If the write + * only touched a subregion of the cluster, then backing image sectors have + * been lost in the untouched region. The solution is to flush after writing a + * new data cluster and before updating the L2 table. + */ +static void qed_aio_write_flush_before_l2_update(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + + if (!bdrv_aio_flush(s->bs->file->bs, qed_aio_write_l2_update_cb, opaque)) { + qed_aio_complete(acb, -EIO); + } +} + +/** + * Write data to the image file + */ +static void qed_aio_write_main(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos); + BlockCompletionFunc *next_fn; + + trace_qed_aio_write_main(s, acb, ret, offset, acb->cur_qiov.size); + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + if (acb->find_cluster_ret == QED_CLUSTER_FOUND) { + next_fn = qed_aio_next_io; + } else { + if (s->bs->backing) { + next_fn = qed_aio_write_flush_before_l2_update; + } else { + next_fn = qed_aio_write_l2_update_cb; + } + } + + BLKDBG_EVENT(s->bs->file, BLKDBG_WRITE_AIO); + bdrv_aio_writev(s->bs->file->bs, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, + next_fn, acb); +} + +/** + * Populate back untouched region of new data cluster + */ +static void qed_aio_write_postfill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = acb->cur_pos + acb->cur_qiov.size; + uint64_t len = + qed_start_of_cluster(s, start + s->header.cluster_size - 1) - start; + uint64_t offset = acb->cur_cluster + + qed_offset_into_cluster(s, acb->cur_pos) + + acb->cur_qiov.size; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + trace_qed_aio_write_postfill(s, acb, start, len, offset); + qed_copy_from_backing_file(s, start, len, offset, + qed_aio_write_main, acb); +} + +/** + * Populate front untouched region of new data cluster + */ +static void qed_aio_write_prefill(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + uint64_t start = qed_start_of_cluster(s, acb->cur_pos); + uint64_t len = qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_write_prefill(s, acb, start, len, acb->cur_cluster); + qed_copy_from_backing_file(s, start, len, acb->cur_cluster, + qed_aio_write_postfill, acb); +} + +/** + * Check if the QED_F_NEED_CHECK bit should be set during allocating write + */ +static bool qed_should_set_need_check(BDRVQEDState *s) +{ + /* The flush before L2 update path ensures consistency */ + if (s->bs->backing) { + return false; + } + + return !(s->header.features & QED_F_NEED_CHECK); +} + +static void qed_aio_write_zero_cluster(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + qed_aio_write_l2_update(acb, 0, 1); +} + +/** + * Write new data cluster + * + * @acb: Write request + * @len: Length in bytes + * + * This path is taken when writing to previously unallocated clusters. + */ +static void qed_aio_write_alloc(QEDAIOCB *acb, size_t len) +{ + BDRVQEDState *s = acb_to_s(acb); + BlockCompletionFunc *cb; + + /* Cancel timer when the first allocating request comes in */ + if (QSIMPLEQ_EMPTY(&s->allocating_write_reqs)) { + qed_cancel_need_check_timer(s); + } + + /* Freeze this request if another allocating write is in progress */ + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs)) { + QSIMPLEQ_INSERT_TAIL(&s->allocating_write_reqs, acb, next); + } + if (acb != QSIMPLEQ_FIRST(&s->allocating_write_reqs) || + s->allocating_write_reqs_plugged) { + return; /* wait for existing request to finish */ + } + + acb->cur_nclusters = qed_bytes_to_clusters(s, + qed_offset_into_cluster(s, acb->cur_pos) + len); + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + if (acb->flags & QED_AIOCB_ZERO) { + /* Skip ahead if the clusters are already zero */ + if (acb->find_cluster_ret == QED_CLUSTER_ZERO) { + qed_aio_next_io(acb, 0); + return; + } + + cb = qed_aio_write_zero_cluster; + } else { + cb = qed_aio_write_prefill; + acb->cur_cluster = qed_alloc_clusters(s, acb->cur_nclusters); + } + + if (qed_should_set_need_check(s)) { + s->header.features |= QED_F_NEED_CHECK; + qed_write_header(s, cb, acb); + } else { + cb(acb, 0); + } +} + +/** + * Write data cluster in place + * + * @acb: Write request + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * This path is taken when writing to already allocated clusters. + */ +static void qed_aio_write_inplace(QEDAIOCB *acb, uint64_t offset, size_t len) +{ + /* Allocate buffer for zero writes */ + if (acb->flags & QED_AIOCB_ZERO) { + struct iovec *iov = acb->qiov->iov; + + if (!iov->iov_base) { + iov->iov_base = qemu_try_blockalign(acb->common.bs, iov->iov_len); + if (iov->iov_base == NULL) { + qed_aio_complete(acb, -ENOMEM); + return; + } + memset(iov->iov_base, 0, iov->iov_len); + } + } + + /* Calculate the I/O vector */ + acb->cur_cluster = offset; + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Do the actual write */ + qed_aio_write_main(acb, 0); +} + +/** + * Write data cluster + * + * @opaque: Write request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_write_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + + trace_qed_aio_write_data(acb_to_s(acb), acb, ret, offset, len); + + acb->find_cluster_ret = ret; + + switch (ret) { + case QED_CLUSTER_FOUND: + qed_aio_write_inplace(acb, offset, len); + break; + + case QED_CLUSTER_L2: + case QED_CLUSTER_L1: + case QED_CLUSTER_ZERO: + qed_aio_write_alloc(acb, len); + break; + + default: + qed_aio_complete(acb, ret); + break; + } +} + +/** + * Read data cluster + * + * @opaque: Read request + * @ret: QED_CLUSTER_FOUND, QED_CLUSTER_L2, QED_CLUSTER_L1, + * or -errno + * @offset: Cluster offset in bytes + * @len: Length in bytes + * + * Callback from qed_find_cluster(). + */ +static void qed_aio_read_data(void *opaque, int ret, + uint64_t offset, size_t len) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + BlockDriverState *bs = acb->common.bs; + + /* Adjust offset into cluster */ + offset += qed_offset_into_cluster(s, acb->cur_pos); + + trace_qed_aio_read_data(s, acb, ret, offset, len); + + if (ret < 0) { + goto err; + } + + qemu_iovec_concat(&acb->cur_qiov, acb->qiov, acb->qiov_offset, len); + + /* Handle zero cluster and backing file reads */ + if (ret == QED_CLUSTER_ZERO) { + qemu_iovec_memset(&acb->cur_qiov, 0, 0, acb->cur_qiov.size); + qed_aio_next_io(acb, 0); + return; + } else if (ret != QED_CLUSTER_FOUND) { + qed_read_backing_file(s, acb->cur_pos, &acb->cur_qiov, + &acb->backing_qiov, qed_aio_next_io, acb); + return; + } + + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + bdrv_aio_readv(bs->file->bs, offset / BDRV_SECTOR_SIZE, + &acb->cur_qiov, acb->cur_qiov.size / BDRV_SECTOR_SIZE, + qed_aio_next_io, acb); + return; + +err: + qed_aio_complete(acb, ret); +} + +/** + * Begin next I/O or complete the request + */ +static void qed_aio_next_io(void *opaque, int ret) +{ + QEDAIOCB *acb = opaque; + BDRVQEDState *s = acb_to_s(acb); + QEDFindClusterFunc *io_fn = (acb->flags & QED_AIOCB_WRITE) ? + qed_aio_write_data : qed_aio_read_data; + + trace_qed_aio_next_io(s, acb, ret, acb->cur_pos + acb->cur_qiov.size); + + if (acb->backing_qiov) { + qemu_iovec_destroy(acb->backing_qiov); + g_free(acb->backing_qiov); + acb->backing_qiov = NULL; + } + + /* Handle I/O error */ + if (ret) { + qed_aio_complete(acb, ret); + return; + } + + acb->qiov_offset += acb->cur_qiov.size; + acb->cur_pos += acb->cur_qiov.size; + qemu_iovec_reset(&acb->cur_qiov); + + /* Complete request */ + if (acb->cur_pos >= acb->end_pos) { + qed_aio_complete(acb, 0); + return; + } + + /* Find next cluster and start I/O */ + qed_find_cluster(s, &acb->request, + acb->cur_pos, acb->end_pos - acb->cur_pos, + io_fn, acb); +} + +static BlockAIOCB *qed_aio_setup(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, int flags) +{ + QEDAIOCB *acb = qemu_aio_get(&qed_aiocb_info, bs, cb, opaque); + + trace_qed_aio_setup(bs->opaque, acb, sector_num, nb_sectors, + opaque, flags); + + acb->flags = flags; + acb->qiov = qiov; + acb->qiov_offset = 0; + acb->cur_pos = (uint64_t)sector_num * BDRV_SECTOR_SIZE; + acb->end_pos = acb->cur_pos + nb_sectors * BDRV_SECTOR_SIZE; + acb->backing_qiov = NULL; + acb->request.l2_table = NULL; + qemu_iovec_init(&acb->cur_qiov, qiov->niov); + + /* Start request */ + qed_aio_next_io(acb, 0); + return &acb->common; +} + +static BlockAIOCB *bdrv_qed_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, opaque, 0); +} + +static BlockAIOCB *bdrv_qed_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + return qed_aio_setup(bs, sector_num, qiov, nb_sectors, cb, + opaque, QED_AIOCB_WRITE); +} + +typedef struct { + Coroutine *co; + int ret; + bool done; +} QEDWriteZeroesCB; + +static void coroutine_fn qed_co_write_zeroes_cb(void *opaque, int ret) +{ + QEDWriteZeroesCB *cb = opaque; + + cb->done = true; + cb->ret = ret; + if (cb->co) { + qemu_coroutine_enter(cb->co, NULL); + } +} + +static int coroutine_fn bdrv_qed_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, + BdrvRequestFlags flags) +{ + BlockAIOCB *blockacb; + BDRVQEDState *s = bs->opaque; + QEDWriteZeroesCB cb = { .done = false }; + QEMUIOVector qiov; + struct iovec iov; + + /* Refuse if there are untouched backing file sectors */ + if (bs->backing) { + if (qed_offset_into_cluster(s, sector_num * BDRV_SECTOR_SIZE) != 0) { + return -ENOTSUP; + } + if (qed_offset_into_cluster(s, nb_sectors * BDRV_SECTOR_SIZE) != 0) { + return -ENOTSUP; + } + } + + /* Zero writes start without an I/O buffer. If a buffer becomes necessary + * then it will be allocated during request processing. + */ + iov.iov_base = NULL, + iov.iov_len = nb_sectors * BDRV_SECTOR_SIZE, + + qemu_iovec_init_external(&qiov, &iov, 1); + blockacb = qed_aio_setup(bs, sector_num, &qiov, nb_sectors, + qed_co_write_zeroes_cb, &cb, + QED_AIOCB_WRITE | QED_AIOCB_ZERO); + if (!blockacb) { + return -EIO; + } + if (!cb.done) { + cb.co = qemu_coroutine_self(); + qemu_coroutine_yield(); + } + assert(cb.done); + return cb.ret; +} + +static int bdrv_qed_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVQEDState *s = bs->opaque; + uint64_t old_image_size; + int ret; + + if (!qed_is_image_size_valid(offset, s->header.cluster_size, + s->header.table_size)) { + return -EINVAL; + } + + /* Shrinking is currently not supported */ + if ((uint64_t)offset < s->header.image_size) { + return -ENOTSUP; + } + + old_image_size = s->header.image_size; + s->header.image_size = offset; + ret = qed_write_header_sync(s); + if (ret < 0) { + s->header.image_size = old_image_size; + } + return ret; +} + +static int64_t bdrv_qed_getlength(BlockDriverState *bs) +{ + BDRVQEDState *s = bs->opaque; + return s->header.image_size; +} + +static int bdrv_qed_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVQEDState *s = bs->opaque; + + memset(bdi, 0, sizeof(*bdi)); + bdi->cluster_size = s->header.cluster_size; + bdi->is_dirty = s->header.features & QED_F_NEED_CHECK; + bdi->unallocated_blocks_are_zero = true; + bdi->can_write_zeroes_with_unmap = true; + return 0; +} + +static int bdrv_qed_change_backing_file(BlockDriverState *bs, + const char *backing_file, + const char *backing_fmt) +{ + BDRVQEDState *s = bs->opaque; + QEDHeader new_header, le_header; + void *buffer; + size_t buffer_len, backing_file_len; + int ret; + + /* Refuse to set backing filename if unknown compat feature bits are + * active. If the image uses an unknown compat feature then we may not + * know the layout of data following the header structure and cannot safely + * add a new string. + */ + if (backing_file && (s->header.compat_features & + ~QED_COMPAT_FEATURE_MASK)) { + return -ENOTSUP; + } + + memcpy(&new_header, &s->header, sizeof(new_header)); + + new_header.features &= ~(QED_F_BACKING_FILE | + QED_F_BACKING_FORMAT_NO_PROBE); + + /* Adjust feature flags */ + if (backing_file) { + new_header.features |= QED_F_BACKING_FILE; + + if (qed_fmt_is_raw(backing_fmt)) { + new_header.features |= QED_F_BACKING_FORMAT_NO_PROBE; + } + } + + /* Calculate new header size */ + backing_file_len = 0; + + if (backing_file) { + backing_file_len = strlen(backing_file); + } + + buffer_len = sizeof(new_header); + new_header.backing_filename_offset = buffer_len; + new_header.backing_filename_size = backing_file_len; + buffer_len += backing_file_len; + + /* Make sure we can rewrite header without failing */ + if (buffer_len > new_header.header_size * new_header.cluster_size) { + return -ENOSPC; + } + + /* Prepare new header */ + buffer = g_malloc(buffer_len); + + qed_header_cpu_to_le(&new_header, &le_header); + memcpy(buffer, &le_header, sizeof(le_header)); + buffer_len = sizeof(le_header); + + if (backing_file) { + memcpy(buffer + buffer_len, backing_file, backing_file_len); + buffer_len += backing_file_len; + } + + /* Write new header */ + ret = bdrv_pwrite_sync(bs->file->bs, 0, buffer, buffer_len); + g_free(buffer); + if (ret == 0) { + memcpy(&s->header, &new_header, sizeof(new_header)); + } + return ret; +} + +static void bdrv_qed_invalidate_cache(BlockDriverState *bs, Error **errp) +{ + BDRVQEDState *s = bs->opaque; + Error *local_err = NULL; + int ret; + + bdrv_qed_close(bs); + + bdrv_invalidate_cache(bs->file->bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + + memset(s, 0, sizeof(BDRVQEDState)); + ret = bdrv_qed_open(bs, NULL, bs->open_flags, &local_err); + if (local_err) { + error_setg(errp, "Could not reopen qed layer: %s", + error_get_pretty(local_err)); + error_free(local_err); + return; + } else if (ret < 0) { + error_setg_errno(errp, -ret, "Could not reopen qed layer"); + return; + } +} + +static int bdrv_qed_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVQEDState *s = bs->opaque; + + return qed_check(s, result, !!fix); +} + +static QemuOptsList qed_create_opts = { + .name = "qed-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qed_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_BACKING_FMT, + .type = QEMU_OPT_STRING, + .help = "Image format of the base image" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Cluster size (in bytes)", + .def_value_str = stringify(QED_DEFAULT_CLUSTER_SIZE) + }, + { + .name = BLOCK_OPT_TABLE_SIZE, + .type = QEMU_OPT_SIZE, + .help = "L1/L2 table size (in clusters)" + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_qed = { + .format_name = "qed", + .instance_size = sizeof(BDRVQEDState), + .create_opts = &qed_create_opts, + .supports_backing = true, + + .bdrv_probe = bdrv_qed_probe, + .bdrv_open = bdrv_qed_open, + .bdrv_close = bdrv_qed_close, + .bdrv_reopen_prepare = bdrv_qed_reopen_prepare, + .bdrv_create = bdrv_qed_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_get_block_status = bdrv_qed_co_get_block_status, + .bdrv_aio_readv = bdrv_qed_aio_readv, + .bdrv_aio_writev = bdrv_qed_aio_writev, + .bdrv_co_write_zeroes = bdrv_qed_co_write_zeroes, + .bdrv_truncate = bdrv_qed_truncate, + .bdrv_getlength = bdrv_qed_getlength, + .bdrv_get_info = bdrv_qed_get_info, + .bdrv_refresh_limits = bdrv_qed_refresh_limits, + .bdrv_change_backing_file = bdrv_qed_change_backing_file, + .bdrv_invalidate_cache = bdrv_qed_invalidate_cache, + .bdrv_check = bdrv_qed_check, + .bdrv_detach_aio_context = bdrv_qed_detach_aio_context, + .bdrv_attach_aio_context = bdrv_qed_attach_aio_context, + .bdrv_drain = bdrv_qed_drain, +}; + +static void bdrv_qed_init(void) +{ + bdrv_register(&bdrv_qed); +} + +block_init(bdrv_qed_init); diff --git a/src/block/qed.h b/src/block/qed.h new file mode 100644 index 0000000..615e676 --- /dev/null +++ b/src/block/qed.h @@ -0,0 +1,343 @@ +/* + * QEMU Enhanced Disk Format + * + * Copyright IBM, Corp. 2010 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_QED_H +#define BLOCK_QED_H + +#include "block/block_int.h" + +/* The layout of a QED file is as follows: + * + * +--------+----------+----------+----------+-----+ + * | header | L1 table | cluster0 | cluster1 | ... | + * +--------+----------+----------+----------+-----+ + * + * There is a 2-level pagetable for cluster allocation: + * + * +----------+ + * | L1 table | + * +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | L2 table | ... | L2 table | + * +----------+ +----------+ + * ,------' | '------. + * +----------+ | +----------+ + * | Data | ... | Data | + * +----------+ +----------+ + * + * The L1 table is fixed size and always present. L2 tables are allocated on + * demand. The L1 table size determines the maximum possible image size; it + * can be influenced using the cluster_size and table_size values. + * + * All fields are little-endian on disk. + */ +#define QED_DEFAULT_CLUSTER_SIZE 65536 +enum { + QED_MAGIC = 'Q' | 'E' << 8 | 'D' << 16 | '\0' << 24, + + /* The image supports a backing file */ + QED_F_BACKING_FILE = 0x01, + + /* The image needs a consistency check before use */ + QED_F_NEED_CHECK = 0x02, + + /* The backing file format must not be probed, treat as raw image */ + QED_F_BACKING_FORMAT_NO_PROBE = 0x04, + + /* Feature bits must be used when the on-disk format changes */ + QED_FEATURE_MASK = QED_F_BACKING_FILE | /* supported feature bits */ + QED_F_NEED_CHECK | + QED_F_BACKING_FORMAT_NO_PROBE, + QED_COMPAT_FEATURE_MASK = 0, /* supported compat feature bits */ + QED_AUTOCLEAR_FEATURE_MASK = 0, /* supported autoclear feature bits */ + + /* Data is stored in groups of sectors called clusters. Cluster size must + * be large to avoid keeping too much metadata. I/O requests that have + * sub-cluster size will require read-modify-write. + */ + QED_MIN_CLUSTER_SIZE = 4 * 1024, /* in bytes */ + QED_MAX_CLUSTER_SIZE = 64 * 1024 * 1024, + + /* Allocated clusters are tracked using a 2-level pagetable. Table size is + * a multiple of clusters so large maximum image sizes can be supported + * without jacking up the cluster size too much. + */ + QED_MIN_TABLE_SIZE = 1, /* in clusters */ + QED_MAX_TABLE_SIZE = 16, + QED_DEFAULT_TABLE_SIZE = 4, + + /* Delay to flush and clean image after last allocating write completes */ + QED_NEED_CHECK_TIMEOUT = 5, /* in seconds */ +}; + +typedef struct { + uint32_t magic; /* QED\0 */ + + uint32_t cluster_size; /* in bytes */ + uint32_t table_size; /* for L1 and L2 tables, in clusters */ + uint32_t header_size; /* in clusters */ + + uint64_t features; /* format feature bits */ + uint64_t compat_features; /* compatible feature bits */ + uint64_t autoclear_features; /* self-resetting feature bits */ + + uint64_t l1_table_offset; /* in bytes */ + uint64_t image_size; /* total logical image size, in bytes */ + + /* if (features & QED_F_BACKING_FILE) */ + uint32_t backing_filename_offset; /* in bytes from start of header */ + uint32_t backing_filename_size; /* in bytes */ +} QEMU_PACKED QEDHeader; + +typedef struct { + uint64_t offsets[0]; /* in bytes */ +} QEDTable; + +/* The L2 cache is a simple write-through cache for L2 structures */ +typedef struct CachedL2Table { + QEDTable *table; + uint64_t offset; /* offset=0 indicates an invalidate entry */ + QTAILQ_ENTRY(CachedL2Table) node; + int ref; +} CachedL2Table; + +typedef struct { + QTAILQ_HEAD(, CachedL2Table) entries; + unsigned int n_entries; +} L2TableCache; + +typedef struct QEDRequest { + CachedL2Table *l2_table; +} QEDRequest; + +enum { + QED_AIOCB_WRITE = 0x0001, /* read or write? */ + QED_AIOCB_ZERO = 0x0002, /* zero write, used with QED_AIOCB_WRITE */ +}; + +typedef struct QEDAIOCB { + BlockAIOCB common; + QEMUBH *bh; + int bh_ret; /* final return status for completion bh */ + QSIMPLEQ_ENTRY(QEDAIOCB) next; /* next request */ + int flags; /* QED_AIOCB_* bits ORed together */ + uint64_t end_pos; /* request end on block device, in bytes */ + + /* User scatter-gather list */ + QEMUIOVector *qiov; + size_t qiov_offset; /* byte count already processed */ + + /* Current cluster scatter-gather list */ + QEMUIOVector cur_qiov; + QEMUIOVector *backing_qiov; + uint64_t cur_pos; /* position on block device, in bytes */ + uint64_t cur_cluster; /* cluster offset in image file */ + unsigned int cur_nclusters; /* number of clusters being accessed */ + int find_cluster_ret; /* used for L1/L2 update */ + + QEDRequest request; +} QEDAIOCB; + +typedef struct { + BlockDriverState *bs; /* device */ + uint64_t file_size; /* length of image file, in bytes */ + + QEDHeader header; /* always cpu-endian */ + QEDTable *l1_table; + L2TableCache l2_cache; /* l2 table cache */ + uint32_t table_nelems; + uint32_t l1_shift; + uint32_t l2_shift; + uint32_t l2_mask; + + /* Allocating write request queue */ + QSIMPLEQ_HEAD(, QEDAIOCB) allocating_write_reqs; + bool allocating_write_reqs_plugged; + + /* Periodic flush and clear need check flag */ + QEMUTimer *need_check_timer; +} BDRVQEDState; + +enum { + QED_CLUSTER_FOUND, /* cluster found */ + QED_CLUSTER_ZERO, /* zero cluster found */ + QED_CLUSTER_L2, /* cluster missing in L2 */ + QED_CLUSTER_L1, /* cluster missing in L1 */ +}; + +/** + * qed_find_cluster() completion callback + * + * @opaque: User data for completion callback + * @ret: QED_CLUSTER_FOUND Success + * QED_CLUSTER_L2 Data cluster unallocated in L2 + * QED_CLUSTER_L1 L2 unallocated in L1 + * -errno POSIX error occurred + * @offset: Data cluster offset + * @len: Contiguous bytes starting from cluster offset + * + * This function is invoked when qed_find_cluster() completes. + * + * On success ret is QED_CLUSTER_FOUND and offset/len are a contiguous range + * in the image file. + * + * On failure ret is QED_CLUSTER_L2 or QED_CLUSTER_L1 for missing L2 or L1 + * table offset, respectively. len is number of contiguous unallocated bytes. + */ +typedef void QEDFindClusterFunc(void *opaque, int ret, uint64_t offset, size_t len); + +/** + * Generic callback for chaining async callbacks + */ +typedef struct { + BlockCompletionFunc *cb; + void *opaque; +} GenericCB; + +void *gencb_alloc(size_t len, BlockCompletionFunc *cb, void *opaque); +void gencb_complete(void *opaque, int ret); + +/** + * Header functions + */ +int qed_write_header_sync(BDRVQEDState *s); + +/** + * L2 cache functions + */ +void qed_init_l2_cache(L2TableCache *l2_cache); +void qed_free_l2_cache(L2TableCache *l2_cache); +CachedL2Table *qed_alloc_l2_cache_entry(L2TableCache *l2_cache); +void qed_unref_l2_cache_entry(CachedL2Table *entry); +CachedL2Table *qed_find_l2_cache_entry(L2TableCache *l2_cache, uint64_t offset); +void qed_commit_l2_cache_entry(L2TableCache *l2_cache, CachedL2Table *l2_table); + +/** + * Table I/O functions + */ +int qed_read_l1_table_sync(BDRVQEDState *s); +void qed_write_l1_table(BDRVQEDState *s, unsigned int index, unsigned int n, + BlockCompletionFunc *cb, void *opaque); +int qed_write_l1_table_sync(BDRVQEDState *s, unsigned int index, + unsigned int n); +int qed_read_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + uint64_t offset); +void qed_read_l2_table(BDRVQEDState *s, QEDRequest *request, uint64_t offset, + BlockCompletionFunc *cb, void *opaque); +void qed_write_l2_table(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush, + BlockCompletionFunc *cb, void *opaque); +int qed_write_l2_table_sync(BDRVQEDState *s, QEDRequest *request, + unsigned int index, unsigned int n, bool flush); + +/** + * Cluster functions + */ +void qed_find_cluster(BDRVQEDState *s, QEDRequest *request, uint64_t pos, + size_t len, QEDFindClusterFunc *cb, void *opaque); + +/** + * Consistency check + */ +int qed_check(BDRVQEDState *s, BdrvCheckResult *result, bool fix); + +QEDTable *qed_alloc_table(BDRVQEDState *s); + +/** + * Round down to the start of a cluster + */ +static inline uint64_t qed_start_of_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & ~(uint64_t)(s->header.cluster_size - 1); +} + +static inline uint64_t qed_offset_into_cluster(BDRVQEDState *s, uint64_t offset) +{ + return offset & (s->header.cluster_size - 1); +} + +static inline uint64_t qed_bytes_to_clusters(BDRVQEDState *s, uint64_t bytes) +{ + return qed_start_of_cluster(s, bytes + (s->header.cluster_size - 1)) / + (s->header.cluster_size - 1); +} + +static inline unsigned int qed_l1_index(BDRVQEDState *s, uint64_t pos) +{ + return pos >> s->l1_shift; +} + +static inline unsigned int qed_l2_index(BDRVQEDState *s, uint64_t pos) +{ + return (pos >> s->l2_shift) & s->l2_mask; +} + +/** + * Test if a cluster offset is valid + */ +static inline bool qed_check_cluster_offset(BDRVQEDState *s, uint64_t offset) +{ + uint64_t header_size = (uint64_t)s->header.header_size * + s->header.cluster_size; + + if (offset & (s->header.cluster_size - 1)) { + return false; + } + return offset >= header_size && offset < s->file_size; +} + +/** + * Test if a table offset is valid + */ +static inline bool qed_check_table_offset(BDRVQEDState *s, uint64_t offset) +{ + uint64_t end_offset = offset + (s->header.table_size - 1) * + s->header.cluster_size; + + /* Overflow check */ + if (end_offset <= offset) { + return false; + } + + return qed_check_cluster_offset(s, offset) && + qed_check_cluster_offset(s, end_offset); +} + +static inline bool qed_offset_is_cluster_aligned(BDRVQEDState *s, + uint64_t offset) +{ + if (qed_offset_into_cluster(s, offset)) { + return false; + } + return true; +} + +static inline bool qed_offset_is_unalloc_cluster(uint64_t offset) +{ + if (offset == 0) { + return true; + } + return false; +} + +static inline bool qed_offset_is_zero_cluster(uint64_t offset) +{ + if (offset == 1) { + return true; + } + return false; +} + +#endif /* BLOCK_QED_H */ diff --git a/src/block/quorum.c b/src/block/quorum.c new file mode 100644 index 0000000..e640688 --- /dev/null +++ b/src/block/quorum.c @@ -0,0 +1,1070 @@ +/* + * Quorum Block filter + * + * Copyright (C) 2012-2014 Nodalink, EURL. + * + * Author: + * Benoît Canet <benoit.canet@irqsave.net> + * + * Based on the design and code of blkverify.c (Copyright (C) 2010 IBM, Corp) + * and blkmirror.c (Copyright (C) 2011 Red Hat, Inc). + * + * This work is licensed under the terms of the GNU GPL, version 2 or later. + * See the COPYING file in the top-level directory. + */ + +#include "block/block_int.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qdict.h" +#include "qapi/qmp/qerror.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qjson.h" +#include "qapi/qmp/qlist.h" +#include "qapi/qmp/qstring.h" +#include "qapi-event.h" +#include "crypto/hash.h" + +#define HASH_LENGTH 32 + +#define QUORUM_OPT_VOTE_THRESHOLD "vote-threshold" +#define QUORUM_OPT_BLKVERIFY "blkverify" +#define QUORUM_OPT_REWRITE "rewrite-corrupted" +#define QUORUM_OPT_READ_PATTERN "read-pattern" + +/* This union holds a vote hash value */ +typedef union QuorumVoteValue { + uint8_t h[HASH_LENGTH]; /* SHA-256 hash */ + int64_t l; /* simpler 64 bits hash */ +} QuorumVoteValue; + +/* A vote item */ +typedef struct QuorumVoteItem { + int index; + QLIST_ENTRY(QuorumVoteItem) next; +} QuorumVoteItem; + +/* this structure is a vote version. A version is the set of votes sharing the + * same vote value. + * The set of votes will be tracked with the items field and its cardinality is + * vote_count. + */ +typedef struct QuorumVoteVersion { + QuorumVoteValue value; + int index; + int vote_count; + QLIST_HEAD(, QuorumVoteItem) items; + QLIST_ENTRY(QuorumVoteVersion) next; +} QuorumVoteVersion; + +/* this structure holds a group of vote versions together */ +typedef struct QuorumVotes { + QLIST_HEAD(, QuorumVoteVersion) vote_list; + bool (*compare)(QuorumVoteValue *a, QuorumVoteValue *b); +} QuorumVotes; + +/* the following structure holds the state of one quorum instance */ +typedef struct BDRVQuorumState { + BdrvChild **children; /* children BlockDriverStates */ + int num_children; /* children count */ + int threshold; /* if less than threshold children reads gave the + * same result a quorum error occurs. + */ + bool is_blkverify; /* true if the driver is in blkverify mode + * Writes are mirrored on two children devices. + * On reads the two children devices' contents are + * compared and if a difference is spotted its + * location is printed and the code aborts. + * It is useful to debug other block drivers by + * comparing them with a reference one. + */ + bool rewrite_corrupted;/* true if the driver must rewrite-on-read corrupted + * block if Quorum is reached. + */ + + QuorumReadPattern read_pattern; +} BDRVQuorumState; + +typedef struct QuorumAIOCB QuorumAIOCB; + +/* Quorum will create one instance of the following structure per operation it + * performs on its children. + * So for each read/write operation coming from the upper layer there will be + * $children_count QuorumChildRequest. + */ +typedef struct QuorumChildRequest { + BlockAIOCB *aiocb; + QEMUIOVector qiov; + uint8_t *buf; + int ret; + QuorumAIOCB *parent; +} QuorumChildRequest; + +/* Quorum will use the following structure to track progress of each read/write + * operation received by the upper layer. + * This structure hold pointers to the QuorumChildRequest structures instances + * used to do operations on each children and track overall progress. + */ +struct QuorumAIOCB { + BlockAIOCB common; + + /* Request metadata */ + uint64_t sector_num; + int nb_sectors; + + QEMUIOVector *qiov; /* calling IOV */ + + QuorumChildRequest *qcrs; /* individual child requests */ + int count; /* number of completed AIOCB */ + int success_count; /* number of successfully completed AIOCB */ + + int rewrite_count; /* number of replica to rewrite: count down to + * zero once writes are fired + */ + + QuorumVotes votes; + + bool is_read; + int vote_ret; + int child_iter; /* which child to read in fifo pattern */ +}; + +static bool quorum_vote(QuorumAIOCB *acb); + +static void quorum_aio_cancel(BlockAIOCB *blockacb) +{ + QuorumAIOCB *acb = container_of(blockacb, QuorumAIOCB, common); + BDRVQuorumState *s = acb->common.bs->opaque; + int i; + + /* cancel all callbacks */ + for (i = 0; i < s->num_children; i++) { + if (acb->qcrs[i].aiocb) { + bdrv_aio_cancel_async(acb->qcrs[i].aiocb); + } + } +} + +static AIOCBInfo quorum_aiocb_info = { + .aiocb_size = sizeof(QuorumAIOCB), + .cancel_async = quorum_aio_cancel, +}; + +static void quorum_aio_finalize(QuorumAIOCB *acb) +{ + int i, ret = 0; + + if (acb->vote_ret) { + ret = acb->vote_ret; + } + + acb->common.cb(acb->common.opaque, ret); + + if (acb->is_read) { + /* on the quorum case acb->child_iter == s->num_children - 1 */ + for (i = 0; i <= acb->child_iter; i++) { + qemu_vfree(acb->qcrs[i].buf); + qemu_iovec_destroy(&acb->qcrs[i].qiov); + } + } + + g_free(acb->qcrs); + qemu_aio_unref(acb); +} + +static bool quorum_sha256_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return !memcmp(a->h, b->h, HASH_LENGTH); +} + +static bool quorum_64bits_compare(QuorumVoteValue *a, QuorumVoteValue *b) +{ + return a->l == b->l; +} + +static QuorumAIOCB *quorum_aio_get(BDRVQuorumState *s, + BlockDriverState *bs, + QEMUIOVector *qiov, + uint64_t sector_num, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + QuorumAIOCB *acb = qemu_aio_get(&quorum_aiocb_info, bs, cb, opaque); + int i; + + acb->common.bs->opaque = s; + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + acb->qiov = qiov; + acb->qcrs = g_new0(QuorumChildRequest, s->num_children); + acb->count = 0; + acb->success_count = 0; + acb->rewrite_count = 0; + acb->votes.compare = quorum_sha256_compare; + QLIST_INIT(&acb->votes.vote_list); + acb->is_read = false; + acb->vote_ret = 0; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].buf = NULL; + acb->qcrs[i].ret = 0; + acb->qcrs[i].parent = acb; + } + + return acb; +} + +static void quorum_report_bad(QuorumAIOCB *acb, char *node_name, int ret) +{ + const char *msg = NULL; + if (ret < 0) { + msg = strerror(-ret); + } + qapi_event_send_quorum_report_bad(!!msg, msg, node_name, + acb->sector_num, acb->nb_sectors, &error_abort); +} + +static void quorum_report_failure(QuorumAIOCB *acb) +{ + const char *reference = bdrv_get_device_or_node_name(acb->common.bs); + qapi_event_send_quorum_failure(reference, acb->sector_num, + acb->nb_sectors, &error_abort); +} + +static int quorum_vote_error(QuorumAIOCB *acb); + +static bool quorum_has_too_much_io_failed(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + + if (acb->success_count < s->threshold) { + acb->vote_ret = quorum_vote_error(acb); + quorum_report_failure(acb); + return true; + } + + return false; +} + +static void quorum_rewrite_aio_cb(void *opaque, int ret) +{ + QuorumAIOCB *acb = opaque; + + /* one less rewrite to do */ + acb->rewrite_count--; + + /* wait until all rewrite callbacks have completed */ + if (acb->rewrite_count) { + return; + } + + quorum_aio_finalize(acb); +} + +static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb); + +static void quorum_copy_qiov(QEMUIOVector *dest, QEMUIOVector *source) +{ + int i; + assert(dest->niov == source->niov); + assert(dest->size == source->size); + for (i = 0; i < source->niov; i++) { + assert(dest->iov[i].iov_len == source->iov[i].iov_len); + memcpy(dest->iov[i].iov_base, + source->iov[i].iov_base, + source->iov[i].iov_len); + } +} + +static void quorum_aio_cb(void *opaque, int ret) +{ + QuorumChildRequest *sacb = opaque; + QuorumAIOCB *acb = sacb->parent; + BDRVQuorumState *s = acb->common.bs->opaque; + bool rewrite = false; + + if (acb->is_read && s->read_pattern == QUORUM_READ_PATTERN_FIFO) { + /* We try to read next child in FIFO order if we fail to read */ + if (ret < 0 && ++acb->child_iter < s->num_children) { + read_fifo_child(acb); + return; + } + + if (ret == 0) { + quorum_copy_qiov(acb->qiov, &acb->qcrs[acb->child_iter].qiov); + } + acb->vote_ret = ret; + quorum_aio_finalize(acb); + return; + } + + sacb->ret = ret; + acb->count++; + if (ret == 0) { + acb->success_count++; + } else { + quorum_report_bad(acb, sacb->aiocb->bs->node_name, ret); + } + assert(acb->count <= s->num_children); + assert(acb->success_count <= s->num_children); + if (acb->count < s->num_children) { + return; + } + + /* Do the vote on read */ + if (acb->is_read) { + rewrite = quorum_vote(acb); + } else { + quorum_has_too_much_io_failed(acb); + } + + /* if no rewrite is done the code will finish right away */ + if (!rewrite) { + quorum_aio_finalize(acb); + } +} + +static void quorum_report_bad_versions(BDRVQuorumState *s, + QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + quorum_report_bad(acb, s->children[item->index]->bs->node_name, 0); + } + } +} + +static bool quorum_rewrite_bad_versions(BDRVQuorumState *s, QuorumAIOCB *acb, + QuorumVoteValue *value) +{ + QuorumVoteVersion *version; + QuorumVoteItem *item; + int count = 0; + + /* first count the number of bad versions: done first to avoid concurrency + * issues. + */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + count++; + } + } + + /* quorum_rewrite_aio_cb will count down this to zero */ + acb->rewrite_count = count; + + /* now fire the correcting rewrites */ + QLIST_FOREACH(version, &acb->votes.vote_list, next) { + if (acb->votes.compare(&version->value, value)) { + continue; + } + QLIST_FOREACH(item, &version->items, next) { + bdrv_aio_writev(s->children[item->index]->bs, acb->sector_num, + acb->qiov, acb->nb_sectors, quorum_rewrite_aio_cb, + acb); + } + } + + /* return true if any rewrite is done else false */ + return count; +} + +static void quorum_count_vote(QuorumVotes *votes, + QuorumVoteValue *value, + int index) +{ + QuorumVoteVersion *v = NULL, *version = NULL; + QuorumVoteItem *item; + + /* look if we have something with this hash */ + QLIST_FOREACH(v, &votes->vote_list, next) { + if (votes->compare(&v->value, value)) { + version = v; + break; + } + } + + /* It's a version not yet in the list add it */ + if (!version) { + version = g_new0(QuorumVoteVersion, 1); + QLIST_INIT(&version->items); + memcpy(&version->value, value, sizeof(version->value)); + version->index = index; + version->vote_count = 0; + QLIST_INSERT_HEAD(&votes->vote_list, version, next); + } + + version->vote_count++; + + item = g_new0(QuorumVoteItem, 1); + item->index = index; + QLIST_INSERT_HEAD(&version->items, item, next); +} + +static void quorum_free_vote_list(QuorumVotes *votes) +{ + QuorumVoteVersion *version, *next_version; + QuorumVoteItem *item, *next_item; + + QLIST_FOREACH_SAFE(version, &votes->vote_list, next, next_version) { + QLIST_REMOVE(version, next); + QLIST_FOREACH_SAFE(item, &version->items, next, next_item) { + QLIST_REMOVE(item, next); + g_free(item); + } + g_free(version); + } +} + +static int quorum_compute_hash(QuorumAIOCB *acb, int i, QuorumVoteValue *hash) +{ + QEMUIOVector *qiov = &acb->qcrs[i].qiov; + size_t len = sizeof(hash->h); + uint8_t *data = hash->h; + + /* XXX - would be nice if we could pass in the Error ** + * and propagate that back, but this quorum code is + * restricted to just errno values currently */ + if (qcrypto_hash_bytesv(QCRYPTO_HASH_ALG_SHA256, + qiov->iov, qiov->niov, + &data, &len, + NULL) < 0) { + return -EINVAL; + } + + return 0; +} + +static QuorumVoteVersion *quorum_get_vote_winner(QuorumVotes *votes) +{ + int max = 0; + QuorumVoteVersion *candidate, *winner = NULL; + + QLIST_FOREACH(candidate, &votes->vote_list, next) { + if (candidate->vote_count > max) { + max = candidate->vote_count; + winner = candidate; + } + } + + return winner; +} + +/* qemu_iovec_compare is handy for blkverify mode because it returns the first + * differing byte location. Yet it is handcoded to compare vectors one byte + * after another so it does not benefit from the libc SIMD optimizations. + * quorum_iovec_compare is written for speed and should be used in the non + * blkverify mode of quorum. + */ +static bool quorum_iovec_compare(QEMUIOVector *a, QEMUIOVector *b) +{ + int i; + int result; + + assert(a->niov == b->niov); + for (i = 0; i < a->niov; i++) { + assert(a->iov[i].iov_len == b->iov[i].iov_len); + result = memcmp(a->iov[i].iov_base, + b->iov[i].iov_base, + a->iov[i].iov_len); + if (result) { + return false; + } + } + + return true; +} + +static void GCC_FMT_ATTR(2, 3) quorum_err(QuorumAIOCB *acb, + const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + fprintf(stderr, "quorum: sector_num=%" PRId64 " nb_sectors=%d ", + acb->sector_num, acb->nb_sectors); + vfprintf(stderr, fmt, ap); + fprintf(stderr, "\n"); + va_end(ap); + exit(1); +} + +static bool quorum_compare(QuorumAIOCB *acb, + QEMUIOVector *a, + QEMUIOVector *b) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + ssize_t offset; + + /* This driver will replace blkverify in this particular case */ + if (s->is_blkverify) { + offset = qemu_iovec_compare(a, b); + if (offset != -1) { + quorum_err(acb, "contents mismatch in sector %" PRId64, + acb->sector_num + + (uint64_t)(offset / BDRV_SECTOR_SIZE)); + } + return true; + } + + return quorum_iovec_compare(a, b); +} + +/* Do a vote to get the error code */ +static int quorum_vote_error(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner = NULL; + QuorumVotes error_votes; + QuorumVoteValue result_value; + int i, ret = 0; + bool error = false; + + QLIST_INIT(&error_votes.vote_list); + error_votes.compare = quorum_64bits_compare; + + for (i = 0; i < s->num_children; i++) { + ret = acb->qcrs[i].ret; + if (ret) { + error = true; + result_value.l = ret; + quorum_count_vote(&error_votes, &result_value, i); + } + } + + if (error) { + winner = quorum_get_vote_winner(&error_votes); + ret = winner->value.l; + } + + quorum_free_vote_list(&error_votes); + + return ret; +} + +static bool quorum_vote(QuorumAIOCB *acb) +{ + bool quorum = true; + bool rewrite = false; + int i, j, ret; + QuorumVoteValue hash; + BDRVQuorumState *s = acb->common.bs->opaque; + QuorumVoteVersion *winner; + + if (quorum_has_too_much_io_failed(acb)) { + return false; + } + + /* get the index of the first successful read */ + for (i = 0; i < s->num_children; i++) { + if (!acb->qcrs[i].ret) { + break; + } + } + + assert(i < s->num_children); + + /* compare this read with all other successful reads stopping at quorum + * failure + */ + for (j = i + 1; j < s->num_children; j++) { + if (acb->qcrs[j].ret) { + continue; + } + quorum = quorum_compare(acb, &acb->qcrs[i].qiov, &acb->qcrs[j].qiov); + if (!quorum) { + break; + } + } + + /* Every successful read agrees */ + if (quorum) { + quorum_copy_qiov(acb->qiov, &acb->qcrs[i].qiov); + return false; + } + + /* compute hashes for each successful read, also store indexes */ + for (i = 0; i < s->num_children; i++) { + if (acb->qcrs[i].ret) { + continue; + } + ret = quorum_compute_hash(acb, i, &hash); + /* if ever the hash computation failed */ + if (ret < 0) { + acb->vote_ret = ret; + goto free_exit; + } + quorum_count_vote(&acb->votes, &hash, i); + } + + /* vote to select the most represented version */ + winner = quorum_get_vote_winner(&acb->votes); + + /* if the winner count is smaller than threshold the read fails */ + if (winner->vote_count < s->threshold) { + quorum_report_failure(acb); + acb->vote_ret = -EIO; + goto free_exit; + } + + /* we have a winner: copy it */ + quorum_copy_qiov(acb->qiov, &acb->qcrs[winner->index].qiov); + + /* some versions are bad print them */ + quorum_report_bad_versions(s, acb, &winner->value); + + /* corruption correction is enabled */ + if (s->rewrite_corrupted) { + rewrite = quorum_rewrite_bad_versions(s, acb, &winner->value); + } + +free_exit: + /* free lists */ + quorum_free_vote_list(&acb->votes); + return rewrite; +} + +static BlockAIOCB *read_quorum_children(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].buf = qemu_blockalign(s->children[i]->bs, acb->qiov->size); + qemu_iovec_init(&acb->qcrs[i].qiov, acb->qiov->niov); + qemu_iovec_clone(&acb->qcrs[i].qiov, acb->qiov, acb->qcrs[i].buf); + } + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].aiocb = bdrv_aio_readv(s->children[i]->bs, acb->sector_num, + &acb->qcrs[i].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[i]); + } + + return &acb->common; +} + +static BlockAIOCB *read_fifo_child(QuorumAIOCB *acb) +{ + BDRVQuorumState *s = acb->common.bs->opaque; + + acb->qcrs[acb->child_iter].buf = + qemu_blockalign(s->children[acb->child_iter]->bs, acb->qiov->size); + qemu_iovec_init(&acb->qcrs[acb->child_iter].qiov, acb->qiov->niov); + qemu_iovec_clone(&acb->qcrs[acb->child_iter].qiov, acb->qiov, + acb->qcrs[acb->child_iter].buf); + acb->qcrs[acb->child_iter].aiocb = + bdrv_aio_readv(s->children[acb->child_iter]->bs, acb->sector_num, + &acb->qcrs[acb->child_iter].qiov, acb->nb_sectors, + quorum_aio_cb, &acb->qcrs[acb->child_iter]); + + return &acb->common; +} + +static BlockAIOCB *quorum_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, + nb_sectors, cb, opaque); + acb->is_read = true; + + if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { + acb->child_iter = s->num_children - 1; + return read_quorum_children(acb); + } + + acb->child_iter = 0; + return read_fifo_child(acb); +} + +static BlockAIOCB *quorum_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + BDRVQuorumState *s = bs->opaque; + QuorumAIOCB *acb = quorum_aio_get(s, bs, qiov, sector_num, nb_sectors, + cb, opaque); + int i; + + for (i = 0; i < s->num_children; i++) { + acb->qcrs[i].aiocb = bdrv_aio_writev(s->children[i]->bs, sector_num, + qiov, nb_sectors, &quorum_aio_cb, + &acb->qcrs[i]); + } + + return &acb->common; +} + +static int64_t quorum_getlength(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int64_t result; + int i; + + /* check that all file have the same length */ + result = bdrv_getlength(s->children[0]->bs); + if (result < 0) { + return result; + } + for (i = 1; i < s->num_children; i++) { + int64_t value = bdrv_getlength(s->children[i]->bs); + if (value < 0) { + return value; + } + if (value != result) { + return -EIO; + } + } + + return result; +} + +static void quorum_invalidate_cache(BlockDriverState *bs, Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + Error *local_err = NULL; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_invalidate_cache(s->children[i]->bs, &local_err); + if (local_err) { + error_propagate(errp, local_err); + return; + } + } +} + +static coroutine_fn int quorum_co_flush(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + QuorumVoteVersion *winner = NULL; + QuorumVotes error_votes; + QuorumVoteValue result_value; + int i; + int result = 0; + + QLIST_INIT(&error_votes.vote_list); + error_votes.compare = quorum_64bits_compare; + + for (i = 0; i < s->num_children; i++) { + result = bdrv_co_flush(s->children[i]->bs); + result_value.l = result; + quorum_count_vote(&error_votes, &result_value, i); + } + + winner = quorum_get_vote_winner(&error_votes); + result = winner->value.l; + + quorum_free_vote_list(&error_votes); + + return result; +} + +static bool quorum_recurse_is_first_non_filter(BlockDriverState *bs, + BlockDriverState *candidate) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bool perm = bdrv_recurse_is_first_non_filter(s->children[i]->bs, + candidate); + if (perm) { + return true; + } + } + + return false; +} + +static int quorum_valid_threshold(int threshold, int num_children, Error **errp) +{ + + if (threshold < 1) { + error_setg(errp, QERR_INVALID_PARAMETER_VALUE, + "vote-threshold", "value >= 1"); + return -ERANGE; + } + + if (threshold > num_children) { + error_setg(errp, "threshold may not exceed children count"); + return -ERANGE; + } + + return 0; +} + +static QemuOptsList quorum_runtime_opts = { + .name = "quorum", + .head = QTAILQ_HEAD_INITIALIZER(quorum_runtime_opts.head), + .desc = { + { + .name = QUORUM_OPT_VOTE_THRESHOLD, + .type = QEMU_OPT_NUMBER, + .help = "The number of vote needed for reaching quorum", + }, + { + .name = QUORUM_OPT_BLKVERIFY, + .type = QEMU_OPT_BOOL, + .help = "Trigger block verify mode if set", + }, + { + .name = QUORUM_OPT_REWRITE, + .type = QEMU_OPT_BOOL, + .help = "Rewrite corrupted block on read quorum", + }, + { + .name = QUORUM_OPT_READ_PATTERN, + .type = QEMU_OPT_STRING, + .help = "Allowed pattern: quorum, fifo. Quorum is default", + }, + { /* end of list */ } + }, +}; + +static int parse_read_pattern(const char *opt) +{ + int i; + + if (!opt) { + /* Set quorum as default */ + return QUORUM_READ_PATTERN_QUORUM; + } + + for (i = 0; i < QUORUM_READ_PATTERN_MAX; i++) { + if (!strcmp(opt, QuorumReadPattern_lookup[i])) { + return i; + } + } + + return -EINVAL; +} + +static int quorum_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVQuorumState *s = bs->opaque; + Error *local_err = NULL; + QemuOpts *opts = NULL; + bool *opened; + int i; + int ret = 0; + + qdict_flatten(options); + + /* count how many different children are present */ + s->num_children = qdict_array_entries(options, "children."); + if (s->num_children < 0) { + error_setg(&local_err, "Option children is not a valid array"); + ret = -EINVAL; + goto exit; + } + if (s->num_children < 2) { + error_setg(&local_err, + "Number of provided children must be greater than 1"); + ret = -EINVAL; + goto exit; + } + + opts = qemu_opts_create(&quorum_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + ret = -EINVAL; + goto exit; + } + + s->threshold = qemu_opt_get_number(opts, QUORUM_OPT_VOTE_THRESHOLD, 0); + /* and validate it against s->num_children */ + ret = quorum_valid_threshold(s->threshold, s->num_children, &local_err); + if (ret < 0) { + goto exit; + } + + ret = parse_read_pattern(qemu_opt_get(opts, QUORUM_OPT_READ_PATTERN)); + if (ret < 0) { + error_setg(&local_err, "Please set read-pattern as fifo or quorum"); + goto exit; + } + s->read_pattern = ret; + + if (s->read_pattern == QUORUM_READ_PATTERN_QUORUM) { + /* is the driver in blkverify mode */ + if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false) && + s->num_children == 2 && s->threshold == 2) { + s->is_blkverify = true; + } else if (qemu_opt_get_bool(opts, QUORUM_OPT_BLKVERIFY, false)) { + fprintf(stderr, "blkverify mode is set by setting blkverify=on " + "and using two files with vote_threshold=2\n"); + } + + s->rewrite_corrupted = qemu_opt_get_bool(opts, QUORUM_OPT_REWRITE, + false); + if (s->rewrite_corrupted && s->is_blkverify) { + error_setg(&local_err, + "rewrite-corrupted=on cannot be used with blkverify=on"); + ret = -EINVAL; + goto exit; + } + } + + /* allocate the children array */ + s->children = g_new0(BdrvChild *, s->num_children); + opened = g_new0(bool, s->num_children); + + for (i = 0; i < s->num_children; i++) { + char indexstr[32]; + ret = snprintf(indexstr, 32, "children.%d", i); + assert(ret < 32); + + s->children[i] = bdrv_open_child(NULL, options, indexstr, bs, + &child_format, false, &local_err); + if (local_err) { + ret = -EINVAL; + goto close_exit; + } + + opened[i] = true; + } + + g_free(opened); + goto exit; + +close_exit: + /* cleanup on error */ + for (i = 0; i < s->num_children; i++) { + if (!opened[i]) { + continue; + } + bdrv_unref_child(bs, s->children[i]); + } + g_free(s->children); + g_free(opened); +exit: + qemu_opts_del(opts); + /* propagate error */ + if (local_err) { + error_propagate(errp, local_err); + } + return ret; +} + +static void quorum_close(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_unref_child(bs, s->children[i]); + } + + g_free(s->children); +} + +static void quorum_detach_aio_context(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_detach_aio_context(s->children[i]->bs); + } +} + +static void quorum_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVQuorumState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_attach_aio_context(s->children[i]->bs, new_context); + } +} + +static void quorum_refresh_filename(BlockDriverState *bs) +{ + BDRVQuorumState *s = bs->opaque; + QDict *opts; + QList *children; + int i; + + for (i = 0; i < s->num_children; i++) { + bdrv_refresh_filename(s->children[i]->bs); + if (!s->children[i]->bs->full_open_options) { + return; + } + } + + children = qlist_new(); + for (i = 0; i < s->num_children; i++) { + QINCREF(s->children[i]->bs->full_open_options); + qlist_append_obj(children, + QOBJECT(s->children[i]->bs->full_open_options)); + } + + opts = qdict_new(); + qdict_put_obj(opts, "driver", QOBJECT(qstring_from_str("quorum"))); + qdict_put_obj(opts, QUORUM_OPT_VOTE_THRESHOLD, + QOBJECT(qint_from_int(s->threshold))); + qdict_put_obj(opts, QUORUM_OPT_BLKVERIFY, + QOBJECT(qbool_from_bool(s->is_blkverify))); + qdict_put_obj(opts, QUORUM_OPT_REWRITE, + QOBJECT(qbool_from_bool(s->rewrite_corrupted))); + qdict_put_obj(opts, "children", QOBJECT(children)); + + bs->full_open_options = opts; +} + +static BlockDriver bdrv_quorum = { + .format_name = "quorum", + .protocol_name = "quorum", + + .instance_size = sizeof(BDRVQuorumState), + + .bdrv_file_open = quorum_open, + .bdrv_close = quorum_close, + .bdrv_refresh_filename = quorum_refresh_filename, + + .bdrv_co_flush_to_disk = quorum_co_flush, + + .bdrv_getlength = quorum_getlength, + + .bdrv_aio_readv = quorum_aio_readv, + .bdrv_aio_writev = quorum_aio_writev, + .bdrv_invalidate_cache = quorum_invalidate_cache, + + .bdrv_detach_aio_context = quorum_detach_aio_context, + .bdrv_attach_aio_context = quorum_attach_aio_context, + + .is_filter = true, + .bdrv_recurse_is_first_non_filter = quorum_recurse_is_first_non_filter, +}; + +static void bdrv_quorum_init(void) +{ + if (!qcrypto_hash_supports(QCRYPTO_HASH_ALG_SHA256)) { + /* SHA256 hash support is required for quorum device */ + return; + } + bdrv_register(&bdrv_quorum); +} + +block_init(bdrv_quorum_init); diff --git a/src/block/raw-aio.h b/src/block/raw-aio.h new file mode 100644 index 0000000..31d791f --- /dev/null +++ b/src/block/raw-aio.h @@ -0,0 +1,62 @@ +/* + * Declarations for AIO in the raw protocol + * + * Copyright IBM, Corp. 2008 + * + * Authors: + * Anthony Liguori <aliguori@us.ibm.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ +#ifndef QEMU_RAW_AIO_H +#define QEMU_RAW_AIO_H + +/* AIO request types */ +#define QEMU_AIO_READ 0x0001 +#define QEMU_AIO_WRITE 0x0002 +#define QEMU_AIO_IOCTL 0x0004 +#define QEMU_AIO_FLUSH 0x0008 +#define QEMU_AIO_DISCARD 0x0010 +#define QEMU_AIO_WRITE_ZEROES 0x0020 +#define QEMU_AIO_TYPE_MASK \ + (QEMU_AIO_READ|QEMU_AIO_WRITE|QEMU_AIO_IOCTL|QEMU_AIO_FLUSH| \ + QEMU_AIO_DISCARD|QEMU_AIO_WRITE_ZEROES) + +/* AIO flags */ +#define QEMU_AIO_MISALIGNED 0x1000 +#define QEMU_AIO_BLKDEV 0x2000 + + +/* linux-aio.c - Linux native implementation */ +#ifdef CONFIG_LINUX_AIO +void *laio_init(void); +void laio_cleanup(void *s); +BlockAIOCB *laio_submit(BlockDriverState *bs, void *aio_ctx, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type); +void laio_detach_aio_context(void *s, AioContext *old_context); +void laio_attach_aio_context(void *s, AioContext *new_context); +void laio_io_plug(BlockDriverState *bs, void *aio_ctx); +void laio_io_unplug(BlockDriverState *bs, void *aio_ctx, bool unplug); +#endif + +#ifdef _WIN32 +typedef struct QEMUWin32AIOState QEMUWin32AIOState; +QEMUWin32AIOState *win32_aio_init(void); +void win32_aio_cleanup(QEMUWin32AIOState *aio); +int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile); +BlockAIOCB *win32_aio_submit(BlockDriverState *bs, + QEMUWin32AIOState *aio, HANDLE hfile, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type); +void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, + AioContext *old_context); +void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, + AioContext *new_context); +#endif + +#endif /* QEMU_RAW_AIO_H */ diff --git a/src/block/raw-posix.c b/src/block/raw-posix.c new file mode 100644 index 0000000..2fff184 --- /dev/null +++ b/src/block/raw-posix.c @@ -0,0 +1,2616 @@ +/* + * Block driver for RAW files (posix) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "qemu/timer.h" +#include "qemu/log.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "trace.h" +#include "block/thread-pool.h" +#include "qemu/iov.h" +#include "raw-aio.h" +#include "qapi/util.h" +#include "qapi/qmp/qstring.h" + +#if defined(__APPLE__) && (__MACH__) +#include <paths.h> +#include <sys/param.h> +#include <IOKit/IOKitLib.h> +#include <IOKit/IOBSD.h> +#include <IOKit/storage/IOMediaBSDClient.h> +#include <IOKit/storage/IOMedia.h> +#include <IOKit/storage/IOCDMedia.h> +//#include <IOKit/storage/IOCDTypes.h> +#include <CoreFoundation/CoreFoundation.h> +#endif + +#ifdef __sun__ +#define _POSIX_PTHREAD_SEMANTICS 1 +#include <sys/dkio.h> +#endif +#ifdef __linux__ +#include <sys/types.h> +#include <sys/stat.h> +#include <sys/ioctl.h> +#include <sys/param.h> +#include <linux/cdrom.h> +#include <linux/fd.h> +#include <linux/fs.h> +#include <linux/hdreg.h> +#include <scsi/sg.h> +#ifdef __s390__ +#include <asm/dasd.h> +#endif +#ifndef FS_NOCOW_FL +#define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#endif +#endif +#if defined(CONFIG_FALLOCATE_PUNCH_HOLE) || defined(CONFIG_FALLOCATE_ZERO_RANGE) +#include <linux/falloc.h> +#endif +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +#include <sys/disk.h> +#include <sys/cdio.h> +#endif + +#ifdef __OpenBSD__ +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> +#endif + +#ifdef __NetBSD__ +#include <sys/ioctl.h> +#include <sys/disklabel.h> +#include <sys/dkio.h> +#include <sys/disk.h> +#endif + +#ifdef __DragonFly__ +#include <sys/ioctl.h> +#include <sys/diskslice.h> +#endif + +#ifdef CONFIG_XFS +#include <xfs/xfs.h> +#endif + +//#define DEBUG_BLOCK + +#ifdef DEBUG_BLOCK +# define DEBUG_BLOCK_PRINT 1 +#else +# define DEBUG_BLOCK_PRINT 0 +#endif +#define DPRINTF(fmt, ...) \ +do { \ + if (DEBUG_BLOCK_PRINT) { \ + printf(fmt, ## __VA_ARGS__); \ + } \ +} while (0) + +/* OS X does not have O_DSYNC */ +#ifndef O_DSYNC +#ifdef O_SYNC +#define O_DSYNC O_SYNC +#elif defined(O_FSYNC) +#define O_DSYNC O_FSYNC +#endif +#endif + +/* Approximate O_DIRECT with O_DSYNC if O_DIRECT isn't available */ +#ifndef O_DIRECT +#define O_DIRECT O_DSYNC +#endif + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 + +#define MAX_BLOCKSIZE 4096 + +typedef struct BDRVRawState { + int fd; + int type; + int open_flags; + size_t buf_align; + +#ifdef CONFIG_LINUX_AIO + int use_aio; + void *aio_ctx; +#endif +#ifdef CONFIG_XFS + bool is_xfs:1; +#endif + bool has_discard:1; + bool has_write_zeroes:1; + bool discard_zeroes:1; + bool has_fallocate; + bool needs_alignment; +} BDRVRawState; + +typedef struct BDRVRawReopenState { + int fd; + int open_flags; +#ifdef CONFIG_LINUX_AIO + int use_aio; +#endif +} BDRVRawReopenState; + +static int fd_open(BlockDriverState *bs); +static int64_t raw_getlength(BlockDriverState *bs); + +typedef struct RawPosixAIOData { + BlockDriverState *bs; + int aio_fildes; + union { + struct iovec *aio_iov; + void *aio_ioctl_buf; + }; + int aio_niov; + uint64_t aio_nbytes; +#define aio_ioctl_cmd aio_nbytes /* for QEMU_AIO_IOCTL */ + off_t aio_offset; + int aio_type; +} RawPosixAIOData; + +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static int cdrom_reopen(BlockDriverState *bs); +#endif + +#if defined(__NetBSD__) +static int raw_normalize_devicepath(const char **filename) +{ + static char namebuf[PATH_MAX]; + const char *dp, *fname; + struct stat sb; + + fname = *filename; + dp = strrchr(fname, '/'); + if (lstat(fname, &sb) < 0) { + fprintf(stderr, "%s: stat failed: %s\n", + fname, strerror(errno)); + return -errno; + } + + if (!S_ISBLK(sb.st_mode)) { + return 0; + } + + if (dp == NULL) { + snprintf(namebuf, PATH_MAX, "r%s", fname); + } else { + snprintf(namebuf, PATH_MAX, "%.*s/r%s", + (int)(dp - fname), fname, dp + 1); + } + fprintf(stderr, "%s is a block device", fname); + *filename = namebuf; + fprintf(stderr, ", using %s\n", *filename); + + return 0; +} +#else +static int raw_normalize_devicepath(const char **filename) +{ + return 0; +} +#endif + +/* + * Get logical block size via ioctl. On success store it in @sector_size_p. + */ +static int probe_logical_blocksize(int fd, unsigned int *sector_size_p) +{ + unsigned int sector_size; + bool success = false; + + errno = ENOTSUP; + + /* Try a few ioctls to get the right size */ +#ifdef BLKSSZGET + if (ioctl(fd, BLKSSZGET, §or_size) >= 0) { + *sector_size_p = sector_size; + success = true; + } +#endif +#ifdef DKIOCGETBLOCKSIZE + if (ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) >= 0) { + *sector_size_p = sector_size; + success = true; + } +#endif +#ifdef DIOCGSECTORSIZE + if (ioctl(fd, DIOCGSECTORSIZE, §or_size) >= 0) { + *sector_size_p = sector_size; + success = true; + } +#endif + + return success ? 0 : -errno; +} + +/** + * Get physical block size of @fd. + * On success, store it in @blk_size and return 0. + * On failure, return -errno. + */ +static int probe_physical_blocksize(int fd, unsigned int *blk_size) +{ +#ifdef BLKPBSZGET + if (ioctl(fd, BLKPBSZGET, blk_size) < 0) { + return -errno; + } + return 0; +#else + return -ENOTSUP; +#endif +} + +/* Check if read is allowed with given memory buffer and length. + * + * This function is used to check O_DIRECT memory buffer and request alignment. + */ +static bool raw_is_io_aligned(int fd, void *buf, size_t len) +{ + ssize_t ret = pread(fd, buf, len, 0); + + if (ret >= 0) { + return true; + } + +#ifdef __linux__ + /* The Linux kernel returns EINVAL for misaligned O_DIRECT reads. Ignore + * other errors (e.g. real I/O error), which could happen on a failed + * drive, since we only care about probing alignment. + */ + if (errno != EINVAL) { + return true; + } +#endif + + return false; +} + +static void raw_probe_alignment(BlockDriverState *bs, int fd, Error **errp) +{ + BDRVRawState *s = bs->opaque; + char *buf; + size_t max_align = MAX(MAX_BLOCKSIZE, getpagesize()); + + /* For SCSI generic devices the alignment is not really used. + With buffered I/O, we don't have any restrictions. */ + if (bdrv_is_sg(bs) || !s->needs_alignment) { + bs->request_alignment = 1; + s->buf_align = 1; + return; + } + + bs->request_alignment = 0; + s->buf_align = 0; + /* Let's try to use the logical blocksize for the alignment. */ + if (probe_logical_blocksize(fd, &bs->request_alignment) < 0) { + bs->request_alignment = 0; + } +#ifdef CONFIG_XFS + if (s->is_xfs) { + struct dioattr da; + if (xfsctl(NULL, fd, XFS_IOC_DIOINFO, &da) >= 0) { + bs->request_alignment = da.d_miniosz; + /* The kernel returns wrong information for d_mem */ + /* s->buf_align = da.d_mem; */ + } + } +#endif + + /* If we could not get the sizes so far, we can only guess them */ + if (!s->buf_align) { + size_t align; + buf = qemu_memalign(max_align, 2 * max_align); + for (align = 512; align <= max_align; align <<= 1) { + if (raw_is_io_aligned(fd, buf + align, max_align)) { + s->buf_align = align; + break; + } + } + qemu_vfree(buf); + } + + if (!bs->request_alignment) { + size_t align; + buf = qemu_memalign(s->buf_align, max_align); + for (align = 512; align <= max_align; align <<= 1) { + if (raw_is_io_aligned(fd, buf, align)) { + bs->request_alignment = align; + break; + } + } + qemu_vfree(buf); + } + + if (!s->buf_align || !bs->request_alignment) { + error_setg(errp, "Could not find working O_DIRECT alignment. " + "Try cache.direct=off."); + } +} + +static void raw_parse_flags(int bdrv_flags, int *open_flags) +{ + assert(open_flags != NULL); + + *open_flags |= O_BINARY; + *open_flags &= ~O_ACCMODE; + if (bdrv_flags & BDRV_O_RDWR) { + *open_flags |= O_RDWR; + } else { + *open_flags |= O_RDONLY; + } + + /* Use O_DSYNC for write-through caching, no flags for write-back caching, + * and O_DIRECT for no caching. */ + if ((bdrv_flags & BDRV_O_NOCACHE)) { + *open_flags |= O_DIRECT; + } +} + +static void raw_detach_aio_context(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + + if (s->use_aio) { + laio_detach_aio_context(s->aio_ctx, bdrv_get_aio_context(bs)); + } +#endif +} + +static void raw_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + + if (s->use_aio) { + laio_attach_aio_context(s->aio_ctx, new_context); + } +#endif +} + +#ifdef CONFIG_LINUX_AIO +static int raw_set_aio(void **aio_ctx, int *use_aio, int bdrv_flags) +{ + int ret = -1; + assert(aio_ctx != NULL); + assert(use_aio != NULL); + /* + * Currently Linux do AIO only for files opened with O_DIRECT + * specified so check NOCACHE flag too + */ + if ((bdrv_flags & (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) == + (BDRV_O_NOCACHE|BDRV_O_NATIVE_AIO)) { + + /* if non-NULL, laio_init() has already been run */ + if (*aio_ctx == NULL) { + *aio_ctx = laio_init(); + if (!*aio_ctx) { + goto error; + } + } + *use_aio = 1; + } else { + *use_aio = 0; + } + + ret = 0; + +error: + return ret; +} +#endif + +static void raw_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The filename does not have to be prefixed by the protocol name, since + * "file" is the default protocol; therefore, the return value of this + * function call can be ignored. */ + strstart(filename, "file:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static QemuOptsList raw_runtime_opts = { + .name = "raw", + .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "File name of the image", + }, + { /* end of list */ } + }, +}; + +static int raw_open_common(BlockDriverState *bs, QDict *options, + int bdrv_flags, int open_flags, Error **errp) +{ + BDRVRawState *s = bs->opaque; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename = NULL; + int fd, ret; + struct stat st; + + opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + filename = qemu_opt_get(opts, "filename"); + + ret = raw_normalize_devicepath(&filename); + if (ret != 0) { + error_setg_errno(errp, -ret, "Could not normalize device path"); + goto fail; + } + + s->open_flags = open_flags; + raw_parse_flags(bdrv_flags, &s->open_flags); + + s->fd = -1; + fd = qemu_open(filename, s->open_flags, 0644); + if (fd < 0) { + ret = -errno; + if (ret == -EROFS) { + ret = -EACCES; + } + goto fail; + } + s->fd = fd; + +#ifdef CONFIG_LINUX_AIO + if (raw_set_aio(&s->aio_ctx, &s->use_aio, bdrv_flags)) { + qemu_close(fd); + ret = -errno; + error_setg_errno(errp, -ret, "Could not set AIO state"); + goto fail; + } + if (!s->use_aio && (bdrv_flags & BDRV_O_NATIVE_AIO)) { + error_printf("WARNING: aio=native was specified for '%s', but " + "it requires cache.direct=on, which was not " + "specified. Falling back to aio=threads.\n" + " This will become an error condition in " + "future QEMU versions.\n", + bs->filename); + } +#else + if (bdrv_flags & BDRV_O_NATIVE_AIO) { + error_printf("WARNING: aio=native was specified for '%s', but " + "is not supported in this build. Falling back to " + "aio=threads.\n" + " This will become an error condition in " + "future QEMU versions.\n", + bs->filename); + } +#endif /* !defined(CONFIG_LINUX_AIO) */ + + s->has_discard = true; + s->has_write_zeroes = true; + if ((bs->open_flags & BDRV_O_NOCACHE) != 0) { + s->needs_alignment = true; + } + + if (fstat(s->fd, &st) < 0) { + ret = -errno; + error_setg_errno(errp, errno, "Could not stat file"); + goto fail; + } + if (S_ISREG(st.st_mode)) { + s->discard_zeroes = true; + s->has_fallocate = true; + } + if (S_ISBLK(st.st_mode)) { +#ifdef BLKDISCARDZEROES + unsigned int arg; + if (ioctl(s->fd, BLKDISCARDZEROES, &arg) == 0 && arg) { + s->discard_zeroes = true; + } +#endif +#ifdef __linux__ + /* On Linux 3.10, BLKDISCARD leaves stale data in the page cache. Do + * not rely on the contents of discarded blocks unless using O_DIRECT. + * Same for BLKZEROOUT. + */ + if (!(bs->open_flags & BDRV_O_NOCACHE)) { + s->discard_zeroes = false; + s->has_write_zeroes = false; + } +#endif + } +#ifdef __FreeBSD__ + if (S_ISCHR(st.st_mode)) { + /* + * The file is a char device (disk), which on FreeBSD isn't behind + * a pager, so force all requests to be aligned. This is needed + * so QEMU makes sure all IO operations on the device are aligned + * to sector size, or else FreeBSD will reject them with EINVAL. + */ + s->needs_alignment = true; + } +#endif + +#ifdef CONFIG_XFS + if (platform_test_xfs_fd(s->fd)) { + s->is_xfs = true; + } +#endif + + raw_attach_aio_context(bs, bdrv_get_aio_context(bs)); + + ret = 0; +fail: + if (filename && (bdrv_flags & BDRV_O_TEMPORARY)) { + unlink(filename); + } + qemu_opts_del(opts); + return ret; +} + +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; + + s->type = FTYPE_FILE; + ret = raw_open_common(bs, options, flags, 0, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; +} + +static int raw_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + BDRVRawState *s; + BDRVRawReopenState *raw_s; + int ret = 0; + Error *local_err = NULL; + + assert(state != NULL); + assert(state->bs != NULL); + + s = state->bs->opaque; + + state->opaque = g_new0(BDRVRawReopenState, 1); + raw_s = state->opaque; + +#ifdef CONFIG_LINUX_AIO + raw_s->use_aio = s->use_aio; + + /* we can use s->aio_ctx instead of a copy, because the use_aio flag is + * valid in the 'false' condition even if aio_ctx is set, and raw_set_aio() + * won't override aio_ctx if aio_ctx is non-NULL */ + if (raw_set_aio(&s->aio_ctx, &raw_s->use_aio, state->flags)) { + error_setg(errp, "Could not set AIO state"); + return -1; + } +#endif + + if (s->type == FTYPE_CD) { + raw_s->open_flags |= O_NONBLOCK; + } + + raw_parse_flags(state->flags, &raw_s->open_flags); + + raw_s->fd = -1; + + int fcntl_flags = O_APPEND | O_NONBLOCK; +#ifdef O_NOATIME + fcntl_flags |= O_NOATIME; +#endif + +#ifdef O_ASYNC + /* Not all operating systems have O_ASYNC, and those that don't + * will not let us track the state into raw_s->open_flags (typically + * you achieve the same effect with an ioctl, for example I_SETSIG + * on Solaris). But we do not use O_ASYNC, so that's fine. + */ + assert((s->open_flags & O_ASYNC) == 0); +#endif + + if ((raw_s->open_flags & ~fcntl_flags) == (s->open_flags & ~fcntl_flags)) { + /* dup the original fd */ + /* TODO: use qemu fcntl wrapper */ +#ifdef F_DUPFD_CLOEXEC + raw_s->fd = fcntl(s->fd, F_DUPFD_CLOEXEC, 0); +#else + raw_s->fd = dup(s->fd); + if (raw_s->fd != -1) { + qemu_set_cloexec(raw_s->fd); + } +#endif + if (raw_s->fd >= 0) { + ret = fcntl_setfl(raw_s->fd, raw_s->open_flags); + if (ret) { + qemu_close(raw_s->fd); + raw_s->fd = -1; + } + } + } + + /* If we cannot use fcntl, or fcntl failed, fall back to qemu_open() */ + if (raw_s->fd == -1) { + const char *normalized_filename = state->bs->filename; + ret = raw_normalize_devicepath(&normalized_filename); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not normalize device path"); + } else { + assert(!(raw_s->open_flags & O_CREAT)); + raw_s->fd = qemu_open(normalized_filename, raw_s->open_flags); + if (raw_s->fd == -1) { + error_setg_errno(errp, errno, "Could not reopen file"); + ret = -1; + } + } + } + + /* Fail already reopen_prepare() if we can't get a working O_DIRECT + * alignment with the new fd. */ + if (raw_s->fd != -1) { + raw_probe_alignment(state->bs, raw_s->fd, &local_err); + if (local_err) { + qemu_close(raw_s->fd); + raw_s->fd = -1; + error_propagate(errp, local_err); + ret = -EINVAL; + } + } + + return ret; +} + +static void raw_reopen_commit(BDRVReopenState *state) +{ + BDRVRawReopenState *raw_s = state->opaque; + BDRVRawState *s = state->bs->opaque; + + s->open_flags = raw_s->open_flags; + + qemu_close(s->fd); + s->fd = raw_s->fd; +#ifdef CONFIG_LINUX_AIO + s->use_aio = raw_s->use_aio; +#endif + + g_free(state->opaque); + state->opaque = NULL; +} + + +static void raw_reopen_abort(BDRVReopenState *state) +{ + BDRVRawReopenState *raw_s = state->opaque; + + /* nothing to do if NULL, we didn't get far enough */ + if (raw_s == NULL) { + return; + } + + if (raw_s->fd >= 0) { + qemu_close(raw_s->fd); + raw_s->fd = -1; + } + g_free(state->opaque); + state->opaque = NULL; +} + +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVRawState *s = bs->opaque; + + raw_probe_alignment(bs, s->fd, errp); + bs->bl.min_mem_alignment = s->buf_align; + bs->bl.opt_mem_alignment = MAX(s->buf_align, getpagesize()); +} + +static int check_for_dasd(int fd) +{ +#ifdef BIODASDINFO2 + struct dasd_information2_t info = {0}; + + return ioctl(fd, BIODASDINFO2, &info); +#else + return -1; +#endif +} + +/** + * Try to get @bs's logical and physical block size. + * On success, store them in @bsz and return zero. + * On failure, return negative errno. + */ +static int hdev_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ + BDRVRawState *s = bs->opaque; + int ret; + + /* If DASD, get blocksizes */ + if (check_for_dasd(s->fd) < 0) { + return -ENOTSUP; + } + ret = probe_logical_blocksize(s->fd, &bsz->log); + if (ret < 0) { + return ret; + } + return probe_physical_blocksize(s->fd, &bsz->phys); +} + +/** + * Try to get @bs's geometry: cyls, heads, sectors. + * On success, store them in @geo and return 0. + * On failure return -errno. + * (Allows block driver to assign default geometry values that guest sees) + */ +#ifdef __linux__ +static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ + BDRVRawState *s = bs->opaque; + struct hd_geometry ioctl_geo = {0}; + + /* If DASD, get its geometry */ + if (check_for_dasd(s->fd) < 0) { + return -ENOTSUP; + } + if (ioctl(s->fd, HDIO_GETGEO, &ioctl_geo) < 0) { + return -errno; + } + /* HDIO_GETGEO may return success even though geo contains zeros + (e.g. certain multipath setups) */ + if (!ioctl_geo.heads || !ioctl_geo.sectors || !ioctl_geo.cylinders) { + return -ENOTSUP; + } + /* Do not return a geometry for partition */ + if (ioctl_geo.start != 0) { + return -ENOTSUP; + } + geo->heads = ioctl_geo.heads; + geo->sectors = ioctl_geo.sectors; + geo->cylinders = ioctl_geo.cylinders; + + return 0; +} +#else /* __linux__ */ +static int hdev_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ + return -ENOTSUP; +} +#endif + +static ssize_t handle_aiocb_ioctl(RawPosixAIOData *aiocb) +{ + int ret; + + ret = ioctl(aiocb->aio_fildes, aiocb->aio_ioctl_cmd, aiocb->aio_ioctl_buf); + if (ret == -1) { + return -errno; + } + + return 0; +} + +static ssize_t handle_aiocb_flush(RawPosixAIOData *aiocb) +{ + int ret; + + ret = qemu_fdatasync(aiocb->aio_fildes); + if (ret == -1) { + return -errno; + } + return 0; +} + +#ifdef CONFIG_PREADV + +static bool preadv_present = true; + +static ssize_t +qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return preadv(fd, iov, nr_iov, offset); +} + +static ssize_t +qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return pwritev(fd, iov, nr_iov, offset); +} + +#else + +static bool preadv_present = false; + +static ssize_t +qemu_preadv(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return -ENOSYS; +} + +static ssize_t +qemu_pwritev(int fd, const struct iovec *iov, int nr_iov, off_t offset) +{ + return -ENOSYS; +} + +#endif + +static ssize_t handle_aiocb_rw_vector(RawPosixAIOData *aiocb) +{ + ssize_t len; + + do { + if (aiocb->aio_type & QEMU_AIO_WRITE) + len = qemu_pwritev(aiocb->aio_fildes, + aiocb->aio_iov, + aiocb->aio_niov, + aiocb->aio_offset); + else + len = qemu_preadv(aiocb->aio_fildes, + aiocb->aio_iov, + aiocb->aio_niov, + aiocb->aio_offset); + } while (len == -1 && errno == EINTR); + + if (len == -1) { + return -errno; + } + return len; +} + +/* + * Read/writes the data to/from a given linear buffer. + * + * Returns the number of bytes handles or -errno in case of an error. Short + * reads are only returned if the end of the file is reached. + */ +static ssize_t handle_aiocb_rw_linear(RawPosixAIOData *aiocb, char *buf) +{ + ssize_t offset = 0; + ssize_t len; + + while (offset < aiocb->aio_nbytes) { + if (aiocb->aio_type & QEMU_AIO_WRITE) { + len = pwrite(aiocb->aio_fildes, + (const char *)buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + } else { + len = pread(aiocb->aio_fildes, + buf + offset, + aiocb->aio_nbytes - offset, + aiocb->aio_offset + offset); + } + if (len == -1 && errno == EINTR) { + continue; + } else if (len == -1 && errno == EINVAL && + (aiocb->bs->open_flags & BDRV_O_NOCACHE) && + !(aiocb->aio_type & QEMU_AIO_WRITE) && + offset > 0) { + /* O_DIRECT pread() may fail with EINVAL when offset is unaligned + * after a short read. Assume that O_DIRECT short reads only occur + * at EOF. Therefore this is a short read, not an I/O error. + */ + break; + } else if (len == -1) { + offset = -errno; + break; + } else if (len == 0) { + break; + } + offset += len; + } + + return offset; +} + +static ssize_t handle_aiocb_rw(RawPosixAIOData *aiocb) +{ + ssize_t nbytes; + char *buf; + + if (!(aiocb->aio_type & QEMU_AIO_MISALIGNED)) { + /* + * If there is just a single buffer, and it is properly aligned + * we can just use plain pread/pwrite without any problems. + */ + if (aiocb->aio_niov == 1) { + return handle_aiocb_rw_linear(aiocb, aiocb->aio_iov->iov_base); + } + /* + * We have more than one iovec, and all are properly aligned. + * + * Try preadv/pwritev first and fall back to linearizing the + * buffer if it's not supported. + */ + if (preadv_present) { + nbytes = handle_aiocb_rw_vector(aiocb); + if (nbytes == aiocb->aio_nbytes || + (nbytes < 0 && nbytes != -ENOSYS)) { + return nbytes; + } + preadv_present = false; + } + + /* + * XXX(hch): short read/write. no easy way to handle the reminder + * using these interfaces. For now retry using plain + * pread/pwrite? + */ + } + + /* + * Ok, we have to do it the hard way, copy all segments into + * a single aligned buffer. + */ + buf = qemu_try_blockalign(aiocb->bs, aiocb->aio_nbytes); + if (buf == NULL) { + return -ENOMEM; + } + + if (aiocb->aio_type & QEMU_AIO_WRITE) { + char *p = buf; + int i; + + for (i = 0; i < aiocb->aio_niov; ++i) { + memcpy(p, aiocb->aio_iov[i].iov_base, aiocb->aio_iov[i].iov_len); + p += aiocb->aio_iov[i].iov_len; + } + assert(p - buf == aiocb->aio_nbytes); + } + + nbytes = handle_aiocb_rw_linear(aiocb, buf); + if (!(aiocb->aio_type & QEMU_AIO_WRITE)) { + char *p = buf; + size_t count = aiocb->aio_nbytes, copy; + int i; + + for (i = 0; i < aiocb->aio_niov && count; ++i) { + copy = count; + if (copy > aiocb->aio_iov[i].iov_len) { + copy = aiocb->aio_iov[i].iov_len; + } + memcpy(aiocb->aio_iov[i].iov_base, p, copy); + assert(count >= copy); + p += copy; + count -= copy; + } + assert(count == 0); + } + qemu_vfree(buf); + + return nbytes; +} + +#ifdef CONFIG_XFS +static int xfs_write_zeroes(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ + struct xfs_flock64 fl; + int err; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = offset; + fl.l_len = bytes; + + if (xfsctl(NULL, s->fd, XFS_IOC_ZERO_RANGE, &fl) < 0) { + err = errno; + DPRINTF("cannot write zero range (%s)\n", strerror(errno)); + return -err; + } + + return 0; +} + +static int xfs_discard(BDRVRawState *s, int64_t offset, uint64_t bytes) +{ + struct xfs_flock64 fl; + int err; + + memset(&fl, 0, sizeof(fl)); + fl.l_whence = SEEK_SET; + fl.l_start = offset; + fl.l_len = bytes; + + if (xfsctl(NULL, s->fd, XFS_IOC_UNRESVSP64, &fl) < 0) { + err = errno; + DPRINTF("cannot punch hole (%s)\n", strerror(errno)); + return -err; + } + + return 0; +} +#endif + +static int translate_err(int err) +{ + if (err == -ENODEV || err == -ENOSYS || err == -EOPNOTSUPP || + err == -ENOTTY) { + err = -ENOTSUP; + } + return err; +} + +#ifdef CONFIG_FALLOCATE +static int do_fallocate(int fd, int mode, off_t offset, off_t len) +{ + do { + if (fallocate(fd, mode, offset, len) == 0) { + return 0; + } + } while (errno == EINTR); + return translate_err(-errno); +} +#endif + +static ssize_t handle_aiocb_write_zeroes_block(RawPosixAIOData *aiocb) +{ + int ret = -ENOTSUP; + BDRVRawState *s = aiocb->bs->opaque; + + if (!s->has_write_zeroes) { + return -ENOTSUP; + } + +#ifdef BLKZEROOUT + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKZEROOUT, range) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = translate_err(-errno); +#endif + + if (ret == -ENOTSUP) { + s->has_write_zeroes = false; + } + return ret; +} + +static ssize_t handle_aiocb_write_zeroes(RawPosixAIOData *aiocb) +{ +#if defined(CONFIG_FALLOCATE) || defined(CONFIG_XFS) + BDRVRawState *s = aiocb->bs->opaque; +#endif + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { + return handle_aiocb_write_zeroes_block(aiocb); + } + +#ifdef CONFIG_XFS + if (s->is_xfs) { + return xfs_write_zeroes(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + +#ifdef CONFIG_FALLOCATE_ZERO_RANGE + if (s->has_write_zeroes) { + int ret = do_fallocate(s->fd, FALLOC_FL_ZERO_RANGE, + aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_write_zeroes = false; + } +#endif + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + if (s->has_discard && s->has_fallocate) { + int ret = do_fallocate(s->fd, + FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0) { + ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_fallocate = false; + } else if (ret != -ENOTSUP) { + return ret; + } else { + s->has_discard = false; + } + } +#endif + +#ifdef CONFIG_FALLOCATE + if (s->has_fallocate && aiocb->aio_offset >= bdrv_getlength(aiocb->bs)) { + int ret = do_fallocate(s->fd, 0, aiocb->aio_offset, aiocb->aio_nbytes); + if (ret == 0 || ret != -ENOTSUP) { + return ret; + } + s->has_fallocate = false; + } +#endif + + return -ENOTSUP; +} + +static ssize_t handle_aiocb_discard(RawPosixAIOData *aiocb) +{ + int ret = -EOPNOTSUPP; + BDRVRawState *s = aiocb->bs->opaque; + + if (!s->has_discard) { + return -ENOTSUP; + } + + if (aiocb->aio_type & QEMU_AIO_BLKDEV) { +#ifdef BLKDISCARD + do { + uint64_t range[2] = { aiocb->aio_offset, aiocb->aio_nbytes }; + if (ioctl(aiocb->aio_fildes, BLKDISCARD, range) == 0) { + return 0; + } + } while (errno == EINTR); + + ret = -errno; +#endif + } else { +#ifdef CONFIG_XFS + if (s->is_xfs) { + return xfs_discard(s, aiocb->aio_offset, aiocb->aio_nbytes); + } +#endif + +#ifdef CONFIG_FALLOCATE_PUNCH_HOLE + ret = do_fallocate(s->fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, + aiocb->aio_offset, aiocb->aio_nbytes); +#endif + } + + ret = translate_err(ret); + if (ret == -ENOTSUP) { + s->has_discard = false; + } + return ret; +} + +static int aio_worker(void *arg) +{ + RawPosixAIOData *aiocb = arg; + ssize_t ret = 0; + + switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { + case QEMU_AIO_READ: + ret = handle_aiocb_rw(aiocb); + if (ret >= 0 && ret < aiocb->aio_nbytes) { + iov_memset(aiocb->aio_iov, aiocb->aio_niov, ret, + 0, aiocb->aio_nbytes - ret); + + ret = aiocb->aio_nbytes; + } + if (ret == aiocb->aio_nbytes) { + ret = 0; + } else if (ret >= 0 && ret < aiocb->aio_nbytes) { + ret = -EINVAL; + } + break; + case QEMU_AIO_WRITE: + ret = handle_aiocb_rw(aiocb); + if (ret == aiocb->aio_nbytes) { + ret = 0; + } else if (ret >= 0 && ret < aiocb->aio_nbytes) { + ret = -EINVAL; + } + break; + case QEMU_AIO_FLUSH: + ret = handle_aiocb_flush(aiocb); + break; + case QEMU_AIO_IOCTL: + ret = handle_aiocb_ioctl(aiocb); + break; + case QEMU_AIO_DISCARD: + ret = handle_aiocb_discard(aiocb); + break; + case QEMU_AIO_WRITE_ZEROES: + ret = handle_aiocb_write_zeroes(aiocb); + break; + default: + fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); + ret = -EINVAL; + break; + } + + g_free(aiocb); + return ret; +} + +static int paio_submit_co(BlockDriverState *bs, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + int type) +{ + RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); + ThreadPool *pool; + + acb->bs = bs; + acb->aio_type = type; + acb->aio_fildes = fd; + + acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; + acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + + if (qiov) { + acb->aio_iov = qiov->iov; + acb->aio_niov = qiov->niov; + assert(qiov->size == acb->aio_nbytes); + } + + trace_paio_submit_co(sector_num, nb_sectors, type); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_co(pool, aio_worker, acb); +} + +static BlockAIOCB *paio_submit(BlockDriverState *bs, int fd, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type) +{ + RawPosixAIOData *acb = g_new(RawPosixAIOData, 1); + ThreadPool *pool; + + acb->bs = bs; + acb->aio_type = type; + acb->aio_fildes = fd; + + acb->aio_nbytes = nb_sectors * BDRV_SECTOR_SIZE; + acb->aio_offset = sector_num * BDRV_SECTOR_SIZE; + + if (qiov) { + acb->aio_iov = qiov->iov; + acb->aio_niov = qiov->niov; + assert(qiov->size == acb->aio_nbytes); + } + + trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); +} + +static BlockAIOCB *raw_aio_submit(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + + /* + * Check if the underlying device requires requests to be aligned, + * and if the request we are trying to submit is aligned or not. + * If this is the case tell the low-level driver that it needs + * to copy the buffer. + */ + if (s->needs_alignment) { + if (!bdrv_qiov_is_aligned(bs, qiov)) { + type |= QEMU_AIO_MISALIGNED; +#ifdef CONFIG_LINUX_AIO + } else if (s->use_aio) { + return laio_submit(bs, s->aio_ctx, s->fd, sector_num, qiov, + nb_sectors, cb, opaque, type); +#endif + } + } + + return paio_submit(bs, s->fd, sector_num, qiov, nb_sectors, + cb, opaque, type); +} + +static void raw_aio_plug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_plug(bs, s->aio_ctx); + } +#endif +} + +static void raw_aio_unplug(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_unplug(bs, s->aio_ctx, true); + } +#endif +} + +static void raw_aio_flush_io_queue(BlockDriverState *bs) +{ +#ifdef CONFIG_LINUX_AIO + BDRVRawState *s = bs->opaque; + if (s->use_aio) { + laio_io_unplug(bs, s->aio_ctx, false); + } +#endif +} + +static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return raw_aio_submit(bs, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_READ); +} + +static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + return raw_aio_submit(bs, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_WRITE); +} + +static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) + return NULL; + + return paio_submit(bs, s->fd, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); +} + +static void raw_close(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + raw_detach_aio_context(bs); + +#ifdef CONFIG_LINUX_AIO + if (s->use_aio) { + laio_cleanup(s->aio_ctx); + } +#endif + if (s->fd >= 0) { + qemu_close(s->fd); + s->fd = -1; + } +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRawState *s = bs->opaque; + struct stat st; + + if (fstat(s->fd, &st)) { + return -errno; + } + + if (S_ISREG(st.st_mode)) { + if (ftruncate(s->fd, offset) < 0) { + return -errno; + } + } else if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + if (offset > raw_getlength(bs)) { + return -EINVAL; + } + } else { + return -ENOTSUP; + } + + return 0; +} + +#ifdef __OpenBSD__ +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + struct stat st; + + if (fstat(fd, &st)) + return -errno; + if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + struct disklabel dl; + + if (ioctl(fd, DIOCGDINFO, &dl)) + return -errno; + return (uint64_t)dl.d_secsize * + dl.d_partitions[DISKPART(st.st_rdev)].p_size; + } else + return st.st_size; +} +#elif defined(__NetBSD__) +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + struct stat st; + + if (fstat(fd, &st)) + return -errno; + if (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode)) { + struct dkwedge_info dkw; + + if (ioctl(fd, DIOCGWEDGEINFO, &dkw) != -1) { + return dkw.dkw_size * 512; + } else { + struct disklabel dl; + + if (ioctl(fd, DIOCGDINFO, &dl)) + return -errno; + return (uint64_t)dl.d_secsize * + dl.d_partitions[DISKPART(st.st_rdev)].p_size; + } + } else + return st.st_size; +} +#elif defined(__sun__) +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + struct dk_minfo minfo; + int ret; + int64_t size; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + /* + * Use the DKIOCGMEDIAINFO ioctl to read the size. + */ + ret = ioctl(s->fd, DKIOCGMEDIAINFO, &minfo); + if (ret != -1) { + return minfo.dki_lbsize * minfo.dki_capacity; + } + + /* + * There are reports that lseek on some devices fails, but + * irc discussion said that contingency on contingency was overkill. + */ + size = lseek(s->fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; +} +#elif defined(CONFIG_BSD) +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd = s->fd; + int64_t size; + struct stat sb; +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) + int reopened = 0; +#endif + int ret; + + ret = fd_open(bs); + if (ret < 0) + return ret; + +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +again: +#endif + if (!fstat(fd, &sb) && (S_IFCHR & sb.st_mode)) { +#ifdef DIOCGMEDIASIZE + if (ioctl(fd, DIOCGMEDIASIZE, (off_t *)&size)) +#elif defined(DIOCGPART) + { + struct partinfo pi; + if (ioctl(fd, DIOCGPART, &pi) == 0) + size = pi.media_size; + else + size = 0; + } + if (size == 0) +#endif +#if defined(__APPLE__) && defined(__MACH__) + { + uint64_t sectors = 0; + uint32_t sector_size = 0; + + if (ioctl(fd, DKIOCGETBLOCKCOUNT, §ors) == 0 + && ioctl(fd, DKIOCGETBLOCKSIZE, §or_size) == 0) { + size = sectors * sector_size; + } else { + size = lseek(fd, 0LL, SEEK_END); + if (size < 0) { + return -errno; + } + } + } +#else + size = lseek(fd, 0LL, SEEK_END); + if (size < 0) { + return -errno; + } +#endif +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) + switch(s->type) { + case FTYPE_CD: + /* XXX FreeBSD acd returns UINT_MAX sectors for an empty drive */ + if (size == 2048LL * (unsigned)-1) + size = 0; + /* XXX no disc? maybe we need to reopen... */ + if (size <= 0 && !reopened && cdrom_reopen(bs) >= 0) { + reopened = 1; + goto again; + } + } +#endif + } else { + size = lseek(fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + } + return size; +} +#else +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + int64_t size; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + size = lseek(s->fd, 0, SEEK_END); + if (size < 0) { + return -errno; + } + return size; +} +#endif + +static int64_t raw_get_allocated_file_size(BlockDriverState *bs) +{ + struct stat st; + BDRVRawState *s = bs->opaque; + + if (fstat(s->fd, &st) < 0) { + return -errno; + } + return (int64_t)st.st_blocks * 512; +} + +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int fd; + int result = 0; + int64_t total_size = 0; + bool nocow = false; + PreallocMode prealloc; + char *buf = NULL; + Error *local_err = NULL; + + strstart(filename, "file:", &filename); + + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + nocow = qemu_opt_get_bool(opts, BLOCK_OPT_NOCOW, false); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + prealloc = qapi_enum_parse(PreallocMode_lookup, buf, + PREALLOC_MODE_MAX, PREALLOC_MODE_OFF, + &local_err); + g_free(buf); + if (local_err) { + error_propagate(errp, local_err); + result = -EINVAL; + goto out; + } + + fd = qemu_open(filename, O_RDWR | O_CREAT | O_TRUNC | O_BINARY, + 0644); + if (fd < 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not create file"); + goto out; + } + + if (nocow) { +#ifdef __linux__ + /* Set NOCOW flag to solve performance issue on fs like btrfs. + * This is an optimisation. The FS_IOC_SETFLAGS ioctl return value + * will be ignored since any failure of this operation should not + * block the left work. + */ + int attr; + if (ioctl(fd, FS_IOC_GETFLAGS, &attr) == 0) { + attr |= FS_NOCOW_FL; + ioctl(fd, FS_IOC_SETFLAGS, &attr); + } +#endif + } + + if (ftruncate(fd, total_size) != 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not resize file"); + goto out_close; + } + + switch (prealloc) { +#ifdef CONFIG_POSIX_FALLOCATE + case PREALLOC_MODE_FALLOC: + /* posix_fallocate() doesn't set errno. */ + result = -posix_fallocate(fd, 0, total_size); + if (result != 0) { + error_setg_errno(errp, -result, + "Could not preallocate data for the new file"); + } + break; +#endif + case PREALLOC_MODE_FULL: + { + int64_t num = 0, left = total_size; + buf = g_malloc0(65536); + + while (left > 0) { + num = MIN(left, 65536); + result = write(fd, buf, num); + if (result < 0) { + result = -errno; + error_setg_errno(errp, -result, + "Could not write to the new file"); + break; + } + left -= result; + } + if (result >= 0) { + result = fsync(fd); + if (result < 0) { + result = -errno; + error_setg_errno(errp, -result, + "Could not flush new file to disk"); + } + } + g_free(buf); + break; + } + case PREALLOC_MODE_OFF: + break; + default: + result = -EINVAL; + error_setg(errp, "Unsupported preallocation mode: %s", + PreallocMode_lookup[prealloc]); + break; + } + +out_close: + if (qemu_close(fd) != 0 && result == 0) { + result = -errno; + error_setg_errno(errp, -result, "Could not close the new file"); + } +out: + return result; +} + +/* + * Find allocation range in @bs around offset @start. + * May change underlying file descriptor's file offset. + * If @start is not in a hole, store @start in @data, and the + * beginning of the next hole in @hole, and return 0. + * If @start is in a non-trailing hole, store @start in @hole and the + * beginning of the next non-hole in @data, and return 0. + * If @start is in a trailing hole or beyond EOF, return -ENXIO. + * If we can't find out, return a negative errno other than -ENXIO. + */ +static int find_allocation(BlockDriverState *bs, off_t start, + off_t *data, off_t *hole) +{ +#if defined SEEK_HOLE && defined SEEK_DATA + BDRVRawState *s = bs->opaque; + off_t offs; + + /* + * SEEK_DATA cases: + * D1. offs == start: start is in data + * D2. offs > start: start is in a hole, next data at offs + * D3. offs < 0, errno = ENXIO: either start is in a trailing hole + * or start is beyond EOF + * If the latter happens, the file has been truncated behind + * our back since we opened it. All bets are off then. + * Treating like a trailing hole is simplest. + * D4. offs < 0, errno != ENXIO: we learned nothing + */ + offs = lseek(s->fd, start, SEEK_DATA); + if (offs < 0) { + return -errno; /* D3 or D4 */ + } + assert(offs >= start); + + if (offs > start) { + /* D2: in hole, next data at offs */ + *hole = start; + *data = offs; + return 0; + } + + /* D1: in data, end not yet known */ + + /* + * SEEK_HOLE cases: + * H1. offs == start: start is in a hole + * If this happens here, a hole has been dug behind our back + * since the previous lseek(). + * H2. offs > start: either start is in data, next hole at offs, + * or start is in trailing hole, EOF at offs + * Linux treats trailing holes like any other hole: offs == + * start. Solaris seeks to EOF instead: offs > start (blech). + * If that happens here, a hole has been dug behind our back + * since the previous lseek(). + * H3. offs < 0, errno = ENXIO: start is beyond EOF + * If this happens, the file has been truncated behind our + * back since we opened it. Treat it like a trailing hole. + * H4. offs < 0, errno != ENXIO: we learned nothing + * Pretend we know nothing at all, i.e. "forget" about D1. + */ + offs = lseek(s->fd, start, SEEK_HOLE); + if (offs < 0) { + return -errno; /* D1 and (H3 or H4) */ + } + assert(offs >= start); + + if (offs > start) { + /* + * D1 and H2: either in data, next hole at offs, or it was in + * data but is now in a trailing hole. In the latter case, + * all bets are off. Treating it as if it there was data all + * the way to EOF is safe, so simply do that. + */ + *data = start; + *hole = offs; + return 0; + } + + /* D1 and H1 */ + return -EBUSY; +#else + return -ENOTSUP; +#endif +} + +/* + * Returns the allocation status of the specified sectors. + * + * If 'sector_num' is beyond the end of the disk image the return value is 0 + * and 'pnum' is set to 0. + * + * 'pnum' is set to the number of sectors (including and immediately following + * the specified sector) that are known to be in the same + * allocated/unallocated state. + * + * 'nb_sectors' is the max value 'pnum' should be set to. If nb_sectors goes + * beyond the end of the disk image it will be clamped. + */ +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + off_t start, data = 0, hole = 0; + int64_t total_size; + int ret; + + ret = fd_open(bs); + if (ret < 0) { + return ret; + } + + start = sector_num * BDRV_SECTOR_SIZE; + total_size = bdrv_getlength(bs); + if (total_size < 0) { + return total_size; + } else if (start >= total_size) { + *pnum = 0; + return 0; + } else if (start + nb_sectors * BDRV_SECTOR_SIZE > total_size) { + nb_sectors = DIV_ROUND_UP(total_size - start, BDRV_SECTOR_SIZE); + } + + ret = find_allocation(bs, start, &data, &hole); + if (ret == -ENXIO) { + /* Trailing hole */ + *pnum = nb_sectors; + ret = BDRV_BLOCK_ZERO; + } else if (ret < 0) { + /* No info available, so pretend there are no holes */ + *pnum = nb_sectors; + ret = BDRV_BLOCK_DATA; + } else if (data == start) { + /* On a data extent, compute sectors to the end of the extent, + * possibly including a partial sector at EOF. */ + *pnum = MIN(nb_sectors, DIV_ROUND_UP(hole - start, BDRV_SECTOR_SIZE)); + ret = BDRV_BLOCK_DATA; + } else { + /* On a hole, compute sectors to the beginning of the next extent. */ + assert(hole == start); + *pnum = MIN(nb_sectors, (data - start) / BDRV_SECTOR_SIZE); + ret = BDRV_BLOCK_ZERO; + } + return ret | BDRV_BLOCK_OFFSET_VALID | start; +} + +static coroutine_fn BlockAIOCB *raw_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + cb, opaque, QEMU_AIO_DISCARD); +} + +static int coroutine_fn raw_co_write_zeroes( + BlockDriverState *bs, int64_t sector_num, + int nb_sectors, BdrvRequestFlags flags) +{ + BDRVRawState *s = bs->opaque; + + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_WRITE_ZEROES); + } else if (s->discard_zeroes) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_DISCARD); + } + return -ENOTSUP; +} + +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVRawState *s = bs->opaque; + + bdi->unallocated_blocks_are_zero = s->discard_zeroes; + bdi->can_write_zeroes_with_unmap = s->discard_zeroes; + return 0; +} + +static QemuOptsList raw_create_opts = { + .name = "raw-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_NOCOW, + .type = QEMU_OPT_BOOL, + .help = "Turn off copy-on-write (valid only on btrfs)" + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, falloc, full)" + }, + { /* end of list */ } + } +}; + +BlockDriver bdrv_file = { + .format_name = "file", + .protocol_name = "file", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe = NULL, /* no probe for protocols */ + .bdrv_parse_filename = raw_parse_filename, + .bdrv_file_open = raw_open, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_close = raw_close, + .bdrv_create = raw_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_get_block_status = raw_co_get_block_status, + .bdrv_co_write_zeroes = raw_co_write_zeroes, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + .bdrv_aio_discard = raw_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + + .create_opts = &raw_create_opts, +}; + +/***********************************************/ +/* host device */ + +#if defined(__APPLE__) && defined(__MACH__) +static kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ); +static kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, + CFIndex maxPathSize, int flags); +kern_return_t FindEjectableCDMedia( io_iterator_t *mediaIterator ) +{ + kern_return_t kernResult; + mach_port_t masterPort; + CFMutableDictionaryRef classesToMatch; + + kernResult = IOMasterPort( MACH_PORT_NULL, &masterPort ); + if ( KERN_SUCCESS != kernResult ) { + printf( "IOMasterPort returned %d\n", kernResult ); + } + + classesToMatch = IOServiceMatching( kIOCDMediaClass ); + if ( classesToMatch == NULL ) { + printf( "IOServiceMatching returned a NULL dictionary.\n" ); + } else { + CFDictionarySetValue( classesToMatch, CFSTR( kIOMediaEjectableKey ), kCFBooleanTrue ); + } + kernResult = IOServiceGetMatchingServices( masterPort, classesToMatch, mediaIterator ); + if ( KERN_SUCCESS != kernResult ) + { + printf( "IOServiceGetMatchingServices returned %d\n", kernResult ); + } + + return kernResult; +} + +kern_return_t GetBSDPath(io_iterator_t mediaIterator, char *bsdPath, + CFIndex maxPathSize, int flags) +{ + io_object_t nextMedia; + kern_return_t kernResult = KERN_FAILURE; + *bsdPath = '\0'; + nextMedia = IOIteratorNext( mediaIterator ); + if ( nextMedia ) + { + CFTypeRef bsdPathAsCFString; + bsdPathAsCFString = IORegistryEntryCreateCFProperty( nextMedia, CFSTR( kIOBSDNameKey ), kCFAllocatorDefault, 0 ); + if ( bsdPathAsCFString ) { + size_t devPathLength; + strcpy( bsdPath, _PATH_DEV ); + if (flags & BDRV_O_NOCACHE) { + strcat(bsdPath, "r"); + } + devPathLength = strlen( bsdPath ); + if ( CFStringGetCString( bsdPathAsCFString, bsdPath + devPathLength, maxPathSize - devPathLength, kCFStringEncodingASCII ) ) { + kernResult = KERN_SUCCESS; + } + CFRelease( bsdPathAsCFString ); + } + IOObjectRelease( nextMedia ); + } + + return kernResult; +} + +#endif + +static int hdev_probe_device(const char *filename) +{ + struct stat st; + + /* allow a dedicated CD-ROM driver to match with a higher priority */ + if (strstart(filename, "/dev/cdrom", NULL)) + return 50; + + if (stat(filename, &st) >= 0 && + (S_ISCHR(st.st_mode) || S_ISBLK(st.st_mode))) { + return 100; + } + + return 0; +} + +static int check_hdev_writable(BDRVRawState *s) +{ +#if defined(BLKROGET) + /* Linux block devices can be configured "read-only" using blockdev(8). + * This is independent of device node permissions and therefore open(2) + * with O_RDWR succeeds. Actual writes fail with EPERM. + * + * bdrv_open() is supposed to fail if the disk is read-only. Explicitly + * check for read-only block devices so that Linux block devices behave + * properly. + */ + struct stat st; + int readonly = 0; + + if (fstat(s->fd, &st)) { + return -errno; + } + + if (!S_ISBLK(st.st_mode)) { + return 0; + } + + if (ioctl(s->fd, BLKROGET, &readonly) < 0) { + return -errno; + } + + if (readonly) { + return -EACCES; + } +#endif /* defined(BLKROGET) */ + return 0; +} + +static void hdev_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_device:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static bool hdev_is_sg(BlockDriverState *bs) +{ + +#if defined(__linux__) + + struct stat st; + struct sg_scsi_id scsiid; + int sg_version; + + if (stat(bs->filename, &st) >= 0 && S_ISCHR(st.st_mode) && + !bdrv_ioctl(bs, SG_GET_VERSION_NUM, &sg_version) && + !bdrv_ioctl(bs, SG_GET_SCSI_ID, &scsiid)) { + DPRINTF("SG device found: type=%d, version=%d\n", + scsiid.scsi_type, sg_version); + return true; + } + +#endif + + return false; +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; + +#if defined(__APPLE__) && defined(__MACH__) + const char *filename = qdict_get_str(options, "filename"); + + if (strstart(filename, "/dev/cdrom", NULL)) { + kern_return_t kernResult; + io_iterator_t mediaIterator; + char bsdPath[ MAXPATHLEN ]; + int fd; + + kernResult = FindEjectableCDMedia( &mediaIterator ); + kernResult = GetBSDPath(mediaIterator, bsdPath, sizeof(bsdPath), + flags); + if ( bsdPath[ 0 ] != '\0' ) { + strcat(bsdPath,"s0"); + /* some CDs don't have a partition 0 */ + fd = qemu_open(bsdPath, O_RDONLY | O_BINARY | O_LARGEFILE); + if (fd < 0) { + bsdPath[strlen(bsdPath)-1] = '1'; + } else { + qemu_close(fd); + } + filename = bsdPath; + qdict_put(options, "filename", qstring_from_str(filename)); + } + + if ( mediaIterator ) + IOObjectRelease( mediaIterator ); + } +#endif + + s->type = FTYPE_FILE; + + ret = raw_open_common(bs, options, flags, 0, &local_err); + if (ret < 0) { + if (local_err) { + error_propagate(errp, local_err); + } + return ret; + } + + /* Since this does ioctl the device must be already opened */ + bs->sg = hdev_is_sg(bs); + + if (flags & BDRV_O_RDWR) { + ret = check_hdev_writable(s); + if (ret < 0) { + raw_close(bs); + error_setg_errno(errp, -ret, "The device is not writable"); + return ret; + } + } + + return ret; +} + +#if defined(__linux__) + +static BlockAIOCB *hdev_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + RawPosixAIOData *acb; + ThreadPool *pool; + + if (fd_open(bs) < 0) + return NULL; + + acb = g_new(RawPosixAIOData, 1); + acb->bs = bs; + acb->aio_type = QEMU_AIO_IOCTL; + acb->aio_fildes = s->fd; + acb->aio_offset = 0; + acb->aio_ioctl_buf = buf; + acb->aio_ioctl_cmd = req; + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); +} +#endif /* linux */ + +static int fd_open(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + /* this is just to ensure s->fd is sane (its called by io ops) */ + if (s->fd >= 0) + return 0; + return -EIO; +} + +static coroutine_fn BlockAIOCB *hdev_aio_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + + if (fd_open(bs) < 0) { + return NULL; + } + return paio_submit(bs, s->fd, sector_num, NULL, nb_sectors, + cb, opaque, QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); +} + +static coroutine_fn int hdev_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, BdrvRequestFlags flags) +{ + BDRVRawState *s = bs->opaque; + int rc; + + rc = fd_open(bs); + if (rc < 0) { + return rc; + } + if (!(flags & BDRV_REQ_MAY_UNMAP)) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_WRITE_ZEROES|QEMU_AIO_BLKDEV); + } else if (s->discard_zeroes) { + return paio_submit_co(bs, s->fd, sector_num, NULL, nb_sectors, + QEMU_AIO_DISCARD|QEMU_AIO_BLKDEV); + } + return -ENOTSUP; +} + +static int hdev_create(const char *filename, QemuOpts *opts, + Error **errp) +{ + int fd; + int ret = 0; + struct stat stat_buf; + int64_t total_size = 0; + bool has_prefix; + + /* This function is used by both protocol block drivers and therefore either + * of these prefixes may be given. + * The return value has to be stored somewhere, otherwise this is an error + * due to -Werror=unused-value. */ + has_prefix = + strstart(filename, "host_device:", &filename) || + strstart(filename, "host_cdrom:" , &filename); + + (void)has_prefix; + + ret = raw_normalize_devicepath(&filename); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not normalize device path"); + return ret; + } + + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + + fd = qemu_open(filename, O_WRONLY | O_BINARY); + if (fd < 0) { + ret = -errno; + error_setg_errno(errp, -ret, "Could not open device"); + return ret; + } + + if (fstat(fd, &stat_buf) < 0) { + ret = -errno; + error_setg_errno(errp, -ret, "Could not stat device"); + } else if (!S_ISBLK(stat_buf.st_mode) && !S_ISCHR(stat_buf.st_mode)) { + error_setg(errp, + "The given file is neither a block nor a character device"); + ret = -ENODEV; + } else if (lseek(fd, 0, SEEK_END) < total_size) { + error_setg(errp, "Device is too small"); + ret = -ENOSPC; + } + + qemu_close(fd); + return ret; +} + +static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .protocol_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe_device = hdev_probe_device, + .bdrv_parse_filename = hdev_parse_filename, + .bdrv_file_open = hdev_open, + .bdrv_close = raw_close, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_create = hdev_create, + .create_opts = &raw_create_opts, + .bdrv_co_write_zeroes = hdev_co_write_zeroes, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + .bdrv_aio_discard = hdev_aio_discard, + .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_info = raw_get_info, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + .bdrv_probe_blocksizes = hdev_probe_blocksizes, + .bdrv_probe_geometry = hdev_probe_geometry, + + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + + /* generic scsi device */ +#ifdef __linux__ + .bdrv_aio_ioctl = hdev_aio_ioctl, +#endif +}; + +#if defined(__linux__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__) +static void cdrom_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_cdrom:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} +#endif + +#ifdef __linux__ +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; + + s->type = FTYPE_CD; + + /* open will not fail even if no CD is inserted, so add O_NONBLOCK */ + ret = raw_open_common(bs, options, flags, O_NONBLOCK, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; +} + +static int cdrom_probe_device(const char *filename) +{ + int fd, ret; + int prio = 0; + struct stat st; + + fd = qemu_open(filename, O_RDONLY | O_NONBLOCK); + if (fd < 0) { + goto out; + } + ret = fstat(fd, &st); + if (ret == -1 || !S_ISBLK(st.st_mode)) { + goto outc; + } + + /* Attempt to detect via a CDROM specific ioctl */ + ret = ioctl(fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + if (ret >= 0) + prio = 100; + +outc: + qemu_close(fd); +out: + return prio; +} + +static bool cdrom_is_inserted(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int ret; + + ret = ioctl(s->fd, CDROM_DRIVE_STATUS, CDSL_CURRENT); + return ret == CDS_DISC_OK; +} + +static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +{ + BDRVRawState *s = bs->opaque; + + if (eject_flag) { + if (ioctl(s->fd, CDROMEJECT, NULL) < 0) + perror("CDROMEJECT"); + } else { + if (ioctl(s->fd, CDROMCLOSETRAY, NULL) < 0) + perror("CDROMEJECT"); + } +} + +static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +{ + BDRVRawState *s = bs->opaque; + + if (ioctl(s->fd, CDROM_LOCKDOOR, locked) < 0) { + /* + * Note: an error can happen if the distribution automatically + * mounts the CD-ROM + */ + /* perror("CDROM_LOCKDOOR"); */ + } +} + +static BlockDriver bdrv_host_cdrom = { + .format_name = "host_cdrom", + .protocol_name = "host_cdrom", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, + .bdrv_file_open = cdrom_open, + .bdrv_close = raw_close, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_create = hdev_create, + .create_opts = &raw_create_opts, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + + /* removable device support */ + .bdrv_is_inserted = cdrom_is_inserted, + .bdrv_eject = cdrom_eject, + .bdrv_lock_medium = cdrom_lock_medium, + + /* generic scsi device */ + .bdrv_aio_ioctl = hdev_aio_ioctl, +}; +#endif /* __linux__ */ + +#if defined (__FreeBSD__) || defined(__FreeBSD_kernel__) +static int cdrom_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + Error *local_err = NULL; + int ret; + + s->type = FTYPE_CD; + + ret = raw_open_common(bs, options, flags, 0, &local_err); + if (ret) { + if (local_err) { + error_propagate(errp, local_err); + } + return ret; + } + + /* make sure the door isn't locked at this time */ + ioctl(s->fd, CDIOCALLOW); + return 0; +} + +static int cdrom_probe_device(const char *filename) +{ + if (strstart(filename, "/dev/cd", NULL) || + strstart(filename, "/dev/acd", NULL)) + return 100; + return 0; +} + +static int cdrom_reopen(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + int fd; + + /* + * Force reread of possibly changed/newly loaded disc, + * FreeBSD seems to not notice sometimes... + */ + if (s->fd >= 0) + qemu_close(s->fd); + fd = qemu_open(bs->filename, s->open_flags, 0644); + if (fd < 0) { + s->fd = -1; + return -EIO; + } + s->fd = fd; + + /* make sure the door isn't locked at this time */ + ioctl(s->fd, CDIOCALLOW); + return 0; +} + +static bool cdrom_is_inserted(BlockDriverState *bs) +{ + return raw_getlength(bs) > 0; +} + +static void cdrom_eject(BlockDriverState *bs, bool eject_flag) +{ + BDRVRawState *s = bs->opaque; + + if (s->fd < 0) + return; + + (void) ioctl(s->fd, CDIOCALLOW); + + if (eject_flag) { + if (ioctl(s->fd, CDIOCEJECT) < 0) + perror("CDIOCEJECT"); + } else { + if (ioctl(s->fd, CDIOCCLOSE) < 0) + perror("CDIOCCLOSE"); + } + + cdrom_reopen(bs); +} + +static void cdrom_lock_medium(BlockDriverState *bs, bool locked) +{ + BDRVRawState *s = bs->opaque; + + if (s->fd < 0) + return; + if (ioctl(s->fd, (locked ? CDIOCPREVENT : CDIOCALLOW)) < 0) { + /* + * Note: an error can happen if the distribution automatically + * mounts the CD-ROM + */ + /* perror("CDROM_LOCKDOOR"); */ + } +} + +static BlockDriver bdrv_host_cdrom = { + .format_name = "host_cdrom", + .protocol_name = "host_cdrom", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_probe_device = cdrom_probe_device, + .bdrv_parse_filename = cdrom_parse_filename, + .bdrv_file_open = cdrom_open, + .bdrv_close = raw_close, + .bdrv_reopen_prepare = raw_reopen_prepare, + .bdrv_reopen_commit = raw_reopen_commit, + .bdrv_reopen_abort = raw_reopen_abort, + .bdrv_create = hdev_create, + .create_opts = &raw_create_opts, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + .bdrv_refresh_limits = raw_refresh_limits, + .bdrv_io_plug = raw_aio_plug, + .bdrv_io_unplug = raw_aio_unplug, + .bdrv_flush_io_queue = raw_aio_flush_io_queue, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .has_variable_length = true, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + + /* removable device support */ + .bdrv_is_inserted = cdrom_is_inserted, + .bdrv_eject = cdrom_eject, + .bdrv_lock_medium = cdrom_lock_medium, +}; +#endif /* __FreeBSD__ */ + +static void bdrv_file_init(void) +{ + /* + * Register all the drivers. Note that order is important, the driver + * registered last will get probed first. + */ + bdrv_register(&bdrv_file); + bdrv_register(&bdrv_host_device); +#ifdef __linux__ + bdrv_register(&bdrv_host_cdrom); +#endif +#if defined(__FreeBSD__) || defined(__FreeBSD_kernel__) + bdrv_register(&bdrv_host_cdrom); +#endif +} + +block_init(bdrv_file_init); diff --git a/src/block/raw-win32.c b/src/block/raw-win32.c new file mode 100644 index 0000000..2d0907a --- /dev/null +++ b/src/block/raw-win32.c @@ -0,0 +1,729 @@ +/* + * Block driver for RAW files (win32) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/timer.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "raw-aio.h" +#include "trace.h" +#include "block/thread-pool.h" +#include "qemu/iov.h" +#include "qapi/qmp/qstring.h" +#include <windows.h> +#include <winioctl.h> + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 +#define FTYPE_HARDDISK 2 + +typedef struct RawWin32AIOData { + BlockDriverState *bs; + HANDLE hfile; + struct iovec *aio_iov; + int aio_niov; + size_t aio_nbytes; + off64_t aio_offset; + int aio_type; +} RawWin32AIOData; + +typedef struct BDRVRawState { + HANDLE hfile; + int type; + char drive_path[16]; /* format: "d:\" */ + QEMUWin32AIOState *aio; +} BDRVRawState; + +/* + * Read/writes the data to/from a given linear buffer. + * + * Returns the number of bytes handles or -errno in case of an error. Short + * reads are only returned if the end of the file is reached. + */ +static size_t handle_aiocb_rw(RawWin32AIOData *aiocb) +{ + size_t offset = 0; + int i; + + for (i = 0; i < aiocb->aio_niov; i++) { + OVERLAPPED ov; + DWORD ret, ret_count, len; + + memset(&ov, 0, sizeof(ov)); + ov.Offset = (aiocb->aio_offset + offset); + ov.OffsetHigh = (aiocb->aio_offset + offset) >> 32; + len = aiocb->aio_iov[i].iov_len; + if (aiocb->aio_type & QEMU_AIO_WRITE) { + ret = WriteFile(aiocb->hfile, aiocb->aio_iov[i].iov_base, + len, &ret_count, &ov); + } else { + ret = ReadFile(aiocb->hfile, aiocb->aio_iov[i].iov_base, + len, &ret_count, &ov); + } + if (!ret) { + ret_count = 0; + } + if (ret_count != len) { + offset += ret_count; + break; + } + offset += len; + } + + return offset; +} + +static int aio_worker(void *arg) +{ + RawWin32AIOData *aiocb = arg; + ssize_t ret = 0; + size_t count; + + switch (aiocb->aio_type & QEMU_AIO_TYPE_MASK) { + case QEMU_AIO_READ: + count = handle_aiocb_rw(aiocb); + if (count < aiocb->aio_nbytes) { + /* A short read means that we have reached EOF. Pad the buffer + * with zeros for bytes after EOF. */ + iov_memset(aiocb->aio_iov, aiocb->aio_niov, count, + 0, aiocb->aio_nbytes - count); + + count = aiocb->aio_nbytes; + } + if (count == aiocb->aio_nbytes) { + ret = 0; + } else { + ret = -EINVAL; + } + break; + case QEMU_AIO_WRITE: + count = handle_aiocb_rw(aiocb); + if (count == aiocb->aio_nbytes) { + ret = 0; + } else { + ret = -EINVAL; + } + break; + case QEMU_AIO_FLUSH: + if (!FlushFileBuffers(aiocb->hfile)) { + return -EIO; + } + break; + default: + fprintf(stderr, "invalid aio request (0x%x)\n", aiocb->aio_type); + ret = -EINVAL; + break; + } + + g_free(aiocb); + return ret; +} + +static BlockAIOCB *paio_submit(BlockDriverState *bs, HANDLE hfile, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type) +{ + RawWin32AIOData *acb = g_new(RawWin32AIOData, 1); + ThreadPool *pool; + + acb->bs = bs; + acb->hfile = hfile; + acb->aio_type = type; + + if (qiov) { + acb->aio_iov = qiov->iov; + acb->aio_niov = qiov->niov; + } + acb->aio_nbytes = nb_sectors * 512; + acb->aio_offset = sector_num * 512; + + trace_paio_submit(acb, opaque, sector_num, nb_sectors, type); + pool = aio_get_thread_pool(bdrv_get_aio_context(bs)); + return thread_pool_submit_aio(pool, aio_worker, acb, cb, opaque); +} + +int qemu_ftruncate64(int fd, int64_t length) +{ + LARGE_INTEGER li; + DWORD dw; + LONG high; + HANDLE h; + BOOL res; + + if ((GetVersion() & 0x80000000UL) && (length >> 32) != 0) + return -1; + + h = (HANDLE)_get_osfhandle(fd); + + /* get current position, ftruncate do not change position */ + li.HighPart = 0; + li.LowPart = SetFilePointer (h, 0, &li.HighPart, FILE_CURRENT); + if (li.LowPart == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { + return -1; + } + + high = length >> 32; + dw = SetFilePointer(h, (DWORD) length, &high, FILE_BEGIN); + if (dw == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { + return -1; + } + res = SetEndOfFile(h); + + /* back to old position */ + SetFilePointer(h, li.LowPart, &li.HighPart, FILE_BEGIN); + return res ? 0 : -1; +} + +static int set_sparse(int fd) +{ + DWORD returned; + return (int) DeviceIoControl((HANDLE)_get_osfhandle(fd), FSCTL_SET_SPARSE, + NULL, 0, NULL, 0, &returned, NULL); +} + +static void raw_detach_aio_context(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + if (s->aio) { + win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); + } +} + +static void raw_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVRawState *s = bs->opaque; + + if (s->aio) { + win32_aio_attach_aio_context(s->aio, new_context); + } +} + +static void raw_probe_alignment(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + DWORD sectorsPerCluster, freeClusters, totalClusters, count; + DISK_GEOMETRY_EX dg; + BOOL status; + + if (s->type == FTYPE_CD) { + bs->request_alignment = 2048; + return; + } + if (s->type == FTYPE_HARDDISK) { + status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &count, NULL); + if (status != 0) { + bs->request_alignment = dg.Geometry.BytesPerSector; + return; + } + /* try GetDiskFreeSpace too */ + } + + if (s->drive_path[0]) { + GetDiskFreeSpace(s->drive_path, §orsPerCluster, + &dg.Geometry.BytesPerSector, + &freeClusters, &totalClusters); + bs->request_alignment = dg.Geometry.BytesPerSector; + } +} + +static void raw_parse_flags(int flags, int *access_flags, DWORD *overlapped) +{ + assert(access_flags != NULL); + assert(overlapped != NULL); + + if (flags & BDRV_O_RDWR) { + *access_flags = GENERIC_READ | GENERIC_WRITE; + } else { + *access_flags = GENERIC_READ; + } + + *overlapped = FILE_ATTRIBUTE_NORMAL; + if (flags & BDRV_O_NATIVE_AIO) { + *overlapped |= FILE_FLAG_OVERLAPPED; + } + if (flags & BDRV_O_NOCACHE) { + *overlapped |= FILE_FLAG_NO_BUFFERING; + } +} + +static void raw_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The filename does not have to be prefixed by the protocol name, since + * "file" is the default protocol; therefore, the return value of this + * function call can be ignored. */ + strstart(filename, "file:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static QemuOptsList raw_runtime_opts = { + .name = "raw", + .head = QTAILQ_HEAD_INITIALIZER(raw_runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "File name of the image", + }, + { /* end of list */ } + }, +}; + +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + int access_flags; + DWORD overlapped; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + int ret; + + s->type = FTYPE_FILE; + + opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + filename = qemu_opt_get(opts, "filename"); + + raw_parse_flags(flags, &access_flags, &overlapped); + + if (filename[0] && filename[1] == ':') { + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", filename[0]); + } else if (filename[0] == '\\' && filename[1] == '\\') { + s->drive_path[0] = 0; + } else { + /* Relative path. */ + char buf[MAX_PATH]; + GetCurrentDirectory(MAX_PATH, buf); + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", buf[0]); + } + + s->hfile = CreateFile(filename, access_flags, + FILE_SHARE_READ, NULL, + OPEN_EXISTING, overlapped, NULL); + if (s->hfile == INVALID_HANDLE_VALUE) { + int err = GetLastError(); + + if (err == ERROR_ACCESS_DENIED) { + ret = -EACCES; + } else { + ret = -EINVAL; + } + goto fail; + } + + if (flags & BDRV_O_NATIVE_AIO) { + s->aio = win32_aio_init(); + if (s->aio == NULL) { + CloseHandle(s->hfile); + error_setg(errp, "Could not initialize AIO"); + ret = -EINVAL; + goto fail; + } + + ret = win32_aio_attach(s->aio, s->hfile); + if (ret < 0) { + win32_aio_cleanup(s->aio); + CloseHandle(s->hfile); + error_setg_errno(errp, -ret, "Could not enable AIO"); + goto fail; + } + + win32_aio_attach_aio_context(s->aio, bdrv_get_aio_context(bs)); + } + + raw_probe_alignment(bs); + ret = 0; +fail: + qemu_opts_del(opts); + return ret; +} + +static BlockAIOCB *raw_aio_readv(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + if (s->aio) { + return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, + nb_sectors, cb, opaque, QEMU_AIO_READ); + } else { + return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_READ); + } +} + +static BlockAIOCB *raw_aio_writev(BlockDriverState *bs, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + if (s->aio) { + return win32_aio_submit(bs, s->aio, s->hfile, sector_num, qiov, + nb_sectors, cb, opaque, QEMU_AIO_WRITE); + } else { + return paio_submit(bs, s->hfile, sector_num, qiov, nb_sectors, + cb, opaque, QEMU_AIO_WRITE); + } +} + +static BlockAIOCB *raw_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, void *opaque) +{ + BDRVRawState *s = bs->opaque; + return paio_submit(bs, s->hfile, 0, NULL, 0, cb, opaque, QEMU_AIO_FLUSH); +} + +static void raw_close(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + + if (s->aio) { + win32_aio_detach_aio_context(s->aio, bdrv_get_aio_context(bs)); + win32_aio_cleanup(s->aio); + s->aio = NULL; + } + + CloseHandle(s->hfile); + if (bs->open_flags & BDRV_O_TEMPORARY) { + unlink(bs->filename); + } +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRawState *s = bs->opaque; + LONG low, high; + DWORD dwPtrLow; + + low = offset; + high = offset >> 32; + + /* + * An error has occurred if the return value is INVALID_SET_FILE_POINTER + * and GetLastError doesn't return NO_ERROR. + */ + dwPtrLow = SetFilePointer(s->hfile, low, &high, FILE_BEGIN); + if (dwPtrLow == INVALID_SET_FILE_POINTER && GetLastError() != NO_ERROR) { + fprintf(stderr, "SetFilePointer error: %lu\n", GetLastError()); + return -EIO; + } + if (SetEndOfFile(s->hfile) == 0) { + fprintf(stderr, "SetEndOfFile error: %lu\n", GetLastError()); + return -EIO; + } + return 0; +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ + BDRVRawState *s = bs->opaque; + LARGE_INTEGER l; + ULARGE_INTEGER available, total, total_free; + DISK_GEOMETRY_EX dg; + DWORD count; + BOOL status; + + switch(s->type) { + case FTYPE_FILE: + l.LowPart = GetFileSize(s->hfile, (PDWORD)&l.HighPart); + if (l.LowPart == 0xffffffffUL && GetLastError() != NO_ERROR) + return -EIO; + break; + case FTYPE_CD: + if (!GetDiskFreeSpaceEx(s->drive_path, &available, &total, &total_free)) + return -EIO; + l.QuadPart = total.QuadPart; + break; + case FTYPE_HARDDISK: + status = DeviceIoControl(s->hfile, IOCTL_DISK_GET_DRIVE_GEOMETRY_EX, + NULL, 0, &dg, sizeof(dg), &count, NULL); + if (status != 0) { + l = dg.DiskSize; + } + break; + default: + return -EIO; + } + return l.QuadPart; +} + +static int64_t raw_get_allocated_file_size(BlockDriverState *bs) +{ + typedef DWORD (WINAPI * get_compressed_t)(const char *filename, + DWORD * high); + get_compressed_t get_compressed; + struct _stati64 st; + const char *filename = bs->filename; + /* WinNT support GetCompressedFileSize to determine allocate size */ + get_compressed = + (get_compressed_t) GetProcAddress(GetModuleHandle("kernel32"), + "GetCompressedFileSizeA"); + if (get_compressed) { + DWORD high, low; + low = get_compressed(filename, &high); + if (low != 0xFFFFFFFFlu || GetLastError() == NO_ERROR) { + return (((int64_t) high) << 32) + low; + } + } + + if (_stati64(filename, &st) < 0) { + return -1; + } + return st.st_size; +} + +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int fd; + int64_t total_size = 0; + + strstart(filename, "file:", &filename); + + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + + fd = qemu_open(filename, O_WRONLY | O_CREAT | O_TRUNC | O_BINARY, + 0644); + if (fd < 0) { + error_setg_errno(errp, errno, "Could not create file"); + return -EIO; + } + set_sparse(fd); + ftruncate(fd, total_size); + qemu_close(fd); + return 0; +} + + +static QemuOptsList raw_create_opts = { + .name = "raw-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + +BlockDriver bdrv_file = { + .format_name = "file", + .protocol_name = "file", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = raw_parse_filename, + .bdrv_file_open = raw_open, + .bdrv_close = raw_close, + .bdrv_create = raw_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_truncate = raw_truncate, + .bdrv_getlength = raw_getlength, + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, + + .create_opts = &raw_create_opts, +}; + +/***********************************************/ +/* host device */ + +static int find_cdrom(char *cdrom_name, int cdrom_name_size) +{ + char drives[256], *pdrv = drives; + UINT type; + + memset(drives, 0, sizeof(drives)); + GetLogicalDriveStrings(sizeof(drives), drives); + while(pdrv[0] != '\0') { + type = GetDriveType(pdrv); + switch(type) { + case DRIVE_CDROM: + snprintf(cdrom_name, cdrom_name_size, "\\\\.\\%c:", pdrv[0]); + return 0; + break; + } + pdrv += lstrlen(pdrv) + 1; + } + return -1; +} + +static int find_device_type(BlockDriverState *bs, const char *filename) +{ + BDRVRawState *s = bs->opaque; + UINT type; + const char *p; + + if (strstart(filename, "\\\\.\\", &p) || + strstart(filename, "//./", &p)) { + if (stristart(p, "PhysicalDrive", NULL)) + return FTYPE_HARDDISK; + snprintf(s->drive_path, sizeof(s->drive_path), "%c:\\", p[0]); + type = GetDriveType(s->drive_path); + switch (type) { + case DRIVE_REMOVABLE: + case DRIVE_FIXED: + return FTYPE_HARDDISK; + case DRIVE_CDROM: + return FTYPE_CD; + default: + return FTYPE_FILE; + } + } else { + return FTYPE_FILE; + } +} + +static int hdev_probe_device(const char *filename) +{ + if (strstart(filename, "/dev/cdrom", NULL)) + return 100; + if (is_windows_drive(filename)) + return 100; + return 0; +} + +static void hdev_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + /* The prefix is optional, just as for "file". */ + strstart(filename, "host_device:", &filename); + + qdict_put_obj(options, "filename", QOBJECT(qstring_from_str(filename))); +} + +static int hdev_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVRawState *s = bs->opaque; + int access_flags, create_flags; + int ret = 0; + DWORD overlapped; + char device_name[64]; + + Error *local_err = NULL; + const char *filename; + + QemuOpts *opts = qemu_opts_create(&raw_runtime_opts, NULL, 0, + &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto done; + } + + filename = qemu_opt_get(opts, "filename"); + + if (strstart(filename, "/dev/cdrom", NULL)) { + if (find_cdrom(device_name, sizeof(device_name)) < 0) { + error_setg(errp, "Could not open CD-ROM drive"); + ret = -ENOENT; + goto done; + } + filename = device_name; + } else { + /* transform drive letters into device name */ + if (((filename[0] >= 'a' && filename[0] <= 'z') || + (filename[0] >= 'A' && filename[0] <= 'Z')) && + filename[1] == ':' && filename[2] == '\0') { + snprintf(device_name, sizeof(device_name), "\\\\.\\%c:", filename[0]); + filename = device_name; + } + } + s->type = find_device_type(bs, filename); + + raw_parse_flags(flags, &access_flags, &overlapped); + + create_flags = OPEN_EXISTING; + + s->hfile = CreateFile(filename, access_flags, + FILE_SHARE_READ, NULL, + create_flags, overlapped, NULL); + if (s->hfile == INVALID_HANDLE_VALUE) { + int err = GetLastError(); + + if (err == ERROR_ACCESS_DENIED) { + ret = -EACCES; + } else { + ret = -EINVAL; + } + error_setg_errno(errp, -ret, "Could not open device"); + goto done; + } + +done: + qemu_opts_del(opts); + return ret; +} + +static BlockDriver bdrv_host_device = { + .format_name = "host_device", + .protocol_name = "host_device", + .instance_size = sizeof(BDRVRawState), + .bdrv_needs_filename = true, + .bdrv_parse_filename = hdev_parse_filename, + .bdrv_probe_device = hdev_probe_device, + .bdrv_file_open = hdev_open, + .bdrv_close = raw_close, + + .bdrv_aio_readv = raw_aio_readv, + .bdrv_aio_writev = raw_aio_writev, + .bdrv_aio_flush = raw_aio_flush, + + .bdrv_detach_aio_context = raw_detach_aio_context, + .bdrv_attach_aio_context = raw_attach_aio_context, + + .bdrv_getlength = raw_getlength, + .has_variable_length = true, + + .bdrv_get_allocated_file_size + = raw_get_allocated_file_size, +}; + +static void bdrv_file_init(void) +{ + bdrv_register(&bdrv_file); + bdrv_register(&bdrv_host_device); +} + +block_init(bdrv_file_init); diff --git a/src/block/raw_bsd.c b/src/block/raw_bsd.c new file mode 100644 index 0000000..915d6fd --- /dev/null +++ b/src/block/raw_bsd.c @@ -0,0 +1,270 @@ +/* BlockDriver implementation for "raw" + * + * Copyright (C) 2010, 2013, Red Hat, Inc. + * Copyright (C) 2010, Blue Swirl <blauwirbel@gmail.com> + * Copyright (C) 2009, Anthony Liguori <aliguori@us.ibm.com> + * + * Author: + * Laszlo Ersek <lersek@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS + * IN THE SOFTWARE. + */ + +#include "block/block_int.h" +#include "qemu/option.h" + +static QemuOptsList raw_create_opts = { + .name = "raw-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(raw_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + +static int raw_reopen_prepare(BDRVReopenState *reopen_state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static int coroutine_fn raw_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BLKDBG_EVENT(bs->file, BLKDBG_READ_AIO); + return bdrv_co_readv(bs->file->bs, sector_num, nb_sectors, qiov); +} + +static int coroutine_fn raw_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + void *buf = NULL; + BlockDriver *drv; + QEMUIOVector local_qiov; + int ret; + + if (bs->probed && sector_num == 0) { + /* As long as these conditions are true, we can't get partial writes to + * the probe buffer and can just directly check the request. */ + QEMU_BUILD_BUG_ON(BLOCK_PROBE_BUF_SIZE != 512); + QEMU_BUILD_BUG_ON(BDRV_SECTOR_SIZE != 512); + + if (nb_sectors == 0) { + /* qemu_iovec_to_buf() would fail, but we want to return success + * instead of -EINVAL in this case. */ + return 0; + } + + buf = qemu_try_blockalign(bs->file->bs, 512); + if (!buf) { + ret = -ENOMEM; + goto fail; + } + + ret = qemu_iovec_to_buf(qiov, 0, buf, 512); + if (ret != 512) { + ret = -EINVAL; + goto fail; + } + + drv = bdrv_probe_all(buf, 512, NULL); + if (drv != bs->drv) { + ret = -EPERM; + goto fail; + } + + /* Use the checked buffer, a malicious guest might be overwriting its + * original buffer in the background. */ + qemu_iovec_init(&local_qiov, qiov->niov + 1); + qemu_iovec_add(&local_qiov, buf, 512); + qemu_iovec_concat(&local_qiov, qiov, 512, qiov->size - 512); + qiov = &local_qiov; + } + + BLKDBG_EVENT(bs->file, BLKDBG_WRITE_AIO); + ret = bdrv_co_writev(bs->file->bs, sector_num, nb_sectors, qiov); + +fail: + if (qiov == &local_qiov) { + qemu_iovec_destroy(&local_qiov); + } + qemu_vfree(buf); + return ret; +} + +static int64_t coroutine_fn raw_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, int *pnum) +{ + *pnum = nb_sectors; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | + (sector_num << BDRV_SECTOR_BITS); +} + +static int coroutine_fn raw_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + BdrvRequestFlags flags) +{ + return bdrv_co_write_zeroes(bs->file->bs, sector_num, nb_sectors, flags); +} + +static int coroutine_fn raw_co_discard(BlockDriverState *bs, + int64_t sector_num, int nb_sectors) +{ + return bdrv_co_discard(bs->file->bs, sector_num, nb_sectors); +} + +static int64_t raw_getlength(BlockDriverState *bs) +{ + return bdrv_getlength(bs->file->bs); +} + +static int raw_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + return bdrv_get_info(bs->file->bs, bdi); +} + +static void raw_refresh_limits(BlockDriverState *bs, Error **errp) +{ + bs->bl = bs->file->bs->bl; +} + +static int raw_truncate(BlockDriverState *bs, int64_t offset) +{ + return bdrv_truncate(bs->file->bs, offset); +} + +static int raw_media_changed(BlockDriverState *bs) +{ + return bdrv_media_changed(bs->file->bs); +} + +static void raw_eject(BlockDriverState *bs, bool eject_flag) +{ + bdrv_eject(bs->file->bs, eject_flag); +} + +static void raw_lock_medium(BlockDriverState *bs, bool locked) +{ + bdrv_lock_medium(bs->file->bs, locked); +} + +static BlockAIOCB *raw_aio_ioctl(BlockDriverState *bs, + unsigned long int req, void *buf, + BlockCompletionFunc *cb, + void *opaque) +{ + return bdrv_aio_ioctl(bs->file->bs, req, buf, cb, opaque); +} + +static int raw_has_zero_init(BlockDriverState *bs) +{ + return bdrv_has_zero_init(bs->file->bs); +} + +static int raw_create(const char *filename, QemuOpts *opts, Error **errp) +{ + Error *local_err = NULL; + int ret; + + ret = bdrv_create_file(filename, opts, &local_err); + if (local_err) { + error_propagate(errp, local_err); + } + return ret; +} + +static int raw_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + bs->sg = bs->file->bs->sg; + + if (bs->probed && !bdrv_is_read_only(bs)) { + fprintf(stderr, + "WARNING: Image format was not specified for '%s' and probing " + "guessed raw.\n" + " Automatically detecting the format is dangerous for " + "raw images, write operations on block 0 will be restricted.\n" + " Specify the 'raw' format explicitly to remove the " + "restrictions.\n", + bs->file->bs->filename); + } + + return 0; +} + +static void raw_close(BlockDriverState *bs) +{ +} + +static int raw_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + /* smallest possible positive score so that raw is used if and only if no + * other block driver works + */ + return 1; +} + +static int raw_probe_blocksizes(BlockDriverState *bs, BlockSizes *bsz) +{ + return bdrv_probe_blocksizes(bs->file->bs, bsz); +} + +static int raw_probe_geometry(BlockDriverState *bs, HDGeometry *geo) +{ + return bdrv_probe_geometry(bs->file->bs, geo); +} + +BlockDriver bdrv_raw = { + .format_name = "raw", + .bdrv_probe = &raw_probe, + .bdrv_reopen_prepare = &raw_reopen_prepare, + .bdrv_open = &raw_open, + .bdrv_close = &raw_close, + .bdrv_create = &raw_create, + .bdrv_co_readv = &raw_co_readv, + .bdrv_co_writev = &raw_co_writev, + .bdrv_co_write_zeroes = &raw_co_write_zeroes, + .bdrv_co_discard = &raw_co_discard, + .bdrv_co_get_block_status = &raw_co_get_block_status, + .bdrv_truncate = &raw_truncate, + .bdrv_getlength = &raw_getlength, + .has_variable_length = true, + .bdrv_get_info = &raw_get_info, + .bdrv_refresh_limits = &raw_refresh_limits, + .bdrv_probe_blocksizes = &raw_probe_blocksizes, + .bdrv_probe_geometry = &raw_probe_geometry, + .bdrv_media_changed = &raw_media_changed, + .bdrv_eject = &raw_eject, + .bdrv_lock_medium = &raw_lock_medium, + .bdrv_aio_ioctl = &raw_aio_ioctl, + .create_opts = &raw_create_opts, + .bdrv_has_zero_init = &raw_has_zero_init +}; + +static void bdrv_raw_init(void) +{ + bdrv_register(&bdrv_raw); +} + +block_init(bdrv_raw_init); diff --git a/src/block/rbd.c b/src/block/rbd.c new file mode 100644 index 0000000..a60a19d --- /dev/null +++ b/src/block/rbd.c @@ -0,0 +1,967 @@ +/* + * QEMU Block driver for RADOS (Ceph) + * + * Copyright (C) 2010-2011 Christian Brunner <chb@muc.de>, + * Josh Durgin <josh.durgin@dreamhost.com> + * + * This work is licensed under the terms of the GNU GPL, version 2. See + * the COPYING file in the top-level directory. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include <inttypes.h> + +#include "qemu-common.h" +#include "qemu/error-report.h" +#include "block/block_int.h" + +#include <rbd/librbd.h> + +/* + * When specifying the image filename use: + * + * rbd:poolname/devicename[@snapshotname][:option1=value1[:option2=value2...]] + * + * poolname must be the name of an existing rados pool. + * + * devicename is the name of the rbd image. + * + * Each option given is used to configure rados, and may be any valid + * Ceph option, "id", or "conf". + * + * The "id" option indicates what user we should authenticate as to + * the Ceph cluster. If it is excluded we will use the Ceph default + * (normally 'admin'). + * + * The "conf" option specifies a Ceph configuration file to read. If + * it is not specified, we will read from the default Ceph locations + * (e.g., /etc/ceph/ceph.conf). To avoid reading _any_ configuration + * file, specify conf=/dev/null. + * + * Configuration values containing :, @, or = can be escaped with a + * leading "\". + */ + +/* rbd_aio_discard added in 0.1.2 */ +#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 2) +#define LIBRBD_SUPPORTS_DISCARD +#else +#undef LIBRBD_SUPPORTS_DISCARD +#endif + +#define OBJ_MAX_SIZE (1UL << OBJ_DEFAULT_OBJ_ORDER) + +#define RBD_MAX_CONF_NAME_SIZE 128 +#define RBD_MAX_CONF_VAL_SIZE 512 +#define RBD_MAX_CONF_SIZE 1024 +#define RBD_MAX_POOL_NAME_SIZE 128 +#define RBD_MAX_SNAP_NAME_SIZE 128 +#define RBD_MAX_SNAPS 100 + +typedef enum { + RBD_AIO_READ, + RBD_AIO_WRITE, + RBD_AIO_DISCARD, + RBD_AIO_FLUSH +} RBDAIOCmd; + +typedef struct RBDAIOCB { + BlockAIOCB common; + QEMUBH *bh; + int64_t ret; + QEMUIOVector *qiov; + char *bounce; + RBDAIOCmd cmd; + int error; + struct BDRVRBDState *s; +} RBDAIOCB; + +typedef struct RADOSCB { + RBDAIOCB *acb; + struct BDRVRBDState *s; + int64_t size; + char *buf; + int64_t ret; +} RADOSCB; + +typedef struct BDRVRBDState { + rados_t cluster; + rados_ioctx_t io_ctx; + rbd_image_t image; + char name[RBD_MAX_IMAGE_NAME_SIZE]; + char *snap; +} BDRVRBDState; + +static int qemu_rbd_next_tok(char *dst, int dst_len, + char *src, char delim, + const char *name, + char **p, Error **errp) +{ + int l; + char *end; + + *p = NULL; + + if (delim != '\0') { + for (end = src; *end; ++end) { + if (*end == delim) { + break; + } + if (*end == '\\' && end[1] != '\0') { + end++; + } + } + if (*end == delim) { + *p = end + 1; + *end = '\0'; + } + } + l = strlen(src); + if (l >= dst_len) { + error_setg(errp, "%s too long", name); + return -EINVAL; + } else if (l == 0) { + error_setg(errp, "%s too short", name); + return -EINVAL; + } + + pstrcpy(dst, dst_len, src); + + return 0; +} + +static void qemu_rbd_unescape(char *src) +{ + char *p; + + for (p = src; *src; ++src, ++p) { + if (*src == '\\' && src[1] != '\0') { + src++; + } + *p = *src; + } + *p = '\0'; +} + +static int qemu_rbd_parsename(const char *filename, + char *pool, int pool_len, + char *snap, int snap_len, + char *name, int name_len, + char *conf, int conf_len, + Error **errp) +{ + const char *start; + char *p, *buf; + int ret; + + if (!strstart(filename, "rbd:", &start)) { + error_setg(errp, "File name must start with 'rbd:'"); + return -EINVAL; + } + + buf = g_strdup(start); + p = buf; + *snap = '\0'; + *conf = '\0'; + + ret = qemu_rbd_next_tok(pool, pool_len, p, + '/', "pool name", &p, errp); + if (ret < 0 || !p) { + ret = -EINVAL; + goto done; + } + qemu_rbd_unescape(pool); + + if (strchr(p, '@')) { + ret = qemu_rbd_next_tok(name, name_len, p, + '@', "object name", &p, errp); + if (ret < 0) { + goto done; + } + ret = qemu_rbd_next_tok(snap, snap_len, p, + ':', "snap name", &p, errp); + qemu_rbd_unescape(snap); + } else { + ret = qemu_rbd_next_tok(name, name_len, p, + ':', "object name", &p, errp); + } + qemu_rbd_unescape(name); + if (ret < 0 || !p) { + goto done; + } + + ret = qemu_rbd_next_tok(conf, conf_len, p, + '\0', "configuration", &p, errp); + +done: + g_free(buf); + return ret; +} + +static char *qemu_rbd_parse_clientname(const char *conf, char *clientname) +{ + const char *p = conf; + + while (*p) { + int len; + const char *end = strchr(p, ':'); + + if (end) { + len = end - p; + } else { + len = strlen(p); + } + + if (strncmp(p, "id=", 3) == 0) { + len -= 3; + strncpy(clientname, p + 3, len); + clientname[len] = '\0'; + return clientname; + } + if (end == NULL) { + break; + } + p = end + 1; + } + return NULL; +} + +static int qemu_rbd_set_conf(rados_t cluster, const char *conf, + bool only_read_conf_file, + Error **errp) +{ + char *p, *buf; + char name[RBD_MAX_CONF_NAME_SIZE]; + char value[RBD_MAX_CONF_VAL_SIZE]; + int ret = 0; + + buf = g_strdup(conf); + p = buf; + + while (p) { + ret = qemu_rbd_next_tok(name, sizeof(name), p, + '=', "conf option name", &p, errp); + if (ret < 0) { + break; + } + qemu_rbd_unescape(name); + + if (!p) { + error_setg(errp, "conf option %s has no value", name); + ret = -EINVAL; + break; + } + + ret = qemu_rbd_next_tok(value, sizeof(value), p, + ':', "conf option value", &p, errp); + if (ret < 0) { + break; + } + qemu_rbd_unescape(value); + + if (strcmp(name, "conf") == 0) { + /* read the conf file alone, so it doesn't override more + specific settings for a particular device */ + if (only_read_conf_file) { + ret = rados_conf_read_file(cluster, value); + if (ret < 0) { + error_setg(errp, "error reading conf file %s", value); + break; + } + } + } else if (strcmp(name, "id") == 0) { + /* ignore, this is parsed by qemu_rbd_parse_clientname() */ + } else if (!only_read_conf_file) { + ret = rados_conf_set(cluster, name, value); + if (ret < 0) { + error_setg(errp, "invalid conf option %s", name); + ret = -EINVAL; + break; + } + } + } + + g_free(buf); + return ret; +} + +static int qemu_rbd_create(const char *filename, QemuOpts *opts, Error **errp) +{ + Error *local_err = NULL; + int64_t bytes = 0; + int64_t objsize; + int obj_order = 0; + char pool[RBD_MAX_POOL_NAME_SIZE]; + char name[RBD_MAX_IMAGE_NAME_SIZE]; + char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; + char conf[RBD_MAX_CONF_SIZE]; + char clientname_buf[RBD_MAX_CONF_SIZE]; + char *clientname; + rados_t cluster; + rados_ioctx_t io_ctx; + int ret; + + if (qemu_rbd_parsename(filename, pool, sizeof(pool), + snap_buf, sizeof(snap_buf), + name, sizeof(name), + conf, sizeof(conf), &local_err) < 0) { + error_propagate(errp, local_err); + return -EINVAL; + } + + /* Read out options */ + bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + objsize = qemu_opt_get_size_del(opts, BLOCK_OPT_CLUSTER_SIZE, 0); + if (objsize) { + if ((objsize - 1) & objsize) { /* not a power of 2? */ + error_setg(errp, "obj size needs to be power of 2"); + return -EINVAL; + } + if (objsize < 4096) { + error_setg(errp, "obj size too small"); + return -EINVAL; + } + obj_order = ctz32(objsize); + } + + clientname = qemu_rbd_parse_clientname(conf, clientname_buf); + if (rados_create(&cluster, clientname) < 0) { + error_setg(errp, "error initializing"); + return -EIO; + } + + if (strstr(conf, "conf=") == NULL) { + /* try default location, but ignore failure */ + rados_conf_read_file(cluster, NULL); + } else if (conf[0] != '\0' && + qemu_rbd_set_conf(cluster, conf, true, &local_err) < 0) { + rados_shutdown(cluster); + error_propagate(errp, local_err); + return -EIO; + } + + if (conf[0] != '\0' && + qemu_rbd_set_conf(cluster, conf, false, &local_err) < 0) { + rados_shutdown(cluster); + error_propagate(errp, local_err); + return -EIO; + } + + if (rados_connect(cluster) < 0) { + error_setg(errp, "error connecting"); + rados_shutdown(cluster); + return -EIO; + } + + if (rados_ioctx_create(cluster, pool, &io_ctx) < 0) { + error_setg(errp, "error opening pool %s", pool); + rados_shutdown(cluster); + return -EIO; + } + + ret = rbd_create(io_ctx, name, bytes, &obj_order); + rados_ioctx_destroy(io_ctx); + rados_shutdown(cluster); + + return ret; +} + +/* + * This aio completion is being called from rbd_finish_bh() and runs in qemu + * BH context. + */ +static void qemu_rbd_complete_aio(RADOSCB *rcb) +{ + RBDAIOCB *acb = rcb->acb; + int64_t r; + + r = rcb->ret; + + if (acb->cmd != RBD_AIO_READ) { + if (r < 0) { + acb->ret = r; + acb->error = 1; + } else if (!acb->error) { + acb->ret = rcb->size; + } + } else { + if (r < 0) { + memset(rcb->buf, 0, rcb->size); + acb->ret = r; + acb->error = 1; + } else if (r < rcb->size) { + memset(rcb->buf + r, 0, rcb->size - r); + if (!acb->error) { + acb->ret = rcb->size; + } + } else if (!acb->error) { + acb->ret = r; + } + } + + g_free(rcb); + + if (acb->cmd == RBD_AIO_READ) { + qemu_iovec_from_buf(acb->qiov, 0, acb->bounce, acb->qiov->size); + } + qemu_vfree(acb->bounce); + acb->common.cb(acb->common.opaque, (acb->ret > 0 ? 0 : acb->ret)); + + qemu_aio_unref(acb); +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "rbd", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "Specification of the rbd image", + }, + { /* end of list */ } + }, +}; + +static int qemu_rbd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVRBDState *s = bs->opaque; + char pool[RBD_MAX_POOL_NAME_SIZE]; + char snap_buf[RBD_MAX_SNAP_NAME_SIZE]; + char conf[RBD_MAX_CONF_SIZE]; + char clientname_buf[RBD_MAX_CONF_SIZE]; + char *clientname; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + int r; + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + qemu_opts_del(opts); + return -EINVAL; + } + + filename = qemu_opt_get(opts, "filename"); + + if (qemu_rbd_parsename(filename, pool, sizeof(pool), + snap_buf, sizeof(snap_buf), + s->name, sizeof(s->name), + conf, sizeof(conf), errp) < 0) { + r = -EINVAL; + goto failed_opts; + } + + clientname = qemu_rbd_parse_clientname(conf, clientname_buf); + r = rados_create(&s->cluster, clientname); + if (r < 0) { + error_setg(errp, "error initializing"); + goto failed_opts; + } + + s->snap = NULL; + if (snap_buf[0] != '\0') { + s->snap = g_strdup(snap_buf); + } + + if (strstr(conf, "conf=") == NULL) { + /* try default location, but ignore failure */ + rados_conf_read_file(s->cluster, NULL); + } else if (conf[0] != '\0') { + r = qemu_rbd_set_conf(s->cluster, conf, true, errp); + if (r < 0) { + goto failed_shutdown; + } + } + + if (conf[0] != '\0') { + r = qemu_rbd_set_conf(s->cluster, conf, false, errp); + if (r < 0) { + goto failed_shutdown; + } + } + + /* + * Fallback to more conservative semantics if setting cache + * options fails. Ignore errors from setting rbd_cache because the + * only possible error is that the option does not exist, and + * librbd defaults to no caching. If write through caching cannot + * be set up, fall back to no caching. + */ + if (flags & BDRV_O_NOCACHE) { + rados_conf_set(s->cluster, "rbd_cache", "false"); + } else { + rados_conf_set(s->cluster, "rbd_cache", "true"); + } + + r = rados_connect(s->cluster); + if (r < 0) { + error_setg(errp, "error connecting"); + goto failed_shutdown; + } + + r = rados_ioctx_create(s->cluster, pool, &s->io_ctx); + if (r < 0) { + error_setg(errp, "error opening pool %s", pool); + goto failed_shutdown; + } + + r = rbd_open(s->io_ctx, s->name, &s->image, s->snap); + if (r < 0) { + error_setg(errp, "error reading header from %s", s->name); + goto failed_open; + } + + bs->read_only = (s->snap != NULL); + + qemu_opts_del(opts); + return 0; + +failed_open: + rados_ioctx_destroy(s->io_ctx); +failed_shutdown: + rados_shutdown(s->cluster); + g_free(s->snap); +failed_opts: + qemu_opts_del(opts); + return r; +} + +static void qemu_rbd_close(BlockDriverState *bs) +{ + BDRVRBDState *s = bs->opaque; + + rbd_close(s->image); + rados_ioctx_destroy(s->io_ctx); + g_free(s->snap); + rados_shutdown(s->cluster); +} + +static const AIOCBInfo rbd_aiocb_info = { + .aiocb_size = sizeof(RBDAIOCB), +}; + +static void rbd_finish_bh(void *opaque) +{ + RADOSCB *rcb = opaque; + qemu_bh_delete(rcb->acb->bh); + qemu_rbd_complete_aio(rcb); +} + +/* + * This is the callback function for rbd_aio_read and _write + * + * Note: this function is being called from a non qemu thread so + * we need to be careful about what we do here. Generally we only + * schedule a BH, and do the rest of the io completion handling + * from rbd_finish_bh() which runs in a qemu context. + */ +static void rbd_finish_aiocb(rbd_completion_t c, RADOSCB *rcb) +{ + RBDAIOCB *acb = rcb->acb; + + rcb->ret = rbd_aio_get_return_value(c); + rbd_aio_release(c); + + acb->bh = aio_bh_new(bdrv_get_aio_context(acb->common.bs), + rbd_finish_bh, rcb); + qemu_bh_schedule(acb->bh); +} + +static int rbd_aio_discard_wrapper(rbd_image_t image, + uint64_t off, + uint64_t len, + rbd_completion_t comp) +{ +#ifdef LIBRBD_SUPPORTS_DISCARD + return rbd_aio_discard(image, off, len, comp); +#else + return -ENOTSUP; +#endif +} + +static int rbd_aio_flush_wrapper(rbd_image_t image, + rbd_completion_t comp) +{ +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH + return rbd_aio_flush(image, comp); +#else + return -ENOTSUP; +#endif +} + +static BlockAIOCB *rbd_start_aio(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque, + RBDAIOCmd cmd) +{ + RBDAIOCB *acb; + RADOSCB *rcb = NULL; + rbd_completion_t c; + int64_t off, size; + char *buf; + int r; + + BDRVRBDState *s = bs->opaque; + + acb = qemu_aio_get(&rbd_aiocb_info, bs, cb, opaque); + acb->cmd = cmd; + acb->qiov = qiov; + if (cmd == RBD_AIO_DISCARD || cmd == RBD_AIO_FLUSH) { + acb->bounce = NULL; + } else { + acb->bounce = qemu_try_blockalign(bs, qiov->size); + if (acb->bounce == NULL) { + goto failed; + } + } + acb->ret = 0; + acb->error = 0; + acb->s = s; + acb->bh = NULL; + + if (cmd == RBD_AIO_WRITE) { + qemu_iovec_to_buf(acb->qiov, 0, acb->bounce, qiov->size); + } + + buf = acb->bounce; + + off = sector_num * BDRV_SECTOR_SIZE; + size = nb_sectors * BDRV_SECTOR_SIZE; + + rcb = g_new(RADOSCB, 1); + rcb->acb = acb; + rcb->buf = buf; + rcb->s = acb->s; + rcb->size = size; + r = rbd_aio_create_completion(rcb, (rbd_callback_t) rbd_finish_aiocb, &c); + if (r < 0) { + goto failed; + } + + switch (cmd) { + case RBD_AIO_WRITE: + r = rbd_aio_write(s->image, off, size, buf, c); + break; + case RBD_AIO_READ: + r = rbd_aio_read(s->image, off, size, buf, c); + break; + case RBD_AIO_DISCARD: + r = rbd_aio_discard_wrapper(s->image, off, size, c); + break; + case RBD_AIO_FLUSH: + r = rbd_aio_flush_wrapper(s->image, c); + break; + default: + r = -EINVAL; + } + + if (r < 0) { + goto failed_completion; + } + + return &acb->common; + +failed_completion: + rbd_aio_release(c); +failed: + g_free(rcb); + qemu_vfree(acb->bounce); + qemu_aio_unref(acb); + return NULL; +} + +static BlockAIOCB *qemu_rbd_aio_readv(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + RBD_AIO_READ); +} + +static BlockAIOCB *qemu_rbd_aio_writev(BlockDriverState *bs, + int64_t sector_num, + QEMUIOVector *qiov, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, sector_num, qiov, nb_sectors, cb, opaque, + RBD_AIO_WRITE); +} + +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH +static BlockAIOCB *qemu_rbd_aio_flush(BlockDriverState *bs, + BlockCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, 0, NULL, 0, cb, opaque, RBD_AIO_FLUSH); +} + +#else + +static int qemu_rbd_co_flush(BlockDriverState *bs) +{ +#if LIBRBD_VERSION_CODE >= LIBRBD_VERSION(0, 1, 1) + /* rbd_flush added in 0.1.1 */ + BDRVRBDState *s = bs->opaque; + return rbd_flush(s->image); +#else + return 0; +#endif +} +#endif + +static int qemu_rbd_getinfo(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVRBDState *s = bs->opaque; + rbd_image_info_t info; + int r; + + r = rbd_stat(s->image, &info, sizeof(info)); + if (r < 0) { + return r; + } + + bdi->cluster_size = info.obj_size; + return 0; +} + +static int64_t qemu_rbd_getlength(BlockDriverState *bs) +{ + BDRVRBDState *s = bs->opaque; + rbd_image_info_t info; + int r; + + r = rbd_stat(s->image, &info, sizeof(info)); + if (r < 0) { + return r; + } + + return info.size; +} + +static int qemu_rbd_truncate(BlockDriverState *bs, int64_t offset) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_resize(s->image, offset); + if (r < 0) { + return r; + } + + return 0; +} + +static int qemu_rbd_snap_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info) +{ + BDRVRBDState *s = bs->opaque; + int r; + + if (sn_info->name[0] == '\0') { + return -EINVAL; /* we need a name for rbd snapshots */ + } + + /* + * rbd snapshots are using the name as the user controlled unique identifier + * we can't use the rbd snapid for that purpose, as it can't be set + */ + if (sn_info->id_str[0] != '\0' && + strcmp(sn_info->id_str, sn_info->name) != 0) { + return -EINVAL; + } + + if (strlen(sn_info->name) >= sizeof(sn_info->id_str)) { + return -ERANGE; + } + + r = rbd_snap_create(s->image, sn_info->name); + if (r < 0) { + error_report("failed to create snap: %s", strerror(-r)); + return r; + } + + return 0; +} + +static int qemu_rbd_snap_remove(BlockDriverState *bs, + const char *snapshot_id, + const char *snapshot_name, + Error **errp) +{ + BDRVRBDState *s = bs->opaque; + int r; + + if (!snapshot_name) { + error_setg(errp, "rbd need a valid snapshot name"); + return -EINVAL; + } + + /* If snapshot_id is specified, it must be equal to name, see + qemu_rbd_snap_list() */ + if (snapshot_id && strcmp(snapshot_id, snapshot_name)) { + error_setg(errp, + "rbd do not support snapshot id, it should be NULL or " + "equal to snapshot name"); + return -EINVAL; + } + + r = rbd_snap_remove(s->image, snapshot_name); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to remove the snapshot"); + } + return r; +} + +static int qemu_rbd_snap_rollback(BlockDriverState *bs, + const char *snapshot_name) +{ + BDRVRBDState *s = bs->opaque; + int r; + + r = rbd_snap_rollback(s->image, snapshot_name); + return r; +} + +static int qemu_rbd_snap_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_tab) +{ + BDRVRBDState *s = bs->opaque; + QEMUSnapshotInfo *sn_info, *sn_tab = NULL; + int i, snap_count; + rbd_snap_info_t *snaps; + int max_snaps = RBD_MAX_SNAPS; + + do { + snaps = g_new(rbd_snap_info_t, max_snaps); + snap_count = rbd_snap_list(s->image, snaps, &max_snaps); + if (snap_count <= 0) { + g_free(snaps); + } + } while (snap_count == -ERANGE); + + if (snap_count <= 0) { + goto done; + } + + sn_tab = g_new0(QEMUSnapshotInfo, snap_count); + + for (i = 0; i < snap_count; i++) { + const char *snap_name = snaps[i].name; + + sn_info = sn_tab + i; + pstrcpy(sn_info->id_str, sizeof(sn_info->id_str), snap_name); + pstrcpy(sn_info->name, sizeof(sn_info->name), snap_name); + + sn_info->vm_state_size = snaps[i].size; + sn_info->date_sec = 0; + sn_info->date_nsec = 0; + sn_info->vm_clock_nsec = 0; + } + rbd_snap_list_end(snaps); + g_free(snaps); + + done: + *psn_tab = sn_tab; + return snap_count; +} + +#ifdef LIBRBD_SUPPORTS_DISCARD +static BlockAIOCB* qemu_rbd_aio_discard(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, + BlockCompletionFunc *cb, + void *opaque) +{ + return rbd_start_aio(bs, sector_num, NULL, nb_sectors, cb, opaque, + RBD_AIO_DISCARD); +} +#endif + +#ifdef LIBRBD_SUPPORTS_INVALIDATE +static void qemu_rbd_invalidate_cache(BlockDriverState *bs, + Error **errp) +{ + BDRVRBDState *s = bs->opaque; + int r = rbd_invalidate_cache(s->image); + if (r < 0) { + error_setg_errno(errp, -r, "Failed to invalidate the cache"); + } +} +#endif + +static QemuOptsList qemu_rbd_create_opts = { + .name = "rbd-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(qemu_rbd_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "RBD object size" + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_rbd = { + .format_name = "rbd", + .instance_size = sizeof(BDRVRBDState), + .bdrv_needs_filename = true, + .bdrv_file_open = qemu_rbd_open, + .bdrv_close = qemu_rbd_close, + .bdrv_create = qemu_rbd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_get_info = qemu_rbd_getinfo, + .create_opts = &qemu_rbd_create_opts, + .bdrv_getlength = qemu_rbd_getlength, + .bdrv_truncate = qemu_rbd_truncate, + .protocol_name = "rbd", + + .bdrv_aio_readv = qemu_rbd_aio_readv, + .bdrv_aio_writev = qemu_rbd_aio_writev, + +#ifdef LIBRBD_SUPPORTS_AIO_FLUSH + .bdrv_aio_flush = qemu_rbd_aio_flush, +#else + .bdrv_co_flush_to_disk = qemu_rbd_co_flush, +#endif + +#ifdef LIBRBD_SUPPORTS_DISCARD + .bdrv_aio_discard = qemu_rbd_aio_discard, +#endif + + .bdrv_snapshot_create = qemu_rbd_snap_create, + .bdrv_snapshot_delete = qemu_rbd_snap_remove, + .bdrv_snapshot_list = qemu_rbd_snap_list, + .bdrv_snapshot_goto = qemu_rbd_snap_rollback, +#ifdef LIBRBD_SUPPORTS_INVALIDATE + .bdrv_invalidate_cache = qemu_rbd_invalidate_cache, +#endif +}; + +static void bdrv_rbd_init(void) +{ + bdrv_register(&bdrv_rbd); +} + +block_init(bdrv_rbd_init); diff --git a/src/block/sheepdog.c b/src/block/sheepdog.c new file mode 100644 index 0000000..d80e4ed --- /dev/null +++ b/src/block/sheepdog.c @@ -0,0 +1,2910 @@ +/* + * Copyright (C) 2009-2010 Nippon Telegraph and Telephone Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Contributions after 2012-01-13 are licensed under the terms of the + * GNU GPL, version 2 or (at your option) any later version. + */ + +#include "qemu-common.h" +#include "qemu/uri.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "block/block_int.h" +#include "qemu/bitops.h" + +#define SD_PROTO_VER 0x01 + +#define SD_DEFAULT_ADDR "localhost" +#define SD_DEFAULT_PORT 7000 + +#define SD_OP_CREATE_AND_WRITE_OBJ 0x01 +#define SD_OP_READ_OBJ 0x02 +#define SD_OP_WRITE_OBJ 0x03 +/* 0x04 is used internally by Sheepdog */ + +#define SD_OP_NEW_VDI 0x11 +#define SD_OP_LOCK_VDI 0x12 +#define SD_OP_RELEASE_VDI 0x13 +#define SD_OP_GET_VDI_INFO 0x14 +#define SD_OP_READ_VDIS 0x15 +#define SD_OP_FLUSH_VDI 0x16 +#define SD_OP_DEL_VDI 0x17 +#define SD_OP_GET_CLUSTER_DEFAULT 0x18 + +#define SD_FLAG_CMD_WRITE 0x01 +#define SD_FLAG_CMD_COW 0x02 +#define SD_FLAG_CMD_CACHE 0x04 /* Writeback mode for cache */ +#define SD_FLAG_CMD_DIRECT 0x08 /* Don't use cache */ + +#define SD_RES_SUCCESS 0x00 /* Success */ +#define SD_RES_UNKNOWN 0x01 /* Unknown error */ +#define SD_RES_NO_OBJ 0x02 /* No object found */ +#define SD_RES_EIO 0x03 /* I/O error */ +#define SD_RES_VDI_EXIST 0x04 /* Vdi exists already */ +#define SD_RES_INVALID_PARMS 0x05 /* Invalid parameters */ +#define SD_RES_SYSTEM_ERROR 0x06 /* System error */ +#define SD_RES_VDI_LOCKED 0x07 /* Vdi is locked */ +#define SD_RES_NO_VDI 0x08 /* No vdi found */ +#define SD_RES_NO_BASE_VDI 0x09 /* No base vdi found */ +#define SD_RES_VDI_READ 0x0A /* Cannot read requested vdi */ +#define SD_RES_VDI_WRITE 0x0B /* Cannot write requested vdi */ +#define SD_RES_BASE_VDI_READ 0x0C /* Cannot read base vdi */ +#define SD_RES_BASE_VDI_WRITE 0x0D /* Cannot write base vdi */ +#define SD_RES_NO_TAG 0x0E /* Requested tag is not found */ +#define SD_RES_STARTUP 0x0F /* Sheepdog is on starting up */ +#define SD_RES_VDI_NOT_LOCKED 0x10 /* Vdi is not locked */ +#define SD_RES_SHUTDOWN 0x11 /* Sheepdog is shutting down */ +#define SD_RES_NO_MEM 0x12 /* Cannot allocate memory */ +#define SD_RES_FULL_VDI 0x13 /* we already have the maximum vdis */ +#define SD_RES_VER_MISMATCH 0x14 /* Protocol version mismatch */ +#define SD_RES_NO_SPACE 0x15 /* Server has no room for new objects */ +#define SD_RES_WAIT_FOR_FORMAT 0x16 /* Waiting for a format operation */ +#define SD_RES_WAIT_FOR_JOIN 0x17 /* Waiting for other nodes joining */ +#define SD_RES_JOIN_FAILED 0x18 /* Target node had failed to join sheepdog */ +#define SD_RES_HALT 0x19 /* Sheepdog is stopped serving IO request */ +#define SD_RES_READONLY 0x1A /* Object is read-only */ + +/* + * Object ID rules + * + * 0 - 19 (20 bits): data object space + * 20 - 31 (12 bits): reserved data object space + * 32 - 55 (24 bits): vdi object space + * 56 - 59 ( 4 bits): reserved vdi object space + * 60 - 63 ( 4 bits): object type identifier space + */ + +#define VDI_SPACE_SHIFT 32 +#define VDI_BIT (UINT64_C(1) << 63) +#define VMSTATE_BIT (UINT64_C(1) << 62) +#define MAX_DATA_OBJS (UINT64_C(1) << 20) +#define MAX_CHILDREN 1024 +#define SD_MAX_VDI_LEN 256 +#define SD_MAX_VDI_TAG_LEN 256 +#define SD_NR_VDIS (1U << 24) +#define SD_DATA_OBJ_SIZE (UINT64_C(1) << 22) +#define SD_MAX_VDI_SIZE (SD_DATA_OBJ_SIZE * MAX_DATA_OBJS) +#define SD_DEFAULT_BLOCK_SIZE_SHIFT 22 +/* + * For erasure coding, we use at most SD_EC_MAX_STRIP for data strips and + * (SD_EC_MAX_STRIP - 1) for parity strips + * + * SD_MAX_COPIES is sum of number of data strips and parity strips. + */ +#define SD_EC_MAX_STRIP 16 +#define SD_MAX_COPIES (SD_EC_MAX_STRIP * 2 - 1) + +#define SD_INODE_SIZE (sizeof(SheepdogInode)) +#define CURRENT_VDI_ID 0 + +#define LOCK_TYPE_NORMAL 0 +#define LOCK_TYPE_SHARED 1 /* for iSCSI multipath */ + +typedef struct SheepdogReq { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t opcode_specific[8]; +} SheepdogReq; + +typedef struct SheepdogRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint32_t opcode_specific[7]; +} SheepdogRsp; + +typedef struct SheepdogObjReq { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint64_t oid; + uint64_t cow_oid; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[6]; + uint64_t offset; +} SheepdogObjReq; + +typedef struct SheepdogObjRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint8_t copies; + uint8_t copy_policy; + uint8_t reserved[2]; + uint32_t pad[6]; +} SheepdogObjRsp; + +typedef struct SheepdogVdiReq { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint64_t vdi_size; + uint32_t base_vdi_id; + uint8_t copies; + uint8_t copy_policy; + uint8_t store_policy; + uint8_t block_size_shift; + uint32_t snapid; + uint32_t type; + uint32_t pad[2]; +} SheepdogVdiReq; + +typedef struct SheepdogVdiRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint32_t rsvd; + uint32_t vdi_id; + uint32_t pad[5]; +} SheepdogVdiRsp; + +typedef struct SheepdogClusterRsp { + uint8_t proto_ver; + uint8_t opcode; + uint16_t flags; + uint32_t epoch; + uint32_t id; + uint32_t data_length; + uint32_t result; + uint8_t nr_copies; + uint8_t copy_policy; + uint8_t block_size_shift; + uint8_t __pad1; + uint32_t __pad2[6]; +} SheepdogClusterRsp; + +typedef struct SheepdogInode { + char name[SD_MAX_VDI_LEN]; + char tag[SD_MAX_VDI_TAG_LEN]; + uint64_t ctime; + uint64_t snap_ctime; + uint64_t vm_clock_nsec; + uint64_t vdi_size; + uint64_t vm_state_size; + uint16_t copy_policy; + uint8_t nr_copies; + uint8_t block_size_shift; + uint32_t snap_id; + uint32_t vdi_id; + uint32_t parent_vdi_id; + uint32_t child_vdi_id[MAX_CHILDREN]; + uint32_t data_vdi_id[MAX_DATA_OBJS]; +} SheepdogInode; + +#define SD_INODE_HEADER_SIZE offsetof(SheepdogInode, data_vdi_id) + +/* + * 64 bit FNV-1a non-zero initial basis + */ +#define FNV1A_64_INIT ((uint64_t)0xcbf29ce484222325ULL) + +/* + * 64 bit Fowler/Noll/Vo FNV-1a hash code + */ +static inline uint64_t fnv_64a_buf(void *buf, size_t len, uint64_t hval) +{ + unsigned char *bp = buf; + unsigned char *be = bp + len; + while (bp < be) { + hval ^= (uint64_t) *bp++; + hval += (hval << 1) + (hval << 4) + (hval << 5) + + (hval << 7) + (hval << 8) + (hval << 40); + } + return hval; +} + +static inline bool is_data_obj_writable(SheepdogInode *inode, unsigned int idx) +{ + return inode->vdi_id == inode->data_vdi_id[idx]; +} + +static inline bool is_data_obj(uint64_t oid) +{ + return !(VDI_BIT & oid); +} + +static inline uint64_t data_oid_to_idx(uint64_t oid) +{ + return oid & (MAX_DATA_OBJS - 1); +} + +static inline uint32_t oid_to_vid(uint64_t oid) +{ + return (oid & ~VDI_BIT) >> VDI_SPACE_SHIFT; +} + +static inline uint64_t vid_to_vdi_oid(uint32_t vid) +{ + return VDI_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT); +} + +static inline uint64_t vid_to_vmstate_oid(uint32_t vid, uint32_t idx) +{ + return VMSTATE_BIT | ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; +} + +static inline uint64_t vid_to_data_oid(uint32_t vid, uint32_t idx) +{ + return ((uint64_t)vid << VDI_SPACE_SHIFT) | idx; +} + +static inline bool is_snapshot(struct SheepdogInode *inode) +{ + return !!inode->snap_ctime; +} + +#undef DPRINTF +#ifdef DEBUG_SDOG +#define DPRINTF(fmt, args...) \ + do { \ + fprintf(stdout, "%s %d: " fmt, __func__, __LINE__, ##args); \ + } while (0) +#else +#define DPRINTF(fmt, args...) +#endif + +typedef struct SheepdogAIOCB SheepdogAIOCB; + +typedef struct AIOReq { + SheepdogAIOCB *aiocb; + unsigned int iov_offset; + + uint64_t oid; + uint64_t base_oid; + uint64_t offset; + unsigned int data_len; + uint8_t flags; + uint32_t id; + bool create; + + QLIST_ENTRY(AIOReq) aio_siblings; +} AIOReq; + +enum AIOCBState { + AIOCB_WRITE_UDATA, + AIOCB_READ_UDATA, + AIOCB_FLUSH_CACHE, + AIOCB_DISCARD_OBJ, +}; + +#define AIOCBOverlapping(x, y) \ + (!(x->max_affect_data_idx < y->min_affect_data_idx \ + || y->max_affect_data_idx < x->min_affect_data_idx)) + +struct SheepdogAIOCB { + BlockAIOCB common; + + QEMUIOVector *qiov; + + int64_t sector_num; + int nb_sectors; + + int ret; + enum AIOCBState aiocb_type; + + Coroutine *coroutine; + void (*aio_done_func)(SheepdogAIOCB *); + + bool cancelable; + int nr_pending; + + uint32_t min_affect_data_idx; + uint32_t max_affect_data_idx; + + /* + * The difference between affect_data_idx and dirty_data_idx: + * affect_data_idx represents range of index of all request types. + * dirty_data_idx represents range of index updated by COW requests. + * dirty_data_idx is used for updating an inode object. + */ + uint32_t min_dirty_data_idx; + uint32_t max_dirty_data_idx; + + QLIST_ENTRY(SheepdogAIOCB) aiocb_siblings; +}; + +typedef struct BDRVSheepdogState { + BlockDriverState *bs; + AioContext *aio_context; + + SheepdogInode inode; + + char name[SD_MAX_VDI_LEN]; + bool is_snapshot; + uint32_t cache_flags; + bool discard_supported; + + char *host_spec; + bool is_unix; + int fd; + + CoMutex lock; + Coroutine *co_send; + Coroutine *co_recv; + + uint32_t aioreq_seq_num; + + /* Every aio request must be linked to either of these queues. */ + QLIST_HEAD(inflight_aio_head, AIOReq) inflight_aio_head; + QLIST_HEAD(failed_aio_head, AIOReq) failed_aio_head; + + CoQueue overlapping_queue; + QLIST_HEAD(inflight_aiocb_head, SheepdogAIOCB) inflight_aiocb_head; +} BDRVSheepdogState; + +typedef struct BDRVSheepdogReopenState { + int fd; + int cache_flags; +} BDRVSheepdogReopenState; + +static const char * sd_strerror(int err) +{ + int i; + + static const struct { + int err; + const char *desc; + } errors[] = { + {SD_RES_SUCCESS, "Success"}, + {SD_RES_UNKNOWN, "Unknown error"}, + {SD_RES_NO_OBJ, "No object found"}, + {SD_RES_EIO, "I/O error"}, + {SD_RES_VDI_EXIST, "VDI exists already"}, + {SD_RES_INVALID_PARMS, "Invalid parameters"}, + {SD_RES_SYSTEM_ERROR, "System error"}, + {SD_RES_VDI_LOCKED, "VDI is already locked"}, + {SD_RES_NO_VDI, "No vdi found"}, + {SD_RES_NO_BASE_VDI, "No base VDI found"}, + {SD_RES_VDI_READ, "Failed read the requested VDI"}, + {SD_RES_VDI_WRITE, "Failed to write the requested VDI"}, + {SD_RES_BASE_VDI_READ, "Failed to read the base VDI"}, + {SD_RES_BASE_VDI_WRITE, "Failed to write the base VDI"}, + {SD_RES_NO_TAG, "Failed to find the requested tag"}, + {SD_RES_STARTUP, "The system is still booting"}, + {SD_RES_VDI_NOT_LOCKED, "VDI isn't locked"}, + {SD_RES_SHUTDOWN, "The system is shutting down"}, + {SD_RES_NO_MEM, "Out of memory on the server"}, + {SD_RES_FULL_VDI, "We already have the maximum vdis"}, + {SD_RES_VER_MISMATCH, "Protocol version mismatch"}, + {SD_RES_NO_SPACE, "Server has no space for new objects"}, + {SD_RES_WAIT_FOR_FORMAT, "Sheepdog is waiting for a format operation"}, + {SD_RES_WAIT_FOR_JOIN, "Sheepdog is waiting for other nodes joining"}, + {SD_RES_JOIN_FAILED, "Target node had failed to join sheepdog"}, + {SD_RES_HALT, "Sheepdog is stopped serving IO request"}, + {SD_RES_READONLY, "Object is read-only"}, + }; + + for (i = 0; i < ARRAY_SIZE(errors); ++i) { + if (errors[i].err == err) { + return errors[i].desc; + } + } + + return "Invalid error code"; +} + +/* + * Sheepdog I/O handling: + * + * 1. In sd_co_rw_vector, we send the I/O requests to the server and + * link the requests to the inflight_list in the + * BDRVSheepdogState. The function exits without waiting for + * receiving the response. + * + * 2. We receive the response in aio_read_response, the fd handler to + * the sheepdog connection. If metadata update is needed, we send + * the write request to the vdi object in sd_write_done, the write + * completion function. We switch back to sd_co_readv/writev after + * all the requests belonging to the AIOCB are finished. + */ + +static inline AIOReq *alloc_aio_req(BDRVSheepdogState *s, SheepdogAIOCB *acb, + uint64_t oid, unsigned int data_len, + uint64_t offset, uint8_t flags, bool create, + uint64_t base_oid, unsigned int iov_offset) +{ + AIOReq *aio_req; + + aio_req = g_malloc(sizeof(*aio_req)); + aio_req->aiocb = acb; + aio_req->iov_offset = iov_offset; + aio_req->oid = oid; + aio_req->base_oid = base_oid; + aio_req->offset = offset; + aio_req->data_len = data_len; + aio_req->flags = flags; + aio_req->id = s->aioreq_seq_num++; + aio_req->create = create; + + acb->nr_pending++; + return aio_req; +} + +static inline void free_aio_req(BDRVSheepdogState *s, AIOReq *aio_req) +{ + SheepdogAIOCB *acb = aio_req->aiocb; + + acb->cancelable = false; + QLIST_REMOVE(aio_req, aio_siblings); + g_free(aio_req); + + acb->nr_pending--; +} + +static void coroutine_fn sd_finish_aiocb(SheepdogAIOCB *acb) +{ + qemu_coroutine_enter(acb->coroutine, NULL); + qemu_aio_unref(acb); +} + +/* + * Check whether the specified acb can be canceled + * + * We can cancel aio when any request belonging to the acb is: + * - Not processed by the sheepdog server. + * - Not linked to the inflight queue. + */ +static bool sd_acb_cancelable(const SheepdogAIOCB *acb) +{ + BDRVSheepdogState *s = acb->common.bs->opaque; + AIOReq *aioreq; + + if (!acb->cancelable) { + return false; + } + + QLIST_FOREACH(aioreq, &s->inflight_aio_head, aio_siblings) { + if (aioreq->aiocb == acb) { + return false; + } + } + + return true; +} + +static void sd_aio_cancel(BlockAIOCB *blockacb) +{ + SheepdogAIOCB *acb = (SheepdogAIOCB *)blockacb; + BDRVSheepdogState *s = acb->common.bs->opaque; + AIOReq *aioreq, *next; + + if (sd_acb_cancelable(acb)) { + /* Remove outstanding requests from failed queue. */ + QLIST_FOREACH_SAFE(aioreq, &s->failed_aio_head, aio_siblings, + next) { + if (aioreq->aiocb == acb) { + free_aio_req(s, aioreq); + } + } + + assert(acb->nr_pending == 0); + if (acb->common.cb) { + acb->common.cb(acb->common.opaque, -ECANCELED); + } + sd_finish_aiocb(acb); + } +} + +static const AIOCBInfo sd_aiocb_info = { + .aiocb_size = sizeof(SheepdogAIOCB), + .cancel_async = sd_aio_cancel, +}; + +static SheepdogAIOCB *sd_aio_setup(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t sector_num, int nb_sectors) +{ + SheepdogAIOCB *acb; + uint32_t object_size; + BDRVSheepdogState *s = bs->opaque; + + object_size = (UINT32_C(1) << s->inode.block_size_shift); + + acb = qemu_aio_get(&sd_aiocb_info, bs, NULL, NULL); + + acb->qiov = qiov; + + acb->sector_num = sector_num; + acb->nb_sectors = nb_sectors; + + acb->aio_done_func = NULL; + acb->cancelable = true; + acb->coroutine = qemu_coroutine_self(); + acb->ret = 0; + acb->nr_pending = 0; + + acb->min_affect_data_idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; + acb->max_affect_data_idx = (acb->sector_num * BDRV_SECTOR_SIZE + + acb->nb_sectors * BDRV_SECTOR_SIZE) / object_size; + + acb->min_dirty_data_idx = UINT32_MAX; + acb->max_dirty_data_idx = 0; + + return acb; +} + +/* Return -EIO in case of error, file descriptor on success */ +static int connect_to_sdog(BDRVSheepdogState *s, Error **errp) +{ + int fd; + + if (s->is_unix) { + fd = unix_connect(s->host_spec, errp); + } else { + fd = inet_connect(s->host_spec, errp); + + if (fd >= 0) { + int ret = socket_set_nodelay(fd); + if (ret < 0) { + error_report("%s", strerror(errno)); + } + } + } + + if (fd >= 0) { + qemu_set_nonblock(fd); + } else { + fd = -EIO; + } + + return fd; +} + +/* Return 0 on success and -errno in case of error */ +static coroutine_fn int send_co_req(int sockfd, SheepdogReq *hdr, void *data, + unsigned int *wlen) +{ + int ret; + + ret = qemu_co_send(sockfd, hdr, sizeof(*hdr)); + if (ret != sizeof(*hdr)) { + error_report("failed to send a req, %s", strerror(errno)); + ret = -socket_error(); + return ret; + } + + ret = qemu_co_send(sockfd, data, *wlen); + if (ret != *wlen) { + ret = -socket_error(); + error_report("failed to send a req, %s", strerror(errno)); + } + + return ret; +} + +static void restart_co_req(void *opaque) +{ + Coroutine *co = opaque; + + qemu_coroutine_enter(co, NULL); +} + +typedef struct SheepdogReqCo { + int sockfd; + AioContext *aio_context; + SheepdogReq *hdr; + void *data; + unsigned int *wlen; + unsigned int *rlen; + int ret; + bool finished; +} SheepdogReqCo; + +static coroutine_fn void do_co_req(void *opaque) +{ + int ret; + Coroutine *co; + SheepdogReqCo *srco = opaque; + int sockfd = srco->sockfd; + SheepdogReq *hdr = srco->hdr; + void *data = srco->data; + unsigned int *wlen = srco->wlen; + unsigned int *rlen = srco->rlen; + + co = qemu_coroutine_self(); + aio_set_fd_handler(srco->aio_context, sockfd, false, + NULL, restart_co_req, co); + + ret = send_co_req(sockfd, hdr, data, wlen); + if (ret < 0) { + goto out; + } + + aio_set_fd_handler(srco->aio_context, sockfd, false, + restart_co_req, NULL, co); + + ret = qemu_co_recv(sockfd, hdr, sizeof(*hdr)); + if (ret != sizeof(*hdr)) { + error_report("failed to get a rsp, %s", strerror(errno)); + ret = -errno; + goto out; + } + + if (*rlen > hdr->data_length) { + *rlen = hdr->data_length; + } + + if (*rlen) { + ret = qemu_co_recv(sockfd, data, *rlen); + if (ret != *rlen) { + error_report("failed to get the data, %s", strerror(errno)); + ret = -errno; + goto out; + } + } + ret = 0; +out: + /* there is at most one request for this sockfd, so it is safe to + * set each handler to NULL. */ + aio_set_fd_handler(srco->aio_context, sockfd, false, + NULL, NULL, NULL); + + srco->ret = ret; + srco->finished = true; +} + +/* + * Send the request to the sheep in a synchronous manner. + * + * Return 0 on success, -errno in case of error. + */ +static int do_req(int sockfd, AioContext *aio_context, SheepdogReq *hdr, + void *data, unsigned int *wlen, unsigned int *rlen) +{ + Coroutine *co; + SheepdogReqCo srco = { + .sockfd = sockfd, + .aio_context = aio_context, + .hdr = hdr, + .data = data, + .wlen = wlen, + .rlen = rlen, + .ret = 0, + .finished = false, + }; + + if (qemu_in_coroutine()) { + do_co_req(&srco); + } else { + co = qemu_coroutine_create(do_co_req); + qemu_coroutine_enter(co, &srco); + while (!srco.finished) { + aio_poll(aio_context, true); + } + } + + return srco.ret; +} + +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, + struct iovec *iov, int niov, + enum AIOCBState aiocb_type); +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req); +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag); +static int get_sheep_fd(BDRVSheepdogState *s, Error **errp); +static void co_write_request(void *opaque); + +static coroutine_fn void reconnect_to_sdog(void *opaque) +{ + BDRVSheepdogState *s = opaque; + AIOReq *aio_req, *next; + + aio_set_fd_handler(s->aio_context, s->fd, false, NULL, + NULL, NULL); + close(s->fd); + s->fd = -1; + + /* Wait for outstanding write requests to be completed. */ + while (s->co_send != NULL) { + co_write_request(opaque); + } + + /* Try to reconnect the sheepdog server every one second. */ + while (s->fd < 0) { + Error *local_err = NULL; + s->fd = get_sheep_fd(s, &local_err); + if (s->fd < 0) { + DPRINTF("Wait for connection to be established\n"); + error_report_err(local_err); + co_aio_sleep_ns(bdrv_get_aio_context(s->bs), QEMU_CLOCK_REALTIME, + 1000000000ULL); + } + }; + + /* + * Now we have to resend all the request in the inflight queue. However, + * resend_aioreq() can yield and newly created requests can be added to the + * inflight queue before the coroutine is resumed. To avoid mixing them, we + * have to move all the inflight requests to the failed queue before + * resend_aioreq() is called. + */ + QLIST_FOREACH_SAFE(aio_req, &s->inflight_aio_head, aio_siblings, next) { + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->failed_aio_head, aio_req, aio_siblings); + } + + /* Resend all the failed aio requests. */ + while (!QLIST_EMPTY(&s->failed_aio_head)) { + aio_req = QLIST_FIRST(&s->failed_aio_head); + QLIST_REMOVE(aio_req, aio_siblings); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + resend_aioreq(s, aio_req); + } +} + +/* + * Receive responses of the I/O requests. + * + * This function is registered as a fd handler, and called from the + * main loop when s->fd is ready for reading responses. + */ +static void coroutine_fn aio_read_response(void *opaque) +{ + SheepdogObjRsp rsp; + BDRVSheepdogState *s = opaque; + int fd = s->fd; + int ret; + AIOReq *aio_req = NULL; + SheepdogAIOCB *acb; + uint64_t idx; + + /* read a header */ + ret = qemu_co_recv(fd, &rsp, sizeof(rsp)); + if (ret != sizeof(rsp)) { + error_report("failed to get the header, %s", strerror(errno)); + goto err; + } + + /* find the right aio_req from the inflight aio list */ + QLIST_FOREACH(aio_req, &s->inflight_aio_head, aio_siblings) { + if (aio_req->id == rsp.id) { + break; + } + } + if (!aio_req) { + error_report("cannot find aio_req %x", rsp.id); + goto err; + } + + acb = aio_req->aiocb; + + switch (acb->aiocb_type) { + case AIOCB_WRITE_UDATA: + /* this coroutine context is no longer suitable for co_recv + * because we may send data to update vdi objects */ + s->co_recv = NULL; + if (!is_data_obj(aio_req->oid)) { + break; + } + idx = data_oid_to_idx(aio_req->oid); + + if (aio_req->create) { + /* + * If the object is newly created one, we need to update + * the vdi object (metadata object). min_dirty_data_idx + * and max_dirty_data_idx are changed to include updated + * index between them. + */ + if (rsp.result == SD_RES_SUCCESS) { + s->inode.data_vdi_id[idx] = s->inode.vdi_id; + acb->max_dirty_data_idx = MAX(idx, acb->max_dirty_data_idx); + acb->min_dirty_data_idx = MIN(idx, acb->min_dirty_data_idx); + } + } + break; + case AIOCB_READ_UDATA: + ret = qemu_co_recvv(fd, acb->qiov->iov, acb->qiov->niov, + aio_req->iov_offset, rsp.data_length); + if (ret != rsp.data_length) { + error_report("failed to get the data, %s", strerror(errno)); + goto err; + } + break; + case AIOCB_FLUSH_CACHE: + if (rsp.result == SD_RES_INVALID_PARMS) { + DPRINTF("disable cache since the server doesn't support it\n"); + s->cache_flags = SD_FLAG_CMD_DIRECT; + rsp.result = SD_RES_SUCCESS; + } + break; + case AIOCB_DISCARD_OBJ: + switch (rsp.result) { + case SD_RES_INVALID_PARMS: + error_report("sheep(%s) doesn't support discard command", + s->host_spec); + rsp.result = SD_RES_SUCCESS; + s->discard_supported = false; + break; + default: + break; + } + } + + switch (rsp.result) { + case SD_RES_SUCCESS: + break; + case SD_RES_READONLY: + if (s->inode.vdi_id == oid_to_vid(aio_req->oid)) { + ret = reload_inode(s, 0, ""); + if (ret < 0) { + goto err; + } + } + if (is_data_obj(aio_req->oid)) { + aio_req->oid = vid_to_data_oid(s->inode.vdi_id, + data_oid_to_idx(aio_req->oid)); + } else { + aio_req->oid = vid_to_vdi_oid(s->inode.vdi_id); + } + resend_aioreq(s, aio_req); + goto out; + default: + acb->ret = -EIO; + error_report("%s", sd_strerror(rsp.result)); + break; + } + + free_aio_req(s, aio_req); + if (!acb->nr_pending) { + /* + * We've finished all requests which belong to the AIOCB, so + * we can switch back to sd_co_readv/writev now. + */ + acb->aio_done_func(acb); + } +out: + s->co_recv = NULL; + return; +err: + s->co_recv = NULL; + reconnect_to_sdog(opaque); +} + +static void co_read_response(void *opaque) +{ + BDRVSheepdogState *s = opaque; + + if (!s->co_recv) { + s->co_recv = qemu_coroutine_create(aio_read_response); + } + + qemu_coroutine_enter(s->co_recv, opaque); +} + +static void co_write_request(void *opaque) +{ + BDRVSheepdogState *s = opaque; + + qemu_coroutine_enter(s->co_send, NULL); +} + +/* + * Return a socket descriptor to read/write objects. + * + * We cannot use this descriptor for other operations because + * the block driver may be on waiting response from the server. + */ +static int get_sheep_fd(BDRVSheepdogState *s, Error **errp) +{ + int fd; + + fd = connect_to_sdog(s, errp); + if (fd < 0) { + return fd; + } + + aio_set_fd_handler(s->aio_context, fd, false, + co_read_response, NULL, s); + return fd; +} + +static int sd_parse_uri(BDRVSheepdogState *s, const char *filename, + char *vdi, uint32_t *snapid, char *tag) +{ + URI *uri; + QueryParams *qp = NULL; + int ret = 0; + + uri = uri_parse(filename); + if (!uri) { + return -EINVAL; + } + + /* transport */ + if (!strcmp(uri->scheme, "sheepdog")) { + s->is_unix = false; + } else if (!strcmp(uri->scheme, "sheepdog+tcp")) { + s->is_unix = false; + } else if (!strcmp(uri->scheme, "sheepdog+unix")) { + s->is_unix = true; + } else { + ret = -EINVAL; + goto out; + } + + if (uri->path == NULL || !strcmp(uri->path, "/")) { + ret = -EINVAL; + goto out; + } + pstrcpy(vdi, SD_MAX_VDI_LEN, uri->path + 1); + + qp = query_params_parse(uri->query); + if (qp->n > 1 || (s->is_unix && !qp->n) || (!s->is_unix && qp->n)) { + ret = -EINVAL; + goto out; + } + + if (s->is_unix) { + /* sheepdog+unix:///vdiname?socket=path */ + if (uri->server || uri->port || strcmp(qp->p[0].name, "socket")) { + ret = -EINVAL; + goto out; + } + s->host_spec = g_strdup(qp->p[0].value); + } else { + /* sheepdog[+tcp]://[host:port]/vdiname */ + s->host_spec = g_strdup_printf("%s:%d", uri->server ?: SD_DEFAULT_ADDR, + uri->port ?: SD_DEFAULT_PORT); + } + + /* snapshot tag */ + if (uri->fragment) { + *snapid = strtoul(uri->fragment, NULL, 10); + if (*snapid == 0) { + pstrcpy(tag, SD_MAX_VDI_TAG_LEN, uri->fragment); + } + } else { + *snapid = CURRENT_VDI_ID; /* search current vdi */ + } + +out: + if (qp) { + query_params_free(qp); + } + uri_free(uri); + return ret; +} + +/* + * Parse a filename (old syntax) + * + * filename must be one of the following formats: + * 1. [vdiname] + * 2. [vdiname]:[snapid] + * 3. [vdiname]:[tag] + * 4. [hostname]:[port]:[vdiname] + * 5. [hostname]:[port]:[vdiname]:[snapid] + * 6. [hostname]:[port]:[vdiname]:[tag] + * + * You can boot from the snapshot images by specifying `snapid` or + * `tag'. + * + * You can run VMs outside the Sheepdog cluster by specifying + * `hostname' and `port' (experimental). + */ +static int parse_vdiname(BDRVSheepdogState *s, const char *filename, + char *vdi, uint32_t *snapid, char *tag) +{ + char *p, *q, *uri; + const char *host_spec, *vdi_spec; + int nr_sep, ret; + + strstart(filename, "sheepdog:", (const char **)&filename); + p = q = g_strdup(filename); + + /* count the number of separators */ + nr_sep = 0; + while (*p) { + if (*p == ':') { + nr_sep++; + } + p++; + } + p = q; + + /* use the first two tokens as host_spec. */ + if (nr_sep >= 2) { + host_spec = p; + p = strchr(p, ':'); + p++; + p = strchr(p, ':'); + *p++ = '\0'; + } else { + host_spec = ""; + } + + vdi_spec = p; + + p = strchr(vdi_spec, ':'); + if (p) { + *p++ = '#'; + } + + uri = g_strdup_printf("sheepdog://%s/%s", host_spec, vdi_spec); + + ret = sd_parse_uri(s, uri, vdi, snapid, tag); + + g_free(q); + g_free(uri); + + return ret; +} + +static int find_vdi_name(BDRVSheepdogState *s, const char *filename, + uint32_t snapid, const char *tag, uint32_t *vid, + bool lock, Error **errp) +{ + int ret, fd; + SheepdogVdiReq hdr; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + unsigned int wlen, rlen = 0; + char buf[SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN]; + + fd = connect_to_sdog(s, errp); + if (fd < 0) { + return fd; + } + + /* This pair of strncpy calls ensures that the buffer is zero-filled, + * which is desirable since we'll soon be sending those bytes, and + * don't want the send_req to read uninitialized data. + */ + strncpy(buf, filename, SD_MAX_VDI_LEN); + strncpy(buf + SD_MAX_VDI_LEN, tag, SD_MAX_VDI_TAG_LEN); + + memset(&hdr, 0, sizeof(hdr)); + if (lock) { + hdr.opcode = SD_OP_LOCK_VDI; + hdr.type = LOCK_TYPE_NORMAL; + } else { + hdr.opcode = SD_OP_GET_VDI_INFO; + } + wlen = SD_MAX_VDI_LEN + SD_MAX_VDI_TAG_LEN; + hdr.proto_ver = SD_PROTO_VER; + hdr.data_length = wlen; + hdr.snapid = snapid; + hdr.flags = SD_FLAG_CMD_WRITE; + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + if (ret) { + error_setg_errno(errp, -ret, "cannot get vdi info"); + goto out; + } + + if (rsp->result != SD_RES_SUCCESS) { + error_setg(errp, "cannot get vdi info, %s, %s %" PRIu32 " %s", + sd_strerror(rsp->result), filename, snapid, tag); + if (rsp->result == SD_RES_NO_VDI) { + ret = -ENOENT; + } else if (rsp->result == SD_RES_VDI_LOCKED) { + ret = -EBUSY; + } else { + ret = -EIO; + } + goto out; + } + *vid = rsp->vdi_id; + + ret = 0; +out: + closesocket(fd); + return ret; +} + +static void coroutine_fn add_aio_request(BDRVSheepdogState *s, AIOReq *aio_req, + struct iovec *iov, int niov, + enum AIOCBState aiocb_type) +{ + int nr_copies = s->inode.nr_copies; + SheepdogObjReq hdr; + unsigned int wlen = 0; + int ret; + uint64_t oid = aio_req->oid; + unsigned int datalen = aio_req->data_len; + uint64_t offset = aio_req->offset; + uint8_t flags = aio_req->flags; + uint64_t old_oid = aio_req->base_oid; + bool create = aio_req->create; + + if (!nr_copies) { + error_report("bug"); + } + + memset(&hdr, 0, sizeof(hdr)); + + switch (aiocb_type) { + case AIOCB_FLUSH_CACHE: + hdr.opcode = SD_OP_FLUSH_VDI; + break; + case AIOCB_READ_UDATA: + hdr.opcode = SD_OP_READ_OBJ; + hdr.flags = flags; + break; + case AIOCB_WRITE_UDATA: + if (create) { + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; + } else { + hdr.opcode = SD_OP_WRITE_OBJ; + } + wlen = datalen; + hdr.flags = SD_FLAG_CMD_WRITE | flags; + break; + case AIOCB_DISCARD_OBJ: + hdr.opcode = SD_OP_WRITE_OBJ; + hdr.flags = SD_FLAG_CMD_WRITE | flags; + s->inode.data_vdi_id[data_oid_to_idx(oid)] = 0; + offset = offsetof(SheepdogInode, + data_vdi_id[data_oid_to_idx(oid)]); + oid = vid_to_vdi_oid(s->inode.vdi_id); + wlen = datalen = sizeof(uint32_t); + break; + } + + if (s->cache_flags) { + hdr.flags |= s->cache_flags; + } + + hdr.oid = oid; + hdr.cow_oid = old_oid; + hdr.copies = s->inode.nr_copies; + + hdr.data_length = datalen; + hdr.offset = offset; + + hdr.id = aio_req->id; + + qemu_co_mutex_lock(&s->lock); + s->co_send = qemu_coroutine_self(); + aio_set_fd_handler(s->aio_context, s->fd, false, + co_read_response, co_write_request, s); + socket_set_cork(s->fd, 1); + + /* send a header */ + ret = qemu_co_send(s->fd, &hdr, sizeof(hdr)); + if (ret != sizeof(hdr)) { + error_report("failed to send a req, %s", strerror(errno)); + goto out; + } + + if (wlen) { + ret = qemu_co_sendv(s->fd, iov, niov, aio_req->iov_offset, wlen); + if (ret != wlen) { + error_report("failed to send a data, %s", strerror(errno)); + } + } +out: + socket_set_cork(s->fd, 0); + aio_set_fd_handler(s->aio_context, s->fd, false, + co_read_response, NULL, s); + s->co_send = NULL; + qemu_co_mutex_unlock(&s->lock); +} + +static int read_write_object(int fd, AioContext *aio_context, char *buf, + uint64_t oid, uint8_t copies, + unsigned int datalen, uint64_t offset, + bool write, bool create, uint32_t cache_flags) +{ + SheepdogObjReq hdr; + SheepdogObjRsp *rsp = (SheepdogObjRsp *)&hdr; + unsigned int wlen, rlen; + int ret; + + memset(&hdr, 0, sizeof(hdr)); + + if (write) { + wlen = datalen; + rlen = 0; + hdr.flags = SD_FLAG_CMD_WRITE; + if (create) { + hdr.opcode = SD_OP_CREATE_AND_WRITE_OBJ; + } else { + hdr.opcode = SD_OP_WRITE_OBJ; + } + } else { + wlen = 0; + rlen = datalen; + hdr.opcode = SD_OP_READ_OBJ; + } + + hdr.flags |= cache_flags; + + hdr.oid = oid; + hdr.data_length = datalen; + hdr.offset = offset; + hdr.copies = copies; + + ret = do_req(fd, aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + if (ret) { + error_report("failed to send a request to the sheep"); + return ret; + } + + switch (rsp->result) { + case SD_RES_SUCCESS: + return 0; + default: + error_report("%s", sd_strerror(rsp->result)); + return -EIO; + } +} + +static int read_object(int fd, AioContext *aio_context, char *buf, + uint64_t oid, uint8_t copies, + unsigned int datalen, uint64_t offset, + uint32_t cache_flags) +{ + return read_write_object(fd, aio_context, buf, oid, copies, + datalen, offset, false, + false, cache_flags); +} + +static int write_object(int fd, AioContext *aio_context, char *buf, + uint64_t oid, uint8_t copies, + unsigned int datalen, uint64_t offset, bool create, + uint32_t cache_flags) +{ + return read_write_object(fd, aio_context, buf, oid, copies, + datalen, offset, true, + create, cache_flags); +} + +/* update inode with the latest state */ +static int reload_inode(BDRVSheepdogState *s, uint32_t snapid, const char *tag) +{ + Error *local_err = NULL; + SheepdogInode *inode; + int ret = 0, fd; + uint32_t vid = 0; + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return -EIO; + } + + inode = g_malloc(SD_INODE_HEADER_SIZE); + + ret = find_vdi_name(s, s->name, snapid, tag, &vid, false, &local_err); + if (ret) { + error_report_err(local_err); + goto out; + } + + ret = read_object(fd, s->aio_context, (char *)inode, vid_to_vdi_oid(vid), + s->inode.nr_copies, SD_INODE_HEADER_SIZE, 0, + s->cache_flags); + if (ret < 0) { + goto out; + } + + if (inode->vdi_id != s->inode.vdi_id) { + memcpy(&s->inode, inode, SD_INODE_HEADER_SIZE); + } + +out: + g_free(inode); + closesocket(fd); + + return ret; +} + +static void coroutine_fn resend_aioreq(BDRVSheepdogState *s, AIOReq *aio_req) +{ + SheepdogAIOCB *acb = aio_req->aiocb; + + aio_req->create = false; + + /* check whether this request becomes a CoW one */ + if (acb->aiocb_type == AIOCB_WRITE_UDATA && is_data_obj(aio_req->oid)) { + int idx = data_oid_to_idx(aio_req->oid); + + if (is_data_obj_writable(&s->inode, idx)) { + goto out; + } + + if (s->inode.data_vdi_id[idx]) { + aio_req->base_oid = vid_to_data_oid(s->inode.data_vdi_id[idx], idx); + aio_req->flags |= SD_FLAG_CMD_COW; + } + aio_req->create = true; + } +out: + if (is_data_obj(aio_req->oid)) { + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, + acb->aiocb_type); + } else { + struct iovec iov; + iov.iov_base = &s->inode; + iov.iov_len = sizeof(s->inode); + add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); + } +} + +static void sd_detach_aio_context(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + + aio_set_fd_handler(s->aio_context, s->fd, false, NULL, + NULL, NULL); +} + +static void sd_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVSheepdogState *s = bs->opaque; + + s->aio_context = new_context; + aio_set_fd_handler(new_context, s->fd, false, + co_read_response, NULL, s); +} + +/* TODO Convert to fine grained options */ +static QemuOptsList runtime_opts = { + .name = "sheepdog", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "filename", + .type = QEMU_OPT_STRING, + .help = "URL to the sheepdog image", + }, + { /* end of list */ } + }, +}; + +static int sd_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + int ret, fd; + uint32_t vid = 0; + BDRVSheepdogState *s = bs->opaque; + char vdi[SD_MAX_VDI_LEN], tag[SD_MAX_VDI_TAG_LEN]; + uint32_t snapid; + char *buf = NULL; + QemuOpts *opts; + Error *local_err = NULL; + const char *filename; + + s->bs = bs; + s->aio_context = bdrv_get_aio_context(bs); + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto out; + } + + filename = qemu_opt_get(opts, "filename"); + + QLIST_INIT(&s->inflight_aio_head); + QLIST_INIT(&s->failed_aio_head); + QLIST_INIT(&s->inflight_aiocb_head); + s->fd = -1; + + memset(vdi, 0, sizeof(vdi)); + memset(tag, 0, sizeof(tag)); + + if (strstr(filename, "://")) { + ret = sd_parse_uri(s, filename, vdi, &snapid, tag); + } else { + ret = parse_vdiname(s, filename, vdi, &snapid, tag); + } + if (ret < 0) { + error_setg(errp, "Can't parse filename"); + goto out; + } + s->fd = get_sheep_fd(s, errp); + if (s->fd < 0) { + ret = s->fd; + goto out; + } + + ret = find_vdi_name(s, vdi, snapid, tag, &vid, true, errp); + if (ret) { + goto out; + } + + /* + * QEMU block layer emulates writethrough cache as 'writeback + flush', so + * we always set SD_FLAG_CMD_CACHE (writeback cache) as default. + */ + s->cache_flags = SD_FLAG_CMD_CACHE; + if (flags & BDRV_O_NOCACHE) { + s->cache_flags = SD_FLAG_CMD_DIRECT; + } + s->discard_supported = true; + + if (snapid || tag[0] != '\0') { + DPRINTF("%" PRIx32 " snapshot inode was open.\n", vid); + s->is_snapshot = true; + } + + fd = connect_to_sdog(s, errp); + if (fd < 0) { + ret = fd; + goto out; + } + + buf = g_malloc(SD_INODE_SIZE); + ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), + 0, SD_INODE_SIZE, 0, s->cache_flags); + + closesocket(fd); + + if (ret) { + error_setg(errp, "Can't read snapshot inode"); + goto out; + } + + memcpy(&s->inode, buf, sizeof(s->inode)); + + bs->total_sectors = s->inode.vdi_size / BDRV_SECTOR_SIZE; + pstrcpy(s->name, sizeof(s->name), vdi); + qemu_co_mutex_init(&s->lock); + qemu_co_queue_init(&s->overlapping_queue); + qemu_opts_del(opts); + g_free(buf); + return 0; +out: + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, + false, NULL, NULL, NULL); + if (s->fd >= 0) { + closesocket(s->fd); + } + qemu_opts_del(opts); + g_free(buf); + return ret; +} + +static int sd_reopen_prepare(BDRVReopenState *state, BlockReopenQueue *queue, + Error **errp) +{ + BDRVSheepdogState *s = state->bs->opaque; + BDRVSheepdogReopenState *re_s; + int ret = 0; + + re_s = state->opaque = g_new0(BDRVSheepdogReopenState, 1); + + re_s->cache_flags = SD_FLAG_CMD_CACHE; + if (state->flags & BDRV_O_NOCACHE) { + re_s->cache_flags = SD_FLAG_CMD_DIRECT; + } + + re_s->fd = get_sheep_fd(s, errp); + if (re_s->fd < 0) { + ret = re_s->fd; + return ret; + } + + return ret; +} + +static void sd_reopen_commit(BDRVReopenState *state) +{ + BDRVSheepdogReopenState *re_s = state->opaque; + BDRVSheepdogState *s = state->bs->opaque; + + if (s->fd) { + aio_set_fd_handler(s->aio_context, s->fd, false, + NULL, NULL, NULL); + closesocket(s->fd); + } + + s->fd = re_s->fd; + s->cache_flags = re_s->cache_flags; + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + +static void sd_reopen_abort(BDRVReopenState *state) +{ + BDRVSheepdogReopenState *re_s = state->opaque; + BDRVSheepdogState *s = state->bs->opaque; + + if (re_s == NULL) { + return; + } + + if (re_s->fd) { + aio_set_fd_handler(s->aio_context, re_s->fd, false, + NULL, NULL, NULL); + closesocket(re_s->fd); + } + + g_free(state->opaque); + state->opaque = NULL; + + return; +} + +static int do_sd_create(BDRVSheepdogState *s, uint32_t *vdi_id, int snapshot, + Error **errp) +{ + SheepdogVdiReq hdr; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + int fd, ret; + unsigned int wlen, rlen = 0; + char buf[SD_MAX_VDI_LEN]; + + fd = connect_to_sdog(s, errp); + if (fd < 0) { + return fd; + } + + /* FIXME: would it be better to fail (e.g., return -EIO) when filename + * does not fit in buf? For now, just truncate and avoid buffer overrun. + */ + memset(buf, 0, sizeof(buf)); + pstrcpy(buf, sizeof(buf), s->name); + + memset(&hdr, 0, sizeof(hdr)); + hdr.opcode = SD_OP_NEW_VDI; + hdr.base_vdi_id = s->inode.vdi_id; + + wlen = SD_MAX_VDI_LEN; + + hdr.flags = SD_FLAG_CMD_WRITE; + hdr.snapid = snapshot; + + hdr.data_length = wlen; + hdr.vdi_size = s->inode.vdi_size; + hdr.copy_policy = s->inode.copy_policy; + hdr.copies = s->inode.nr_copies; + hdr.block_size_shift = s->inode.block_size_shift; + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, buf, &wlen, &rlen); + + closesocket(fd); + + if (ret) { + error_setg_errno(errp, -ret, "create failed"); + return ret; + } + + if (rsp->result != SD_RES_SUCCESS) { + error_setg(errp, "%s, %s", sd_strerror(rsp->result), s->inode.name); + return -EIO; + } + + if (vdi_id) { + *vdi_id = rsp->vdi_id; + } + + return 0; +} + +static int sd_prealloc(const char *filename, Error **errp) +{ + BlockDriverState *bs = NULL; + BDRVSheepdogState *base = NULL; + unsigned long buf_size; + uint32_t idx, max_idx; + uint32_t object_size; + int64_t vdi_size; + void *buf = NULL; + int ret; + + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + errp); + if (ret < 0) { + goto out_with_err_set; + } + + vdi_size = bdrv_getlength(bs); + if (vdi_size < 0) { + ret = vdi_size; + goto out; + } + + base = bs->opaque; + object_size = (UINT32_C(1) << base->inode.block_size_shift); + buf_size = MIN(object_size, SD_DATA_OBJ_SIZE); + buf = g_malloc0(buf_size); + + max_idx = DIV_ROUND_UP(vdi_size, buf_size); + + for (idx = 0; idx < max_idx; idx++) { + /* + * The created image can be a cloned image, so we need to read + * a data from the source image. + */ + ret = bdrv_pread(bs, idx * buf_size, buf, buf_size); + if (ret < 0) { + goto out; + } + ret = bdrv_pwrite(bs, idx * buf_size, buf, buf_size); + if (ret < 0) { + goto out; + } + } + +out: + if (ret < 0) { + error_setg_errno(errp, -ret, "Can't pre-allocate"); + } +out_with_err_set: + if (bs) { + bdrv_unref(bs); + } + g_free(buf); + + return ret; +} + +/* + * Sheepdog support two kinds of redundancy, full replication and erasure + * coding. + * + * # create a fully replicated vdi with x copies + * -o redundancy=x (1 <= x <= SD_MAX_COPIES) + * + * # create a erasure coded vdi with x data strips and y parity strips + * -o redundancy=x:y (x must be one of {2,4,8,16} and 1 <= y < SD_EC_MAX_STRIP) + */ +static int parse_redundancy(BDRVSheepdogState *s, const char *opt) +{ + struct SheepdogInode *inode = &s->inode; + const char *n1, *n2; + long copy, parity; + char p[10]; + + pstrcpy(p, sizeof(p), opt); + n1 = strtok(p, ":"); + n2 = strtok(NULL, ":"); + + if (!n1) { + return -EINVAL; + } + + copy = strtol(n1, NULL, 10); + if (copy > SD_MAX_COPIES || copy < 1) { + return -EINVAL; + } + if (!n2) { + inode->copy_policy = 0; + inode->nr_copies = copy; + return 0; + } + + if (copy != 2 && copy != 4 && copy != 8 && copy != 16) { + return -EINVAL; + } + + parity = strtol(n2, NULL, 10); + if (parity >= SD_EC_MAX_STRIP || parity < 1) { + return -EINVAL; + } + + /* + * 4 bits for parity and 4 bits for data. + * We have to compress upper data bits because it can't represent 16 + */ + inode->copy_policy = ((copy / 2) << 4) + parity; + inode->nr_copies = copy + parity; + + return 0; +} + +static int parse_block_size_shift(BDRVSheepdogState *s, QemuOpts *opt) +{ + struct SheepdogInode *inode = &s->inode; + uint64_t object_size; + int obj_order; + + object_size = qemu_opt_get_size_del(opt, BLOCK_OPT_OBJECT_SIZE, 0); + if (object_size) { + if ((object_size - 1) & object_size) { /* not a power of 2? */ + return -EINVAL; + } + obj_order = ctz32(object_size); + if (obj_order < 20 || obj_order > 31) { + return -EINVAL; + } + inode->block_size_shift = (uint8_t)obj_order; + } + + return 0; +} + +static int sd_create(const char *filename, QemuOpts *opts, + Error **errp) +{ + int ret = 0; + uint32_t vid = 0; + char *backing_file = NULL; + char *buf = NULL; + BDRVSheepdogState *s; + char tag[SD_MAX_VDI_TAG_LEN]; + uint32_t snapid; + uint64_t max_vdi_size; + bool prealloc = false; + + s = g_new0(BDRVSheepdogState, 1); + + memset(tag, 0, sizeof(tag)); + if (strstr(filename, "://")) { + ret = sd_parse_uri(s, filename, s->name, &snapid, tag); + } else { + ret = parse_vdiname(s, filename, s->name, &snapid, tag); + } + if (ret < 0) { + error_setg(errp, "Can't parse filename"); + goto out; + } + + s->inode.vdi_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + buf = qemu_opt_get_del(opts, BLOCK_OPT_PREALLOC); + if (!buf || !strcmp(buf, "off")) { + prealloc = false; + } else if (!strcmp(buf, "full")) { + prealloc = true; + } else { + error_setg(errp, "Invalid preallocation mode: '%s'", buf); + ret = -EINVAL; + goto out; + } + + g_free(buf); + buf = qemu_opt_get_del(opts, BLOCK_OPT_REDUNDANCY); + if (buf) { + ret = parse_redundancy(s, buf); + if (ret < 0) { + error_setg(errp, "Invalid redundancy mode: '%s'", buf); + goto out; + } + } + ret = parse_block_size_shift(s, opts); + if (ret < 0) { + error_setg(errp, "Invalid object_size." + " obect_size needs to be power of 2" + " and be limited from 2^20 to 2^31"); + goto out; + } + + if (backing_file) { + BlockDriverState *bs; + BDRVSheepdogState *base; + BlockDriver *drv; + + /* Currently, only Sheepdog backing image is supported. */ + drv = bdrv_find_protocol(backing_file, true, NULL); + if (!drv || strcmp(drv->protocol_name, "sheepdog") != 0) { + error_setg(errp, "backing_file must be a sheepdog image"); + ret = -EINVAL; + goto out; + } + + bs = NULL; + ret = bdrv_open(&bs, backing_file, NULL, NULL, BDRV_O_PROTOCOL, errp); + if (ret < 0) { + goto out; + } + + base = bs->opaque; + + if (!is_snapshot(&base->inode)) { + error_setg(errp, "cannot clone from a non snapshot vdi"); + bdrv_unref(bs); + ret = -EINVAL; + goto out; + } + s->inode.vdi_id = base->inode.vdi_id; + bdrv_unref(bs); + } + + s->aio_context = qemu_get_aio_context(); + + /* if block_size_shift is not specified, get cluster default value */ + if (s->inode.block_size_shift == 0) { + SheepdogVdiReq hdr; + SheepdogClusterRsp *rsp = (SheepdogClusterRsp *)&hdr; + Error *local_err = NULL; + int fd; + unsigned int wlen = 0, rlen = 0; + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report("%s", error_get_pretty(local_err)); + error_free(local_err); + ret = -EIO; + goto out; + } + + memset(&hdr, 0, sizeof(hdr)); + hdr.opcode = SD_OP_GET_CLUSTER_DEFAULT; + hdr.proto_ver = SD_PROTO_VER; + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + NULL, &wlen, &rlen); + closesocket(fd); + if (ret) { + error_setg_errno(errp, -ret, "failed to get cluster default"); + goto out; + } + if (rsp->result == SD_RES_SUCCESS) { + s->inode.block_size_shift = rsp->block_size_shift; + } else { + s->inode.block_size_shift = SD_DEFAULT_BLOCK_SIZE_SHIFT; + } + } + + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; + + if (s->inode.vdi_size > max_vdi_size) { + error_setg(errp, "An image is too large." + " The maximum image size is %"PRIu64 "GB", + max_vdi_size / 1024 / 1024 / 1024); + ret = -EINVAL; + goto out; + } + + ret = do_sd_create(s, &vid, 0, errp); + if (ret) { + goto out; + } + + if (prealloc) { + ret = sd_prealloc(filename, errp); + } +out: + g_free(backing_file); + g_free(buf); + g_free(s); + return ret; +} + +static void sd_close(BlockDriverState *bs) +{ + Error *local_err = NULL; + BDRVSheepdogState *s = bs->opaque; + SheepdogVdiReq hdr; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + unsigned int wlen, rlen = 0; + int fd, ret; + + DPRINTF("%s\n", s->name); + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return; + } + + memset(&hdr, 0, sizeof(hdr)); + + hdr.opcode = SD_OP_RELEASE_VDI; + hdr.type = LOCK_TYPE_NORMAL; + hdr.base_vdi_id = s->inode.vdi_id; + wlen = strlen(s->name) + 1; + hdr.data_length = wlen; + hdr.flags = SD_FLAG_CMD_WRITE; + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + s->name, &wlen, &rlen); + + closesocket(fd); + + if (!ret && rsp->result != SD_RES_SUCCESS && + rsp->result != SD_RES_VDI_NOT_LOCKED) { + error_report("%s, %s", sd_strerror(rsp->result), s->name); + } + + aio_set_fd_handler(bdrv_get_aio_context(bs), s->fd, + false, NULL, NULL, NULL); + closesocket(s->fd); + g_free(s->host_spec); +} + +static int64_t sd_getlength(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + + return s->inode.vdi_size; +} + +static int sd_truncate(BlockDriverState *bs, int64_t offset) +{ + Error *local_err = NULL; + BDRVSheepdogState *s = bs->opaque; + int ret, fd; + unsigned int datalen; + uint64_t max_vdi_size; + + max_vdi_size = (UINT64_C(1) << s->inode.block_size_shift) * MAX_DATA_OBJS; + if (offset < s->inode.vdi_size) { + error_report("shrinking is not supported"); + return -EINVAL; + } else if (offset > max_vdi_size) { + error_report("too big image size"); + return -EINVAL; + } + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return fd; + } + + /* we don't need to update entire object */ + datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); + s->inode.vdi_size = offset; + ret = write_object(fd, s->aio_context, (char *)&s->inode, + vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, + datalen, 0, false, s->cache_flags); + close(fd); + + if (ret < 0) { + error_report("failed to update an inode."); + } + + return ret; +} + +/* + * This function is called after writing data objects. If we need to + * update metadata, this sends a write request to the vdi object. + * Otherwise, this switches back to sd_co_readv/writev. + */ +static void coroutine_fn sd_write_done(SheepdogAIOCB *acb) +{ + BDRVSheepdogState *s = acb->common.bs->opaque; + struct iovec iov; + AIOReq *aio_req; + uint32_t offset, data_len, mn, mx; + + mn = acb->min_dirty_data_idx; + mx = acb->max_dirty_data_idx; + if (mn <= mx) { + /* we need to update the vdi object. */ + offset = sizeof(s->inode) - sizeof(s->inode.data_vdi_id) + + mn * sizeof(s->inode.data_vdi_id[0]); + data_len = (mx - mn + 1) * sizeof(s->inode.data_vdi_id[0]); + + acb->min_dirty_data_idx = UINT32_MAX; + acb->max_dirty_data_idx = 0; + + iov.iov_base = &s->inode; + iov.iov_len = sizeof(s->inode); + aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), + data_len, offset, 0, false, 0, offset); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + add_aio_request(s, aio_req, &iov, 1, AIOCB_WRITE_UDATA); + + acb->aio_done_func = sd_finish_aiocb; + acb->aiocb_type = AIOCB_WRITE_UDATA; + return; + } + + sd_finish_aiocb(acb); +} + +/* Delete current working VDI on the snapshot chain */ +static bool sd_delete(BDRVSheepdogState *s) +{ + Error *local_err = NULL; + unsigned int wlen = SD_MAX_VDI_LEN, rlen = 0; + SheepdogVdiReq hdr = { + .opcode = SD_OP_DEL_VDI, + .base_vdi_id = s->inode.vdi_id, + .data_length = wlen, + .flags = SD_FLAG_CMD_WRITE, + }; + SheepdogVdiRsp *rsp = (SheepdogVdiRsp *)&hdr; + int fd, ret; + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return false; + } + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&hdr, + s->name, &wlen, &rlen); + closesocket(fd); + if (ret) { + return false; + } + switch (rsp->result) { + case SD_RES_NO_VDI: + error_report("%s was already deleted", s->name); + /* fall through */ + case SD_RES_SUCCESS: + break; + default: + error_report("%s, %s", sd_strerror(rsp->result), s->name); + return false; + } + + return true; +} + +/* + * Create a writable VDI from a snapshot + */ +static int sd_create_branch(BDRVSheepdogState *s) +{ + Error *local_err = NULL; + int ret, fd; + uint32_t vid; + char *buf; + bool deleted; + + DPRINTF("%" PRIx32 " is snapshot.\n", s->inode.vdi_id); + + buf = g_malloc(SD_INODE_SIZE); + + /* + * Even If deletion fails, we will just create extra snapshot based on + * the working VDI which was supposed to be deleted. So no need to + * false bail out. + */ + deleted = sd_delete(s); + ret = do_sd_create(s, &vid, !deleted, &local_err); + if (ret) { + error_report_err(local_err); + goto out; + } + + DPRINTF("%" PRIx32 " is created.\n", vid); + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + ret = fd; + goto out; + } + + ret = read_object(fd, s->aio_context, buf, vid_to_vdi_oid(vid), + s->inode.nr_copies, SD_INODE_SIZE, 0, s->cache_flags); + + closesocket(fd); + + if (ret < 0) { + goto out; + } + + memcpy(&s->inode, buf, sizeof(s->inode)); + + s->is_snapshot = false; + ret = 0; + DPRINTF("%" PRIx32 " was newly created.\n", s->inode.vdi_id); + +out: + g_free(buf); + + return ret; +} + +/* + * Send I/O requests to the server. + * + * This function sends requests to the server, links the requests to + * the inflight_list in BDRVSheepdogState, and exits without + * waiting the response. The responses are received in the + * `aio_read_response' function which is called from the main loop as + * a fd handler. + * + * Returns 1 when we need to wait a response, 0 when there is no sent + * request and -errno in error cases. + */ +static int coroutine_fn sd_co_rw_vector(void *p) +{ + SheepdogAIOCB *acb = p; + int ret = 0; + unsigned long len, done = 0, total = acb->nb_sectors * BDRV_SECTOR_SIZE; + unsigned long idx; + uint32_t object_size; + uint64_t oid; + uint64_t offset; + BDRVSheepdogState *s = acb->common.bs->opaque; + SheepdogInode *inode = &s->inode; + AIOReq *aio_req; + + if (acb->aiocb_type == AIOCB_WRITE_UDATA && s->is_snapshot) { + /* + * In the case we open the snapshot VDI, Sheepdog creates the + * writable VDI when we do a write operation first. + */ + ret = sd_create_branch(s); + if (ret) { + acb->ret = -EIO; + goto out; + } + } + + object_size = (UINT32_C(1) << inode->block_size_shift); + idx = acb->sector_num * BDRV_SECTOR_SIZE / object_size; + offset = (acb->sector_num * BDRV_SECTOR_SIZE) % object_size; + + /* + * Make sure we don't free the aiocb before we are done with all requests. + * This additional reference is dropped at the end of this function. + */ + acb->nr_pending++; + + while (done != total) { + uint8_t flags = 0; + uint64_t old_oid = 0; + bool create = false; + + oid = vid_to_data_oid(inode->data_vdi_id[idx], idx); + + len = MIN(total - done, object_size - offset); + + switch (acb->aiocb_type) { + case AIOCB_READ_UDATA: + if (!inode->data_vdi_id[idx]) { + qemu_iovec_memset(acb->qiov, done, 0, len); + goto done; + } + break; + case AIOCB_WRITE_UDATA: + if (!inode->data_vdi_id[idx]) { + create = true; + } else if (!is_data_obj_writable(inode, idx)) { + /* Copy-On-Write */ + create = true; + old_oid = oid; + flags = SD_FLAG_CMD_COW; + } + break; + case AIOCB_DISCARD_OBJ: + /* + * We discard the object only when the whole object is + * 1) allocated 2) trimmed. Otherwise, simply skip it. + */ + if (len != object_size || inode->data_vdi_id[idx] == 0) { + goto done; + } + break; + default: + break; + } + + if (create) { + DPRINTF("update ino (%" PRIu32 ") %" PRIu64 " %" PRIu64 " %ld\n", + inode->vdi_id, oid, + vid_to_data_oid(inode->data_vdi_id[idx], idx), idx); + oid = vid_to_data_oid(inode->vdi_id, idx); + DPRINTF("new oid %" PRIx64 "\n", oid); + } + + aio_req = alloc_aio_req(s, acb, oid, len, offset, flags, create, + old_oid, + acb->aiocb_type == AIOCB_DISCARD_OBJ ? + 0 : done); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + + add_aio_request(s, aio_req, acb->qiov->iov, acb->qiov->niov, + acb->aiocb_type); + done: + offset = 0; + idx++; + done += len; + } +out: + if (!--acb->nr_pending) { + return acb->ret; + } + return 1; +} + +static bool check_overlapping_aiocb(BDRVSheepdogState *s, SheepdogAIOCB *aiocb) +{ + SheepdogAIOCB *cb; + + QLIST_FOREACH(cb, &s->inflight_aiocb_head, aiocb_siblings) { + if (AIOCBOverlapping(aiocb, cb)) { + return true; + } + } + + QLIST_INSERT_HEAD(&s->inflight_aiocb_head, aiocb, aiocb_siblings); + return false; +} + +static coroutine_fn int sd_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + SheepdogAIOCB *acb; + int ret; + int64_t offset = (sector_num + nb_sectors) * BDRV_SECTOR_SIZE; + BDRVSheepdogState *s = bs->opaque; + + if (offset > s->inode.vdi_size) { + ret = sd_truncate(bs, offset); + if (ret < 0) { + return ret; + } + } + + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); + acb->aio_done_func = sd_write_done; + acb->aiocb_type = AIOCB_WRITE_UDATA; + +retry: + if (check_overlapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overlapping_queue); + goto retry; + } + + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overlapping_queue); + qemu_aio_unref(acb); + return ret; + } + + qemu_coroutine_yield(); + + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overlapping_queue); + + return acb->ret; +} + +static coroutine_fn int sd_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + SheepdogAIOCB *acb; + int ret; + BDRVSheepdogState *s = bs->opaque; + + acb = sd_aio_setup(bs, qiov, sector_num, nb_sectors); + acb->aiocb_type = AIOCB_READ_UDATA; + acb->aio_done_func = sd_finish_aiocb; + +retry: + if (check_overlapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overlapping_queue); + goto retry; + } + + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overlapping_queue); + qemu_aio_unref(acb); + return ret; + } + + qemu_coroutine_yield(); + + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overlapping_queue); + return acb->ret; +} + +static int coroutine_fn sd_co_flush_to_disk(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogAIOCB *acb; + AIOReq *aio_req; + + if (s->cache_flags != SD_FLAG_CMD_CACHE) { + return 0; + } + + acb = sd_aio_setup(bs, NULL, 0, 0); + acb->aiocb_type = AIOCB_FLUSH_CACHE; + acb->aio_done_func = sd_finish_aiocb; + + aio_req = alloc_aio_req(s, acb, vid_to_vdi_oid(s->inode.vdi_id), + 0, 0, 0, false, 0, 0); + QLIST_INSERT_HEAD(&s->inflight_aio_head, aio_req, aio_siblings); + add_aio_request(s, aio_req, NULL, 0, acb->aiocb_type); + + qemu_coroutine_yield(); + return acb->ret; +} + +static int sd_snapshot_create(BlockDriverState *bs, QEMUSnapshotInfo *sn_info) +{ + Error *local_err = NULL; + BDRVSheepdogState *s = bs->opaque; + int ret, fd; + uint32_t new_vid; + SheepdogInode *inode; + unsigned int datalen; + + DPRINTF("sn_info: name %s id_str %s s: name %s vm_state_size %" PRId64 " " + "is_snapshot %d\n", sn_info->name, sn_info->id_str, + s->name, sn_info->vm_state_size, s->is_snapshot); + + if (s->is_snapshot) { + error_report("You can't create a snapshot of a snapshot VDI, " + "%s (%" PRIu32 ").", s->name, s->inode.vdi_id); + + return -EINVAL; + } + + DPRINTF("%s %s\n", sn_info->name, sn_info->id_str); + + s->inode.vm_state_size = sn_info->vm_state_size; + s->inode.vm_clock_nsec = sn_info->vm_clock_nsec; + /* It appears that inode.tag does not require a NUL terminator, + * which means this use of strncpy is ok. + */ + strncpy(s->inode.tag, sn_info->name, sizeof(s->inode.tag)); + /* we don't need to update entire object */ + datalen = SD_INODE_SIZE - sizeof(s->inode.data_vdi_id); + inode = g_malloc(datalen); + + /* refresh inode. */ + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + ret = fd; + goto cleanup; + } + + ret = write_object(fd, s->aio_context, (char *)&s->inode, + vid_to_vdi_oid(s->inode.vdi_id), s->inode.nr_copies, + datalen, 0, false, s->cache_flags); + if (ret < 0) { + error_report("failed to write snapshot's inode."); + goto cleanup; + } + + ret = do_sd_create(s, &new_vid, 1, &local_err); + if (ret < 0) { + error_report("failed to create inode for snapshot: %s", + error_get_pretty(local_err)); + error_free(local_err); + goto cleanup; + } + + ret = read_object(fd, s->aio_context, (char *)inode, + vid_to_vdi_oid(new_vid), s->inode.nr_copies, datalen, 0, + s->cache_flags); + + if (ret < 0) { + error_report("failed to read new inode info. %s", strerror(errno)); + goto cleanup; + } + + memcpy(&s->inode, inode, datalen); + DPRINTF("s->inode: name %s snap_id %x oid %x\n", + s->inode.name, s->inode.snap_id, s->inode.vdi_id); + +cleanup: + g_free(inode); + closesocket(fd); + return ret; +} + +/* + * We implement rollback(loadvm) operation to the specified snapshot by + * 1) switch to the snapshot + * 2) rely on sd_create_branch to delete working VDI and + * 3) create a new working VDI based on the specified snapshot + */ +static int sd_snapshot_goto(BlockDriverState *bs, const char *snapshot_id) +{ + BDRVSheepdogState *s = bs->opaque; + BDRVSheepdogState *old_s; + char tag[SD_MAX_VDI_TAG_LEN]; + uint32_t snapid = 0; + int ret = 0; + + old_s = g_new(BDRVSheepdogState, 1); + + memcpy(old_s, s, sizeof(BDRVSheepdogState)); + + snapid = strtoul(snapshot_id, NULL, 10); + if (snapid) { + tag[0] = 0; + } else { + pstrcpy(tag, sizeof(tag), snapshot_id); + } + + ret = reload_inode(s, snapid, tag); + if (ret) { + goto out; + } + + ret = sd_create_branch(s); + if (ret) { + goto out; + } + + g_free(old_s); + + return 0; +out: + /* recover bdrv_sd_state */ + memcpy(s, old_s, sizeof(BDRVSheepdogState)); + g_free(old_s); + + error_report("failed to open. recover old bdrv_sd_state."); + + return ret; +} + +static int sd_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) +{ + /* FIXME: Delete specified snapshot id. */ + return 0; +} + +static int sd_snapshot_list(BlockDriverState *bs, QEMUSnapshotInfo **psn_tab) +{ + Error *local_err = NULL; + BDRVSheepdogState *s = bs->opaque; + SheepdogReq req; + int fd, nr = 1024, ret, max = BITS_TO_LONGS(SD_NR_VDIS) * sizeof(long); + QEMUSnapshotInfo *sn_tab = NULL; + unsigned wlen, rlen; + int found = 0; + static SheepdogInode inode; + unsigned long *vdi_inuse; + unsigned int start_nr; + uint64_t hval; + uint32_t vid; + + vdi_inuse = g_malloc(max); + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + ret = fd; + goto out; + } + + rlen = max; + wlen = 0; + + memset(&req, 0, sizeof(req)); + + req.opcode = SD_OP_READ_VDIS; + req.data_length = max; + + ret = do_req(fd, s->aio_context, (SheepdogReq *)&req, + vdi_inuse, &wlen, &rlen); + + closesocket(fd); + if (ret) { + goto out; + } + + sn_tab = g_new0(QEMUSnapshotInfo, nr); + + /* calculate a vdi id with hash function */ + hval = fnv_64a_buf(s->name, strlen(s->name), FNV1A_64_INIT); + start_nr = hval & (SD_NR_VDIS - 1); + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + ret = fd; + goto out; + } + + for (vid = start_nr; found < nr; vid = (vid + 1) % SD_NR_VDIS) { + if (!test_bit(vid, vdi_inuse)) { + break; + } + + /* we don't need to read entire object */ + ret = read_object(fd, s->aio_context, (char *)&inode, + vid_to_vdi_oid(vid), + 0, SD_INODE_SIZE - sizeof(inode.data_vdi_id), 0, + s->cache_flags); + + if (ret) { + continue; + } + + if (!strcmp(inode.name, s->name) && is_snapshot(&inode)) { + sn_tab[found].date_sec = inode.snap_ctime >> 32; + sn_tab[found].date_nsec = inode.snap_ctime & 0xffffffff; + sn_tab[found].vm_state_size = inode.vm_state_size; + sn_tab[found].vm_clock_nsec = inode.vm_clock_nsec; + + snprintf(sn_tab[found].id_str, sizeof(sn_tab[found].id_str), + "%" PRIu32, inode.snap_id); + pstrcpy(sn_tab[found].name, + MIN(sizeof(sn_tab[found].name), sizeof(inode.tag)), + inode.tag); + found++; + } + } + + closesocket(fd); +out: + *psn_tab = sn_tab; + + g_free(vdi_inuse); + + if (ret < 0) { + return ret; + } + + return found; +} + +static int do_load_save_vmstate(BDRVSheepdogState *s, uint8_t *data, + int64_t pos, int size, int load) +{ + Error *local_err = NULL; + bool create; + int fd, ret = 0, remaining = size; + unsigned int data_len; + uint64_t vmstate_oid; + uint64_t offset; + uint32_t vdi_index; + uint32_t vdi_id = load ? s->inode.parent_vdi_id : s->inode.vdi_id; + uint32_t object_size = (UINT32_C(1) << s->inode.block_size_shift); + + fd = connect_to_sdog(s, &local_err); + if (fd < 0) { + error_report_err(local_err); + return fd; + } + + while (remaining) { + vdi_index = pos / object_size; + offset = pos % object_size; + + data_len = MIN(remaining, object_size - offset); + + vmstate_oid = vid_to_vmstate_oid(vdi_id, vdi_index); + + create = (offset == 0); + if (load) { + ret = read_object(fd, s->aio_context, (char *)data, vmstate_oid, + s->inode.nr_copies, data_len, offset, + s->cache_flags); + } else { + ret = write_object(fd, s->aio_context, (char *)data, vmstate_oid, + s->inode.nr_copies, data_len, offset, create, + s->cache_flags); + } + + if (ret < 0) { + error_report("failed to save vmstate %s", strerror(errno)); + goto cleanup; + } + + pos += data_len; + data += data_len; + remaining -= data_len; + } + ret = size; +cleanup: + closesocket(fd); + return ret; +} + +static int sd_save_vmstate(BlockDriverState *bs, QEMUIOVector *qiov, + int64_t pos) +{ + BDRVSheepdogState *s = bs->opaque; + void *buf; + int ret; + + buf = qemu_blockalign(bs, qiov->size); + qemu_iovec_to_buf(qiov, 0, buf, qiov->size); + ret = do_load_save_vmstate(s, (uint8_t *) buf, pos, qiov->size, 0); + qemu_vfree(buf); + + return ret; +} + +static int sd_load_vmstate(BlockDriverState *bs, uint8_t *data, + int64_t pos, int size) +{ + BDRVSheepdogState *s = bs->opaque; + + return do_load_save_vmstate(s, data, pos, size, 1); +} + + +static coroutine_fn int sd_co_discard(BlockDriverState *bs, int64_t sector_num, + int nb_sectors) +{ + SheepdogAIOCB *acb; + BDRVSheepdogState *s = bs->opaque; + int ret; + QEMUIOVector discard_iov; + struct iovec iov; + uint32_t zero = 0; + + if (!s->discard_supported) { + return 0; + } + + memset(&discard_iov, 0, sizeof(discard_iov)); + memset(&iov, 0, sizeof(iov)); + iov.iov_base = &zero; + iov.iov_len = sizeof(zero); + discard_iov.iov = &iov; + discard_iov.niov = 1; + acb = sd_aio_setup(bs, &discard_iov, sector_num, nb_sectors); + acb->aiocb_type = AIOCB_DISCARD_OBJ; + acb->aio_done_func = sd_finish_aiocb; + +retry: + if (check_overlapping_aiocb(s, acb)) { + qemu_co_queue_wait(&s->overlapping_queue); + goto retry; + } + + ret = sd_co_rw_vector(acb); + if (ret <= 0) { + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overlapping_queue); + qemu_aio_unref(acb); + return ret; + } + + qemu_coroutine_yield(); + + QLIST_REMOVE(acb, aiocb_siblings); + qemu_co_queue_restart_all(&s->overlapping_queue); + + return acb->ret; +} + +static coroutine_fn int64_t +sd_co_get_block_status(BlockDriverState *bs, int64_t sector_num, int nb_sectors, + int *pnum) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogInode *inode = &s->inode; + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); + uint64_t offset = sector_num * BDRV_SECTOR_SIZE; + unsigned long start = offset / object_size, + end = DIV_ROUND_UP((sector_num + nb_sectors) * + BDRV_SECTOR_SIZE, object_size); + unsigned long idx; + int64_t ret = BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; + + for (idx = start; idx < end; idx++) { + if (inode->data_vdi_id[idx] == 0) { + break; + } + } + if (idx == start) { + /* Get the longest length of unallocated sectors */ + ret = 0; + for (idx = start + 1; idx < end; idx++) { + if (inode->data_vdi_id[idx] != 0) { + break; + } + } + } + + *pnum = (idx - start) * object_size / BDRV_SECTOR_SIZE; + if (*pnum > nb_sectors) { + *pnum = nb_sectors; + } + return ret; +} + +static int64_t sd_get_allocated_file_size(BlockDriverState *bs) +{ + BDRVSheepdogState *s = bs->opaque; + SheepdogInode *inode = &s->inode; + uint32_t object_size = (UINT32_C(1) << inode->block_size_shift); + unsigned long i, last = DIV_ROUND_UP(inode->vdi_size, object_size); + uint64_t size = 0; + + for (i = 0; i < last; i++) { + if (inode->data_vdi_id[i] == 0) { + continue; + } + size += object_size; + } + return size; +} + +static QemuOptsList sd_create_opts = { + .name = "sheepdog-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(sd_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_PREALLOC, + .type = QEMU_OPT_STRING, + .help = "Preallocation mode (allowed values: off, full)" + }, + { + .name = BLOCK_OPT_REDUNDANCY, + .type = QEMU_OPT_STRING, + .help = "Redundancy of the image" + }, + { + .name = BLOCK_OPT_OBJECT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Object size of the image" + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_sheepdog = { + .format_name = "sheepdog", + .protocol_name = "sheepdog", + .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, + .bdrv_file_open = sd_open, + .bdrv_reopen_prepare = sd_reopen_prepare, + .bdrv_reopen_commit = sd_reopen_commit, + .bdrv_reopen_abort = sd_reopen_abort, + .bdrv_close = sd_close, + .bdrv_create = sd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, + .bdrv_truncate = sd_truncate, + + .bdrv_co_readv = sd_co_readv, + .bdrv_co_writev = sd_co_writev, + .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, + .bdrv_co_get_block_status = sd_co_get_block_status, + + .bdrv_snapshot_create = sd_snapshot_create, + .bdrv_snapshot_goto = sd_snapshot_goto, + .bdrv_snapshot_delete = sd_snapshot_delete, + .bdrv_snapshot_list = sd_snapshot_list, + + .bdrv_save_vmstate = sd_save_vmstate, + .bdrv_load_vmstate = sd_load_vmstate, + + .bdrv_detach_aio_context = sd_detach_aio_context, + .bdrv_attach_aio_context = sd_attach_aio_context, + + .create_opts = &sd_create_opts, +}; + +static BlockDriver bdrv_sheepdog_tcp = { + .format_name = "sheepdog", + .protocol_name = "sheepdog+tcp", + .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, + .bdrv_file_open = sd_open, + .bdrv_reopen_prepare = sd_reopen_prepare, + .bdrv_reopen_commit = sd_reopen_commit, + .bdrv_reopen_abort = sd_reopen_abort, + .bdrv_close = sd_close, + .bdrv_create = sd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, + .bdrv_truncate = sd_truncate, + + .bdrv_co_readv = sd_co_readv, + .bdrv_co_writev = sd_co_writev, + .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, + .bdrv_co_get_block_status = sd_co_get_block_status, + + .bdrv_snapshot_create = sd_snapshot_create, + .bdrv_snapshot_goto = sd_snapshot_goto, + .bdrv_snapshot_delete = sd_snapshot_delete, + .bdrv_snapshot_list = sd_snapshot_list, + + .bdrv_save_vmstate = sd_save_vmstate, + .bdrv_load_vmstate = sd_load_vmstate, + + .bdrv_detach_aio_context = sd_detach_aio_context, + .bdrv_attach_aio_context = sd_attach_aio_context, + + .create_opts = &sd_create_opts, +}; + +static BlockDriver bdrv_sheepdog_unix = { + .format_name = "sheepdog", + .protocol_name = "sheepdog+unix", + .instance_size = sizeof(BDRVSheepdogState), + .bdrv_needs_filename = true, + .bdrv_file_open = sd_open, + .bdrv_reopen_prepare = sd_reopen_prepare, + .bdrv_reopen_commit = sd_reopen_commit, + .bdrv_reopen_abort = sd_reopen_abort, + .bdrv_close = sd_close, + .bdrv_create = sd_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_getlength = sd_getlength, + .bdrv_get_allocated_file_size = sd_get_allocated_file_size, + .bdrv_truncate = sd_truncate, + + .bdrv_co_readv = sd_co_readv, + .bdrv_co_writev = sd_co_writev, + .bdrv_co_flush_to_disk = sd_co_flush_to_disk, + .bdrv_co_discard = sd_co_discard, + .bdrv_co_get_block_status = sd_co_get_block_status, + + .bdrv_snapshot_create = sd_snapshot_create, + .bdrv_snapshot_goto = sd_snapshot_goto, + .bdrv_snapshot_delete = sd_snapshot_delete, + .bdrv_snapshot_list = sd_snapshot_list, + + .bdrv_save_vmstate = sd_save_vmstate, + .bdrv_load_vmstate = sd_load_vmstate, + + .bdrv_detach_aio_context = sd_detach_aio_context, + .bdrv_attach_aio_context = sd_attach_aio_context, + + .create_opts = &sd_create_opts, +}; + +static void bdrv_sheepdog_init(void) +{ + bdrv_register(&bdrv_sheepdog); + bdrv_register(&bdrv_sheepdog_tcp); + bdrv_register(&bdrv_sheepdog_unix); +} +block_init(bdrv_sheepdog_init); diff --git a/src/block/snapshot.c b/src/block/snapshot.c new file mode 100644 index 0000000..6e9fa8d --- /dev/null +++ b/src/block/snapshot.c @@ -0,0 +1,486 @@ +/* + * Block layer snapshot related functions + * + * Copyright (c) 2003-2008 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "block/snapshot.h" +#include "block/block_int.h" +#include "qapi/qmp/qerror.h" + +QemuOptsList internal_snapshot_opts = { + .name = "snapshot", + .head = QTAILQ_HEAD_INITIALIZER(internal_snapshot_opts.head), + .desc = { + { + .name = SNAPSHOT_OPT_ID, + .type = QEMU_OPT_STRING, + .help = "snapshot id" + },{ + .name = SNAPSHOT_OPT_NAME, + .type = QEMU_OPT_STRING, + .help = "snapshot name" + },{ + /* end of list */ + } + }, +}; + +int bdrv_snapshot_find(BlockDriverState *bs, QEMUSnapshotInfo *sn_info, + const char *name) +{ + QEMUSnapshotInfo *sn_tab, *sn; + int nb_sns, i, ret; + + ret = -ENOENT; + nb_sns = bdrv_snapshot_list(bs, &sn_tab); + if (nb_sns < 0) { + return ret; + } + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, name) || !strcmp(sn->name, name)) { + *sn_info = *sn; + ret = 0; + break; + } + } + g_free(sn_tab); + return ret; +} + +/** + * Look up an internal snapshot by @id and @name. + * @bs: block device to search + * @id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @sn_info: location to store information on the snapshot found + * @errp: location to store error, will be set only for exception + * + * This function will traverse snapshot list in @bs to search the matching + * one, @id and @name are the matching condition: + * If both @id and @name are specified, find the first one with id @id and + * name @name. + * If only @id is specified, find the first one with id @id. + * If only @name is specified, find the first one with name @name. + * if none is specified, abort(). + * + * Returns: true when a snapshot is found and @sn_info will be filled, false + * when error or not found. If all operation succeed but no matching one is + * found, @errp will NOT be set. + */ +bool bdrv_snapshot_find_by_id_and_name(BlockDriverState *bs, + const char *id, + const char *name, + QEMUSnapshotInfo *sn_info, + Error **errp) +{ + QEMUSnapshotInfo *sn_tab, *sn; + int nb_sns, i; + bool ret = false; + + assert(id || name); + + nb_sns = bdrv_snapshot_list(bs, &sn_tab); + if (nb_sns < 0) { + error_setg_errno(errp, -nb_sns, "Failed to get a snapshot list"); + return false; + } else if (nb_sns == 0) { + return false; + } + + if (id && name) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, id) && !strcmp(sn->name, name)) { + *sn_info = *sn; + ret = true; + break; + } + } + } else if (id) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->id_str, id)) { + *sn_info = *sn; + ret = true; + break; + } + } + } else if (name) { + for (i = 0; i < nb_sns; i++) { + sn = &sn_tab[i]; + if (!strcmp(sn->name, name)) { + *sn_info = *sn; + ret = true; + break; + } + } + } + + g_free(sn_tab); + return ret; +} + +int bdrv_can_snapshot(BlockDriverState *bs) +{ + BlockDriver *drv = bs->drv; + if (!drv || !bdrv_is_inserted(bs) || bdrv_is_read_only(bs)) { + return 0; + } + + if (!drv->bdrv_snapshot_create) { + if (bs->file != NULL) { + return bdrv_can_snapshot(bs->file->bs); + } + return 0; + } + + return 1; +} + +int bdrv_snapshot_create(BlockDriverState *bs, + QEMUSnapshotInfo *sn_info) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_create) { + return drv->bdrv_snapshot_create(bs, sn_info); + } + if (bs->file) { + return bdrv_snapshot_create(bs->file->bs, sn_info); + } + return -ENOTSUP; +} + +int bdrv_snapshot_goto(BlockDriverState *bs, + const char *snapshot_id) +{ + BlockDriver *drv = bs->drv; + int ret, open_ret; + + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_goto) { + return drv->bdrv_snapshot_goto(bs, snapshot_id); + } + + if (bs->file) { + drv->bdrv_close(bs); + ret = bdrv_snapshot_goto(bs->file->bs, snapshot_id); + open_ret = drv->bdrv_open(bs, NULL, bs->open_flags, NULL); + if (open_ret < 0) { + bdrv_unref(bs->file->bs); + bs->drv = NULL; + return open_ret; + } + return ret; + } + + return -ENOTSUP; +} + +/** + * Delete an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, delete the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, delete the first one with id + * @snapshot_id. + * If only @name is specified, delete the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on failure. If @bs is not inserted, return + * -ENOMEDIUM. If @snapshot_id and @name are both NULL, return -EINVAL. If @bs + * does not support internal snapshot deletion, return -ENOTSUP. If @bs does + * not support parameter @snapshot_id or @name, or one of them is not correctly + * specified, return -EINVAL. If @bs can't find one matching @id and @name, + * return -ENOENT. If @errp != NULL, it will always be filled with error + * message on failure. + */ +int bdrv_snapshot_delete(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); + return -ENOMEDIUM; + } + if (!snapshot_id && !name) { + error_setg(errp, "snapshot_id and name are both NULL"); + return -EINVAL; + } + + /* drain all pending i/o before deleting snapshot */ + bdrv_drain(bs); + + if (drv->bdrv_snapshot_delete) { + return drv->bdrv_snapshot_delete(bs, snapshot_id, name, errp); + } + if (bs->file) { + return bdrv_snapshot_delete(bs->file->bs, snapshot_id, name, errp); + } + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support internal snapshot deletion", + drv->format_name, bdrv_get_device_name(bs)); + return -ENOTSUP; +} + +int bdrv_snapshot_delete_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp) +{ + int ret; + Error *local_err = NULL; + + ret = bdrv_snapshot_delete(bs, id_or_name, NULL, &local_err); + if (ret == -ENOENT || ret == -EINVAL) { + error_free(local_err); + local_err = NULL; + ret = bdrv_snapshot_delete(bs, NULL, id_or_name, &local_err); + } + + if (ret < 0) { + error_propagate(errp, local_err); + } + return ret; +} + +int bdrv_snapshot_list(BlockDriverState *bs, + QEMUSnapshotInfo **psn_info) +{ + BlockDriver *drv = bs->drv; + if (!drv) { + return -ENOMEDIUM; + } + if (drv->bdrv_snapshot_list) { + return drv->bdrv_snapshot_list(bs, psn_info); + } + if (bs->file) { + return bdrv_snapshot_list(bs->file->bs, psn_info); + } + return -ENOTSUP; +} + +/** + * Temporarily load an internal snapshot by @snapshot_id and @name. + * @bs: block device used in the operation + * @snapshot_id: unique snapshot ID, or NULL + * @name: snapshot name, or NULL + * @errp: location to store error + * + * If both @snapshot_id and @name are specified, load the first one with + * id @snapshot_id and name @name. + * If only @snapshot_id is specified, load the first one with id + * @snapshot_id. + * If only @name is specified, load the first one with name @name. + * if none is specified, return -EINVAL. + * + * Returns: 0 on success, -errno on fail. If @bs is not inserted, return + * -ENOMEDIUM. If @bs is not readonly, return -EINVAL. If @bs did not support + * internal snapshot, return -ENOTSUP. If qemu can't find a matching @id and + * @name, return -ENOENT. If @errp != NULL, it will always be filled on + * failure. + */ +int bdrv_snapshot_load_tmp(BlockDriverState *bs, + const char *snapshot_id, + const char *name, + Error **errp) +{ + BlockDriver *drv = bs->drv; + + if (!drv) { + error_setg(errp, QERR_DEVICE_HAS_NO_MEDIUM, bdrv_get_device_name(bs)); + return -ENOMEDIUM; + } + if (!snapshot_id && !name) { + error_setg(errp, "snapshot_id and name are both NULL"); + return -EINVAL; + } + if (!bs->read_only) { + error_setg(errp, "Device is not readonly"); + return -EINVAL; + } + if (drv->bdrv_snapshot_load_tmp) { + return drv->bdrv_snapshot_load_tmp(bs, snapshot_id, name, errp); + } + error_setg(errp, "Block format '%s' used by device '%s' " + "does not support temporarily loading internal snapshots", + drv->format_name, bdrv_get_device_name(bs)); + return -ENOTSUP; +} + +int bdrv_snapshot_load_tmp_by_id_or_name(BlockDriverState *bs, + const char *id_or_name, + Error **errp) +{ + int ret; + Error *local_err = NULL; + + ret = bdrv_snapshot_load_tmp(bs, id_or_name, NULL, &local_err); + if (ret == -ENOENT || ret == -EINVAL) { + error_free(local_err); + local_err = NULL; + ret = bdrv_snapshot_load_tmp(bs, NULL, id_or_name, &local_err); + } + + if (local_err) { + error_propagate(errp, local_err); + } + + return ret; +} + + +/* Group operations. All block drivers are involved. + * These functions will properly handle dataplane (take aio_context_acquire + * when appropriate for appropriate block drivers) */ + +bool bdrv_all_can_snapshot(BlockDriverState **first_bad_bs) +{ + bool ok = true; + BlockDriverState *bs = NULL; + + while (ok && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_is_inserted(bs) && !bdrv_is_read_only(bs)) { + ok = bdrv_can_snapshot(bs); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return ok; +} + +int bdrv_all_delete_snapshot(const char *name, BlockDriverState **first_bad_bs, + Error **err) +{ + int ret = 0; + BlockDriverState *bs = NULL; + QEMUSnapshotInfo sn1, *snapshot = &sn1; + + while (ret == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_can_snapshot(bs) && + bdrv_snapshot_find(bs, snapshot, name) >= 0) { + ret = bdrv_snapshot_delete_by_id_or_name(bs, name, err); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return ret; +} + + +int bdrv_all_goto_snapshot(const char *name, BlockDriverState **first_bad_bs) +{ + int err = 0; + BlockDriverState *bs = NULL; + + while (err == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_can_snapshot(bs)) { + err = bdrv_snapshot_goto(bs, name); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return err; +} + +int bdrv_all_find_snapshot(const char *name, BlockDriverState **first_bad_bs) +{ + QEMUSnapshotInfo sn; + int err = 0; + BlockDriverState *bs = NULL; + + while (err == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bdrv_can_snapshot(bs)) { + err = bdrv_snapshot_find(bs, &sn, name); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return err; +} + +int bdrv_all_create_snapshot(QEMUSnapshotInfo *sn, + BlockDriverState *vm_state_bs, + uint64_t vm_state_size, + BlockDriverState **first_bad_bs) +{ + int err = 0; + BlockDriverState *bs = NULL; + + while (err == 0 && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + if (bs == vm_state_bs) { + sn->vm_state_size = vm_state_size; + err = bdrv_snapshot_create(bs, sn); + } else if (bdrv_can_snapshot(bs)) { + sn->vm_state_size = 0; + err = bdrv_snapshot_create(bs, sn); + } + aio_context_release(ctx); + } + + *first_bad_bs = bs; + return err; +} + +BlockDriverState *bdrv_all_find_vmstate_bs(void) +{ + bool not_found = true; + BlockDriverState *bs = NULL; + + while (not_found && (bs = bdrv_next(bs))) { + AioContext *ctx = bdrv_get_aio_context(bs); + + aio_context_acquire(ctx); + not_found = !bdrv_can_snapshot(bs); + aio_context_release(ctx); + } + return bs; +} diff --git a/src/block/ssh.c b/src/block/ssh.c new file mode 100644 index 0000000..af025c0 --- /dev/null +++ b/src/block/ssh.c @@ -0,0 +1,1112 @@ +/* + * Secure Shell (ssh) backend for QEMU. + * + * Copyright (C) 2013 Red Hat Inc., Richard W.M. Jones <rjones@redhat.com> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> + +#include <libssh2.h> +#include <libssh2_sftp.h> + +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/sockets.h" +#include "qemu/uri.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qstring.h" + +/* DEBUG_SSH=1 enables the DPRINTF (debugging printf) statements in + * this block driver code. + * + * TRACE_LIBSSH2=<bitmask> enables tracing in libssh2 itself. Note + * that this requires that libssh2 was specially compiled with the + * `./configure --enable-debug' option, so most likely you will have + * to compile it yourself. The meaning of <bitmask> is described + * here: http://www.libssh2.org/libssh2_trace.html + */ +#define DEBUG_SSH 0 +#define TRACE_LIBSSH2 0 /* or try: LIBSSH2_TRACE_SFTP */ + +#define DPRINTF(fmt, ...) \ + do { \ + if (DEBUG_SSH) { \ + fprintf(stderr, "ssh: %-15s " fmt "\n", \ + __func__, ##__VA_ARGS__); \ + } \ + } while (0) + +typedef struct BDRVSSHState { + /* Coroutine. */ + CoMutex lock; + + /* SSH connection. */ + int sock; /* socket */ + LIBSSH2_SESSION *session; /* ssh session */ + LIBSSH2_SFTP *sftp; /* sftp session */ + LIBSSH2_SFTP_HANDLE *sftp_handle; /* sftp remote file handle */ + + /* See ssh_seek() function below. */ + int64_t offset; + bool offset_op_read; + + /* File attributes at open. We try to keep the .filesize field + * updated if it changes (eg by writing at the end of the file). + */ + LIBSSH2_SFTP_ATTRIBUTES attrs; + + /* Used to warn if 'flush' is not supported. */ + char *hostport; + bool unsafe_flush_warning; +} BDRVSSHState; + +static void ssh_state_init(BDRVSSHState *s) +{ + memset(s, 0, sizeof *s); + s->sock = -1; + s->offset = -1; + qemu_co_mutex_init(&s->lock); +} + +static void ssh_state_free(BDRVSSHState *s) +{ + g_free(s->hostport); + if (s->sftp_handle) { + libssh2_sftp_close(s->sftp_handle); + } + if (s->sftp) { + libssh2_sftp_shutdown(s->sftp); + } + if (s->session) { + libssh2_session_disconnect(s->session, + "from qemu ssh client: " + "user closed the connection"); + libssh2_session_free(s->session); + } + if (s->sock >= 0) { + close(s->sock); + } +} + +static void GCC_FMT_ATTR(3, 4) +session_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) +{ + va_list args; + char *msg; + + va_start(args, fs); + msg = g_strdup_vprintf(fs, args); + va_end(args); + + if (s->session) { + char *ssh_err; + int ssh_err_code; + + /* This is not an errno. See <libssh2.h>. */ + ssh_err_code = libssh2_session_last_error(s->session, + &ssh_err, NULL, 0); + error_setg(errp, "%s: %s (libssh2 error code: %d)", + msg, ssh_err, ssh_err_code); + } else { + error_setg(errp, "%s", msg); + } + g_free(msg); +} + +static void GCC_FMT_ATTR(3, 4) +sftp_error_setg(Error **errp, BDRVSSHState *s, const char *fs, ...) +{ + va_list args; + char *msg; + + va_start(args, fs); + msg = g_strdup_vprintf(fs, args); + va_end(args); + + if (s->sftp) { + char *ssh_err; + int ssh_err_code; + unsigned long sftp_err_code; + + /* This is not an errno. See <libssh2.h>. */ + ssh_err_code = libssh2_session_last_error(s->session, + &ssh_err, NULL, 0); + /* See <libssh2_sftp.h>. */ + sftp_err_code = libssh2_sftp_last_error((s)->sftp); + + error_setg(errp, + "%s: %s (libssh2 error code: %d, sftp error code: %lu)", + msg, ssh_err, ssh_err_code, sftp_err_code); + } else { + error_setg(errp, "%s", msg); + } + g_free(msg); +} + +static void GCC_FMT_ATTR(2, 3) +sftp_error_report(BDRVSSHState *s, const char *fs, ...) +{ + va_list args; + + va_start(args, fs); + error_vprintf(fs, args); + + if ((s)->sftp) { + char *ssh_err; + int ssh_err_code; + unsigned long sftp_err_code; + + /* This is not an errno. See <libssh2.h>. */ + ssh_err_code = libssh2_session_last_error(s->session, + &ssh_err, NULL, 0); + /* See <libssh2_sftp.h>. */ + sftp_err_code = libssh2_sftp_last_error((s)->sftp); + + error_printf(": %s (libssh2 error code: %d, sftp error code: %lu)", + ssh_err, ssh_err_code, sftp_err_code); + } + + va_end(args); + error_printf("\n"); +} + +static int parse_uri(const char *filename, QDict *options, Error **errp) +{ + URI *uri = NULL; + QueryParams *qp; + int i; + + uri = uri_parse(filename); + if (!uri) { + return -EINVAL; + } + + if (strcmp(uri->scheme, "ssh") != 0) { + error_setg(errp, "URI scheme must be 'ssh'"); + goto err; + } + + if (!uri->server || strcmp(uri->server, "") == 0) { + error_setg(errp, "missing hostname in URI"); + goto err; + } + + if (!uri->path || strcmp(uri->path, "") == 0) { + error_setg(errp, "missing remote path in URI"); + goto err; + } + + qp = query_params_parse(uri->query); + if (!qp) { + error_setg(errp, "could not parse query parameters"); + goto err; + } + + if(uri->user && strcmp(uri->user, "") != 0) { + qdict_put(options, "user", qstring_from_str(uri->user)); + } + + qdict_put(options, "host", qstring_from_str(uri->server)); + + if (uri->port) { + qdict_put(options, "port", qint_from_int(uri->port)); + } + + qdict_put(options, "path", qstring_from_str(uri->path)); + + /* Pick out any query parameters that we understand, and ignore + * the rest. + */ + for (i = 0; i < qp->n; ++i) { + if (strcmp(qp->p[i].name, "host_key_check") == 0) { + qdict_put(options, "host_key_check", + qstring_from_str(qp->p[i].value)); + } + } + + query_params_free(qp); + uri_free(uri); + return 0; + + err: + if (uri) { + uri_free(uri); + } + return -EINVAL; +} + +static void ssh_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + if (qdict_haskey(options, "user") || + qdict_haskey(options, "host") || + qdict_haskey(options, "port") || + qdict_haskey(options, "path") || + qdict_haskey(options, "host_key_check")) { + error_setg(errp, "user, host, port, path, host_key_check cannot be used at the same time as a file option"); + return; + } + + parse_uri(filename, options, errp); +} + +static int check_host_key_knownhosts(BDRVSSHState *s, + const char *host, int port, Error **errp) +{ + const char *home; + char *knh_file = NULL; + LIBSSH2_KNOWNHOSTS *knh = NULL; + struct libssh2_knownhost *found; + int ret, r; + const char *hostkey; + size_t len; + int type; + + hostkey = libssh2_session_hostkey(s->session, &len, &type); + if (!hostkey) { + ret = -EINVAL; + session_error_setg(errp, s, "failed to read remote host key"); + goto out; + } + + knh = libssh2_knownhost_init(s->session); + if (!knh) { + ret = -EINVAL; + session_error_setg(errp, s, + "failed to initialize known hosts support"); + goto out; + } + + home = getenv("HOME"); + if (home) { + knh_file = g_strdup_printf("%s/.ssh/known_hosts", home); + } else { + knh_file = g_strdup_printf("/root/.ssh/known_hosts"); + } + + /* Read all known hosts from OpenSSH-style known_hosts file. */ + libssh2_knownhost_readfile(knh, knh_file, LIBSSH2_KNOWNHOST_FILE_OPENSSH); + + r = libssh2_knownhost_checkp(knh, host, port, hostkey, len, + LIBSSH2_KNOWNHOST_TYPE_PLAIN| + LIBSSH2_KNOWNHOST_KEYENC_RAW, + &found); + switch (r) { + case LIBSSH2_KNOWNHOST_CHECK_MATCH: + /* OK */ + DPRINTF("host key OK: %s", found->key); + break; + case LIBSSH2_KNOWNHOST_CHECK_MISMATCH: + ret = -EINVAL; + session_error_setg(errp, s, + "host key does not match the one in known_hosts" + " (found key %s)", found->key); + goto out; + case LIBSSH2_KNOWNHOST_CHECK_NOTFOUND: + ret = -EINVAL; + session_error_setg(errp, s, "no host key was found in known_hosts"); + goto out; + case LIBSSH2_KNOWNHOST_CHECK_FAILURE: + ret = -EINVAL; + session_error_setg(errp, s, + "failure matching the host key with known_hosts"); + goto out; + default: + ret = -EINVAL; + session_error_setg(errp, s, "unknown error matching the host key" + " with known_hosts (%d)", r); + goto out; + } + + /* known_hosts checking successful. */ + ret = 0; + + out: + if (knh != NULL) { + libssh2_knownhost_free(knh); + } + g_free(knh_file); + return ret; +} + +static unsigned hex2decimal(char ch) +{ + if (ch >= '0' && ch <= '9') { + return (ch - '0'); + } else if (ch >= 'a' && ch <= 'f') { + return 10 + (ch - 'a'); + } else if (ch >= 'A' && ch <= 'F') { + return 10 + (ch - 'A'); + } + + return -1; +} + +/* Compare the binary fingerprint (hash of host key) with the + * host_key_check parameter. + */ +static int compare_fingerprint(const unsigned char *fingerprint, size_t len, + const char *host_key_check) +{ + unsigned c; + + while (len > 0) { + while (*host_key_check == ':') + host_key_check++; + if (!qemu_isxdigit(host_key_check[0]) || + !qemu_isxdigit(host_key_check[1])) + return 1; + c = hex2decimal(host_key_check[0]) * 16 + + hex2decimal(host_key_check[1]); + if (c - *fingerprint != 0) + return c - *fingerprint; + fingerprint++; + len--; + host_key_check += 2; + } + return *host_key_check - '\0'; +} + +static int +check_host_key_hash(BDRVSSHState *s, const char *hash, + int hash_type, size_t fingerprint_len, Error **errp) +{ + const char *fingerprint; + + fingerprint = libssh2_hostkey_hash(s->session, hash_type); + if (!fingerprint) { + session_error_setg(errp, s, "failed to read remote host key"); + return -EINVAL; + } + + if(compare_fingerprint((unsigned char *) fingerprint, fingerprint_len, + hash) != 0) { + error_setg(errp, "remote host key does not match host_key_check '%s'", + hash); + return -EPERM; + } + + return 0; +} + +static int check_host_key(BDRVSSHState *s, const char *host, int port, + const char *host_key_check, Error **errp) +{ + /* host_key_check=no */ + if (strcmp(host_key_check, "no") == 0) { + return 0; + } + + /* host_key_check=md5:xx:yy:zz:... */ + if (strncmp(host_key_check, "md5:", 4) == 0) { + return check_host_key_hash(s, &host_key_check[4], + LIBSSH2_HOSTKEY_HASH_MD5, 16, errp); + } + + /* host_key_check=sha1:xx:yy:zz:... */ + if (strncmp(host_key_check, "sha1:", 5) == 0) { + return check_host_key_hash(s, &host_key_check[5], + LIBSSH2_HOSTKEY_HASH_SHA1, 20, errp); + } + + /* host_key_check=yes */ + if (strcmp(host_key_check, "yes") == 0) { + return check_host_key_knownhosts(s, host, port, errp); + } + + error_setg(errp, "unknown host_key_check setting (%s)", host_key_check); + return -EINVAL; +} + +static int authenticate(BDRVSSHState *s, const char *user, Error **errp) +{ + int r, ret; + const char *userauthlist; + LIBSSH2_AGENT *agent = NULL; + struct libssh2_agent_publickey *identity; + struct libssh2_agent_publickey *prev_identity = NULL; + + userauthlist = libssh2_userauth_list(s->session, user, strlen(user)); + if (strstr(userauthlist, "publickey") == NULL) { + ret = -EPERM; + error_setg(errp, + "remote server does not support \"publickey\" authentication"); + goto out; + } + + /* Connect to ssh-agent and try each identity in turn. */ + agent = libssh2_agent_init(s->session); + if (!agent) { + ret = -EINVAL; + session_error_setg(errp, s, "failed to initialize ssh-agent support"); + goto out; + } + if (libssh2_agent_connect(agent)) { + ret = -ECONNREFUSED; + session_error_setg(errp, s, "failed to connect to ssh-agent"); + goto out; + } + if (libssh2_agent_list_identities(agent)) { + ret = -EINVAL; + session_error_setg(errp, s, + "failed requesting identities from ssh-agent"); + goto out; + } + + for(;;) { + r = libssh2_agent_get_identity(agent, &identity, prev_identity); + if (r == 1) { /* end of list */ + break; + } + if (r < 0) { + ret = -EINVAL; + session_error_setg(errp, s, + "failed to obtain identity from ssh-agent"); + goto out; + } + r = libssh2_agent_userauth(agent, user, identity); + if (r == 0) { + /* Authenticated! */ + ret = 0; + goto out; + } + /* Failed to authenticate with this identity, try the next one. */ + prev_identity = identity; + } + + ret = -EPERM; + error_setg(errp, "failed to authenticate using publickey authentication " + "and the identities held by your ssh-agent"); + + out: + if (agent != NULL) { + /* Note: libssh2 implementation implicitly calls + * libssh2_agent_disconnect if necessary. + */ + libssh2_agent_free(agent); + } + + return ret; +} + +static int connect_to_ssh(BDRVSSHState *s, QDict *options, + int ssh_flags, int creat_mode, Error **errp) +{ + int r, ret; + const char *host, *user, *path, *host_key_check; + int port; + + if (!qdict_haskey(options, "host")) { + ret = -EINVAL; + error_setg(errp, "No hostname was specified"); + goto err; + } + host = qdict_get_str(options, "host"); + + if (qdict_haskey(options, "port")) { + port = qdict_get_int(options, "port"); + } else { + port = 22; + } + + if (!qdict_haskey(options, "path")) { + ret = -EINVAL; + error_setg(errp, "No path was specified"); + goto err; + } + path = qdict_get_str(options, "path"); + + if (qdict_haskey(options, "user")) { + user = qdict_get_str(options, "user"); + } else { + user = g_get_user_name(); + if (!user) { + error_setg_errno(errp, errno, "Can't get user name"); + ret = -errno; + goto err; + } + } + + if (qdict_haskey(options, "host_key_check")) { + host_key_check = qdict_get_str(options, "host_key_check"); + } else { + host_key_check = "yes"; + } + + /* Construct the host:port name for inet_connect. */ + g_free(s->hostport); + s->hostport = g_strdup_printf("%s:%d", host, port); + + /* Open the socket and connect. */ + s->sock = inet_connect(s->hostport, errp); + if (s->sock < 0) { + ret = -EIO; + goto err; + } + + /* Create SSH session. */ + s->session = libssh2_session_init(); + if (!s->session) { + ret = -EINVAL; + session_error_setg(errp, s, "failed to initialize libssh2 session"); + goto err; + } + +#if TRACE_LIBSSH2 != 0 + libssh2_trace(s->session, TRACE_LIBSSH2); +#endif + + r = libssh2_session_handshake(s->session, s->sock); + if (r != 0) { + ret = -EINVAL; + session_error_setg(errp, s, "failed to establish SSH session"); + goto err; + } + + /* Check the remote host's key against known_hosts. */ + ret = check_host_key(s, host, port, host_key_check, errp); + if (ret < 0) { + goto err; + } + + /* Authenticate. */ + ret = authenticate(s, user, errp); + if (ret < 0) { + goto err; + } + + /* Start SFTP. */ + s->sftp = libssh2_sftp_init(s->session); + if (!s->sftp) { + session_error_setg(errp, s, "failed to initialize sftp handle"); + ret = -EINVAL; + goto err; + } + + /* Open the remote file. */ + DPRINTF("opening file %s flags=0x%x creat_mode=0%o", + path, ssh_flags, creat_mode); + s->sftp_handle = libssh2_sftp_open(s->sftp, path, ssh_flags, creat_mode); + if (!s->sftp_handle) { + session_error_setg(errp, s, "failed to open remote file '%s'", path); + ret = -EINVAL; + goto err; + } + + r = libssh2_sftp_fstat(s->sftp_handle, &s->attrs); + if (r < 0) { + sftp_error_setg(errp, s, "failed to read file attributes"); + return -EINVAL; + } + + /* Delete the options we've used; any not deleted will cause the + * block layer to give an error about unused options. + */ + qdict_del(options, "host"); + qdict_del(options, "port"); + qdict_del(options, "user"); + qdict_del(options, "path"); + qdict_del(options, "host_key_check"); + + return 0; + + err: + if (s->sftp_handle) { + libssh2_sftp_close(s->sftp_handle); + } + s->sftp_handle = NULL; + if (s->sftp) { + libssh2_sftp_shutdown(s->sftp); + } + s->sftp = NULL; + if (s->session) { + libssh2_session_disconnect(s->session, + "from qemu ssh client: " + "error opening connection"); + libssh2_session_free(s->session); + } + s->session = NULL; + + return ret; +} + +static int ssh_file_open(BlockDriverState *bs, QDict *options, int bdrv_flags, + Error **errp) +{ + BDRVSSHState *s = bs->opaque; + int ret; + int ssh_flags; + + ssh_state_init(s); + + ssh_flags = LIBSSH2_FXF_READ; + if (bdrv_flags & BDRV_O_RDWR) { + ssh_flags |= LIBSSH2_FXF_WRITE; + } + + /* Start up SSH. */ + ret = connect_to_ssh(s, options, ssh_flags, 0, errp); + if (ret < 0) { + goto err; + } + + /* Go non-blocking. */ + libssh2_session_set_blocking(s->session, 0); + + return 0; + + err: + if (s->sock >= 0) { + close(s->sock); + } + s->sock = -1; + + return ret; +} + +static QemuOptsList ssh_create_opts = { + .name = "ssh-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(ssh_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { /* end of list */ } + } +}; + +static int ssh_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int r, ret; + int64_t total_size = 0; + QDict *uri_options = NULL; + BDRVSSHState s; + ssize_t r2; + char c[1] = { '\0' }; + + ssh_state_init(&s); + + /* Get desired file size. */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + DPRINTF("total_size=%" PRIi64, total_size); + + uri_options = qdict_new(); + r = parse_uri(filename, uri_options, errp); + if (r < 0) { + ret = r; + goto out; + } + + r = connect_to_ssh(&s, uri_options, + LIBSSH2_FXF_READ|LIBSSH2_FXF_WRITE| + LIBSSH2_FXF_CREAT|LIBSSH2_FXF_TRUNC, + 0644, errp); + if (r < 0) { + ret = r; + goto out; + } + + if (total_size > 0) { + libssh2_sftp_seek64(s.sftp_handle, total_size-1); + r2 = libssh2_sftp_write(s.sftp_handle, c, 1); + if (r2 < 0) { + sftp_error_setg(errp, &s, "truncate failed"); + ret = -EINVAL; + goto out; + } + s.attrs.filesize = total_size; + } + + ret = 0; + + out: + ssh_state_free(&s); + if (uri_options != NULL) { + QDECREF(uri_options); + } + return ret; +} + +static void ssh_close(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + + ssh_state_free(s); +} + +static int ssh_has_zero_init(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + /* Assume false, unless we can positively prove it's true. */ + int has_zero_init = 0; + + if (s->attrs.flags & LIBSSH2_SFTP_ATTR_PERMISSIONS) { + if (s->attrs.permissions & LIBSSH2_SFTP_S_IFREG) { + has_zero_init = 1; + } + } + + return has_zero_init; +} + +static void restart_coroutine(void *opaque) +{ + Coroutine *co = opaque; + + DPRINTF("co=%p", co); + + qemu_coroutine_enter(co, NULL); +} + +static coroutine_fn void set_fd_handler(BDRVSSHState *s, BlockDriverState *bs) +{ + int r; + IOHandler *rd_handler = NULL, *wr_handler = NULL; + Coroutine *co = qemu_coroutine_self(); + + r = libssh2_session_block_directions(s->session); + + if (r & LIBSSH2_SESSION_BLOCK_INBOUND) { + rd_handler = restart_coroutine; + } + if (r & LIBSSH2_SESSION_BLOCK_OUTBOUND) { + wr_handler = restart_coroutine; + } + + DPRINTF("s->sock=%d rd_handler=%p wr_handler=%p", s->sock, + rd_handler, wr_handler); + + aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, + false, rd_handler, wr_handler, co); +} + +static coroutine_fn void clear_fd_handler(BDRVSSHState *s, + BlockDriverState *bs) +{ + DPRINTF("s->sock=%d", s->sock); + aio_set_fd_handler(bdrv_get_aio_context(bs), s->sock, + false, NULL, NULL, NULL); +} + +/* A non-blocking call returned EAGAIN, so yield, ensuring the + * handlers are set up so that we'll be rescheduled when there is an + * interesting event on the socket. + */ +static coroutine_fn void co_yield(BDRVSSHState *s, BlockDriverState *bs) +{ + set_fd_handler(s, bs); + qemu_coroutine_yield(); + clear_fd_handler(s, bs); +} + +/* SFTP has a function `libssh2_sftp_seek64' which seeks to a position + * in the remote file. Notice that it just updates a field in the + * sftp_handle structure, so there is no network traffic and it cannot + * fail. + * + * However, `libssh2_sftp_seek64' does have a catastrophic effect on + * performance since it causes the handle to throw away all in-flight + * reads and buffered readahead data. Therefore this function tries + * to be intelligent about when to call the underlying libssh2 function. + */ +#define SSH_SEEK_WRITE 0 +#define SSH_SEEK_READ 1 +#define SSH_SEEK_FORCE 2 + +static void ssh_seek(BDRVSSHState *s, int64_t offset, int flags) +{ + bool op_read = (flags & SSH_SEEK_READ) != 0; + bool force = (flags & SSH_SEEK_FORCE) != 0; + + if (force || op_read != s->offset_op_read || offset != s->offset) { + DPRINTF("seeking to offset=%" PRIi64, offset); + libssh2_sftp_seek64(s->sftp_handle, offset); + s->offset = offset; + s->offset_op_read = op_read; + } +} + +static coroutine_fn int ssh_read(BDRVSSHState *s, BlockDriverState *bs, + int64_t offset, size_t size, + QEMUIOVector *qiov) +{ + ssize_t r; + size_t got; + char *buf, *end_of_vec; + struct iovec *i; + + DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); + + ssh_seek(s, offset, SSH_SEEK_READ); + + /* This keeps track of the current iovec element ('i'), where we + * will write to next ('buf'), and the end of the current iovec + * ('end_of_vec'). + */ + i = &qiov->iov[0]; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + + /* libssh2 has a hard-coded limit of 2000 bytes per request, + * although it will also do readahead behind our backs. Therefore + * we may have to do repeated reads here until we have read 'size' + * bytes. + */ + for (got = 0; got < size; ) { + again: + DPRINTF("sftp_read buf=%p size=%zu", buf, end_of_vec - buf); + r = libssh2_sftp_read(s->sftp_handle, buf, end_of_vec - buf); + DPRINTF("sftp_read returned %zd", r); + + if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { + co_yield(s, bs); + goto again; + } + if (r < 0) { + sftp_error_report(s, "read failed"); + s->offset = -1; + return -EIO; + } + if (r == 0) { + /* EOF: Short read so pad the buffer with zeroes and return it. */ + qemu_iovec_memset(qiov, got, 0, size - got); + return 0; + } + + got += r; + buf += r; + s->offset += r; + if (buf >= end_of_vec && got < size) { + i++; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + } + } + + return 0; +} + +static coroutine_fn int ssh_co_readv(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVSSHState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = ssh_read(s, bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, qiov); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +static int ssh_write(BDRVSSHState *s, BlockDriverState *bs, + int64_t offset, size_t size, + QEMUIOVector *qiov) +{ + ssize_t r; + size_t written; + char *buf, *end_of_vec; + struct iovec *i; + + DPRINTF("offset=%" PRIi64 " size=%zu", offset, size); + + ssh_seek(s, offset, SSH_SEEK_WRITE); + + /* This keeps track of the current iovec element ('i'), where we + * will read from next ('buf'), and the end of the current iovec + * ('end_of_vec'). + */ + i = &qiov->iov[0]; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + + for (written = 0; written < size; ) { + again: + DPRINTF("sftp_write buf=%p size=%zu", buf, end_of_vec - buf); + r = libssh2_sftp_write(s->sftp_handle, buf, end_of_vec - buf); + DPRINTF("sftp_write returned %zd", r); + + if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { + co_yield(s, bs); + goto again; + } + if (r < 0) { + sftp_error_report(s, "write failed"); + s->offset = -1; + return -EIO; + } + /* The libssh2 API is very unclear about this. A comment in + * the code says "nothing was acked, and no EAGAIN was + * received!" which apparently means that no data got sent + * out, and the underlying channel didn't return any EAGAIN + * indication. I think this is a bug in either libssh2 or + * OpenSSH (server-side). In any case, forcing a seek (to + * discard libssh2 internal buffers), and then trying again + * works for me. + */ + if (r == 0) { + ssh_seek(s, offset + written, SSH_SEEK_WRITE|SSH_SEEK_FORCE); + co_yield(s, bs); + goto again; + } + + written += r; + buf += r; + s->offset += r; + if (buf >= end_of_vec && written < size) { + i++; + buf = i->iov_base; + end_of_vec = i->iov_base + i->iov_len; + } + + if (offset + written > s->attrs.filesize) + s->attrs.filesize = offset + written; + } + + return 0; +} + +static coroutine_fn int ssh_co_writev(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVSSHState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = ssh_write(s, bs, sector_num * BDRV_SECTOR_SIZE, + nb_sectors * BDRV_SECTOR_SIZE, qiov); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +static void unsafe_flush_warning(BDRVSSHState *s, const char *what) +{ + if (!s->unsafe_flush_warning) { + error_report("warning: ssh server %s does not support fsync", + s->hostport); + if (what) { + error_report("to support fsync, you need %s", what); + } + s->unsafe_flush_warning = true; + } +} + +#ifdef HAS_LIBSSH2_SFTP_FSYNC + +static coroutine_fn int ssh_flush(BDRVSSHState *s, BlockDriverState *bs) +{ + int r; + + DPRINTF("fsync"); + again: + r = libssh2_sftp_fsync(s->sftp_handle); + if (r == LIBSSH2_ERROR_EAGAIN || r == LIBSSH2_ERROR_TIMEOUT) { + co_yield(s, bs); + goto again; + } + if (r == LIBSSH2_ERROR_SFTP_PROTOCOL && + libssh2_sftp_last_error(s->sftp) == LIBSSH2_FX_OP_UNSUPPORTED) { + unsafe_flush_warning(s, "OpenSSH >= 6.3"); + return 0; + } + if (r < 0) { + sftp_error_report(s, "fsync failed"); + return -EIO; + } + + return 0; +} + +static coroutine_fn int ssh_co_flush(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + int ret; + + qemu_co_mutex_lock(&s->lock); + ret = ssh_flush(s, bs); + qemu_co_mutex_unlock(&s->lock); + + return ret; +} + +#else /* !HAS_LIBSSH2_SFTP_FSYNC */ + +static coroutine_fn int ssh_co_flush(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + + unsafe_flush_warning(s, "libssh2 >= 1.4.4"); + return 0; +} + +#endif /* !HAS_LIBSSH2_SFTP_FSYNC */ + +static int64_t ssh_getlength(BlockDriverState *bs) +{ + BDRVSSHState *s = bs->opaque; + int64_t length; + + /* Note we cannot make a libssh2 call here. */ + length = (int64_t) s->attrs.filesize; + DPRINTF("length=%" PRIi64, length); + + return length; +} + +static BlockDriver bdrv_ssh = { + .format_name = "ssh", + .protocol_name = "ssh", + .instance_size = sizeof(BDRVSSHState), + .bdrv_parse_filename = ssh_parse_filename, + .bdrv_file_open = ssh_file_open, + .bdrv_create = ssh_create, + .bdrv_close = ssh_close, + .bdrv_has_zero_init = ssh_has_zero_init, + .bdrv_co_readv = ssh_co_readv, + .bdrv_co_writev = ssh_co_writev, + .bdrv_getlength = ssh_getlength, + .bdrv_co_flush_to_disk = ssh_co_flush, + .create_opts = &ssh_create_opts, +}; + +static void bdrv_ssh_init(void) +{ + int r; + + r = libssh2_init(0); + if (r != 0) { + fprintf(stderr, "libssh2 initialization failed, %d\n", r); + exit(EXIT_FAILURE); + } + + bdrv_register(&bdrv_ssh); +} + +block_init(bdrv_ssh_init); diff --git a/src/block/stream.c b/src/block/stream.c new file mode 100644 index 0000000..25af7ef --- /dev/null +++ b/src/block/stream.c @@ -0,0 +1,243 @@ +/* + * Image streaming + * + * Copyright IBM, Corp. 2011 + * + * Authors: + * Stefan Hajnoczi <stefanha@linux.vnet.ibm.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "trace.h" +#include "block/block_int.h" +#include "block/blockjob.h" +#include "qapi/qmp/qerror.h" +#include "qemu/ratelimit.h" +#include "sysemu/block-backend.h" + +enum { + /* + * Size of data buffer for populating the image file. This should be large + * enough to process multiple clusters in a single call, so that populating + * contiguous regions of the image is efficient. + */ + STREAM_BUFFER_SIZE = 512 * 1024, /* in bytes */ +}; + +#define SLICE_TIME 100000000ULL /* ns */ + +typedef struct StreamBlockJob { + BlockJob common; + RateLimit limit; + BlockDriverState *base; + BlockdevOnError on_error; + char *backing_file_str; +} StreamBlockJob; + +static int coroutine_fn stream_populate(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, + void *buf) +{ + struct iovec iov = { + .iov_base = buf, + .iov_len = nb_sectors * BDRV_SECTOR_SIZE, + }; + QEMUIOVector qiov; + + qemu_iovec_init_external(&qiov, &iov, 1); + + /* Copy-on-read the unallocated clusters */ + return bdrv_co_copy_on_readv(bs, sector_num, nb_sectors, &qiov); +} + +typedef struct { + int ret; + bool reached_end; +} StreamCompleteData; + +static void stream_complete(BlockJob *job, void *opaque) +{ + StreamBlockJob *s = container_of(job, StreamBlockJob, common); + StreamCompleteData *data = opaque; + BlockDriverState *base = s->base; + + if (!block_job_is_cancelled(&s->common) && data->reached_end && + data->ret == 0) { + const char *base_id = NULL, *base_fmt = NULL; + if (base) { + base_id = s->backing_file_str; + if (base->drv) { + base_fmt = base->drv->format_name; + } + } + data->ret = bdrv_change_backing_file(job->bs, base_id, base_fmt); + bdrv_set_backing_hd(job->bs, base); + } + + g_free(s->backing_file_str); + block_job_completed(&s->common, data->ret); + g_free(data); +} + +static void coroutine_fn stream_run(void *opaque) +{ + StreamBlockJob *s = opaque; + StreamCompleteData *data; + BlockDriverState *bs = s->common.bs; + BlockDriverState *base = s->base; + int64_t sector_num, end; + int error = 0; + int ret = 0; + int n = 0; + void *buf; + + if (!bs->backing) { + block_job_completed(&s->common, 0); + return; + } + + s->common.len = bdrv_getlength(bs); + if (s->common.len < 0) { + block_job_completed(&s->common, s->common.len); + return; + } + + end = s->common.len >> BDRV_SECTOR_BITS; + buf = qemu_blockalign(bs, STREAM_BUFFER_SIZE); + + /* Turn on copy-on-read for the whole block device so that guest read + * requests help us make progress. Only do this when copying the entire + * backing chain since the copy-on-read operation does not take base into + * account. + */ + if (!base) { + bdrv_enable_copy_on_read(bs); + } + + for (sector_num = 0; sector_num < end; sector_num += n) { + uint64_t delay_ns = 0; + bool copy; + +wait: + /* Note that even when no rate limit is applied we need to yield + * with no pending I/O here so that bdrv_drain_all() returns. + */ + block_job_sleep_ns(&s->common, QEMU_CLOCK_REALTIME, delay_ns); + if (block_job_is_cancelled(&s->common)) { + break; + } + + copy = false; + + ret = bdrv_is_allocated(bs, sector_num, + STREAM_BUFFER_SIZE / BDRV_SECTOR_SIZE, &n); + if (ret == 1) { + /* Allocated in the top, no need to copy. */ + } else if (ret >= 0) { + /* Copy if allocated in the intermediate images. Limit to the + * known-unallocated area [sector_num, sector_num+n). */ + ret = bdrv_is_allocated_above(backing_bs(bs), base, + sector_num, n, &n); + + /* Finish early if end of backing file has been reached */ + if (ret == 0 && n == 0) { + n = end - sector_num; + } + + copy = (ret == 1); + } + trace_stream_one_iteration(s, sector_num, n, ret); + if (copy) { + if (s->common.speed) { + delay_ns = ratelimit_calculate_delay(&s->limit, n); + if (delay_ns > 0) { + goto wait; + } + } + ret = stream_populate(bs, sector_num, n, buf); + } + if (ret < 0) { + BlockErrorAction action = + block_job_error_action(&s->common, s->common.bs, s->on_error, + true, -ret); + if (action == BLOCK_ERROR_ACTION_STOP) { + n = 0; + continue; + } + if (error == 0) { + error = ret; + } + if (action == BLOCK_ERROR_ACTION_REPORT) { + break; + } + } + ret = 0; + + /* Publish progress */ + s->common.offset += n * BDRV_SECTOR_SIZE; + } + + if (!base) { + bdrv_disable_copy_on_read(bs); + } + + /* Do not remove the backing file if an error was there but ignored. */ + ret = error; + + qemu_vfree(buf); + + /* Modify backing chain and close BDSes in main loop */ + data = g_malloc(sizeof(*data)); + data->ret = ret; + data->reached_end = sector_num == end; + block_job_defer_to_main_loop(&s->common, stream_complete, data); +} + +static void stream_set_speed(BlockJob *job, int64_t speed, Error **errp) +{ + StreamBlockJob *s = container_of(job, StreamBlockJob, common); + + if (speed < 0) { + error_setg(errp, QERR_INVALID_PARAMETER, "speed"); + return; + } + ratelimit_set_speed(&s->limit, speed / BDRV_SECTOR_SIZE, SLICE_TIME); +} + +static const BlockJobDriver stream_job_driver = { + .instance_size = sizeof(StreamBlockJob), + .job_type = BLOCK_JOB_TYPE_STREAM, + .set_speed = stream_set_speed, +}; + +void stream_start(BlockDriverState *bs, BlockDriverState *base, + const char *backing_file_str, int64_t speed, + BlockdevOnError on_error, + BlockCompletionFunc *cb, + void *opaque, Error **errp) +{ + StreamBlockJob *s; + + if ((on_error == BLOCKDEV_ON_ERROR_STOP || + on_error == BLOCKDEV_ON_ERROR_ENOSPC) && + (!bs->blk || !blk_iostatus_is_enabled(bs->blk))) { + error_setg(errp, QERR_INVALID_PARAMETER, "on-error"); + return; + } + + s = block_job_create(&stream_job_driver, bs, speed, cb, opaque, errp); + if (!s) { + return; + } + + s->base = base; + s->backing_file_str = g_strdup(backing_file_str); + + s->on_error = on_error; + s->common.co = qemu_coroutine_create(stream_run); + trace_stream_start(bs, base, s, s->common.co, opaque); + qemu_coroutine_enter(s->common.co, s); +} diff --git a/src/block/throttle-groups.c b/src/block/throttle-groups.c new file mode 100644 index 0000000..13b5baa --- /dev/null +++ b/src/block/throttle-groups.c @@ -0,0 +1,482 @@ +/* + * QEMU block throttling group infrastructure + * + * Copyright (C) Nodalink, EURL. 2014 + * Copyright (C) Igalia, S.L. 2015 + * + * Authors: + * Benoît Canet <benoit.canet@nodalink.com> + * Alberto Garcia <berto@igalia.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 or + * (at your option) version 3 of the License. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see <http://www.gnu.org/licenses/>. + */ + +#include "block/throttle-groups.h" +#include "qemu/queue.h" +#include "qemu/thread.h" +#include "sysemu/qtest.h" + +/* The ThrottleGroup structure (with its ThrottleState) is shared + * among different BlockDriverState and it's independent from + * AioContext, so in order to use it from different threads it needs + * its own locking. + * + * This locking is however handled internally in this file, so it's + * transparent to outside users. + * + * The whole ThrottleGroup structure is private and invisible to + * outside users, that only use it through its ThrottleState. + * + * In addition to the ThrottleGroup structure, BlockDriverState has + * fields that need to be accessed by other members of the group and + * therefore also need to be protected by this lock. Once a BDS is + * registered in a group those fields can be accessed by other threads + * any time. + * + * Again, all this is handled internally and is mostly transparent to + * the outside. The 'throttle_timers' field however has an additional + * constraint because it may be temporarily invalid (see for example + * bdrv_set_aio_context()). Therefore in this file a thread will + * access some other BDS's timers only after verifying that that BDS + * has throttled requests in the queue. + */ +typedef struct ThrottleGroup { + char *name; /* This is constant during the lifetime of the group */ + + QemuMutex lock; /* This lock protects the following four fields */ + ThrottleState ts; + QLIST_HEAD(, BlockDriverState) head; + BlockDriverState *tokens[2]; + bool any_timer_armed[2]; + + /* These two are protected by the global throttle_groups_lock */ + unsigned refcount; + QTAILQ_ENTRY(ThrottleGroup) list; +} ThrottleGroup; + +static QemuMutex throttle_groups_lock; +static QTAILQ_HEAD(, ThrottleGroup) throttle_groups = + QTAILQ_HEAD_INITIALIZER(throttle_groups); + +/* Increments the reference count of a ThrottleGroup given its name. + * + * If no ThrottleGroup is found with the given name a new one is + * created. + * + * @name: the name of the ThrottleGroup + * @ret: the ThrottleState member of the ThrottleGroup + */ +ThrottleState *throttle_group_incref(const char *name) +{ + ThrottleGroup *tg = NULL; + ThrottleGroup *iter; + + qemu_mutex_lock(&throttle_groups_lock); + + /* Look for an existing group with that name */ + QTAILQ_FOREACH(iter, &throttle_groups, list) { + if (!strcmp(name, iter->name)) { + tg = iter; + break; + } + } + + /* Create a new one if not found */ + if (!tg) { + tg = g_new0(ThrottleGroup, 1); + tg->name = g_strdup(name); + qemu_mutex_init(&tg->lock); + throttle_init(&tg->ts); + QLIST_INIT(&tg->head); + + QTAILQ_INSERT_TAIL(&throttle_groups, tg, list); + } + + tg->refcount++; + + qemu_mutex_unlock(&throttle_groups_lock); + + return &tg->ts; +} + +/* Decrease the reference count of a ThrottleGroup. + * + * When the reference count reaches zero the ThrottleGroup is + * destroyed. + * + * @ts: The ThrottleGroup to unref, given by its ThrottleState member + */ +void throttle_group_unref(ThrottleState *ts) +{ + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + + qemu_mutex_lock(&throttle_groups_lock); + if (--tg->refcount == 0) { + QTAILQ_REMOVE(&throttle_groups, tg, list); + qemu_mutex_destroy(&tg->lock); + g_free(tg->name); + g_free(tg); + } + qemu_mutex_unlock(&throttle_groups_lock); +} + +/* Get the name from a BlockDriverState's ThrottleGroup. The name (and + * the pointer) is guaranteed to remain constant during the lifetime + * of the group. + * + * @bs: a BlockDriverState that is member of a throttling group + * @ret: the name of the group. + */ +const char *throttle_group_get_name(BlockDriverState *bs) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + return tg->name; +} + +/* Return the next BlockDriverState in the round-robin sequence, + * simulating a circular list. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @ret: the next BlockDriverState in the sequence + */ +static BlockDriverState *throttle_group_next_bs(BlockDriverState *bs) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + BlockDriverState *next = QLIST_NEXT(bs, round_robin); + + if (!next) { + return QLIST_FIRST(&tg->head); + } + + return next; +} + +/* Return the next BlockDriverState in the round-robin sequence with + * pending I/O requests. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @is_write: the type of operation (read/write) + * @ret: the next BlockDriverState with pending requests, or bs + * if there is none. + */ +static BlockDriverState *next_throttle_token(BlockDriverState *bs, + bool is_write) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + BlockDriverState *token, *start; + + start = token = tg->tokens[is_write]; + + /* get next bs round in round robin style */ + token = throttle_group_next_bs(token); + while (token != start && !token->pending_reqs[is_write]) { + token = throttle_group_next_bs(token); + } + + /* If no IO are queued for scheduling on the next round robin token + * then decide the token is the current bs because chances are + * the current bs get the current request queued. + */ + if (token == start && !token->pending_reqs[is_write]) { + token = bs; + } + + return token; +} + +/* Check if the next I/O request for a BlockDriverState needs to be + * throttled or not. If there's no timer set in this group, set one + * and update the token accordingly. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @is_write: the type of operation (read/write) + * @ret: whether the I/O request needs to be throttled or not + */ +static bool throttle_group_schedule_timer(BlockDriverState *bs, + bool is_write) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleTimers *tt = &bs->throttle_timers; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + bool must_wait; + + /* Check if any of the timers in this group is already armed */ + if (tg->any_timer_armed[is_write]) { + return true; + } + + must_wait = throttle_schedule_timer(ts, tt, is_write); + + /* If a timer just got armed, set bs as the current token */ + if (must_wait) { + tg->tokens[is_write] = bs; + tg->any_timer_armed[is_write] = true; + } + + return must_wait; +} + +/* Look for the next pending I/O request and schedule it. + * + * This assumes that tg->lock is held. + * + * @bs: the current BlockDriverState + * @is_write: the type of operation (read/write) + */ +static void schedule_next_request(BlockDriverState *bs, bool is_write) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + bool must_wait; + BlockDriverState *token; + + /* Check if there's any pending request to schedule next */ + token = next_throttle_token(bs, is_write); + if (!token->pending_reqs[is_write]) { + return; + } + + /* Set a timer for the request if it needs to be throttled */ + must_wait = throttle_group_schedule_timer(token, is_write); + + /* If it doesn't have to wait, queue it for immediate execution */ + if (!must_wait) { + /* Give preference to requests from the current bs */ + if (qemu_in_coroutine() && + qemu_co_queue_next(&bs->throttled_reqs[is_write])) { + token = bs; + } else { + ThrottleTimers *tt = &token->throttle_timers; + int64_t now = qemu_clock_get_ns(tt->clock_type); + timer_mod(tt->timers[is_write], now + 1); + tg->any_timer_armed[is_write] = true; + } + tg->tokens[is_write] = token; + } +} + +/* Check if an I/O request needs to be throttled, wait and set a timer + * if necessary, and schedule the next request using a round robin + * algorithm. + * + * @bs: the current BlockDriverState + * @bytes: the number of bytes for this I/O + * @is_write: the type of operation (read/write) + */ +void coroutine_fn throttle_group_co_io_limits_intercept(BlockDriverState *bs, + unsigned int bytes, + bool is_write) +{ + bool must_wait; + BlockDriverState *token; + + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + qemu_mutex_lock(&tg->lock); + + /* First we check if this I/O has to be throttled. */ + token = next_throttle_token(bs, is_write); + must_wait = throttle_group_schedule_timer(token, is_write); + + /* Wait if there's a timer set or queued requests of this type */ + if (must_wait || bs->pending_reqs[is_write]) { + bs->pending_reqs[is_write]++; + qemu_mutex_unlock(&tg->lock); + qemu_co_queue_wait(&bs->throttled_reqs[is_write]); + qemu_mutex_lock(&tg->lock); + bs->pending_reqs[is_write]--; + } + + /* The I/O will be executed, so do the accounting */ + throttle_account(bs->throttle_state, is_write, bytes); + + /* Schedule the next request */ + schedule_next_request(bs, is_write); + + qemu_mutex_unlock(&tg->lock); +} + +/* Update the throttle configuration for a particular group. Similar + * to throttle_config(), but guarantees atomicity within the + * throttling group. + * + * @bs: a BlockDriverState that is member of the group + * @cfg: the configuration to set + */ +void throttle_group_config(BlockDriverState *bs, ThrottleConfig *cfg) +{ + ThrottleTimers *tt = &bs->throttle_timers; + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + qemu_mutex_lock(&tg->lock); + /* throttle_config() cancels the timers */ + if (timer_pending(tt->timers[0])) { + tg->any_timer_armed[0] = false; + } + if (timer_pending(tt->timers[1])) { + tg->any_timer_armed[1] = false; + } + throttle_config(ts, tt, cfg); + qemu_mutex_unlock(&tg->lock); +} + +/* Get the throttle configuration from a particular group. Similar to + * throttle_get_config(), but guarantees atomicity within the + * throttling group. + * + * @bs: a BlockDriverState that is member of the group + * @cfg: the configuration will be written here + */ +void throttle_group_get_config(BlockDriverState *bs, ThrottleConfig *cfg) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + qemu_mutex_lock(&tg->lock); + throttle_get_config(ts, cfg); + qemu_mutex_unlock(&tg->lock); +} + +/* ThrottleTimers callback. This wakes up a request that was waiting + * because it had been throttled. + * + * @bs: the BlockDriverState whose request had been throttled + * @is_write: the type of operation (read/write) + */ +static void timer_cb(BlockDriverState *bs, bool is_write) +{ + ThrottleState *ts = bs->throttle_state; + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + bool empty_queue; + + /* The timer has just been fired, so we can update the flag */ + qemu_mutex_lock(&tg->lock); + tg->any_timer_armed[is_write] = false; + qemu_mutex_unlock(&tg->lock); + + /* Run the request that was waiting for this timer */ + empty_queue = !qemu_co_enter_next(&bs->throttled_reqs[is_write]); + + /* If the request queue was empty then we have to take care of + * scheduling the next one */ + if (empty_queue) { + qemu_mutex_lock(&tg->lock); + schedule_next_request(bs, is_write); + qemu_mutex_unlock(&tg->lock); + } +} + +static void read_timer_cb(void *opaque) +{ + timer_cb(opaque, false); +} + +static void write_timer_cb(void *opaque) +{ + timer_cb(opaque, true); +} + +/* Register a BlockDriverState in the throttling group, also + * initializing its timers and updating its throttle_state pointer to + * point to it. If a throttling group with that name does not exist + * yet, it will be created. + * + * @bs: the BlockDriverState to insert + * @groupname: the name of the group + */ +void throttle_group_register_bs(BlockDriverState *bs, const char *groupname) +{ + int i; + ThrottleState *ts = throttle_group_incref(groupname); + ThrottleGroup *tg = container_of(ts, ThrottleGroup, ts); + int clock_type = QEMU_CLOCK_REALTIME; + + if (qtest_enabled()) { + /* For testing block IO throttling only */ + clock_type = QEMU_CLOCK_VIRTUAL; + } + + bs->throttle_state = ts; + + qemu_mutex_lock(&tg->lock); + /* If the ThrottleGroup is new set this BlockDriverState as the token */ + for (i = 0; i < 2; i++) { + if (!tg->tokens[i]) { + tg->tokens[i] = bs; + } + } + + QLIST_INSERT_HEAD(&tg->head, bs, round_robin); + + throttle_timers_init(&bs->throttle_timers, + bdrv_get_aio_context(bs), + clock_type, + read_timer_cb, + write_timer_cb, + bs); + + qemu_mutex_unlock(&tg->lock); +} + +/* Unregister a BlockDriverState from its group, removing it from the + * list, destroying the timers and setting the throttle_state pointer + * to NULL. + * + * The BlockDriverState must not have pending throttled requests, so + * the caller has to drain them first. + * + * The group will be destroyed if it's empty after this operation. + * + * @bs: the BlockDriverState to remove + */ +void throttle_group_unregister_bs(BlockDriverState *bs) +{ + ThrottleGroup *tg = container_of(bs->throttle_state, ThrottleGroup, ts); + int i; + + assert(bs->pending_reqs[0] == 0 && bs->pending_reqs[1] == 0); + assert(qemu_co_queue_empty(&bs->throttled_reqs[0])); + assert(qemu_co_queue_empty(&bs->throttled_reqs[1])); + + qemu_mutex_lock(&tg->lock); + for (i = 0; i < 2; i++) { + if (tg->tokens[i] == bs) { + BlockDriverState *token = throttle_group_next_bs(bs); + /* Take care of the case where this is the last bs in the group */ + if (token == bs) { + token = NULL; + } + tg->tokens[i] = token; + } + } + + /* remove the current bs from the list */ + QLIST_REMOVE(bs, round_robin); + throttle_timers_destroy(&bs->throttle_timers); + qemu_mutex_unlock(&tg->lock); + + throttle_group_unref(&tg->ts); + bs->throttle_state = NULL; +} + +static void throttle_groups_init(void) +{ + qemu_mutex_init(&throttle_groups_lock); +} + +block_init(throttle_groups_init); diff --git a/src/block/vdi.c b/src/block/vdi.c new file mode 100644 index 0000000..17f435f --- /dev/null +++ b/src/block/vdi.c @@ -0,0 +1,915 @@ +/* + * Block driver for the Virtual Disk Image (VDI) format + * + * Copyright (c) 2009, 2012 Stefan Weil + * + * This program is free software: you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation, either version 2 of the License, or + * (at your option) version 3 or any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see <http://www.gnu.org/licenses/>. + * + * Reference: + * http://forums.virtualbox.org/viewtopic.php?t=8046 + * + * This driver supports create / read / write operations on VDI images. + * + * Todo (see also TODO in code): + * + * Some features like snapshots are still missing. + * + * Deallocation of zero-filled blocks and shrinking images are missing, too + * (might be added to common block layer). + * + * Allocation of blocks could be optimized (less writes to block map and + * header). + * + * Read and write of adjacent blocks could be done in one operation + * (current code uses one operation per block (1 MiB). + * + * The code is not thread safe (missing locks for changes in header and + * block table, no problem with current QEMU). + * + * Hints: + * + * Blocks (VDI documentation) correspond to clusters (QEMU). + * QEMU's backing files could be implemented using VDI snapshot files (TODO). + * VDI snapshot files may also contain the complete machine state. + * Maybe this machine state can be converted to QEMU PC machine snapshot data. + * + * The driver keeps a block cache (little endian entries) in memory. + * For the standard block size (1 MiB), a 1 TiB disk will use 4 MiB RAM, + * so this seems to be reasonable. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#include "qemu/coroutine.h" + +#if defined(CONFIG_UUID) +#include <uuid/uuid.h> +#else +/* TODO: move uuid emulation to some central place in QEMU. */ +#include "sysemu/sysemu.h" /* UUID_FMT */ +typedef unsigned char uuid_t[16]; +#endif + +/* Code configuration options. */ + +/* Enable debug messages. */ +//~ #define CONFIG_VDI_DEBUG + +/* Support write operations on VDI images. */ +#define CONFIG_VDI_WRITE + +/* Support non-standard block (cluster) size. This is untested. + * Maybe it will be needed for very large images. + */ +//~ #define CONFIG_VDI_BLOCK_SIZE + +/* Support static (fixed, pre-allocated) images. */ +#define CONFIG_VDI_STATIC_IMAGE + +/* Command line option for static images. */ +#define BLOCK_OPT_STATIC "static" + +#define KiB 1024 +#define MiB (KiB * KiB) + +#define SECTOR_SIZE 512 +#define DEFAULT_CLUSTER_SIZE (1 * MiB) + +#if defined(CONFIG_VDI_DEBUG) +#define logout(fmt, ...) \ + fprintf(stderr, "vdi\t%-24s" fmt, __func__, ##__VA_ARGS__) +#else +#define logout(fmt, ...) ((void)0) +#endif + +/* Image signature. */ +#define VDI_SIGNATURE 0xbeda107f + +/* Image version. */ +#define VDI_VERSION_1_1 0x00010001 + +/* Image type. */ +#define VDI_TYPE_DYNAMIC 1 +#define VDI_TYPE_STATIC 2 + +/* Innotek / SUN images use these strings in header.text: + * "<<< innotek VirtualBox Disk Image >>>\n" + * "<<< Sun xVM VirtualBox Disk Image >>>\n" + * "<<< Sun VirtualBox Disk Image >>>\n" + * The value does not matter, so QEMU created images use a different text. + */ +#define VDI_TEXT "<<< QEMU VM Virtual Disk Image >>>\n" + +/* A never-allocated block; semantically arbitrary content. */ +#define VDI_UNALLOCATED 0xffffffffU + +/* A discarded (no longer allocated) block; semantically zero-filled. */ +#define VDI_DISCARDED 0xfffffffeU + +#define VDI_IS_ALLOCATED(X) ((X) < VDI_DISCARDED) + +/* The bmap will take up VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) bytes; since + * the bmap is read and written in a single operation, its size needs to be + * limited to INT_MAX; furthermore, when opening an image, the bmap size is + * rounded up to be aligned on BDRV_SECTOR_SIZE. + * Therefore this should satisfy the following: + * VDI_BLOCKS_IN_IMAGE_MAX * sizeof(uint32_t) + BDRV_SECTOR_SIZE == INT_MAX + 1 + * (INT_MAX + 1 is the first value not representable as an int) + * This guarantees that any value below or equal to the constant will, when + * multiplied by sizeof(uint32_t) and rounded up to a BDRV_SECTOR_SIZE boundary, + * still be below or equal to INT_MAX. */ +#define VDI_BLOCKS_IN_IMAGE_MAX \ + ((unsigned)((INT_MAX + 1u - BDRV_SECTOR_SIZE) / sizeof(uint32_t))) +#define VDI_DISK_SIZE_MAX ((uint64_t)VDI_BLOCKS_IN_IMAGE_MAX * \ + (uint64_t)DEFAULT_CLUSTER_SIZE) + +#if !defined(CONFIG_UUID) +static inline void uuid_generate(uuid_t out) +{ + memset(out, 0, sizeof(uuid_t)); +} + +static inline int uuid_is_null(const uuid_t uu) +{ + uuid_t null_uuid = { 0 }; + return memcmp(uu, null_uuid, sizeof(uuid_t)) == 0; +} + +# if defined(CONFIG_VDI_DEBUG) +static inline void uuid_unparse(const uuid_t uu, char *out) +{ + snprintf(out, 37, UUID_FMT, + uu[0], uu[1], uu[2], uu[3], uu[4], uu[5], uu[6], uu[7], + uu[8], uu[9], uu[10], uu[11], uu[12], uu[13], uu[14], uu[15]); +} +# endif +#endif + +typedef struct { + char text[0x40]; + uint32_t signature; + uint32_t version; + uint32_t header_size; + uint32_t image_type; + uint32_t image_flags; + char description[256]; + uint32_t offset_bmap; + uint32_t offset_data; + uint32_t cylinders; /* disk geometry, unused here */ + uint32_t heads; /* disk geometry, unused here */ + uint32_t sectors; /* disk geometry, unused here */ + uint32_t sector_size; + uint32_t unused1; + uint64_t disk_size; + uint32_t block_size; + uint32_t block_extra; /* unused here */ + uint32_t blocks_in_image; + uint32_t blocks_allocated; + uuid_t uuid_image; + uuid_t uuid_last_snap; + uuid_t uuid_link; + uuid_t uuid_parent; + uint64_t unused2[7]; +} QEMU_PACKED VdiHeader; + +typedef struct { + /* The block map entries are little endian (even in memory). */ + uint32_t *bmap; + /* Size of block (bytes). */ + uint32_t block_size; + /* Size of block (sectors). */ + uint32_t block_sectors; + /* First sector of block map. */ + uint32_t bmap_sector; + /* VDI header (converted to host endianness). */ + VdiHeader header; + + CoMutex write_lock; + + Error *migration_blocker; +} BDRVVdiState; + +/* Change UUID from little endian (IPRT = VirtualBox format) to big endian + * format (network byte order, standard, see RFC 4122) and vice versa. + */ +static void uuid_convert(uuid_t uuid) +{ + bswap32s((uint32_t *)&uuid[0]); + bswap16s((uint16_t *)&uuid[4]); + bswap16s((uint16_t *)&uuid[6]); +} + +static void vdi_header_to_cpu(VdiHeader *header) +{ + le32_to_cpus(&header->signature); + le32_to_cpus(&header->version); + le32_to_cpus(&header->header_size); + le32_to_cpus(&header->image_type); + le32_to_cpus(&header->image_flags); + le32_to_cpus(&header->offset_bmap); + le32_to_cpus(&header->offset_data); + le32_to_cpus(&header->cylinders); + le32_to_cpus(&header->heads); + le32_to_cpus(&header->sectors); + le32_to_cpus(&header->sector_size); + le64_to_cpus(&header->disk_size); + le32_to_cpus(&header->block_size); + le32_to_cpus(&header->block_extra); + le32_to_cpus(&header->blocks_in_image); + le32_to_cpus(&header->blocks_allocated); + uuid_convert(header->uuid_image); + uuid_convert(header->uuid_last_snap); + uuid_convert(header->uuid_link); + uuid_convert(header->uuid_parent); +} + +static void vdi_header_to_le(VdiHeader *header) +{ + cpu_to_le32s(&header->signature); + cpu_to_le32s(&header->version); + cpu_to_le32s(&header->header_size); + cpu_to_le32s(&header->image_type); + cpu_to_le32s(&header->image_flags); + cpu_to_le32s(&header->offset_bmap); + cpu_to_le32s(&header->offset_data); + cpu_to_le32s(&header->cylinders); + cpu_to_le32s(&header->heads); + cpu_to_le32s(&header->sectors); + cpu_to_le32s(&header->sector_size); + cpu_to_le64s(&header->disk_size); + cpu_to_le32s(&header->block_size); + cpu_to_le32s(&header->block_extra); + cpu_to_le32s(&header->blocks_in_image); + cpu_to_le32s(&header->blocks_allocated); + uuid_convert(header->uuid_image); + uuid_convert(header->uuid_last_snap); + uuid_convert(header->uuid_link); + uuid_convert(header->uuid_parent); +} + +#if defined(CONFIG_VDI_DEBUG) +static void vdi_header_print(VdiHeader *header) +{ + char uuid[37]; + logout("text %s", header->text); + logout("signature 0x%08x\n", header->signature); + logout("header size 0x%04x\n", header->header_size); + logout("image type 0x%04x\n", header->image_type); + logout("image flags 0x%04x\n", header->image_flags); + logout("description %s\n", header->description); + logout("offset bmap 0x%04x\n", header->offset_bmap); + logout("offset data 0x%04x\n", header->offset_data); + logout("cylinders 0x%04x\n", header->cylinders); + logout("heads 0x%04x\n", header->heads); + logout("sectors 0x%04x\n", header->sectors); + logout("sector size 0x%04x\n", header->sector_size); + logout("image size 0x%" PRIx64 " B (%" PRIu64 " MiB)\n", + header->disk_size, header->disk_size / MiB); + logout("block size 0x%04x\n", header->block_size); + logout("block extra 0x%04x\n", header->block_extra); + logout("blocks tot. 0x%04x\n", header->blocks_in_image); + logout("blocks all. 0x%04x\n", header->blocks_allocated); + uuid_unparse(header->uuid_image, uuid); + logout("uuid image %s\n", uuid); + uuid_unparse(header->uuid_last_snap, uuid); + logout("uuid snap %s\n", uuid); + uuid_unparse(header->uuid_link, uuid); + logout("uuid link %s\n", uuid); + uuid_unparse(header->uuid_parent, uuid); + logout("uuid parent %s\n", uuid); +} +#endif + +static int vdi_check(BlockDriverState *bs, BdrvCheckResult *res, + BdrvCheckMode fix) +{ + /* TODO: additional checks possible. */ + BDRVVdiState *s = (BDRVVdiState *)bs->opaque; + uint32_t blocks_allocated = 0; + uint32_t block; + uint32_t *bmap; + logout("\n"); + + if (fix) { + return -ENOTSUP; + } + + bmap = g_try_new(uint32_t, s->header.blocks_in_image); + if (s->header.blocks_in_image && bmap == NULL) { + res->check_errors++; + return -ENOMEM; + } + + memset(bmap, 0xff, s->header.blocks_in_image * sizeof(uint32_t)); + + /* Check block map and value of blocks_allocated. */ + for (block = 0; block < s->header.blocks_in_image; block++) { + uint32_t bmap_entry = le32_to_cpu(s->bmap[block]); + if (VDI_IS_ALLOCATED(bmap_entry)) { + if (bmap_entry < s->header.blocks_in_image) { + blocks_allocated++; + if (!VDI_IS_ALLOCATED(bmap[bmap_entry])) { + bmap[bmap_entry] = bmap_entry; + } else { + fprintf(stderr, "ERROR: block index %" PRIu32 + " also used by %" PRIu32 "\n", bmap[bmap_entry], bmap_entry); + res->corruptions++; + } + } else { + fprintf(stderr, "ERROR: block index %" PRIu32 + " too large, is %" PRIu32 "\n", block, bmap_entry); + res->corruptions++; + } + } + } + if (blocks_allocated != s->header.blocks_allocated) { + fprintf(stderr, "ERROR: allocated blocks mismatch, is %" PRIu32 + ", should be %" PRIu32 "\n", + blocks_allocated, s->header.blocks_allocated); + res->corruptions++; + } + + g_free(bmap); + + return 0; +} + +static int vdi_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + /* TODO: vdi_get_info would be needed for machine snapshots. + vm_state_offset is still missing. */ + BDRVVdiState *s = (BDRVVdiState *)bs->opaque; + logout("\n"); + bdi->cluster_size = s->block_size; + bdi->vm_state_offset = 0; + bdi->unallocated_blocks_are_zero = true; + return 0; +} + +static int vdi_make_empty(BlockDriverState *bs) +{ + /* TODO: missing code. */ + logout("\n"); + /* The return value for missing code must be 0, see block.c. */ + return 0; +} + +static int vdi_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + const VdiHeader *header = (const VdiHeader *)buf; + int ret = 0; + + logout("\n"); + + if (buf_size < sizeof(*header)) { + /* Header too small, no VDI. */ + } else if (le32_to_cpu(header->signature) == VDI_SIGNATURE) { + ret = 100; + } + + if (ret == 0) { + logout("no vdi image\n"); + } else { + logout("%s", header->text); + } + + return ret; +} + +static int vdi_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVVdiState *s = bs->opaque; + VdiHeader header; + size_t bmap_size; + int ret; + + logout("\n"); + + ret = bdrv_read(bs->file->bs, 0, (uint8_t *)&header, 1); + if (ret < 0) { + goto fail; + } + + vdi_header_to_cpu(&header); +#if defined(CONFIG_VDI_DEBUG) + vdi_header_print(&header); +#endif + + if (header.disk_size > VDI_DISK_SIZE_MAX) { + error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 + ", max supported is 0x%" PRIx64 ")", + header.disk_size, VDI_DISK_SIZE_MAX); + ret = -ENOTSUP; + goto fail; + } + + if (header.disk_size % SECTOR_SIZE != 0) { + /* 'VBoxManage convertfromraw' can create images with odd disk sizes. + We accept them but round the disk size to the next multiple of + SECTOR_SIZE. */ + logout("odd disk size %" PRIu64 " B, round up\n", header.disk_size); + header.disk_size = ROUND_UP(header.disk_size, SECTOR_SIZE); + } + + if (header.signature != VDI_SIGNATURE) { + error_setg(errp, "Image not in VDI format (bad signature %08" PRIx32 + ")", header.signature); + ret = -EINVAL; + goto fail; + } else if (header.version != VDI_VERSION_1_1) { + error_setg(errp, "unsupported VDI image (version %" PRIu32 ".%" PRIu32 + ")", header.version >> 16, header.version & 0xffff); + ret = -ENOTSUP; + goto fail; + } else if (header.offset_bmap % SECTOR_SIZE != 0) { + /* We only support block maps which start on a sector boundary. */ + error_setg(errp, "unsupported VDI image (unaligned block map offset " + "0x%" PRIx32 ")", header.offset_bmap); + ret = -ENOTSUP; + goto fail; + } else if (header.offset_data % SECTOR_SIZE != 0) { + /* We only support data blocks which start on a sector boundary. */ + error_setg(errp, "unsupported VDI image (unaligned data offset 0x%" + PRIx32 ")", header.offset_data); + ret = -ENOTSUP; + goto fail; + } else if (header.sector_size != SECTOR_SIZE) { + error_setg(errp, "unsupported VDI image (sector size %" PRIu32 + " is not %u)", header.sector_size, SECTOR_SIZE); + ret = -ENOTSUP; + goto fail; + } else if (header.block_size != DEFAULT_CLUSTER_SIZE) { + error_setg(errp, "unsupported VDI image (block size %" PRIu32 + " is not %u)", header.block_size, DEFAULT_CLUSTER_SIZE); + ret = -ENOTSUP; + goto fail; + } else if (header.disk_size > + (uint64_t)header.blocks_in_image * header.block_size) { + error_setg(errp, "unsupported VDI image (disk size %" PRIu64 ", " + "image bitmap has room for %" PRIu64 ")", + header.disk_size, + (uint64_t)header.blocks_in_image * header.block_size); + ret = -ENOTSUP; + goto fail; + } else if (!uuid_is_null(header.uuid_link)) { + error_setg(errp, "unsupported VDI image (non-NULL link UUID)"); + ret = -ENOTSUP; + goto fail; + } else if (!uuid_is_null(header.uuid_parent)) { + error_setg(errp, "unsupported VDI image (non-NULL parent UUID)"); + ret = -ENOTSUP; + goto fail; + } else if (header.blocks_in_image > VDI_BLOCKS_IN_IMAGE_MAX) { + error_setg(errp, "unsupported VDI image " + "(too many blocks %u, max is %u)", + header.blocks_in_image, VDI_BLOCKS_IN_IMAGE_MAX); + ret = -ENOTSUP; + goto fail; + } + + bs->total_sectors = header.disk_size / SECTOR_SIZE; + + s->block_size = header.block_size; + s->block_sectors = header.block_size / SECTOR_SIZE; + s->bmap_sector = header.offset_bmap / SECTOR_SIZE; + s->header = header; + + bmap_size = header.blocks_in_image * sizeof(uint32_t); + bmap_size = DIV_ROUND_UP(bmap_size, SECTOR_SIZE); + s->bmap = qemu_try_blockalign(bs->file->bs, bmap_size * SECTOR_SIZE); + if (s->bmap == NULL) { + ret = -ENOMEM; + goto fail; + } + + ret = bdrv_read(bs->file->bs, s->bmap_sector, (uint8_t *)s->bmap, + bmap_size); + if (ret < 0) { + goto fail_free_bmap; + } + + /* Disable migration when vdi images are used */ + error_setg(&s->migration_blocker, "The vdi format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); + migrate_add_blocker(s->migration_blocker); + + qemu_co_mutex_init(&s->write_lock); + + return 0; + + fail_free_bmap: + qemu_vfree(s->bmap); + + fail: + return ret; +} + +static int vdi_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +static int64_t coroutine_fn vdi_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + /* TODO: Check for too large sector_num (in bdrv_is_allocated or here). */ + BDRVVdiState *s = (BDRVVdiState *)bs->opaque; + size_t bmap_index = sector_num / s->block_sectors; + size_t sector_in_block = sector_num % s->block_sectors; + int n_sectors = s->block_sectors - sector_in_block; + uint32_t bmap_entry = le32_to_cpu(s->bmap[bmap_index]); + uint64_t offset; + int result; + + logout("%p, %" PRId64 ", %d, %p\n", bs, sector_num, nb_sectors, pnum); + if (n_sectors > nb_sectors) { + n_sectors = nb_sectors; + } + *pnum = n_sectors; + result = VDI_IS_ALLOCATED(bmap_entry); + if (!result) { + return 0; + } + + offset = s->header.offset_data + + (uint64_t)bmap_entry * s->block_size + + sector_in_block * SECTOR_SIZE; + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | offset; +} + +static int vdi_co_read(BlockDriverState *bs, + int64_t sector_num, uint8_t *buf, int nb_sectors) +{ + BDRVVdiState *s = bs->opaque; + uint32_t bmap_entry; + uint32_t block_index; + uint32_t sector_in_block; + uint32_t n_sectors; + int ret = 0; + + logout("\n"); + + while (ret >= 0 && nb_sectors > 0) { + block_index = sector_num / s->block_sectors; + sector_in_block = sector_num % s->block_sectors; + n_sectors = s->block_sectors - sector_in_block; + if (n_sectors > nb_sectors) { + n_sectors = nb_sectors; + } + + logout("will read %u sectors starting at sector %" PRIu64 "\n", + n_sectors, sector_num); + + /* prepare next AIO request */ + bmap_entry = le32_to_cpu(s->bmap[block_index]); + if (!VDI_IS_ALLOCATED(bmap_entry)) { + /* Block not allocated, return zeros, no need to wait. */ + memset(buf, 0, n_sectors * SECTOR_SIZE); + ret = 0; + } else { + uint64_t offset = s->header.offset_data / SECTOR_SIZE + + (uint64_t)bmap_entry * s->block_sectors + + sector_in_block; + ret = bdrv_read(bs->file->bs, offset, buf, n_sectors); + } + logout("%u sectors read\n", n_sectors); + + nb_sectors -= n_sectors; + sector_num += n_sectors; + buf += n_sectors * SECTOR_SIZE; + } + + return ret; +} + +static int vdi_co_write(BlockDriverState *bs, + int64_t sector_num, const uint8_t *buf, int nb_sectors) +{ + BDRVVdiState *s = bs->opaque; + uint32_t bmap_entry; + uint32_t block_index; + uint32_t sector_in_block; + uint32_t n_sectors; + uint32_t bmap_first = VDI_UNALLOCATED; + uint32_t bmap_last = VDI_UNALLOCATED; + uint8_t *block = NULL; + int ret = 0; + + logout("\n"); + + while (ret >= 0 && nb_sectors > 0) { + block_index = sector_num / s->block_sectors; + sector_in_block = sector_num % s->block_sectors; + n_sectors = s->block_sectors - sector_in_block; + if (n_sectors > nb_sectors) { + n_sectors = nb_sectors; + } + + logout("will write %u sectors starting at sector %" PRIu64 "\n", + n_sectors, sector_num); + + /* prepare next AIO request */ + bmap_entry = le32_to_cpu(s->bmap[block_index]); + if (!VDI_IS_ALLOCATED(bmap_entry)) { + /* Allocate new block and write to it. */ + uint64_t offset; + bmap_entry = s->header.blocks_allocated; + s->bmap[block_index] = cpu_to_le32(bmap_entry); + s->header.blocks_allocated++; + offset = s->header.offset_data / SECTOR_SIZE + + (uint64_t)bmap_entry * s->block_sectors; + if (block == NULL) { + block = g_malloc(s->block_size); + bmap_first = block_index; + } + bmap_last = block_index; + /* Copy data to be written to new block and zero unused parts. */ + memset(block, 0, sector_in_block * SECTOR_SIZE); + memcpy(block + sector_in_block * SECTOR_SIZE, + buf, n_sectors * SECTOR_SIZE); + memset(block + (sector_in_block + n_sectors) * SECTOR_SIZE, 0, + (s->block_sectors - n_sectors - sector_in_block) * SECTOR_SIZE); + + /* Note that this coroutine does not yield anywhere from reading the + * bmap entry until here, so in regards to all the coroutines trying + * to write to this cluster, the one doing the allocation will + * always be the first to try to acquire the lock. + * Therefore, it is also the first that will actually be able to + * acquire the lock and thus the padded cluster is written before + * the other coroutines can write to the affected area. */ + qemu_co_mutex_lock(&s->write_lock); + ret = bdrv_write(bs->file->bs, offset, block, s->block_sectors); + qemu_co_mutex_unlock(&s->write_lock); + } else { + uint64_t offset = s->header.offset_data / SECTOR_SIZE + + (uint64_t)bmap_entry * s->block_sectors + + sector_in_block; + qemu_co_mutex_lock(&s->write_lock); + /* This lock is only used to make sure the following write operation + * is executed after the write issued by the coroutine allocating + * this cluster, therefore we do not need to keep it locked. + * As stated above, the allocating coroutine will always try to lock + * the mutex before all the other concurrent accesses to that + * cluster, therefore at this point we can be absolutely certain + * that that write operation has returned (there may be other writes + * in flight, but they do not concern this very operation). */ + qemu_co_mutex_unlock(&s->write_lock); + ret = bdrv_write(bs->file->bs, offset, buf, n_sectors); + } + + nb_sectors -= n_sectors; + sector_num += n_sectors; + buf += n_sectors * SECTOR_SIZE; + + logout("%u sectors written\n", n_sectors); + } + + logout("finished data write\n"); + if (ret < 0) { + return ret; + } + + if (block) { + /* One or more new blocks were allocated. */ + VdiHeader *header = (VdiHeader *) block; + uint8_t *base; + uint64_t offset; + + logout("now writing modified header\n"); + assert(VDI_IS_ALLOCATED(bmap_first)); + *header = s->header; + vdi_header_to_le(header); + ret = bdrv_write(bs->file->bs, 0, block, 1); + g_free(block); + block = NULL; + + if (ret < 0) { + return ret; + } + + logout("now writing modified block map entry %u...%u\n", + bmap_first, bmap_last); + /* Write modified sectors from block map. */ + bmap_first /= (SECTOR_SIZE / sizeof(uint32_t)); + bmap_last /= (SECTOR_SIZE / sizeof(uint32_t)); + n_sectors = bmap_last - bmap_first + 1; + offset = s->bmap_sector + bmap_first; + base = ((uint8_t *)&s->bmap[0]) + bmap_first * SECTOR_SIZE; + logout("will write %u block map sectors starting from entry %u\n", + n_sectors, bmap_first); + ret = bdrv_write(bs->file->bs, offset, base, n_sectors); + } + + return ret; +} + +static int vdi_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int ret = 0; + uint64_t bytes = 0; + uint32_t blocks; + size_t block_size = DEFAULT_CLUSTER_SIZE; + uint32_t image_type = VDI_TYPE_DYNAMIC; + VdiHeader header; + size_t i; + size_t bmap_size; + int64_t offset = 0; + Error *local_err = NULL; + BlockDriverState *bs = NULL; + uint32_t *bmap = NULL; + + logout("\n"); + + /* Read out options. */ + bytes = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); +#if defined(CONFIG_VDI_BLOCK_SIZE) + /* TODO: Additional checks (SECTOR_SIZE * 2^n, ...). */ + block_size = qemu_opt_get_size_del(opts, + BLOCK_OPT_CLUSTER_SIZE, + DEFAULT_CLUSTER_SIZE); +#endif +#if defined(CONFIG_VDI_STATIC_IMAGE) + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_STATIC, false)) { + image_type = VDI_TYPE_STATIC; + } +#endif + + if (bytes > VDI_DISK_SIZE_MAX) { + ret = -ENOTSUP; + error_setg(errp, "Unsupported VDI image size (size is 0x%" PRIx64 + ", max supported is 0x%" PRIx64 ")", + bytes, VDI_DISK_SIZE_MAX); + goto exit; + } + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + /* We need enough blocks to store the given disk size, + so always round up. */ + blocks = DIV_ROUND_UP(bytes, block_size); + + bmap_size = blocks * sizeof(uint32_t); + bmap_size = ROUND_UP(bmap_size, SECTOR_SIZE); + + memset(&header, 0, sizeof(header)); + pstrcpy(header.text, sizeof(header.text), VDI_TEXT); + header.signature = VDI_SIGNATURE; + header.version = VDI_VERSION_1_1; + header.header_size = 0x180; + header.image_type = image_type; + header.offset_bmap = 0x200; + header.offset_data = 0x200 + bmap_size; + header.sector_size = SECTOR_SIZE; + header.disk_size = bytes; + header.block_size = block_size; + header.blocks_in_image = blocks; + if (image_type == VDI_TYPE_STATIC) { + header.blocks_allocated = blocks; + } + uuid_generate(header.uuid_image); + uuid_generate(header.uuid_last_snap); + /* There is no need to set header.uuid_link or header.uuid_parent here. */ +#if defined(CONFIG_VDI_DEBUG) + vdi_header_print(&header); +#endif + vdi_header_to_le(&header); + ret = bdrv_pwrite_sync(bs, offset, &header, sizeof(header)); + if (ret < 0) { + error_setg(errp, "Error writing header to %s", filename); + goto exit; + } + offset += sizeof(header); + + if (bmap_size > 0) { + bmap = g_try_malloc0(bmap_size); + if (bmap == NULL) { + ret = -ENOMEM; + error_setg(errp, "Could not allocate bmap"); + goto exit; + } + for (i = 0; i < blocks; i++) { + if (image_type == VDI_TYPE_STATIC) { + bmap[i] = i; + } else { + bmap[i] = VDI_UNALLOCATED; + } + } + ret = bdrv_pwrite_sync(bs, offset, bmap, bmap_size); + if (ret < 0) { + error_setg(errp, "Error writing bmap to %s", filename); + goto exit; + } + offset += bmap_size; + } + + if (image_type == VDI_TYPE_STATIC) { + ret = bdrv_truncate(bs, offset + blocks * block_size); + if (ret < 0) { + error_setg(errp, "Failed to statically allocate %s", filename); + goto exit; + } + } + +exit: + bdrv_unref(bs); + g_free(bmap); + return ret; +} + +static void vdi_close(BlockDriverState *bs) +{ + BDRVVdiState *s = bs->opaque; + + qemu_vfree(s->bmap); + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static QemuOptsList vdi_create_opts = { + .name = "vdi-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vdi_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, +#if defined(CONFIG_VDI_BLOCK_SIZE) + { + .name = BLOCK_OPT_CLUSTER_SIZE, + .type = QEMU_OPT_SIZE, + .help = "VDI cluster (block) size", + .def_value_str = stringify(DEFAULT_CLUSTER_SIZE) + }, +#endif +#if defined(CONFIG_VDI_STATIC_IMAGE) + { + .name = BLOCK_OPT_STATIC, + .type = QEMU_OPT_BOOL, + .help = "VDI static (pre-allocated) image", + .def_value_str = "off" + }, +#endif + /* TODO: An additional option to set UUID values might be useful. */ + { /* end of list */ } + } +}; + +static BlockDriver bdrv_vdi = { + .format_name = "vdi", + .instance_size = sizeof(BDRVVdiState), + .bdrv_probe = vdi_probe, + .bdrv_open = vdi_open, + .bdrv_close = vdi_close, + .bdrv_reopen_prepare = vdi_reopen_prepare, + .bdrv_create = vdi_create, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + .bdrv_co_get_block_status = vdi_co_get_block_status, + .bdrv_make_empty = vdi_make_empty, + + .bdrv_read = vdi_co_read, +#if defined(CONFIG_VDI_WRITE) + .bdrv_write = vdi_co_write, +#endif + + .bdrv_get_info = vdi_get_info, + + .create_opts = &vdi_create_opts, + .bdrv_check = vdi_check, +}; + +static void bdrv_vdi_init(void) +{ + logout("\n"); + bdrv_register(&bdrv_vdi); +} + +block_init(bdrv_vdi_init); diff --git a/src/block/vhdx-endian.c b/src/block/vhdx-endian.c new file mode 100644 index 0000000..0640d3f --- /dev/null +++ b/src/block/vhdx-endian.c @@ -0,0 +1,223 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "block/vhdx.h" + +#include <uuid/uuid.h> + + +/* + * All the VHDX formats on disk are little endian - the following + * are helper import/export functions to correctly convert + * endianness from disk read to native cpu format, and back again. + */ + + +/* VHDX File Header */ + + +void vhdx_header_le_import(VHDXHeader *h) +{ + assert(h != NULL); + + le32_to_cpus(&h->signature); + le32_to_cpus(&h->checksum); + le64_to_cpus(&h->sequence_number); + + leguid_to_cpus(&h->file_write_guid); + leguid_to_cpus(&h->data_write_guid); + leguid_to_cpus(&h->log_guid); + + le16_to_cpus(&h->log_version); + le16_to_cpus(&h->version); + le32_to_cpus(&h->log_length); + le64_to_cpus(&h->log_offset); +} + +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h) +{ + assert(orig_h != NULL); + assert(new_h != NULL); + + new_h->signature = cpu_to_le32(orig_h->signature); + new_h->checksum = cpu_to_le32(orig_h->checksum); + new_h->sequence_number = cpu_to_le64(orig_h->sequence_number); + + new_h->file_write_guid = orig_h->file_write_guid; + new_h->data_write_guid = orig_h->data_write_guid; + new_h->log_guid = orig_h->log_guid; + + cpu_to_leguids(&new_h->file_write_guid); + cpu_to_leguids(&new_h->data_write_guid); + cpu_to_leguids(&new_h->log_guid); + + new_h->log_version = cpu_to_le16(orig_h->log_version); + new_h->version = cpu_to_le16(orig_h->version); + new_h->log_length = cpu_to_le32(orig_h->log_length); + new_h->log_offset = cpu_to_le64(orig_h->log_offset); +} + + +/* VHDX Log Headers */ + + +void vhdx_log_desc_le_import(VHDXLogDescriptor *d) +{ + assert(d != NULL); + + le32_to_cpus(&d->signature); + le64_to_cpus(&d->file_offset); + le64_to_cpus(&d->sequence_number); +} + +void vhdx_log_desc_le_export(VHDXLogDescriptor *d) +{ + assert(d != NULL); + + cpu_to_le32s(&d->signature); + cpu_to_le32s(&d->trailing_bytes); + cpu_to_le64s(&d->leading_bytes); + cpu_to_le64s(&d->file_offset); + cpu_to_le64s(&d->sequence_number); +} + +void vhdx_log_data_le_import(VHDXLogDataSector *d) +{ + assert(d != NULL); + + le32_to_cpus(&d->data_signature); + le32_to_cpus(&d->sequence_high); + le32_to_cpus(&d->sequence_low); +} + +void vhdx_log_data_le_export(VHDXLogDataSector *d) +{ + assert(d != NULL); + + cpu_to_le32s(&d->data_signature); + cpu_to_le32s(&d->sequence_high); + cpu_to_le32s(&d->sequence_low); +} + +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr) +{ + assert(hdr != NULL); + + le32_to_cpus(&hdr->signature); + le32_to_cpus(&hdr->checksum); + le32_to_cpus(&hdr->entry_length); + le32_to_cpus(&hdr->tail); + le64_to_cpus(&hdr->sequence_number); + le32_to_cpus(&hdr->descriptor_count); + leguid_to_cpus(&hdr->log_guid); + le64_to_cpus(&hdr->flushed_file_offset); + le64_to_cpus(&hdr->last_file_offset); +} + +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le32s(&hdr->signature); + cpu_to_le32s(&hdr->checksum); + cpu_to_le32s(&hdr->entry_length); + cpu_to_le32s(&hdr->tail); + cpu_to_le64s(&hdr->sequence_number); + cpu_to_le32s(&hdr->descriptor_count); + cpu_to_leguids(&hdr->log_guid); + cpu_to_le64s(&hdr->flushed_file_offset); + cpu_to_le64s(&hdr->last_file_offset); +} + + +/* Region table entries */ +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr) +{ + assert(hdr != NULL); + + le32_to_cpus(&hdr->signature); + le32_to_cpus(&hdr->checksum); + le32_to_cpus(&hdr->entry_count); +} + +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le32s(&hdr->signature); + cpu_to_le32s(&hdr->checksum); + cpu_to_le32s(&hdr->entry_count); +} + +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e) +{ + assert(e != NULL); + + leguid_to_cpus(&e->guid); + le64_to_cpus(&e->file_offset); + le32_to_cpus(&e->length); + le32_to_cpus(&e->data_bits); +} + +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e) +{ + assert(e != NULL); + + cpu_to_leguids(&e->guid); + cpu_to_le64s(&e->file_offset); + cpu_to_le32s(&e->length); + cpu_to_le32s(&e->data_bits); +} + + +/* Metadata headers & table */ +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr) +{ + assert(hdr != NULL); + + le64_to_cpus(&hdr->signature); + le16_to_cpus(&hdr->entry_count); +} + +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr) +{ + assert(hdr != NULL); + + cpu_to_le64s(&hdr->signature); + cpu_to_le16s(&hdr->entry_count); +} + +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e) +{ + assert(e != NULL); + + leguid_to_cpus(&e->item_id); + le32_to_cpus(&e->offset); + le32_to_cpus(&e->length); + le32_to_cpus(&e->data_bits); +} +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e) +{ + assert(e != NULL); + + cpu_to_leguids(&e->item_id); + cpu_to_le32s(&e->offset); + cpu_to_le32s(&e->length); + cpu_to_le32s(&e->data_bits); +} diff --git a/src/block/vhdx-log.c b/src/block/vhdx-log.c new file mode 100644 index 0000000..47ae4b1 --- /dev/null +++ b/src/block/vhdx-log.c @@ -0,0 +1,1040 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This file covers the functionality of the metadata log writing, parsing, and + * replay. + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "block/vhdx.h" + + +typedef struct VHDXLogSequence { + bool valid; + uint32_t count; + VHDXLogEntries log; + VHDXLogEntryHeader hdr; +} VHDXLogSequence; + +typedef struct VHDXLogDescEntries { + VHDXLogEntryHeader hdr; + VHDXLogDescriptor desc[]; +} VHDXLogDescEntries; + +static const MSGUID zero_guid = { 0 }; + +/* The log located on the disk is circular buffer containing + * sectors of 4096 bytes each. + * + * It is assumed for the read/write functions below that the + * circular buffer scheme uses a 'one sector open' to indicate + * the buffer is full. Given the validation methods used for each + * sector, this method should be compatible with other methods that + * do not waste a sector. + */ + + +/* Allow peeking at the hdr entry at the beginning of the current + * read index, without advancing the read index */ +static int vhdx_log_peek_hdr(BlockDriverState *bs, VHDXLogEntries *log, + VHDXLogEntryHeader *hdr) +{ + int ret = 0; + uint64_t offset; + uint32_t read; + + assert(hdr != NULL); + + /* peek is only supported on sector boundaries */ + if (log->read % VHDX_LOG_SECTOR_SIZE) { + ret = -EFAULT; + goto exit; + } + + read = log->read; + /* we are guaranteed that a) log sectors are 4096 bytes, + * and b) the log length is a multiple of 1MB. So, there + * is always a round number of sectors in the buffer */ + if ((read + sizeof(VHDXLogEntryHeader)) > log->length) { + read = 0; + } + + if (read == log->write) { + ret = -EINVAL; + goto exit; + } + + offset = log->offset + read; + + ret = bdrv_pread(bs->file->bs, offset, hdr, sizeof(VHDXLogEntryHeader)); + if (ret < 0) { + goto exit; + } + vhdx_log_entry_hdr_le_import(hdr); + +exit: + return ret; +} + +/* Index increment for log, based on sector boundaries */ +static int vhdx_log_inc_idx(uint32_t idx, uint64_t length) +{ + idx += VHDX_LOG_SECTOR_SIZE; + /* we are guaranteed that a) log sectors are 4096 bytes, + * and b) the log length is a multiple of 1MB. So, there + * is always a round number of sectors in the buffer */ + return idx >= length ? 0 : idx; +} + + +/* Reset the log to empty */ +static void vhdx_log_reset(BlockDriverState *bs, BDRVVHDXState *s) +{ + MSGUID guid = { 0 }; + s->log.read = s->log.write = 0; + /* a log guid of 0 indicates an empty log to any parser of v0 + * VHDX logs */ + vhdx_update_headers(bs, s, false, &guid); +} + +/* Reads num_sectors from the log (all log sectors are 4096 bytes), + * into buffer 'buffer'. Upon return, *sectors_read will contain + * the number of sectors successfully read. + * + * It is assumed that 'buffer' is already allocated, and of sufficient + * size (i.e. >= 4096*num_sectors). + * + * If 'peek' is true, then the tail (read) pointer for the circular buffer is + * not modified. + * + * 0 is returned on success, -errno otherwise. */ +static int vhdx_log_read_sectors(BlockDriverState *bs, VHDXLogEntries *log, + uint32_t *sectors_read, void *buffer, + uint32_t num_sectors, bool peek) +{ + int ret = 0; + uint64_t offset; + uint32_t read; + + read = log->read; + + *sectors_read = 0; + while (num_sectors) { + if (read == log->write) { + /* empty */ + break; + } + offset = log->offset + read; + + ret = bdrv_pread(bs->file->bs, offset, buffer, VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + read = vhdx_log_inc_idx(read, log->length); + + *sectors_read = *sectors_read + 1; + num_sectors--; + } + +exit: + if (!peek) { + log->read = read; + } + return ret; +} + +/* Writes num_sectors to the log (all log sectors are 4096 bytes), + * from buffer 'buffer'. Upon return, *sectors_written will contain + * the number of sectors successfully written. + * + * It is assumed that 'buffer' is at least 4096*num_sectors large. + * + * 0 is returned on success, -errno otherwise */ +static int vhdx_log_write_sectors(BlockDriverState *bs, VHDXLogEntries *log, + uint32_t *sectors_written, void *buffer, + uint32_t num_sectors) +{ + int ret = 0; + uint64_t offset; + uint32_t write; + void *buffer_tmp; + BDRVVHDXState *s = bs->opaque; + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + write = log->write; + + buffer_tmp = buffer; + while (num_sectors) { + + offset = log->offset + write; + write = vhdx_log_inc_idx(write, log->length); + if (write == log->read) { + /* full */ + break; + } + ret = bdrv_pwrite(bs->file->bs, offset, buffer_tmp, + VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + buffer_tmp += VHDX_LOG_SECTOR_SIZE; + + log->write = write; + *sectors_written = *sectors_written + 1; + num_sectors--; + } + +exit: + return ret; +} + + +/* Validates a log entry header */ +static bool vhdx_log_hdr_is_valid(VHDXLogEntries *log, VHDXLogEntryHeader *hdr, + BDRVVHDXState *s) +{ + int valid = false; + + if (hdr->signature != VHDX_LOG_SIGNATURE) { + goto exit; + } + + /* if the individual entry length is larger than the whole log + * buffer, that is obviously invalid */ + if (log->length < hdr->entry_length) { + goto exit; + } + + /* length of entire entry must be in units of 4KB (log sector size) */ + if (hdr->entry_length % (VHDX_LOG_SECTOR_SIZE)) { + goto exit; + } + + /* per spec, sequence # must be > 0 */ + if (hdr->sequence_number == 0) { + goto exit; + } + + /* log entries are only valid if they match the file-wide log guid + * found in the active header */ + if (!guid_eq(hdr->log_guid, s->headers[s->curr_header]->log_guid)) { + goto exit; + } + + if (hdr->descriptor_count * sizeof(VHDXLogDescriptor) > hdr->entry_length) { + goto exit; + } + + valid = true; + +exit: + return valid; +} + +/* + * Given a log header, this will validate that the descriptors and the + * corresponding data sectors (if applicable) + * + * Validation consists of: + * 1. Making sure the sequence numbers matches the entry header + * 2. Verifying a valid signature ('zero' or 'desc' for descriptors) + * 3. File offset field is a multiple of 4KB + * 4. If a data descriptor, the corresponding data sector + * has its signature ('data') and matching sequence number + * + * @desc: the data buffer containing the descriptor + * @hdr: the log entry header + * + * Returns true if valid + */ +static bool vhdx_log_desc_is_valid(VHDXLogDescriptor *desc, + VHDXLogEntryHeader *hdr) +{ + bool ret = false; + + if (desc->sequence_number != hdr->sequence_number) { + goto exit; + } + if (desc->file_offset % VHDX_LOG_SECTOR_SIZE) { + goto exit; + } + + if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { + if (desc->zero_length % VHDX_LOG_SECTOR_SIZE == 0) { + /* valid */ + ret = true; + } + } else if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { + /* valid */ + ret = true; + } + +exit: + return ret; +} + + +/* Prior to sector data for a log entry, there is the header + * and the descriptors referenced in the header: + * + * [] = 4KB sector + * + * [ hdr, desc ][ desc ][ ... ][ data ][ ... ] + * + * The first sector in a log entry has a 64 byte header, and + * up to 126 32-byte descriptors. If more descriptors than + * 126 are required, then subsequent sectors can have up to 128 + * descriptors. Each sector is 4KB. Data follows the descriptor + * sectors. + * + * This will return the number of sectors needed to encompass + * the passed number of descriptors in desc_cnt. + * + * This will never return 0, even if desc_cnt is 0. + */ +static int vhdx_compute_desc_sectors(uint32_t desc_cnt) +{ + uint32_t desc_sectors; + + desc_cnt += 2; /* account for header in first sector */ + desc_sectors = desc_cnt / 128; + if (desc_cnt % 128) { + desc_sectors++; + } + + return desc_sectors; +} + + +/* Reads the log header, and subsequent descriptors (if any). This + * will allocate all the space for buffer, which must be NULL when + * passed into this function. Each descriptor will also be validated, + * and error returned if any are invalid. */ +static int vhdx_log_read_desc(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogEntries *log, VHDXLogDescEntries **buffer, + bool convert_endian) +{ + int ret = 0; + uint32_t desc_sectors; + uint32_t sectors_read; + VHDXLogEntryHeader hdr; + VHDXLogDescEntries *desc_entries = NULL; + VHDXLogDescriptor desc; + int i; + + assert(*buffer == NULL); + + ret = vhdx_log_peek_hdr(bs, log, &hdr); + if (ret < 0) { + goto exit; + } + + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { + ret = -EINVAL; + goto exit; + } + + desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); + desc_entries = qemu_try_blockalign(bs->file->bs, + desc_sectors * VHDX_LOG_SECTOR_SIZE); + if (desc_entries == NULL) { + ret = -ENOMEM; + goto exit; + } + + ret = vhdx_log_read_sectors(bs, log, §ors_read, desc_entries, + desc_sectors, false); + if (ret < 0) { + goto free_and_exit; + } + if (sectors_read != desc_sectors) { + ret = -EINVAL; + goto free_and_exit; + } + + /* put in proper endianness, and validate each desc */ + for (i = 0; i < hdr.descriptor_count; i++) { + desc = desc_entries->desc[i]; + vhdx_log_desc_le_import(&desc); + if (convert_endian) { + desc_entries->desc[i] = desc; + } + if (vhdx_log_desc_is_valid(&desc, &hdr) == false) { + ret = -EINVAL; + goto free_and_exit; + } + } + if (convert_endian) { + desc_entries->hdr = hdr; + } + + *buffer = desc_entries; + goto exit; + +free_and_exit: + qemu_vfree(desc_entries); +exit: + return ret; +} + + +/* Flushes the descriptor described by desc to the VHDX image file. + * If the descriptor is a data descriptor, than 'data' must be non-NULL, + * and >= 4096 bytes (VHDX_LOG_SECTOR_SIZE), containing the data to be + * written. + * + * Verification is performed to make sure the sequence numbers of a data + * descriptor match the sequence number in the desc. + * + * For a zero descriptor, it may describe multiple sectors to fill with zeroes. + * In this case, it should be noted that zeroes are written to disk, and the + * image file is not extended as a sparse file. */ +static int vhdx_log_flush_desc(BlockDriverState *bs, VHDXLogDescriptor *desc, + VHDXLogDataSector *data) +{ + int ret = 0; + uint64_t seq, file_offset; + uint32_t offset = 0; + void *buffer = NULL; + uint64_t count = 1; + int i; + + buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + if (desc->signature == VHDX_LOG_DESC_SIGNATURE) { + /* data sector */ + if (data == NULL) { + ret = -EFAULT; + goto exit; + } + + /* The sequence number of the data sector must match that + * in the descriptor */ + seq = data->sequence_high; + seq <<= 32; + seq |= data->sequence_low & 0xffffffff; + + if (seq != desc->sequence_number) { + ret = -EINVAL; + goto exit; + } + + /* Each data sector is in total 4096 bytes, however the first + * 8 bytes, and last 4 bytes, are located in the descriptor */ + memcpy(buffer, &desc->leading_bytes, 8); + offset += 8; + + memcpy(buffer+offset, data->data, 4084); + offset += 4084; + + memcpy(buffer+offset, &desc->trailing_bytes, 4); + + } else if (desc->signature == VHDX_LOG_ZERO_SIGNATURE) { + /* write 'count' sectors of sector */ + memset(buffer, 0, VHDX_LOG_SECTOR_SIZE); + count = desc->zero_length / VHDX_LOG_SECTOR_SIZE; + } else { + error_report("Invalid VHDX log descriptor entry signature 0x%" PRIx32, + desc->signature); + ret = -EINVAL; + goto exit; + } + + file_offset = desc->file_offset; + + /* count is only > 1 if we are writing zeroes */ + for (i = 0; i < count; i++) { + ret = bdrv_pwrite_sync(bs->file->bs, file_offset, buffer, + VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + file_offset += VHDX_LOG_SECTOR_SIZE; + } + +exit: + qemu_vfree(buffer); + return ret; +} + +/* Flush the entire log (as described by 'logs') to the VHDX image + * file, and then set the log to 'empty' status once complete. + * + * The log entries should be validate prior to flushing */ +static int vhdx_log_flush(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogSequence *logs) +{ + int ret = 0; + int i; + uint32_t cnt, sectors_read; + uint64_t new_file_size; + void *data = NULL; + VHDXLogDescEntries *desc_entries = NULL; + VHDXLogEntryHeader hdr_tmp = { 0 }; + + cnt = logs->count; + + data = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + /* each iteration represents one log sequence, which may span multiple + * sectors */ + while (cnt--) { + ret = vhdx_log_peek_hdr(bs, &logs->log, &hdr_tmp); + if (ret < 0) { + goto exit; + } + /* if the log shows a FlushedFileOffset larger than our current file + * size, then that means the file has been truncated / corrupted, and + * we must refused to open it / use it */ + if (hdr_tmp.flushed_file_offset > bdrv_getlength(bs->file->bs)) { + ret = -EINVAL; + goto exit; + } + + ret = vhdx_log_read_desc(bs, s, &logs->log, &desc_entries, true); + if (ret < 0) { + goto exit; + } + + for (i = 0; i < desc_entries->hdr.descriptor_count; i++) { + if (desc_entries->desc[i].signature == VHDX_LOG_DESC_SIGNATURE) { + /* data sector, so read a sector to flush */ + ret = vhdx_log_read_sectors(bs, &logs->log, §ors_read, + data, 1, false); + if (ret < 0) { + goto exit; + } + if (sectors_read != 1) { + ret = -EINVAL; + goto exit; + } + vhdx_log_data_le_import(data); + } + + ret = vhdx_log_flush_desc(bs, &desc_entries->desc[i], data); + if (ret < 0) { + goto exit; + } + } + if (bdrv_getlength(bs->file->bs) < desc_entries->hdr.last_file_offset) { + new_file_size = desc_entries->hdr.last_file_offset; + if (new_file_size % (1024*1024)) { + /* round up to nearest 1MB boundary */ + new_file_size = ((new_file_size >> 20) + 1) << 20; + bdrv_truncate(bs->file->bs, new_file_size); + } + } + qemu_vfree(desc_entries); + desc_entries = NULL; + } + + bdrv_flush(bs); + /* once the log is fully flushed, indicate that we have an empty log + * now. This also sets the log guid to 0, to indicate an empty log */ + vhdx_log_reset(bs, s); + +exit: + qemu_vfree(data); + qemu_vfree(desc_entries); + return ret; +} + +static int vhdx_validate_log_entry(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogEntries *log, uint64_t seq, + bool *valid, VHDXLogEntryHeader *entry) +{ + int ret = 0; + VHDXLogEntryHeader hdr; + void *buffer = NULL; + uint32_t i, desc_sectors, total_sectors, crc; + uint32_t sectors_read = 0; + VHDXLogDescEntries *desc_buffer = NULL; + + *valid = false; + + ret = vhdx_log_peek_hdr(bs, log, &hdr); + if (ret < 0) { + goto inc_and_exit; + } + + if (vhdx_log_hdr_is_valid(log, &hdr, s) == false) { + goto inc_and_exit; + } + + if (seq > 0) { + if (hdr.sequence_number != seq + 1) { + goto inc_and_exit; + } + } + + desc_sectors = vhdx_compute_desc_sectors(hdr.descriptor_count); + + /* Read all log sectors, and calculate log checksum */ + + total_sectors = hdr.entry_length / VHDX_LOG_SECTOR_SIZE; + + + /* read_desc() will increment the read idx */ + ret = vhdx_log_read_desc(bs, s, log, &desc_buffer, false); + if (ret < 0) { + goto free_and_exit; + } + + crc = vhdx_checksum_calc(0xffffffff, (void *)desc_buffer, + desc_sectors * VHDX_LOG_SECTOR_SIZE, 4); + crc ^= 0xffffffff; + + buffer = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + if (total_sectors > desc_sectors) { + for (i = 0; i < total_sectors - desc_sectors; i++) { + sectors_read = 0; + ret = vhdx_log_read_sectors(bs, log, §ors_read, buffer, + 1, false); + if (ret < 0 || sectors_read != 1) { + goto free_and_exit; + } + crc = vhdx_checksum_calc(crc, buffer, VHDX_LOG_SECTOR_SIZE, -1); + crc ^= 0xffffffff; + } + } + crc ^= 0xffffffff; + if (crc != hdr.checksum) { + goto free_and_exit; + } + + *valid = true; + *entry = hdr; + goto free_and_exit; + +inc_and_exit: + log->read = vhdx_log_inc_idx(log->read, log->length); + +free_and_exit: + qemu_vfree(buffer); + qemu_vfree(desc_buffer); + return ret; +} + +/* Search through the log circular buffer, and find the valid, active + * log sequence, if any exists + * */ +static int vhdx_log_search(BlockDriverState *bs, BDRVVHDXState *s, + VHDXLogSequence *logs) +{ + int ret = 0; + uint32_t tail; + bool seq_valid = false; + VHDXLogSequence candidate = { 0 }; + VHDXLogEntryHeader hdr = { 0 }; + VHDXLogEntries curr_log; + + memcpy(&curr_log, &s->log, sizeof(VHDXLogEntries)); + curr_log.write = curr_log.length; /* assume log is full */ + curr_log.read = 0; + + + /* now we will go through the whole log sector by sector, until + * we find a valid, active log sequence, or reach the end of the + * log buffer */ + for (;;) { + uint64_t curr_seq = 0; + VHDXLogSequence current = { 0 }; + + tail = curr_log.read; + + ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, + &seq_valid, &hdr); + if (ret < 0) { + goto exit; + } + + if (seq_valid) { + current.valid = true; + current.log = curr_log; + current.log.read = tail; + current.log.write = curr_log.read; + current.count = 1; + current.hdr = hdr; + + + for (;;) { + ret = vhdx_validate_log_entry(bs, s, &curr_log, curr_seq, + &seq_valid, &hdr); + if (ret < 0) { + goto exit; + } + if (seq_valid == false) { + break; + } + current.log.write = curr_log.read; + current.count++; + + curr_seq = hdr.sequence_number; + } + } + + if (current.valid) { + if (candidate.valid == false || + current.hdr.sequence_number > candidate.hdr.sequence_number) { + candidate = current; + } + } + + if (curr_log.read < tail) { + break; + } + } + + *logs = candidate; + + if (candidate.valid) { + /* this is the next sequence number, for writes */ + s->log.sequence = candidate.hdr.sequence_number + 1; + } + + +exit: + return ret; +} + +/* Parse the replay log. Per the VHDX spec, if the log is present + * it must be replayed prior to opening the file, even read-only. + * + * If read-only, we must replay the log in RAM (or refuse to open + * a dirty VHDX file read-only) */ +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp) +{ + int ret = 0; + VHDXHeader *hdr; + VHDXLogSequence logs = { 0 }; + + hdr = s->headers[s->curr_header]; + + *flushed = false; + + /* s->log.hdr is freed in vhdx_close() */ + if (s->log.hdr == NULL) { + s->log.hdr = qemu_blockalign(bs, sizeof(VHDXLogEntryHeader)); + } + + s->log.offset = hdr->log_offset; + s->log.length = hdr->log_length; + + if (s->log.offset < VHDX_LOG_MIN_SIZE || + s->log.offset % VHDX_LOG_MIN_SIZE) { + ret = -EINVAL; + goto exit; + } + + /* per spec, only log version of 0 is supported */ + if (hdr->log_version != 0) { + ret = -EINVAL; + goto exit; + } + + /* If either the log guid, or log length is zero, + * then a replay log is not present */ + if (guid_eq(hdr->log_guid, zero_guid)) { + goto exit; + } + + if (hdr->log_length == 0) { + goto exit; + } + + if (hdr->log_length % VHDX_LOG_MIN_SIZE) { + ret = -EINVAL; + goto exit; + } + + + /* The log is present, we need to find if and where there is an active + * sequence of valid entries present in the log. */ + + ret = vhdx_log_search(bs, s, &logs); + if (ret < 0) { + goto exit; + } + + if (logs.valid) { + if (bs->read_only) { + ret = -EPERM; + error_setg_errno(errp, EPERM, + "VHDX image file '%s' opened read-only, but " + "contains a log that needs to be replayed. To " + "replay the log, execute:\n qemu-img check -r " + "all '%s'", + bs->filename, bs->filename); + goto exit; + } + /* now flush the log */ + ret = vhdx_log_flush(bs, s, &logs); + if (ret < 0) { + goto exit; + } + *flushed = true; + } + + +exit: + return ret; +} + + + +static void vhdx_log_raw_to_le_sector(VHDXLogDescriptor *desc, + VHDXLogDataSector *sector, void *data, + uint64_t seq) +{ + /* 8 + 4084 + 4 = 4096, 1 log sector */ + memcpy(&desc->leading_bytes, data, 8); + data += 8; + cpu_to_le64s(&desc->leading_bytes); + memcpy(sector->data, data, 4084); + data += 4084; + memcpy(&desc->trailing_bytes, data, 4); + cpu_to_le32s(&desc->trailing_bytes); + data += 4; + + sector->sequence_high = (uint32_t) (seq >> 32); + sector->sequence_low = (uint32_t) (seq & 0xffffffff); + sector->data_signature = VHDX_LOG_DATA_SIGNATURE; + + vhdx_log_desc_le_export(desc); + vhdx_log_data_le_export(sector); +} + + +static int vhdx_log_write(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset) +{ + int ret = 0; + void *buffer = NULL; + void *merged_sector = NULL; + void *data_tmp, *sector_write; + unsigned int i; + int sector_offset; + uint32_t desc_sectors, sectors, total_length; + uint32_t sectors_written = 0; + uint32_t aligned_length; + uint32_t leading_length = 0; + uint32_t trailing_length = 0; + uint32_t partial_sectors = 0; + uint32_t bytes_written = 0; + uint64_t file_offset; + VHDXHeader *header; + VHDXLogEntryHeader new_hdr; + VHDXLogDescriptor *new_desc = NULL; + VHDXLogDataSector *data_sector = NULL; + MSGUID new_guid = { 0 }; + + header = s->headers[s->curr_header]; + + /* need to have offset read data, and be on 4096 byte boundary */ + + if (length > header->log_length) { + /* no log present. we could create a log here instead of failing */ + ret = -EINVAL; + goto exit; + } + + if (guid_eq(header->log_guid, zero_guid)) { + vhdx_guid_generate(&new_guid); + vhdx_update_headers(bs, s, false, &new_guid); + } else { + /* currently, we require that the log be flushed after + * every write. */ + ret = -ENOTSUP; + goto exit; + } + + /* 0 is an invalid sequence number, but may also represent the first + * log write (or a wrapped seq) */ + if (s->log.sequence == 0) { + s->log.sequence = 1; + } + + sector_offset = offset % VHDX_LOG_SECTOR_SIZE; + file_offset = (offset / VHDX_LOG_SECTOR_SIZE) * VHDX_LOG_SECTOR_SIZE; + + aligned_length = length; + + /* add in the unaligned head and tail bytes */ + if (sector_offset) { + leading_length = (VHDX_LOG_SECTOR_SIZE - sector_offset); + leading_length = leading_length > length ? length : leading_length; + aligned_length -= leading_length; + partial_sectors++; + } + + sectors = aligned_length / VHDX_LOG_SECTOR_SIZE; + trailing_length = aligned_length - (sectors * VHDX_LOG_SECTOR_SIZE); + if (trailing_length) { + partial_sectors++; + } + + sectors += partial_sectors; + + /* sectors is now how many sectors the data itself takes, not + * including the header and descriptor metadata */ + + new_hdr = (VHDXLogEntryHeader) { + .signature = VHDX_LOG_SIGNATURE, + .tail = s->log.tail, + .sequence_number = s->log.sequence, + .descriptor_count = sectors, + .reserved = 0, + .flushed_file_offset = bdrv_getlength(bs->file->bs), + .last_file_offset = bdrv_getlength(bs->file->bs), + }; + + new_hdr.log_guid = header->log_guid; + + desc_sectors = vhdx_compute_desc_sectors(new_hdr.descriptor_count); + + total_length = (desc_sectors + sectors) * VHDX_LOG_SECTOR_SIZE; + new_hdr.entry_length = total_length; + + vhdx_log_entry_hdr_le_export(&new_hdr); + + buffer = qemu_blockalign(bs, total_length); + memcpy(buffer, &new_hdr, sizeof(new_hdr)); + + new_desc = buffer + sizeof(new_hdr); + data_sector = buffer + (desc_sectors * VHDX_LOG_SECTOR_SIZE); + data_tmp = data; + + /* All log sectors are 4KB, so for any partial sectors we must + * merge the data with preexisting data from the final file + * destination */ + merged_sector = qemu_blockalign(bs, VHDX_LOG_SECTOR_SIZE); + + for (i = 0; i < sectors; i++) { + new_desc->signature = VHDX_LOG_DESC_SIGNATURE; + new_desc->sequence_number = s->log.sequence; + new_desc->file_offset = file_offset; + + if (i == 0 && leading_length) { + /* partial sector at the front of the buffer */ + ret = bdrv_pread(bs->file->bs, file_offset, merged_sector, + VHDX_LOG_SECTOR_SIZE); + if (ret < 0) { + goto exit; + } + memcpy(merged_sector + sector_offset, data_tmp, leading_length); + bytes_written = leading_length; + sector_write = merged_sector; + } else if (i == sectors - 1 && trailing_length) { + /* partial sector at the end of the buffer */ + ret = bdrv_pread(bs->file->bs, + file_offset, + merged_sector + trailing_length, + VHDX_LOG_SECTOR_SIZE - trailing_length); + if (ret < 0) { + goto exit; + } + memcpy(merged_sector, data_tmp, trailing_length); + bytes_written = trailing_length; + sector_write = merged_sector; + } else { + bytes_written = VHDX_LOG_SECTOR_SIZE; + sector_write = data_tmp; + } + + /* populate the raw sector data into the proper structures, + * as well as update the descriptor, and convert to proper + * endianness */ + vhdx_log_raw_to_le_sector(new_desc, data_sector, sector_write, + s->log.sequence); + + data_tmp += bytes_written; + data_sector++; + new_desc++; + file_offset += VHDX_LOG_SECTOR_SIZE; + } + + /* checksum covers entire entry, from the log header through the + * last data sector */ + vhdx_update_checksum(buffer, total_length, + offsetof(VHDXLogEntryHeader, checksum)); + + /* now write to the log */ + ret = vhdx_log_write_sectors(bs, &s->log, §ors_written, buffer, + desc_sectors + sectors); + if (ret < 0) { + goto exit; + } + + if (sectors_written != desc_sectors + sectors) { + /* instead of failing, we could flush the log here */ + ret = -EINVAL; + goto exit; + } + + s->log.sequence++; + /* write new tail */ + s->log.tail = s->log.write; + +exit: + qemu_vfree(buffer); + qemu_vfree(merged_sector); + return ret; +} + +/* Perform a log write, and then immediately flush the entire log */ +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset) +{ + int ret = 0; + VHDXLogSequence logs = { .valid = true, + .count = 1, + .hdr = { 0 } }; + + + /* Make sure data written (new and/or changed blocks) is stable + * on disk, before creating log entry */ + bdrv_flush(bs); + ret = vhdx_log_write(bs, s, data, length, offset); + if (ret < 0) { + goto exit; + } + logs.log = s->log; + + /* Make sure log is stable on disk */ + bdrv_flush(bs); + ret = vhdx_log_flush(bs, s, &logs); + if (ret < 0) { + goto exit; + } + + s->log = logs.log; + +exit: + return ret; +} + diff --git a/src/block/vhdx.c b/src/block/vhdx.c new file mode 100644 index 0000000..2fe9a5e --- /dev/null +++ b/src/block/vhdx.c @@ -0,0 +1,1982 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "qemu/crc32c.h" +#include "block/vhdx.h" +#include "migration/migration.h" + +#include <uuid/uuid.h> +#include <glib.h> + +/* Options for VHDX creation */ + +#define VHDX_BLOCK_OPT_LOG_SIZE "log_size" +#define VHDX_BLOCK_OPT_BLOCK_SIZE "block_size" +#define VHDX_BLOCK_OPT_ZERO "block_state_zero" + +typedef enum VHDXImageType { + VHDX_TYPE_DYNAMIC = 0, + VHDX_TYPE_FIXED, + VHDX_TYPE_DIFFERENCING, /* Currently unsupported */ +} VHDXImageType; + +/* Several metadata and region table data entries are identified by + * guids in a MS-specific GUID format. */ + + +/* ------- Known Region Table GUIDs ---------------------- */ +static const MSGUID bat_guid = { .data1 = 0x2dc27766, + .data2 = 0xf623, + .data3 = 0x4200, + .data4 = { 0x9d, 0x64, 0x11, 0x5e, + 0x9b, 0xfd, 0x4a, 0x08} }; + +static const MSGUID metadata_guid = { .data1 = 0x8b7ca206, + .data2 = 0x4790, + .data3 = 0x4b9a, + .data4 = { 0xb8, 0xfe, 0x57, 0x5f, + 0x05, 0x0f, 0x88, 0x6e} }; + + + +/* ------- Known Metadata Entry GUIDs ---------------------- */ +static const MSGUID file_param_guid = { .data1 = 0xcaa16737, + .data2 = 0xfa36, + .data3 = 0x4d43, + .data4 = { 0xb3, 0xb6, 0x33, 0xf0, + 0xaa, 0x44, 0xe7, 0x6b} }; + +static const MSGUID virtual_size_guid = { .data1 = 0x2FA54224, + .data2 = 0xcd1b, + .data3 = 0x4876, + .data4 = { 0xb2, 0x11, 0x5d, 0xbe, + 0xd8, 0x3b, 0xf4, 0xb8} }; + +static const MSGUID page83_guid = { .data1 = 0xbeca12ab, + .data2 = 0xb2e6, + .data3 = 0x4523, + .data4 = { 0x93, 0xef, 0xc3, 0x09, + 0xe0, 0x00, 0xc7, 0x46} }; + + +static const MSGUID phys_sector_guid = { .data1 = 0xcda348c7, + .data2 = 0x445d, + .data3 = 0x4471, + .data4 = { 0x9c, 0xc9, 0xe9, 0x88, + 0x52, 0x51, 0xc5, 0x56} }; + +static const MSGUID parent_locator_guid = { .data1 = 0xa8d35f2d, + .data2 = 0xb30b, + .data3 = 0x454d, + .data4 = { 0xab, 0xf7, 0xd3, + 0xd8, 0x48, 0x34, + 0xab, 0x0c} }; + +static const MSGUID logical_sector_guid = { .data1 = 0x8141bf1d, + .data2 = 0xa96f, + .data3 = 0x4709, + .data4 = { 0xba, 0x47, 0xf2, + 0x33, 0xa8, 0xfa, + 0xab, 0x5f} }; + +/* Each parent type must have a valid GUID; this is for parent images + * of type 'VHDX'. If we were to allow e.g. a QCOW2 parent, we would + * need to make up our own QCOW2 GUID type */ +static const MSGUID parent_vhdx_guid __attribute__((unused)) + = { .data1 = 0xb04aefb7, + .data2 = 0xd19e, + .data3 = 0x4a81, + .data4 = { 0xb7, 0x89, 0x25, 0xb8, + 0xe9, 0x44, 0x59, 0x13} }; + + +#define META_FILE_PARAMETER_PRESENT 0x01 +#define META_VIRTUAL_DISK_SIZE_PRESENT 0x02 +#define META_PAGE_83_PRESENT 0x04 +#define META_LOGICAL_SECTOR_SIZE_PRESENT 0x08 +#define META_PHYS_SECTOR_SIZE_PRESENT 0x10 +#define META_PARENT_LOCATOR_PRESENT 0x20 + +#define META_ALL_PRESENT \ + (META_FILE_PARAMETER_PRESENT | META_VIRTUAL_DISK_SIZE_PRESENT | \ + META_PAGE_83_PRESENT | META_LOGICAL_SECTOR_SIZE_PRESENT | \ + META_PHYS_SECTOR_SIZE_PRESENT) + + +typedef struct VHDXSectorInfo { + uint32_t bat_idx; /* BAT entry index */ + uint32_t sectors_avail; /* sectors available in payload block */ + uint32_t bytes_left; /* bytes left in the block after data to r/w */ + uint32_t bytes_avail; /* bytes available in payload block */ + uint64_t file_offset; /* absolute offset in bytes, in file */ + uint64_t block_offset; /* block offset, in bytes */ +} VHDXSectorInfo; + +/* Calculates new checksum. + * + * Zero is substituted during crc calculation for the original crc field + * crc_offset: byte offset in buf of the buffer crc + * buf: buffer pointer + * size: size of buffer (must be > crc_offset+4) + * + * Note: The buffer should have all multi-byte data in little-endian format, + * and the resulting checksum is in little endian format. + */ +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset) +{ + uint32_t crc; + + assert(buf != NULL); + assert(size > (crc_offset + sizeof(crc))); + + memset(buf + crc_offset, 0, sizeof(crc)); + crc = crc32c(0xffffffff, buf, size); + cpu_to_le32s(&crc); + memcpy(buf + crc_offset, &crc, sizeof(crc)); + + return crc; +} + +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, + int crc_offset) +{ + uint32_t crc_new; + uint32_t crc_orig; + assert(buf != NULL); + + if (crc_offset > 0) { + memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); + memset(buf + crc_offset, 0, sizeof(crc_orig)); + } + + crc_new = crc32c(crc, buf, size); + if (crc_offset > 0) { + memcpy(buf + crc_offset, &crc_orig, sizeof(crc_orig)); + } + + return crc_new; +} + +/* Validates the checksum of the buffer, with an in-place CRC. + * + * Zero is substituted during crc calculation for the original crc field, + * and the crc field is restored afterwards. But the buffer will be modifed + * during the calculation, so this may not be not suitable for multi-threaded + * use. + * + * crc_offset: byte offset in buf of the buffer crc + * buf: buffer pointer + * size: size of buffer (must be > crc_offset+4) + * + * returns true if checksum is valid, false otherwise + */ +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset) +{ + uint32_t crc_orig; + uint32_t crc; + + assert(buf != NULL); + assert(size > (crc_offset + 4)); + + memcpy(&crc_orig, buf + crc_offset, sizeof(crc_orig)); + crc_orig = le32_to_cpu(crc_orig); + + crc = vhdx_checksum_calc(0xffffffff, buf, size, crc_offset); + + return crc == crc_orig; +} + + +/* + * This generates a UUID that is compliant with the MS GUIDs used + * in the VHDX spec (and elsewhere). + */ +void vhdx_guid_generate(MSGUID *guid) +{ + uuid_t uuid; + assert(guid != NULL); + + uuid_generate(uuid); + memcpy(guid, uuid, sizeof(MSGUID)); +} + +/* Check for region overlaps inside the VHDX image */ +static int vhdx_region_check(BDRVVHDXState *s, uint64_t start, uint64_t length) +{ + int ret = 0; + uint64_t end; + VHDXRegionEntry *r; + + end = start + length; + QLIST_FOREACH(r, &s->regions, entries) { + if (!((start >= r->end) || (end <= r->start))) { + ret = -EINVAL; + goto exit; + } + } + +exit: + return ret; +} + +/* Register a region for future checks */ +static void vhdx_region_register(BDRVVHDXState *s, + uint64_t start, uint64_t length) +{ + VHDXRegionEntry *r; + + r = g_malloc0(sizeof(*r)); + + r->start = start; + r->end = start + length; + + QLIST_INSERT_HEAD(&s->regions, r, entries); +} + +/* Free all registered regions */ +static void vhdx_region_unregister_all(BDRVVHDXState *s) +{ + VHDXRegionEntry *r, *r_next; + + QLIST_FOREACH_SAFE(r, &s->regions, entries, r_next) { + QLIST_REMOVE(r, entries); + g_free(r); + } +} + +static void vhdx_set_shift_bits(BDRVVHDXState *s) +{ + s->logical_sector_size_bits = 31 - clz32(s->logical_sector_size); + s->sectors_per_block_bits = 31 - clz32(s->sectors_per_block); + s->chunk_ratio_bits = 63 - clz64(s->chunk_ratio); + s->block_size_bits = 31 - clz32(s->block_size); +} + +/* + * Per the MS VHDX Specification, for every VHDX file: + * - The header section is fixed size - 1 MB + * - The header section is always the first "object" + * - The first 64KB of the header is the File Identifier + * - The first uint64 (8 bytes) is the VHDX Signature ("vhdxfile") + * - The following 512 bytes constitute a UTF-16 string identifiying the + * software that created the file, and is optional and diagnostic only. + * + * Therefore, we probe by looking for the vhdxfile signature "vhdxfile" + */ +static int vhdx_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + if (buf_size >= 8 && !memcmp(buf, "vhdxfile", 8)) { + return 100; + } + return 0; +} + +/* + * Writes the header to the specified offset. + * + * This will optionally read in buffer data from disk (otherwise zero-fill), + * and then update the header checksum. Header is converted to proper + * endianness before being written to the specified file offset + */ +static int vhdx_write_header(BlockDriverState *bs_file, VHDXHeader *hdr, + uint64_t offset, bool read) +{ + uint8_t *buffer = NULL; + int ret; + VHDXHeader *header_le; + + assert(bs_file != NULL); + assert(hdr != NULL); + + /* the header checksum is not over just the packed size of VHDXHeader, + * but rather over the entire 'reserved' range for the header, which is + * 4KB (VHDX_HEADER_SIZE). */ + + buffer = qemu_blockalign(bs_file, VHDX_HEADER_SIZE); + if (read) { + /* if true, we can't assume the extra reserved bytes are 0 */ + ret = bdrv_pread(bs_file, offset, buffer, VHDX_HEADER_SIZE); + if (ret < 0) { + goto exit; + } + } else { + memset(buffer, 0, VHDX_HEADER_SIZE); + } + + /* overwrite the actual VHDXHeader portion */ + header_le = (VHDXHeader *)buffer; + memcpy(header_le, hdr, sizeof(VHDXHeader)); + vhdx_header_le_export(hdr, header_le); + vhdx_update_checksum(buffer, VHDX_HEADER_SIZE, + offsetof(VHDXHeader, checksum)); + ret = bdrv_pwrite_sync(bs_file, offset, header_le, sizeof(VHDXHeader)); + +exit: + qemu_vfree(buffer); + return ret; +} + +/* Update the VHDX headers + * + * This follows the VHDX spec procedures for header updates. + * + * - non-current header is updated with largest sequence number + */ +static int vhdx_update_header(BlockDriverState *bs, BDRVVHDXState *s, + bool generate_data_write_guid, MSGUID *log_guid) +{ + int ret = 0; + int hdr_idx = 0; + uint64_t header_offset = VHDX_HEADER1_OFFSET; + + VHDXHeader *active_header; + VHDXHeader *inactive_header; + + /* operate on the non-current header */ + if (s->curr_header == 0) { + hdr_idx = 1; + header_offset = VHDX_HEADER2_OFFSET; + } + + active_header = s->headers[s->curr_header]; + inactive_header = s->headers[hdr_idx]; + + inactive_header->sequence_number = active_header->sequence_number + 1; + + /* a new file guid must be generated before any file write, including + * headers */ + inactive_header->file_write_guid = s->session_guid; + + /* a new data guid only needs to be generated before any guest-visible + * writes (i.e. something observable via virtual disk read) */ + if (generate_data_write_guid) { + vhdx_guid_generate(&inactive_header->data_write_guid); + } + + /* update the log guid if present */ + if (log_guid) { + inactive_header->log_guid = *log_guid; + } + + ret = vhdx_write_header(bs->file->bs, inactive_header, header_offset, true); + if (ret < 0) { + goto exit; + } + s->curr_header = hdr_idx; + +exit: + return ret; +} + +/* + * The VHDX spec calls for header updates to be performed twice, so that both + * the current and non-current header have valid info + */ +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, + bool generate_data_write_guid, MSGUID *log_guid) +{ + int ret; + + ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + if (ret < 0) { + return ret; + } + ret = vhdx_update_header(bs, s, generate_data_write_guid, log_guid); + return ret; +} + +/* opens the specified header block from the VHDX file header section */ +static void vhdx_parse_header(BlockDriverState *bs, BDRVVHDXState *s, + Error **errp) +{ + int ret; + VHDXHeader *header1; + VHDXHeader *header2; + bool h1_valid = false; + bool h2_valid = false; + uint64_t h1_seq = 0; + uint64_t h2_seq = 0; + uint8_t *buffer; + + /* header1 & header2 are freed in vhdx_close() */ + header1 = qemu_blockalign(bs, sizeof(VHDXHeader)); + header2 = qemu_blockalign(bs, sizeof(VHDXHeader)); + + buffer = qemu_blockalign(bs, VHDX_HEADER_SIZE); + + s->headers[0] = header1; + s->headers[1] = header2; + + /* We have to read the whole VHDX_HEADER_SIZE instead of + * sizeof(VHDXHeader), because the checksum is over the whole + * region */ + ret = bdrv_pread(bs->file->bs, VHDX_HEADER1_OFFSET, buffer, + VHDX_HEADER_SIZE); + if (ret < 0) { + goto fail; + } + /* copy over just the relevant portion that we need */ + memcpy(header1, buffer, sizeof(VHDXHeader)); + + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { + vhdx_header_le_import(header1); + if (header1->signature == VHDX_HEADER_SIGNATURE && + header1->version == 1) { + h1_seq = header1->sequence_number; + h1_valid = true; + } + } + + ret = bdrv_pread(bs->file->bs, VHDX_HEADER2_OFFSET, buffer, + VHDX_HEADER_SIZE); + if (ret < 0) { + goto fail; + } + /* copy over just the relevant portion that we need */ + memcpy(header2, buffer, sizeof(VHDXHeader)); + + if (vhdx_checksum_is_valid(buffer, VHDX_HEADER_SIZE, 4)) { + vhdx_header_le_import(header2); + if (header2->signature == VHDX_HEADER_SIGNATURE && + header2->version == 1) { + h2_seq = header2->sequence_number; + h2_valid = true; + } + } + + /* If there is only 1 valid header (or no valid headers), we + * don't care what the sequence numbers are */ + if (h1_valid && !h2_valid) { + s->curr_header = 0; + } else if (!h1_valid && h2_valid) { + s->curr_header = 1; + } else if (!h1_valid && !h2_valid) { + goto fail; + } else { + /* If both headers are valid, then we choose the active one by the + * highest sequence number. If the sequence numbers are equal, that is + * invalid */ + if (h1_seq > h2_seq) { + s->curr_header = 0; + } else if (h2_seq > h1_seq) { + s->curr_header = 1; + } else { + /* The Microsoft Disk2VHD tool will create 2 identical + * headers, with identical sequence numbers. If the headers are + * identical, don't consider the file corrupt */ + if (!memcmp(header1, header2, sizeof(VHDXHeader))) { + s->curr_header = 0; + } else { + goto fail; + } + } + } + + vhdx_region_register(s, s->headers[s->curr_header]->log_offset, + s->headers[s->curr_header]->log_length); + goto exit; + +fail: + error_setg_errno(errp, -ret, "No valid VHDX header found"); + qemu_vfree(header1); + qemu_vfree(header2); + s->headers[0] = NULL; + s->headers[1] = NULL; +exit: + qemu_vfree(buffer); +} + + +static int vhdx_open_region_tables(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + uint8_t *buffer; + int offset = 0; + VHDXRegionTableEntry rt_entry; + uint32_t i; + bool bat_rt_found = false; + bool metadata_rt_found = false; + + /* We have to read the whole 64KB block, because the crc32 is over the + * whole block */ + buffer = qemu_blockalign(bs, VHDX_HEADER_BLOCK_SIZE); + + ret = bdrv_pread(bs->file->bs, VHDX_REGION_TABLE_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto fail; + } + memcpy(&s->rt, buffer, sizeof(s->rt)); + offset += sizeof(s->rt); + + if (!vhdx_checksum_is_valid(buffer, VHDX_HEADER_BLOCK_SIZE, 4)) { + ret = -EINVAL; + goto fail; + } + + vhdx_region_header_le_import(&s->rt); + + if (s->rt.signature != VHDX_REGION_SIGNATURE) { + ret = -EINVAL; + goto fail; + } + + + /* Per spec, maximum region table entry count is 2047 */ + if (s->rt.entry_count > 2047) { + ret = -EINVAL; + goto fail; + } + + for (i = 0; i < s->rt.entry_count; i++) { + memcpy(&rt_entry, buffer + offset, sizeof(rt_entry)); + offset += sizeof(rt_entry); + + vhdx_region_entry_le_import(&rt_entry); + + /* check for region overlap between these entries, and any + * other memory regions in the file */ + ret = vhdx_region_check(s, rt_entry.file_offset, rt_entry.length); + if (ret < 0) { + goto fail; + } + + vhdx_region_register(s, rt_entry.file_offset, rt_entry.length); + + /* see if we recognize the entry */ + if (guid_eq(rt_entry.guid, bat_guid)) { + /* must be unique; if we have already found it this is invalid */ + if (bat_rt_found) { + ret = -EINVAL; + goto fail; + } + bat_rt_found = true; + s->bat_rt = rt_entry; + continue; + } + + if (guid_eq(rt_entry.guid, metadata_guid)) { + /* must be unique; if we have already found it this is invalid */ + if (metadata_rt_found) { + ret = -EINVAL; + goto fail; + } + metadata_rt_found = true; + s->metadata_rt = rt_entry; + continue; + } + + if (rt_entry.data_bits & VHDX_REGION_ENTRY_REQUIRED) { + /* cannot read vhdx file - required region table entry that + * we do not understand. per spec, we must fail to open */ + ret = -ENOTSUP; + goto fail; + } + } + + if (!bat_rt_found || !metadata_rt_found) { + ret = -EINVAL; + goto fail; + } + + ret = 0; + +fail: + qemu_vfree(buffer); + return ret; +} + + + +/* Metadata initial parser + * + * This loads all the metadata entry fields. This may cause additional + * fields to be processed (e.g. parent locator, etc..). + * + * There are 5 Metadata items that are always required: + * - File Parameters (block size, has a parent) + * - Virtual Disk Size (size, in bytes, of the virtual drive) + * - Page 83 Data (scsi page 83 guid) + * - Logical Sector Size (logical sector size in bytes, either 512 or + * 4096. We only support 512 currently) + * - Physical Sector Size (512 or 4096) + * + * Also, if the File Parameters indicate this is a differencing file, + * we must also look for the Parent Locator metadata item. + */ +static int vhdx_parse_metadata(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + uint8_t *buffer; + int offset = 0; + uint32_t i = 0; + VHDXMetadataTableEntry md_entry; + + buffer = qemu_blockalign(bs, VHDX_METADATA_TABLE_MAX_SIZE); + + ret = bdrv_pread(bs->file->bs, s->metadata_rt.file_offset, buffer, + VHDX_METADATA_TABLE_MAX_SIZE); + if (ret < 0) { + goto exit; + } + memcpy(&s->metadata_hdr, buffer, sizeof(s->metadata_hdr)); + offset += sizeof(s->metadata_hdr); + + vhdx_metadata_header_le_import(&s->metadata_hdr); + + if (s->metadata_hdr.signature != VHDX_METADATA_SIGNATURE) { + ret = -EINVAL; + goto exit; + } + + s->metadata_entries.present = 0; + + if ((s->metadata_hdr.entry_count * sizeof(md_entry)) > + (VHDX_METADATA_TABLE_MAX_SIZE - offset)) { + ret = -EINVAL; + goto exit; + } + + for (i = 0; i < s->metadata_hdr.entry_count; i++) { + memcpy(&md_entry, buffer + offset, sizeof(md_entry)); + offset += sizeof(md_entry); + + vhdx_metadata_entry_le_import(&md_entry); + + if (guid_eq(md_entry.item_id, file_param_guid)) { + if (s->metadata_entries.present & META_FILE_PARAMETER_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.file_parameters_entry = md_entry; + s->metadata_entries.present |= META_FILE_PARAMETER_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, virtual_size_guid)) { + if (s->metadata_entries.present & META_VIRTUAL_DISK_SIZE_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.virtual_disk_size_entry = md_entry; + s->metadata_entries.present |= META_VIRTUAL_DISK_SIZE_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, page83_guid)) { + if (s->metadata_entries.present & META_PAGE_83_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.page83_data_entry = md_entry; + s->metadata_entries.present |= META_PAGE_83_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, logical_sector_guid)) { + if (s->metadata_entries.present & + META_LOGICAL_SECTOR_SIZE_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.logical_sector_size_entry = md_entry; + s->metadata_entries.present |= META_LOGICAL_SECTOR_SIZE_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, phys_sector_guid)) { + if (s->metadata_entries.present & META_PHYS_SECTOR_SIZE_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.phys_sector_size_entry = md_entry; + s->metadata_entries.present |= META_PHYS_SECTOR_SIZE_PRESENT; + continue; + } + + if (guid_eq(md_entry.item_id, parent_locator_guid)) { + if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { + ret = -EINVAL; + goto exit; + } + s->metadata_entries.parent_locator_entry = md_entry; + s->metadata_entries.present |= META_PARENT_LOCATOR_PRESENT; + continue; + } + + if (md_entry.data_bits & VHDX_META_FLAGS_IS_REQUIRED) { + /* cannot read vhdx file - required region table entry that + * we do not understand. per spec, we must fail to open */ + ret = -ENOTSUP; + goto exit; + } + } + + if (s->metadata_entries.present != META_ALL_PRESENT) { + ret = -ENOTSUP; + goto exit; + } + + ret = bdrv_pread(bs->file->bs, + s->metadata_entries.file_parameters_entry.offset + + s->metadata_rt.file_offset, + &s->params, + sizeof(s->params)); + + if (ret < 0) { + goto exit; + } + + le32_to_cpus(&s->params.block_size); + le32_to_cpus(&s->params.data_bits); + + + /* We now have the file parameters, so we can tell if this is a + * differencing file (i.e.. has_parent), is dynamic or fixed + * sized (leave_blocks_allocated), and the block size */ + + /* The parent locator required iff the file parameters has_parent set */ + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + if (s->metadata_entries.present & META_PARENT_LOCATOR_PRESENT) { + /* TODO: parse parent locator fields */ + ret = -ENOTSUP; /* temp, until differencing files are supported */ + goto exit; + } else { + /* if has_parent is set, but there is not parent locator present, + * then that is an invalid combination */ + ret = -EINVAL; + goto exit; + } + } + + /* determine virtual disk size, logical sector size, + * and phys sector size */ + + ret = bdrv_pread(bs->file->bs, + s->metadata_entries.virtual_disk_size_entry.offset + + s->metadata_rt.file_offset, + &s->virtual_disk_size, + sizeof(uint64_t)); + if (ret < 0) { + goto exit; + } + ret = bdrv_pread(bs->file->bs, + s->metadata_entries.logical_sector_size_entry.offset + + s->metadata_rt.file_offset, + &s->logical_sector_size, + sizeof(uint32_t)); + if (ret < 0) { + goto exit; + } + ret = bdrv_pread(bs->file->bs, + s->metadata_entries.phys_sector_size_entry.offset + + s->metadata_rt.file_offset, + &s->physical_sector_size, + sizeof(uint32_t)); + if (ret < 0) { + goto exit; + } + + le64_to_cpus(&s->virtual_disk_size); + le32_to_cpus(&s->logical_sector_size); + le32_to_cpus(&s->physical_sector_size); + + if (s->params.block_size < VHDX_BLOCK_SIZE_MIN || + s->params.block_size > VHDX_BLOCK_SIZE_MAX) { + ret = -EINVAL; + goto exit; + } + + /* only 2 supported sector sizes */ + if (s->logical_sector_size != 512 && s->logical_sector_size != 4096) { + ret = -EINVAL; + goto exit; + } + + /* Both block_size and sector_size are guaranteed powers of 2, below. + Due to range checks above, s->sectors_per_block can never be < 256 */ + s->sectors_per_block = s->params.block_size / s->logical_sector_size; + s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * + (uint64_t)s->logical_sector_size / + (uint64_t)s->params.block_size; + + /* These values are ones we will want to use for division / multiplication + * later on, and they are all guaranteed (per the spec) to be powers of 2, + * so we can take advantage of that for shift operations during + * reads/writes */ + if (s->logical_sector_size & (s->logical_sector_size - 1)) { + ret = -EINVAL; + goto exit; + } + if (s->sectors_per_block & (s->sectors_per_block - 1)) { + ret = -EINVAL; + goto exit; + } + if (s->chunk_ratio & (s->chunk_ratio - 1)) { + ret = -EINVAL; + goto exit; + } + s->block_size = s->params.block_size; + if (s->block_size & (s->block_size - 1)) { + ret = -EINVAL; + goto exit; + } + + vhdx_set_shift_bits(s); + + ret = 0; + +exit: + qemu_vfree(buffer); + return ret; +} + +/* + * Calculate the number of BAT entries, including sector + * bitmap entries. + */ +static void vhdx_calc_bat_entries(BDRVVHDXState *s) +{ + uint32_t data_blocks_cnt, bitmap_blocks_cnt; + + data_blocks_cnt = s->virtual_disk_size >> s->block_size_bits; + if (s->virtual_disk_size - (data_blocks_cnt << s->block_size_bits)) { + data_blocks_cnt++; + } + bitmap_blocks_cnt = data_blocks_cnt >> s->chunk_ratio_bits; + if (data_blocks_cnt - (bitmap_blocks_cnt << s->chunk_ratio_bits)) { + bitmap_blocks_cnt++; + } + + if (s->parent_entries) { + s->bat_entries = bitmap_blocks_cnt * (s->chunk_ratio + 1); + } else { + s->bat_entries = data_blocks_cnt + + ((data_blocks_cnt - 1) >> s->chunk_ratio_bits); + } + +} + +static void vhdx_close(BlockDriverState *bs) +{ + BDRVVHDXState *s = bs->opaque; + qemu_vfree(s->headers[0]); + s->headers[0] = NULL; + qemu_vfree(s->headers[1]); + s->headers[1] = NULL; + qemu_vfree(s->bat); + s->bat = NULL; + qemu_vfree(s->parent_entries); + s->parent_entries = NULL; + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); + qemu_vfree(s->log.hdr); + s->log.hdr = NULL; + vhdx_region_unregister_all(s); +} + +static int vhdx_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVVHDXState *s = bs->opaque; + int ret = 0; + uint32_t i; + uint64_t signature; + Error *local_err = NULL; + + s->bat = NULL; + s->first_visible_write = true; + + qemu_co_mutex_init(&s->lock); + QLIST_INIT(&s->regions); + + /* validate the file signature */ + ret = bdrv_pread(bs->file->bs, 0, &signature, sizeof(uint64_t)); + if (ret < 0) { + goto fail; + } + if (memcmp(&signature, "vhdxfile", 8)) { + ret = -EINVAL; + goto fail; + } + + /* This is used for any header updates, for the file_write_guid. + * The spec dictates that a new value should be used for the first + * header update */ + vhdx_guid_generate(&s->session_guid); + + vhdx_parse_header(bs, s, &local_err); + if (local_err != NULL) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + ret = vhdx_parse_log(bs, s, &s->log_replayed_on_open, errp); + if (ret < 0) { + goto fail; + } + + ret = vhdx_open_region_tables(bs, s); + if (ret < 0) { + goto fail; + } + + ret = vhdx_parse_metadata(bs, s); + if (ret < 0) { + goto fail; + } + + s->block_size = s->params.block_size; + + /* the VHDX spec dictates that virtual_disk_size is always a multiple of + * logical_sector_size */ + bs->total_sectors = s->virtual_disk_size >> s->logical_sector_size_bits; + + vhdx_calc_bat_entries(s); + + s->bat_offset = s->bat_rt.file_offset; + + if (s->bat_entries > s->bat_rt.length / sizeof(VHDXBatEntry)) { + /* BAT allocation is not large enough for all entries */ + ret = -EINVAL; + goto fail; + } + + /* s->bat is freed in vhdx_close() */ + s->bat = qemu_try_blockalign(bs->file->bs, s->bat_rt.length); + if (s->bat == NULL) { + ret = -ENOMEM; + goto fail; + } + + ret = bdrv_pread(bs->file->bs, s->bat_offset, s->bat, s->bat_rt.length); + if (ret < 0) { + goto fail; + } + + uint64_t payblocks = s->chunk_ratio; + /* endian convert, and verify populated BAT field file offsets against + * region table and log entries */ + for (i = 0; i < s->bat_entries; i++) { + le64_to_cpus(&s->bat[i]); + if (payblocks--) { + /* payload bat entries */ + if ((s->bat[i] & VHDX_BAT_STATE_BIT_MASK) == + PAYLOAD_BLOCK_FULLY_PRESENT) { + ret = vhdx_region_check(s, s->bat[i] & VHDX_BAT_FILE_OFF_MASK, + s->block_size); + if (ret < 0) { + goto fail; + } + } + } else { + payblocks = s->chunk_ratio; + /* Once differencing files are supported, verify sector bitmap + * blocks here */ + } + } + + if (flags & BDRV_O_RDWR) { + ret = vhdx_update_headers(bs, s, false, NULL); + if (ret < 0) { + goto fail; + } + } + + /* TODO: differencing files */ + + /* Disable migration when VHDX images are used */ + error_setg(&s->migration_blocker, "The vhdx format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); + migrate_add_blocker(s->migration_blocker); + + return 0; +fail: + vhdx_close(bs); + return ret; +} + +static int vhdx_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + + +/* + * Perform sector to block offset translations, to get various + * sector and file offsets into the image. See VHDXSectorInfo + */ +static void vhdx_block_translate(BDRVVHDXState *s, int64_t sector_num, + int nb_sectors, VHDXSectorInfo *sinfo) +{ + uint32_t block_offset; + + sinfo->bat_idx = sector_num >> s->sectors_per_block_bits; + /* effectively a modulo - this gives us the offset into the block + * (in sector sizes) for our sector number */ + block_offset = sector_num - (sinfo->bat_idx << s->sectors_per_block_bits); + /* the chunk ratio gives us the interleaving of the sector + * bitmaps, so we need to advance our page block index by the + * sector bitmaps entry number */ + sinfo->bat_idx += sinfo->bat_idx >> s->chunk_ratio_bits; + + /* the number of sectors we can read/write in this cycle */ + sinfo->sectors_avail = s->sectors_per_block - block_offset; + + sinfo->bytes_left = sinfo->sectors_avail << s->logical_sector_size_bits; + + if (sinfo->sectors_avail > nb_sectors) { + sinfo->sectors_avail = nb_sectors; + } + + sinfo->bytes_avail = sinfo->sectors_avail << s->logical_sector_size_bits; + + sinfo->file_offset = s->bat[sinfo->bat_idx] & VHDX_BAT_FILE_OFF_MASK; + + sinfo->block_offset = block_offset << s->logical_sector_size_bits; + + /* The file offset must be past the header section, so must be > 0 */ + if (sinfo->file_offset == 0) { + return; + } + + /* block offset is the offset in vhdx logical sectors, in + * the payload data block. Convert that to a byte offset + * in the block, and add in the payload data block offset + * in the file, in bytes, to get the final read address */ + + sinfo->file_offset += sinfo->block_offset; +} + + +static int vhdx_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVVHDXState *s = bs->opaque; + + bdi->cluster_size = s->block_size; + + bdi->unallocated_blocks_are_zero = + (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) == 0; + + return 0; +} + + +static coroutine_fn int vhdx_co_readv(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + BDRVVHDXState *s = bs->opaque; + int ret = 0; + VHDXSectorInfo sinfo; + uint64_t bytes_done = 0; + QEMUIOVector hd_qiov; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + while (nb_sectors > 0) { + /* We are a differencing file, so we need to inspect the sector bitmap + * to see if we have the data or not */ + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + /* not supported yet */ + ret = -ENOTSUP; + goto exit; + } else { + vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); + + qemu_iovec_reset(&hd_qiov); + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, sinfo.bytes_avail); + + /* check the payload block state */ + switch (s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK) { + case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ + case PAYLOAD_BLOCK_UNDEFINED: + case PAYLOAD_BLOCK_UNMAPPED: + case PAYLOAD_BLOCK_UNMAPPED_v095: + case PAYLOAD_BLOCK_ZERO: + /* return zero */ + qemu_iovec_memset(&hd_qiov, 0, 0, sinfo.bytes_avail); + break; + case PAYLOAD_BLOCK_FULLY_PRESENT: + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_readv(bs->file->bs, + sinfo.file_offset >> BDRV_SECTOR_BITS, + sinfo.sectors_avail, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto exit; + } + break; + case PAYLOAD_BLOCK_PARTIALLY_PRESENT: + /* we don't yet support difference files, fall through + * to error */ + default: + ret = -EIO; + goto exit; + break; + } + nb_sectors -= sinfo.sectors_avail; + sector_num += sinfo.sectors_avail; + bytes_done += sinfo.bytes_avail; + } + } + ret = 0; +exit: + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&hd_qiov); + return ret; +} + +/* + * Allocate a new payload block at the end of the file. + * + * Allocation will happen at 1MB alignment inside the file + * + * Returns the file offset start of the new payload block + */ +static int vhdx_allocate_block(BlockDriverState *bs, BDRVVHDXState *s, + uint64_t *new_offset) +{ + *new_offset = bdrv_getlength(bs->file->bs); + + /* per the spec, the address for a block is in units of 1MB */ + *new_offset = ROUND_UP(*new_offset, 1024 * 1024); + + return bdrv_truncate(bs->file->bs, *new_offset + s->block_size); +} + +/* + * Update the BAT table entry with the new file offset, and the new entry + * state */ +static void vhdx_update_bat_table_entry(BlockDriverState *bs, BDRVVHDXState *s, + VHDXSectorInfo *sinfo, + uint64_t *bat_entry_le, + uint64_t *bat_offset, int state) +{ + /* The BAT entry is a uint64, with 44 bits for the file offset in units of + * 1MB, and 3 bits for the block state. */ + if ((state == PAYLOAD_BLOCK_ZERO) || + (state == PAYLOAD_BLOCK_UNDEFINED) || + (state == PAYLOAD_BLOCK_NOT_PRESENT) || + (state == PAYLOAD_BLOCK_UNMAPPED)) { + s->bat[sinfo->bat_idx] = 0; /* For PAYLOAD_BLOCK_ZERO, the + FileOffsetMB field is denoted as + 'reserved' in the v1.0 spec. If it is + non-zero, MS Hyper-V will fail to read + the disk image */ + } else { + s->bat[sinfo->bat_idx] = sinfo->file_offset; + } + + s->bat[sinfo->bat_idx] |= state & VHDX_BAT_STATE_BIT_MASK; + + *bat_entry_le = cpu_to_le64(s->bat[sinfo->bat_idx]); + *bat_offset = s->bat_offset + sinfo->bat_idx * sizeof(VHDXBatEntry); + +} + +/* Per the spec, on the first write of guest-visible data to the file the + * data write guid must be updated in the header */ +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s) +{ + int ret = 0; + if (s->first_visible_write) { + s->first_visible_write = false; + ret = vhdx_update_headers(bs, s, true, NULL); + } + return ret; +} + +static coroutine_fn int vhdx_co_writev(BlockDriverState *bs, int64_t sector_num, + int nb_sectors, QEMUIOVector *qiov) +{ + int ret = -ENOTSUP; + BDRVVHDXState *s = bs->opaque; + VHDXSectorInfo sinfo; + uint64_t bytes_done = 0; + uint64_t bat_entry = 0; + uint64_t bat_entry_offset = 0; + QEMUIOVector hd_qiov; + struct iovec iov1 = { 0 }; + struct iovec iov2 = { 0 }; + int sectors_to_write; + int bat_state; + uint64_t bat_prior_offset = 0; + bool bat_update = false; + + qemu_iovec_init(&hd_qiov, qiov->niov); + + qemu_co_mutex_lock(&s->lock); + + ret = vhdx_user_visible_write(bs, s); + if (ret < 0) { + goto exit; + } + + while (nb_sectors > 0) { + bool use_zero_buffers = false; + bat_update = false; + if (s->params.data_bits & VHDX_PARAMS_HAS_PARENT) { + /* not supported yet */ + ret = -ENOTSUP; + goto exit; + } else { + vhdx_block_translate(s, sector_num, nb_sectors, &sinfo); + sectors_to_write = sinfo.sectors_avail; + + qemu_iovec_reset(&hd_qiov); + /* check the payload block state */ + bat_state = s->bat[sinfo.bat_idx] & VHDX_BAT_STATE_BIT_MASK; + switch (bat_state) { + case PAYLOAD_BLOCK_ZERO: + /* in this case, we need to preserve zero writes for + * data that is not part of this write, so we must pad + * the rest of the buffer to zeroes */ + + /* if we are on a posix system with ftruncate() that extends + * a file, then it is zero-filled for us. On Win32, the raw + * layer uses SetFilePointer and SetFileEnd, which does not + * zero fill AFAIK */ + + /* Queue another write of zero buffers if the underlying file + * does not zero-fill on file extension */ + + if (bdrv_has_zero_init(bs->file->bs) == 0) { + use_zero_buffers = true; + + /* zero fill the front, if any */ + if (sinfo.block_offset) { + iov1.iov_len = sinfo.block_offset; + iov1.iov_base = qemu_blockalign(bs, iov1.iov_len); + memset(iov1.iov_base, 0, iov1.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov1, 1, 0, + iov1.iov_len); + sectors_to_write += iov1.iov_len >> BDRV_SECTOR_BITS; + } + + /* our actual data */ + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + + /* zero fill the back, if any */ + if ((sinfo.bytes_avail - sinfo.block_offset) < + s->block_size) { + iov2.iov_len = s->block_size - + (sinfo.bytes_avail + sinfo.block_offset); + iov2.iov_base = qemu_blockalign(bs, iov2.iov_len); + memset(iov2.iov_base, 0, iov2.iov_len); + qemu_iovec_concat_iov(&hd_qiov, &iov2, 1, 0, + iov2.iov_len); + sectors_to_write += iov2.iov_len >> BDRV_SECTOR_BITS; + } + } + /* fall through */ + case PAYLOAD_BLOCK_NOT_PRESENT: /* fall through */ + case PAYLOAD_BLOCK_UNMAPPED: + case PAYLOAD_BLOCK_UNMAPPED_v095: + case PAYLOAD_BLOCK_UNDEFINED: + bat_prior_offset = sinfo.file_offset; + ret = vhdx_allocate_block(bs, s, &sinfo.file_offset); + if (ret < 0) { + goto exit; + } + /* once we support differencing files, this may also be + * partially present */ + /* update block state to the newly specified state */ + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, + PAYLOAD_BLOCK_FULLY_PRESENT); + bat_update = true; + /* since we just allocated a block, file_offset is the + * beginning of the payload block. It needs to be the + * write address, which includes the offset into the block */ + if (!use_zero_buffers) { + sinfo.file_offset += sinfo.block_offset; + } + /* fall through */ + case PAYLOAD_BLOCK_FULLY_PRESENT: + /* if the file offset address is in the header zone, + * there is a problem */ + if (sinfo.file_offset < (1024 * 1024)) { + ret = -EFAULT; + goto error_bat_restore; + } + + if (!use_zero_buffers) { + qemu_iovec_concat(&hd_qiov, qiov, bytes_done, + sinfo.bytes_avail); + } + /* block exists, so we can just overwrite it */ + qemu_co_mutex_unlock(&s->lock); + ret = bdrv_co_writev(bs->file->bs, + sinfo.file_offset >> BDRV_SECTOR_BITS, + sectors_to_write, &hd_qiov); + qemu_co_mutex_lock(&s->lock); + if (ret < 0) { + goto error_bat_restore; + } + break; + case PAYLOAD_BLOCK_PARTIALLY_PRESENT: + /* we don't yet support difference files, fall through + * to error */ + default: + ret = -EIO; + goto exit; + break; + } + + if (bat_update) { + /* this will update the BAT entry into the log journal, and + * then flush the log journal out to disk */ + ret = vhdx_log_write_and_flush(bs, s, &bat_entry, + sizeof(VHDXBatEntry), + bat_entry_offset); + if (ret < 0) { + goto exit; + } + } + + nb_sectors -= sinfo.sectors_avail; + sector_num += sinfo.sectors_avail; + bytes_done += sinfo.bytes_avail; + + } + } + + goto exit; + +error_bat_restore: + if (bat_update) { + /* keep metadata in sync, and restore the bat entry state + * if error. */ + sinfo.file_offset = bat_prior_offset; + vhdx_update_bat_table_entry(bs, s, &sinfo, &bat_entry, + &bat_entry_offset, bat_state); + } +exit: + qemu_vfree(iov1.iov_base); + qemu_vfree(iov2.iov_base); + qemu_co_mutex_unlock(&s->lock); + qemu_iovec_destroy(&hd_qiov); + return ret; +} + + + +/* + * Create VHDX Headers + * + * There are 2 headers, and the highest sequence number will represent + * the active header + */ +static int vhdx_create_new_headers(BlockDriverState *bs, uint64_t image_size, + uint32_t log_size) +{ + int ret = 0; + VHDXHeader *hdr = NULL; + + hdr = g_new0(VHDXHeader, 1); + + hdr->signature = VHDX_HEADER_SIGNATURE; + hdr->sequence_number = g_random_int(); + hdr->log_version = 0; + hdr->version = 1; + hdr->log_length = log_size; + hdr->log_offset = VHDX_HEADER_SECTION_END; + vhdx_guid_generate(&hdr->file_write_guid); + vhdx_guid_generate(&hdr->data_write_guid); + + ret = vhdx_write_header(bs, hdr, VHDX_HEADER1_OFFSET, false); + if (ret < 0) { + goto exit; + } + hdr->sequence_number++; + ret = vhdx_write_header(bs, hdr, VHDX_HEADER2_OFFSET, false); + if (ret < 0) { + goto exit; + } + +exit: + g_free(hdr); + return ret; +} + +#define VHDX_METADATA_ENTRY_BUFFER_SIZE \ + (sizeof(VHDXFileParameters) +\ + sizeof(VHDXVirtualDiskSize) +\ + sizeof(VHDXPage83Data) +\ + sizeof(VHDXVirtualDiskLogicalSectorSize) +\ + sizeof(VHDXVirtualDiskPhysicalSectorSize)) + +/* + * Create the Metadata entries. + * + * For more details on the entries, see section 3.5 (pg 29) in the + * VHDX 1.00 specification. + * + * We support 5 metadata entries (all required by spec): + * File Parameters, + * Virtual Disk Size, + * Page 83 Data, + * Logical Sector Size, + * Physical Sector Size + * + * The first 64KB of the Metadata section is reserved for the metadata + * header and entries; beyond that, the metadata items themselves reside. + */ +static int vhdx_create_new_metadata(BlockDriverState *bs, + uint64_t image_size, + uint32_t block_size, + uint32_t sector_size, + uint64_t metadata_offset, + VHDXImageType type) +{ + int ret = 0; + uint32_t offset = 0; + void *buffer = NULL; + void *entry_buffer; + VHDXMetadataTableHeader *md_table; + VHDXMetadataTableEntry *md_table_entry; + + /* Metadata entries */ + VHDXFileParameters *mt_file_params; + VHDXVirtualDiskSize *mt_virtual_size; + VHDXPage83Data *mt_page83; + VHDXVirtualDiskLogicalSectorSize *mt_log_sector_size; + VHDXVirtualDiskPhysicalSectorSize *mt_phys_sector_size; + + entry_buffer = g_malloc0(VHDX_METADATA_ENTRY_BUFFER_SIZE); + + mt_file_params = entry_buffer; + offset += sizeof(VHDXFileParameters); + mt_virtual_size = entry_buffer + offset; + offset += sizeof(VHDXVirtualDiskSize); + mt_page83 = entry_buffer + offset; + offset += sizeof(VHDXPage83Data); + mt_log_sector_size = entry_buffer + offset; + offset += sizeof(VHDXVirtualDiskLogicalSectorSize); + mt_phys_sector_size = entry_buffer + offset; + + mt_file_params->block_size = cpu_to_le32(block_size); + if (type == VHDX_TYPE_FIXED) { + mt_file_params->data_bits |= VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED; + cpu_to_le32s(&mt_file_params->data_bits); + } + + vhdx_guid_generate(&mt_page83->page_83_data); + cpu_to_leguids(&mt_page83->page_83_data); + mt_virtual_size->virtual_disk_size = cpu_to_le64(image_size); + mt_log_sector_size->logical_sector_size = cpu_to_le32(sector_size); + mt_phys_sector_size->physical_sector_size = cpu_to_le32(sector_size); + + buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); + md_table = buffer; + + md_table->signature = VHDX_METADATA_SIGNATURE; + md_table->entry_count = 5; + vhdx_metadata_header_le_export(md_table); + + + /* This will reference beyond the reserved table portion */ + offset = 64 * KiB; + + md_table_entry = buffer + sizeof(VHDXMetadataTableHeader); + + md_table_entry[0].item_id = file_param_guid; + md_table_entry[0].offset = offset; + md_table_entry[0].length = sizeof(VHDXFileParameters); + md_table_entry[0].data_bits |= VHDX_META_FLAGS_IS_REQUIRED; + offset += md_table_entry[0].length; + vhdx_metadata_entry_le_export(&md_table_entry[0]); + + md_table_entry[1].item_id = virtual_size_guid; + md_table_entry[1].offset = offset; + md_table_entry[1].length = sizeof(VHDXVirtualDiskSize); + md_table_entry[1].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[1].length; + vhdx_metadata_entry_le_export(&md_table_entry[1]); + + md_table_entry[2].item_id = page83_guid; + md_table_entry[2].offset = offset; + md_table_entry[2].length = sizeof(VHDXPage83Data); + md_table_entry[2].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[2].length; + vhdx_metadata_entry_le_export(&md_table_entry[2]); + + md_table_entry[3].item_id = logical_sector_guid; + md_table_entry[3].offset = offset; + md_table_entry[3].length = sizeof(VHDXVirtualDiskLogicalSectorSize); + md_table_entry[3].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + offset += md_table_entry[3].length; + vhdx_metadata_entry_le_export(&md_table_entry[3]); + + md_table_entry[4].item_id = phys_sector_guid; + md_table_entry[4].offset = offset; + md_table_entry[4].length = sizeof(VHDXVirtualDiskPhysicalSectorSize); + md_table_entry[4].data_bits |= VHDX_META_FLAGS_IS_REQUIRED | + VHDX_META_FLAGS_IS_VIRTUAL_DISK; + vhdx_metadata_entry_le_export(&md_table_entry[4]); + + ret = bdrv_pwrite(bs, metadata_offset, buffer, VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + ret = bdrv_pwrite(bs, metadata_offset + (64 * KiB), entry_buffer, + VHDX_METADATA_ENTRY_BUFFER_SIZE); + if (ret < 0) { + goto exit; + } + + +exit: + g_free(buffer); + g_free(entry_buffer); + return ret; +} + +/* This create the actual BAT itself. We currently only support + * 'Dynamic' and 'Fixed' image types. + * + * Dynamic images: default state of the BAT is all zeroes. + * + * Fixed images: default state of the BAT is fully populated, with + * file offsets and state PAYLOAD_BLOCK_FULLY_PRESENT. + */ +static int vhdx_create_bat(BlockDriverState *bs, BDRVVHDXState *s, + uint64_t image_size, VHDXImageType type, + bool use_zero_blocks, uint64_t file_offset, + uint32_t length) +{ + int ret = 0; + uint64_t data_file_offset; + uint64_t total_sectors = 0; + uint64_t sector_num = 0; + uint64_t unused; + int block_state; + VHDXSectorInfo sinfo; + + assert(s->bat == NULL); + + /* this gives a data start after BAT/bitmap entries, and well + * past any metadata entries (with a 4 MB buffer for future + * expansion */ + data_file_offset = file_offset + length + 5 * MiB; + total_sectors = image_size >> s->logical_sector_size_bits; + + if (type == VHDX_TYPE_DYNAMIC) { + /* All zeroes, so we can just extend the file - the end of the BAT + * is the furthest thing we have written yet */ + ret = bdrv_truncate(bs, data_file_offset); + if (ret < 0) { + goto exit; + } + } else if (type == VHDX_TYPE_FIXED) { + ret = bdrv_truncate(bs, data_file_offset + image_size); + if (ret < 0) { + goto exit; + } + } else { + ret = -ENOTSUP; + goto exit; + } + + if (type == VHDX_TYPE_FIXED || + use_zero_blocks || + bdrv_has_zero_init(bs) == 0) { + /* for a fixed file, the default BAT entry is not zero */ + s->bat = g_try_malloc0(length); + if (length && s->bat == NULL) { + ret = -ENOMEM; + goto exit; + } + block_state = type == VHDX_TYPE_FIXED ? PAYLOAD_BLOCK_FULLY_PRESENT : + PAYLOAD_BLOCK_NOT_PRESENT; + block_state = use_zero_blocks ? PAYLOAD_BLOCK_ZERO : block_state; + /* fill the BAT by emulating sector writes of sectors_per_block size */ + while (sector_num < total_sectors) { + vhdx_block_translate(s, sector_num, s->sectors_per_block, &sinfo); + sinfo.file_offset = data_file_offset + + (sector_num << s->logical_sector_size_bits); + sinfo.file_offset = ROUND_UP(sinfo.file_offset, MiB); + vhdx_update_bat_table_entry(bs, s, &sinfo, &unused, &unused, + block_state); + cpu_to_le64s(&s->bat[sinfo.bat_idx]); + sector_num += s->sectors_per_block; + } + ret = bdrv_pwrite(bs, file_offset, s->bat, length); + if (ret < 0) { + goto exit; + } + } + + + +exit: + g_free(s->bat); + return ret; +} + +/* Creates the region table header, and region table entries. + * There are 2 supported region table entries: BAT, and Metadata/ + * + * As the calculations for the BAT region table are also needed + * to create the BAT itself, we will also cause the BAT to be + * created. + */ +static int vhdx_create_new_region_table(BlockDriverState *bs, + uint64_t image_size, + uint32_t block_size, + uint32_t sector_size, + uint32_t log_size, + bool use_zero_blocks, + VHDXImageType type, + uint64_t *metadata_offset) +{ + int ret = 0; + uint32_t offset = 0; + void *buffer = NULL; + uint64_t bat_file_offset; + uint32_t bat_length; + BDRVVHDXState *s = NULL; + VHDXRegionTableHeader *region_table; + VHDXRegionTableEntry *rt_bat; + VHDXRegionTableEntry *rt_metadata; + + assert(metadata_offset != NULL); + + /* Populate enough of the BDRVVHDXState to be able to use the + * pre-existing BAT calculation, translation, and update functions */ + s = g_new0(BDRVVHDXState, 1); + + s->chunk_ratio = (VHDX_MAX_SECTORS_PER_BLOCK) * + (uint64_t) sector_size / (uint64_t) block_size; + + s->sectors_per_block = block_size / sector_size; + s->virtual_disk_size = image_size; + s->block_size = block_size; + s->logical_sector_size = sector_size; + + vhdx_set_shift_bits(s); + + vhdx_calc_bat_entries(s); + + /* At this point the VHDX state is populated enough for creation */ + + /* a single buffer is used so we can calculate the checksum over the + * entire 64KB block */ + buffer = g_malloc0(VHDX_HEADER_BLOCK_SIZE); + region_table = buffer; + offset += sizeof(VHDXRegionTableHeader); + rt_bat = buffer + offset; + offset += sizeof(VHDXRegionTableEntry); + rt_metadata = buffer + offset; + + region_table->signature = VHDX_REGION_SIGNATURE; + region_table->entry_count = 2; /* BAT and Metadata */ + + rt_bat->guid = bat_guid; + rt_bat->length = ROUND_UP(s->bat_entries * sizeof(VHDXBatEntry), MiB); + rt_bat->file_offset = ROUND_UP(VHDX_HEADER_SECTION_END + log_size, MiB); + s->bat_offset = rt_bat->file_offset; + + rt_metadata->guid = metadata_guid; + rt_metadata->file_offset = ROUND_UP(rt_bat->file_offset + rt_bat->length, + MiB); + rt_metadata->length = 1 * MiB; /* min size, and more than enough */ + *metadata_offset = rt_metadata->file_offset; + + bat_file_offset = rt_bat->file_offset; + bat_length = rt_bat->length; + + vhdx_region_header_le_export(region_table); + vhdx_region_entry_le_export(rt_bat); + vhdx_region_entry_le_export(rt_metadata); + + vhdx_update_checksum(buffer, VHDX_HEADER_BLOCK_SIZE, + offsetof(VHDXRegionTableHeader, checksum)); + + + /* The region table gives us the data we need to create the BAT, + * so do that now */ + ret = vhdx_create_bat(bs, s, image_size, type, use_zero_blocks, + bat_file_offset, bat_length); + if (ret < 0) { + goto exit; + } + + /* Now write out the region headers to disk */ + ret = bdrv_pwrite(bs, VHDX_REGION_TABLE_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + + ret = bdrv_pwrite(bs, VHDX_REGION_TABLE2_OFFSET, buffer, + VHDX_HEADER_BLOCK_SIZE); + if (ret < 0) { + goto exit; + } + +exit: + g_free(s); + g_free(buffer); + return ret; +} + +/* We need to create the following elements: + * + * .-----------------------------------------------------------------. + * | (A) | (B) | (C) | (D) | (E) | + * | File ID | Header1 | Header 2 | Region Tbl 1 | Region Tbl 2 | + * | | | | | | + * .-----------------------------------------------------------------. + * 0 64KB 128KB 192KB 256KB 320KB + * + * + * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + * | (F) | (G) | (H) | | + * | Journal Log | BAT / Bitmap | Metadata | .... data ...... | + * | | | | | + * .---- ~ ----------- ~ ------------ ~ ---------------- ~ -----------. + * 1MB + */ +static int vhdx_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int ret = 0; + uint64_t image_size = (uint64_t) 2 * GiB; + uint32_t log_size = 1 * MiB; + uint32_t block_size = 0; + uint64_t signature; + uint64_t metadata_offset; + bool use_zero_blocks = false; + + gunichar2 *creator = NULL; + glong creator_items; + BlockDriverState *bs; + char *type = NULL; + VHDXImageType image_type; + Error *local_err = NULL; + + image_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + log_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_LOG_SIZE, 0); + block_size = qemu_opt_get_size_del(opts, VHDX_BLOCK_OPT_BLOCK_SIZE, 0); + type = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); + use_zero_blocks = qemu_opt_get_bool_del(opts, VHDX_BLOCK_OPT_ZERO, true); + + if (image_size > VHDX_MAX_IMAGE_SIZE) { + error_setg_errno(errp, EINVAL, "Image size too large; max of 64TB"); + ret = -EINVAL; + goto exit; + } + + if (type == NULL) { + type = g_strdup("dynamic"); + } + + if (!strcmp(type, "dynamic")) { + image_type = VHDX_TYPE_DYNAMIC; + } else if (!strcmp(type, "fixed")) { + image_type = VHDX_TYPE_FIXED; + } else if (!strcmp(type, "differencing")) { + error_setg_errno(errp, ENOTSUP, + "Differencing files not yet supported"); + ret = -ENOTSUP; + goto exit; + } else { + ret = -EINVAL; + goto exit; + } + + /* These are pretty arbitrary, and mainly designed to keep the BAT + * size reasonable to load into RAM */ + if (block_size == 0) { + if (image_size > 32 * TiB) { + block_size = 64 * MiB; + } else if (image_size > (uint64_t) 100 * GiB) { + block_size = 32 * MiB; + } else if (image_size > 1 * GiB) { + block_size = 16 * MiB; + } else { + block_size = 8 * MiB; + } + } + + + /* make the log size close to what was specified, but must be + * min 1MB, and multiple of 1MB */ + log_size = ROUND_UP(log_size, MiB); + + block_size = ROUND_UP(block_size, MiB); + block_size = block_size > VHDX_BLOCK_SIZE_MAX ? VHDX_BLOCK_SIZE_MAX : + block_size; + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + bs = NULL; + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + /* Create (A) */ + + /* The creator field is optional, but may be useful for + * debugging / diagnostics */ + creator = g_utf8_to_utf16("QEMU v" QEMU_VERSION, -1, NULL, + &creator_items, NULL); + signature = cpu_to_le64(VHDX_FILE_SIGNATURE); + ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET, &signature, sizeof(signature)); + if (ret < 0) { + goto delete_and_exit; + } + if (creator) { + ret = bdrv_pwrite(bs, VHDX_FILE_ID_OFFSET + sizeof(signature), + creator, creator_items * sizeof(gunichar2)); + if (ret < 0) { + goto delete_and_exit; + } + } + + + /* Creates (B),(C) */ + ret = vhdx_create_new_headers(bs, image_size, log_size); + if (ret < 0) { + goto delete_and_exit; + } + + /* Creates (D),(E),(G) explicitly. (F) created as by-product */ + ret = vhdx_create_new_region_table(bs, image_size, block_size, 512, + log_size, use_zero_blocks, image_type, + &metadata_offset); + if (ret < 0) { + goto delete_and_exit; + } + + /* Creates (H) */ + ret = vhdx_create_new_metadata(bs, image_size, block_size, 512, + metadata_offset, image_type); + if (ret < 0) { + goto delete_and_exit; + } + + +delete_and_exit: + bdrv_unref(bs); +exit: + g_free(type); + g_free(creator); + return ret; +} + +/* If opened r/w, the VHDX driver will automatically replay the log, + * if one is present, inside the vhdx_open() call. + * + * If qemu-img check -r all is called, the image is automatically opened + * r/w and any log has already been replayed, so there is nothing (currently) + * for us to do here + */ +static int vhdx_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVVHDXState *s = bs->opaque; + + if (s->log_replayed_on_open) { + result->corruptions_fixed++; + } + return 0; +} + +static QemuOptsList vhdx_create_opts = { + .name = "vhdx-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vhdx_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size; max of 64TB." + }, + { + .name = VHDX_BLOCK_OPT_LOG_SIZE, + .type = QEMU_OPT_SIZE, + .def_value_str = stringify(DEFAULT_LOG_SIZE), + .help = "Log size; min 1MB." + }, + { + .name = VHDX_BLOCK_OPT_BLOCK_SIZE, + .type = QEMU_OPT_SIZE, + .def_value_str = stringify(0), + .help = "Block Size; min 1MB, max 256MB. " \ + "0 means auto-calculate based on image size." + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = QEMU_OPT_STRING, + .help = "VHDX format type, can be either 'dynamic' or 'fixed'. "\ + "Default is 'dynamic'." + }, + { + .name = VHDX_BLOCK_OPT_ZERO, + .type = QEMU_OPT_BOOL, + .help = "Force use of payload blocks of type 'ZERO'. "\ + "Non-standard, but default. Do not set to 'off' when "\ + "using 'qemu-img convert' with subformat=dynamic." + }, + { NULL } + } +}; + +static BlockDriver bdrv_vhdx = { + .format_name = "vhdx", + .instance_size = sizeof(BDRVVHDXState), + .bdrv_probe = vhdx_probe, + .bdrv_open = vhdx_open, + .bdrv_close = vhdx_close, + .bdrv_reopen_prepare = vhdx_reopen_prepare, + .bdrv_co_readv = vhdx_co_readv, + .bdrv_co_writev = vhdx_co_writev, + .bdrv_create = vhdx_create, + .bdrv_get_info = vhdx_get_info, + .bdrv_check = vhdx_check, + .bdrv_has_zero_init = bdrv_has_zero_init_1, + + .create_opts = &vhdx_create_opts, +}; + +static void bdrv_vhdx_init(void) +{ + bdrv_register(&bdrv_vhdx); +} + +block_init(bdrv_vhdx_init); diff --git a/src/block/vhdx.h b/src/block/vhdx.h new file mode 100644 index 0000000..7003ab7 --- /dev/null +++ b/src/block/vhdx.h @@ -0,0 +1,453 @@ +/* + * Block driver for Hyper-V VHDX Images + * + * Copyright (c) 2013 Red Hat, Inc., + * + * Authors: + * Jeff Cody <jcody@redhat.com> + * + * This is based on the "VHDX Format Specification v1.00", published 8/25/2012 + * by Microsoft: + * https://www.microsoft.com/en-us/download/details.aspx?id=34750 + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + * + */ + +#ifndef BLOCK_VHDX_H +#define BLOCK_VHDX_H + +#define KiB (1 * 1024) +#define MiB (KiB * 1024) +#define GiB (MiB * 1024) +#define TiB ((uint64_t) GiB * 1024) + +#define DEFAULT_LOG_SIZE 1048576 /* 1MiB */ +/* Structures and fields present in the VHDX file */ + +/* The header section has the following blocks, + * each block is 64KB: + * + * _____________________________________________________________________________ + * | File Id. | Header 1 | Header 2 | Region Table | Reserved (768KB) | + * |----------|---------------|------------|--------------|--------------------| + * | | | | | | + * 0.........64KB...........128KB........192KB..........256KB................1MB + */ + +#define VHDX_HEADER_BLOCK_SIZE (64 * 1024) + +#define VHDX_FILE_ID_OFFSET 0 +#define VHDX_HEADER1_OFFSET (VHDX_HEADER_BLOCK_SIZE * 1) +#define VHDX_HEADER2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 2) +#define VHDX_REGION_TABLE_OFFSET (VHDX_HEADER_BLOCK_SIZE * 3) +#define VHDX_REGION_TABLE2_OFFSET (VHDX_HEADER_BLOCK_SIZE * 4) + +#define VHDX_HEADER_SECTION_END (1 * MiB) +/* + * A note on the use of MS-GUID fields. For more details on the GUID, + * please see: https://en.wikipedia.org/wiki/Globally_unique_identifier. + * + * The VHDX specification only states that these are MS GUIDs, and which + * bytes are data1-data4. It makes no mention of what algorithm should be used + * to generate the GUID, nor what standard. However, looking at the specified + * known GUID fields, it appears the GUIDs are: + * Standard/DCE GUID type (noted by 10b in the MSB of byte 0 of .data4) + * Random algorithm (noted by 0x4XXX for .data3) + */ + +/* ---- HEADER SECTION STRUCTURES ---- */ + +/* These structures are ones that are defined in the VHDX specification + * document */ + +#define VHDX_FILE_SIGNATURE 0x656C696678646876ULL /* "vhdxfile" in ASCII */ +typedef struct VHDXFileIdentifier { + uint64_t signature; /* "vhdxfile" in ASCII */ + uint16_t creator[256]; /* optional; utf-16 string to identify + the vhdx file creator. Diagnostic + only */ +} VHDXFileIdentifier; + + +/* the guid is a 16 byte unique ID - the definition for this used by + * Microsoft is not just 16 bytes though - it is a structure that is defined, + * so we need to follow it here so that endianness does not trip us up */ + +typedef struct QEMU_PACKED MSGUID { + uint32_t data1; + uint16_t data2; + uint16_t data3; + uint8_t data4[8]; +} MSGUID; + +#define guid_eq(a, b) \ + (memcmp(&(a), &(b), sizeof(MSGUID)) == 0) + +#define VHDX_HEADER_SIZE (4 * 1024) /* although the vhdx_header struct in disk + is only 582 bytes, for purposes of crc + the header is the first 4KB of the 64KB + block */ + +/* The full header is 4KB, although the actual header data is much smaller. + * But for the checksum calculation, it is over the entire 4KB structure, + * not just the defined portion of it */ +#define VHDX_HEADER_SIGNATURE 0x64616568 +typedef struct QEMU_PACKED VHDXHeader { + uint32_t signature; /* "head" in ASCII */ + uint32_t checksum; /* CRC-32C hash of the whole header */ + uint64_t sequence_number; /* Seq number of this header. Each + VHDX file has 2 of these headers, + and only the header with the highest + sequence number is valid */ + MSGUID file_write_guid; /* 128 bit unique identifier. Must be + updated to new, unique value before + the first modification is made to + file */ + MSGUID data_write_guid; /* 128 bit unique identifier. Must be + updated to new, unique value before + the first modification is made to + visible data. Visbile data is + defined as: + - system & user metadata + - raw block data + - disk size + - any change that will + cause the virtual disk + sector read to differ + + This does not need to change if + blocks are re-arranged */ + MSGUID log_guid; /* 128 bit unique identifier. If zero, + there is no valid log. If non-zero, + log entries with this guid are + valid. */ + uint16_t log_version; /* version of the log format. Must be + set to zero */ + uint16_t version; /* version of the vhdx file. Currently, + only supported version is "1" */ + uint32_t log_length; /* length of the log. Must be multiple + of 1MB */ + uint64_t log_offset; /* byte offset in the file of the log. + Must also be a multiple of 1MB */ +} VHDXHeader; + +/* Header for the region table block */ +#define VHDX_REGION_SIGNATURE 0x69676572 /* "regi" in ASCII */ +typedef struct QEMU_PACKED VHDXRegionTableHeader { + uint32_t signature; /* "regi" in ASCII */ + uint32_t checksum; /* CRC-32C hash of the 64KB table */ + uint32_t entry_count; /* number of valid entries */ + uint32_t reserved; +} VHDXRegionTableHeader; + +/* Individual region table entry. There may be a maximum of 2047 of these + * + * There are two known region table properties. Both are required. + * BAT (block allocation table): 2DC27766F62342009D64115E9BFD4A08 + * Metadata: 8B7CA20647904B9AB8FE575F050F886E + */ +#define VHDX_REGION_ENTRY_REQUIRED 0x01 /* if set, parser must understand + this entry in order to open + file */ +typedef struct QEMU_PACKED VHDXRegionTableEntry { + MSGUID guid; /* 128-bit unique identifier */ + uint64_t file_offset; /* offset of the object in the file. + Must be multiple of 1MB */ + uint32_t length; /* length, in bytes, of the object */ + uint32_t data_bits; +} VHDXRegionTableEntry; + + +/* ---- LOG ENTRY STRUCTURES ---- */ +#define VHDX_LOG_MIN_SIZE (1024 * 1024) +#define VHDX_LOG_SECTOR_SIZE 4096 +#define VHDX_LOG_HDR_SIZE 64 +#define VHDX_LOG_SIGNATURE 0x65676f6c +typedef struct QEMU_PACKED VHDXLogEntryHeader { + uint32_t signature; /* "loge" in ASCII */ + uint32_t checksum; /* CRC-32C hash of the 64KB table */ + uint32_t entry_length; /* length in bytes, multiple of 1MB */ + uint32_t tail; /* byte offset of first log entry of a + seq, where this entry is the last + entry */ + uint64_t sequence_number; /* incremented with each log entry. + May not be zero. */ + uint32_t descriptor_count; /* number of descriptors in this log + entry, must be >= 0 */ + uint32_t reserved; + MSGUID log_guid; /* value of the log_guid from + vhdx_header. If not found in + vhdx_header, it is invalid */ + uint64_t flushed_file_offset; /* see spec for full details - this + should be vhdx file size in bytes */ + uint64_t last_file_offset; /* size in bytes that all allocated + file structures fit into */ +} VHDXLogEntryHeader; + +#define VHDX_LOG_DESC_SIZE 32 +#define VHDX_LOG_DESC_SIGNATURE 0x63736564 +#define VHDX_LOG_ZERO_SIGNATURE 0x6f72657a +typedef struct QEMU_PACKED VHDXLogDescriptor { + uint32_t signature; /* "zero" or "desc" in ASCII */ + union { + uint32_t reserved; /* zero desc */ + uint32_t trailing_bytes; /* data desc: bytes 4092-4096 of the + data sector */ + }; + union { + uint64_t zero_length; /* zero desc: length of the section to + zero */ + uint64_t leading_bytes; /* data desc: bytes 0-7 of the data + sector */ + }; + uint64_t file_offset; /* file offset to write zeros - multiple + of 4kB */ + uint64_t sequence_number; /* must match same field in + vhdx_log_entry_header */ +} VHDXLogDescriptor; + +#define VHDX_LOG_DATA_SIGNATURE 0x61746164 +typedef struct QEMU_PACKED VHDXLogDataSector { + uint32_t data_signature; /* "data" in ASCII */ + uint32_t sequence_high; /* 4 MSB of 8 byte sequence_number */ + uint8_t data[4084]; /* raw data, bytes 8-4091 (inclusive). + see the data descriptor field for the + other mising bytes */ + uint32_t sequence_low; /* 4 LSB of 8 byte sequence_number */ +} VHDXLogDataSector; + + + +/* block states - different state values depending on whether it is a + * payload block, or a sector block. */ + +#define PAYLOAD_BLOCK_NOT_PRESENT 0 +#define PAYLOAD_BLOCK_UNDEFINED 1 +#define PAYLOAD_BLOCK_ZERO 2 +#define PAYLOAD_BLOCK_UNMAPPED 3 +#define PAYLOAD_BLOCK_UNMAPPED_v095 5 +#define PAYLOAD_BLOCK_FULLY_PRESENT 6 +#define PAYLOAD_BLOCK_PARTIALLY_PRESENT 7 + +#define SB_BLOCK_NOT_PRESENT 0 +#define SB_BLOCK_PRESENT 6 + +/* per the spec */ +#define VHDX_MAX_SECTORS_PER_BLOCK (1 << 23) + +/* upper 44 bits are the file offset in 1MB units lower 3 bits are the state + other bits are reserved */ +#define VHDX_BAT_STATE_BIT_MASK 0x07 +#define VHDX_BAT_FILE_OFF_MASK 0xFFFFFFFFFFF00000ULL /* upper 44 bits */ +typedef uint64_t VHDXBatEntry; + +/* ---- METADATA REGION STRUCTURES ---- */ + +#define VHDX_METADATA_ENTRY_SIZE 32 +#define VHDX_METADATA_MAX_ENTRIES 2047 /* not including the header */ +#define VHDX_METADATA_TABLE_MAX_SIZE \ + (VHDX_METADATA_ENTRY_SIZE * (VHDX_METADATA_MAX_ENTRIES+1)) +#define VHDX_METADATA_SIGNATURE 0x617461646174656DULL /* "metadata" in ASCII */ +typedef struct QEMU_PACKED VHDXMetadataTableHeader { + uint64_t signature; /* "metadata" in ASCII */ + uint16_t reserved; + uint16_t entry_count; /* number table entries. <= 2047 */ + uint32_t reserved2[5]; +} VHDXMetadataTableHeader; + +#define VHDX_META_FLAGS_IS_USER 0x01 /* max 1024 entries */ +#define VHDX_META_FLAGS_IS_VIRTUAL_DISK 0x02 /* virtual disk metadata if set, + otherwise file metdata */ +#define VHDX_META_FLAGS_IS_REQUIRED 0x04 /* parse must understand this + entry to open the file */ +typedef struct QEMU_PACKED VHDXMetadataTableEntry { + MSGUID item_id; /* 128-bit identifier for metadata */ + uint32_t offset; /* byte offset of the metadata. At + least 64kB. Relative to start of + metadata region */ + /* note: if length = 0, so is offset */ + uint32_t length; /* length of metadata. <= 1MB. */ + uint32_t data_bits; /* least-significant 3 bits are flags, + the rest are reserved (see above) */ + uint32_t reserved2; +} VHDXMetadataTableEntry; + +#define VHDX_PARAMS_LEAVE_BLOCKS_ALLOCED 0x01 /* Do not change any blocks to + be BLOCK_NOT_PRESENT. + If set indicates a fixed + size VHDX file */ +#define VHDX_PARAMS_HAS_PARENT 0x02 /* has parent / backing file */ +#define VHDX_BLOCK_SIZE_MIN (1 * MiB) +#define VHDX_BLOCK_SIZE_MAX (256 * MiB) +typedef struct QEMU_PACKED VHDXFileParameters { + uint32_t block_size; /* size of each payload block, always + power of 2, <= 256MB and >= 1MB. */ + uint32_t data_bits; /* least-significant 2 bits are flags, + the rest are reserved (see above) */ +} VHDXFileParameters; + +#define VHDX_MAX_IMAGE_SIZE ((uint64_t) 64 * TiB) +typedef struct QEMU_PACKED VHDXVirtualDiskSize { + uint64_t virtual_disk_size; /* Size of the virtual disk, in bytes. + Must be multiple of the sector size, + max of 64TB */ +} VHDXVirtualDiskSize; + +typedef struct QEMU_PACKED VHDXPage83Data { + MSGUID page_83_data; /* unique id for scsi devices that + support page 0x83 */ +} VHDXPage83Data; + +typedef struct QEMU_PACKED VHDXVirtualDiskLogicalSectorSize { + uint32_t logical_sector_size; /* virtual disk sector size (in bytes). + Can only be 512 or 4096 bytes */ +} VHDXVirtualDiskLogicalSectorSize; + +typedef struct QEMU_PACKED VHDXVirtualDiskPhysicalSectorSize { + uint32_t physical_sector_size; /* physical sector size (in bytes). + Can only be 512 or 4096 bytes */ +} VHDXVirtualDiskPhysicalSectorSize; + +typedef struct QEMU_PACKED VHDXParentLocatorHeader { + MSGUID locator_type; /* type of the parent virtual disk. */ + uint16_t reserved; + uint16_t key_value_count; /* number of key/value pairs for this + locator */ +} VHDXParentLocatorHeader; + +/* key and value strings are UNICODE strings, UTF-16 LE encoding, no NULs */ +typedef struct QEMU_PACKED VHDXParentLocatorEntry { + uint32_t key_offset; /* offset in metadata for key, > 0 */ + uint32_t value_offset; /* offset in metadata for value, >0 */ + uint16_t key_length; /* length of entry key, > 0 */ + uint16_t value_length; /* length of entry value, > 0 */ +} VHDXParentLocatorEntry; + + +/* ----- END VHDX SPECIFICATION STRUCTURES ---- */ + +typedef struct VHDXMetadataEntries { + VHDXMetadataTableEntry file_parameters_entry; + VHDXMetadataTableEntry virtual_disk_size_entry; + VHDXMetadataTableEntry page83_data_entry; + VHDXMetadataTableEntry logical_sector_size_entry; + VHDXMetadataTableEntry phys_sector_size_entry; + VHDXMetadataTableEntry parent_locator_entry; + uint16_t present; +} VHDXMetadataEntries; + +typedef struct VHDXLogEntries { + uint64_t offset; + uint64_t length; + uint32_t write; + uint32_t read; + VHDXLogEntryHeader *hdr; + void *desc_buffer; + uint64_t sequence; + uint32_t tail; +} VHDXLogEntries; + +typedef struct VHDXRegionEntry { + uint64_t start; + uint64_t end; + QLIST_ENTRY(VHDXRegionEntry) entries; +} VHDXRegionEntry; + +typedef struct BDRVVHDXState { + CoMutex lock; + + int curr_header; + VHDXHeader *headers[2]; + + VHDXRegionTableHeader rt; + VHDXRegionTableEntry bat_rt; /* region table for the BAT */ + VHDXRegionTableEntry metadata_rt; /* region table for the metadata */ + + VHDXMetadataTableHeader metadata_hdr; + VHDXMetadataEntries metadata_entries; + + VHDXFileParameters params; + uint32_t block_size; + uint32_t block_size_bits; + uint32_t sectors_per_block; + uint32_t sectors_per_block_bits; + + uint64_t virtual_disk_size; + uint32_t logical_sector_size; + uint32_t physical_sector_size; + + uint64_t chunk_ratio; + uint32_t chunk_ratio_bits; + uint32_t logical_sector_size_bits; + + uint32_t bat_entries; + VHDXBatEntry *bat; + uint64_t bat_offset; + + bool first_visible_write; + MSGUID session_guid; + + VHDXLogEntries log; + + VHDXParentLocatorHeader parent_header; + VHDXParentLocatorEntry *parent_entries; + + Error *migration_blocker; + + bool log_replayed_on_open; + + QLIST_HEAD(VHDXRegionHead, VHDXRegionEntry) regions; +} BDRVVHDXState; + +void vhdx_guid_generate(MSGUID *guid); + +int vhdx_update_headers(BlockDriverState *bs, BDRVVHDXState *s, bool rw, + MSGUID *log_guid); + +uint32_t vhdx_update_checksum(uint8_t *buf, size_t size, int crc_offset); +uint32_t vhdx_checksum_calc(uint32_t crc, uint8_t *buf, size_t size, + int crc_offset); + +bool vhdx_checksum_is_valid(uint8_t *buf, size_t size, int crc_offset); + +int vhdx_parse_log(BlockDriverState *bs, BDRVVHDXState *s, bool *flushed, + Error **errp); + +int vhdx_log_write_and_flush(BlockDriverState *bs, BDRVVHDXState *s, + void *data, uint32_t length, uint64_t offset); + +static inline void leguid_to_cpus(MSGUID *guid) +{ + le32_to_cpus(&guid->data1); + le16_to_cpus(&guid->data2); + le16_to_cpus(&guid->data3); +} + +static inline void cpu_to_leguids(MSGUID *guid) +{ + cpu_to_le32s(&guid->data1); + cpu_to_le16s(&guid->data2); + cpu_to_le16s(&guid->data3); +} + +void vhdx_header_le_import(VHDXHeader *h); +void vhdx_header_le_export(VHDXHeader *orig_h, VHDXHeader *new_h); +void vhdx_log_desc_le_import(VHDXLogDescriptor *d); +void vhdx_log_desc_le_export(VHDXLogDescriptor *d); +void vhdx_log_data_le_import(VHDXLogDataSector *d); +void vhdx_log_data_le_export(VHDXLogDataSector *d); +void vhdx_log_entry_hdr_le_import(VHDXLogEntryHeader *hdr); +void vhdx_log_entry_hdr_le_export(VHDXLogEntryHeader *hdr); +void vhdx_region_header_le_import(VHDXRegionTableHeader *hdr); +void vhdx_region_header_le_export(VHDXRegionTableHeader *hdr); +void vhdx_region_entry_le_import(VHDXRegionTableEntry *e); +void vhdx_region_entry_le_export(VHDXRegionTableEntry *e); +void vhdx_metadata_header_le_import(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_header_le_export(VHDXMetadataTableHeader *hdr); +void vhdx_metadata_entry_le_import(VHDXMetadataTableEntry *e); +void vhdx_metadata_entry_le_export(VHDXMetadataTableEntry *e); +int vhdx_user_visible_write(BlockDriverState *bs, BDRVVHDXState *s); + +#endif diff --git a/src/block/vmdk.c b/src/block/vmdk.c new file mode 100644 index 0000000..e46271a --- /dev/null +++ b/src/block/vmdk.c @@ -0,0 +1,2319 @@ +/* + * Block driver for the VMDK format + * + * Copyright (c) 2004 Fabrice Bellard + * Copyright (c) 2005 Filip Navara + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ + +#include "qemu-common.h" +#include "block/block_int.h" +#include "qapi/qmp/qerror.h" +#include "qemu/error-report.h" +#include "qemu/module.h" +#include "migration/migration.h" +#include <zlib.h> +#include <glib.h> + +#define VMDK3_MAGIC (('C' << 24) | ('O' << 16) | ('W' << 8) | 'D') +#define VMDK4_MAGIC (('K' << 24) | ('D' << 16) | ('M' << 8) | 'V') +#define VMDK4_COMPRESSION_DEFLATE 1 +#define VMDK4_FLAG_NL_DETECT (1 << 0) +#define VMDK4_FLAG_RGD (1 << 1) +/* Zeroed-grain enable bit */ +#define VMDK4_FLAG_ZERO_GRAIN (1 << 2) +#define VMDK4_FLAG_COMPRESS (1 << 16) +#define VMDK4_FLAG_MARKER (1 << 17) +#define VMDK4_GD_AT_END 0xffffffffffffffffULL + +#define VMDK_GTE_ZEROED 0x1 + +/* VMDK internal error codes */ +#define VMDK_OK 0 +#define VMDK_ERROR (-1) +/* Cluster not allocated */ +#define VMDK_UNALLOC (-2) +#define VMDK_ZEROED (-3) + +#define BLOCK_OPT_ZEROED_GRAIN "zeroed_grain" + +typedef struct { + uint32_t version; + uint32_t flags; + uint32_t disk_sectors; + uint32_t granularity; + uint32_t l1dir_offset; + uint32_t l1dir_size; + uint32_t file_sectors; + uint32_t cylinders; + uint32_t heads; + uint32_t sectors_per_track; +} QEMU_PACKED VMDK3Header; + +typedef struct { + uint32_t version; + uint32_t flags; + uint64_t capacity; + uint64_t granularity; + uint64_t desc_offset; + uint64_t desc_size; + /* Number of GrainTableEntries per GrainTable */ + uint32_t num_gtes_per_gt; + uint64_t rgd_offset; + uint64_t gd_offset; + uint64_t grain_offset; + char filler[1]; + char check_bytes[4]; + uint16_t compressAlgorithm; +} QEMU_PACKED VMDK4Header; + +#define L2_CACHE_SIZE 16 + +typedef struct VmdkExtent { + BdrvChild *file; + bool flat; + bool compressed; + bool has_marker; + bool has_zero_grain; + int version; + int64_t sectors; + int64_t end_sector; + int64_t flat_start_offset; + int64_t l1_table_offset; + int64_t l1_backup_table_offset; + uint32_t *l1_table; + uint32_t *l1_backup_table; + unsigned int l1_size; + uint32_t l1_entry_sectors; + + unsigned int l2_size; + uint32_t *l2_cache; + uint32_t l2_cache_offsets[L2_CACHE_SIZE]; + uint32_t l2_cache_counts[L2_CACHE_SIZE]; + + int64_t cluster_sectors; + int64_t next_cluster_sector; + char *type; +} VmdkExtent; + +typedef struct BDRVVmdkState { + CoMutex lock; + uint64_t desc_offset; + bool cid_updated; + bool cid_checked; + uint32_t cid; + uint32_t parent_cid; + int num_extents; + /* Extent array with num_extents entries, ascend ordered by address */ + VmdkExtent *extents; + Error *migration_blocker; + char *create_type; +} BDRVVmdkState; + +typedef struct VmdkMetaData { + unsigned int l1_index; + unsigned int l2_index; + unsigned int l2_offset; + int valid; + uint32_t *l2_cache_entry; +} VmdkMetaData; + +typedef struct VmdkGrainMarker { + uint64_t lba; + uint32_t size; + uint8_t data[0]; +} QEMU_PACKED VmdkGrainMarker; + +enum { + MARKER_END_OF_STREAM = 0, + MARKER_GRAIN_TABLE = 1, + MARKER_GRAIN_DIRECTORY = 2, + MARKER_FOOTER = 3, +}; + +static int vmdk_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + uint32_t magic; + + if (buf_size < 4) { + return 0; + } + magic = be32_to_cpu(*(uint32_t *)buf); + if (magic == VMDK3_MAGIC || + magic == VMDK4_MAGIC) { + return 100; + } else { + const char *p = (const char *)buf; + const char *end = p + buf_size; + while (p < end) { + if (*p == '#') { + /* skip comment line */ + while (p < end && *p != '\n') { + p++; + } + p++; + continue; + } + if (*p == ' ') { + while (p < end && *p == ' ') { + p++; + } + /* skip '\r' if windows line endings used. */ + if (p < end && *p == '\r') { + p++; + } + /* only accept blank lines before 'version=' line */ + if (p == end || *p != '\n') { + return 0; + } + p++; + continue; + } + if (end - p >= strlen("version=X\n")) { + if (strncmp("version=1\n", p, strlen("version=1\n")) == 0 || + strncmp("version=2\n", p, strlen("version=2\n")) == 0) { + return 100; + } + } + if (end - p >= strlen("version=X\r\n")) { + if (strncmp("version=1\r\n", p, strlen("version=1\r\n")) == 0 || + strncmp("version=2\r\n", p, strlen("version=2\r\n")) == 0) { + return 100; + } + } + return 0; + } + return 0; + } +} + +#define SECTOR_SIZE 512 +#define DESC_SIZE (20 * SECTOR_SIZE) /* 20 sectors of 512 bytes each */ +#define BUF_SIZE 4096 +#define HEADER_SIZE 512 /* first sector of 512 bytes */ + +static void vmdk_free_extents(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + VmdkExtent *e; + + for (i = 0; i < s->num_extents; i++) { + e = &s->extents[i]; + g_free(e->l1_table); + g_free(e->l2_cache); + g_free(e->l1_backup_table); + g_free(e->type); + if (e->file != bs->file) { + bdrv_unref_child(bs, e->file); + } + } + g_free(s->extents); +} + +static void vmdk_free_last_extent(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + + if (s->num_extents == 0) { + return; + } + s->num_extents--; + s->extents = g_renew(VmdkExtent, s->extents, s->num_extents); +} + +static uint32_t vmdk_read_cid(BlockDriverState *bs, int parent) +{ + char desc[DESC_SIZE]; + uint32_t cid = 0xffffffff; + const char *p_name, *cid_str; + size_t cid_str_size; + BDRVVmdkState *s = bs->opaque; + int ret; + + ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return 0; + } + + if (parent) { + cid_str = "parentCID"; + cid_str_size = sizeof("parentCID"); + } else { + cid_str = "CID"; + cid_str_size = sizeof("CID"); + } + + desc[DESC_SIZE - 1] = '\0'; + p_name = strstr(desc, cid_str); + if (p_name != NULL) { + p_name += cid_str_size; + sscanf(p_name, "%" SCNx32, &cid); + } + + return cid; +} + +static int vmdk_write_cid(BlockDriverState *bs, uint32_t cid) +{ + char desc[DESC_SIZE], tmp_desc[DESC_SIZE]; + char *p_name, *tmp_str; + BDRVVmdkState *s = bs->opaque; + int ret; + + ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return ret; + } + + desc[DESC_SIZE - 1] = '\0'; + tmp_str = strstr(desc, "parentCID"); + if (tmp_str == NULL) { + return -EINVAL; + } + + pstrcpy(tmp_desc, sizeof(tmp_desc), tmp_str); + p_name = strstr(desc, "CID"); + if (p_name != NULL) { + p_name += sizeof("CID"); + snprintf(p_name, sizeof(desc) - (p_name - desc), "%" PRIx32 "\n", cid); + pstrcat(desc, sizeof(desc), tmp_desc); + } + + ret = bdrv_pwrite_sync(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return ret; + } + + return 0; +} + +static int vmdk_is_cid_valid(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + uint32_t cur_pcid; + + if (!s->cid_checked && bs->backing) { + BlockDriverState *p_bs = bs->backing->bs; + + cur_pcid = vmdk_read_cid(p_bs, 0); + if (s->parent_cid != cur_pcid) { + /* CID not valid */ + return 0; + } + } + s->cid_checked = true; + /* CID valid */ + return 1; +} + +/* We have nothing to do for VMDK reopen, stubs just return success */ +static int vmdk_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + assert(state != NULL); + assert(state->bs != NULL); + return 0; +} + +static int vmdk_parent_open(BlockDriverState *bs) +{ + char *p_name; + char desc[DESC_SIZE + 1]; + BDRVVmdkState *s = bs->opaque; + int ret; + + desc[DESC_SIZE] = '\0'; + ret = bdrv_pread(bs->file->bs, s->desc_offset, desc, DESC_SIZE); + if (ret < 0) { + return ret; + } + + p_name = strstr(desc, "parentFileNameHint"); + if (p_name != NULL) { + char *end_name; + + p_name += sizeof("parentFileNameHint") + 1; + end_name = strchr(p_name, '\"'); + if (end_name == NULL) { + return -EINVAL; + } + if ((end_name - p_name) > sizeof(bs->backing_file) - 1) { + return -EINVAL; + } + + pstrcpy(bs->backing_file, end_name - p_name + 1, p_name); + } + + return 0; +} + +/* Create and append extent to the extent array. Return the added VmdkExtent + * address. return NULL if allocation failed. */ +static int vmdk_add_extent(BlockDriverState *bs, + BdrvChild *file, bool flat, int64_t sectors, + int64_t l1_offset, int64_t l1_backup_offset, + uint32_t l1_size, + int l2_size, uint64_t cluster_sectors, + VmdkExtent **new_extent, + Error **errp) +{ + VmdkExtent *extent; + BDRVVmdkState *s = bs->opaque; + int64_t nb_sectors; + + if (cluster_sectors > 0x200000) { + /* 0x200000 * 512Bytes = 1GB for one cluster is unrealistic */ + error_setg(errp, "Invalid granularity, image may be corrupt"); + return -EFBIG; + } + if (l1_size > 512 * 1024 * 1024) { + /* Although with big capacity and small l1_entry_sectors, we can get a + * big l1_size, we don't want unbounded value to allocate the table. + * Limit it to 512M, which is 16PB for default cluster and L2 table + * size */ + error_setg(errp, "L1 size too big"); + return -EFBIG; + } + + nb_sectors = bdrv_nb_sectors(file->bs); + if (nb_sectors < 0) { + return nb_sectors; + } + + s->extents = g_renew(VmdkExtent, s->extents, s->num_extents + 1); + extent = &s->extents[s->num_extents]; + s->num_extents++; + + memset(extent, 0, sizeof(VmdkExtent)); + extent->file = file; + extent->flat = flat; + extent->sectors = sectors; + extent->l1_table_offset = l1_offset; + extent->l1_backup_table_offset = l1_backup_offset; + extent->l1_size = l1_size; + extent->l1_entry_sectors = l2_size * cluster_sectors; + extent->l2_size = l2_size; + extent->cluster_sectors = flat ? sectors : cluster_sectors; + extent->next_cluster_sector = ROUND_UP(nb_sectors, cluster_sectors); + + if (s->num_extents > 1) { + extent->end_sector = (*(extent - 1)).end_sector + extent->sectors; + } else { + extent->end_sector = extent->sectors; + } + bs->total_sectors = extent->end_sector; + if (new_extent) { + *new_extent = extent; + } + return 0; +} + +static int vmdk_init_tables(BlockDriverState *bs, VmdkExtent *extent, + Error **errp) +{ + int ret; + size_t l1_size; + int i; + + /* read the L1 table */ + l1_size = extent->l1_size * sizeof(uint32_t); + extent->l1_table = g_try_malloc(l1_size); + if (l1_size && extent->l1_table == NULL) { + return -ENOMEM; + } + + ret = bdrv_pread(extent->file->bs, + extent->l1_table_offset, + extent->l1_table, + l1_size); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read l1 table from extent '%s'", + extent->file->bs->filename); + goto fail_l1; + } + for (i = 0; i < extent->l1_size; i++) { + le32_to_cpus(&extent->l1_table[i]); + } + + if (extent->l1_backup_table_offset) { + extent->l1_backup_table = g_try_malloc(l1_size); + if (l1_size && extent->l1_backup_table == NULL) { + ret = -ENOMEM; + goto fail_l1; + } + ret = bdrv_pread(extent->file->bs, + extent->l1_backup_table_offset, + extent->l1_backup_table, + l1_size); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read l1 backup table from extent '%s'", + extent->file->bs->filename); + goto fail_l1b; + } + for (i = 0; i < extent->l1_size; i++) { + le32_to_cpus(&extent->l1_backup_table[i]); + } + } + + extent->l2_cache = + g_new(uint32_t, extent->l2_size * L2_CACHE_SIZE); + return 0; + fail_l1b: + g_free(extent->l1_backup_table); + fail_l1: + g_free(extent->l1_table); + return ret; +} + +static int vmdk_open_vmfs_sparse(BlockDriverState *bs, + BdrvChild *file, + int flags, Error **errp) +{ + int ret; + uint32_t magic; + VMDK3Header header; + VmdkExtent *extent; + + ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read header from file '%s'", + file->bs->filename); + return ret; + } + ret = vmdk_add_extent(bs, file, false, + le32_to_cpu(header.disk_sectors), + (int64_t)le32_to_cpu(header.l1dir_offset) << 9, + 0, + le32_to_cpu(header.l1dir_size), + 4096, + le32_to_cpu(header.granularity), + &extent, + errp); + if (ret < 0) { + return ret; + } + ret = vmdk_init_tables(bs, extent, errp); + if (ret) { + /* free extent allocated by vmdk_add_extent */ + vmdk_free_last_extent(bs); + } + return ret; +} + +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, + QDict *options, Error **errp); + +static char *vmdk_read_desc(BlockDriverState *file, uint64_t desc_offset, + Error **errp) +{ + int64_t size; + char *buf; + int ret; + + size = bdrv_getlength(file); + if (size < 0) { + error_setg_errno(errp, -size, "Could not access file"); + return NULL; + } + + if (size < 4) { + /* Both descriptor file and sparse image must be much larger than 4 + * bytes, also callers of vmdk_read_desc want to compare the first 4 + * bytes with VMDK4_MAGIC, let's error out if less is read. */ + error_setg(errp, "File is too small, not a valid image"); + return NULL; + } + + size = MIN(size, (1 << 20) - 1); /* avoid unbounded allocation */ + buf = g_malloc(size + 1); + + ret = bdrv_pread(file, desc_offset, buf, size); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not read from file"); + g_free(buf); + return NULL; + } + buf[ret] = 0; + + return buf; +} + +static int vmdk_open_vmdk4(BlockDriverState *bs, + BdrvChild *file, + int flags, QDict *options, Error **errp) +{ + int ret; + uint32_t magic; + uint32_t l1_size, l1_entry_sectors; + VMDK4Header header; + VmdkExtent *extent; + BDRVVmdkState *s = bs->opaque; + int64_t l1_backup_offset = 0; + bool compressed; + + ret = bdrv_pread(file->bs, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + error_setg_errno(errp, -ret, + "Could not read header from file '%s'", + file->bs->filename); + return -EINVAL; + } + if (header.capacity == 0) { + uint64_t desc_offset = le64_to_cpu(header.desc_offset); + if (desc_offset) { + char *buf = vmdk_read_desc(file->bs, desc_offset << 9, errp); + if (!buf) { + return -EINVAL; + } + ret = vmdk_open_desc_file(bs, flags, buf, options, errp); + g_free(buf); + return ret; + } + } + + if (!s->create_type) { + s->create_type = g_strdup("monolithicSparse"); + } + + if (le64_to_cpu(header.gd_offset) == VMDK4_GD_AT_END) { + /* + * The footer takes precedence over the header, so read it in. The + * footer starts at offset -1024 from the end: One sector for the + * footer, and another one for the end-of-stream marker. + */ + struct { + struct { + uint64_t val; + uint32_t size; + uint32_t type; + uint8_t pad[512 - 16]; + } QEMU_PACKED footer_marker; + + uint32_t magic; + VMDK4Header header; + uint8_t pad[512 - 4 - sizeof(VMDK4Header)]; + + struct { + uint64_t val; + uint32_t size; + uint32_t type; + uint8_t pad[512 - 16]; + } QEMU_PACKED eos_marker; + } QEMU_PACKED footer; + + ret = bdrv_pread(file->bs, + bs->file->bs->total_sectors * 512 - 1536, + &footer, sizeof(footer)); + if (ret < 0) { + error_setg_errno(errp, -ret, "Failed to read footer"); + return ret; + } + + /* Some sanity checks for the footer */ + if (be32_to_cpu(footer.magic) != VMDK4_MAGIC || + le32_to_cpu(footer.footer_marker.size) != 0 || + le32_to_cpu(footer.footer_marker.type) != MARKER_FOOTER || + le64_to_cpu(footer.eos_marker.val) != 0 || + le32_to_cpu(footer.eos_marker.size) != 0 || + le32_to_cpu(footer.eos_marker.type) != MARKER_END_OF_STREAM) + { + error_setg(errp, "Invalid footer"); + return -EINVAL; + } + + header = footer.header; + } + + compressed = + le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; + if (le32_to_cpu(header.version) > 3) { + char buf[64]; + snprintf(buf, sizeof(buf), "VMDK version %" PRId32, + le32_to_cpu(header.version)); + error_setg(errp, QERR_UNKNOWN_BLOCK_FORMAT_FEATURE, + bdrv_get_device_or_node_name(bs), "vmdk", buf); + return -ENOTSUP; + } else if (le32_to_cpu(header.version) == 3 && (flags & BDRV_O_RDWR) && + !compressed) { + /* VMware KB 2064959 explains that version 3 added support for + * persistent changed block tracking (CBT), and backup software can + * read it as version=1 if it doesn't care about the changed area + * information. So we are safe to enable read only. */ + error_setg(errp, "VMDK version 3 must be read only"); + return -EINVAL; + } + + if (le32_to_cpu(header.num_gtes_per_gt) > 512) { + error_setg(errp, "L2 table size too big"); + return -EINVAL; + } + + l1_entry_sectors = le32_to_cpu(header.num_gtes_per_gt) + * le64_to_cpu(header.granularity); + if (l1_entry_sectors == 0) { + error_setg(errp, "L1 entry size is invalid"); + return -EINVAL; + } + l1_size = (le64_to_cpu(header.capacity) + l1_entry_sectors - 1) + / l1_entry_sectors; + if (le32_to_cpu(header.flags) & VMDK4_FLAG_RGD) { + l1_backup_offset = le64_to_cpu(header.rgd_offset) << 9; + } + if (bdrv_nb_sectors(file->bs) < le64_to_cpu(header.grain_offset)) { + error_setg(errp, "File truncated, expecting at least %" PRId64 " bytes", + (int64_t)(le64_to_cpu(header.grain_offset) + * BDRV_SECTOR_SIZE)); + return -EINVAL; + } + + ret = vmdk_add_extent(bs, file, false, + le64_to_cpu(header.capacity), + le64_to_cpu(header.gd_offset) << 9, + l1_backup_offset, + l1_size, + le32_to_cpu(header.num_gtes_per_gt), + le64_to_cpu(header.granularity), + &extent, + errp); + if (ret < 0) { + return ret; + } + extent->compressed = + le16_to_cpu(header.compressAlgorithm) == VMDK4_COMPRESSION_DEFLATE; + if (extent->compressed) { + g_free(s->create_type); + s->create_type = g_strdup("streamOptimized"); + } + extent->has_marker = le32_to_cpu(header.flags) & VMDK4_FLAG_MARKER; + extent->version = le32_to_cpu(header.version); + extent->has_zero_grain = le32_to_cpu(header.flags) & VMDK4_FLAG_ZERO_GRAIN; + ret = vmdk_init_tables(bs, extent, errp); + if (ret) { + /* free extent allocated by vmdk_add_extent */ + vmdk_free_last_extent(bs); + } + return ret; +} + +/* find an option value out of descriptor file */ +static int vmdk_parse_description(const char *desc, const char *opt_name, + char *buf, int buf_size) +{ + char *opt_pos, *opt_end; + const char *end = desc + strlen(desc); + + opt_pos = strstr(desc, opt_name); + if (!opt_pos) { + return VMDK_ERROR; + } + /* Skip "=\"" following opt_name */ + opt_pos += strlen(opt_name) + 2; + if (opt_pos >= end) { + return VMDK_ERROR; + } + opt_end = opt_pos; + while (opt_end < end && *opt_end != '"') { + opt_end++; + } + if (opt_end == end || buf_size < opt_end - opt_pos + 1) { + return VMDK_ERROR; + } + pstrcpy(buf, opt_end - opt_pos + 1, opt_pos); + return VMDK_OK; +} + +/* Open an extent file and append to bs array */ +static int vmdk_open_sparse(BlockDriverState *bs, BdrvChild *file, int flags, + char *buf, QDict *options, Error **errp) +{ + uint32_t magic; + + magic = ldl_be_p(buf); + switch (magic) { + case VMDK3_MAGIC: + return vmdk_open_vmfs_sparse(bs, file, flags, errp); + break; + case VMDK4_MAGIC: + return vmdk_open_vmdk4(bs, file, flags, options, errp); + break; + default: + error_setg(errp, "Image not in VMDK format"); + return -EINVAL; + break; + } +} + +static int vmdk_parse_extents(const char *desc, BlockDriverState *bs, + const char *desc_file_path, QDict *options, + Error **errp) +{ + int ret; + int matches; + char access[11]; + char type[11]; + char fname[512]; + const char *p = desc; + int64_t sectors = 0; + int64_t flat_offset; + char *extent_path; + BdrvChild *extent_file; + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent; + char extent_opt_prefix[32]; + Error *local_err = NULL; + + while (*p) { + /* parse extent line in one of below formats: + * + * RW [size in sectors] FLAT "file-name.vmdk" OFFSET + * RW [size in sectors] SPARSE "file-name.vmdk" + * RW [size in sectors] VMFS "file-name.vmdk" + * RW [size in sectors] VMFSSPARSE "file-name.vmdk" + */ + flat_offset = -1; + matches = sscanf(p, "%10s %" SCNd64 " %10s \"%511[^\n\r\"]\" %" SCNd64, + access, §ors, type, fname, &flat_offset); + if (matches < 4 || strcmp(access, "RW")) { + goto next_line; + } else if (!strcmp(type, "FLAT")) { + if (matches != 5 || flat_offset < 0) { + error_setg(errp, "Invalid extent lines: \n%s", p); + return -EINVAL; + } + } else if (!strcmp(type, "VMFS")) { + if (matches == 4) { + flat_offset = 0; + } else { + error_setg(errp, "Invalid extent lines:\n%s", p); + return -EINVAL; + } + } else if (matches != 4) { + error_setg(errp, "Invalid extent lines:\n%s", p); + return -EINVAL; + } + + if (sectors <= 0 || + (strcmp(type, "FLAT") && strcmp(type, "SPARSE") && + strcmp(type, "VMFS") && strcmp(type, "VMFSSPARSE")) || + (strcmp(access, "RW"))) { + goto next_line; + } + + if (!path_is_absolute(fname) && !path_has_protocol(fname) && + !desc_file_path[0]) + { + error_setg(errp, "Cannot use relative extent paths with VMDK " + "descriptor file '%s'", bs->file->bs->filename); + return -EINVAL; + } + + extent_path = g_malloc0(PATH_MAX); + path_combine(extent_path, PATH_MAX, desc_file_path, fname); + + ret = snprintf(extent_opt_prefix, 32, "extents.%d", s->num_extents); + assert(ret < 32); + + extent_file = bdrv_open_child(extent_path, options, extent_opt_prefix, + bs, &child_file, false, &local_err); + g_free(extent_path); + if (local_err) { + error_propagate(errp, local_err); + return -EINVAL; + } + + /* save to extents array */ + if (!strcmp(type, "FLAT") || !strcmp(type, "VMFS")) { + /* FLAT extent */ + + ret = vmdk_add_extent(bs, extent_file, true, sectors, + 0, 0, 0, 0, 0, &extent, errp); + if (ret < 0) { + bdrv_unref_child(bs, extent_file); + return ret; + } + extent->flat_start_offset = flat_offset << 9; + } else if (!strcmp(type, "SPARSE") || !strcmp(type, "VMFSSPARSE")) { + /* SPARSE extent and VMFSSPARSE extent are both "COWD" sparse file*/ + char *buf = vmdk_read_desc(extent_file->bs, 0, errp); + if (!buf) { + ret = -EINVAL; + } else { + ret = vmdk_open_sparse(bs, extent_file, bs->open_flags, buf, + options, errp); + } + g_free(buf); + if (ret) { + bdrv_unref_child(bs, extent_file); + return ret; + } + extent = &s->extents[s->num_extents - 1]; + } else { + error_setg(errp, "Unsupported extent type '%s'", type); + bdrv_unref_child(bs, extent_file); + return -ENOTSUP; + } + extent->type = g_strdup(type); +next_line: + /* move to next line */ + while (*p) { + if (*p == '\n') { + p++; + break; + } + p++; + } + } + return 0; +} + +static int vmdk_open_desc_file(BlockDriverState *bs, int flags, char *buf, + QDict *options, Error **errp) +{ + int ret; + char ct[128]; + BDRVVmdkState *s = bs->opaque; + + if (vmdk_parse_description(buf, "createType", ct, sizeof(ct))) { + error_setg(errp, "invalid VMDK image descriptor"); + ret = -EINVAL; + goto exit; + } + if (strcmp(ct, "monolithicFlat") && + strcmp(ct, "vmfs") && + strcmp(ct, "vmfsSparse") && + strcmp(ct, "twoGbMaxExtentSparse") && + strcmp(ct, "twoGbMaxExtentFlat")) { + error_setg(errp, "Unsupported image type '%s'", ct); + ret = -ENOTSUP; + goto exit; + } + s->create_type = g_strdup(ct); + s->desc_offset = 0; + ret = vmdk_parse_extents(buf, bs, bs->file->bs->exact_filename, options, + errp); +exit: + return ret; +} + +static int vmdk_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + char *buf; + int ret; + BDRVVmdkState *s = bs->opaque; + uint32_t magic; + + buf = vmdk_read_desc(bs->file->bs, 0, errp); + if (!buf) { + return -EINVAL; + } + + magic = ldl_be_p(buf); + switch (magic) { + case VMDK3_MAGIC: + case VMDK4_MAGIC: + ret = vmdk_open_sparse(bs, bs->file, flags, buf, options, + errp); + s->desc_offset = 0x200; + break; + default: + ret = vmdk_open_desc_file(bs, flags, buf, options, errp); + break; + } + if (ret) { + goto fail; + } + + /* try to open parent images, if exist */ + ret = vmdk_parent_open(bs); + if (ret) { + goto fail; + } + s->cid = vmdk_read_cid(bs, 0); + s->parent_cid = vmdk_read_cid(bs, 1); + qemu_co_mutex_init(&s->lock); + + /* Disable migration when VMDK images are used */ + error_setg(&s->migration_blocker, "The vmdk format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); + migrate_add_blocker(s->migration_blocker); + g_free(buf); + return 0; + +fail: + g_free(buf); + g_free(s->create_type); + s->create_type = NULL; + vmdk_free_extents(bs); + return ret; +} + + +static void vmdk_refresh_limits(BlockDriverState *bs, Error **errp) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + if (!s->extents[i].flat) { + bs->bl.write_zeroes_alignment = + MAX(bs->bl.write_zeroes_alignment, + s->extents[i].cluster_sectors); + } + } +} + +/** + * get_whole_cluster + * + * Copy backing file's cluster that covers @sector_num, otherwise write zero, + * to the cluster at @cluster_sector_num. + * + * If @skip_start_sector < @skip_end_sector, the relative range + * [@skip_start_sector, @skip_end_sector) is not copied or written, and leave + * it for call to write user data in the request. + */ +static int get_whole_cluster(BlockDriverState *bs, + VmdkExtent *extent, + uint64_t cluster_sector_num, + uint64_t sector_num, + uint64_t skip_start_sector, + uint64_t skip_end_sector) +{ + int ret = VMDK_OK; + int64_t cluster_bytes; + uint8_t *whole_grain; + + /* For COW, align request sector_num to cluster start */ + sector_num = QEMU_ALIGN_DOWN(sector_num, extent->cluster_sectors); + cluster_bytes = extent->cluster_sectors << BDRV_SECTOR_BITS; + whole_grain = qemu_blockalign(bs, cluster_bytes); + + if (!bs->backing) { + memset(whole_grain, 0, skip_start_sector << BDRV_SECTOR_BITS); + memset(whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), 0, + cluster_bytes - (skip_end_sector << BDRV_SECTOR_BITS)); + } + + assert(skip_end_sector <= extent->cluster_sectors); + /* we will be here if it's first write on non-exist grain(cluster). + * try to read from parent image, if exist */ + if (bs->backing && !vmdk_is_cid_valid(bs)) { + ret = VMDK_ERROR; + goto exit; + } + + /* Read backing data before skip range */ + if (skip_start_sector > 0) { + if (bs->backing) { + ret = bdrv_read(bs->backing->bs, sector_num, + whole_grain, skip_start_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + ret = bdrv_write(extent->file->bs, cluster_sector_num, whole_grain, + skip_start_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + /* Read backing data after skip range */ + if (skip_end_sector < extent->cluster_sectors) { + if (bs->backing) { + ret = bdrv_read(bs->backing->bs, sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + ret = bdrv_write(extent->file->bs, cluster_sector_num + skip_end_sector, + whole_grain + (skip_end_sector << BDRV_SECTOR_BITS), + extent->cluster_sectors - skip_end_sector); + if (ret < 0) { + ret = VMDK_ERROR; + goto exit; + } + } + +exit: + qemu_vfree(whole_grain); + return ret; +} + +static int vmdk_L2update(VmdkExtent *extent, VmdkMetaData *m_data, + uint32_t offset) +{ + offset = cpu_to_le32(offset); + /* update L2 table */ + if (bdrv_pwrite_sync( + extent->file->bs, + ((int64_t)m_data->l2_offset * 512) + + (m_data->l2_index * sizeof(offset)), + &offset, sizeof(offset)) < 0) { + return VMDK_ERROR; + } + /* update backup L2 table */ + if (extent->l1_backup_table_offset != 0) { + m_data->l2_offset = extent->l1_backup_table[m_data->l1_index]; + if (bdrv_pwrite_sync( + extent->file->bs, + ((int64_t)m_data->l2_offset * 512) + + (m_data->l2_index * sizeof(offset)), + &offset, sizeof(offset)) < 0) { + return VMDK_ERROR; + } + } + if (m_data->l2_cache_entry) { + *m_data->l2_cache_entry = offset; + } + + return VMDK_OK; +} + +/** + * get_cluster_offset + * + * Look up cluster offset in extent file by sector number, and store in + * @cluster_offset. + * + * For flat extents, the start offset as parsed from the description file is + * returned. + * + * For sparse extents, look up in L1, L2 table. If allocate is true, return an + * offset for a new cluster and update L2 cache. If there is a backing file, + * COW is done before returning; otherwise, zeroes are written to the allocated + * cluster. Both COW and zero writing skips the sector range + * [@skip_start_sector, @skip_end_sector) passed in by caller, because caller + * has new data to write there. + * + * Returns: VMDK_OK if cluster exists and mapped in the image. + * VMDK_UNALLOC if cluster is not mapped and @allocate is false. + * VMDK_ERROR if failed. + */ +static int get_cluster_offset(BlockDriverState *bs, + VmdkExtent *extent, + VmdkMetaData *m_data, + uint64_t offset, + bool allocate, + uint64_t *cluster_offset, + uint64_t skip_start_sector, + uint64_t skip_end_sector) +{ + unsigned int l1_index, l2_offset, l2_index; + int min_index, i, j; + uint32_t min_count, *l2_table; + bool zeroed = false; + int64_t ret; + int64_t cluster_sector; + + if (m_data) { + m_data->valid = 0; + } + if (extent->flat) { + *cluster_offset = extent->flat_start_offset; + return VMDK_OK; + } + + offset -= (extent->end_sector - extent->sectors) * SECTOR_SIZE; + l1_index = (offset >> 9) / extent->l1_entry_sectors; + if (l1_index >= extent->l1_size) { + return VMDK_ERROR; + } + l2_offset = extent->l1_table[l1_index]; + if (!l2_offset) { + return VMDK_UNALLOC; + } + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (l2_offset == extent->l2_cache_offsets[i]) { + /* increment the hit count */ + if (++extent->l2_cache_counts[i] == 0xffffffff) { + for (j = 0; j < L2_CACHE_SIZE; j++) { + extent->l2_cache_counts[j] >>= 1; + } + } + l2_table = extent->l2_cache + (i * extent->l2_size); + goto found; + } + } + /* not found: load a new entry in the least used one */ + min_index = 0; + min_count = 0xffffffff; + for (i = 0; i < L2_CACHE_SIZE; i++) { + if (extent->l2_cache_counts[i] < min_count) { + min_count = extent->l2_cache_counts[i]; + min_index = i; + } + } + l2_table = extent->l2_cache + (min_index * extent->l2_size); + if (bdrv_pread( + extent->file->bs, + (int64_t)l2_offset * 512, + l2_table, + extent->l2_size * sizeof(uint32_t) + ) != extent->l2_size * sizeof(uint32_t)) { + return VMDK_ERROR; + } + + extent->l2_cache_offsets[min_index] = l2_offset; + extent->l2_cache_counts[min_index] = 1; + found: + l2_index = ((offset >> 9) / extent->cluster_sectors) % extent->l2_size; + cluster_sector = le32_to_cpu(l2_table[l2_index]); + + if (m_data) { + m_data->valid = 1; + m_data->l1_index = l1_index; + m_data->l2_index = l2_index; + m_data->l2_offset = l2_offset; + m_data->l2_cache_entry = &l2_table[l2_index]; + } + if (extent->has_zero_grain && cluster_sector == VMDK_GTE_ZEROED) { + zeroed = true; + } + + if (!cluster_sector || zeroed) { + if (!allocate) { + return zeroed ? VMDK_ZEROED : VMDK_UNALLOC; + } + + cluster_sector = extent->next_cluster_sector; + extent->next_cluster_sector += extent->cluster_sectors; + + /* First of all we write grain itself, to avoid race condition + * that may to corrupt the image. + * This problem may occur because of insufficient space on host disk + * or inappropriate VM shutdown. + */ + ret = get_whole_cluster(bs, extent, + cluster_sector, + offset >> BDRV_SECTOR_BITS, + skip_start_sector, skip_end_sector); + if (ret) { + return ret; + } + } + *cluster_offset = cluster_sector << BDRV_SECTOR_BITS; + return VMDK_OK; +} + +static VmdkExtent *find_extent(BDRVVmdkState *s, + int64_t sector_num, VmdkExtent *start_hint) +{ + VmdkExtent *extent = start_hint; + + if (!extent) { + extent = &s->extents[0]; + } + while (extent < &s->extents[s->num_extents]) { + if (sector_num < extent->end_sector) { + return extent; + } + extent++; + } + return NULL; +} + +static inline uint64_t vmdk_find_index_in_cluster(VmdkExtent *extent, + int64_t sector_num) +{ + uint64_t index_in_cluster, extent_begin_sector, extent_relative_sector_num; + + extent_begin_sector = extent->end_sector - extent->sectors; + extent_relative_sector_num = sector_num - extent_begin_sector; + index_in_cluster = extent_relative_sector_num % extent->cluster_sectors; + return index_in_cluster; +} + +static int64_t coroutine_fn vmdk_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVVmdkState *s = bs->opaque; + int64_t index_in_cluster, n, ret; + uint64_t offset; + VmdkExtent *extent; + + extent = find_extent(s, sector_num, NULL); + if (!extent) { + return 0; + } + qemu_co_mutex_lock(&s->lock); + ret = get_cluster_offset(bs, extent, NULL, + sector_num * 512, false, &offset, + 0, 0); + qemu_co_mutex_unlock(&s->lock); + + switch (ret) { + case VMDK_ERROR: + ret = -EIO; + break; + case VMDK_UNALLOC: + ret = 0; + break; + case VMDK_ZEROED: + ret = BDRV_BLOCK_ZERO; + break; + case VMDK_OK: + ret = BDRV_BLOCK_DATA; + if (extent->file == bs->file && !extent->compressed) { + ret |= BDRV_BLOCK_OFFSET_VALID | offset; + } + + break; + } + + index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + *pnum = n; + return ret; +} + +static int vmdk_write_extent(VmdkExtent *extent, int64_t cluster_offset, + int64_t offset_in_cluster, const uint8_t *buf, + int nb_sectors, int64_t sector_num) +{ + int ret; + VmdkGrainMarker *data = NULL; + uLongf buf_len; + const uint8_t *write_buf = buf; + int write_len = nb_sectors * 512; + int64_t write_offset; + int64_t write_end_sector; + + if (extent->compressed) { + if (!extent->has_marker) { + ret = -EINVAL; + goto out; + } + buf_len = (extent->cluster_sectors << 9) * 2; + data = g_malloc(buf_len + sizeof(VmdkGrainMarker)); + if (compress(data->data, &buf_len, buf, nb_sectors << 9) != Z_OK || + buf_len == 0) { + ret = -EINVAL; + goto out; + } + data->lba = sector_num; + data->size = buf_len; + write_buf = (uint8_t *)data; + write_len = buf_len + sizeof(VmdkGrainMarker); + } + write_offset = cluster_offset + offset_in_cluster, + ret = bdrv_pwrite(extent->file->bs, write_offset, write_buf, write_len); + + write_end_sector = DIV_ROUND_UP(write_offset + write_len, BDRV_SECTOR_SIZE); + + if (extent->compressed) { + extent->next_cluster_sector = write_end_sector; + } else { + extent->next_cluster_sector = MAX(extent->next_cluster_sector, + write_end_sector); + } + + if (ret != write_len) { + ret = ret < 0 ? ret : -EIO; + goto out; + } + ret = 0; + out: + g_free(data); + return ret; +} + +static int vmdk_read_extent(VmdkExtent *extent, int64_t cluster_offset, + int64_t offset_in_cluster, uint8_t *buf, + int nb_sectors) +{ + int ret; + int cluster_bytes, buf_bytes; + uint8_t *cluster_buf, *compressed_data; + uint8_t *uncomp_buf; + uint32_t data_len; + VmdkGrainMarker *marker; + uLongf buf_len; + + + if (!extent->compressed) { + ret = bdrv_pread(extent->file->bs, + cluster_offset + offset_in_cluster, + buf, nb_sectors * 512); + if (ret == nb_sectors * 512) { + return 0; + } else { + return -EIO; + } + } + cluster_bytes = extent->cluster_sectors * 512; + /* Read two clusters in case GrainMarker + compressed data > one cluster */ + buf_bytes = cluster_bytes * 2; + cluster_buf = g_malloc(buf_bytes); + uncomp_buf = g_malloc(cluster_bytes); + ret = bdrv_pread(extent->file->bs, + cluster_offset, + cluster_buf, buf_bytes); + if (ret < 0) { + goto out; + } + compressed_data = cluster_buf; + buf_len = cluster_bytes; + data_len = cluster_bytes; + if (extent->has_marker) { + marker = (VmdkGrainMarker *)cluster_buf; + compressed_data = marker->data; + data_len = le32_to_cpu(marker->size); + } + if (!data_len || data_len > buf_bytes) { + ret = -EINVAL; + goto out; + } + ret = uncompress(uncomp_buf, &buf_len, compressed_data, data_len); + if (ret != Z_OK) { + ret = -EINVAL; + goto out; + + } + if (offset_in_cluster < 0 || + offset_in_cluster + nb_sectors * 512 > buf_len) { + ret = -EINVAL; + goto out; + } + memcpy(buf, uncomp_buf + offset_in_cluster, nb_sectors * 512); + ret = 0; + + out: + g_free(uncomp_buf); + g_free(cluster_buf); + return ret; +} + +static int vmdk_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + int ret; + uint64_t n, index_in_cluster; + VmdkExtent *extent = NULL; + uint64_t cluster_offset; + + while (nb_sectors > 0) { + extent = find_extent(s, sector_num, extent); + if (!extent) { + return -EIO; + } + ret = get_cluster_offset(bs, extent, NULL, + sector_num << 9, false, &cluster_offset, + 0, 0); + index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + if (ret != VMDK_OK) { + /* if not allocated, try to read from parent image, if exist */ + if (bs->backing && ret != VMDK_ZEROED) { + if (!vmdk_is_cid_valid(bs)) { + return -EINVAL; + } + ret = bdrv_read(bs->backing->bs, sector_num, buf, n); + if (ret < 0) { + return ret; + } + } else { + memset(buf, 0, 512 * n); + } + } else { + ret = vmdk_read_extent(extent, + cluster_offset, index_in_cluster * 512, + buf, n); + if (ret) { + return ret; + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + } + return 0; +} + +static coroutine_fn int vmdk_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVmdkState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vmdk_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +/** + * vmdk_write: + * @zeroed: buf is ignored (data is zero), use zeroed_grain GTE feature + * if possible, otherwise return -ENOTSUP. + * @zero_dry_run: used for zeroed == true only, don't update L2 table, just try + * with each cluster. By dry run we can find if the zero write + * is possible without modifying image data. + * + * Returns: error code with 0 for success. + */ +static int vmdk_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors, + bool zeroed, bool zero_dry_run) +{ + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent = NULL; + int ret; + int64_t index_in_cluster, n; + uint64_t cluster_offset; + VmdkMetaData m_data; + + if (sector_num > bs->total_sectors) { + error_report("Wrong offset: sector_num=0x%" PRIx64 + " total_sectors=0x%" PRIx64 "\n", + sector_num, bs->total_sectors); + return -EIO; + } + + while (nb_sectors > 0) { + extent = find_extent(s, sector_num, extent); + if (!extent) { + return -EIO; + } + index_in_cluster = vmdk_find_index_in_cluster(extent, sector_num); + n = extent->cluster_sectors - index_in_cluster; + if (n > nb_sectors) { + n = nb_sectors; + } + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + !(extent->compressed || zeroed), + &cluster_offset, + index_in_cluster, index_in_cluster + n); + if (extent->compressed) { + if (ret == VMDK_OK) { + /* Refuse write to allocated cluster for streamOptimized */ + error_report("Could not write to allocated cluster" + " for streamOptimized"); + return -EIO; + } else { + /* allocate */ + ret = get_cluster_offset(bs, extent, &m_data, sector_num << 9, + true, &cluster_offset, 0, 0); + } + } + if (ret == VMDK_ERROR) { + return -EINVAL; + } + if (zeroed) { + /* Do zeroed write, buf is ignored */ + if (extent->has_zero_grain && + index_in_cluster == 0 && + n >= extent->cluster_sectors) { + n = extent->cluster_sectors; + if (!zero_dry_run) { + /* update L2 tables */ + if (vmdk_L2update(extent, &m_data, VMDK_GTE_ZEROED) + != VMDK_OK) { + return -EIO; + } + } + } else { + return -ENOTSUP; + } + } else { + ret = vmdk_write_extent(extent, + cluster_offset, index_in_cluster * 512, + buf, n, sector_num); + if (ret) { + return ret; + } + if (m_data.valid) { + /* update L2 tables */ + if (vmdk_L2update(extent, &m_data, + cluster_offset >> BDRV_SECTOR_BITS) + != VMDK_OK) { + return -EIO; + } + } + } + nb_sectors -= n; + sector_num += n; + buf += n * 512; + + /* update CID on the first write every time the virtual disk is + * opened */ + if (!s->cid_updated) { + ret = vmdk_write_cid(bs, g_random_int()); + if (ret < 0) { + return ret; + } + s->cid_updated = true; + } + } + return 0; +} + +static coroutine_fn int vmdk_co_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVmdkState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int vmdk_write_compressed(BlockDriverState *bs, + int64_t sector_num, + const uint8_t *buf, + int nb_sectors) +{ + BDRVVmdkState *s = bs->opaque; + if (s->num_extents == 1 && s->extents[0].compressed) { + return vmdk_write(bs, sector_num, buf, nb_sectors, false, false); + } else { + return -ENOTSUP; + } +} + +static int coroutine_fn vmdk_co_write_zeroes(BlockDriverState *bs, + int64_t sector_num, + int nb_sectors, + BdrvRequestFlags flags) +{ + int ret; + BDRVVmdkState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + /* write zeroes could fail if sectors not aligned to cluster, test it with + * dry_run == true before really updating image */ + ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, true); + if (!ret) { + ret = vmdk_write(bs, sector_num, NULL, nb_sectors, true, false); + } + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int vmdk_create_extent(const char *filename, int64_t filesize, + bool flat, bool compress, bool zeroed_grain, + QemuOpts *opts, Error **errp) +{ + int ret, i; + BlockDriverState *bs = NULL; + VMDK4Header header; + Error *local_err = NULL; + uint32_t tmp, magic, grains, gd_sectors, gt_size, gt_count; + uint32_t *gd_buf = NULL; + int gd_buf_size; + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + assert(bs == NULL); + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + + if (flat) { + ret = bdrv_truncate(bs, filesize); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not truncate file"); + } + goto exit; + } + magic = cpu_to_be32(VMDK4_MAGIC); + memset(&header, 0, sizeof(header)); + if (compress) { + header.version = 3; + } else if (zeroed_grain) { + header.version = 2; + } else { + header.version = 1; + } + header.flags = VMDK4_FLAG_RGD | VMDK4_FLAG_NL_DETECT + | (compress ? VMDK4_FLAG_COMPRESS | VMDK4_FLAG_MARKER : 0) + | (zeroed_grain ? VMDK4_FLAG_ZERO_GRAIN : 0); + header.compressAlgorithm = compress ? VMDK4_COMPRESSION_DEFLATE : 0; + header.capacity = filesize / BDRV_SECTOR_SIZE; + header.granularity = 128; + header.num_gtes_per_gt = BDRV_SECTOR_SIZE; + + grains = DIV_ROUND_UP(filesize / BDRV_SECTOR_SIZE, header.granularity); + gt_size = DIV_ROUND_UP(header.num_gtes_per_gt * sizeof(uint32_t), + BDRV_SECTOR_SIZE); + gt_count = DIV_ROUND_UP(grains, header.num_gtes_per_gt); + gd_sectors = DIV_ROUND_UP(gt_count * sizeof(uint32_t), BDRV_SECTOR_SIZE); + + header.desc_offset = 1; + header.desc_size = 20; + header.rgd_offset = header.desc_offset + header.desc_size; + header.gd_offset = header.rgd_offset + gd_sectors + (gt_size * gt_count); + header.grain_offset = + ROUND_UP(header.gd_offset + gd_sectors + (gt_size * gt_count), + header.granularity); + /* swap endianness for all header fields */ + header.version = cpu_to_le32(header.version); + header.flags = cpu_to_le32(header.flags); + header.capacity = cpu_to_le64(header.capacity); + header.granularity = cpu_to_le64(header.granularity); + header.num_gtes_per_gt = cpu_to_le32(header.num_gtes_per_gt); + header.desc_offset = cpu_to_le64(header.desc_offset); + header.desc_size = cpu_to_le64(header.desc_size); + header.rgd_offset = cpu_to_le64(header.rgd_offset); + header.gd_offset = cpu_to_le64(header.gd_offset); + header.grain_offset = cpu_to_le64(header.grain_offset); + header.compressAlgorithm = cpu_to_le16(header.compressAlgorithm); + + header.check_bytes[0] = 0xa; + header.check_bytes[1] = 0x20; + header.check_bytes[2] = 0xd; + header.check_bytes[3] = 0xa; + + /* write all the data */ + ret = bdrv_pwrite(bs, 0, &magic, sizeof(magic)); + if (ret < 0) { + error_setg(errp, QERR_IO_ERROR); + goto exit; + } + ret = bdrv_pwrite(bs, sizeof(magic), &header, sizeof(header)); + if (ret < 0) { + error_setg(errp, QERR_IO_ERROR); + goto exit; + } + + ret = bdrv_truncate(bs, le64_to_cpu(header.grain_offset) << 9); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not truncate file"); + goto exit; + } + + /* write grain directory */ + gd_buf_size = gd_sectors * BDRV_SECTOR_SIZE; + gd_buf = g_malloc0(gd_buf_size); + for (i = 0, tmp = le64_to_cpu(header.rgd_offset) + gd_sectors; + i < gt_count; i++, tmp += gt_size) { + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.rgd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_setg(errp, QERR_IO_ERROR); + goto exit; + } + + /* write backup grain directory */ + for (i = 0, tmp = le64_to_cpu(header.gd_offset) + gd_sectors; + i < gt_count; i++, tmp += gt_size) { + gd_buf[i] = cpu_to_le32(tmp); + } + ret = bdrv_pwrite(bs, le64_to_cpu(header.gd_offset) * BDRV_SECTOR_SIZE, + gd_buf, gd_buf_size); + if (ret < 0) { + error_setg(errp, QERR_IO_ERROR); + goto exit; + } + + ret = 0; +exit: + if (bs) { + bdrv_unref(bs); + } + g_free(gd_buf); + return ret; +} + +static int filename_decompose(const char *filename, char *path, char *prefix, + char *postfix, size_t buf_len, Error **errp) +{ + const char *p, *q; + + if (filename == NULL || !strlen(filename)) { + error_setg(errp, "No filename provided"); + return VMDK_ERROR; + } + p = strrchr(filename, '/'); + if (p == NULL) { + p = strrchr(filename, '\\'); + } + if (p == NULL) { + p = strrchr(filename, ':'); + } + if (p != NULL) { + p++; + if (p - filename >= buf_len) { + return VMDK_ERROR; + } + pstrcpy(path, p - filename + 1, filename); + } else { + p = filename; + path[0] = '\0'; + } + q = strrchr(p, '.'); + if (q == NULL) { + pstrcpy(prefix, buf_len, p); + postfix[0] = '\0'; + } else { + if (q - p >= buf_len) { + return VMDK_ERROR; + } + pstrcpy(prefix, q - p + 1, p); + pstrcpy(postfix, buf_len, q); + } + return VMDK_OK; +} + +static int vmdk_create(const char *filename, QemuOpts *opts, Error **errp) +{ + int idx = 0; + BlockDriverState *new_bs = NULL; + Error *local_err = NULL; + char *desc = NULL; + int64_t total_size = 0, filesize; + char *adapter_type = NULL; + char *backing_file = NULL; + char *fmt = NULL; + int flags = 0; + int ret = 0; + bool flat, split, compress; + GString *ext_desc_lines; + char *path = g_malloc0(PATH_MAX); + char *prefix = g_malloc0(PATH_MAX); + char *postfix = g_malloc0(PATH_MAX); + char *desc_line = g_malloc0(BUF_SIZE); + char *ext_filename = g_malloc0(PATH_MAX); + char *desc_filename = g_malloc0(PATH_MAX); + const int64_t split_size = 0x80000000; /* VMDK has constant split size */ + const char *desc_extent_line; + char *parent_desc_line = g_malloc0(BUF_SIZE); + uint32_t parent_cid = 0xffffffff; + uint32_t number_heads = 16; + bool zeroed_grain = false; + uint32_t desc_offset = 0, desc_len; + const char desc_template[] = + "# Disk DescriptorFile\n" + "version=1\n" + "CID=%" PRIx32 "\n" + "parentCID=%" PRIx32 "\n" + "createType=\"%s\"\n" + "%s" + "\n" + "# Extent description\n" + "%s" + "\n" + "# The Disk Data Base\n" + "#DDB\n" + "\n" + "ddb.virtualHWVersion = \"%d\"\n" + "ddb.geometry.cylinders = \"%" PRId64 "\"\n" + "ddb.geometry.heads = \"%" PRIu32 "\"\n" + "ddb.geometry.sectors = \"63\"\n" + "ddb.adapterType = \"%s\"\n"; + + ext_desc_lines = g_string_new(NULL); + + if (filename_decompose(filename, path, prefix, postfix, PATH_MAX, errp)) { + ret = -EINVAL; + goto exit; + } + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + adapter_type = qemu_opt_get_del(opts, BLOCK_OPT_ADAPTER_TYPE); + backing_file = qemu_opt_get_del(opts, BLOCK_OPT_BACKING_FILE); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_COMPAT6, false)) { + flags |= BLOCK_FLAG_COMPAT6; + } + fmt = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); + if (qemu_opt_get_bool_del(opts, BLOCK_OPT_ZEROED_GRAIN, false)) { + zeroed_grain = true; + } + + if (!adapter_type) { + adapter_type = g_strdup("ide"); + } else if (strcmp(adapter_type, "ide") && + strcmp(adapter_type, "buslogic") && + strcmp(adapter_type, "lsilogic") && + strcmp(adapter_type, "legacyESX")) { + error_setg(errp, "Unknown adapter type: '%s'", adapter_type); + ret = -EINVAL; + goto exit; + } + if (strcmp(adapter_type, "ide") != 0) { + /* that's the number of heads with which vmware operates when + creating, exporting, etc. vmdk files with a non-ide adapter type */ + number_heads = 255; + } + if (!fmt) { + /* Default format to monolithicSparse */ + fmt = g_strdup("monolithicSparse"); + } else if (strcmp(fmt, "monolithicFlat") && + strcmp(fmt, "monolithicSparse") && + strcmp(fmt, "twoGbMaxExtentSparse") && + strcmp(fmt, "twoGbMaxExtentFlat") && + strcmp(fmt, "streamOptimized")) { + error_setg(errp, "Unknown subformat: '%s'", fmt); + ret = -EINVAL; + goto exit; + } + split = !(strcmp(fmt, "twoGbMaxExtentFlat") && + strcmp(fmt, "twoGbMaxExtentSparse")); + flat = !(strcmp(fmt, "monolithicFlat") && + strcmp(fmt, "twoGbMaxExtentFlat")); + compress = !strcmp(fmt, "streamOptimized"); + if (flat) { + desc_extent_line = "RW %" PRId64 " FLAT \"%s\" 0\n"; + } else { + desc_extent_line = "RW %" PRId64 " SPARSE \"%s\"\n"; + } + if (flat && backing_file) { + error_setg(errp, "Flat image can't have backing file"); + ret = -ENOTSUP; + goto exit; + } + if (flat && zeroed_grain) { + error_setg(errp, "Flat image can't enable zeroed grain"); + ret = -ENOTSUP; + goto exit; + } + if (backing_file) { + BlockDriverState *bs = NULL; + char *full_backing = g_new0(char, PATH_MAX); + bdrv_get_full_backing_filename_from_filename(filename, backing_file, + full_backing, PATH_MAX, + &local_err); + if (local_err) { + g_free(full_backing); + error_propagate(errp, local_err); + ret = -ENOENT; + goto exit; + } + ret = bdrv_open(&bs, full_backing, NULL, NULL, BDRV_O_NO_BACKING, errp); + g_free(full_backing); + if (ret != 0) { + goto exit; + } + if (strcmp(bs->drv->format_name, "vmdk")) { + bdrv_unref(bs); + ret = -EINVAL; + goto exit; + } + parent_cid = vmdk_read_cid(bs, 0); + bdrv_unref(bs); + snprintf(parent_desc_line, BUF_SIZE, + "parentFileNameHint=\"%s\"", backing_file); + } + + /* Create extents */ + filesize = total_size; + while (filesize > 0) { + int64_t size = filesize; + + if (split && size > split_size) { + size = split_size; + } + if (split) { + snprintf(desc_filename, PATH_MAX, "%s-%c%03d%s", + prefix, flat ? 'f' : 's', ++idx, postfix); + } else if (flat) { + snprintf(desc_filename, PATH_MAX, "%s-flat%s", prefix, postfix); + } else { + snprintf(desc_filename, PATH_MAX, "%s%s", prefix, postfix); + } + snprintf(ext_filename, PATH_MAX, "%s%s", path, desc_filename); + + if (vmdk_create_extent(ext_filename, size, + flat, compress, zeroed_grain, opts, errp)) { + ret = -EINVAL; + goto exit; + } + filesize -= size; + + /* Format description line */ + snprintf(desc_line, BUF_SIZE, + desc_extent_line, size / BDRV_SECTOR_SIZE, desc_filename); + g_string_append(ext_desc_lines, desc_line); + } + /* generate descriptor file */ + desc = g_strdup_printf(desc_template, + g_random_int(), + parent_cid, + fmt, + parent_desc_line, + ext_desc_lines->str, + (flags & BLOCK_FLAG_COMPAT6 ? 6 : 4), + total_size / + (int64_t)(63 * number_heads * BDRV_SECTOR_SIZE), + number_heads, + adapter_type); + desc_len = strlen(desc); + /* the descriptor offset = 0x200 */ + if (!split && !flat) { + desc_offset = 0x200; + } else { + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + } + assert(new_bs == NULL); + ret = bdrv_open(&new_bs, filename, NULL, NULL, + BDRV_O_RDWR | BDRV_O_PROTOCOL, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto exit; + } + ret = bdrv_pwrite(new_bs, desc_offset, desc, desc_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not write description"); + goto exit; + } + /* bdrv_pwrite write padding zeros to align to sector, we don't need that + * for description file */ + if (desc_offset == 0) { + ret = bdrv_truncate(new_bs, desc_len); + if (ret < 0) { + error_setg_errno(errp, -ret, "Could not truncate file"); + } + } +exit: + if (new_bs) { + bdrv_unref(new_bs); + } + g_free(adapter_type); + g_free(backing_file); + g_free(fmt); + g_free(desc); + g_free(path); + g_free(prefix); + g_free(postfix); + g_free(desc_line); + g_free(ext_filename); + g_free(desc_filename); + g_free(parent_desc_line); + g_string_free(ext_desc_lines, true); + return ret; +} + +static void vmdk_close(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + + vmdk_free_extents(bs); + g_free(s->create_type); + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static coroutine_fn int vmdk_co_flush(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + int i, err; + int ret = 0; + + for (i = 0; i < s->num_extents; i++) { + err = bdrv_co_flush(s->extents[i].file->bs); + if (err < 0) { + ret = err; + } + } + return ret; +} + +static int64_t vmdk_get_allocated_file_size(BlockDriverState *bs) +{ + int i; + int64_t ret = 0; + int64_t r; + BDRVVmdkState *s = bs->opaque; + + ret = bdrv_get_allocated_file_size(bs->file->bs); + if (ret < 0) { + return ret; + } + for (i = 0; i < s->num_extents; i++) { + if (s->extents[i].file == bs->file) { + continue; + } + r = bdrv_get_allocated_file_size(s->extents[i].file->bs); + if (r < 0) { + return r; + } + ret += r; + } + return ret; +} + +static int vmdk_has_zero_init(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + + /* If has a flat extent and its underlying storage doesn't have zero init, + * return 0. */ + for (i = 0; i < s->num_extents; i++) { + if (s->extents[i].flat) { + if (!bdrv_has_zero_init(s->extents[i].file->bs)) { + return 0; + } + } + } + return 1; +} + +static ImageInfo *vmdk_get_extent_info(VmdkExtent *extent) +{ + ImageInfo *info = g_new0(ImageInfo, 1); + + *info = (ImageInfo){ + .filename = g_strdup(extent->file->bs->filename), + .format = g_strdup(extent->type), + .virtual_size = extent->sectors * BDRV_SECTOR_SIZE, + .compressed = extent->compressed, + .has_compressed = extent->compressed, + .cluster_size = extent->cluster_sectors * BDRV_SECTOR_SIZE, + .has_cluster_size = !extent->flat, + }; + + return info; +} + +static int vmdk_check(BlockDriverState *bs, BdrvCheckResult *result, + BdrvCheckMode fix) +{ + BDRVVmdkState *s = bs->opaque; + VmdkExtent *extent = NULL; + int64_t sector_num = 0; + int64_t total_sectors = bdrv_nb_sectors(bs); + int ret; + uint64_t cluster_offset; + + if (fix) { + return -ENOTSUP; + } + + for (;;) { + if (sector_num >= total_sectors) { + return 0; + } + extent = find_extent(s, sector_num, extent); + if (!extent) { + fprintf(stderr, + "ERROR: could not find extent for sector %" PRId64 "\n", + sector_num); + break; + } + ret = get_cluster_offset(bs, extent, NULL, + sector_num << BDRV_SECTOR_BITS, + false, &cluster_offset, 0, 0); + if (ret == VMDK_ERROR) { + fprintf(stderr, + "ERROR: could not get cluster_offset for sector %" + PRId64 "\n", sector_num); + break; + } + if (ret == VMDK_OK && + cluster_offset >= bdrv_getlength(extent->file->bs)) + { + fprintf(stderr, + "ERROR: cluster offset for sector %" + PRId64 " points after EOF\n", sector_num); + break; + } + sector_num += extent->cluster_sectors; + } + + result->corruptions++; + return 0; +} + +static ImageInfoSpecific *vmdk_get_specific_info(BlockDriverState *bs) +{ + int i; + BDRVVmdkState *s = bs->opaque; + ImageInfoSpecific *spec_info = g_new0(ImageInfoSpecific, 1); + ImageInfoList **next; + + *spec_info = (ImageInfoSpecific){ + .type = IMAGE_INFO_SPECIFIC_KIND_VMDK, + { + .vmdk = g_new0(ImageInfoSpecificVmdk, 1), + }, + }; + + *spec_info->u.vmdk = (ImageInfoSpecificVmdk) { + .create_type = g_strdup(s->create_type), + .cid = s->cid, + .parent_cid = s->parent_cid, + }; + + next = &spec_info->u.vmdk->extents; + for (i = 0; i < s->num_extents; i++) { + *next = g_new0(ImageInfoList, 1); + (*next)->value = vmdk_get_extent_info(&s->extents[i]); + (*next)->next = NULL; + next = &(*next)->next; + } + + return spec_info; +} + +static bool vmdk_extents_type_eq(const VmdkExtent *a, const VmdkExtent *b) +{ + return a->flat == b->flat && + a->compressed == b->compressed && + (a->flat || a->cluster_sectors == b->cluster_sectors); +} + +static int vmdk_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + int i; + BDRVVmdkState *s = bs->opaque; + assert(s->num_extents); + + /* See if we have multiple extents but they have different cases */ + for (i = 1; i < s->num_extents; i++) { + if (!vmdk_extents_type_eq(&s->extents[0], &s->extents[i])) { + return -ENOTSUP; + } + } + bdi->needs_compressed_writes = s->extents[0].compressed; + if (!s->extents[0].flat) { + bdi->cluster_size = s->extents[0].cluster_sectors << BDRV_SECTOR_BITS; + } + return 0; +} + +static void vmdk_detach_aio_context(BlockDriverState *bs) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + bdrv_detach_aio_context(s->extents[i].file->bs); + } +} + +static void vmdk_attach_aio_context(BlockDriverState *bs, + AioContext *new_context) +{ + BDRVVmdkState *s = bs->opaque; + int i; + + for (i = 0; i < s->num_extents; i++) { + bdrv_attach_aio_context(s->extents[i].file->bs, new_context); + } +} + +static QemuOptsList vmdk_create_opts = { + .name = "vmdk-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vmdk_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_ADAPTER_TYPE, + .type = QEMU_OPT_STRING, + .help = "Virtual adapter type, can be one of " + "ide (default), lsilogic, buslogic or legacyESX" + }, + { + .name = BLOCK_OPT_BACKING_FILE, + .type = QEMU_OPT_STRING, + .help = "File name of a base image" + }, + { + .name = BLOCK_OPT_COMPAT6, + .type = QEMU_OPT_BOOL, + .help = "VMDK version 6 image", + .def_value_str = "off" + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = QEMU_OPT_STRING, + .help = + "VMDK flat extent format, can be one of " + "{monolithicSparse (default) | monolithicFlat | twoGbMaxExtentSparse | twoGbMaxExtentFlat | streamOptimized} " + }, + { + .name = BLOCK_OPT_ZEROED_GRAIN, + .type = QEMU_OPT_BOOL, + .help = "Enable efficient zero writes " + "using the zeroed-grain GTE feature" + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_vmdk = { + .format_name = "vmdk", + .instance_size = sizeof(BDRVVmdkState), + .bdrv_probe = vmdk_probe, + .bdrv_open = vmdk_open, + .bdrv_check = vmdk_check, + .bdrv_reopen_prepare = vmdk_reopen_prepare, + .bdrv_read = vmdk_co_read, + .bdrv_write = vmdk_co_write, + .bdrv_write_compressed = vmdk_write_compressed, + .bdrv_co_write_zeroes = vmdk_co_write_zeroes, + .bdrv_close = vmdk_close, + .bdrv_create = vmdk_create, + .bdrv_co_flush_to_disk = vmdk_co_flush, + .bdrv_co_get_block_status = vmdk_co_get_block_status, + .bdrv_get_allocated_file_size = vmdk_get_allocated_file_size, + .bdrv_has_zero_init = vmdk_has_zero_init, + .bdrv_get_specific_info = vmdk_get_specific_info, + .bdrv_refresh_limits = vmdk_refresh_limits, + .bdrv_get_info = vmdk_get_info, + .bdrv_detach_aio_context = vmdk_detach_aio_context, + .bdrv_attach_aio_context = vmdk_attach_aio_context, + + .supports_backing = true, + .create_opts = &vmdk_create_opts, +}; + +static void bdrv_vmdk_init(void) +{ + bdrv_register(&bdrv_vmdk); +} + +block_init(bdrv_vmdk_init); diff --git a/src/block/vpc.c b/src/block/vpc.c new file mode 100644 index 0000000..299d373 --- /dev/null +++ b/src/block/vpc.c @@ -0,0 +1,946 @@ +/* + * Block driver for Connectix / Microsoft Virtual PC images + * + * Copyright (c) 2005 Alex Beregszaszi + * Copyright (c) 2009 Kevin Wolf <kwolf@suse.de> + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#if defined(CONFIG_UUID) +#include <uuid/uuid.h> +#endif + +/**************************************************************/ + +#define HEADER_SIZE 512 + +//#define CACHE + +enum vhd_type { + VHD_FIXED = 2, + VHD_DYNAMIC = 3, + VHD_DIFFERENCING = 4, +}; + +// Seconds since Jan 1, 2000 0:00:00 (UTC) +#define VHD_TIMESTAMP_BASE 946684800 + +#define VHD_MAX_SECTORS (65535LL * 255 * 255) +#define VHD_MAX_GEOMETRY (65535LL * 16 * 255) + +// always big-endian +typedef struct vhd_footer { + char creator[8]; // "conectix" + uint32_t features; + uint32_t version; + + // Offset of next header structure, 0xFFFFFFFF if none + uint64_t data_offset; + + // Seconds since Jan 1, 2000 0:00:00 (UTC) + uint32_t timestamp; + + char creator_app[4]; // "vpc " + uint16_t major; + uint16_t minor; + char creator_os[4]; // "Wi2k" + + uint64_t orig_size; + uint64_t current_size; + + uint16_t cyls; + uint8_t heads; + uint8_t secs_per_cyl; + + uint32_t type; + + // Checksum of the Hard Disk Footer ("one's complement of the sum of all + // the bytes in the footer without the checksum field") + uint32_t checksum; + + // UUID used to identify a parent hard disk (backing file) + uint8_t uuid[16]; + + uint8_t in_saved_state; +} QEMU_PACKED VHDFooter; + +typedef struct vhd_dyndisk_header { + char magic[8]; // "cxsparse" + + // Offset of next header structure, 0xFFFFFFFF if none + uint64_t data_offset; + + // Offset of the Block Allocation Table (BAT) + uint64_t table_offset; + + uint32_t version; + uint32_t max_table_entries; // 32bit/entry + + // 2 MB by default, must be a power of two + uint32_t block_size; + + uint32_t checksum; + uint8_t parent_uuid[16]; + uint32_t parent_timestamp; + uint32_t reserved; + + // Backing file name (in UTF-16) + uint8_t parent_name[512]; + + struct { + uint32_t platform; + uint32_t data_space; + uint32_t data_length; + uint32_t reserved; + uint64_t data_offset; + } parent_locator[8]; +} QEMU_PACKED VHDDynDiskHeader; + +typedef struct BDRVVPCState { + CoMutex lock; + uint8_t footer_buf[HEADER_SIZE]; + uint64_t free_data_block_offset; + int max_table_entries; + uint32_t *pagetable; + uint64_t bat_offset; + uint64_t last_bitmap_offset; + + uint32_t block_size; + uint32_t bitmap_size; + +#ifdef CACHE + uint8_t *pageentry_u8; + uint32_t *pageentry_u32; + uint16_t *pageentry_u16; + + uint64_t last_bitmap; +#endif + + Error *migration_blocker; +} BDRVVPCState; + +static uint32_t vpc_checksum(uint8_t* buf, size_t size) +{ + uint32_t res = 0; + int i; + + for (i = 0; i < size; i++) + res += buf[i]; + + return ~res; +} + + +static int vpc_probe(const uint8_t *buf, int buf_size, const char *filename) +{ + if (buf_size >= 8 && !strncmp((char *)buf, "conectix", 8)) + return 100; + return 0; +} + +static int vpc_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVVPCState *s = bs->opaque; + int i; + VHDFooter *footer; + VHDDynDiskHeader *dyndisk_header; + uint8_t buf[HEADER_SIZE]; + uint32_t checksum; + uint64_t computed_size; + uint64_t pagetable_size; + int disk_type = VHD_DYNAMIC; + int ret; + + ret = bdrv_pread(bs->file->bs, 0, s->footer_buf, HEADER_SIZE); + if (ret < 0) { + goto fail; + } + + footer = (VHDFooter *) s->footer_buf; + if (strncmp(footer->creator, "conectix", 8)) { + int64_t offset = bdrv_getlength(bs->file->bs); + if (offset < 0) { + ret = offset; + goto fail; + } else if (offset < HEADER_SIZE) { + ret = -EINVAL; + goto fail; + } + + /* If a fixed disk, the footer is found only at the end of the file */ + ret = bdrv_pread(bs->file->bs, offset-HEADER_SIZE, s->footer_buf, + HEADER_SIZE); + if (ret < 0) { + goto fail; + } + if (strncmp(footer->creator, "conectix", 8)) { + error_setg(errp, "invalid VPC image"); + ret = -EINVAL; + goto fail; + } + disk_type = VHD_FIXED; + } + + checksum = be32_to_cpu(footer->checksum); + footer->checksum = 0; + if (vpc_checksum(s->footer_buf, HEADER_SIZE) != checksum) + fprintf(stderr, "block-vpc: The header checksum of '%s' is " + "incorrect.\n", bs->filename); + + /* Write 'checksum' back to footer, or else will leave it with zero. */ + footer->checksum = cpu_to_be32(checksum); + + // The visible size of a image in Virtual PC depends on the geometry + // rather than on the size stored in the footer (the size in the footer + // is too large usually) + bs->total_sectors = (int64_t) + be16_to_cpu(footer->cyls) * footer->heads * footer->secs_per_cyl; + + /* Images that have exactly the maximum geometry are probably bigger and + * would be truncated if we adhered to the geometry for them. Rely on + * footer->current_size for them. */ + if (bs->total_sectors == VHD_MAX_GEOMETRY) { + bs->total_sectors = be64_to_cpu(footer->current_size) / + BDRV_SECTOR_SIZE; + } + + /* Allow a maximum disk size of approximately 2 TB */ + if (bs->total_sectors >= VHD_MAX_SECTORS) { + ret = -EFBIG; + goto fail; + } + + if (disk_type == VHD_DYNAMIC) { + ret = bdrv_pread(bs->file->bs, be64_to_cpu(footer->data_offset), buf, + HEADER_SIZE); + if (ret < 0) { + goto fail; + } + + dyndisk_header = (VHDDynDiskHeader *) buf; + + if (strncmp(dyndisk_header->magic, "cxsparse", 8)) { + ret = -EINVAL; + goto fail; + } + + s->block_size = be32_to_cpu(dyndisk_header->block_size); + if (!is_power_of_2(s->block_size) || s->block_size < BDRV_SECTOR_SIZE) { + error_setg(errp, "Invalid block size %" PRIu32, s->block_size); + ret = -EINVAL; + goto fail; + } + s->bitmap_size = ((s->block_size / (8 * 512)) + 511) & ~511; + + s->max_table_entries = be32_to_cpu(dyndisk_header->max_table_entries); + + if ((bs->total_sectors * 512) / s->block_size > 0xffffffffU) { + ret = -EINVAL; + goto fail; + } + if (s->max_table_entries > (VHD_MAX_SECTORS * 512) / s->block_size) { + ret = -EINVAL; + goto fail; + } + + computed_size = (uint64_t) s->max_table_entries * s->block_size; + if (computed_size < bs->total_sectors * 512) { + ret = -EINVAL; + goto fail; + } + + if (s->max_table_entries > SIZE_MAX / 4 || + s->max_table_entries > (int) INT_MAX / 4) { + error_setg(errp, "Max Table Entries too large (%" PRId32 ")", + s->max_table_entries); + ret = -EINVAL; + goto fail; + } + + pagetable_size = (uint64_t) s->max_table_entries * 4; + + s->pagetable = qemu_try_blockalign(bs->file->bs, pagetable_size); + if (s->pagetable == NULL) { + ret = -ENOMEM; + goto fail; + } + + s->bat_offset = be64_to_cpu(dyndisk_header->table_offset); + + ret = bdrv_pread(bs->file->bs, s->bat_offset, s->pagetable, + pagetable_size); + if (ret < 0) { + goto fail; + } + + s->free_data_block_offset = + ROUND_UP(s->bat_offset + pagetable_size, 512); + + for (i = 0; i < s->max_table_entries; i++) { + be32_to_cpus(&s->pagetable[i]); + if (s->pagetable[i] != 0xFFFFFFFF) { + int64_t next = (512 * (int64_t) s->pagetable[i]) + + s->bitmap_size + s->block_size; + + if (next > s->free_data_block_offset) { + s->free_data_block_offset = next; + } + } + } + + if (s->free_data_block_offset > bdrv_getlength(bs->file->bs)) { + error_setg(errp, "block-vpc: free_data_block_offset points after " + "the end of file. The image has been truncated."); + ret = -EINVAL; + goto fail; + } + + s->last_bitmap_offset = (int64_t) -1; + +#ifdef CACHE + s->pageentry_u8 = g_malloc(512); + s->pageentry_u32 = s->pageentry_u8; + s->pageentry_u16 = s->pageentry_u8; + s->last_pagetable = -1; +#endif + } + + qemu_co_mutex_init(&s->lock); + + /* Disable migration when VHD images are used */ + error_setg(&s->migration_blocker, "The vpc format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); + migrate_add_blocker(s->migration_blocker); + + return 0; + +fail: + qemu_vfree(s->pagetable); +#ifdef CACHE + g_free(s->pageentry_u8); +#endif + return ret; +} + +static int vpc_reopen_prepare(BDRVReopenState *state, + BlockReopenQueue *queue, Error **errp) +{ + return 0; +} + +/* + * Returns the absolute byte offset of the given sector in the image file. + * If the sector is not allocated, -1 is returned instead. + * + * The parameter write must be 1 if the offset will be used for a write + * operation (the block bitmaps is updated then), 0 otherwise. + */ +static inline int64_t get_sector_offset(BlockDriverState *bs, + int64_t sector_num, int write) +{ + BDRVVPCState *s = bs->opaque; + uint64_t offset = sector_num * 512; + uint64_t bitmap_offset, block_offset; + uint32_t pagetable_index, pageentry_index; + + pagetable_index = offset / s->block_size; + pageentry_index = (offset % s->block_size) / 512; + + if (pagetable_index >= s->max_table_entries || s->pagetable[pagetable_index] == 0xffffffff) + return -1; // not allocated + + bitmap_offset = 512 * (uint64_t) s->pagetable[pagetable_index]; + block_offset = bitmap_offset + s->bitmap_size + (512 * pageentry_index); + + // We must ensure that we don't write to any sectors which are marked as + // unused in the bitmap. We get away with setting all bits in the block + // bitmap each time we write to a new block. This might cause Virtual PC to + // miss sparse read optimization, but it's not a problem in terms of + // correctness. + if (write && (s->last_bitmap_offset != bitmap_offset)) { + uint8_t bitmap[s->bitmap_size]; + + s->last_bitmap_offset = bitmap_offset; + memset(bitmap, 0xff, s->bitmap_size); + bdrv_pwrite_sync(bs->file->bs, bitmap_offset, bitmap, s->bitmap_size); + } + + return block_offset; +} + +/* + * Writes the footer to the end of the image file. This is needed when the + * file grows as it overwrites the old footer + * + * Returns 0 on success and < 0 on error + */ +static int rewrite_footer(BlockDriverState* bs) +{ + int ret; + BDRVVPCState *s = bs->opaque; + int64_t offset = s->free_data_block_offset; + + ret = bdrv_pwrite_sync(bs->file->bs, offset, s->footer_buf, HEADER_SIZE); + if (ret < 0) + return ret; + + return 0; +} + +/* + * Allocates a new block. This involves writing a new footer and updating + * the Block Allocation Table to use the space at the old end of the image + * file (overwriting the old footer) + * + * Returns the sectors' offset in the image file on success and < 0 on error + */ +static int64_t alloc_block(BlockDriverState* bs, int64_t sector_num) +{ + BDRVVPCState *s = bs->opaque; + int64_t bat_offset; + uint32_t index, bat_value; + int ret; + uint8_t bitmap[s->bitmap_size]; + + // Check if sector_num is valid + if ((sector_num < 0) || (sector_num > bs->total_sectors)) + return -1; + + // Write entry into in-memory BAT + index = (sector_num * 512) / s->block_size; + if (s->pagetable[index] != 0xFFFFFFFF) + return -1; + + s->pagetable[index] = s->free_data_block_offset / 512; + + // Initialize the block's bitmap + memset(bitmap, 0xff, s->bitmap_size); + ret = bdrv_pwrite_sync(bs->file->bs, s->free_data_block_offset, bitmap, + s->bitmap_size); + if (ret < 0) { + return ret; + } + + // Write new footer (the old one will be overwritten) + s->free_data_block_offset += s->block_size + s->bitmap_size; + ret = rewrite_footer(bs); + if (ret < 0) + goto fail; + + // Write BAT entry to disk + bat_offset = s->bat_offset + (4 * index); + bat_value = cpu_to_be32(s->pagetable[index]); + ret = bdrv_pwrite_sync(bs->file->bs, bat_offset, &bat_value, 4); + if (ret < 0) + goto fail; + + return get_sector_offset(bs, sector_num, 0); + +fail: + s->free_data_block_offset -= (s->block_size + s->bitmap_size); + return -1; +} + +static int vpc_get_info(BlockDriverState *bs, BlockDriverInfo *bdi) +{ + BDRVVPCState *s = (BDRVVPCState *)bs->opaque; + VHDFooter *footer = (VHDFooter *) s->footer_buf; + + if (be32_to_cpu(footer->type) != VHD_FIXED) { + bdi->cluster_size = s->block_size; + } + + bdi->unallocated_blocks_are_zero = true; + return 0; +} + +static int vpc_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVPCState *s = bs->opaque; + int ret; + int64_t offset; + int64_t sectors, sectors_per_block; + VHDFooter *footer = (VHDFooter *) s->footer_buf; + + if (be32_to_cpu(footer->type) == VHD_FIXED) { + return bdrv_read(bs->file->bs, sector_num, buf, nb_sectors); + } + while (nb_sectors > 0) { + offset = get_sector_offset(bs, sector_num, 0); + + sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; + sectors = sectors_per_block - (sector_num % sectors_per_block); + if (sectors > nb_sectors) { + sectors = nb_sectors; + } + + if (offset == -1) { + memset(buf, 0, sectors * BDRV_SECTOR_SIZE); + } else { + ret = bdrv_pread(bs->file->bs, offset, buf, + sectors * BDRV_SECTOR_SIZE); + if (ret != sectors * BDRV_SECTOR_SIZE) { + return -1; + } + } + + nb_sectors -= sectors; + sector_num += sectors; + buf += sectors * BDRV_SECTOR_SIZE; + } + return 0; +} + +static coroutine_fn int vpc_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVPCState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vpc_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int vpc_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVPCState *s = bs->opaque; + int64_t offset; + int64_t sectors, sectors_per_block; + int ret; + VHDFooter *footer = (VHDFooter *) s->footer_buf; + + if (be32_to_cpu(footer->type) == VHD_FIXED) { + return bdrv_write(bs->file->bs, sector_num, buf, nb_sectors); + } + while (nb_sectors > 0) { + offset = get_sector_offset(bs, sector_num, 1); + + sectors_per_block = s->block_size >> BDRV_SECTOR_BITS; + sectors = sectors_per_block - (sector_num % sectors_per_block); + if (sectors > nb_sectors) { + sectors = nb_sectors; + } + + if (offset == -1) { + offset = alloc_block(bs, sector_num); + if (offset < 0) + return -1; + } + + ret = bdrv_pwrite(bs->file->bs, offset, buf, + sectors * BDRV_SECTOR_SIZE); + if (ret != sectors * BDRV_SECTOR_SIZE) { + return -1; + } + + nb_sectors -= sectors; + sector_num += sectors; + buf += sectors * BDRV_SECTOR_SIZE; + } + + return 0; +} + +static coroutine_fn int vpc_co_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVPCState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vpc_write(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int64_t coroutine_fn vpc_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int *pnum) +{ + BDRVVPCState *s = bs->opaque; + VHDFooter *footer = (VHDFooter*) s->footer_buf; + int64_t start, offset; + bool allocated; + int n; + + if (be32_to_cpu(footer->type) == VHD_FIXED) { + *pnum = nb_sectors; + return BDRV_BLOCK_RAW | BDRV_BLOCK_OFFSET_VALID | BDRV_BLOCK_DATA | + (sector_num << BDRV_SECTOR_BITS); + } + + offset = get_sector_offset(bs, sector_num, 0); + start = offset; + allocated = (offset != -1); + *pnum = 0; + + do { + /* All sectors in a block are contiguous (without using the bitmap) */ + n = ROUND_UP(sector_num + 1, s->block_size / BDRV_SECTOR_SIZE) + - sector_num; + n = MIN(n, nb_sectors); + + *pnum += n; + sector_num += n; + nb_sectors -= n; + /* *pnum can't be greater than one block for allocated + * sectors since there is always a bitmap in between. */ + if (allocated) { + return BDRV_BLOCK_DATA | BDRV_BLOCK_OFFSET_VALID | start; + } + if (nb_sectors == 0) { + break; + } + offset = get_sector_offset(bs, sector_num, 0); + } while (offset == -1); + + return 0; +} + +/* + * Calculates the number of cylinders, heads and sectors per cylinder + * based on a given number of sectors. This is the algorithm described + * in the VHD specification. + * + * Note that the geometry doesn't always exactly match total_sectors but + * may round it down. + * + * Returns 0 on success, -EFBIG if the size is larger than ~2 TB. Override + * the hardware EIDE and ATA-2 limit of 16 heads (max disk size of 127 GB) + * and instead allow up to 255 heads. + */ +static int calculate_geometry(int64_t total_sectors, uint16_t* cyls, + uint8_t* heads, uint8_t* secs_per_cyl) +{ + uint32_t cyls_times_heads; + + total_sectors = MIN(total_sectors, VHD_MAX_GEOMETRY); + + if (total_sectors >= 65535LL * 16 * 63) { + *secs_per_cyl = 255; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } else { + *secs_per_cyl = 17; + cyls_times_heads = total_sectors / *secs_per_cyl; + *heads = (cyls_times_heads + 1023) / 1024; + + if (*heads < 4) { + *heads = 4; + } + + if (cyls_times_heads >= (*heads * 1024) || *heads > 16) { + *secs_per_cyl = 31; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } + + if (cyls_times_heads >= (*heads * 1024)) { + *secs_per_cyl = 63; + *heads = 16; + cyls_times_heads = total_sectors / *secs_per_cyl; + } + } + + *cyls = cyls_times_heads / *heads; + + return 0; +} + +static int create_dynamic_disk(BlockDriverState *bs, uint8_t *buf, + int64_t total_sectors) +{ + VHDDynDiskHeader *dyndisk_header = + (VHDDynDiskHeader *) buf; + size_t block_size, num_bat_entries; + int i; + int ret; + int64_t offset = 0; + + // Write the footer (twice: at the beginning and at the end) + block_size = 0x200000; + num_bat_entries = (total_sectors + block_size / 512) / (block_size / 512); + + ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + if (ret) { + goto fail; + } + + offset = 1536 + ((num_bat_entries * 4 + 511) & ~511); + ret = bdrv_pwrite_sync(bs, offset, buf, HEADER_SIZE); + if (ret < 0) { + goto fail; + } + + // Write the initial BAT + offset = 3 * 512; + + memset(buf, 0xFF, 512); + for (i = 0; i < (num_bat_entries * 4 + 511) / 512; i++) { + ret = bdrv_pwrite_sync(bs, offset, buf, 512); + if (ret < 0) { + goto fail; + } + offset += 512; + } + + // Prepare the Dynamic Disk Header + memset(buf, 0, 1024); + + memcpy(dyndisk_header->magic, "cxsparse", 8); + + /* + * Note: The spec is actually wrong here for data_offset, it says + * 0xFFFFFFFF, but MS tools expect all 64 bits to be set. + */ + dyndisk_header->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL); + dyndisk_header->table_offset = cpu_to_be64(3 * 512); + dyndisk_header->version = cpu_to_be32(0x00010000); + dyndisk_header->block_size = cpu_to_be32(block_size); + dyndisk_header->max_table_entries = cpu_to_be32(num_bat_entries); + + dyndisk_header->checksum = cpu_to_be32(vpc_checksum(buf, 1024)); + + // Write the header + offset = 512; + + ret = bdrv_pwrite_sync(bs, offset, buf, 1024); + if (ret < 0) { + goto fail; + } + + fail: + return ret; +} + +static int create_fixed_disk(BlockDriverState *bs, uint8_t *buf, + int64_t total_size) +{ + int ret; + + /* Add footer to total size */ + total_size += HEADER_SIZE; + + ret = bdrv_truncate(bs, total_size); + if (ret < 0) { + return ret; + } + + ret = bdrv_pwrite_sync(bs, total_size - HEADER_SIZE, buf, HEADER_SIZE); + if (ret < 0) { + return ret; + } + + return ret; +} + +static int vpc_create(const char *filename, QemuOpts *opts, Error **errp) +{ + uint8_t buf[1024]; + VHDFooter *footer = (VHDFooter *) buf; + char *disk_type_param; + int i; + uint16_t cyls = 0; + uint8_t heads = 0; + uint8_t secs_per_cyl = 0; + int64_t total_sectors; + int64_t total_size; + int disk_type; + int ret = -EIO; + Error *local_err = NULL; + BlockDriverState *bs = NULL; + + /* Read out options */ + total_size = ROUND_UP(qemu_opt_get_size_del(opts, BLOCK_OPT_SIZE, 0), + BDRV_SECTOR_SIZE); + disk_type_param = qemu_opt_get_del(opts, BLOCK_OPT_SUBFMT); + if (disk_type_param) { + if (!strcmp(disk_type_param, "dynamic")) { + disk_type = VHD_DYNAMIC; + } else if (!strcmp(disk_type_param, "fixed")) { + disk_type = VHD_FIXED; + } else { + ret = -EINVAL; + goto out; + } + } else { + disk_type = VHD_DYNAMIC; + } + + ret = bdrv_create_file(filename, opts, &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto out; + } + ret = bdrv_open(&bs, filename, NULL, NULL, BDRV_O_RDWR | BDRV_O_PROTOCOL, + &local_err); + if (ret < 0) { + error_propagate(errp, local_err); + goto out; + } + + /* + * Calculate matching total_size and geometry. Increase the number of + * sectors requested until we get enough (or fail). This ensures that + * qemu-img convert doesn't truncate images, but rather rounds up. + * + * If the image size can't be represented by a spec conform CHS geometry, + * we set the geometry to 65535 x 16 x 255 (CxHxS) sectors and use + * the image size from the VHD footer to calculate total_sectors. + */ + total_sectors = MIN(VHD_MAX_GEOMETRY, total_size / BDRV_SECTOR_SIZE); + for (i = 0; total_sectors > (int64_t)cyls * heads * secs_per_cyl; i++) { + calculate_geometry(total_sectors + i, &cyls, &heads, &secs_per_cyl); + } + + if ((int64_t)cyls * heads * secs_per_cyl == VHD_MAX_GEOMETRY) { + total_sectors = total_size / BDRV_SECTOR_SIZE; + /* Allow a maximum disk size of approximately 2 TB */ + if (total_sectors > VHD_MAX_SECTORS) { + ret = -EFBIG; + goto out; + } + } else { + total_sectors = (int64_t)cyls * heads * secs_per_cyl; + total_size = total_sectors * BDRV_SECTOR_SIZE; + } + + /* Prepare the Hard Disk Footer */ + memset(buf, 0, 1024); + + memcpy(footer->creator, "conectix", 8); + /* TODO Check if "qemu" creator_app is ok for VPC */ + memcpy(footer->creator_app, "qemu", 4); + memcpy(footer->creator_os, "Wi2k", 4); + + footer->features = cpu_to_be32(0x02); + footer->version = cpu_to_be32(0x00010000); + if (disk_type == VHD_DYNAMIC) { + footer->data_offset = cpu_to_be64(HEADER_SIZE); + } else { + footer->data_offset = cpu_to_be64(0xFFFFFFFFFFFFFFFFULL); + } + footer->timestamp = cpu_to_be32(time(NULL) - VHD_TIMESTAMP_BASE); + + /* Version of Virtual PC 2007 */ + footer->major = cpu_to_be16(0x0005); + footer->minor = cpu_to_be16(0x0003); + footer->orig_size = cpu_to_be64(total_size); + footer->current_size = cpu_to_be64(total_size); + footer->cyls = cpu_to_be16(cyls); + footer->heads = heads; + footer->secs_per_cyl = secs_per_cyl; + + footer->type = cpu_to_be32(disk_type); + +#if defined(CONFIG_UUID) + uuid_generate(footer->uuid); +#endif + + footer->checksum = cpu_to_be32(vpc_checksum(buf, HEADER_SIZE)); + + if (disk_type == VHD_DYNAMIC) { + ret = create_dynamic_disk(bs, buf, total_sectors); + } else { + ret = create_fixed_disk(bs, buf, total_size); + } + +out: + bdrv_unref(bs); + g_free(disk_type_param); + return ret; +} + +static int vpc_has_zero_init(BlockDriverState *bs) +{ + BDRVVPCState *s = bs->opaque; + VHDFooter *footer = (VHDFooter *) s->footer_buf; + + if (be32_to_cpu(footer->type) == VHD_FIXED) { + return bdrv_has_zero_init(bs->file->bs); + } else { + return 1; + } +} + +static void vpc_close(BlockDriverState *bs) +{ + BDRVVPCState *s = bs->opaque; + qemu_vfree(s->pagetable); +#ifdef CACHE + g_free(s->pageentry_u8); +#endif + + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); +} + +static QemuOptsList vpc_create_opts = { + .name = "vpc-create-opts", + .head = QTAILQ_HEAD_INITIALIZER(vpc_create_opts.head), + .desc = { + { + .name = BLOCK_OPT_SIZE, + .type = QEMU_OPT_SIZE, + .help = "Virtual disk size" + }, + { + .name = BLOCK_OPT_SUBFMT, + .type = QEMU_OPT_STRING, + .help = + "Type of virtual hard disk format. Supported formats are " + "{dynamic (default) | fixed} " + }, + { /* end of list */ } + } +}; + +static BlockDriver bdrv_vpc = { + .format_name = "vpc", + .instance_size = sizeof(BDRVVPCState), + + .bdrv_probe = vpc_probe, + .bdrv_open = vpc_open, + .bdrv_close = vpc_close, + .bdrv_reopen_prepare = vpc_reopen_prepare, + .bdrv_create = vpc_create, + + .bdrv_read = vpc_co_read, + .bdrv_write = vpc_co_write, + .bdrv_co_get_block_status = vpc_co_get_block_status, + + .bdrv_get_info = vpc_get_info, + + .create_opts = &vpc_create_opts, + .bdrv_has_zero_init = vpc_has_zero_init, +}; + +static void bdrv_vpc_init(void) +{ + bdrv_register(&bdrv_vpc); +} + +block_init(bdrv_vpc_init); diff --git a/src/block/vvfat.c b/src/block/vvfat.c new file mode 100644 index 0000000..b184eca --- /dev/null +++ b/src/block/vvfat.c @@ -0,0 +1,3043 @@ +/* vim:set shiftwidth=4 ts=4: */ +/* + * QEMU Block driver for virtual VFAT (shadows a local directory) + * + * Copyright (c) 2004,2005 Johannes E. Schindelin + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include <sys/stat.h> +#include <dirent.h> +#include "qemu-common.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "migration/migration.h" +#include "qapi/qmp/qint.h" +#include "qapi/qmp/qbool.h" +#include "qapi/qmp/qstring.h" + +#ifndef S_IWGRP +#define S_IWGRP 0 +#endif +#ifndef S_IWOTH +#define S_IWOTH 0 +#endif + +/* TODO: add ":bootsector=blabla.img:" */ +/* LATER TODO: add automatic boot sector generation from + BOOTEASY.ASM and Ranish Partition Manager + Note that DOS assumes the system files to be the first files in the + file system (test if the boot sector still relies on that fact)! */ +/* MAYBE TODO: write block-visofs.c */ +/* TODO: call try_commit() only after a timeout */ + +/* #define DEBUG */ + +#ifdef DEBUG + +#define DLOG(a) a + +static void checkpoint(void); + +#ifdef __MINGW32__ +void nonono(const char* file, int line, const char* msg) { + fprintf(stderr, "Nonono! %s:%d %s\n", file, line, msg); + exit(-5); +} +#undef assert +#define assert(a) do {if (!(a)) nonono(__FILE__, __LINE__, #a);}while(0) +#endif + +#else + +#define DLOG(a) + +#endif + +/* dynamic array functions */ +typedef struct array_t { + char* pointer; + unsigned int size,next,item_size; +} array_t; + +static inline void array_init(array_t* array,unsigned int item_size) +{ + array->pointer = NULL; + array->size=0; + array->next=0; + array->item_size=item_size; +} + +static inline void array_free(array_t* array) +{ + g_free(array->pointer); + array->size=array->next=0; +} + +/* does not automatically grow */ +static inline void* array_get(array_t* array,unsigned int index) { + assert(index < array->next); + return array->pointer + index * array->item_size; +} + +static inline int array_ensure_allocated(array_t* array, int index) +{ + if((index + 1) * array->item_size > array->size) { + int new_size = (index + 32) * array->item_size; + array->pointer = g_realloc(array->pointer, new_size); + if (!array->pointer) + return -1; + array->size = new_size; + array->next = index + 1; + } + + return 0; +} + +static inline void* array_get_next(array_t* array) { + unsigned int next = array->next; + void* result; + + if (array_ensure_allocated(array, next) < 0) + return NULL; + + array->next = next + 1; + result = array_get(array, next); + + return result; +} + +static inline void* array_insert(array_t* array,unsigned int index,unsigned int count) { + if((array->next+count)*array->item_size>array->size) { + int increment=count*array->item_size; + array->pointer=g_realloc(array->pointer,array->size+increment); + if(!array->pointer) + return NULL; + array->size+=increment; + } + memmove(array->pointer+(index+count)*array->item_size, + array->pointer+index*array->item_size, + (array->next-index)*array->item_size); + array->next+=count; + return array->pointer+index*array->item_size; +} + +/* this performs a "roll", so that the element which was at index_from becomes + * index_to, but the order of all other elements is preserved. */ +static inline int array_roll(array_t* array,int index_to,int index_from,int count) +{ + char* buf; + char* from; + char* to; + int is; + + if(!array || + index_to<0 || index_to>=array->next || + index_from<0 || index_from>=array->next) + return -1; + + if(index_to==index_from) + return 0; + + is=array->item_size; + from=array->pointer+index_from*is; + to=array->pointer+index_to*is; + buf=g_malloc(is*count); + memcpy(buf,from,is*count); + + if(index_to<index_from) + memmove(to+is*count,to,from-to); + else + memmove(from,from+is*count,to-from); + + memcpy(to,buf,is*count); + + g_free(buf); + + return 0; +} + +static inline int array_remove_slice(array_t* array,int index, int count) +{ + assert(index >=0); + assert(count > 0); + assert(index + count <= array->next); + if(array_roll(array,array->next-1,index,count)) + return -1; + array->next -= count; + return 0; +} + +static int array_remove(array_t* array,int index) +{ + return array_remove_slice(array, index, 1); +} + +/* return the index for a given member */ +static int array_index(array_t* array, void* pointer) +{ + size_t offset = (char*)pointer - array->pointer; + assert((offset % array->item_size) == 0); + assert(offset/array->item_size < array->next); + return offset/array->item_size; +} + +/* These structures are used to fake a disk and the VFAT filesystem. + * For this reason we need to use QEMU_PACKED. */ + +typedef struct bootsector_t { + uint8_t jump[3]; + uint8_t name[8]; + uint16_t sector_size; + uint8_t sectors_per_cluster; + uint16_t reserved_sectors; + uint8_t number_of_fats; + uint16_t root_entries; + uint16_t total_sectors16; + uint8_t media_type; + uint16_t sectors_per_fat; + uint16_t sectors_per_track; + uint16_t number_of_heads; + uint32_t hidden_sectors; + uint32_t total_sectors; + union { + struct { + uint8_t drive_number; + uint8_t current_head; + uint8_t signature; + uint32_t id; + uint8_t volume_label[11]; + } QEMU_PACKED fat16; + struct { + uint32_t sectors_per_fat; + uint16_t flags; + uint8_t major,minor; + uint32_t first_cluster_of_root_directory; + uint16_t info_sector; + uint16_t backup_boot_sector; + uint16_t ignored; + } QEMU_PACKED fat32; + } u; + uint8_t fat_type[8]; + uint8_t ignored[0x1c0]; + uint8_t magic[2]; +} QEMU_PACKED bootsector_t; + +typedef struct { + uint8_t head; + uint8_t sector; + uint8_t cylinder; +} mbr_chs_t; + +typedef struct partition_t { + uint8_t attributes; /* 0x80 = bootable */ + mbr_chs_t start_CHS; + uint8_t fs_type; /* 0x1 = FAT12, 0x6 = FAT16, 0xe = FAT16_LBA, 0xb = FAT32, 0xc = FAT32_LBA */ + mbr_chs_t end_CHS; + uint32_t start_sector_long; + uint32_t length_sector_long; +} QEMU_PACKED partition_t; + +typedef struct mbr_t { + uint8_t ignored[0x1b8]; + uint32_t nt_id; + uint8_t ignored2[2]; + partition_t partition[4]; + uint8_t magic[2]; +} QEMU_PACKED mbr_t; + +typedef struct direntry_t { + uint8_t name[8 + 3]; + uint8_t attributes; + uint8_t reserved[2]; + uint16_t ctime; + uint16_t cdate; + uint16_t adate; + uint16_t begin_hi; + uint16_t mtime; + uint16_t mdate; + uint16_t begin; + uint32_t size; +} QEMU_PACKED direntry_t; + +/* this structure are used to transparently access the files */ + +typedef struct mapping_t { + /* begin is the first cluster, end is the last+1 */ + uint32_t begin,end; + /* as s->directory is growable, no pointer may be used here */ + unsigned int dir_index; + /* the clusters of a file may be in any order; this points to the first */ + int first_mapping_index; + union { + /* offset is + * - the offset in the file (in clusters) for a file, or + * - the next cluster of the directory for a directory, and + * - the address of the buffer for a faked entry + */ + struct { + uint32_t offset; + } file; + struct { + int parent_mapping_index; + int first_dir_index; + } dir; + } info; + /* path contains the full path, i.e. it always starts with s->path */ + char* path; + + enum { MODE_UNDEFINED = 0, MODE_NORMAL = 1, MODE_MODIFIED = 2, + MODE_DIRECTORY = 4, MODE_FAKED = 8, + MODE_DELETED = 16, MODE_RENAMED = 32 } mode; + int read_only; +} mapping_t; + +#ifdef DEBUG +static void print_direntry(const struct direntry_t*); +static void print_mapping(const struct mapping_t* mapping); +#endif + +/* here begins the real VVFAT driver */ + +typedef struct BDRVVVFATState { + CoMutex lock; + BlockDriverState* bs; /* pointer to parent */ + unsigned int first_sectors_number; /* 1 for a single partition, 0x40 for a disk with partition table */ + unsigned char first_sectors[0x40*0x200]; + + int fat_type; /* 16 or 32 */ + array_t fat,directory,mapping; + char volume_label[11]; + + unsigned int cluster_size; + unsigned int sectors_per_cluster; + unsigned int sectors_per_fat; + unsigned int sectors_of_root_directory; + uint32_t last_cluster_of_root_directory; + unsigned int faked_sectors; /* how many sectors are faked before file data */ + uint32_t sector_count; /* total number of sectors of the partition */ + uint32_t cluster_count; /* total number of clusters of this partition */ + uint32_t max_fat_value; + + int current_fd; + mapping_t* current_mapping; + unsigned char* cluster; /* points to current cluster */ + unsigned char* cluster_buffer; /* points to a buffer to hold temp data */ + unsigned int current_cluster; + + /* write support */ + BlockDriverState* write_target; + char* qcow_filename; + BlockDriverState* qcow; + void* fat2; + char* used_clusters; + array_t commits; + const char* path; + int downcase_short_names; + + Error *migration_blocker; +} BDRVVVFATState; + +/* take the sector position spos and convert it to Cylinder/Head/Sector position + * if the position is outside the specified geometry, fill maximum value for CHS + * and return 1 to signal overflow. + */ +static int sector2CHS(mbr_chs_t *chs, int spos, int cyls, int heads, int secs) +{ + int head,sector; + sector = spos % secs; spos /= secs; + head = spos % heads; spos /= heads; + if (spos >= cyls) { + /* Overflow, + it happens if 32bit sector positions are used, while CHS is only 24bit. + Windows/Dos is said to take 1023/255/63 as nonrepresentable CHS */ + chs->head = 0xFF; + chs->sector = 0xFF; + chs->cylinder = 0xFF; + return 1; + } + chs->head = (uint8_t)head; + chs->sector = (uint8_t)( (sector+1) | ((spos>>8)<<6) ); + chs->cylinder = (uint8_t)spos; + return 0; +} + +static void init_mbr(BDRVVVFATState *s, int cyls, int heads, int secs) +{ + /* TODO: if the files mbr.img and bootsect.img exist, use them */ + mbr_t* real_mbr=(mbr_t*)s->first_sectors; + partition_t* partition = &(real_mbr->partition[0]); + int lba; + + memset(s->first_sectors,0,512); + + /* Win NT Disk Signature */ + real_mbr->nt_id= cpu_to_le32(0xbe1afdfa); + + partition->attributes=0x80; /* bootable */ + + /* LBA is used when partition is outside the CHS geometry */ + lba = sector2CHS(&partition->start_CHS, s->first_sectors_number - 1, + cyls, heads, secs); + lba |= sector2CHS(&partition->end_CHS, s->bs->total_sectors - 1, + cyls, heads, secs); + + /*LBA partitions are identified only by start/length_sector_long not by CHS*/ + partition->start_sector_long = cpu_to_le32(s->first_sectors_number - 1); + partition->length_sector_long = cpu_to_le32(s->bs->total_sectors + - s->first_sectors_number + 1); + + /* FAT12/FAT16/FAT32 */ + /* DOS uses different types when partition is LBA, + probably to prevent older versions from using CHS on them */ + partition->fs_type= s->fat_type==12 ? 0x1: + s->fat_type==16 ? (lba?0xe:0x06): + /*fat_tyoe==32*/ (lba?0xc:0x0b); + + real_mbr->magic[0]=0x55; real_mbr->magic[1]=0xaa; +} + +/* direntry functions */ + +/* dest is assumed to hold 258 bytes, and pads with 0xffff up to next multiple of 26 */ +static inline int short2long_name(char* dest,const char* src) +{ + int i; + int len; + for(i=0;i<129 && src[i];i++) { + dest[2*i]=src[i]; + dest[2*i+1]=0; + } + len=2*i; + dest[2*i]=dest[2*i+1]=0; + for(i=2*i+2;(i%26);i++) + dest[i]=0xff; + return len; +} + +static inline direntry_t* create_long_filename(BDRVVVFATState* s,const char* filename) +{ + char buffer[258]; + int length=short2long_name(buffer,filename), + number_of_entries=(length+25)/26,i; + direntry_t* entry; + + for(i=0;i<number_of_entries;i++) { + entry=array_get_next(&(s->directory)); + entry->attributes=0xf; + entry->reserved[0]=0; + entry->begin=0; + entry->name[0]=(number_of_entries-i)|(i==0?0x40:0); + } + for(i=0;i<26*number_of_entries;i++) { + int offset=(i%26); + if(offset<10) offset=1+offset; + else if(offset<22) offset=14+offset-10; + else offset=28+offset-22; + entry=array_get(&(s->directory),s->directory.next-1-(i/26)); + entry->name[offset]=buffer[i]; + } + return array_get(&(s->directory),s->directory.next-number_of_entries); +} + +static char is_free(const direntry_t* direntry) +{ + return direntry->name[0]==0xe5 || direntry->name[0]==0x00; +} + +static char is_volume_label(const direntry_t* direntry) +{ + return direntry->attributes == 0x28; +} + +static char is_long_name(const direntry_t* direntry) +{ + return direntry->attributes == 0xf; +} + +static char is_short_name(const direntry_t* direntry) +{ + return !is_volume_label(direntry) && !is_long_name(direntry) + && !is_free(direntry); +} + +static char is_directory(const direntry_t* direntry) +{ + return direntry->attributes & 0x10 && direntry->name[0] != 0xe5; +} + +static inline char is_dot(const direntry_t* direntry) +{ + return is_short_name(direntry) && direntry->name[0] == '.'; +} + +static char is_file(const direntry_t* direntry) +{ + return is_short_name(direntry) && !is_directory(direntry); +} + +static inline uint32_t begin_of_direntry(const direntry_t* direntry) +{ + return le16_to_cpu(direntry->begin)|(le16_to_cpu(direntry->begin_hi)<<16); +} + +static inline uint32_t filesize_of_direntry(const direntry_t* direntry) +{ + return le32_to_cpu(direntry->size); +} + +static void set_begin_of_direntry(direntry_t* direntry, uint32_t begin) +{ + direntry->begin = cpu_to_le16(begin & 0xffff); + direntry->begin_hi = cpu_to_le16((begin >> 16) & 0xffff); +} + +/* fat functions */ + +static inline uint8_t fat_chksum(const direntry_t* entry) +{ + uint8_t chksum=0; + int i; + + for (i = 0; i < ARRAY_SIZE(entry->name); i++) { + chksum = (((chksum & 0xfe) >> 1) | + ((chksum & 0x01) ? 0x80 : 0)) + entry->name[i]; + } + + return chksum; +} + +/* if return_time==0, this returns the fat_date, else the fat_time */ +static uint16_t fat_datetime(time_t time,int return_time) { + struct tm* t; + struct tm t1; + t = &t1; + localtime_r(&time,t); + if(return_time) + return cpu_to_le16((t->tm_sec/2)|(t->tm_min<<5)|(t->tm_hour<<11)); + return cpu_to_le16((t->tm_mday)|((t->tm_mon+1)<<5)|((t->tm_year-80)<<9)); +} + +static inline void fat_set(BDRVVVFATState* s,unsigned int cluster,uint32_t value) +{ + if(s->fat_type==32) { + uint32_t* entry=array_get(&(s->fat),cluster); + *entry=cpu_to_le32(value); + } else if(s->fat_type==16) { + uint16_t* entry=array_get(&(s->fat),cluster); + *entry=cpu_to_le16(value&0xffff); + } else { + int offset = (cluster*3/2); + unsigned char* p = array_get(&(s->fat), offset); + switch (cluster&1) { + case 0: + p[0] = value&0xff; + p[1] = (p[1]&0xf0) | ((value>>8)&0xf); + break; + case 1: + p[0] = (p[0]&0xf) | ((value&0xf)<<4); + p[1] = (value>>4); + break; + } + } +} + +static inline uint32_t fat_get(BDRVVVFATState* s,unsigned int cluster) +{ + if(s->fat_type==32) { + uint32_t* entry=array_get(&(s->fat),cluster); + return le32_to_cpu(*entry); + } else if(s->fat_type==16) { + uint16_t* entry=array_get(&(s->fat),cluster); + return le16_to_cpu(*entry); + } else { + const uint8_t* x=(uint8_t*)(s->fat.pointer)+cluster*3/2; + return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; + } +} + +static inline int fat_eof(BDRVVVFATState* s,uint32_t fat_entry) +{ + if(fat_entry>s->max_fat_value-8) + return -1; + return 0; +} + +static inline void init_fat(BDRVVVFATState* s) +{ + if (s->fat_type == 12) { + array_init(&(s->fat),1); + array_ensure_allocated(&(s->fat), + s->sectors_per_fat * 0x200 * 3 / 2 - 1); + } else { + array_init(&(s->fat),(s->fat_type==32?4:2)); + array_ensure_allocated(&(s->fat), + s->sectors_per_fat * 0x200 / s->fat.item_size - 1); + } + memset(s->fat.pointer,0,s->fat.size); + + switch(s->fat_type) { + case 12: s->max_fat_value=0xfff; break; + case 16: s->max_fat_value=0xffff; break; + case 32: s->max_fat_value=0x0fffffff; break; + default: s->max_fat_value=0; /* error... */ + } + +} + +/* TODO: in create_short_filename, 0xe5->0x05 is not yet handled! */ +/* TODO: in parse_short_filename, 0x05->0xe5 is not yet handled! */ +static inline direntry_t* create_short_and_long_name(BDRVVVFATState* s, + unsigned int directory_start, const char* filename, int is_dot) +{ + int i,j,long_index=s->directory.next; + direntry_t* entry = NULL; + direntry_t* entry_long = NULL; + + if(is_dot) { + entry=array_get_next(&(s->directory)); + memset(entry->name, 0x20, sizeof(entry->name)); + memcpy(entry->name,filename,strlen(filename)); + return entry; + } + + entry_long=create_long_filename(s,filename); + + i = strlen(filename); + for(j = i - 1; j>0 && filename[j]!='.';j--); + if (j > 0) + i = (j > 8 ? 8 : j); + else if (i > 8) + i = 8; + + entry=array_get_next(&(s->directory)); + memset(entry->name, 0x20, sizeof(entry->name)); + memcpy(entry->name, filename, i); + + if (j > 0) { + for (i = 0; i < 3 && filename[j + 1 + i]; i++) { + entry->name[8 + i] = filename[j + 1 + i]; + } + } + + /* upcase & remove unwanted characters */ + for(i=10;i>=0;i--) { + if(i==10 || i==7) for(;i>0 && entry->name[i]==' ';i--); + if(entry->name[i]<=' ' || entry->name[i]>0x7f + || strchr(".*?<>|\":/\\[];,+='",entry->name[i])) + entry->name[i]='_'; + else if(entry->name[i]>='a' && entry->name[i]<='z') + entry->name[i]+='A'-'a'; + } + + /* mangle duplicates */ + while(1) { + direntry_t* entry1=array_get(&(s->directory),directory_start); + int j; + + for(;entry1<entry;entry1++) + if(!is_long_name(entry1) && !memcmp(entry1->name,entry->name,11)) + break; /* found dupe */ + if(entry1==entry) /* no dupe found */ + break; + + /* use all 8 characters of name */ + if(entry->name[7]==' ') { + int j; + for(j=6;j>0 && entry->name[j]==' ';j--) + entry->name[j]='~'; + } + + /* increment number */ + for(j=7;j>0 && entry->name[j]=='9';j--) + entry->name[j]='0'; + if(j>0) { + if(entry->name[j]<'0' || entry->name[j]>'9') + entry->name[j]='0'; + else + entry->name[j]++; + } + } + + /* calculate checksum; propagate to long name */ + if(entry_long) { + uint8_t chksum=fat_chksum(entry); + + /* calculate anew, because realloc could have taken place */ + entry_long=array_get(&(s->directory),long_index); + while(entry_long<entry && is_long_name(entry_long)) { + entry_long->reserved[1]=chksum; + entry_long++; + } + } + + return entry; +} + +/* + * Read a directory. (the index of the corresponding mapping must be passed). + */ +static int read_directory(BDRVVVFATState* s, int mapping_index) +{ + mapping_t* mapping = array_get(&(s->mapping), mapping_index); + direntry_t* direntry; + const char* dirname = mapping->path; + int first_cluster = mapping->begin; + int parent_index = mapping->info.dir.parent_mapping_index; + mapping_t* parent_mapping = (mapping_t*) + (parent_index >= 0 ? array_get(&(s->mapping), parent_index) : NULL); + int first_cluster_of_parent = parent_mapping ? parent_mapping->begin : -1; + + DIR* dir=opendir(dirname); + struct dirent* entry; + int i; + + assert(mapping->mode & MODE_DIRECTORY); + + if(!dir) { + mapping->end = mapping->begin; + return -1; + } + + i = mapping->info.dir.first_dir_index = + first_cluster == 0 ? 0 : s->directory.next; + + /* actually read the directory, and allocate the mappings */ + while((entry=readdir(dir))) { + unsigned int length=strlen(dirname)+2+strlen(entry->d_name); + char* buffer; + direntry_t* direntry; + struct stat st; + int is_dot=!strcmp(entry->d_name,"."); + int is_dotdot=!strcmp(entry->d_name,".."); + + if(first_cluster == 0 && (is_dotdot || is_dot)) + continue; + + buffer = g_malloc(length); + snprintf(buffer,length,"%s/%s",dirname,entry->d_name); + + if(stat(buffer,&st)<0) { + g_free(buffer); + continue; + } + + /* create directory entry for this file */ + direntry=create_short_and_long_name(s, i, entry->d_name, + is_dot || is_dotdot); + direntry->attributes=(S_ISDIR(st.st_mode)?0x10:0x20); + direntry->reserved[0]=direntry->reserved[1]=0; + direntry->ctime=fat_datetime(st.st_ctime,1); + direntry->cdate=fat_datetime(st.st_ctime,0); + direntry->adate=fat_datetime(st.st_atime,0); + direntry->begin_hi=0; + direntry->mtime=fat_datetime(st.st_mtime,1); + direntry->mdate=fat_datetime(st.st_mtime,0); + if(is_dotdot) + set_begin_of_direntry(direntry, first_cluster_of_parent); + else if(is_dot) + set_begin_of_direntry(direntry, first_cluster); + else + direntry->begin=0; /* do that later */ + if (st.st_size > 0x7fffffff) { + fprintf(stderr, "File %s is larger than 2GB\n", buffer); + g_free(buffer); + closedir(dir); + return -2; + } + direntry->size=cpu_to_le32(S_ISDIR(st.st_mode)?0:st.st_size); + + /* create mapping for this file */ + if(!is_dot && !is_dotdot && (S_ISDIR(st.st_mode) || st.st_size)) { + s->current_mapping = array_get_next(&(s->mapping)); + s->current_mapping->begin=0; + s->current_mapping->end=st.st_size; + /* + * we get the direntry of the most recent direntry, which + * contains the short name and all the relevant information. + */ + s->current_mapping->dir_index=s->directory.next-1; + s->current_mapping->first_mapping_index = -1; + if (S_ISDIR(st.st_mode)) { + s->current_mapping->mode = MODE_DIRECTORY; + s->current_mapping->info.dir.parent_mapping_index = + mapping_index; + } else { + s->current_mapping->mode = MODE_UNDEFINED; + s->current_mapping->info.file.offset = 0; + } + s->current_mapping->path=buffer; + s->current_mapping->read_only = + (st.st_mode & (S_IWUSR | S_IWGRP | S_IWOTH)) == 0; + } else { + g_free(buffer); + } + } + closedir(dir); + + /* fill with zeroes up to the end of the cluster */ + while(s->directory.next%(0x10*s->sectors_per_cluster)) { + direntry_t* direntry=array_get_next(&(s->directory)); + memset(direntry,0,sizeof(direntry_t)); + } + +/* TODO: if there are more entries, bootsector has to be adjusted! */ +#define ROOT_ENTRIES (0x02 * 0x10 * s->sectors_per_cluster) + if (mapping_index == 0 && s->directory.next < ROOT_ENTRIES) { + /* root directory */ + int cur = s->directory.next; + array_ensure_allocated(&(s->directory), ROOT_ENTRIES - 1); + s->directory.next = ROOT_ENTRIES; + memset(array_get(&(s->directory), cur), 0, + (ROOT_ENTRIES - cur) * sizeof(direntry_t)); + } + + /* reget the mapping, since s->mapping was possibly realloc()ed */ + mapping = array_get(&(s->mapping), mapping_index); + first_cluster += (s->directory.next - mapping->info.dir.first_dir_index) + * 0x20 / s->cluster_size; + mapping->end = first_cluster; + + direntry = array_get(&(s->directory), mapping->dir_index); + set_begin_of_direntry(direntry, mapping->begin); + + return 0; +} + +static inline uint32_t sector2cluster(BDRVVVFATState* s,off_t sector_num) +{ + return (sector_num-s->faked_sectors)/s->sectors_per_cluster; +} + +static inline off_t cluster2sector(BDRVVVFATState* s, uint32_t cluster_num) +{ + return s->faked_sectors + s->sectors_per_cluster * cluster_num; +} + +static int init_directories(BDRVVVFATState* s, + const char *dirname, int heads, int secs, + Error **errp) +{ + bootsector_t* bootsector; + mapping_t* mapping; + unsigned int i; + unsigned int cluster; + + memset(&(s->first_sectors[0]),0,0x40*0x200); + + s->cluster_size=s->sectors_per_cluster*0x200; + s->cluster_buffer=g_malloc(s->cluster_size); + + /* + * The formula: sc = spf+1+spf*spc*(512*8/fat_type), + * where sc is sector_count, + * spf is sectors_per_fat, + * spc is sectors_per_clusters, and + * fat_type = 12, 16 or 32. + */ + i = 1+s->sectors_per_cluster*0x200*8/s->fat_type; + s->sectors_per_fat=(s->sector_count+i)/i; /* round up */ + + array_init(&(s->mapping),sizeof(mapping_t)); + array_init(&(s->directory),sizeof(direntry_t)); + + /* add volume label */ + { + direntry_t* entry=array_get_next(&(s->directory)); + entry->attributes=0x28; /* archive | volume label */ + memcpy(entry->name, s->volume_label, sizeof(entry->name)); + } + + /* Now build FAT, and write back information into directory */ + init_fat(s); + + s->faked_sectors=s->first_sectors_number+s->sectors_per_fat*2; + s->cluster_count=sector2cluster(s, s->sector_count); + + mapping = array_get_next(&(s->mapping)); + mapping->begin = 0; + mapping->dir_index = 0; + mapping->info.dir.parent_mapping_index = -1; + mapping->first_mapping_index = -1; + mapping->path = g_strdup(dirname); + i = strlen(mapping->path); + if (i > 0 && mapping->path[i - 1] == '/') + mapping->path[i - 1] = '\0'; + mapping->mode = MODE_DIRECTORY; + mapping->read_only = 0; + s->path = mapping->path; + + for (i = 0, cluster = 0; i < s->mapping.next; i++) { + /* MS-DOS expects the FAT to be 0 for the root directory + * (except for the media byte). */ + /* LATER TODO: still true for FAT32? */ + int fix_fat = (i != 0); + mapping = array_get(&(s->mapping), i); + + if (mapping->mode & MODE_DIRECTORY) { + mapping->begin = cluster; + if(read_directory(s, i)) { + error_setg(errp, "Could not read directory %s", + mapping->path); + return -1; + } + mapping = array_get(&(s->mapping), i); + } else { + assert(mapping->mode == MODE_UNDEFINED); + mapping->mode=MODE_NORMAL; + mapping->begin = cluster; + if (mapping->end > 0) { + direntry_t* direntry = array_get(&(s->directory), + mapping->dir_index); + + mapping->end = cluster + 1 + (mapping->end-1)/s->cluster_size; + set_begin_of_direntry(direntry, mapping->begin); + } else { + mapping->end = cluster + 1; + fix_fat = 0; + } + } + + assert(mapping->begin < mapping->end); + + /* next free cluster */ + cluster = mapping->end; + + if(cluster > s->cluster_count) { + error_setg(errp, + "Directory does not fit in FAT%d (capacity %.2f MB)", + s->fat_type, s->sector_count / 2000.0); + return -1; + } + + /* fix fat for entry */ + if (fix_fat) { + int j; + for(j = mapping->begin; j < mapping->end - 1; j++) + fat_set(s, j, j+1); + fat_set(s, mapping->end - 1, s->max_fat_value); + } + } + + mapping = array_get(&(s->mapping), 0); + s->sectors_of_root_directory = mapping->end * s->sectors_per_cluster; + s->last_cluster_of_root_directory = mapping->end; + + /* the FAT signature */ + fat_set(s,0,s->max_fat_value); + fat_set(s,1,s->max_fat_value); + + s->current_mapping = NULL; + + bootsector=(bootsector_t*)(s->first_sectors+(s->first_sectors_number-1)*0x200); + bootsector->jump[0]=0xeb; + bootsector->jump[1]=0x3e; + bootsector->jump[2]=0x90; + memcpy(bootsector->name,"QEMU ",8); + bootsector->sector_size=cpu_to_le16(0x200); + bootsector->sectors_per_cluster=s->sectors_per_cluster; + bootsector->reserved_sectors=cpu_to_le16(1); + bootsector->number_of_fats=0x2; /* number of FATs */ + bootsector->root_entries=cpu_to_le16(s->sectors_of_root_directory*0x10); + bootsector->total_sectors16=s->sector_count>0xffff?0:cpu_to_le16(s->sector_count); + bootsector->media_type=(s->first_sectors_number>1?0xf8:0xf0); /* media descriptor (f8=hd, f0=3.5 fd)*/ + s->fat.pointer[0] = bootsector->media_type; + bootsector->sectors_per_fat=cpu_to_le16(s->sectors_per_fat); + bootsector->sectors_per_track = cpu_to_le16(secs); + bootsector->number_of_heads = cpu_to_le16(heads); + bootsector->hidden_sectors=cpu_to_le32(s->first_sectors_number==1?0:0x3f); + bootsector->total_sectors=cpu_to_le32(s->sector_count>0xffff?s->sector_count:0); + + /* LATER TODO: if FAT32, this is wrong */ + bootsector->u.fat16.drive_number=s->first_sectors_number==1?0:0x80; /* fda=0, hda=0x80 */ + bootsector->u.fat16.current_head=0; + bootsector->u.fat16.signature=0x29; + bootsector->u.fat16.id=cpu_to_le32(0xfabe1afd); + + memcpy(bootsector->u.fat16.volume_label, s->volume_label, + sizeof(bootsector->u.fat16.volume_label)); + memcpy(bootsector->fat_type,(s->fat_type==12?"FAT12 ":s->fat_type==16?"FAT16 ":"FAT32 "),8); + bootsector->magic[0]=0x55; bootsector->magic[1]=0xaa; + + return 0; +} + +#ifdef DEBUG +static BDRVVVFATState *vvv = NULL; +#endif + +static int enable_write_target(BDRVVVFATState *s, Error **errp); +static int is_consistent(BDRVVVFATState *s); + +static QemuOptsList runtime_opts = { + .name = "vvfat", + .head = QTAILQ_HEAD_INITIALIZER(runtime_opts.head), + .desc = { + { + .name = "dir", + .type = QEMU_OPT_STRING, + .help = "Host directory to map to the vvfat device", + }, + { + .name = "fat-type", + .type = QEMU_OPT_NUMBER, + .help = "FAT type (12, 16 or 32)", + }, + { + .name = "floppy", + .type = QEMU_OPT_BOOL, + .help = "Create a floppy rather than a hard disk image", + }, + { + .name = "label", + .type = QEMU_OPT_STRING, + .help = "Use a volume label other than QEMU VVFAT", + }, + { + .name = "rw", + .type = QEMU_OPT_BOOL, + .help = "Make the image writable", + }, + { /* end of list */ } + }, +}; + +static void vvfat_parse_filename(const char *filename, QDict *options, + Error **errp) +{ + int fat_type = 0; + bool floppy = false; + bool rw = false; + int i; + + if (!strstart(filename, "fat:", NULL)) { + error_setg(errp, "File name string must start with 'fat:'"); + return; + } + + /* Parse options */ + if (strstr(filename, ":32:")) { + fat_type = 32; + } else if (strstr(filename, ":16:")) { + fat_type = 16; + } else if (strstr(filename, ":12:")) { + fat_type = 12; + } + + if (strstr(filename, ":floppy:")) { + floppy = true; + } + + if (strstr(filename, ":rw:")) { + rw = true; + } + + /* Get the directory name without options */ + i = strrchr(filename, ':') - filename; + assert(i >= 3); + if (filename[i - 2] == ':' && qemu_isalpha(filename[i - 1])) { + /* workaround for DOS drive names */ + filename += i - 1; + } else { + filename += i + 1; + } + + /* Fill in the options QDict */ + qdict_put(options, "dir", qstring_from_str(filename)); + qdict_put(options, "fat-type", qint_from_int(fat_type)); + qdict_put(options, "floppy", qbool_from_bool(floppy)); + qdict_put(options, "rw", qbool_from_bool(rw)); +} + +static int vvfat_open(BlockDriverState *bs, QDict *options, int flags, + Error **errp) +{ + BDRVVVFATState *s = bs->opaque; + int cyls, heads, secs; + bool floppy; + const char *dirname, *label; + QemuOpts *opts; + Error *local_err = NULL; + int ret; + +#ifdef DEBUG + vvv = s; +#endif + + opts = qemu_opts_create(&runtime_opts, NULL, 0, &error_abort); + qemu_opts_absorb_qdict(opts, options, &local_err); + if (local_err) { + error_propagate(errp, local_err); + ret = -EINVAL; + goto fail; + } + + dirname = qemu_opt_get(opts, "dir"); + if (!dirname) { + error_setg(errp, "vvfat block driver requires a 'dir' option"); + ret = -EINVAL; + goto fail; + } + + s->fat_type = qemu_opt_get_number(opts, "fat-type", 0); + floppy = qemu_opt_get_bool(opts, "floppy", false); + + memset(s->volume_label, ' ', sizeof(s->volume_label)); + label = qemu_opt_get(opts, "label"); + if (label) { + size_t label_length = strlen(label); + if (label_length > 11) { + error_setg(errp, "vvfat label cannot be longer than 11 bytes"); + ret = -EINVAL; + goto fail; + } + memcpy(s->volume_label, label, label_length); + } + + if (floppy) { + /* 1.44MB or 2.88MB floppy. 2.88MB can be FAT12 (default) or FAT16. */ + if (!s->fat_type) { + s->fat_type = 12; + secs = 36; + s->sectors_per_cluster = 2; + } else { + secs = s->fat_type == 12 ? 18 : 36; + s->sectors_per_cluster = 1; + } + s->first_sectors_number = 1; + cyls = 80; + heads = 2; + } else { + /* 32MB or 504MB disk*/ + if (!s->fat_type) { + s->fat_type = 16; + } + s->first_sectors_number = 0x40; + cyls = s->fat_type == 12 ? 64 : 1024; + heads = 16; + secs = 63; + } + + switch (s->fat_type) { + case 32: + fprintf(stderr, "Big fat greek warning: FAT32 has not been tested. " + "You are welcome to do so!\n"); + break; + case 16: + case 12: + break; + default: + error_setg(errp, "Valid FAT types are only 12, 16 and 32"); + ret = -EINVAL; + goto fail; + } + + + s->bs = bs; + + /* LATER TODO: if FAT32, adjust */ + s->sectors_per_cluster=0x10; + + s->current_cluster=0xffffffff; + + /* read only is the default for safety */ + bs->read_only = 1; + s->qcow = s->write_target = NULL; + s->qcow_filename = NULL; + s->fat2 = NULL; + s->downcase_short_names = 1; + + fprintf(stderr, "vvfat %s chs %d,%d,%d\n", + dirname, cyls, heads, secs); + + s->sector_count = cyls * heads * secs - (s->first_sectors_number - 1); + + if (qemu_opt_get_bool(opts, "rw", false)) { + ret = enable_write_target(s, errp); + if (ret < 0) { + goto fail; + } + bs->read_only = 0; + } + + bs->total_sectors = cyls * heads * secs; + + if (init_directories(s, dirname, heads, secs, errp)) { + ret = -EIO; + goto fail; + } + + s->sector_count = s->faked_sectors + s->sectors_per_cluster*s->cluster_count; + + if (s->first_sectors_number == 0x40) { + init_mbr(s, cyls, heads, secs); + } + + // assert(is_consistent(s)); + qemu_co_mutex_init(&s->lock); + + /* Disable migration when vvfat is used rw */ + if (s->qcow) { + error_setg(&s->migration_blocker, + "The vvfat (rw) format used by node '%s' " + "does not support live migration", + bdrv_get_device_or_node_name(bs)); + migrate_add_blocker(s->migration_blocker); + } + + ret = 0; +fail: + qemu_opts_del(opts); + return ret; +} + +static inline void vvfat_close_current_file(BDRVVVFATState *s) +{ + if(s->current_mapping) { + s->current_mapping = NULL; + if (s->current_fd) { + qemu_close(s->current_fd); + s->current_fd = 0; + } + } + s->current_cluster = -1; +} + +/* mappings between index1 and index2-1 are supposed to be ordered + * return value is the index of the last mapping for which end>cluster_num + */ +static inline int find_mapping_for_cluster_aux(BDRVVVFATState* s,int cluster_num,int index1,int index2) +{ + while(1) { + int index3; + mapping_t* mapping; + index3=(index1+index2)/2; + mapping=array_get(&(s->mapping),index3); + assert(mapping->begin < mapping->end); + if(mapping->begin>=cluster_num) { + assert(index2!=index3 || index2==0); + if(index2==index3) + return index1; + index2=index3; + } else { + if(index1==index3) + return mapping->end<=cluster_num ? index2 : index1; + index1=index3; + } + assert(index1<=index2); + DLOG(mapping=array_get(&(s->mapping),index1); + assert(mapping->begin<=cluster_num); + assert(index2 >= s->mapping.next || + ((mapping = array_get(&(s->mapping),index2)) && + mapping->end>cluster_num))); + } +} + +static inline mapping_t* find_mapping_for_cluster(BDRVVVFATState* s,int cluster_num) +{ + int index=find_mapping_for_cluster_aux(s,cluster_num,0,s->mapping.next); + mapping_t* mapping; + if(index>=s->mapping.next) + return NULL; + mapping=array_get(&(s->mapping),index); + if(mapping->begin>cluster_num) + return NULL; + assert(mapping->begin<=cluster_num && mapping->end>cluster_num); + return mapping; +} + +static int open_file(BDRVVVFATState* s,mapping_t* mapping) +{ + if(!mapping) + return -1; + if(!s->current_mapping || + strcmp(s->current_mapping->path,mapping->path)) { + /* open file */ + int fd = qemu_open(mapping->path, O_RDONLY | O_BINARY | O_LARGEFILE); + if(fd<0) + return -1; + vvfat_close_current_file(s); + s->current_fd = fd; + s->current_mapping = mapping; + } + return 0; +} + +static inline int read_cluster(BDRVVVFATState *s,int cluster_num) +{ + if(s->current_cluster != cluster_num) { + int result=0; + off_t offset; + assert(!s->current_mapping || s->current_fd || (s->current_mapping->mode & MODE_DIRECTORY)); + if(!s->current_mapping + || s->current_mapping->begin>cluster_num + || s->current_mapping->end<=cluster_num) { + /* binary search of mappings for file */ + mapping_t* mapping=find_mapping_for_cluster(s,cluster_num); + + assert(!mapping || (cluster_num>=mapping->begin && cluster_num<mapping->end)); + + if (mapping && mapping->mode & MODE_DIRECTORY) { + vvfat_close_current_file(s); + s->current_mapping = mapping; +read_cluster_directory: + offset = s->cluster_size*(cluster_num-s->current_mapping->begin); + s->cluster = (unsigned char*)s->directory.pointer+offset + + 0x20*s->current_mapping->info.dir.first_dir_index; + assert(((s->cluster-(unsigned char*)s->directory.pointer)%s->cluster_size)==0); + assert((char*)s->cluster+s->cluster_size <= s->directory.pointer+s->directory.next*s->directory.item_size); + s->current_cluster = cluster_num; + return 0; + } + + if(open_file(s,mapping)) + return -2; + } else if (s->current_mapping->mode & MODE_DIRECTORY) + goto read_cluster_directory; + + assert(s->current_fd); + + offset=s->cluster_size*(cluster_num-s->current_mapping->begin)+s->current_mapping->info.file.offset; + if(lseek(s->current_fd, offset, SEEK_SET)!=offset) + return -3; + s->cluster=s->cluster_buffer; + result=read(s->current_fd,s->cluster,s->cluster_size); + if(result<0) { + s->current_cluster = -1; + return -1; + } + s->current_cluster = cluster_num; + } + return 0; +} + +#ifdef DEBUG +static void print_direntry(const direntry_t* direntry) +{ + int j = 0; + char buffer[1024]; + + fprintf(stderr, "direntry %p: ", direntry); + if(!direntry) + return; + if(is_long_name(direntry)) { + unsigned char* c=(unsigned char*)direntry; + int i; + for(i=1;i<11 && c[i] && c[i]!=0xff;i+=2) +#define ADD_CHAR(c) {buffer[j] = (c); if (buffer[j] < ' ') buffer[j] = 0xb0; j++;} + ADD_CHAR(c[i]); + for(i=14;i<26 && c[i] && c[i]!=0xff;i+=2) + ADD_CHAR(c[i]); + for(i=28;i<32 && c[i] && c[i]!=0xff;i+=2) + ADD_CHAR(c[i]); + buffer[j] = 0; + fprintf(stderr, "%s\n", buffer); + } else { + int i; + for(i=0;i<11;i++) + ADD_CHAR(direntry->name[i]); + buffer[j] = 0; + fprintf(stderr,"%s attributes=0x%02x begin=%d size=%d\n", + buffer, + direntry->attributes, + begin_of_direntry(direntry),le32_to_cpu(direntry->size)); + } +} + +static void print_mapping(const mapping_t* mapping) +{ + fprintf(stderr, "mapping (%p): begin, end = %d, %d, dir_index = %d, " + "first_mapping_index = %d, name = %s, mode = 0x%x, " , + mapping, mapping->begin, mapping->end, mapping->dir_index, + mapping->first_mapping_index, mapping->path, mapping->mode); + + if (mapping->mode & MODE_DIRECTORY) + fprintf(stderr, "parent_mapping_index = %d, first_dir_index = %d\n", mapping->info.dir.parent_mapping_index, mapping->info.dir.first_dir_index); + else + fprintf(stderr, "offset = %d\n", mapping->info.file.offset); +} +#endif + +static int vvfat_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + BDRVVVFATState *s = bs->opaque; + int i; + + for(i=0;i<nb_sectors;i++,sector_num++) { + if (sector_num >= bs->total_sectors) + return -1; + if (s->qcow) { + int n; + if (bdrv_is_allocated(s->qcow, sector_num, nb_sectors-i, &n)) { +DLOG(fprintf(stderr, "sectors %d+%d allocated\n", (int)sector_num, n)); + if (bdrv_read(s->qcow, sector_num, buf + i*0x200, n)) { + return -1; + } + i += n - 1; + sector_num += n - 1; + continue; + } +DLOG(fprintf(stderr, "sector %d not allocated\n", (int)sector_num)); + } + if(sector_num<s->faked_sectors) { + if(sector_num<s->first_sectors_number) + memcpy(buf+i*0x200,&(s->first_sectors[sector_num*0x200]),0x200); + else if(sector_num-s->first_sectors_number<s->sectors_per_fat) + memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number)*0x200]),0x200); + else if(sector_num-s->first_sectors_number-s->sectors_per_fat<s->sectors_per_fat) + memcpy(buf+i*0x200,&(s->fat.pointer[(sector_num-s->first_sectors_number-s->sectors_per_fat)*0x200]),0x200); + } else { + uint32_t sector=sector_num-s->faked_sectors, + sector_offset_in_cluster=(sector%s->sectors_per_cluster), + cluster_num=sector/s->sectors_per_cluster; + if(cluster_num > s->cluster_count || read_cluster(s, cluster_num) != 0) { + /* LATER TODO: strict: return -1; */ + memset(buf+i*0x200,0,0x200); + continue; + } + memcpy(buf+i*0x200,s->cluster+sector_offset_in_cluster*0x200,0x200); + } + } + return 0; +} + +static coroutine_fn int vvfat_co_read(BlockDriverState *bs, int64_t sector_num, + uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVVFATState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vvfat_read(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +/* LATER TODO: statify all functions */ + +/* + * Idea of the write support (use snapshot): + * + * 1. check if all data is consistent, recording renames, modifications, + * new files and directories (in s->commits). + * + * 2. if the data is not consistent, stop committing + * + * 3. handle renames, and create new files and directories (do not yet + * write their contents) + * + * 4. walk the directories, fixing the mapping and direntries, and marking + * the handled mappings as not deleted + * + * 5. commit the contents of the files + * + * 6. handle deleted files and directories + * + */ + +typedef struct commit_t { + char* path; + union { + struct { uint32_t cluster; } rename; + struct { int dir_index; uint32_t modified_offset; } writeout; + struct { uint32_t first_cluster; } new_file; + struct { uint32_t cluster; } mkdir; + } param; + /* DELETEs and RMDIRs are handled differently: see handle_deletes() */ + enum { + ACTION_RENAME, ACTION_WRITEOUT, ACTION_NEW_FILE, ACTION_MKDIR + } action; +} commit_t; + +static void clear_commits(BDRVVVFATState* s) +{ + int i; +DLOG(fprintf(stderr, "clear_commits (%d commits)\n", s->commits.next)); + for (i = 0; i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + assert(commit->path || commit->action == ACTION_WRITEOUT); + if (commit->action != ACTION_WRITEOUT) { + assert(commit->path); + g_free(commit->path); + } else + assert(commit->path == NULL); + } + s->commits.next = 0; +} + +static void schedule_rename(BDRVVVFATState* s, + uint32_t cluster, char* new_path) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = new_path; + commit->param.rename.cluster = cluster; + commit->action = ACTION_RENAME; +} + +static void schedule_writeout(BDRVVVFATState* s, + int dir_index, uint32_t modified_offset) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = NULL; + commit->param.writeout.dir_index = dir_index; + commit->param.writeout.modified_offset = modified_offset; + commit->action = ACTION_WRITEOUT; +} + +static void schedule_new_file(BDRVVVFATState* s, + char* path, uint32_t first_cluster) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = path; + commit->param.new_file.first_cluster = first_cluster; + commit->action = ACTION_NEW_FILE; +} + +static void schedule_mkdir(BDRVVVFATState* s, uint32_t cluster, char* path) +{ + commit_t* commit = array_get_next(&(s->commits)); + commit->path = path; + commit->param.mkdir.cluster = cluster; + commit->action = ACTION_MKDIR; +} + +typedef struct { + /* + * Since the sequence number is at most 0x3f, and the filename + * length is at most 13 times the sequence number, the maximal + * filename length is 0x3f * 13 bytes. + */ + unsigned char name[0x3f * 13 + 1]; + int checksum, len; + int sequence_number; +} long_file_name; + +static void lfn_init(long_file_name* lfn) +{ + lfn->sequence_number = lfn->len = 0; + lfn->checksum = 0x100; +} + +/* return 0 if parsed successfully, > 0 if no long name, < 0 if error */ +static int parse_long_name(long_file_name* lfn, + const direntry_t* direntry) +{ + int i, j, offset; + const unsigned char* pointer = (const unsigned char*)direntry; + + if (!is_long_name(direntry)) + return 1; + + if (pointer[0] & 0x40) { + lfn->sequence_number = pointer[0] & 0x3f; + lfn->checksum = pointer[13]; + lfn->name[0] = 0; + lfn->name[lfn->sequence_number * 13] = 0; + } else if ((pointer[0] & 0x3f) != --lfn->sequence_number) + return -1; + else if (pointer[13] != lfn->checksum) + return -2; + else if (pointer[12] || pointer[26] || pointer[27]) + return -3; + + offset = 13 * (lfn->sequence_number - 1); + for (i = 0, j = 1; i < 13; i++, j+=2) { + if (j == 11) + j = 14; + else if (j == 26) + j = 28; + + if (pointer[j+1] == 0) + lfn->name[offset + i] = pointer[j]; + else if (pointer[j+1] != 0xff || (pointer[0] & 0x40) == 0) + return -4; + else + lfn->name[offset + i] = 0; + } + + if (pointer[0] & 0x40) + lfn->len = offset + strlen((char*)lfn->name + offset); + + return 0; +} + +/* returns 0 if successful, >0 if no short_name, and <0 on error */ +static int parse_short_name(BDRVVVFATState* s, + long_file_name* lfn, direntry_t* direntry) +{ + int i, j; + + if (!is_short_name(direntry)) + return 1; + + for (j = 7; j >= 0 && direntry->name[j] == ' '; j--); + for (i = 0; i <= j; i++) { + if (direntry->name[i] <= ' ' || direntry->name[i] > 0x7f) + return -1; + else if (s->downcase_short_names) + lfn->name[i] = qemu_tolower(direntry->name[i]); + else + lfn->name[i] = direntry->name[i]; + } + + for (j = 2; j >= 0 && direntry->name[8 + j] == ' '; j--) { + } + if (j >= 0) { + lfn->name[i++] = '.'; + lfn->name[i + j + 1] = '\0'; + for (;j >= 0; j--) { + uint8_t c = direntry->name[8 + j]; + if (c <= ' ' || c > 0x7f) { + return -2; + } else if (s->downcase_short_names) { + lfn->name[i + j] = qemu_tolower(c); + } else { + lfn->name[i + j] = c; + } + } + } else + lfn->name[i + j + 1] = '\0'; + + lfn->len = strlen((char*)lfn->name); + + return 0; +} + +static inline uint32_t modified_fat_get(BDRVVVFATState* s, + unsigned int cluster) +{ + if (cluster < s->last_cluster_of_root_directory) { + if (cluster + 1 == s->last_cluster_of_root_directory) + return s->max_fat_value; + else + return cluster + 1; + } + + if (s->fat_type==32) { + uint32_t* entry=((uint32_t*)s->fat2)+cluster; + return le32_to_cpu(*entry); + } else if (s->fat_type==16) { + uint16_t* entry=((uint16_t*)s->fat2)+cluster; + return le16_to_cpu(*entry); + } else { + const uint8_t* x=s->fat2+cluster*3/2; + return ((x[0]|(x[1]<<8))>>(cluster&1?4:0))&0x0fff; + } +} + +static inline int cluster_was_modified(BDRVVVFATState* s, uint32_t cluster_num) +{ + int was_modified = 0; + int i, dummy; + + if (s->qcow == NULL) + return 0; + + for (i = 0; !was_modified && i < s->sectors_per_cluster; i++) + was_modified = bdrv_is_allocated(s->qcow, + cluster2sector(s, cluster_num) + i, 1, &dummy); + + return was_modified; +} + +static const char* get_basename(const char* path) +{ + char* basename = strrchr(path, '/'); + if (basename == NULL) + return path; + else + return basename + 1; /* strip '/' */ +} + +/* + * The array s->used_clusters holds the states of the clusters. If it is + * part of a file, it has bit 2 set, in case of a directory, bit 1. If it + * was modified, bit 3 is set. + * If any cluster is allocated, but not part of a file or directory, this + * driver refuses to commit. + */ +typedef enum { + USED_DIRECTORY = 1, USED_FILE = 2, USED_ANY = 3, USED_ALLOCATED = 4 +} used_t; + +/* + * get_cluster_count_for_direntry() not only determines how many clusters + * are occupied by direntry, but also if it was renamed or modified. + * + * A file is thought to be renamed *only* if there already was a file with + * exactly the same first cluster, but a different name. + * + * Further, the files/directories handled by this function are + * assumed to be *not* deleted (and *only* those). + */ +static uint32_t get_cluster_count_for_direntry(BDRVVVFATState* s, + direntry_t* direntry, const char* path) +{ + /* + * This is a little bit tricky: + * IF the guest OS just inserts a cluster into the file chain, + * and leaves the rest alone, (i.e. the original file had clusters + * 15 -> 16, but now has 15 -> 32 -> 16), then the following happens: + * + * - do_commit will write the cluster into the file at the given + * offset, but + * + * - the cluster which is overwritten should be moved to a later + * position in the file. + * + * I am not aware that any OS does something as braindead, but this + * situation could happen anyway when not committing for a long time. + * Just to be sure that this does not bite us, detect it, and copy the + * contents of the clusters to-be-overwritten into the qcow. + */ + int copy_it = 0; + int was_modified = 0; + int32_t ret = 0; + + uint32_t cluster_num = begin_of_direntry(direntry); + uint32_t offset = 0; + int first_mapping_index = -1; + mapping_t* mapping = NULL; + const char* basename2 = NULL; + + vvfat_close_current_file(s); + + /* the root directory */ + if (cluster_num == 0) + return 0; + + /* write support */ + if (s->qcow) { + basename2 = get_basename(path); + + mapping = find_mapping_for_cluster(s, cluster_num); + + if (mapping) { + const char* basename; + + assert(mapping->mode & MODE_DELETED); + mapping->mode &= ~MODE_DELETED; + + basename = get_basename(mapping->path); + + assert(mapping->mode & MODE_NORMAL); + + /* rename */ + if (strcmp(basename, basename2)) + schedule_rename(s, cluster_num, g_strdup(path)); + } else if (is_file(direntry)) + /* new file */ + schedule_new_file(s, g_strdup(path), cluster_num); + else { + abort(); + return 0; + } + } + + while(1) { + if (s->qcow) { + if (!copy_it && cluster_was_modified(s, cluster_num)) { + if (mapping == NULL || + mapping->begin > cluster_num || + mapping->end <= cluster_num) + mapping = find_mapping_for_cluster(s, cluster_num); + + + if (mapping && + (mapping->mode & MODE_DIRECTORY) == 0) { + + /* was modified in qcow */ + if (offset != mapping->info.file.offset + s->cluster_size + * (cluster_num - mapping->begin)) { + /* offset of this cluster in file chain has changed */ + abort(); + copy_it = 1; + } else if (offset == 0) { + const char* basename = get_basename(mapping->path); + + if (strcmp(basename, basename2)) + copy_it = 1; + first_mapping_index = array_index(&(s->mapping), mapping); + } + + if (mapping->first_mapping_index != first_mapping_index + && mapping->info.file.offset > 0) { + abort(); + copy_it = 1; + } + + /* need to write out? */ + if (!was_modified && is_file(direntry)) { + was_modified = 1; + schedule_writeout(s, mapping->dir_index, offset); + } + } + } + + if (copy_it) { + int i, dummy; + /* + * This is horribly inefficient, but that is okay, since + * it is rarely executed, if at all. + */ + int64_t offset = cluster2sector(s, cluster_num); + + vvfat_close_current_file(s); + for (i = 0; i < s->sectors_per_cluster; i++) { + if (!bdrv_is_allocated(s->qcow, offset + i, 1, &dummy)) { + if (vvfat_read(s->bs, offset, s->cluster_buffer, 1)) { + return -1; + } + if (bdrv_write(s->qcow, offset, s->cluster_buffer, 1)) { + return -2; + } + } + } + } + } + + ret++; + if (s->used_clusters[cluster_num] & USED_ANY) + return 0; + s->used_clusters[cluster_num] = USED_FILE; + + cluster_num = modified_fat_get(s, cluster_num); + + if (fat_eof(s, cluster_num)) + return ret; + else if (cluster_num < 2 || cluster_num > s->max_fat_value - 16) + return -1; + + offset += s->cluster_size; + } +} + +/* + * This function looks at the modified data (qcow). + * It returns 0 upon inconsistency or error, and the number of clusters + * used by the directory, its subdirectories and their files. + */ +static int check_directory_consistency(BDRVVVFATState *s, + int cluster_num, const char* path) +{ + int ret = 0; + unsigned char* cluster = g_malloc(s->cluster_size); + direntry_t* direntries = (direntry_t*)cluster; + mapping_t* mapping = find_mapping_for_cluster(s, cluster_num); + + long_file_name lfn; + int path_len = strlen(path); + char path2[PATH_MAX + 1]; + + assert(path_len < PATH_MAX); /* len was tested before! */ + pstrcpy(path2, sizeof(path2), path); + path2[path_len] = '/'; + path2[path_len + 1] = '\0'; + + if (mapping) { + const char* basename = get_basename(mapping->path); + const char* basename2 = get_basename(path); + + assert(mapping->mode & MODE_DIRECTORY); + + assert(mapping->mode & MODE_DELETED); + mapping->mode &= ~MODE_DELETED; + + if (strcmp(basename, basename2)) + schedule_rename(s, cluster_num, g_strdup(path)); + } else + /* new directory */ + schedule_mkdir(s, cluster_num, g_strdup(path)); + + lfn_init(&lfn); + do { + int i; + int subret = 0; + + ret++; + + if (s->used_clusters[cluster_num] & USED_ANY) { + fprintf(stderr, "cluster %d used more than once\n", (int)cluster_num); + goto fail; + } + s->used_clusters[cluster_num] = USED_DIRECTORY; + +DLOG(fprintf(stderr, "read cluster %d (sector %d)\n", (int)cluster_num, (int)cluster2sector(s, cluster_num))); + subret = vvfat_read(s->bs, cluster2sector(s, cluster_num), cluster, + s->sectors_per_cluster); + if (subret) { + fprintf(stderr, "Error fetching direntries\n"); + fail: + g_free(cluster); + return 0; + } + + for (i = 0; i < 0x10 * s->sectors_per_cluster; i++) { + int cluster_count = 0; + +DLOG(fprintf(stderr, "check direntry %d:\n", i); print_direntry(direntries + i)); + if (is_volume_label(direntries + i) || is_dot(direntries + i) || + is_free(direntries + i)) + continue; + + subret = parse_long_name(&lfn, direntries + i); + if (subret < 0) { + fprintf(stderr, "Error in long name\n"); + goto fail; + } + if (subret == 0 || is_free(direntries + i)) + continue; + + if (fat_chksum(direntries+i) != lfn.checksum) { + subret = parse_short_name(s, &lfn, direntries + i); + if (subret < 0) { + fprintf(stderr, "Error in short name (%d)\n", subret); + goto fail; + } + if (subret > 0 || !strcmp((char*)lfn.name, ".") + || !strcmp((char*)lfn.name, "..")) + continue; + } + lfn.checksum = 0x100; /* cannot use long name twice */ + + if (path_len + 1 + lfn.len >= PATH_MAX) { + fprintf(stderr, "Name too long: %s/%s\n", path, lfn.name); + goto fail; + } + pstrcpy(path2 + path_len + 1, sizeof(path2) - path_len - 1, + (char*)lfn.name); + + if (is_directory(direntries + i)) { + if (begin_of_direntry(direntries + i) == 0) { + DLOG(fprintf(stderr, "invalid begin for directory: %s\n", path2); print_direntry(direntries + i)); + goto fail; + } + cluster_count = check_directory_consistency(s, + begin_of_direntry(direntries + i), path2); + if (cluster_count == 0) { + DLOG(fprintf(stderr, "problem in directory %s:\n", path2); print_direntry(direntries + i)); + goto fail; + } + } else if (is_file(direntries + i)) { + /* check file size with FAT */ + cluster_count = get_cluster_count_for_direntry(s, direntries + i, path2); + if (cluster_count != + (le32_to_cpu(direntries[i].size) + s->cluster_size + - 1) / s->cluster_size) { + DLOG(fprintf(stderr, "Cluster count mismatch\n")); + goto fail; + } + } else + abort(); /* cluster_count = 0; */ + + ret += cluster_count; + } + + cluster_num = modified_fat_get(s, cluster_num); + } while(!fat_eof(s, cluster_num)); + + g_free(cluster); + return ret; +} + +/* returns 1 on success */ +static int is_consistent(BDRVVVFATState* s) +{ + int i, check; + int used_clusters_count = 0; + +DLOG(checkpoint()); + /* + * - get modified FAT + * - compare the two FATs (TODO) + * - get buffer for marking used clusters + * - recurse direntries from root (using bs->bdrv_read to make + * sure to get the new data) + * - check that the FAT agrees with the size + * - count the number of clusters occupied by this directory and + * its files + * - check that the cumulative used cluster count agrees with the + * FAT + * - if all is fine, return number of used clusters + */ + if (s->fat2 == NULL) { + int size = 0x200 * s->sectors_per_fat; + s->fat2 = g_malloc(size); + memcpy(s->fat2, s->fat.pointer, size); + } + check = vvfat_read(s->bs, + s->first_sectors_number, s->fat2, s->sectors_per_fat); + if (check) { + fprintf(stderr, "Could not copy fat\n"); + return 0; + } + assert (s->used_clusters); + for (i = 0; i < sector2cluster(s, s->sector_count); i++) + s->used_clusters[i] &= ~USED_ANY; + + clear_commits(s); + + /* mark every mapped file/directory as deleted. + * (check_directory_consistency() will unmark those still present). */ + if (s->qcow) + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->first_mapping_index < 0) + mapping->mode |= MODE_DELETED; + } + + used_clusters_count = check_directory_consistency(s, 0, s->path); + if (used_clusters_count <= 0) { + DLOG(fprintf(stderr, "problem in directory\n")); + return 0; + } + + check = s->last_cluster_of_root_directory; + for (i = check; i < sector2cluster(s, s->sector_count); i++) { + if (modified_fat_get(s, i)) { + if(!s->used_clusters[i]) { + DLOG(fprintf(stderr, "FAT was modified (%d), but cluster is not used?\n", i)); + return 0; + } + check++; + } + + if (s->used_clusters[i] == USED_ALLOCATED) { + /* allocated, but not used... */ + DLOG(fprintf(stderr, "unused, modified cluster: %d\n", i)); + return 0; + } + } + + if (check != used_clusters_count) + return 0; + + return used_clusters_count; +} + +static inline void adjust_mapping_indices(BDRVVVFATState* s, + int offset, int adjust) +{ + int i; + + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + +#define ADJUST_MAPPING_INDEX(name) \ + if (mapping->name >= offset) \ + mapping->name += adjust + + ADJUST_MAPPING_INDEX(first_mapping_index); + if (mapping->mode & MODE_DIRECTORY) + ADJUST_MAPPING_INDEX(info.dir.parent_mapping_index); + } +} + +/* insert or update mapping */ +static mapping_t* insert_mapping(BDRVVVFATState* s, + uint32_t begin, uint32_t end) +{ + /* + * - find mapping where mapping->begin >= begin, + * - if mapping->begin > begin: insert + * - adjust all references to mappings! + * - else: adjust + * - replace name + */ + int index = find_mapping_for_cluster_aux(s, begin, 0, s->mapping.next); + mapping_t* mapping = NULL; + mapping_t* first_mapping = array_get(&(s->mapping), 0); + + if (index < s->mapping.next && (mapping = array_get(&(s->mapping), index)) + && mapping->begin < begin) { + mapping->end = begin; + index++; + mapping = array_get(&(s->mapping), index); + } + if (index >= s->mapping.next || mapping->begin > begin) { + mapping = array_insert(&(s->mapping), index, 1); + mapping->path = NULL; + adjust_mapping_indices(s, index, +1); + } + + mapping->begin = begin; + mapping->end = end; + +DLOG(mapping_t* next_mapping; +assert(index + 1 >= s->mapping.next || +((next_mapping = array_get(&(s->mapping), index + 1)) && + next_mapping->begin >= end))); + + if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) + s->current_mapping = array_get(&(s->mapping), + s->current_mapping - first_mapping); + + return mapping; +} + +static int remove_mapping(BDRVVVFATState* s, int mapping_index) +{ + mapping_t* mapping = array_get(&(s->mapping), mapping_index); + mapping_t* first_mapping = array_get(&(s->mapping), 0); + + /* free mapping */ + if (mapping->first_mapping_index < 0) { + g_free(mapping->path); + } + + /* remove from s->mapping */ + array_remove(&(s->mapping), mapping_index); + + /* adjust all references to mappings */ + adjust_mapping_indices(s, mapping_index, -1); + + if (s->current_mapping && first_mapping != (mapping_t*)s->mapping.pointer) + s->current_mapping = array_get(&(s->mapping), + s->current_mapping - first_mapping); + + return 0; +} + +static void adjust_dirindices(BDRVVVFATState* s, int offset, int adjust) +{ + int i; + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->dir_index >= offset) + mapping->dir_index += adjust; + if ((mapping->mode & MODE_DIRECTORY) && + mapping->info.dir.first_dir_index >= offset) + mapping->info.dir.first_dir_index += adjust; + } +} + +static direntry_t* insert_direntries(BDRVVVFATState* s, + int dir_index, int count) +{ + /* + * make room in s->directory, + * adjust_dirindices + */ + direntry_t* result = array_insert(&(s->directory), dir_index, count); + if (result == NULL) + return NULL; + adjust_dirindices(s, dir_index, count); + return result; +} + +static int remove_direntries(BDRVVVFATState* s, int dir_index, int count) +{ + int ret = array_remove_slice(&(s->directory), dir_index, count); + if (ret) + return ret; + adjust_dirindices(s, dir_index, -count); + return 0; +} + +/* + * Adapt the mappings of the cluster chain starting at first cluster + * (i.e. if a file starts at first_cluster, the chain is followed according + * to the modified fat, and the corresponding entries in s->mapping are + * adjusted) + */ +static int commit_mappings(BDRVVVFATState* s, + uint32_t first_cluster, int dir_index) +{ + mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t cluster = first_cluster; + + vvfat_close_current_file(s); + + assert(mapping); + assert(mapping->begin == first_cluster); + mapping->first_mapping_index = -1; + mapping->dir_index = dir_index; + mapping->mode = (dir_index <= 0 || is_directory(direntry)) ? + MODE_DIRECTORY : MODE_NORMAL; + + while (!fat_eof(s, cluster)) { + uint32_t c, c1; + + for (c = cluster, c1 = modified_fat_get(s, c); c + 1 == c1; + c = c1, c1 = modified_fat_get(s, c1)); + + c++; + if (c > mapping->end) { + int index = array_index(&(s->mapping), mapping); + int i, max_i = s->mapping.next - index; + for (i = 1; i < max_i && mapping[i].begin < c; i++); + while (--i > 0) + remove_mapping(s, index + 1); + } + assert(mapping == array_get(&(s->mapping), s->mapping.next - 1) + || mapping[1].begin >= c); + mapping->end = c; + + if (!fat_eof(s, c1)) { + int i = find_mapping_for_cluster_aux(s, c1, 0, s->mapping.next); + mapping_t* next_mapping = i >= s->mapping.next ? NULL : + array_get(&(s->mapping), i); + + if (next_mapping == NULL || next_mapping->begin > c1) { + int i1 = array_index(&(s->mapping), mapping); + + next_mapping = insert_mapping(s, c1, c1+1); + + if (c1 < c) + i1++; + mapping = array_get(&(s->mapping), i1); + } + + next_mapping->dir_index = mapping->dir_index; + next_mapping->first_mapping_index = + mapping->first_mapping_index < 0 ? + array_index(&(s->mapping), mapping) : + mapping->first_mapping_index; + next_mapping->path = mapping->path; + next_mapping->mode = mapping->mode; + next_mapping->read_only = mapping->read_only; + if (mapping->mode & MODE_DIRECTORY) { + next_mapping->info.dir.parent_mapping_index = + mapping->info.dir.parent_mapping_index; + next_mapping->info.dir.first_dir_index = + mapping->info.dir.first_dir_index + + 0x10 * s->sectors_per_cluster * + (mapping->end - mapping->begin); + } else + next_mapping->info.file.offset = mapping->info.file.offset + + mapping->end - mapping->begin; + + mapping = next_mapping; + } + + cluster = c1; + } + + return 0; +} + +static int commit_direntries(BDRVVVFATState* s, + int dir_index, int parent_mapping_index) +{ + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t first_cluster = dir_index == 0 ? 0 : begin_of_direntry(direntry); + mapping_t* mapping = find_mapping_for_cluster(s, first_cluster); + + int factor = 0x10 * s->sectors_per_cluster; + int old_cluster_count, new_cluster_count; + int current_dir_index = mapping->info.dir.first_dir_index; + int first_dir_index = current_dir_index; + int ret, i; + uint32_t c; + +DLOG(fprintf(stderr, "commit_direntries for %s, parent_mapping_index %d\n", mapping->path, parent_mapping_index)); + + assert(direntry); + assert(mapping); + assert(mapping->begin == first_cluster); + assert(mapping->info.dir.first_dir_index < s->directory.next); + assert(mapping->mode & MODE_DIRECTORY); + assert(dir_index == 0 || is_directory(direntry)); + + mapping->info.dir.parent_mapping_index = parent_mapping_index; + + if (first_cluster == 0) { + old_cluster_count = new_cluster_count = + s->last_cluster_of_root_directory; + } else { + for (old_cluster_count = 0, c = first_cluster; !fat_eof(s, c); + c = fat_get(s, c)) + old_cluster_count++; + + for (new_cluster_count = 0, c = first_cluster; !fat_eof(s, c); + c = modified_fat_get(s, c)) + new_cluster_count++; + } + + if (new_cluster_count > old_cluster_count) { + if (insert_direntries(s, + current_dir_index + factor * old_cluster_count, + factor * (new_cluster_count - old_cluster_count)) == NULL) + return -1; + } else if (new_cluster_count < old_cluster_count) + remove_direntries(s, + current_dir_index + factor * new_cluster_count, + factor * (old_cluster_count - new_cluster_count)); + + for (c = first_cluster; !fat_eof(s, c); c = modified_fat_get(s, c)) { + void* direntry = array_get(&(s->directory), current_dir_index); + int ret = vvfat_read(s->bs, cluster2sector(s, c), direntry, + s->sectors_per_cluster); + if (ret) + return ret; + assert(!strncmp(s->directory.pointer, "QEMU", 4)); + current_dir_index += factor; + } + + ret = commit_mappings(s, first_cluster, dir_index); + if (ret) + return ret; + + /* recurse */ + for (i = 0; i < factor * new_cluster_count; i++) { + direntry = array_get(&(s->directory), first_dir_index + i); + if (is_directory(direntry) && !is_dot(direntry)) { + mapping = find_mapping_for_cluster(s, first_cluster); + assert(mapping->mode & MODE_DIRECTORY); + ret = commit_direntries(s, first_dir_index + i, + array_index(&(s->mapping), mapping)); + if (ret) + return ret; + } + } + + return 0; +} + +/* commit one file (adjust contents, adjust mapping), + return first_mapping_index */ +static int commit_one_file(BDRVVVFATState* s, + int dir_index, uint32_t offset) +{ + direntry_t* direntry = array_get(&(s->directory), dir_index); + uint32_t c = begin_of_direntry(direntry); + uint32_t first_cluster = c; + mapping_t* mapping = find_mapping_for_cluster(s, c); + uint32_t size = filesize_of_direntry(direntry); + char* cluster = g_malloc(s->cluster_size); + uint32_t i; + int fd = 0; + + assert(offset < size); + assert((offset % s->cluster_size) == 0); + + for (i = s->cluster_size; i < offset; i += s->cluster_size) + c = modified_fat_get(s, c); + + fd = qemu_open(mapping->path, O_RDWR | O_CREAT | O_BINARY, 0666); + if (fd < 0) { + fprintf(stderr, "Could not open %s... (%s, %d)\n", mapping->path, + strerror(errno), errno); + g_free(cluster); + return fd; + } + if (offset > 0) { + if (lseek(fd, offset, SEEK_SET) != offset) { + qemu_close(fd); + g_free(cluster); + return -3; + } + } + + while (offset < size) { + uint32_t c1; + int rest_size = (size - offset > s->cluster_size ? + s->cluster_size : size - offset); + int ret; + + c1 = modified_fat_get(s, c); + + assert((size - offset == 0 && fat_eof(s, c)) || + (size > offset && c >=2 && !fat_eof(s, c))); + + ret = vvfat_read(s->bs, cluster2sector(s, c), + (uint8_t*)cluster, (rest_size + 0x1ff) / 0x200); + + if (ret < 0) { + qemu_close(fd); + g_free(cluster); + return ret; + } + + if (write(fd, cluster, rest_size) < 0) { + qemu_close(fd); + g_free(cluster); + return -2; + } + + offset += rest_size; + c = c1; + } + + if (ftruncate(fd, size)) { + perror("ftruncate()"); + qemu_close(fd); + g_free(cluster); + return -4; + } + qemu_close(fd); + g_free(cluster); + + return commit_mappings(s, first_cluster, dir_index); +} + +#ifdef DEBUG +/* test, if all mappings point to valid direntries */ +static void check1(BDRVVVFATState* s) +{ + int i; + for (i = 0; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->mode & MODE_DELETED) { + fprintf(stderr, "deleted\n"); + continue; + } + assert(mapping->dir_index < s->directory.next); + direntry_t* direntry = array_get(&(s->directory), mapping->dir_index); + assert(mapping->begin == begin_of_direntry(direntry) || mapping->first_mapping_index >= 0); + if (mapping->mode & MODE_DIRECTORY) { + assert(mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster * (mapping->end - mapping->begin) <= s->directory.next); + assert((mapping->info.dir.first_dir_index % (0x10 * s->sectors_per_cluster)) == 0); + } + } +} + +/* test, if all direntries have mappings */ +static void check2(BDRVVVFATState* s) +{ + int i; + int first_mapping = -1; + + for (i = 0; i < s->directory.next; i++) { + direntry_t* direntry = array_get(&(s->directory), i); + + if (is_short_name(direntry) && begin_of_direntry(direntry)) { + mapping_t* mapping = find_mapping_for_cluster(s, begin_of_direntry(direntry)); + assert(mapping); + assert(mapping->dir_index == i || is_dot(direntry)); + assert(mapping->begin == begin_of_direntry(direntry) || is_dot(direntry)); + } + + if ((i % (0x10 * s->sectors_per_cluster)) == 0) { + /* cluster start */ + int j, count = 0; + + for (j = 0; j < s->mapping.next; j++) { + mapping_t* mapping = array_get(&(s->mapping), j); + if (mapping->mode & MODE_DELETED) + continue; + if (mapping->mode & MODE_DIRECTORY) { + if (mapping->info.dir.first_dir_index <= i && mapping->info.dir.first_dir_index + 0x10 * s->sectors_per_cluster > i) { + assert(++count == 1); + if (mapping->first_mapping_index == -1) + first_mapping = array_index(&(s->mapping), mapping); + else + assert(first_mapping == mapping->first_mapping_index); + if (mapping->info.dir.parent_mapping_index < 0) + assert(j == 0); + else { + mapping_t* parent = array_get(&(s->mapping), mapping->info.dir.parent_mapping_index); + assert(parent->mode & MODE_DIRECTORY); + assert(parent->info.dir.first_dir_index < mapping->info.dir.first_dir_index); + } + } + } + } + if (count == 0) + first_mapping = -1; + } + } +} +#endif + +static int handle_renames_and_mkdirs(BDRVVVFATState* s) +{ + int i; + +#ifdef DEBUG + fprintf(stderr, "handle_renames\n"); + for (i = 0; i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + fprintf(stderr, "%d, %s (%d, %d)\n", i, commit->path ? commit->path : "(null)", commit->param.rename.cluster, commit->action); + } +#endif + + for (i = 0; i < s->commits.next;) { + commit_t* commit = array_get(&(s->commits), i); + if (commit->action == ACTION_RENAME) { + mapping_t* mapping = find_mapping_for_cluster(s, + commit->param.rename.cluster); + char* old_path = mapping->path; + + assert(commit->path); + mapping->path = commit->path; + if (rename(old_path, mapping->path)) + return -2; + + if (mapping->mode & MODE_DIRECTORY) { + int l1 = strlen(mapping->path); + int l2 = strlen(old_path); + int diff = l1 - l2; + direntry_t* direntry = array_get(&(s->directory), + mapping->info.dir.first_dir_index); + uint32_t c = mapping->begin; + int i = 0; + + /* recurse */ + while (!fat_eof(s, c)) { + do { + direntry_t* d = direntry + i; + + if (is_file(d) || (is_directory(d) && !is_dot(d))) { + mapping_t* m = find_mapping_for_cluster(s, + begin_of_direntry(d)); + int l = strlen(m->path); + char* new_path = g_malloc(l + diff + 1); + + assert(!strncmp(m->path, mapping->path, l2)); + + pstrcpy(new_path, l + diff + 1, mapping->path); + pstrcpy(new_path + l1, l + diff + 1 - l1, + m->path + l2); + + schedule_rename(s, m->begin, new_path); + } + i++; + } while((i % (0x10 * s->sectors_per_cluster)) != 0); + c = fat_get(s, c); + } + } + + g_free(old_path); + array_remove(&(s->commits), i); + continue; + } else if (commit->action == ACTION_MKDIR) { + mapping_t* mapping; + int j, parent_path_len; + +#ifdef __MINGW32__ + if (mkdir(commit->path)) + return -5; +#else + if (mkdir(commit->path, 0755)) + return -5; +#endif + + mapping = insert_mapping(s, commit->param.mkdir.cluster, + commit->param.mkdir.cluster + 1); + if (mapping == NULL) + return -6; + + mapping->mode = MODE_DIRECTORY; + mapping->read_only = 0; + mapping->path = commit->path; + j = s->directory.next; + assert(j); + insert_direntries(s, s->directory.next, + 0x10 * s->sectors_per_cluster); + mapping->info.dir.first_dir_index = j; + + parent_path_len = strlen(commit->path) + - strlen(get_basename(commit->path)) - 1; + for (j = 0; j < s->mapping.next; j++) { + mapping_t* m = array_get(&(s->mapping), j); + if (m->first_mapping_index < 0 && m != mapping && + !strncmp(m->path, mapping->path, parent_path_len) && + strlen(m->path) == parent_path_len) + break; + } + assert(j < s->mapping.next); + mapping->info.dir.parent_mapping_index = j; + + array_remove(&(s->commits), i); + continue; + } + + i++; + } + return 0; +} + +/* + * TODO: make sure that the short name is not matching *another* file + */ +static int handle_commits(BDRVVVFATState* s) +{ + int i, fail = 0; + + vvfat_close_current_file(s); + + for (i = 0; !fail && i < s->commits.next; i++) { + commit_t* commit = array_get(&(s->commits), i); + switch(commit->action) { + case ACTION_RENAME: case ACTION_MKDIR: + abort(); + fail = -2; + break; + case ACTION_WRITEOUT: { +#ifndef NDEBUG + /* these variables are only used by assert() below */ + direntry_t* entry = array_get(&(s->directory), + commit->param.writeout.dir_index); + uint32_t begin = begin_of_direntry(entry); + mapping_t* mapping = find_mapping_for_cluster(s, begin); +#endif + + assert(mapping); + assert(mapping->begin == begin); + assert(commit->path == NULL); + + if (commit_one_file(s, commit->param.writeout.dir_index, + commit->param.writeout.modified_offset)) + fail = -3; + + break; + } + case ACTION_NEW_FILE: { + int begin = commit->param.new_file.first_cluster; + mapping_t* mapping = find_mapping_for_cluster(s, begin); + direntry_t* entry; + int i; + + /* find direntry */ + for (i = 0; i < s->directory.next; i++) { + entry = array_get(&(s->directory), i); + if (is_file(entry) && begin_of_direntry(entry) == begin) + break; + } + + if (i >= s->directory.next) { + fail = -6; + continue; + } + + /* make sure there exists an initial mapping */ + if (mapping && mapping->begin != begin) { + mapping->end = begin; + mapping = NULL; + } + if (mapping == NULL) { + mapping = insert_mapping(s, begin, begin+1); + } + /* most members will be fixed in commit_mappings() */ + assert(commit->path); + mapping->path = commit->path; + mapping->read_only = 0; + mapping->mode = MODE_NORMAL; + mapping->info.file.offset = 0; + + if (commit_one_file(s, i, 0)) + fail = -7; + + break; + } + default: + abort(); + } + } + if (i > 0 && array_remove_slice(&(s->commits), 0, i)) + return -1; + return fail; +} + +static int handle_deletes(BDRVVVFATState* s) +{ + int i, deferred = 1, deleted = 1; + + /* delete files corresponding to mappings marked as deleted */ + /* handle DELETEs and unused mappings (modified_fat_get(s, mapping->begin) == 0) */ + while (deferred && deleted) { + deferred = 0; + deleted = 0; + + for (i = 1; i < s->mapping.next; i++) { + mapping_t* mapping = array_get(&(s->mapping), i); + if (mapping->mode & MODE_DELETED) { + direntry_t* entry = array_get(&(s->directory), + mapping->dir_index); + + if (is_free(entry)) { + /* remove file/directory */ + if (mapping->mode & MODE_DIRECTORY) { + int j, next_dir_index = s->directory.next, + first_dir_index = mapping->info.dir.first_dir_index; + + if (rmdir(mapping->path) < 0) { + if (errno == ENOTEMPTY) { + deferred++; + continue; + } else + return -5; + } + + for (j = 1; j < s->mapping.next; j++) { + mapping_t* m = array_get(&(s->mapping), j); + if (m->mode & MODE_DIRECTORY && + m->info.dir.first_dir_index > + first_dir_index && + m->info.dir.first_dir_index < + next_dir_index) + next_dir_index = + m->info.dir.first_dir_index; + } + remove_direntries(s, first_dir_index, + next_dir_index - first_dir_index); + + deleted++; + } + } else { + if (unlink(mapping->path)) + return -4; + deleted++; + } + DLOG(fprintf(stderr, "DELETE (%d)\n", i); print_mapping(mapping); print_direntry(entry)); + remove_mapping(s, i); + } + } + } + + return 0; +} + +/* + * synchronize mapping with new state: + * + * - copy FAT (with bdrv_read) + * - mark all filenames corresponding to mappings as deleted + * - recurse direntries from root (using bs->bdrv_read) + * - delete files corresponding to mappings marked as deleted + */ +static int do_commit(BDRVVVFATState* s) +{ + int ret = 0; + + /* the real meat are the commits. Nothing to do? Move along! */ + if (s->commits.next == 0) + return 0; + + vvfat_close_current_file(s); + + ret = handle_renames_and_mkdirs(s); + if (ret) { + fprintf(stderr, "Error handling renames (%d)\n", ret); + abort(); + return ret; + } + + /* copy FAT (with bdrv_read) */ + memcpy(s->fat.pointer, s->fat2, 0x200 * s->sectors_per_fat); + + /* recurse direntries from root (using bs->bdrv_read) */ + ret = commit_direntries(s, 0, -1); + if (ret) { + fprintf(stderr, "Fatal: error while committing (%d)\n", ret); + abort(); + return ret; + } + + ret = handle_commits(s); + if (ret) { + fprintf(stderr, "Error handling commits (%d)\n", ret); + abort(); + return ret; + } + + ret = handle_deletes(s); + if (ret) { + fprintf(stderr, "Error deleting\n"); + abort(); + return ret; + } + + if (s->qcow->drv->bdrv_make_empty) { + s->qcow->drv->bdrv_make_empty(s->qcow); + } + + memset(s->used_clusters, 0, sector2cluster(s, s->sector_count)); + +DLOG(checkpoint()); + return 0; +} + +static int try_commit(BDRVVVFATState* s) +{ + vvfat_close_current_file(s); +DLOG(checkpoint()); + if(!is_consistent(s)) + return -1; + return do_commit(s); +} + +static int vvfat_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + BDRVVVFATState *s = bs->opaque; + int i, ret; + +DLOG(checkpoint()); + + /* Check if we're operating in read-only mode */ + if (s->qcow == NULL) { + return -EACCES; + } + + vvfat_close_current_file(s); + + /* + * Some sanity checks: + * - do not allow writing to the boot sector + * - do not allow to write non-ASCII filenames + */ + + if (sector_num < s->first_sectors_number) + return -1; + + for (i = sector2cluster(s, sector_num); + i <= sector2cluster(s, sector_num + nb_sectors - 1);) { + mapping_t* mapping = find_mapping_for_cluster(s, i); + if (mapping) { + if (mapping->read_only) { + fprintf(stderr, "Tried to write to write-protected file %s\n", + mapping->path); + return -1; + } + + if (mapping->mode & MODE_DIRECTORY) { + int begin = cluster2sector(s, i); + int end = begin + s->sectors_per_cluster, k; + int dir_index; + const direntry_t* direntries; + long_file_name lfn; + + lfn_init(&lfn); + + if (begin < sector_num) + begin = sector_num; + if (end > sector_num + nb_sectors) + end = sector_num + nb_sectors; + dir_index = mapping->dir_index + + 0x10 * (begin - mapping->begin * s->sectors_per_cluster); + direntries = (direntry_t*)(buf + 0x200 * (begin - sector_num)); + + for (k = 0; k < (end - begin) * 0x10; k++) { + /* do not allow non-ASCII filenames */ + if (parse_long_name(&lfn, direntries + k) < 0) { + fprintf(stderr, "Warning: non-ASCII filename\n"); + return -1; + } + /* no access to the direntry of a read-only file */ + else if (is_short_name(direntries+k) && + (direntries[k].attributes & 1)) { + if (memcmp(direntries + k, + array_get(&(s->directory), dir_index + k), + sizeof(direntry_t))) { + fprintf(stderr, "Warning: tried to write to write-protected file\n"); + return -1; + } + } + } + } + i = mapping->end; + } else + i++; + } + + /* + * Use qcow backend. Commit later. + */ +DLOG(fprintf(stderr, "Write to qcow backend: %d + %d\n", (int)sector_num, nb_sectors)); + ret = bdrv_write(s->qcow, sector_num, buf, nb_sectors); + if (ret < 0) { + fprintf(stderr, "Error writing to qcow backend\n"); + return ret; + } + + for (i = sector2cluster(s, sector_num); + i <= sector2cluster(s, sector_num + nb_sectors - 1); i++) + if (i >= 0) + s->used_clusters[i] |= USED_ALLOCATED; + +DLOG(checkpoint()); + /* TODO: add timeout */ + try_commit(s); + +DLOG(checkpoint()); + return 0; +} + +static coroutine_fn int vvfat_co_write(BlockDriverState *bs, int64_t sector_num, + const uint8_t *buf, int nb_sectors) +{ + int ret; + BDRVVVFATState *s = bs->opaque; + qemu_co_mutex_lock(&s->lock); + ret = vvfat_write(bs, sector_num, buf, nb_sectors); + qemu_co_mutex_unlock(&s->lock); + return ret; +} + +static int64_t coroutine_fn vvfat_co_get_block_status(BlockDriverState *bs, + int64_t sector_num, int nb_sectors, int* n) +{ + BDRVVVFATState* s = bs->opaque; + *n = s->sector_count - sector_num; + if (*n > nb_sectors) { + *n = nb_sectors; + } else if (*n < 0) { + return 0; + } + return BDRV_BLOCK_DATA; +} + +static int write_target_commit(BlockDriverState *bs, int64_t sector_num, + const uint8_t* buffer, int nb_sectors) { + BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); + return try_commit(s); +} + +static void write_target_close(BlockDriverState *bs) { + BDRVVVFATState* s = *((BDRVVVFATState**) bs->opaque); + bdrv_unref(s->qcow); + g_free(s->qcow_filename); +} + +static BlockDriver vvfat_write_target = { + .format_name = "vvfat_write_target", + .bdrv_write = write_target_commit, + .bdrv_close = write_target_close, +}; + +static int enable_write_target(BDRVVVFATState *s, Error **errp) +{ + BlockDriver *bdrv_qcow = NULL; + BlockDriverState *backing; + QemuOpts *opts = NULL; + int ret; + int size = sector2cluster(s, s->sector_count); + QDict *options; + + s->used_clusters = calloc(size, 1); + + array_init(&(s->commits), sizeof(commit_t)); + + s->qcow_filename = g_malloc(PATH_MAX); + ret = get_tmp_filename(s->qcow_filename, PATH_MAX); + if (ret < 0) { + error_setg_errno(errp, -ret, "can't create temporary file"); + goto err; + } + + bdrv_qcow = bdrv_find_format("qcow"); + if (!bdrv_qcow) { + error_setg(errp, "Failed to locate qcow driver"); + ret = -ENOENT; + goto err; + } + + opts = qemu_opts_create(bdrv_qcow->create_opts, NULL, 0, &error_abort); + qemu_opt_set_number(opts, BLOCK_OPT_SIZE, s->sector_count * 512, + &error_abort); + qemu_opt_set(opts, BLOCK_OPT_BACKING_FILE, "fat:", &error_abort); + + ret = bdrv_create(bdrv_qcow, s->qcow_filename, opts, errp); + qemu_opts_del(opts); + if (ret < 0) { + goto err; + } + + s->qcow = NULL; + options = qdict_new(); + qdict_put(options, "driver", qstring_from_str("qcow")); + ret = bdrv_open(&s->qcow, s->qcow_filename, NULL, options, + BDRV_O_RDWR | BDRV_O_CACHE_WB | BDRV_O_NO_FLUSH, + errp); + if (ret < 0) { + goto err; + } + +#ifndef _WIN32 + unlink(s->qcow_filename); +#endif + + backing = bdrv_new(); + bdrv_set_backing_hd(s->bs, backing); + bdrv_unref(backing); + + s->bs->backing->bs->drv = &vvfat_write_target; + s->bs->backing->bs->opaque = g_new(void *, 1); + *(void**)s->bs->backing->bs->opaque = s; + + return 0; + +err: + g_free(s->qcow_filename); + s->qcow_filename = NULL; + return ret; +} + +static void vvfat_close(BlockDriverState *bs) +{ + BDRVVVFATState *s = bs->opaque; + + vvfat_close_current_file(s); + array_free(&(s->fat)); + array_free(&(s->directory)); + array_free(&(s->mapping)); + g_free(s->cluster_buffer); + + if (s->qcow) { + migrate_del_blocker(s->migration_blocker); + error_free(s->migration_blocker); + } +} + +static BlockDriver bdrv_vvfat = { + .format_name = "vvfat", + .protocol_name = "fat", + .instance_size = sizeof(BDRVVVFATState), + + .bdrv_parse_filename = vvfat_parse_filename, + .bdrv_file_open = vvfat_open, + .bdrv_close = vvfat_close, + + .bdrv_read = vvfat_co_read, + .bdrv_write = vvfat_co_write, + .bdrv_co_get_block_status = vvfat_co_get_block_status, +}; + +static void bdrv_vvfat_init(void) +{ + bdrv_register(&bdrv_vvfat); +} + +block_init(bdrv_vvfat_init); + +#ifdef DEBUG +static void checkpoint(void) { + assert(((mapping_t*)array_get(&(vvv->mapping), 0))->end == 2); + check1(vvv); + check2(vvv); + assert(!vvv->current_mapping || vvv->current_fd || (vvv->current_mapping->mode & MODE_DIRECTORY)); +#if 0 + if (((direntry_t*)vvv->directory.pointer)[1].attributes != 0xf) + fprintf(stderr, "Nonono!\n"); + mapping_t* mapping; + direntry_t* direntry; + assert(vvv->mapping.size >= vvv->mapping.item_size * vvv->mapping.next); + assert(vvv->directory.size >= vvv->directory.item_size * vvv->directory.next); + if (vvv->mapping.next<47) + return; + assert((mapping = array_get(&(vvv->mapping), 47))); + assert(mapping->dir_index < vvv->directory.next); + direntry = array_get(&(vvv->directory), mapping->dir_index); + assert(!memcmp(direntry->name, "USB H ", 11) || direntry->name[0]==0); +#endif +} +#endif diff --git a/src/block/win32-aio.c b/src/block/win32-aio.c new file mode 100644 index 0000000..bbf2f01 --- /dev/null +++ b/src/block/win32-aio.c @@ -0,0 +1,218 @@ +/* + * Block driver for RAW files (win32) + * + * Copyright (c) 2006 Fabrice Bellard + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + */ +#include "qemu-common.h" +#include "qemu/timer.h" +#include "block/block_int.h" +#include "qemu/module.h" +#include "block/aio.h" +#include "raw-aio.h" +#include "qemu/event_notifier.h" +#include "qemu/iov.h" +#include <windows.h> +#include <winioctl.h> + +#define FTYPE_FILE 0 +#define FTYPE_CD 1 +#define FTYPE_HARDDISK 2 + +struct QEMUWin32AIOState { + HANDLE hIOCP; + EventNotifier e; + int count; + bool is_aio_context_attached; +}; + +typedef struct QEMUWin32AIOCB { + BlockAIOCB common; + struct QEMUWin32AIOState *ctx; + int nbytes; + OVERLAPPED ov; + QEMUIOVector *qiov; + void *buf; + bool is_read; + bool is_linear; +} QEMUWin32AIOCB; + +/* + * Completes an AIO request (calls the callback and frees the ACB). + */ +static void win32_aio_process_completion(QEMUWin32AIOState *s, + QEMUWin32AIOCB *waiocb, DWORD count) +{ + int ret; + s->count--; + + if (waiocb->ov.Internal != 0) { + ret = -EIO; + } else { + ret = 0; + if (count < waiocb->nbytes) { + /* Short reads mean EOF, pad with zeros. */ + if (waiocb->is_read) { + qemu_iovec_memset(waiocb->qiov, count, 0, + waiocb->qiov->size - count); + } else { + ret = -EINVAL; + } + } + } + + if (!waiocb->is_linear) { + if (ret == 0 && waiocb->is_read) { + QEMUIOVector *qiov = waiocb->qiov; + iov_from_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); + } + qemu_vfree(waiocb->buf); + } + + + waiocb->common.cb(waiocb->common.opaque, ret); + qemu_aio_unref(waiocb); +} + +static void win32_aio_completion_cb(EventNotifier *e) +{ + QEMUWin32AIOState *s = container_of(e, QEMUWin32AIOState, e); + DWORD count; + ULONG_PTR key; + OVERLAPPED *ov; + + event_notifier_test_and_clear(&s->e); + while (GetQueuedCompletionStatus(s->hIOCP, &count, &key, &ov, 0)) { + QEMUWin32AIOCB *waiocb = container_of(ov, QEMUWin32AIOCB, ov); + + win32_aio_process_completion(s, waiocb, count); + } +} + +static const AIOCBInfo win32_aiocb_info = { + .aiocb_size = sizeof(QEMUWin32AIOCB), +}; + +BlockAIOCB *win32_aio_submit(BlockDriverState *bs, + QEMUWin32AIOState *aio, HANDLE hfile, + int64_t sector_num, QEMUIOVector *qiov, int nb_sectors, + BlockCompletionFunc *cb, void *opaque, int type) +{ + struct QEMUWin32AIOCB *waiocb; + uint64_t offset = sector_num * 512; + DWORD rc; + + waiocb = qemu_aio_get(&win32_aiocb_info, bs, cb, opaque); + waiocb->nbytes = nb_sectors * 512; + waiocb->qiov = qiov; + waiocb->is_read = (type == QEMU_AIO_READ); + + if (qiov->niov > 1) { + waiocb->buf = qemu_try_blockalign(bs, qiov->size); + if (waiocb->buf == NULL) { + goto out; + } + if (type & QEMU_AIO_WRITE) { + iov_to_buf(qiov->iov, qiov->niov, 0, waiocb->buf, qiov->size); + } + waiocb->is_linear = false; + } else { + waiocb->buf = qiov->iov[0].iov_base; + waiocb->is_linear = true; + } + + memset(&waiocb->ov, 0, sizeof(waiocb->ov)); + waiocb->ov.Offset = (DWORD)offset; + waiocb->ov.OffsetHigh = (DWORD)(offset >> 32); + waiocb->ov.hEvent = event_notifier_get_handle(&aio->e); + + aio->count++; + + if (type & QEMU_AIO_READ) { + rc = ReadFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov); + } else { + rc = WriteFile(hfile, waiocb->buf, waiocb->nbytes, NULL, &waiocb->ov); + } + if(rc == 0 && GetLastError() != ERROR_IO_PENDING) { + goto out_dec_count; + } + return &waiocb->common; + +out_dec_count: + aio->count--; +out: + qemu_aio_unref(waiocb); + return NULL; +} + +int win32_aio_attach(QEMUWin32AIOState *aio, HANDLE hfile) +{ + if (CreateIoCompletionPort(hfile, aio->hIOCP, (ULONG_PTR) 0, 0) == NULL) { + return -EINVAL; + } else { + return 0; + } +} + +void win32_aio_detach_aio_context(QEMUWin32AIOState *aio, + AioContext *old_context) +{ + aio_set_event_notifier(old_context, &aio->e, false, NULL); + aio->is_aio_context_attached = false; +} + +void win32_aio_attach_aio_context(QEMUWin32AIOState *aio, + AioContext *new_context) +{ + aio->is_aio_context_attached = true; + aio_set_event_notifier(new_context, &aio->e, false, + win32_aio_completion_cb); +} + +QEMUWin32AIOState *win32_aio_init(void) +{ + QEMUWin32AIOState *s; + + s = g_malloc0(sizeof(*s)); + if (event_notifier_init(&s->e, false) < 0) { + goto out_free_state; + } + + s->hIOCP = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0); + if (s->hIOCP == NULL) { + goto out_close_efd; + } + + return s; + +out_close_efd: + event_notifier_cleanup(&s->e); +out_free_state: + g_free(s); + return NULL; +} + +void win32_aio_cleanup(QEMUWin32AIOState *aio) +{ + assert(!aio->is_aio_context_attached); + CloseHandle(aio->hIOCP); + event_notifier_cleanup(&aio->e); + g_free(aio); +} diff --git a/src/block/write-threshold.c b/src/block/write-threshold.c new file mode 100644 index 0000000..0fe3891 --- /dev/null +++ b/src/block/write-threshold.c @@ -0,0 +1,125 @@ +/* + * QEMU System Emulator block write threshold notification + * + * Copyright Red Hat, Inc. 2014 + * + * Authors: + * Francesco Romani <fromani@redhat.com> + * + * This work is licensed under the terms of the GNU LGPL, version 2 or later. + * See the COPYING.LIB file in the top-level directory. + */ + +#include "block/block_int.h" +#include "qemu/coroutine.h" +#include "block/write-threshold.h" +#include "qemu/notify.h" +#include "qapi-event.h" +#include "qmp-commands.h" + + +uint64_t bdrv_write_threshold_get(const BlockDriverState *bs) +{ + return bs->write_threshold_offset; +} + +bool bdrv_write_threshold_is_set(const BlockDriverState *bs) +{ + return bs->write_threshold_offset > 0; +} + +static void write_threshold_disable(BlockDriverState *bs) +{ + if (bdrv_write_threshold_is_set(bs)) { + notifier_with_return_remove(&bs->write_threshold_notifier); + bs->write_threshold_offset = 0; + } +} + +uint64_t bdrv_write_threshold_exceeded(const BlockDriverState *bs, + const BdrvTrackedRequest *req) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (req->offset > bs->write_threshold_offset) { + return (req->offset - bs->write_threshold_offset) + req->bytes; + } + if ((req->offset + req->bytes) > bs->write_threshold_offset) { + return (req->offset + req->bytes) - bs->write_threshold_offset; + } + } + return 0; +} + +static int coroutine_fn before_write_notify(NotifierWithReturn *notifier, + void *opaque) +{ + BdrvTrackedRequest *req = opaque; + BlockDriverState *bs = req->bs; + uint64_t amount = 0; + + amount = bdrv_write_threshold_exceeded(bs, req); + if (amount > 0) { + qapi_event_send_block_write_threshold( + bs->node_name, + amount, + bs->write_threshold_offset, + &error_abort); + + /* autodisable to avoid flooding the monitor */ + write_threshold_disable(bs); + } + + return 0; /* should always let other notifiers run */ +} + +static void write_threshold_register_notifier(BlockDriverState *bs) +{ + bs->write_threshold_notifier.notify = before_write_notify; + notifier_with_return_list_add(&bs->before_write_notifiers, + &bs->write_threshold_notifier); +} + +static void write_threshold_update(BlockDriverState *bs, + int64_t threshold_bytes) +{ + bs->write_threshold_offset = threshold_bytes; +} + +void bdrv_write_threshold_set(BlockDriverState *bs, uint64_t threshold_bytes) +{ + if (bdrv_write_threshold_is_set(bs)) { + if (threshold_bytes > 0) { + write_threshold_update(bs, threshold_bytes); + } else { + write_threshold_disable(bs); + } + } else { + if (threshold_bytes > 0) { + /* avoid multiple registration */ + write_threshold_register_notifier(bs); + write_threshold_update(bs, threshold_bytes); + } + /* discard bogus disable request */ + } +} + +void qmp_block_set_write_threshold(const char *node_name, + uint64_t threshold_bytes, + Error **errp) +{ + BlockDriverState *bs; + AioContext *aio_context; + + bs = bdrv_find_node(node_name); + if (!bs) { + error_setg(errp, "Device '%s' not found", node_name); + return; + } + + aio_context = bdrv_get_aio_context(bs); + aio_context_acquire(aio_context); + + bdrv_write_threshold_set(bs, threshold_bytes); + + aio_context_release(aio_context); +} |