diff options
-rw-r--r-- | share/man/man4/pass.4 | 132 | ||||
-rw-r--r-- | sys/cam/ata/ata_da.c | 25 | ||||
-rw-r--r-- | sys/cam/cam_ccb.h | 3 | ||||
-rw-r--r-- | sys/cam/cam_xpt.c | 11 | ||||
-rw-r--r-- | sys/cam/cam_xpt.h | 4 | ||||
-rw-r--r-- | sys/cam/scsi/scsi_da.c | 29 | ||||
-rw-r--r-- | sys/cam/scsi/scsi_pass.c | 1604 | ||||
-rw-r--r-- | sys/cam/scsi/scsi_pass.h | 8 | ||||
-rw-r--r-- | sys/dev/md/md.c | 307 | ||||
-rw-r--r-- | sys/geom/geom_disk.c | 188 | ||||
-rw-r--r-- | sys/geom/geom_io.c | 9 | ||||
-rw-r--r-- | sys/ia64/include/bus.h | 3 | ||||
-rw-r--r-- | sys/kern/subr_bus_dma.c | 69 | ||||
-rw-r--r-- | sys/kern/subr_uio.c | 54 | ||||
-rw-r--r-- | sys/pc98/include/bus.h | 6 | ||||
-rw-r--r-- | sys/sys/bio.h | 1 | ||||
-rw-r--r-- | sys/sys/uio.h | 5 | ||||
-rw-r--r-- | usr.sbin/Makefile | 1 | ||||
-rw-r--r-- | usr.sbin/camdd/Makefile | 11 | ||||
-rw-r--r-- | usr.sbin/camdd/camdd.8 | 283 | ||||
-rw-r--r-- | usr.sbin/camdd/camdd.c | 3428 |
21 files changed, 5983 insertions, 198 deletions
diff --git a/share/man/man4/pass.4 b/share/man/man4/pass.4 index 7819ea3..00b9ccd 100644 --- a/share/man/man4/pass.4 +++ b/share/man/man4/pass.4 @@ -27,7 +27,7 @@ .\" .\" $FreeBSD$ .\" -.Dd October 10, 1998 +.Dd March 17, 2015 .Dt PASS 4 .Os .Sh NAME @@ -53,9 +53,13 @@ The .Nm driver attaches to every .Tn SCSI +and +.Tn ATA device found in the system. Since it attaches to every device, it provides a generic means of accessing .Tn SCSI +and +.Tn ATA devices, and allows the user to access devices which have no "standard" peripheral driver associated with them. .Sh KERNEL CONFIGURATION @@ -65,10 +69,12 @@ device in the kernel; .Nm devices are automatically allocated as .Tn SCSI +and +.Tn ATA devices are found. .Sh IOCTLS -.Bl -tag -width 012345678901234 -.It CAMIOCOMMAND +.Bl -tag -width 5n +.It CAMIOCOMMAND union ccb * This ioctl takes most kinds of CAM CCBs and passes them through to the CAM transport layer for action. Note that some CCB types are not allowed @@ -79,7 +85,7 @@ Some examples of xpt-only CCBs are XPT_SCAN_BUS, XPT_DEV_MATCH, XPT_RESET_BUS, XPT_SCAN_LUN, XPT_ENG_INQ, and XPT_ENG_EXEC. These CCB types have various attributes that make it illogical or impossible to service them through the passthrough interface. -.It CAMGETPASSTHRU +.It CAMGETPASSTHRU union ccb * This ioctl takes an XPT_GDEVLIST CCB, and returns the passthrough device corresponding to the device in question. Although this ioctl is available through the @@ -90,6 +96,109 @@ ioctl. It is probably more useful to issue this ioctl through the .Xr xpt 4 device. +.It CAMIOQUEUE union ccb * +Queue a CCB to the +.Xr pass 4 +driver to be executed asynchronously. +The caller may use +.Xr select 2 , +.Xr poll 2 +or +.Xr kevent 2 +to receive notification when the CCB has completed. +.Pp +This ioctl takes most CAM CCBs, but some CCB types are not allowed through +the pass device, and must be sent through the +.Xr xpt 4 +device instead. +Some examples of xpt-only CCBs are XPT_SCAN_BUS, +XPT_DEV_MATCH, XPT_RESET_BUS, XPT_SCAN_LUN, XPT_ENG_INQ, and XPT_ENG_EXEC. +These CCB types have various attributes that make it illogical or +impossible to service them through the passthrough interface. +.Pp +Although the +.Dv CAMIOQUEUE +ioctl is not defined to take an argument, it does require a +pointer to a union ccb. +It is not defined to take an argument to avoid an extra malloc and copy +inside the generic +.Xr ioctl 2 +handler. +.pp +The completed CCB will be returned via the +.Dv CAMIOGET +ioctl. +An error will only be returned from the +.Dv CAMIOQUEUE +ioctl if there is an error allocating memory for the request or copying +memory from userland. +All other errors will be reported as standard CAM CCB status errors. +Since the CCB is not copied back to the user process from the pass driver +in the +.Dv CAMIOQUEUE +ioctl, the user's passed-in CCB will not be modfied. +This is the case even with immediate CCBs. +Instead, the completed CCB must be retrieved via the +.Dv CAMIOGET +ioctl and the status examined. +.Pp +Multiple CCBs may be queued via the +.Dv CAMIOQUEUE +ioctl at any given time, and they may complete in a different order than +the order that they were submitted. +The caller must take steps to identify CCBs that are queued and completed. +The +.Dv periph_priv +structure inside struct ccb_hdr is available for userland use with the +.Dv CAMIOQUEUE +and +.Dv CAMIOGET +ioctls, and will be preserved across calls. +Also, the periph_links linked list pointers inside struct ccb_hdr are +available for userland use with the +.Dv CAMIOQUEUE +and +.Dv CAMIOGET +ioctls and will be preserved across calls. +.It CAMIOGET union ccb * +Retrieve completed CAM CCBs queued via the +.Dv CAMIOQUEUE +ioctl. +An error will only be returned from the +.Dv CAMIOGET +ioctl if the +.Xr pass 4 +driver fails to copy data to the user process or if there are no completed +CCBs available to retrieve. +If no CCBs are available to retrieve, +errno will be set to +.Dv ENOENT . +.Pp +All other errors will be reported as standard CAM CCB status errors. +.Pp +Although the +.Dv CAMIOGET +ioctl is not defined to take an argument, it does require a +pointer to a union ccb. +It is not defined to take an argument to avoid an extra malloc and copy +inside the generic +.Xr ioctl 2 +handler. +.Pp +The pass driver will report via +.Xr select 2 , +.Xr poll 2 +or +.Xr kevent 2 +when a CCB has completed. +One CCB may be retrieved per +.Dv CAMIOGET +call. +CCBs may be returned in an order different than the order they were +submitted. +So the caller should use the +.Dv periph_priv +area inside the CCB header to store pointers to identifying information. .El .Sh FILES .Bl -tag -width /dev/passn -compact @@ -103,18 +212,21 @@ CAM subsystem. .Sh DIAGNOSTICS None. .Sh SEE ALSO +.Xr kqueue 2 , +.Xr poll 2 , +.Xr select 2 , .Xr cam 3 , .Xr cam 4 , .Xr cam_cdbparse 3 , +.Xr cd 4 , +.Xr ctl 4 , +.Xr da 4 , +.Xr sa 4 , .Xr xpt 4 , -.Xr camcontrol 8 +.Xr camcontrol 8 , +.Xr camdd 8 .Sh HISTORY The CAM passthrough driver first appeared in .Fx 3.0 . .Sh AUTHORS .An Kenneth Merry Aq ken@FreeBSD.org -.Sh BUGS -It might be nice to have a way to asynchronously send CCBs through the -passthrough driver. -This would probably require some sort of read/write -interface or an asynchronous ioctl interface. diff --git a/sys/cam/ata/ata_da.c b/sys/cam/ata/ata_da.c index f88899e..005c684 100644 --- a/sys/cam/ata/ata_da.c +++ b/sys/cam/ata/ata_da.c @@ -1573,12 +1573,26 @@ adastart(struct cam_periph *periph, union ccb *start_ccb) } switch (bp->bio_cmd) { case BIO_WRITE: - softc->flags |= ADA_FLAG_DIRTY; - /* FALLTHROUGH */ case BIO_READ: { uint64_t lba = bp->bio_pblkno; uint16_t count = bp->bio_bcount / softc->params.secsize; + void *data_ptr; + int rw_op; + + if (bp->bio_cmd == BIO_WRITE) { + softc->flags |= ADA_FLAG_DIRTY; + rw_op = CAM_DIR_OUT; + } else { + rw_op = CAM_DIR_IN; + } + + data_ptr = bp->bio_data; + if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) { + rw_op |= CAM_DATA_BIO; + data_ptr = bp; + } + #ifdef ADA_TEST_FAILURE int fail = 0; @@ -1623,12 +1637,9 @@ adastart(struct cam_periph *periph, union ccb *start_ccb) cam_fill_ataio(ataio, ada_retry_count, adadone, - (bp->bio_cmd == BIO_READ ? CAM_DIR_IN : - CAM_DIR_OUT) | ((bp->bio_flags & BIO_UNMAPPED) - != 0 ? CAM_DATA_BIO : 0), + rw_op, tag_code, - ((bp->bio_flags & BIO_UNMAPPED) != 0) ? (void *)bp : - bp->bio_data, + data_ptr, bp->bio_bcount, ada_default_timeout*1000); diff --git a/sys/cam/cam_ccb.h b/sys/cam/cam_ccb.h index 98bb9ea..12d3803 100644 --- a/sys/cam/cam_ccb.h +++ b/sys/cam/cam_ccb.h @@ -111,6 +111,9 @@ typedef enum { typedef enum { CAM_EXTLUN_VALID = 0x00000001,/* 64bit lun field is valid */ + CAM_USER_DATA_ADDR = 0x00000002,/* Userspace data pointers */ + CAM_SG_FORMAT_IOVEC = 0x00000004,/* iovec instead of busdma S/G*/ + CAM_UNMAPPED_BUF = 0x00000008 /* use unmapped I/O */ } ccb_xflags; /* XPT Opcodes for xpt_action */ diff --git a/sys/cam/cam_xpt.c b/sys/cam/cam_xpt.c index ba0863a..6773829 100644 --- a/sys/cam/cam_xpt.c +++ b/sys/cam/cam_xpt.c @@ -3337,7 +3337,8 @@ xpt_merge_ccb(union ccb *master_ccb, union ccb *slave_ccb) } void -xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority) +xpt_setup_ccb_flags(struct ccb_hdr *ccb_h, struct cam_path *path, + u_int32_t priority, u_int32_t flags) { CAM_DEBUG(path, CAM_DEBUG_TRACE, ("xpt_setup_ccb\n")); @@ -3355,10 +3356,16 @@ xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority) ccb_h->target_lun = CAM_TARGET_WILDCARD; } ccb_h->pinfo.index = CAM_UNQUEUED_INDEX; - ccb_h->flags = 0; + ccb_h->flags = flags; ccb_h->xflags = 0; } +void +xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority) +{ + xpt_setup_ccb_flags(ccb_h, path, priority, /*flags*/ 0); +} + /* Path manipulation functions */ cam_status xpt_create_path(struct cam_path **new_path_ptr, struct cam_periph *perph, diff --git a/sys/cam/cam_xpt.h b/sys/cam/cam_xpt.h index 1d983c9..ca7dccc 100644 --- a/sys/cam/cam_xpt.h +++ b/sys/cam/cam_xpt.h @@ -70,6 +70,10 @@ void xpt_action_default(union ccb *new_ccb); union ccb *xpt_alloc_ccb(void); union ccb *xpt_alloc_ccb_nowait(void); void xpt_free_ccb(union ccb *free_ccb); +void xpt_setup_ccb_flags(struct ccb_hdr *ccb_h, + struct cam_path *path, + u_int32_t priority, + u_int32_t flags); void xpt_setup_ccb(struct ccb_hdr *ccb_h, struct cam_path *path, u_int32_t priority); diff --git a/sys/cam/scsi/scsi_da.c b/sys/cam/scsi/scsi_da.c index 4e3fe76..1cd687a 100644 --- a/sys/cam/scsi/scsi_da.c +++ b/sys/cam/scsi/scsi_da.c @@ -2332,29 +2332,40 @@ skipstate: switch (bp->bio_cmd) { case BIO_WRITE: - softc->flags |= DA_FLAG_DIRTY; - /* FALLTHROUGH */ case BIO_READ: + { + void *data_ptr; + int rw_op; + + if (bp->bio_cmd == BIO_WRITE) { + softc->flags |= DA_FLAG_DIRTY; + rw_op = SCSI_RW_WRITE; + } else { + rw_op = SCSI_RW_READ; + } + + data_ptr = bp->bio_data; + if ((bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0) { + rw_op |= SCSI_RW_BIO; + data_ptr = bp; + } + scsi_read_write(&start_ccb->csio, /*retries*/da_retry_count, /*cbfcnp*/dadone, /*tag_action*/tag_code, - /*read_op*/(bp->bio_cmd == BIO_READ ? - SCSI_RW_READ : SCSI_RW_WRITE) | - ((bp->bio_flags & BIO_UNMAPPED) != 0 ? - SCSI_RW_BIO : 0), + rw_op, /*byte2*/0, softc->minimum_cmd_size, /*lba*/bp->bio_pblkno, /*block_count*/bp->bio_bcount / softc->params.secsize, - /*data_ptr*/ (bp->bio_flags & - BIO_UNMAPPED) != 0 ? (void *)bp : - bp->bio_data, + data_ptr, /*dxfer_len*/ bp->bio_bcount, /*sense_len*/SSD_FULL_SIZE, da_default_timeout * 1000); break; + } case BIO_FLUSH: /* * BIO_FLUSH doesn't currently communicate diff --git a/sys/cam/scsi/scsi_pass.c b/sys/cam/scsi/scsi_pass.c index 174151e..09cda5b 100644 --- a/sys/cam/scsi/scsi_pass.c +++ b/sys/cam/scsi/scsi_pass.c @@ -28,27 +28,39 @@ #include <sys/cdefs.h> __FBSDID("$FreeBSD$"); +#include "opt_kdtrace.h" + #include <sys/param.h> #include <sys/systm.h> #include <sys/kernel.h> +#include <sys/conf.h> #include <sys/types.h> #include <sys/bio.h> -#include <sys/malloc.h> -#include <sys/fcntl.h> -#include <sys/conf.h> -#include <sys/errno.h> +#include <sys/bus.h> #include <sys/devicestat.h> +#include <sys/errno.h> +#include <sys/fcntl.h> +#include <sys/malloc.h> #include <sys/proc.h> +#include <sys/poll.h> +#include <sys/selinfo.h> +#include <sys/sdt.h> #include <sys/taskqueue.h> +#include <vm/uma.h> +#include <vm/vm.h> +#include <vm/vm_extern.h> + +#include <machine/bus.h> #include <cam/cam.h> #include <cam/cam_ccb.h> #include <cam/cam_periph.h> #include <cam/cam_queue.h> +#include <cam/cam_xpt.h> #include <cam/cam_xpt_periph.h> #include <cam/cam_debug.h> -#include <cam/cam_sim.h> #include <cam/cam_compat.h> +#include <cam/cam_xpt_periph.h> #include <cam/scsi/scsi_all.h> #include <cam/scsi/scsi_pass.h> @@ -57,7 +69,11 @@ typedef enum { PASS_FLAG_OPEN = 0x01, PASS_FLAG_LOCKED = 0x02, PASS_FLAG_INVALID = 0x04, - PASS_FLAG_INITIAL_PHYSPATH = 0x08 + PASS_FLAG_INITIAL_PHYSPATH = 0x08, + PASS_FLAG_ZONE_INPROG = 0x10, + PASS_FLAG_ZONE_VALID = 0x20, + PASS_FLAG_UNMAPPED_CAPABLE = 0x40, + PASS_FLAG_ABANDONED_REF_SET = 0x80 } pass_flags; typedef enum { @@ -65,38 +81,104 @@ typedef enum { } pass_state; typedef enum { - PASS_CCB_BUFFER_IO + PASS_CCB_BUFFER_IO, + PASS_CCB_QUEUED_IO } pass_ccb_types; #define ccb_type ppriv_field0 -#define ccb_bp ppriv_ptr1 +#define ccb_ioreq ppriv_ptr1 -struct pass_softc { - pass_state state; - pass_flags flags; - u_int8_t pd_type; - union ccb saved_ccb; - int open_count; - u_int maxio; - struct devstat *device_stats; - struct cdev *dev; - struct cdev *alias_dev; - struct task add_physpath_task; +/* + * The maximum number of memory segments we preallocate. + */ +#define PASS_MAX_SEGS 16 + +typedef enum { + PASS_IO_NONE = 0x00, + PASS_IO_USER_SEG_MALLOC = 0x01, + PASS_IO_KERN_SEG_MALLOC = 0x02, + PASS_IO_ABANDONED = 0x04 +} pass_io_flags; + +struct pass_io_req { + union ccb ccb; + union ccb *alloced_ccb; + union ccb *user_ccb_ptr; + camq_entry user_periph_links; + ccb_ppriv_area user_periph_priv; + struct cam_periph_map_info mapinfo; + pass_io_flags flags; + ccb_flags data_flags; + int num_user_segs; + bus_dma_segment_t user_segs[PASS_MAX_SEGS]; + int num_kern_segs; + bus_dma_segment_t kern_segs[PASS_MAX_SEGS]; + bus_dma_segment_t *user_segptr; + bus_dma_segment_t *kern_segptr; + int num_bufs; + uint32_t dirs[CAM_PERIPH_MAXMAPS]; + uint32_t lengths[CAM_PERIPH_MAXMAPS]; + uint8_t *user_bufs[CAM_PERIPH_MAXMAPS]; + uint8_t *kern_bufs[CAM_PERIPH_MAXMAPS]; + struct bintime start_time; + TAILQ_ENTRY(pass_io_req) links; }; +struct pass_softc { + pass_state state; + pass_flags flags; + u_int8_t pd_type; + union ccb saved_ccb; + int open_count; + u_int maxio; + struct devstat *device_stats; + struct cdev *dev; + struct cdev *alias_dev; + struct task add_physpath_task; + struct task shutdown_kqueue_task; + struct selinfo read_select; + TAILQ_HEAD(, pass_io_req) incoming_queue; + TAILQ_HEAD(, pass_io_req) active_queue; + TAILQ_HEAD(, pass_io_req) abandoned_queue; + TAILQ_HEAD(, pass_io_req) done_queue; + struct cam_periph *periph; + char zone_name[12]; + char io_zone_name[12]; + uma_zone_t pass_zone; + uma_zone_t pass_io_zone; + size_t io_zone_size; +}; static d_open_t passopen; static d_close_t passclose; static d_ioctl_t passioctl; static d_ioctl_t passdoioctl; +static d_poll_t passpoll; +static d_kqfilter_t passkqfilter; +static void passreadfiltdetach(struct knote *kn); +static int passreadfilt(struct knote *kn, long hint); static periph_init_t passinit; static periph_ctor_t passregister; static periph_oninv_t passoninvalidate; static periph_dtor_t passcleanup; -static void pass_add_physpath(void *context, int pending); +static periph_start_t passstart; +static void pass_shutdown_kqueue(void *context, int pending); +static void pass_add_physpath(void *context, int pending); static void passasync(void *callback_arg, u_int32_t code, struct cam_path *path, void *arg); +static void passdone(struct cam_periph *periph, + union ccb *done_ccb); +static int passcreatezone(struct cam_periph *periph); +static void passiocleanup(struct pass_softc *softc, + struct pass_io_req *io_req); +static int passcopysglist(struct cam_periph *periph, + struct pass_io_req *io_req, + ccb_flags direction); +static int passmemsetup(struct cam_periph *periph, + struct pass_io_req *io_req); +static int passmemdone(struct cam_periph *periph, + struct pass_io_req *io_req); static int passerror(union ccb *ccb, u_int32_t cam_flags, u_int32_t sense_flags); static int passsendccb(struct cam_periph *periph, union ccb *ccb, @@ -116,9 +198,19 @@ static struct cdevsw pass_cdevsw = { .d_open = passopen, .d_close = passclose, .d_ioctl = passioctl, + .d_poll = passpoll, + .d_kqfilter = passkqfilter, .d_name = "pass", }; +static struct filterops passread_filtops = { + .f_isfd = 1, + .f_detach = passreadfiltdetach, + .f_event = passreadfilt +}; + +static MALLOC_DEFINE(M_SCSIPASS, "scsi_pass", "scsi passthrough buffers"); + static void passinit(void) { @@ -138,6 +230,60 @@ passinit(void) } static void +passrejectios(struct cam_periph *periph) +{ + struct pass_io_req *io_req, *io_req2; + struct pass_softc *softc; + + softc = (struct pass_softc *)periph->softc; + + /* + * The user can no longer get status for I/O on the done queue, so + * clean up all outstanding I/O on the done queue. + */ + TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) { + TAILQ_REMOVE(&softc->done_queue, io_req, links); + passiocleanup(softc, io_req); + uma_zfree(softc->pass_zone, io_req); + } + + /* + * The underlying device is gone, so we can't issue these I/Os. + * The devfs node has been shut down, so we can't return status to + * the user. Free any I/O left on the incoming queue. + */ + TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, io_req2) { + TAILQ_REMOVE(&softc->incoming_queue, io_req, links); + passiocleanup(softc, io_req); + uma_zfree(softc->pass_zone, io_req); + } + + /* + * Normally we would put I/Os on the abandoned queue and acquire a + * reference when we saw the final close. But, the device went + * away and devfs may have moved everything off to deadfs by the + * time the I/O done callback is called; as a result, we won't see + * any more closes. So, if we have any active I/Os, we need to put + * them on the abandoned queue. When the abandoned queue is empty, + * we'll release the remaining reference (see below) to the peripheral. + */ + TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, io_req2) { + TAILQ_REMOVE(&softc->active_queue, io_req, links); + io_req->flags |= PASS_IO_ABANDONED; + TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, links); + } + + /* + * If we put any I/O on the abandoned queue, acquire a reference. + */ + if ((!TAILQ_EMPTY(&softc->abandoned_queue)) + && ((softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0)) { + cam_periph_doacquire(periph); + softc->flags |= PASS_FLAG_ABANDONED_REF_SET; + } +} + +static void passdevgonecb(void *arg) { struct cam_periph *periph; @@ -165,17 +311,26 @@ passdevgonecb(void *arg) /* * Release the reference held for the device node, it is gone now. + * Accordingly, inform all queued I/Os of their fate. */ cam_periph_release_locked(periph); + passrejectios(periph); /* - * We reference the lock directly here, instead of using + * We reference the SIM lock directly here, instead of using * cam_periph_unlock(). The reason is that the final call to * cam_periph_release_locked() above could result in the periph * getting freed. If that is the case, dereferencing the periph * with a cam_periph_unlock() call would cause a page fault. */ mtx_unlock(mtx); + + /* + * We have to remove our kqueue context from a thread because it + * may sleep. It would be nice if we could get a callback from + * kqueue when it is done cleaning up resources. + */ + taskqueue_enqueue(taskqueue_thread, &softc->shutdown_kqueue_task); } static void @@ -197,12 +352,6 @@ passoninvalidate(struct cam_periph *periph) * when it has cleaned up its state. */ destroy_dev_sched_cb(softc->dev, passdevgonecb, periph); - - /* - * XXX Return all queued I/O with ENXIO. - * XXX Handle any transactions queued to the card - * with XPT_ABORT_CCB. - */ } static void @@ -212,9 +361,40 @@ passcleanup(struct cam_periph *periph) softc = (struct pass_softc *)periph->softc; + cam_periph_assert(periph, MA_OWNED); + KASSERT(TAILQ_EMPTY(&softc->active_queue), + ("%s called when there are commands on the active queue!\n", + __func__)); + KASSERT(TAILQ_EMPTY(&softc->abandoned_queue), + ("%s called when there are commands on the abandoned queue!\n", + __func__)); + KASSERT(TAILQ_EMPTY(&softc->incoming_queue), + ("%s called when there are commands on the incoming queue!\n", + __func__)); + KASSERT(TAILQ_EMPTY(&softc->done_queue), + ("%s called when there are commands on the done queue!\n", + __func__)); + devstat_remove_entry(softc->device_stats); cam_periph_unlock(periph); + + /* + * We call taskqueue_drain() for the physpath task to make sure it + * is complete. We drop the lock because this can potentially + * sleep. XXX KDM that is bad. Need a way to get a callback when + * a taskqueue is drained. + * + * Note that we don't drain the kqueue shutdown task queue. This + * is because we hold a reference on the periph for kqueue, and + * release that reference from the kqueue shutdown task queue. So + * we cannot come into this routine unless we've released that + * reference. Also, because that could be the last reference, we + * could be called from the cam_periph_release() call in + * pass_shutdown_kqueue(). In that case, the taskqueue_drain() + * would deadlock. It would be preferable if we had a way to + * get a callback when a taskqueue is done. + */ taskqueue_drain(taskqueue_thread, &softc->add_physpath_task); cam_periph_lock(periph); @@ -223,10 +403,29 @@ passcleanup(struct cam_periph *periph) } static void +pass_shutdown_kqueue(void *context, int pending) +{ + struct cam_periph *periph; + struct pass_softc *softc; + + periph = context; + softc = periph->softc; + + knlist_clear(&softc->read_select.si_note, /*is_locked*/ 0); + knlist_destroy(&softc->read_select.si_note); + + /* + * Release the reference we held for kqueue. + */ + cam_periph_release(periph); +} + +static void pass_add_physpath(void *context, int pending) { struct cam_periph *periph; struct pass_softc *softc; + struct mtx *mtx; char *physpath; /* @@ -236,34 +435,38 @@ pass_add_physpath(void *context, int pending) periph = context; softc = periph->softc; physpath = malloc(MAXPATHLEN, M_DEVBUF, M_WAITOK); - cam_periph_lock(periph); - if (periph->flags & CAM_PERIPH_INVALID) { - cam_periph_unlock(periph); + mtx = cam_periph_mtx(periph); + mtx_lock(mtx); + + if (periph->flags & CAM_PERIPH_INVALID) goto out; - } + if (xpt_getattr(physpath, MAXPATHLEN, "GEOM::physpath", periph->path) == 0 && strlen(physpath) != 0) { - cam_periph_unlock(periph); + mtx_unlock(mtx); make_dev_physpath_alias(MAKEDEV_WAITOK, &softc->alias_dev, softc->dev, softc->alias_dev, physpath); - cam_periph_lock(periph); + mtx_lock(mtx); } +out: /* * Now that we've made our alias, we no longer have to have a * reference to the device. */ - if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0) { + if ((softc->flags & PASS_FLAG_INITIAL_PHYSPATH) == 0) softc->flags |= PASS_FLAG_INITIAL_PHYSPATH; - cam_periph_unlock(periph); - dev_rel(softc->dev); - } - else - cam_periph_unlock(periph); -out: + /* + * We always acquire a reference to the periph before queueing this + * task queue function, so it won't go away before we run. + */ + while (pending-- > 0) + cam_periph_release_locked(periph); + mtx_unlock(mtx); + free(physpath, M_DEVBUF); } @@ -291,7 +494,7 @@ passasync(void *callback_arg, u_int32_t code, * process. */ status = cam_periph_alloc(passregister, passoninvalidate, - passcleanup, NULL, "pass", + passcleanup, passstart, "pass", CAM_PERIPH_BIO, path, passasync, AC_FOUND_DEVICE, cgd); @@ -315,8 +518,19 @@ passasync(void *callback_arg, u_int32_t code, buftype = (uintptr_t)arg; if (buftype == CDAI_TYPE_PHYS_PATH) { struct pass_softc *softc; + cam_status status; softc = (struct pass_softc *)periph->softc; + /* + * Acquire a reference to the periph before we + * start the taskqueue, so that we don't run into + * a situation where the periph goes away before + * the task queue has a chance to run. + */ + status = cam_periph_acquire(periph); + if (status != CAM_REQ_CMP) + break; + taskqueue_enqueue(taskqueue_thread, &softc->add_physpath_task); } @@ -361,6 +575,17 @@ passregister(struct cam_periph *periph, void *arg) softc->pd_type = T_DIRECT; periph->softc = softc; + softc->periph = periph; + TAILQ_INIT(&softc->incoming_queue); + TAILQ_INIT(&softc->active_queue); + TAILQ_INIT(&softc->abandoned_queue); + TAILQ_INIT(&softc->done_queue); + snprintf(softc->zone_name, sizeof(softc->zone_name), "%s%d", + periph->periph_name, periph->unit_number); + snprintf(softc->io_zone_name, sizeof(softc->io_zone_name), "%s%dIO", + periph->periph_name, periph->unit_number); + softc->io_zone_size = MAXPHYS; + knlist_init_mtx(&softc->read_select.si_note, cam_periph_mtx(periph)); bzero(&cpi, sizeof(cpi)); xpt_setup_ccb(&cpi.ccb_h, periph->path, CAM_PRIORITY_NORMAL); @@ -374,6 +599,9 @@ passregister(struct cam_periph *periph, void *arg) else softc->maxio = cpi.maxio; /* real value */ + if (cpi.hba_misc & PIM_UNMAPPED) + softc->flags |= PASS_FLAG_UNMAPPED_CAPABLE; + /* * We pass in 0 for a blocksize, since we don't * know what the blocksize of this device is, if @@ -391,6 +619,23 @@ passregister(struct cam_periph *periph, void *arg) DEVSTAT_PRIORITY_PASS); /* + * Initialize the taskqueue handler for shutting down kqueue. + */ + TASK_INIT(&softc->shutdown_kqueue_task, /*priority*/ 0, + pass_shutdown_kqueue, periph); + + /* + * Acquire a reference to the periph that we can release once we've + * cleaned up the kqueue. + */ + if (cam_periph_acquire(periph) != CAM_REQ_CMP) { + xpt_print(periph->path, "%s: lost periph during " + "registration!\n", __func__); + cam_periph_lock(periph); + return (CAM_REQ_CMP_ERR); + } + + /* * Acquire a reference to the periph before we create the devfs * instance for it. We'll release this reference once the devfs * instance has been freed. @@ -408,12 +653,15 @@ passregister(struct cam_periph *periph, void *arg) periph->periph_name, periph->unit_number); /* - * Now that we have made the devfs instance, hold a reference to it - * until the task queue has run to setup the physical path alias. - * That way devfs won't get rid of the device before we add our - * alias. + * Hold a reference to the periph before we create the physical + * path alias so it can't go away. */ - dev_ref(softc->dev); + if (cam_periph_acquire(periph) != CAM_REQ_CMP) { + xpt_print(periph->path, "%s: lost periph during " + "registration!\n", __func__); + cam_periph_lock(periph); + return (CAM_REQ_CMP_ERR); + } cam_periph_lock(periph); softc->dev->si_drv1 = periph; @@ -514,6 +762,55 @@ passclose(struct cdev *dev, int flag, int fmt, struct thread *td) softc = periph->softc; softc->open_count--; + if (softc->open_count == 0) { + struct pass_io_req *io_req, *io_req2; + int need_unlock; + + need_unlock = 0; + + TAILQ_FOREACH_SAFE(io_req, &softc->done_queue, links, io_req2) { + TAILQ_REMOVE(&softc->done_queue, io_req, links); + passiocleanup(softc, io_req); + uma_zfree(softc->pass_zone, io_req); + } + + TAILQ_FOREACH_SAFE(io_req, &softc->incoming_queue, links, + io_req2) { + TAILQ_REMOVE(&softc->incoming_queue, io_req, links); + passiocleanup(softc, io_req); + uma_zfree(softc->pass_zone, io_req); + } + + /* + * If there are any active I/Os, we need to forcibly acquire a + * reference to the peripheral so that we don't go away + * before they complete. We'll release the reference when + * the abandoned queue is empty. + */ + io_req = TAILQ_FIRST(&softc->active_queue); + if ((io_req != NULL) + && (softc->flags & PASS_FLAG_ABANDONED_REF_SET) == 0) { + cam_periph_doacquire(periph); + softc->flags |= PASS_FLAG_ABANDONED_REF_SET; + } + + /* + * Since the I/O in the active queue is not under our + * control, just set a flag so that we can clean it up when + * it completes and put it on the abandoned queue. This + * will prevent our sending spurious completions in the + * event that the device is opened again before these I/Os + * complete. + */ + TAILQ_FOREACH_SAFE(io_req, &softc->active_queue, links, + io_req2) { + TAILQ_REMOVE(&softc->active_queue, io_req, links); + io_req->flags |= PASS_IO_ABANDONED; + TAILQ_INSERT_TAIL(&softc->abandoned_queue, io_req, + links); + } + } + cam_periph_release_locked(periph); /* @@ -533,6 +830,915 @@ passclose(struct cdev *dev, int flag, int fmt, struct thread *td) return (0); } + +static void +passstart(struct cam_periph *periph, union ccb *start_ccb) +{ + struct pass_softc *softc; + + softc = (struct pass_softc *)periph->softc; + + switch (softc->state) { + case PASS_STATE_NORMAL: { + struct pass_io_req *io_req; + + /* + * Check for any queued I/O requests that require an + * allocated slot. + */ + io_req = TAILQ_FIRST(&softc->incoming_queue); + if (io_req == NULL) { + xpt_release_ccb(start_ccb); + break; + } + TAILQ_REMOVE(&softc->incoming_queue, io_req, links); + TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links); + /* + * Merge the user's CCB into the allocated CCB. + */ + xpt_merge_ccb(start_ccb, &io_req->ccb); + start_ccb->ccb_h.ccb_type = PASS_CCB_QUEUED_IO; + start_ccb->ccb_h.ccb_ioreq = io_req; + start_ccb->ccb_h.cbfcnp = passdone; + io_req->alloced_ccb = start_ccb; + binuptime(&io_req->start_time); + devstat_start_transaction(softc->device_stats, + &io_req->start_time); + + xpt_action(start_ccb); + + /* + * If we have any more I/O waiting, schedule ourselves again. + */ + if (!TAILQ_EMPTY(&softc->incoming_queue)) + xpt_schedule(periph, CAM_PRIORITY_NORMAL); + break; + } + default: + break; + } +} + +static void +passdone(struct cam_periph *periph, union ccb *done_ccb) +{ + struct pass_softc *softc; + struct ccb_scsiio *csio; + + softc = (struct pass_softc *)periph->softc; + + cam_periph_assert(periph, MA_OWNED); + + csio = &done_ccb->csio; + switch (csio->ccb_h.ccb_type) { + case PASS_CCB_QUEUED_IO: { + struct pass_io_req *io_req; + + io_req = done_ccb->ccb_h.ccb_ioreq; +#if 0 + xpt_print(periph->path, "%s: called for user CCB %p\n", + __func__, io_req->user_ccb_ptr); +#endif + if (((done_ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) + && (done_ccb->ccb_h.flags & CAM_PASS_ERR_RECOVER) + && ((io_req->flags & PASS_IO_ABANDONED) == 0)) { + int error; + + error = passerror(done_ccb, CAM_RETRY_SELTO, + SF_RETRY_UA | SF_NO_PRINT); + + if (error == ERESTART) { + /* + * A retry was scheduled, so + * just return. + */ + return; + } + } + + /* + * Copy the allocated CCB contents back to the malloced CCB + * so we can give status back to the user when he requests it. + */ + bcopy(done_ccb, &io_req->ccb, sizeof(*done_ccb)); + + /* + * Log data/transaction completion with devstat(9). + */ + switch (done_ccb->ccb_h.func_code) { + case XPT_SCSI_IO: + devstat_end_transaction(softc->device_stats, + done_ccb->csio.dxfer_len - done_ccb->csio.resid, + done_ccb->csio.tag_action & 0x3, + ((done_ccb->ccb_h.flags & CAM_DIR_MASK) == + CAM_DIR_NONE) ? DEVSTAT_NO_DATA : + (done_ccb->ccb_h.flags & CAM_DIR_OUT) ? + DEVSTAT_WRITE : DEVSTAT_READ, NULL, + &io_req->start_time); + break; + case XPT_ATA_IO: + devstat_end_transaction(softc->device_stats, + done_ccb->ataio.dxfer_len - done_ccb->ataio.resid, + done_ccb->ataio.tag_action & 0x3, + ((done_ccb->ccb_h.flags & CAM_DIR_MASK) == + CAM_DIR_NONE) ? DEVSTAT_NO_DATA : + (done_ccb->ccb_h.flags & CAM_DIR_OUT) ? + DEVSTAT_WRITE : DEVSTAT_READ, NULL, + &io_req->start_time); + break; + case XPT_SMP_IO: + /* + * XXX KDM this isn't quite right, but there isn't + * currently an easy way to represent a bidirectional + * transfer in devstat. The only way to do it + * and have the byte counts come out right would + * mean that we would have to record two + * transactions, one for the request and one for the + * response. For now, so that we report something, + * just treat the entire thing as a read. + */ + devstat_end_transaction(softc->device_stats, + done_ccb->smpio.smp_request_len + + done_ccb->smpio.smp_response_len, + DEVSTAT_TAG_SIMPLE, DEVSTAT_READ, NULL, + &io_req->start_time); + break; + default: + devstat_end_transaction(softc->device_stats, 0, + DEVSTAT_TAG_NONE, DEVSTAT_NO_DATA, NULL, + &io_req->start_time); + break; + } + + /* + * In the normal case, take the completed I/O off of the + * active queue and put it on the done queue. Notitfy the + * user that we have a completed I/O. + */ + if ((io_req->flags & PASS_IO_ABANDONED) == 0) { + TAILQ_REMOVE(&softc->active_queue, io_req, links); + TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links); + selwakeuppri(&softc->read_select, PRIBIO); + KNOTE_LOCKED(&softc->read_select.si_note, 0); + } else { + /* + * In the case of an abandoned I/O (final close + * without fetching the I/O), take it off of the + * abandoned queue and free it. + */ + TAILQ_REMOVE(&softc->abandoned_queue, io_req, links); + passiocleanup(softc, io_req); + uma_zfree(softc->pass_zone, io_req); + + /* + * Release the done_ccb here, since we may wind up + * freeing the peripheral when we decrement the + * reference count below. + */ + xpt_release_ccb(done_ccb); + + /* + * If the abandoned queue is empty, we can release + * our reference to the periph since we won't have + * any more completions coming. + */ + if ((TAILQ_EMPTY(&softc->abandoned_queue)) + && (softc->flags & PASS_FLAG_ABANDONED_REF_SET)) { + softc->flags &= ~PASS_FLAG_ABANDONED_REF_SET; + cam_periph_release_locked(periph); + } + + /* + * We have already released the CCB, so we can + * return. + */ + return; + } + break; + } + } + xpt_release_ccb(done_ccb); +} + +static int +passcreatezone(struct cam_periph *periph) +{ + struct pass_softc *softc; + int error; + + error = 0; + softc = (struct pass_softc *)periph->softc; + + cam_periph_assert(periph, MA_OWNED); + KASSERT(((softc->flags & PASS_FLAG_ZONE_VALID) == 0), + ("%s called when the pass(4) zone is valid!\n", __func__)); + KASSERT((softc->pass_zone == NULL), + ("%s called when the pass(4) zone is allocated!\n", __func__)); + + if ((softc->flags & PASS_FLAG_ZONE_INPROG) == 0) { + + /* + * We're the first context through, so we need to create + * the pass(4) UMA zone for I/O requests. + */ + softc->flags |= PASS_FLAG_ZONE_INPROG; + + /* + * uma_zcreate() does a blocking (M_WAITOK) allocation, + * so we cannot hold a mutex while we call it. + */ + cam_periph_unlock(periph); + + softc->pass_zone = uma_zcreate(softc->zone_name, + sizeof(struct pass_io_req), NULL, NULL, NULL, NULL, + /*align*/ 0, /*flags*/ 0); + + softc->pass_io_zone = uma_zcreate(softc->io_zone_name, + softc->io_zone_size, NULL, NULL, NULL, NULL, + /*align*/ 0, /*flags*/ 0); + + cam_periph_lock(periph); + + if ((softc->pass_zone == NULL) + || (softc->pass_io_zone == NULL)) { + if (softc->pass_zone == NULL) + xpt_print(periph->path, "unable to allocate " + "IO Req UMA zone\n"); + else + xpt_print(periph->path, "unable to allocate " + "IO UMA zone\n"); + softc->flags &= ~PASS_FLAG_ZONE_INPROG; + goto bailout; + } + + /* + * Set the flags appropriately and notify any other waiters. + */ + softc->flags &= PASS_FLAG_ZONE_INPROG; + softc->flags |= PASS_FLAG_ZONE_VALID; + wakeup(&softc->pass_zone); + } else { + /* + * In this case, the UMA zone has not yet been created, but + * another context is in the process of creating it. We + * need to sleep until the creation is either done or has + * failed. + */ + while ((softc->flags & PASS_FLAG_ZONE_INPROG) + && ((softc->flags & PASS_FLAG_ZONE_VALID) == 0)) { + error = msleep(&softc->pass_zone, + cam_periph_mtx(periph), PRIBIO, + "paszon", 0); + if (error != 0) + goto bailout; + } + /* + * If the zone creation failed, no luck for the user. + */ + if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0){ + error = ENOMEM; + goto bailout; + } + } +bailout: + return (error); +} + +static void +passiocleanup(struct pass_softc *softc, struct pass_io_req *io_req) +{ + union ccb *ccb; + u_int8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; + int i, numbufs; + + ccb = &io_req->ccb; + + switch (ccb->ccb_h.func_code) { + case XPT_DEV_MATCH: + numbufs = min(io_req->num_bufs, 2); + + if (numbufs == 1) { + data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; + } else { + data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; + data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; + } + break; + case XPT_SCSI_IO: + case XPT_CONT_TARGET_IO: + data_ptrs[0] = &ccb->csio.data_ptr; + numbufs = min(io_req->num_bufs, 1); + break; + case XPT_ATA_IO: + data_ptrs[0] = &ccb->ataio.data_ptr; + numbufs = min(io_req->num_bufs, 1); + break; + case XPT_SMP_IO: + numbufs = min(io_req->num_bufs, 2); + data_ptrs[0] = &ccb->smpio.smp_request; + data_ptrs[1] = &ccb->smpio.smp_response; + break; + case XPT_DEV_ADVINFO: + numbufs = min(io_req->num_bufs, 1); + data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; + break; + default: + /* allow ourselves to be swapped once again */ + return; + break; /* NOTREACHED */ + } + + if (io_req->flags & PASS_IO_USER_SEG_MALLOC) { + free(io_req->user_segptr, M_SCSIPASS); + io_req->user_segptr = NULL; + } + + /* + * We only want to free memory we malloced. + */ + if (io_req->data_flags == CAM_DATA_VADDR) { + for (i = 0; i < io_req->num_bufs; i++) { + if (io_req->kern_bufs[i] == NULL) + continue; + + free(io_req->kern_bufs[i], M_SCSIPASS); + io_req->kern_bufs[i] = NULL; + } + } else if (io_req->data_flags == CAM_DATA_SG) { + for (i = 0; i < io_req->num_kern_segs; i++) { + if ((uint8_t *)(uintptr_t) + io_req->kern_segptr[i].ds_addr == NULL) + continue; + + uma_zfree(softc->pass_io_zone, (uint8_t *)(uintptr_t) + io_req->kern_segptr[i].ds_addr); + io_req->kern_segptr[i].ds_addr = 0; + } + } + + if (io_req->flags & PASS_IO_KERN_SEG_MALLOC) { + free(io_req->kern_segptr, M_SCSIPASS); + io_req->kern_segptr = NULL; + } + + if (io_req->data_flags != CAM_DATA_PADDR) { + for (i = 0; i < numbufs; i++) { + /* + * Restore the user's buffer pointers to their + * previous values. + */ + if (io_req->user_bufs[i] != NULL) + *data_ptrs[i] = io_req->user_bufs[i]; + } + } + +} + +static int +passcopysglist(struct cam_periph *periph, struct pass_io_req *io_req, + ccb_flags direction) +{ + bus_size_t kern_watermark, user_watermark, len_copied, len_to_copy; + bus_dma_segment_t *user_sglist, *kern_sglist; + int i, j, error; + + error = 0; + kern_watermark = 0; + user_watermark = 0; + len_to_copy = 0; + len_copied = 0; + user_sglist = io_req->user_segptr; + kern_sglist = io_req->kern_segptr; + + for (i = 0, j = 0; i < io_req->num_user_segs && + j < io_req->num_kern_segs;) { + uint8_t *user_ptr, *kern_ptr; + + len_to_copy = min(user_sglist[i].ds_len -user_watermark, + kern_sglist[j].ds_len - kern_watermark); + + user_ptr = (uint8_t *)(uintptr_t)user_sglist[i].ds_addr; + user_ptr = user_ptr + user_watermark; + kern_ptr = (uint8_t *)(uintptr_t)kern_sglist[j].ds_addr; + kern_ptr = kern_ptr + kern_watermark; + + user_watermark += len_to_copy; + kern_watermark += len_to_copy; + + if (!useracc(user_ptr, len_to_copy, + (direction == CAM_DIR_IN) ? VM_PROT_WRITE : VM_PROT_READ)) { + xpt_print(periph->path, "%s: unable to access user " + "S/G list element %p len %zu\n", __func__, + user_ptr, len_to_copy); + error = EFAULT; + goto bailout; + } + + if (direction == CAM_DIR_IN) { + error = copyout(kern_ptr, user_ptr, len_to_copy); + if (error != 0) { + xpt_print(periph->path, "%s: copyout of %u " + "bytes from %p to %p failed with " + "error %d\n", __func__, len_to_copy, + kern_ptr, user_ptr, error); + goto bailout; + } + } else { + error = copyin(user_ptr, kern_ptr, len_to_copy); + if (error != 0) { + xpt_print(periph->path, "%s: copyin of %u " + "bytes from %p to %p failed with " + "error %d\n", __func__, len_to_copy, + user_ptr, kern_ptr, error); + goto bailout; + } + } + + len_copied += len_to_copy; + + if (user_sglist[i].ds_len == user_watermark) { + i++; + user_watermark = 0; + } + + if (kern_sglist[j].ds_len == kern_watermark) { + j++; + kern_watermark = 0; + } + } + +bailout: + + return (error); +} + +static int +passmemsetup(struct cam_periph *periph, struct pass_io_req *io_req) +{ + union ccb *ccb; + struct pass_softc *softc; + int numbufs, i; + uint8_t **data_ptrs[CAM_PERIPH_MAXMAPS]; + uint32_t lengths[CAM_PERIPH_MAXMAPS]; + uint32_t dirs[CAM_PERIPH_MAXMAPS]; + uint32_t num_segs; + uint16_t *seg_cnt_ptr; + size_t maxmap; + int error; + + cam_periph_assert(periph, MA_NOTOWNED); + + softc = periph->softc; + + error = 0; + ccb = &io_req->ccb; + maxmap = 0; + num_segs = 0; + seg_cnt_ptr = NULL; + + switch(ccb->ccb_h.func_code) { + case XPT_DEV_MATCH: + if (ccb->cdm.match_buf_len == 0) { + printf("%s: invalid match buffer length 0\n", __func__); + return(EINVAL); + } + if (ccb->cdm.pattern_buf_len > 0) { + data_ptrs[0] = (u_int8_t **)&ccb->cdm.patterns; + lengths[0] = ccb->cdm.pattern_buf_len; + dirs[0] = CAM_DIR_OUT; + data_ptrs[1] = (u_int8_t **)&ccb->cdm.matches; + lengths[1] = ccb->cdm.match_buf_len; + dirs[1] = CAM_DIR_IN; + numbufs = 2; + } else { + data_ptrs[0] = (u_int8_t **)&ccb->cdm.matches; + lengths[0] = ccb->cdm.match_buf_len; + dirs[0] = CAM_DIR_IN; + numbufs = 1; + } + io_req->data_flags = CAM_DATA_VADDR; + break; + case XPT_SCSI_IO: + case XPT_CONT_TARGET_IO: + if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) + return(0); + + /* + * The user shouldn't be able to supply a bio. + */ + if ((ccb->ccb_h.flags & CAM_DATA_MASK) == CAM_DATA_BIO) + return (EINVAL); + + io_req->data_flags = ccb->ccb_h.flags & CAM_DATA_MASK; + + data_ptrs[0] = &ccb->csio.data_ptr; + lengths[0] = ccb->csio.dxfer_len; + dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; + num_segs = ccb->csio.sglist_cnt; + seg_cnt_ptr = &ccb->csio.sglist_cnt; + numbufs = 1; + maxmap = softc->maxio; + break; + case XPT_ATA_IO: + if ((ccb->ccb_h.flags & CAM_DIR_MASK) == CAM_DIR_NONE) + return(0); + + /* + * We only support a single virtual address for ATA I/O. + */ + if ((ccb->ccb_h.flags & CAM_DATA_MASK) != CAM_DATA_VADDR) + return (EINVAL); + + io_req->data_flags = CAM_DATA_VADDR; + + data_ptrs[0] = &ccb->ataio.data_ptr; + lengths[0] = ccb->ataio.dxfer_len; + dirs[0] = ccb->ccb_h.flags & CAM_DIR_MASK; + numbufs = 1; + maxmap = softc->maxio; + break; + case XPT_SMP_IO: + io_req->data_flags = CAM_DATA_VADDR; + + data_ptrs[0] = &ccb->smpio.smp_request; + lengths[0] = ccb->smpio.smp_request_len; + dirs[0] = CAM_DIR_OUT; + data_ptrs[1] = &ccb->smpio.smp_response; + lengths[1] = ccb->smpio.smp_response_len; + dirs[1] = CAM_DIR_IN; + numbufs = 2; + maxmap = softc->maxio; + break; + case XPT_DEV_ADVINFO: + if (ccb->cdai.bufsiz == 0) + return (0); + + io_req->data_flags = CAM_DATA_VADDR; + + data_ptrs[0] = (uint8_t **)&ccb->cdai.buf; + lengths[0] = ccb->cdai.bufsiz; + dirs[0] = CAM_DIR_IN; + numbufs = 1; + break; + default: + return(EINVAL); + break; /* NOTREACHED */ + } + + io_req->num_bufs = numbufs; + + /* + * If there is a maximum, check to make sure that the user's + * request fits within the limit. In general, we should only have + * a maximum length for requests that go to hardware. Otherwise it + * is whatever we're able to malloc. + */ + for (i = 0; i < numbufs; i++) { + io_req->user_bufs[i] = *data_ptrs[i]; + io_req->dirs[i] = dirs[i]; + io_req->lengths[i] = lengths[i]; + + if (maxmap == 0) + continue; + + if (lengths[i] <= maxmap) + continue; + + xpt_print(periph->path, "%s: data length %u > max allowed %u " + "bytes\n", __func__, lengths[i], maxmap); + error = EINVAL; + goto bailout; + } + + switch (io_req->data_flags) { + case CAM_DATA_VADDR: + /* Map or copy the buffer into kernel address space */ + for (i = 0; i < numbufs; i++) { + uint8_t *tmp_buf; + + /* + * If for some reason no length is specified, we + * don't need to allocate anything. + */ + if (io_req->lengths[i] == 0) + continue; + + /* + * Make sure that the user's buffer is accessible + * to that process. + */ + if (!useracc(io_req->user_bufs[i], io_req->lengths[i], + (io_req->dirs[i] == CAM_DIR_IN) ? VM_PROT_WRITE : + VM_PROT_READ)) { + xpt_print(periph->path, "%s: user address %p " + "length %u is not accessible\n", __func__, + io_req->user_bufs[i], io_req->lengths[i]); + error = EFAULT; + goto bailout; + } + + tmp_buf = malloc(lengths[i], M_SCSIPASS, + M_WAITOK | M_ZERO); + io_req->kern_bufs[i] = tmp_buf; + *data_ptrs[i] = tmp_buf; + +#if 0 + xpt_print(periph->path, "%s: malloced %p len %u, user " + "buffer %p, operation: %s\n", __func__, + tmp_buf, lengths[i], io_req->user_bufs[i], + (dirs[i] == CAM_DIR_IN) ? "read" : "write"); +#endif + /* + * We only need to copy in if the user is writing. + */ + if (dirs[i] != CAM_DIR_OUT) + continue; + + error = copyin(io_req->user_bufs[i], + io_req->kern_bufs[i], lengths[i]); + if (error != 0) { + xpt_print(periph->path, "%s: copy of user " + "buffer from %p to %p failed with " + "error %d\n", __func__, + io_req->user_bufs[i], + io_req->kern_bufs[i], error); + goto bailout; + } + } + break; + case CAM_DATA_PADDR: + /* Pass down the pointer as-is */ + break; + case CAM_DATA_SG: { + size_t sg_length, size_to_go, alloc_size; + uint32_t num_segs_needed; + + /* + * Copy the user S/G list in, and then copy in the + * individual segments. + */ + /* + * We shouldn't see this, but check just in case. + */ + if (numbufs != 1) { + xpt_print(periph->path, "%s: cannot currently handle " + "more than one S/G list per CCB\n", __func__); + error = EINVAL; + goto bailout; + } + + /* + * We have to have at least one segment. + */ + if (num_segs == 0) { + xpt_print(periph->path, "%s: CAM_DATA_SG flag set, " + "but sglist_cnt=0!\n", __func__); + error = EINVAL; + goto bailout; + } + + /* + * Make sure the user specified the total length and didn't + * just leave it to us to decode the S/G list. + */ + if (lengths[0] == 0) { + xpt_print(periph->path, "%s: no dxfer_len specified, " + "but CAM_DATA_SG flag is set!\n", __func__); + error = EINVAL; + goto bailout; + } + + /* + * We allocate buffers in io_zone_size increments for an + * S/G list. This will generally be MAXPHYS. + */ + if (lengths[0] <= softc->io_zone_size) + num_segs_needed = 1; + else { + num_segs_needed = lengths[0] / softc->io_zone_size; + if ((lengths[0] % softc->io_zone_size) != 0) + num_segs_needed++; + } + + /* Figure out the size of the S/G list */ + sg_length = num_segs * sizeof(bus_dma_segment_t); + io_req->num_user_segs = num_segs; + io_req->num_kern_segs = num_segs_needed; + + /* Save the user's S/G list pointer for later restoration */ + io_req->user_bufs[0] = *data_ptrs[0]; + + /* + * If we have enough segments allocated by default to handle + * the length of the user's S/G list, + */ + if (num_segs > PASS_MAX_SEGS) { + io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) * + num_segs, M_SCSIPASS, M_WAITOK | M_ZERO); + io_req->flags |= PASS_IO_USER_SEG_MALLOC; + } else + io_req->user_segptr = io_req->user_segs; + + if (!useracc(*data_ptrs[0], sg_length, VM_PROT_READ)) { + xpt_print(periph->path, "%s: unable to access user " + "S/G list at %p\n", __func__, *data_ptrs[0]); + error = EFAULT; + goto bailout; + } + + error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length); + if (error != 0) { + xpt_print(periph->path, "%s: copy of user S/G list " + "from %p to %p failed with error %d\n", + __func__, *data_ptrs[0], io_req->user_segptr, + error); + goto bailout; + } + + if (num_segs_needed > PASS_MAX_SEGS) { + io_req->kern_segptr = malloc(sizeof(bus_dma_segment_t) * + num_segs_needed, M_SCSIPASS, M_WAITOK | M_ZERO); + io_req->flags |= PASS_IO_KERN_SEG_MALLOC; + } else { + io_req->kern_segptr = io_req->kern_segs; + } + + /* + * Allocate the kernel S/G list. + */ + for (size_to_go = lengths[0], i = 0; + size_to_go > 0 && i < num_segs_needed; + i++, size_to_go -= alloc_size) { + uint8_t *kern_ptr; + + alloc_size = min(size_to_go, softc->io_zone_size); + kern_ptr = uma_zalloc(softc->pass_io_zone, M_WAITOK); + io_req->kern_segptr[i].ds_addr = + (bus_addr_t)(uintptr_t)kern_ptr; + io_req->kern_segptr[i].ds_len = alloc_size; + } + if (size_to_go > 0) { + printf("%s: size_to_go = %zu, software error!\n", + __func__, size_to_go); + error = EINVAL; + goto bailout; + } + + *data_ptrs[0] = (uint8_t *)io_req->kern_segptr; + *seg_cnt_ptr = io_req->num_kern_segs; + + /* + * We only need to copy data here if the user is writing. + */ + if (dirs[0] == CAM_DIR_OUT) + error = passcopysglist(periph, io_req, dirs[0]); + break; + } + case CAM_DATA_SG_PADDR: { + size_t sg_length; + + /* + * We shouldn't see this, but check just in case. + */ + if (numbufs != 1) { + printf("%s: cannot currently handle more than one " + "S/G list per CCB\n", __func__); + error = EINVAL; + goto bailout; + } + + /* + * We have to have at least one segment. + */ + if (num_segs == 0) { + xpt_print(periph->path, "%s: CAM_DATA_SG_PADDR flag " + "set, but sglist_cnt=0!\n", __func__); + error = EINVAL; + goto bailout; + } + + /* + * Make sure the user specified the total length and didn't + * just leave it to us to decode the S/G list. + */ + if (lengths[0] == 0) { + xpt_print(periph->path, "%s: no dxfer_len specified, " + "but CAM_DATA_SG flag is set!\n", __func__); + error = EINVAL; + goto bailout; + } + + /* Figure out the size of the S/G list */ + sg_length = num_segs * sizeof(bus_dma_segment_t); + io_req->num_user_segs = num_segs; + io_req->num_kern_segs = io_req->num_user_segs; + + /* Save the user's S/G list pointer for later restoration */ + io_req->user_bufs[0] = *data_ptrs[0]; + + if (num_segs > PASS_MAX_SEGS) { + io_req->user_segptr = malloc(sizeof(bus_dma_segment_t) * + num_segs, M_SCSIPASS, M_WAITOK | M_ZERO); + io_req->flags |= PASS_IO_USER_SEG_MALLOC; + } else + io_req->user_segptr = io_req->user_segs; + + io_req->kern_segptr = io_req->user_segptr; + + error = copyin(*data_ptrs[0], io_req->user_segptr, sg_length); + if (error != 0) { + xpt_print(periph->path, "%s: copy of user S/G list " + "from %p to %p failed with error %d\n", + __func__, *data_ptrs[0], io_req->user_segptr, + error); + goto bailout; + } + break; + } + default: + case CAM_DATA_BIO: + /* + * A user shouldn't be attaching a bio to the CCB. It + * isn't a user-accessible structure. + */ + error = EINVAL; + break; + } + +bailout: + if (error != 0) + passiocleanup(softc, io_req); + + return (error); +} + +static int +passmemdone(struct cam_periph *periph, struct pass_io_req *io_req) +{ + struct pass_softc *softc; + union ccb *ccb; + int error; + int i; + + error = 0; + softc = (struct pass_softc *)periph->softc; + ccb = &io_req->ccb; + + switch (io_req->data_flags) { + case CAM_DATA_VADDR: + /* + * Copy back to the user buffer if this was a read. + */ + for (i = 0; i < io_req->num_bufs; i++) { + if (io_req->dirs[i] != CAM_DIR_IN) + continue; + + error = copyout(io_req->kern_bufs[i], + io_req->user_bufs[i], io_req->lengths[i]); + if (error != 0) { + xpt_print(periph->path, "Unable to copy %u " + "bytes from %p to user address %p\n", + io_req->lengths[i], + io_req->kern_bufs[i], + io_req->user_bufs[i]); + goto bailout; + } + + } + break; + case CAM_DATA_PADDR: + /* Do nothing. The pointer is a physical address already */ + break; + case CAM_DATA_SG: + /* + * Copy back to the user buffer if this was a read. + * Restore the user's S/G list buffer pointer. + */ + if (io_req->dirs[0] == CAM_DIR_IN) + error = passcopysglist(periph, io_req, io_req->dirs[0]); + break; + case CAM_DATA_SG_PADDR: + /* + * Restore the user's S/G list buffer pointer. No need to + * copy. + */ + break; + default: + case CAM_DATA_BIO: + error = EINVAL; + break; + } + +bailout: + /* + * Reset the user's pointers to their original values and free + * allocated memory. + */ + passiocleanup(softc, io_req); + + return (error); +} + static int passioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread *td) { @@ -622,15 +1828,317 @@ passdoioctl(struct cdev *dev, u_long cmd, caddr_t addr, int flag, struct thread break; } + case CAMIOQUEUE: + { + struct pass_io_req *io_req; + union ccb **user_ccb, *ccb; + xpt_opcode fc; + + if ((softc->flags & PASS_FLAG_ZONE_VALID) == 0) { + error = passcreatezone(periph); + if (error != 0) + goto bailout; + } + + /* + * We're going to do a blocking allocation for this I/O + * request, so we have to drop the lock. + */ + cam_periph_unlock(periph); + + io_req = uma_zalloc(softc->pass_zone, M_WAITOK | M_ZERO); + ccb = &io_req->ccb; + user_ccb = (union ccb **)addr; + + /* + * Unlike the CAMIOCOMMAND ioctl above, we only have a + * pointer to the user's CCB, so we have to copy the whole + * thing in to a buffer we have allocated (above) instead + * of allowing the ioctl code to malloc a buffer and copy + * it in. + * + * This is an advantage for this asynchronous interface, + * since we don't want the memory to get freed while the + * CCB is outstanding. + */ +#if 0 + xpt_print(periph->path, "Copying user CCB %p to " + "kernel address %p\n", *user_ccb, ccb); +#endif + error = copyin(*user_ccb, ccb, sizeof(*ccb)); + if (error != 0) { + xpt_print(periph->path, "Copy of user CCB %p to " + "kernel address %p failed with error %d\n", + *user_ccb, ccb, error); + uma_zfree(softc->pass_zone, io_req); + cam_periph_lock(periph); + break; + } + + /* + * Some CCB types, like scan bus and scan lun can only go + * through the transport layer device. + */ + if (ccb->ccb_h.func_code & XPT_FC_XPT_ONLY) { + xpt_print(periph->path, "CCB function code %#x is " + "restricted to the XPT device\n", + ccb->ccb_h.func_code); + uma_zfree(softc->pass_zone, io_req); + cam_periph_lock(periph); + error = ENODEV; + break; + } + + /* + * Save the user's CCB pointer as well as his linked list + * pointers and peripheral private area so that we can + * restore these later. + */ + io_req->user_ccb_ptr = *user_ccb; + io_req->user_periph_links = ccb->ccb_h.periph_links; + io_req->user_periph_priv = ccb->ccb_h.periph_priv; + + /* + * Now that we've saved the user's values, we can set our + * own peripheral private entry. + */ + ccb->ccb_h.ccb_ioreq = io_req; + + /* Compatibility for RL/priority-unaware code. */ + priority = ccb->ccb_h.pinfo.priority; + if (priority <= CAM_PRIORITY_OOB) + priority += CAM_PRIORITY_OOB + 1; + + /* + * Setup fields in the CCB like the path and the priority. + * The path in particular cannot be done in userland, since + * it is a pointer to a kernel data structure. + */ + xpt_setup_ccb_flags(&ccb->ccb_h, periph->path, priority, + ccb->ccb_h.flags); + + /* + * Setup our done routine. There is no way for the user to + * have a valid pointer here. + */ + ccb->ccb_h.cbfcnp = passdone; + + fc = ccb->ccb_h.func_code; + /* + * If this function code has memory that can be mapped in + * or out, we need to call passmemsetup(). + */ + if ((fc == XPT_SCSI_IO) || (fc == XPT_ATA_IO) + || (fc == XPT_SMP_IO) || (fc == XPT_DEV_MATCH) + || (fc == XPT_DEV_ADVINFO)) { + error = passmemsetup(periph, io_req); + if (error != 0) { + uma_zfree(softc->pass_zone, io_req); + cam_periph_lock(periph); + break; + } + } else + io_req->mapinfo.num_bufs_used = 0; + + cam_periph_lock(periph); + + /* + * Everything goes on the incoming queue initially. + */ + TAILQ_INSERT_TAIL(&softc->incoming_queue, io_req, links); + + /* + * If the CCB is queued, and is not a user CCB, then + * we need to allocate a slot for it. Call xpt_schedule() + * so that our start routine will get called when a CCB is + * available. + */ + if ((fc & XPT_FC_QUEUED) + && ((fc & XPT_FC_USER_CCB) == 0)) { + xpt_schedule(periph, priority); + break; + } + + /* + * At this point, the CCB in question is either an + * immediate CCB (like XPT_DEV_ADVINFO) or it is a user CCB + * and therefore should be malloced, not allocated via a slot. + * Remove the CCB from the incoming queue and add it to the + * active queue. + */ + TAILQ_REMOVE(&softc->incoming_queue, io_req, links); + TAILQ_INSERT_TAIL(&softc->active_queue, io_req, links); + + xpt_action(ccb); + + /* + * If this is not a queued CCB (i.e. it is an immediate CCB), + * then it is already done. We need to put it on the done + * queue for the user to fetch. + */ + if ((fc & XPT_FC_QUEUED) == 0) { + TAILQ_REMOVE(&softc->active_queue, io_req, links); + TAILQ_INSERT_TAIL(&softc->done_queue, io_req, links); + } + break; + } + case CAMIOGET: + { + union ccb **user_ccb; + struct pass_io_req *io_req; + int old_error; + + user_ccb = (union ccb **)addr; + old_error = 0; + + io_req = TAILQ_FIRST(&softc->done_queue); + if (io_req == NULL) { + error = ENOENT; + break; + } + + /* + * Remove the I/O from the done queue. + */ + TAILQ_REMOVE(&softc->done_queue, io_req, links); + + /* + * We have to drop the lock during the copyout because the + * copyout can result in VM faults that require sleeping. + */ + cam_periph_unlock(periph); + + /* + * Do any needed copies (e.g. for reads) and revert the + * pointers in the CCB back to the user's pointers. + */ + error = passmemdone(periph, io_req); + + old_error = error; + + io_req->ccb.ccb_h.periph_links = io_req->user_periph_links; + io_req->ccb.ccb_h.periph_priv = io_req->user_periph_priv; + +#if 0 + xpt_print(periph->path, "Copying to user CCB %p from " + "kernel address %p\n", *user_ccb, &io_req->ccb); +#endif + + error = copyout(&io_req->ccb, *user_ccb, sizeof(union ccb)); + if (error != 0) { + xpt_print(periph->path, "Copy to user CCB %p from " + "kernel address %p failed with error %d\n", + *user_ccb, &io_req->ccb, error); + } + + /* + * Prefer the first error we got back, and make sure we + * don't overwrite bad status with good. + */ + if (old_error != 0) + error = old_error; + + cam_periph_lock(periph); + + /* + * At this point, if there was an error, we could potentially + * re-queue the I/O and try again. But why? The error + * would almost certainly happen again. We might as well + * not leak memory. + */ + uma_zfree(softc->pass_zone, io_req); + break; + } default: error = cam_periph_ioctl(periph, cmd, addr, passerror); break; } +bailout: cam_periph_unlock(periph); + return(error); } +static int +passpoll(struct cdev *dev, int poll_events, struct thread *td) +{ + struct cam_periph *periph; + struct pass_softc *softc; + int revents; + + periph = (struct cam_periph *)dev->si_drv1; + if (periph == NULL) + return (ENXIO); + + softc = (struct pass_softc *)periph->softc; + + revents = poll_events & (POLLOUT | POLLWRNORM); + if ((poll_events & (POLLIN | POLLRDNORM)) != 0) { + cam_periph_lock(periph); + + if (!TAILQ_EMPTY(&softc->done_queue)) { + revents |= poll_events & (POLLIN | POLLRDNORM); + } + cam_periph_unlock(periph); + if (revents == 0) + selrecord(td, &softc->read_select); + } + + return (revents); +} + +static int +passkqfilter(struct cdev *dev, struct knote *kn) +{ + struct cam_periph *periph; + struct pass_softc *softc; + + periph = (struct cam_periph *)dev->si_drv1; + if (periph == NULL) + return (ENXIO); + + softc = (struct pass_softc *)periph->softc; + + kn->kn_hook = (caddr_t)periph; + kn->kn_fop = &passread_filtops; + knlist_add(&softc->read_select.si_note, kn, 0); + + return (0); +} + +static void +passreadfiltdetach(struct knote *kn) +{ + struct cam_periph *periph; + struct pass_softc *softc; + + periph = (struct cam_periph *)kn->kn_hook; + softc = (struct pass_softc *)periph->softc; + + knlist_remove(&softc->read_select.si_note, kn, 0); +} + +static int +passreadfilt(struct knote *kn, long hint) +{ + struct cam_periph *periph; + struct pass_softc *softc; + int retval; + + periph = (struct cam_periph *)kn->kn_hook; + softc = (struct pass_softc *)periph->softc; + + cam_periph_assert(periph, MA_OWNED); + + if (TAILQ_EMPTY(&softc->done_queue)) + retval = 0; + else + retval = 1; + + return (retval); +} + /* * Generally, "ccb" should be the CCB supplied by the kernel. "inccb" * should be the CCB that is copied in from the user. @@ -652,6 +2160,10 @@ passsendccb(struct cam_periph *periph, union ccb *ccb, union ccb *inccb) xpt_merge_ccb(ccb, inccb); /* + */ + ccb->ccb_h.cbfcnp = passdone; + + /* * Let cam_periph_mapmem do a sanity check on the data pointer format. * Even if no data transfer is needed, it's a cheap check and it * simplifies the code. diff --git a/sys/cam/scsi/scsi_pass.h b/sys/cam/scsi/scsi_pass.h index ae0e058..797ef08 100644 --- a/sys/cam/scsi/scsi_pass.h +++ b/sys/cam/scsi/scsi_pass.h @@ -39,4 +39,12 @@ #define CAMIOCOMMAND _IOWR(CAM_VERSION, 2, union ccb) #define CAMGETPASSTHRU _IOWR(CAM_VERSION, 3, union ccb) +/* + * These two ioctls take a union ccb *, but that is not explicitly declared + * to avoid having the ioctl handling code malloc and free their own copy + * of the CCB or the CCB pointer. + */ +#define CAMIOQUEUE _IO(CAM_VERSION, 4) +#define CAMIOGET _IO(CAM_VERSION, 5) + #endif diff --git a/sys/dev/md/md.c b/sys/dev/md/md.c index dccd5b3..27ef8b3 100644 --- a/sys/dev/md/md.c +++ b/sys/dev/md/md.c @@ -99,6 +99,8 @@ #include <vm/swap_pager.h> #include <vm/uma.h> +#include <machine/bus.h> + #define MD_MODVER 1 #define MD_SHUTDOWN 0x10000 /* Tell worker thread to terminate. */ @@ -435,7 +437,7 @@ g_md_start(struct bio *bp) #define MD_MALLOC_MOVE_CMP 5 static int -md_malloc_move(vm_page_t **mp, int *ma_offs, unsigned sectorsize, +md_malloc_move_ma(vm_page_t **mp, int *ma_offs, unsigned sectorsize, void *ptr, u_char fill, int op) { struct sf_buf *sf; @@ -497,7 +499,7 @@ md_malloc_move(vm_page_t **mp, int *ma_offs, unsigned sectorsize, } break; default: - KASSERT(0, ("md_malloc_move unknown op %d\n", op)); + KASSERT(0, ("md_malloc_move_ma unknown op %d\n", op)); break; } if (error != 0) @@ -520,10 +522,68 @@ md_malloc_move(vm_page_t **mp, int *ma_offs, unsigned sectorsize, } static int +md_malloc_move_vlist(bus_dma_segment_t **pvlist, int *pma_offs, + unsigned len, void *ptr, u_char fill, int op) +{ + bus_dma_segment_t *vlist; + uint8_t *p, *end, first; + off_t *uc; + int ma_offs, seg_len; + + vlist = *pvlist; + ma_offs = *pma_offs; + uc = ptr; + + for (; len != 0; len -= seg_len) { + seg_len = imin(vlist->ds_len - ma_offs, len); + p = (uint8_t *)(uintptr_t)vlist->ds_addr + ma_offs; + switch (op) { + case MD_MALLOC_MOVE_ZERO: + bzero(p, seg_len); + break; + case MD_MALLOC_MOVE_FILL: + memset(p, fill, seg_len); + break; + case MD_MALLOC_MOVE_READ: + bcopy(ptr, p, seg_len); + cpu_flush_dcache(p, seg_len); + break; + case MD_MALLOC_MOVE_WRITE: + bcopy(p, ptr, seg_len); + break; + case MD_MALLOC_MOVE_CMP: + end = p + seg_len; + first = *uc = *p; + /* Confirm all following bytes match the first */ + while (++p < end) { + if (*p != first) + return (EDOOFUS); + } + break; + default: + KASSERT(0, ("md_malloc_move_vlist unknown op %d\n", op)); + break; + } + + ma_offs += seg_len; + if (ma_offs == vlist->ds_len) { + ma_offs = 0; + vlist++; + } + ptr = (uint8_t *)ptr + seg_len; + } + *pvlist = vlist; + *pma_offs = ma_offs; + + return (0); +} + +static int mdstart_malloc(struct md_s *sc, struct bio *bp) { u_char *dst; vm_page_t *m; + bus_dma_segment_t *vlist; int i, error, error1, ma_offs, notmapped; off_t secno, nsec, uc; uintptr_t sp, osp; @@ -538,10 +598,16 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) } notmapped = (bp->bio_flags & BIO_UNMAPPED) != 0; + vlist = (bp->bio_flags & BIO_VLIST) != 0 ? + (bus_dma_segment_t *)bp->bio_data : NULL; if (notmapped) { m = bp->bio_ma; ma_offs = bp->bio_ma_offset; dst = NULL; + KASSERT(vlist == NULL, ("vlists cannot be unmapped")); + } else if (vlist != NULL) { + ma_offs = bp->bio_ma_offset; + dst = NULL; } else { dst = bp->bio_data; } @@ -557,23 +623,36 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) } else if (bp->bio_cmd == BIO_READ) { if (osp == 0) { if (notmapped) { - error = md_malloc_move(&m, &ma_offs, + error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, NULL, 0, MD_MALLOC_MOVE_ZERO); + } else if (vlist != NULL) { + error = md_malloc_move_vlist(&vlist, + &ma_offs, sc->sectorsize, NULL, 0, + MD_MALLOC_MOVE_ZERO); } else bzero(dst, sc->sectorsize); } else if (osp <= 255) { if (notmapped) { - error = md_malloc_move(&m, &ma_offs, + error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, NULL, osp, MD_MALLOC_MOVE_FILL); + } else if (vlist != NULL) { + error = md_malloc_move_vlist(&vlist, + &ma_offs, sc->sectorsize, NULL, osp, + MD_MALLOC_MOVE_FILL); } else memset(dst, osp, sc->sectorsize); } else { if (notmapped) { - error = md_malloc_move(&m, &ma_offs, + error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_READ); + } else if (vlist != NULL) { + error = md_malloc_move_vlist(&vlist, + &ma_offs, sc->sectorsize, + (void *)osp, 0, + MD_MALLOC_MOVE_READ); } else { bcopy((void *)osp, dst, sc->sectorsize); cpu_flush_dcache(dst, sc->sectorsize); @@ -583,10 +662,15 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) } else if (bp->bio_cmd == BIO_WRITE) { if (sc->flags & MD_COMPRESS) { if (notmapped) { - error1 = md_malloc_move(&m, &ma_offs, + error1 = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, &uc, 0, MD_MALLOC_MOVE_CMP); i = error1 == 0 ? sc->sectorsize : 0; + } else if (vlist != NULL) { + error1 = md_malloc_move_vlist(&vlist, + &ma_offs, sc->sectorsize, &uc, 0, + MD_MALLOC_MOVE_CMP); + i = error1 == 0 ? sc->sectorsize : 0; } else { uc = dst[0]; for (i = 1; i < sc->sectorsize; i++) { @@ -611,10 +695,15 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) break; } if (notmapped) { - error = md_malloc_move(&m, + error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)sp, 0, MD_MALLOC_MOVE_WRITE); + } else if (vlist != NULL) { + error = md_malloc_move_vlist( + &vlist, &ma_offs, + sc->sectorsize, (void *)sp, + 0, MD_MALLOC_MOVE_WRITE); } else { bcopy(dst, (void *)sp, sc->sectorsize); @@ -622,10 +711,15 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) error = s_write(sc->indir, secno, sp); } else { if (notmapped) { - error = md_malloc_move(&m, + error = md_malloc_move_ma(&m, &ma_offs, sc->sectorsize, (void *)osp, 0, MD_MALLOC_MOVE_WRITE); + } else if (vlist != NULL) { + error = md_malloc_move_vlist( + &vlist, &ma_offs, + sc->sectorsize, (void *)osp, + 0, MD_MALLOC_MOVE_WRITE); } else { bcopy(dst, (void *)osp, sc->sectorsize); @@ -641,26 +735,78 @@ mdstart_malloc(struct md_s *sc, struct bio *bp) if (error != 0) break; secno++; - if (!notmapped) + if (!notmapped && vlist == NULL) dst += sc->sectorsize; } bp->bio_resid = 0; return (error); } +static void +mdcopyto_vlist(void *src, bus_dma_segment_t *vlist, off_t offset, off_t len) +{ + off_t seg_len; + + while (offset >= vlist->ds_len) { + offset -= vlist->ds_len; + vlist++; + } + + while (len != 0) { + seg_len = omin(len, vlist->ds_len - offset); + bcopy(src, (void *)(uintptr_t)(vlist->ds_addr + offset), + seg_len); + offset = 0; + src = (uint8_t *)src + seg_len; + len -= seg_len; + vlist++; + } +} + +static void +mdcopyfrom_vlist(bus_dma_segment_t *vlist, off_t offset, void *dst, off_t len) +{ + off_t seg_len; + + while (offset >= vlist->ds_len) { + offset -= vlist->ds_len; + vlist++; + } + + while (len != 0) { + seg_len = omin(len, vlist->ds_len - offset); + bcopy((void *)(uintptr_t)(vlist->ds_addr + offset), dst, + seg_len); + offset = 0; + dst = (uint8_t *)dst + seg_len; + len -= seg_len; + vlist++; + } +} + static int mdstart_preload(struct md_s *sc, struct bio *bp) { + uint8_t *p; + p = sc->pl_ptr + bp->bio_offset; switch (bp->bio_cmd) { case BIO_READ: - bcopy(sc->pl_ptr + bp->bio_offset, bp->bio_data, - bp->bio_length); + if ((bp->bio_flags & BIO_VLIST) != 0) { + mdcopyto_vlist(p, (bus_dma_segment_t *)bp->bio_data, + bp->bio_ma_offset, bp->bio_length); + } else { + bcopy(p, bp->bio_data, bp->bio_length); + } cpu_flush_dcache(bp->bio_data, bp->bio_length); break; case BIO_WRITE: - bcopy(bp->bio_data, sc->pl_ptr + bp->bio_offset, - bp->bio_length); + if ((bp->bio_flags & BIO_VLIST) != 0) { + mdcopyfrom_vlist((bus_dma_segment_t *)bp->bio_data, + bp->bio_ma_offset, p, bp->bio_length); + } else { + bcopy(bp->bio_data, p, bp->bio_length); + } break; } bp->bio_resid = 0; @@ -673,16 +819,23 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) int error; struct uio auio; struct iovec aiov; + struct iovec *piov; struct mount *mp; struct vnode *vp; struct buf *pb; + bus_dma_segment_t *vlist; struct thread *td; - off_t end, zerosize; + off_t len, zerosize; + int ma_offs; switch (bp->bio_cmd) { case BIO_READ: + auio.uio_rw = UIO_READ; + break; case BIO_WRITE: case BIO_DELETE: + auio.uio_rw = UIO_WRITE; + break; case BIO_FLUSH: break; default: @@ -691,6 +844,9 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) td = curthread; vp = sc->vnode; + pb = NULL; + piov = NULL; + ma_offs = bp->bio_ma_offset; /* * VNODE I/O @@ -709,73 +865,66 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) return (error); } - bzero(&auio, sizeof(auio)); + auio.uio_offset = (vm_ooffset_t)bp->bio_offset; + auio.uio_resid = bp->bio_length; + auio.uio_segflg = UIO_SYSSPACE; + auio.uio_td = td; - /* - * Special case for BIO_DELETE. On the surface, this is very - * similar to BIO_WRITE, except that we write from our own - * fixed-length buffer, so we have to loop. The net result is - * that the two cases end up having very little in common. - */ if (bp->bio_cmd == BIO_DELETE) { + /* + * Emulate BIO_DELETE by writing zeros. + */ zerosize = ZERO_REGION_SIZE - (ZERO_REGION_SIZE % sc->sectorsize); - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = (vm_ooffset_t)bp->bio_offset; - auio.uio_segflg = UIO_SYSSPACE; - auio.uio_rw = UIO_WRITE; - auio.uio_td = td; - end = bp->bio_offset + bp->bio_length; - (void) vn_start_write(vp, &mp, V_WAIT); - vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); - error = 0; - while (auio.uio_offset < end) { - aiov.iov_base = __DECONST(void *, zero_region); - aiov.iov_len = end - auio.uio_offset; - if (aiov.iov_len > zerosize) - aiov.iov_len = zerosize; - auio.uio_resid = aiov.iov_len; - error = VOP_WRITE(vp, &auio, - sc->flags & MD_ASYNC ? 0 : IO_SYNC, sc->cred); - if (error != 0) - break; + auio.uio_iovcnt = howmany(bp->bio_length, zerosize); + piov = malloc(sizeof(*piov) * auio.uio_iovcnt, M_MD, M_WAITOK); + auio.uio_iov = piov; + len = bp->bio_length; + while (len > 0) { + piov->iov_base = __DECONST(void *, zero_region); + piov->iov_len = len; + if (len > zerosize) + piov->iov_len = zerosize; + len -= piov->iov_len; + piov++; } - VOP_UNLOCK(vp, 0); - vn_finished_write(mp); - bp->bio_resid = end - auio.uio_offset; - return (error); - } - - KASSERT(bp->bio_length <= MAXPHYS, ("bio_length %jd", - (uintmax_t)bp->bio_length)); - if ((bp->bio_flags & BIO_UNMAPPED) == 0) { - pb = NULL; - aiov.iov_base = bp->bio_data; - } else { + piov = auio.uio_iov; + } else if ((bp->bio_flags & BIO_VLIST) != 0) { + piov = malloc(sizeof(*piov) * bp->bio_ma_n, M_MD, M_WAITOK); + auio.uio_iov = piov; + vlist = (bus_dma_segment_t *)bp->bio_data; + len = bp->bio_length; + while (len > 0) { + piov->iov_base = (void *)(uintptr_t)(vlist->ds_addr + + ma_offs); + piov->iov_len = vlist->ds_len - ma_offs; + if (piov->iov_len > len) + piov->iov_len = len; + len -= piov->iov_len; + ma_offs = 0; + vlist++; + piov++; + } + auio.uio_iovcnt = piov - auio.uio_iov; + piov = auio.uio_iov; + } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pb = getpbuf(&md_vnode_pbuf_freecnt); pmap_qenter((vm_offset_t)pb->b_data, bp->bio_ma, bp->bio_ma_n); - aiov.iov_base = (void *)((vm_offset_t)pb->b_data + - bp->bio_ma_offset); + aiov.iov_base = (void *)((vm_offset_t)pb->b_data + ma_offs); + aiov.iov_len = bp->bio_length; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; + } else { + aiov.iov_base = bp->bio_data; + aiov.iov_len = bp->bio_length; + auio.uio_iov = &aiov; + auio.uio_iovcnt = 1; } - aiov.iov_len = bp->bio_length; - auio.uio_iov = &aiov; - auio.uio_iovcnt = 1; - auio.uio_offset = (vm_ooffset_t)bp->bio_offset; - auio.uio_segflg = UIO_SYSSPACE; - if (bp->bio_cmd == BIO_READ) - auio.uio_rw = UIO_READ; - else if (bp->bio_cmd == BIO_WRITE) - auio.uio_rw = UIO_WRITE; - else - panic("wrong BIO_OP in mdstart_vnode"); - auio.uio_resid = bp->bio_length; - auio.uio_td = td; /* * When reading set IO_DIRECT to try to avoid double-caching * the data. When writing IO_DIRECT is not optimal. */ - if (bp->bio_cmd == BIO_READ) { + if (auio.uio_rw == UIO_READ) { vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); error = VOP_READ(vp, &auio, IO_DIRECT, sc->cred); VOP_UNLOCK(vp, 0); @@ -787,10 +936,15 @@ mdstart_vnode(struct md_s *sc, struct bio *bp) VOP_UNLOCK(vp, 0); vn_finished_write(mp); } - if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + + if (pb) { pmap_qremove((vm_offset_t)pb->b_data, bp->bio_ma_n); relpbuf(pb, &md_vnode_pbuf_freecnt); } + + if (piov != NULL) + free(piov, M_MD); + bp->bio_resid = auio.uio_resid; return (error); } @@ -801,6 +955,7 @@ mdstart_swap(struct md_s *sc, struct bio *bp) vm_page_t m; u_char *p; vm_pindex_t i, lastp; + bus_dma_segment_t *vlist; int rv, ma_offs, offs, len, lastend; switch (bp->bio_cmd) { @@ -813,7 +968,10 @@ mdstart_swap(struct md_s *sc, struct bio *bp) } p = bp->bio_data; - ma_offs = (bp->bio_flags & BIO_UNMAPPED) == 0 ? 0 : bp->bio_ma_offset; + ma_offs = (bp->bio_flags & (BIO_UNMAPPED|BIO_VLIST)) != 0 ? + bp->bio_ma_offset : 0; + vlist = (bp->bio_flags & BIO_VLIST) != 0 ? + (bus_dma_segment_t *)bp->bio_data : NULL; /* * offs is the offset at which to start operating on the @@ -853,6 +1011,10 @@ mdstart_swap(struct md_s *sc, struct bio *bp) if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pmap_copy_pages(&m, offs, bp->bio_ma, ma_offs, len); + } else if ((bp->bio_flags & BIO_VLIST) != 0) { + physcopyout_vlist(VM_PAGE_TO_PHYS(m) + offs, + vlist, ma_offs, len); + cpu_flush_dcache(p, len); } else { physcopyout(VM_PAGE_TO_PHYS(m) + offs, p, len); cpu_flush_dcache(p, len); @@ -869,6 +1031,9 @@ mdstart_swap(struct md_s *sc, struct bio *bp) if ((bp->bio_flags & BIO_UNMAPPED) != 0) { pmap_copy_pages(bp->bio_ma, ma_offs, &m, offs, len); + } else if ((bp->bio_flags & BIO_VLIST) != 0) { + physcopyin_vlist(vlist, ma_offs, + VM_PAGE_TO_PHYS(m) + offs, len); } else { physcopyin(p, VM_PAGE_TO_PHYS(m) + offs, len); } diff --git a/sys/geom/geom_disk.c b/sys/geom/geom_disk.c index 9319b97..1a879f7 100644 --- a/sys/geom/geom_disk.c +++ b/sys/geom/geom_disk.c @@ -58,6 +58,8 @@ __FBSDID("$FreeBSD$"); #include <dev/led/led.h> +#include <machine/bus.h> + struct g_disk_softc { struct mtx done_mtx; struct disk *dp; @@ -273,6 +275,145 @@ g_disk_ioctl(struct g_provider *pp, u_long cmd, void * data, int fflag, struct t return (error); } +static off_t +g_disk_maxsize(struct disk *dp, struct bio *bp) +{ + if (bp->bio_cmd == BIO_DELETE) + return (dp->d_delmaxsize); + return (dp->d_maxsize); +} + +static int +g_disk_maxsegs(struct disk *dp, struct bio *bp) +{ + return ((g_disk_maxsize(dp, bp) / PAGE_SIZE) + 1); +} + +static void +g_disk_advance(struct disk *dp, struct bio *bp, off_t off) +{ + + bp->bio_offset += off; + bp->bio_length -= off; + + if ((bp->bio_flags & BIO_VLIST) != 0) { + bus_dma_segment_t *seg, *end; + + seg = (bus_dma_segment_t *)bp->bio_data; + end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; + off += bp->bio_ma_offset; + while (off >= seg->ds_len) { + KASSERT((seg != end), + ("vlist request runs off the end")); + off -= seg->ds_len; + seg++; + } + bp->bio_ma_offset = off; + bp->bio_ma_n = end - seg; + bp->bio_data = (void *)seg; + } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + bp->bio_ma += off / PAGE_SIZE; + bp->bio_ma_offset += off; + bp->bio_ma_offset %= PAGE_SIZE; + bp->bio_ma_n -= off / PAGE_SIZE; + } else { + bp->bio_data += off; + } +} + +static void +g_disk_seg_limit(bus_dma_segment_t *seg, off_t *poffset, + off_t *plength, int *ppages) +{ + uintptr_t seg_page_base; + uintptr_t seg_page_end; + off_t offset; + off_t length; + int seg_pages; + + offset = *poffset; + length = *plength; + + if (length > seg->ds_len - offset) + length = seg->ds_len - offset; + + seg_page_base = trunc_page(seg->ds_addr + offset); + seg_page_end = round_page(seg->ds_addr + offset + length); + seg_pages = (seg_page_end - seg_page_base) >> PAGE_SHIFT; + + if (seg_pages > *ppages) { + seg_pages = *ppages; + length = (seg_page_base + (seg_pages << PAGE_SHIFT)) - + (seg->ds_addr + offset); + } + + *poffset = 0; + *plength -= length; + *ppages -= seg_pages; +} + +static off_t +g_disk_vlist_limit(struct disk *dp, struct bio *bp, bus_dma_segment_t **pendseg) +{ + bus_dma_segment_t *seg, *end; + off_t residual; + off_t offset; + int pages; + + seg = (bus_dma_segment_t *)bp->bio_data; + end = (bus_dma_segment_t *)bp->bio_data + bp->bio_ma_n; + residual = bp->bio_length; + offset = bp->bio_ma_offset; + pages = g_disk_maxsegs(dp, bp); + while (residual != 0 && pages != 0) { + KASSERT((seg != end), + ("vlist limit runs off the end")); + g_disk_seg_limit(seg, &offset, &residual, &pages); + seg++; + } + if (pendseg != NULL) + *pendseg = seg; + return (residual); +} + +static bool +g_disk_limit(struct disk *dp, struct bio *bp) +{ + bool limited = false; + off_t maxsz; + + maxsz = g_disk_maxsize(dp, bp); + + /* + * XXX: If we have a stripesize we should really use it here. + * Care should be taken in the delete case if this is done + * as deletes can be very sensitive to size given how they + * are processed. + */ + if (bp->bio_length > maxsz) { + bp->bio_length = maxsz; + limited = true; + } + + if ((bp->bio_flags & BIO_VLIST) != 0) { + bus_dma_segment_t *firstseg, *endseg; + off_t residual; + + firstseg = (bus_dma_segment_t*)bp->bio_data; + residual = g_disk_vlist_limit(dp, bp, &endseg); + if (residual != 0) { + bp->bio_ma_n = endseg - firstseg; + bp->bio_length -= residual; + limited = true; + } + } else if ((bp->bio_flags & BIO_UNMAPPED) != 0) { + bp->bio_ma_n = + howmany(bp->bio_ma_offset + bp->bio_length, PAGE_SIZE); + } + + return (limited); +} + static void g_disk_start(struct bio *bp) { @@ -297,6 +438,9 @@ g_disk_start(struct bio *bp) /* fall-through */ case BIO_READ: case BIO_WRITE: + KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) != 0 || + (bp->bio_flags & BIO_UNMAPPED) == 0, + ("unmapped bio not supported by disk %s", dp->d_name)); off = 0; bp3 = NULL; bp2 = g_clone_bio(bp); @@ -304,39 +448,10 @@ g_disk_start(struct bio *bp) error = ENOMEM; break; } - do { - off_t d_maxsize; - - d_maxsize = (bp->bio_cmd == BIO_DELETE) ? - dp->d_delmaxsize : dp->d_maxsize; - bp2->bio_offset += off; - bp2->bio_length -= off; - if ((bp->bio_flags & BIO_UNMAPPED) == 0) { - bp2->bio_data += off; - } else { - KASSERT((dp->d_flags & DISKFLAG_UNMAPPED_BIO) - != 0, - ("unmapped bio not supported by disk %s", - dp->d_name)); - bp2->bio_ma += off / PAGE_SIZE; - bp2->bio_ma_offset += off; - bp2->bio_ma_offset %= PAGE_SIZE; - bp2->bio_ma_n -= off / PAGE_SIZE; - } - if (bp2->bio_length > d_maxsize) { - /* - * XXX: If we have a stripesize we should really - * use it here. Care should be taken in the delete - * case if this is done as deletes can be very - * sensitive to size given how they are processed. - */ - bp2->bio_length = d_maxsize; - if ((bp->bio_flags & BIO_UNMAPPED) != 0) { - bp2->bio_ma_n = howmany( - bp2->bio_ma_offset + - bp2->bio_length, PAGE_SIZE); - } - off += d_maxsize; + for (;;) { + if (g_disk_limit(dp, bp2)) { + off += bp2->bio_length; + /* * To avoid a race, we need to grab the next bio * before we schedule this one. See "notes". @@ -355,9 +470,14 @@ g_disk_start(struct bio *bp) g_disk_lock_giant(dp); dp->d_strategy(bp2); g_disk_unlock_giant(dp); + + if (bp3 == NULL) + break; + bp2 = bp3; bp3 = NULL; - } while (bp2 != NULL); + g_disk_advance(dp, bp2, off); + } break; case BIO_GETATTR: /* Give the driver a chance to override */ diff --git a/sys/geom/geom_io.c b/sys/geom/geom_io.c index f1edc70..9dff151 100644 --- a/sys/geom/geom_io.c +++ b/sys/geom/geom_io.c @@ -205,11 +205,12 @@ g_clone_bio(struct bio *bp) /* * BIO_ORDERED flag may be used by disk drivers to enforce * ordering restrictions, so this flag needs to be cloned. - * BIO_UNMAPPED should be inherited, to properly indicate - * which way the buffer is passed. + * BIO_UNMAPPED and BIO_VLIST should be inherited, to properly + * indicate which way the buffer is passed. * Other bio flags are not suitable for cloning. */ - bp2->bio_flags = bp->bio_flags & (BIO_ORDERED | BIO_UNMAPPED); + bp2->bio_flags = bp->bio_flags & + (BIO_ORDERED | BIO_UNMAPPED | BIO_VLIST); bp2->bio_length = bp->bio_length; bp2->bio_offset = bp->bio_offset; bp2->bio_data = bp->bio_data; @@ -240,7 +241,7 @@ g_duplicate_bio(struct bio *bp) struct bio *bp2; bp2 = uma_zalloc(biozone, M_WAITOK | M_ZERO); - bp2->bio_flags = bp->bio_flags & BIO_UNMAPPED; + bp2->bio_flags = bp->bio_flags & (BIO_UNMAPPED | BIO_VLIST); bp2->bio_parent = bp; bp2->bio_cmd = bp->bio_cmd; bp2->bio_length = bp->bio_length; diff --git a/sys/ia64/include/bus.h b/sys/ia64/include/bus.h index 966a75d3..a9b09c6 100644 --- a/sys/ia64/include/bus.h +++ b/sys/ia64/include/bus.h @@ -123,6 +123,7 @@ #define BUS_SPACE_UNRESTRICTED (~0) +#ifdef _KERNEL /* * Map and unmap a region of device bus space into CPU virtual address space. @@ -815,6 +816,8 @@ bus_space_copy_region_8(bus_space_tag_t bst, bus_space_handle_t sbsh, #define bus_space_copy_region_stream_4 bus_space_copy_region_4 #define bus_space_copy_region_stream_8 bus_space_copy_region_8 +#endif /* _KERNEL */ + #include <machine/bus_dma.h> #endif /* _MACHINE_BUS_H_ */ diff --git a/sys/kern/subr_bus_dma.c b/sys/kern/subr_bus_dma.c index a16d8c8..ae30276 100644 --- a/sys/kern/subr_bus_dma.c +++ b/sys/kern/subr_bus_dma.c @@ -54,19 +54,32 @@ __FBSDID("$FreeBSD$"); #include <machine/bus.h> /* - * Load a list of virtual addresses. + * Load up data starting at offset within a region specified by a + * list of virtual address ranges until either length or the region + * are exhausted. */ static int _bus_dmamap_load_vlist(bus_dma_tag_t dmat, bus_dmamap_t map, bus_dma_segment_t *list, int sglist_cnt, struct pmap *pmap, int *nsegs, - int flags) + int flags, size_t offset, size_t length) { int error; error = 0; - for (; sglist_cnt > 0; sglist_cnt--, list++) { - error = _bus_dmamap_load_buffer(dmat, map, - (void *)(uintptr_t)list->ds_addr, list->ds_len, pmap, + for (; sglist_cnt > 0 && length != 0; sglist_cnt--, list++) { + char *addr; + size_t ds_len; + + KASSERT((offset < list->ds_len), + ("Invalid mid-segment offset")); + addr = (char *)(uintptr_t)list->ds_addr + offset; + ds_len = list->ds_len - offset; + offset = 0; + if (ds_len > length) + ds_len = length; + length -= ds_len; + KASSERT((ds_len != 0), ("Segment length is zero")); + error = _bus_dmamap_load_buffer(dmat, map, addr, ds_len, pmap, flags, NULL, nsegs); if (error) break; @@ -118,22 +131,48 @@ _bus_dmamap_load_mbuf_sg(bus_dma_tag_t dmat, bus_dmamap_t map, } /* + * Load tlen data starting at offset within a region specified by a list of + * physical pages. + */ +static int +_bus_dmamap_load_pages(bus_dma_tag_t dmat, bus_dmamap_t map, + vm_page_t *pages, bus_size_t tlen, int offset, int *nsegs, int flags) +{ + vm_paddr_t paddr; + bus_size_t len; + int error, i; + + for (i = 0, error = 0; error == 0 && tlen > 0; i++, tlen -= len) { + len = min(PAGE_SIZE - offset, tlen); + paddr = VM_PAGE_TO_PHYS(pages[i]) + offset; + error = _bus_dmamap_load_phys(dmat, map, paddr, len, + flags, NULL, nsegs); + offset = 0; + } + return (error); +} + +/* * Load from block io. */ static int _bus_dmamap_load_bio(bus_dma_tag_t dmat, bus_dmamap_t map, struct bio *bio, int *nsegs, int flags) { - int error; - if ((bio->bio_flags & BIO_UNMAPPED) == 0) { - error = _bus_dmamap_load_buffer(dmat, map, bio->bio_data, - bio->bio_bcount, kernel_pmap, flags, NULL, nsegs); - } else { - error = _bus_dmamap_load_ma(dmat, map, bio->bio_ma, - bio->bio_bcount, bio->bio_ma_offset, flags, NULL, nsegs); + if ((bio->bio_flags & BIO_VLIST) != 0) { + bus_dma_segment_t *segs = (bus_dma_segment_t *)bio->bio_data; + return (_bus_dmamap_load_vlist(dmat, map, segs, bio->bio_ma_n, + kernel_pmap, nsegs, flags, bio->bio_ma_offset, + bio->bio_bcount)); } - return (error); + + if ((bio->bio_flags & BIO_UNMAPPED) != 0) + return (_bus_dmamap_load_pages(dmat, map, bio->bio_ma, + bio->bio_bcount, bio->bio_ma_offset, nsegs, flags)); + + return (_bus_dmamap_load_buffer(dmat, map, bio->bio_data, + bio->bio_bcount, kernel_pmap, flags, NULL, nsegs)); } int @@ -219,7 +258,7 @@ _bus_dmamap_load_ccb(bus_dma_tag_t dmat, bus_dmamap_t map, union ccb *ccb, case CAM_DATA_SG: error = _bus_dmamap_load_vlist(dmat, map, (bus_dma_segment_t *)data_ptr, sglist_cnt, kernel_pmap, - nsegs, flags); + nsegs, flags, 0, dxfer_len); break; case CAM_DATA_SG_PADDR: error = _bus_dmamap_load_plist(dmat, map, @@ -494,7 +533,7 @@ bus_dmamap_load_mem(bus_dma_tag_t dmat, bus_dmamap_t map, break; case MEMDESC_VLIST: error = _bus_dmamap_load_vlist(dmat, map, mem->u.md_list, - mem->md_opaque, kernel_pmap, &nsegs, flags); + mem->md_opaque, kernel_pmap, &nsegs, flags, 0, SIZE_T_MAX); break; case MEMDESC_PLIST: error = _bus_dmamap_load_plist(dmat, map, mem->u.md_list, diff --git a/sys/kern/subr_uio.c b/sys/kern/subr_uio.c index 87892fd..3712f92 100644 --- a/sys/kern/subr_uio.c +++ b/sys/kern/subr_uio.c @@ -62,6 +62,8 @@ __FBSDID("$FreeBSD$"); #include <vm/vm_pageout.h> #include <vm/vm_map.h> +#include <machine/bus.h> + SYSCTL_INT(_kern, KERN_IOV_MAX, iov_max, CTLFLAG_RD, SYSCTL_NULL_INT_PTR, UIO_MAXIOV, "Maximum number of elements in an I/O vector; sysconf(_SC_IOV_MAX)"); @@ -136,6 +138,58 @@ physcopyout(vm_paddr_t src, void *dst, size_t len) #undef PHYS_PAGE_COUNT int +physcopyin_vlist(bus_dma_segment_t *src, off_t offset, vm_paddr_t dst, + size_t len) +{ + size_t seg_len; + int error; + + error = 0; + while (offset >= src->ds_len) { + offset -= src->ds_len; + src++; + } + + while (len > 0 && error == 0) { + seg_len = MIN(src->ds_len - offset, len); + error = physcopyin((void *)(uintptr_t)(src->ds_addr + offset), + dst, seg_len); + offset = 0; + src++; + len -= seg_len; + dst += seg_len; + } + + return (error); +} + +int +physcopyout_vlist(vm_paddr_t src, bus_dma_segment_t *dst, off_t offset, + size_t len) +{ + size_t seg_len; + int error; + + error = 0; + while (offset >= dst->ds_len) { + offset -= dst->ds_len; + dst++; + } + + while (len > 0 && error == 0) { + seg_len = MIN(dst->ds_len - offset, len); + error = physcopyout(src, (void *)(uintptr_t)(dst->ds_addr + + offset), seg_len); + offset = 0; + dst++; + len -= seg_len; + src += seg_len; + } + + return (error); +} + +int uiomove(void *cp, int n, struct uio *uio) { diff --git a/sys/pc98/include/bus.h b/sys/pc98/include/bus.h index 3292474..2060414 100644 --- a/sys/pc98/include/bus.h +++ b/sys/pc98/include/bus.h @@ -78,7 +78,9 @@ #ifndef _PC98_BUS_H_ #define _PC98_BUS_H_ +#ifdef _KERNEL #include <sys/systm.h> +#endif /* _KERNEL */ #include <machine/_bus.h> #include <machine/cpufunc.h> @@ -92,6 +94,8 @@ #define BUS_SPACE_UNRESTRICTED (~0) +#ifdef _KERNEL + /* * address relocation table */ @@ -639,4 +643,6 @@ bus_space_barrier(bus_space_tag_t tag, bus_space_handle_t bsh, #define bus_space_copy_region_stream_4(t, h1, o1, h2, o2, c) \ bus_space_copy_region_4((t), (h1), (o1), (h2), (o2), (c)) +#endif /* _KERNEL */ + #endif /* _PC98_BUS_H_ */ diff --git a/sys/sys/bio.h b/sys/sys/bio.h index 535ce61..8b3a5fc 100644 --- a/sys/sys/bio.h +++ b/sys/sys/bio.h @@ -61,6 +61,7 @@ #define BIO_ORDERED 0x08 #define BIO_UNMAPPED 0x10 #define BIO_TRANSIENT_MAPPING 0x20 +#define BIO_VLIST 0x40 #ifdef _KERNEL struct disk; diff --git a/sys/sys/uio.h b/sys/sys/uio.h index 271a2f7..ff21b09 100644 --- a/sys/sys/uio.h +++ b/sys/sys/uio.h @@ -85,6 +85,7 @@ struct uio { struct vm_object; struct vm_page; +struct bus_dma_segment; struct uio *cloneuio(struct uio *uiop); int copyinfrom(const void * __restrict src, void * __restrict dst, @@ -98,6 +99,10 @@ int copyout_map(struct thread *td, vm_offset_t *addr, size_t sz); int copyout_unmap(struct thread *td, vm_offset_t addr, size_t sz); int physcopyin(void *src, vm_paddr_t dst, size_t len); int physcopyout(vm_paddr_t src, void *dst, size_t len); +int physcopyin_vlist(struct bus_dma_segment *src, off_t offset, + vm_paddr_t dst, size_t len); +int physcopyout_vlist(vm_paddr_t src, struct bus_dma_segment *dst, + off_t offset, size_t len); int uiomove(void *cp, int n, struct uio *uio); int uiomove_frombuf(void *buf, int buflen, struct uio *uio); int uiomove_fromphys(struct vm_page *ma[], vm_offset_t offset, int n, diff --git a/usr.sbin/Makefile b/usr.sbin/Makefile index 57effb8..8e97961 100644 --- a/usr.sbin/Makefile +++ b/usr.sbin/Makefile @@ -7,6 +7,7 @@ SUBDIR= adduser \ arp \ binmiscctl \ bsdconfig \ + camdd \ cdcontrol \ chkgrp \ chown \ diff --git a/usr.sbin/camdd/Makefile b/usr.sbin/camdd/Makefile new file mode 100644 index 0000000..0028668 --- /dev/null +++ b/usr.sbin/camdd/Makefile @@ -0,0 +1,11 @@ +# $FreeBSD$ + +PROG= camdd +SRCS= camdd.c +SDIR= ${.CURDIR}/../../sys +DPADD= ${LIBCAM} ${LIBMT} ${LIBSBUF} ${LIBBSDXML} ${LIBUTIL} ${LIBTHR} +LDADD= -lcam -lmt -lsbuf -lbsdxml -lutil -lthr +NO_WTHREAD_SAFETY= 1 +MAN= camdd.8 + +.include <bsd.prog.mk> diff --git a/usr.sbin/camdd/camdd.8 b/usr.sbin/camdd/camdd.8 new file mode 100644 index 0000000..af556bb --- /dev/null +++ b/usr.sbin/camdd/camdd.8 @@ -0,0 +1,283 @@ +.\" +.\" Copyright (c) 2015 Spectra Logic Corporation +.\" All rights reserved. +.\" +.\" Redistribution and use in source and binary forms, with or without +.\" modification, are permitted provided that the following conditions +.\" are met: +.\" 1. Redistributions of source code must retain the above copyright +.\" notice, this list of conditions, and the following disclaimer, +.\" without modification. +.\" 2. Redistributions in binary form must reproduce at minimum a disclaimer +.\" substantially similar to the "NO WARRANTY" disclaimer below +.\" ("Disclaimer") and any redistribution must be conditioned upon +.\" including a substantially similar Disclaimer requirement for further +.\" binary redistribution. +.\" +.\" NO WARRANTY +.\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +.\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +.\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR +.\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +.\" HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS +.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +.\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +.\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +.\" POSSIBILITY OF SUCH DAMAGES. +.\" +.\" Authors: Ken Merry (Spectra Logic Corporation) +.\" +.\" $FreeBSD$ +.\" +.Dd November 11, 2015 +.Dt CAMDD 8 +.Os +.Sh NAME +.Nm camdd +.Nd CAM data transfer utility +.Sh SYNOPSIS +.Nm +.Aq Fl i|o Ar pass=pass_dev|file=filename,bs=blocksize,[...] +.Op Fl C Ar retry_count +.Op Fl E +.Op Fl m Ar max_io +.Op Fl t Ar timeout +.Op Fl v +.Op Fl h +.Sh DESCRIPTION +The +.Nm +utility is a sequential data transfer utility that offers standard +.Xr read 2 +and +.Xr write 2 +operation in addition to a mode that uses the asynchronous +.Xr pass 4 +API. +The asynchronous +.Xr pass 4 +API allows multiple requests to be queued to a device simultaneously. +.Pp +.Nm +collects performance information and will display it when the transfer +completes, when +.Nm +is terminated or when it receives a SIGINFO signal. +.Pp +The following options are available: +.Bl -tag -width 12n +.It Fl i | o Ar args +Specify the input and output device or file. +Both +.Fl i +and +.Fl o +must be specified. +There are a number of parameters that can be specified. +One of the first two (file or pass) MUST be specified to indicate which I/O +method to use on the device in question. +.Bl -tag -width 9n +.It pass=dev +Specify a +.Xr pass 4 +device to operate on. +This requests that +.Nm +access the device in question be accessed via the asynchronous +.Xr pass 4 +interface. +.Pp +The device name can be a +.Xr pass 4 +name and unit number, for instance +.Dq pass0 , +or a regular peripheral driver name and unit number, for instance +.Dq da5 . +It can also be the path of a +.Xr pass 4 +or other disk device, like +.Dq /dev/da5 . +It may also be a bus:target:lun, for example: +.Dq 0:5:0 . +.Pp +Only +.Xr pass 4 +devices for +.Tn SCSI +disk-like devices are supported. +.Tn ATA +devices are not currently supported, but support could be added later. +Specifically, +.Tn SCSI +Direct Access (type 0), WORM (type 4), CDROM (type 5), and RBC (Reduced +Block Command, type 14) devices are supported. +Tape drives, medium changers, enclosures etc. are not supported. +.It file=path +Specify a file or device to operate on. +This requests that the file or device in question be accessed using the +standard +.Xr read 2 +and +.Xr write 2 +system calls. +The file interface does not support queueing multiple commands at a time. +It does support probing disk sector size and capacity information, and tape +blocksize and maximum transfer size information. +The file interface supports standard files, disks, tape drives, special +devices, pipes and standard input and output. +If the file is specified as a +.Dq - , +standard input or standard output are used. +For tape devices, the specified blocksize will be the size that +.Nm +attempts to use to write to or read from the tape. +When writing to a tape device, the blocksize is treated like a disk sector +size. +So, that means +.Nm +will not write anything smaller than the sector size. +At the end of a transfer, if there isn't sufficient data from the reader +to yield a full block, +.Nm +will add zeros on the end of the data from the reader to make up a full +block. +.It bs=N +Specify the blocksize to use for transfers. +.Nm +will attempt to read or write using the requested blocksize. +.Pp +Note that the blocksize given only applies to either the input or the +output path. +To use the same blocksize for the input and output transfers, you must +specify that blocksize with both the +.Fl i +and +.Fl o +arguments. +.Pp +The blocksize may be specified in bytes, or using any suffix (e.g. k, M, G) +supported by +.Xr expand_number 3 . +.It offset=N +Specify the starting offset for the input or output device or file. +The offset may be specified in bytes, or by using any suffix (e.g. k, M, G) +supported by +.Xr expand_number 3 . +.It depth=N +Specify a desired queue depth for the input or output path. +.Nm +will attempt to keep the requested number of requests of the specified +blocksize queued to the input or output device. +Queue depths greater than 1 are only supported for the asynchronous +.Xr pass 4 +output method. +The queue depth is maintained on a best effort basis, and may not be +possible to maintain for especially fast devices. +For writes, maintaining the queue depth also depends on a sufficiently +fast reading device. +.It mcs=N +Specify the minimum command size to use for +.Xr pass 4 +devices. +Some devices do not support 6 byte +.Tn SCSI +commands. +The +.Xr da 4 +device handles this restriction automatically, but the +.Xr pass 4 +device allows the user to specify the +.Tn SCSI +command used. +If a device does not accept 6 byte +.Tn SCSI +READ/WRITE commands (which is the default at lower LBAs), it will generally +accept 10 byte +.Tn SCSI +commands instead. +.It debug=N +Specify the debug level for this device. +There is currently only one debug level setting, so setting this to any +non-zero value will turn on debugging. +The debug facility may be expanded in the future. +.El +.It Fl C Ar count +Specify the retry count for commands sent via the asynchronous +.Xr pass 4 +interface. +This does not apply to commands sent via the file interface. +.It Fl E +Enable kernel error recovery for the +.Xr pass 4 +driver. +If error recovery is not enabled, unit attention conditions and other +transient failures may cause the transfer to fail. +.It Fl m Ar size +Specify the maximum amount of data to be transferred. +This may be specified in bytes, or by using any suffix (e.g. K, M, G) +supported by +.Xr expand_number 3 . +.It Fl t Ar timeout +Specify the command timeout in seconds to use for commands sent via the +.Xr pass 4 +driver. +.It Fl v +Enable verbose reporting of errors. +This is recommended to aid in debugging any +.Tn SCSI +issues that come up. +.It Fl h +Display the +.Nm +usage message. +.El +.Pp +If +.Nm +receives a SIGINFO signal, it will print the current input and output byte +counts, elapsed runtime and average throughput. +If +.Nm +receives a SIGINT signal, it will print the current input and output byte +counts, elapsed runtime and average throughput and then exit. +.Sh EXAMPLES +.Dl camdd -i pass=da8,bs=512k,depth=4 -o pass=da3,bs=512k,depth=4 +.Pp +Copy all data from da8 to da3 using a blocksize of 512k for both drives, +and attempt to maintain a queue depth of 4 on both the input and output +devices. +The transfer will stop when the end of either device is reached. +.Pp +.Dl camdd -i file=/dev/zero,bs=1M -o pass=da5,bs=1M,depth=4 -m 100M +.Pp +Read 1MB blocks of zeros from /dev/zero, and write them to da5 with a +desired queue depth of 4. +Stop the transfer after 100MB has been written. +.Pp +.Dl camdd -i pass=da8,bs=1M,depth=3 -o file=disk.img +.Pp +Copy disk da8 using a 1MB blocksize and desired queue depth of 3 to the +file disk.img. +.Pp +.Dl camdd -i file=/etc/rc -o file=- +.Pp +Read the file /etc/rc and write it to standard output. +.Pp +.Dl camdd -i pass=da10,bs=64k,depth=16 -o file=/dev/nsa0,bs=128k +.Pp +Copy 64K blocks from the disk da10 with a queue depth of 16, and write +to the tape drive sa0 with a 128k blocksize. +The copy will stop when either the end of the disk or tape is reached. +.Sh SEE ALSO +.Xr cam 3 , +.Xr cam 4 , +.Xr pass 4 , +.Xr camcontrol 8 +.Sh HISTORY +.Nm +first appeared in +.Fx 10.2 +.Sh AUTHORS +.An Kenneth Merry Aq Mt ken@FreeBSD.org diff --git a/usr.sbin/camdd/camdd.c b/usr.sbin/camdd/camdd.c new file mode 100644 index 0000000..573214e --- /dev/null +++ b/usr.sbin/camdd/camdd.c @@ -0,0 +1,3428 @@ +/*- + * Copyright (c) 1997-2007 Kenneth D. Merry + * Copyright (c) 2013, 2014, 2015 Spectra Logic Corporation + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + * + * Authors: Ken Merry (Spectra Logic Corporation) + */ + +/* + * This is eventually intended to be: + * - A basic data transfer/copy utility + * - A simple benchmark utility + * - An example of how to use the asynchronous pass(4) driver interface. + */ +#include <sys/cdefs.h> +__FBSDID("$FreeBSD$"); + +#include <sys/ioctl.h> +#include <sys/stdint.h> +#include <sys/types.h> +#include <sys/endian.h> +#include <sys/param.h> +#include <sys/sbuf.h> +#include <sys/stat.h> +#include <sys/event.h> +#include <sys/time.h> +#include <sys/uio.h> +#include <vm/vm.h> +#include <machine/bus.h> +#include <sys/bus.h> +#include <sys/bus_dma.h> +#include <sys/mtio.h> +#include <sys/conf.h> +#include <sys/disk.h> + +#include <stdio.h> +#include <stdlib.h> +#include <semaphore.h> +#include <string.h> +#include <unistd.h> +#include <inttypes.h> +#include <limits.h> +#include <fcntl.h> +#include <ctype.h> +#include <err.h> +#include <libutil.h> +#include <pthread.h> +#include <assert.h> +#include <bsdxml.h> + +#include <cam/cam.h> +#include <cam/cam_debug.h> +#include <cam/cam_ccb.h> +#include <cam/scsi/scsi_all.h> +#include <cam/scsi/scsi_da.h> +#include <cam/scsi/scsi_pass.h> +#include <cam/scsi/scsi_message.h> +#include <cam/scsi/smp_all.h> +#include <camlib.h> +#include <mtlib.h> +#include <zlib.h> + +typedef enum { + CAMDD_CMD_NONE = 0x00000000, + CAMDD_CMD_HELP = 0x00000001, + CAMDD_CMD_WRITE = 0x00000002, + CAMDD_CMD_READ = 0x00000003 +} camdd_cmdmask; + +typedef enum { + CAMDD_ARG_NONE = 0x00000000, + CAMDD_ARG_VERBOSE = 0x00000001, + CAMDD_ARG_DEVICE = 0x00000002, + CAMDD_ARG_BUS = 0x00000004, + CAMDD_ARG_TARGET = 0x00000008, + CAMDD_ARG_LUN = 0x00000010, + CAMDD_ARG_UNIT = 0x00000020, + CAMDD_ARG_TIMEOUT = 0x00000040, + CAMDD_ARG_ERR_RECOVER = 0x00000080, + CAMDD_ARG_RETRIES = 0x00000100 +} camdd_argmask; + +typedef enum { + CAMDD_DEV_NONE = 0x00, + CAMDD_DEV_PASS = 0x01, + CAMDD_DEV_FILE = 0x02 +} camdd_dev_type; + +struct camdd_io_opts { + camdd_dev_type dev_type; + char *dev_name; + uint64_t blocksize; + uint64_t queue_depth; + uint64_t offset; + int min_cmd_size; + int write_dev; + uint64_t debug; +}; + +typedef enum { + CAMDD_BUF_NONE, + CAMDD_BUF_DATA, + CAMDD_BUF_INDIRECT +} camdd_buf_type; + +struct camdd_buf_indirect { + /* + * Pointer to the source buffer. + */ + struct camdd_buf *src_buf; + + /* + * Offset into the source buffer, in bytes. + */ + uint64_t offset; + /* + * Pointer to the starting point in the source buffer. + */ + uint8_t *start_ptr; + + /* + * Length of this chunk in bytes. + */ + size_t len; +}; + +struct camdd_buf_data { + /* + * Buffer allocated when we allocate this camdd_buf. This should + * be the size of the blocksize for this device. + */ + uint8_t *buf; + + /* + * The amount of backing store allocated in buf. Generally this + * will be the blocksize of the device. + */ + uint32_t alloc_len; + + /* + * The amount of data that was put into the buffer (on reads) or + * the amount of data we have put onto the src_list so far (on + * writes). + */ + uint32_t fill_len; + + /* + * The amount of data that was not transferred. + */ + uint32_t resid; + + /* + * Starting byte offset on the reader. + */ + uint64_t src_start_offset; + + /* + * CCB used for pass(4) device targets. + */ + union ccb ccb; + + /* + * Number of scatter/gather segments. + */ + int sg_count; + + /* + * Set if we had to tack on an extra buffer to round the transfer + * up to a sector size. + */ + int extra_buf; + + /* + * Scatter/gather list used generally when we're the writer for a + * pass(4) device. + */ + bus_dma_segment_t *segs; + + /* + * Scatter/gather list used generally when we're the writer for a + * file or block device; + */ + struct iovec *iovec; +}; + +union camdd_buf_types { + struct camdd_buf_indirect indirect; + struct camdd_buf_data data; +}; + +typedef enum { + CAMDD_STATUS_NONE, + CAMDD_STATUS_OK, + CAMDD_STATUS_SHORT_IO, + CAMDD_STATUS_EOF, + CAMDD_STATUS_ERROR +} camdd_buf_status; + +struct camdd_buf { + camdd_buf_type buf_type; + union camdd_buf_types buf_type_spec; + + camdd_buf_status status; + + uint64_t lba; + size_t len; + + /* + * A reference count of how many indirect buffers point to this + * buffer. + */ + int refcount; + + /* + * A link back to our parent device. + */ + struct camdd_dev *dev; + STAILQ_ENTRY(camdd_buf) links; + STAILQ_ENTRY(camdd_buf) work_links; + + /* + * A count of the buffers on the src_list. + */ + int src_count; + + /* + * List of buffers from our partner thread that are the components + * of this buffer for the I/O. Uses src_links. + */ + STAILQ_HEAD(,camdd_buf) src_list; + STAILQ_ENTRY(camdd_buf) src_links; +}; + +#define NUM_DEV_TYPES 2 + +struct camdd_dev_pass { + int scsi_dev_type; + struct cam_device *dev; + uint64_t max_sector; + uint32_t block_len; + uint32_t cpi_maxio; +}; + +typedef enum { + CAMDD_FILE_NONE, + CAMDD_FILE_REG, + CAMDD_FILE_STD, + CAMDD_FILE_PIPE, + CAMDD_FILE_DISK, + CAMDD_FILE_TAPE, + CAMDD_FILE_TTY, + CAMDD_FILE_MEM +} camdd_file_type; + +typedef enum { + CAMDD_FF_NONE = 0x00, + CAMDD_FF_CAN_SEEK = 0x01 +} camdd_file_flags; + +struct camdd_dev_file { + int fd; + struct stat sb; + char filename[MAXPATHLEN + 1]; + camdd_file_type file_type; + camdd_file_flags file_flags; + uint8_t *tmp_buf; +}; + +struct camdd_dev_block { + int fd; + uint64_t size_bytes; + uint32_t block_len; +}; + +union camdd_dev_spec { + struct camdd_dev_pass pass; + struct camdd_dev_file file; + struct camdd_dev_block block; +}; + +typedef enum { + CAMDD_DEV_FLAG_NONE = 0x00, + CAMDD_DEV_FLAG_EOF = 0x01, + CAMDD_DEV_FLAG_PEER_EOF = 0x02, + CAMDD_DEV_FLAG_ACTIVE = 0x04, + CAMDD_DEV_FLAG_EOF_SENT = 0x08, + CAMDD_DEV_FLAG_EOF_QUEUED = 0x10 +} camdd_dev_flags; + +struct camdd_dev { + camdd_dev_type dev_type; + union camdd_dev_spec dev_spec; + camdd_dev_flags flags; + char device_name[MAXPATHLEN+1]; + uint32_t blocksize; + uint32_t sector_size; + uint64_t max_sector; + uint64_t sector_io_limit; + int min_cmd_size; + int write_dev; + int retry_count; + int io_timeout; + int debug; + uint64_t start_offset_bytes; + uint64_t next_io_pos_bytes; + uint64_t next_peer_pos_bytes; + uint64_t next_completion_pos_bytes; + uint64_t peer_bytes_queued; + uint64_t bytes_transferred; + uint32_t target_queue_depth; + uint32_t cur_active_io; + uint8_t *extra_buf; + uint32_t extra_buf_len; + struct camdd_dev *peer_dev; + pthread_mutex_t mutex; + pthread_cond_t cond; + int kq; + + int (*run)(struct camdd_dev *dev); + int (*fetch)(struct camdd_dev *dev); + + /* + * Buffers that are available for I/O. Uses links. + */ + STAILQ_HEAD(,camdd_buf) free_queue; + + /* + * Free indirect buffers. These are used for breaking a large + * buffer into multiple pieces. + */ + STAILQ_HEAD(,camdd_buf) free_indirect_queue; + + /* + * Buffers that have been queued to the kernel. Uses links. + */ + STAILQ_HEAD(,camdd_buf) active_queue; + + /* + * Will generally contain one of our buffers that is waiting for enough + * I/O from our partner thread to be able to execute. This will + * generally happen when our per-I/O-size is larger than the + * partner thread's per-I/O-size. Uses links. + */ + STAILQ_HEAD(,camdd_buf) pending_queue; + + /* + * Number of buffers on the pending queue + */ + int num_pending_queue; + + /* + * Buffers that are filled and ready to execute. This is used when + * our partner (reader) thread sends us blocks that are larger than + * our blocksize, and so we have to split them into multiple pieces. + */ + STAILQ_HEAD(,camdd_buf) run_queue; + + /* + * Number of buffers on the run queue. + */ + int num_run_queue; + + STAILQ_HEAD(,camdd_buf) reorder_queue; + + int num_reorder_queue; + + /* + * Buffers that have been queued to us by our partner thread + * (generally the reader thread) to be written out. Uses + * work_links. + */ + STAILQ_HEAD(,camdd_buf) work_queue; + + /* + * Buffers that have been completed by our partner thread. Uses + * work_links. + */ + STAILQ_HEAD(,camdd_buf) peer_done_queue; + + /* + * Number of buffers on the peer done queue. + */ + uint32_t num_peer_done_queue; + + /* + * A list of buffers that we have queued to our peer thread. Uses + * links. + */ + STAILQ_HEAD(,camdd_buf) peer_work_queue; + + /* + * Number of buffers on the peer work queue. + */ + uint32_t num_peer_work_queue; +}; + +static sem_t camdd_sem; +static int need_exit = 0; +static int error_exit = 0; +static int need_status = 0; + +#ifndef min +#define min(a, b) (a < b) ? a : b +#endif + +/* + * XXX KDM private copy of timespecsub(). This is normally defined in + * sys/time.h, but is only enabled in the kernel. If that definition is + * enabled in userland, it breaks the build of libnetbsd. + */ +#ifndef timespecsub +#define timespecsub(vvp, uvp) \ + do { \ + (vvp)->tv_sec -= (uvp)->tv_sec; \ + (vvp)->tv_nsec -= (uvp)->tv_nsec; \ + if ((vvp)->tv_nsec < 0) { \ + (vvp)->tv_sec--; \ + (vvp)->tv_nsec += 1000000000; \ + } \ + } while (0) +#endif + + +/* Generically usefull offsets into the peripheral private area */ +#define ppriv_ptr0 periph_priv.entries[0].ptr +#define ppriv_ptr1 periph_priv.entries[1].ptr +#define ppriv_field0 periph_priv.entries[0].field +#define ppriv_field1 periph_priv.entries[1].field + +#define ccb_buf ppriv_ptr0 + +#define CAMDD_FILE_DEFAULT_BLOCK 524288 +#define CAMDD_FILE_DEFAULT_DEPTH 1 +#define CAMDD_PASS_MAX_BLOCK 1048576 +#define CAMDD_PASS_DEFAULT_DEPTH 6 +#define CAMDD_PASS_RW_TIMEOUT 60 * 1000 + +static int parse_btl(char *tstr, int *bus, int *target, int *lun, + camdd_argmask *arglst); +void camdd_free_dev(struct camdd_dev *dev); +struct camdd_dev *camdd_alloc_dev(camdd_dev_type dev_type, + struct kevent *new_ke, int num_ke, + int retry_count, int timeout); +static struct camdd_buf *camdd_alloc_buf(struct camdd_dev *dev, + camdd_buf_type buf_type); +void camdd_release_buf(struct camdd_buf *buf); +struct camdd_buf *camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type); +int camdd_buf_sg_create(struct camdd_buf *buf, int iovec, + uint32_t sector_size, uint32_t *num_sectors_used, + int *double_buf_needed); +uint32_t camdd_buf_get_len(struct camdd_buf *buf); +void camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf); +int camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize, + uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran); +struct camdd_dev *camdd_probe_file(int fd, struct camdd_io_opts *io_opts, + int retry_count, int timeout); +struct camdd_dev *camdd_probe_pass(struct cam_device *cam_dev, + struct camdd_io_opts *io_opts, + camdd_argmask arglist, int probe_retry_count, + int probe_timeout, int io_retry_count, + int io_timeout); +void *camdd_file_worker(void *arg); +camdd_buf_status camdd_ccb_status(union ccb *ccb); +int camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf); +int camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf); +void camdd_peer_done(struct camdd_buf *buf); +void camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf, + int *error_count); +int camdd_pass_fetch(struct camdd_dev *dev); +int camdd_file_run(struct camdd_dev *dev); +int camdd_pass_run(struct camdd_dev *dev); +int camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len); +int camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf); +void camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth, + uint32_t *peer_depth, uint32_t *our_bytes, + uint32_t *peer_bytes); +void *camdd_worker(void *arg); +void camdd_sig_handler(int sig); +void camdd_print_status(struct camdd_dev *camdd_dev, + struct camdd_dev *other_dev, + struct timespec *start_time); +int camdd_rw(struct camdd_io_opts *io_opts, int num_io_opts, + uint64_t max_io, int retry_count, int timeout); +int camdd_parse_io_opts(char *args, int is_write, + struct camdd_io_opts *io_opts); +void usage(void); + +/* + * Parse out a bus, or a bus, target and lun in the following + * format: + * bus + * bus:target + * bus:target:lun + * + * Returns the number of parsed components, or 0. + */ +static int +parse_btl(char *tstr, int *bus, int *target, int *lun, camdd_argmask *arglst) +{ + char *tmpstr; + int convs = 0; + + while (isspace(*tstr) && (*tstr != '\0')) + tstr++; + + tmpstr = (char *)strtok(tstr, ":"); + if ((tmpstr != NULL) && (*tmpstr != '\0')) { + *bus = strtol(tmpstr, NULL, 0); + *arglst |= CAMDD_ARG_BUS; + convs++; + tmpstr = (char *)strtok(NULL, ":"); + if ((tmpstr != NULL) && (*tmpstr != '\0')) { + *target = strtol(tmpstr, NULL, 0); + *arglst |= CAMDD_ARG_TARGET; + convs++; + tmpstr = (char *)strtok(NULL, ":"); + if ((tmpstr != NULL) && (*tmpstr != '\0')) { + *lun = strtol(tmpstr, NULL, 0); + *arglst |= CAMDD_ARG_LUN; + convs++; + } + } + } + + return convs; +} + +/* + * XXX KDM clean up and free all of the buffers on the queue! + */ +void +camdd_free_dev(struct camdd_dev *dev) +{ + if (dev == NULL) + return; + + switch (dev->dev_type) { + case CAMDD_DEV_FILE: { + struct camdd_dev_file *file_dev = &dev->dev_spec.file; + + if (file_dev->fd != -1) + close(file_dev->fd); + free(file_dev->tmp_buf); + break; + } + case CAMDD_DEV_PASS: { + struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass; + + if (pass_dev->dev != NULL) + cam_close_device(pass_dev->dev); + break; + } + default: + break; + } + + free(dev); +} + +struct camdd_dev * +camdd_alloc_dev(camdd_dev_type dev_type, struct kevent *new_ke, int num_ke, + int retry_count, int timeout) +{ + struct camdd_dev *dev = NULL; + struct kevent *ke; + size_t ke_size; + int retval = 0; + + dev = malloc(sizeof(*dev)); + if (dev == NULL) { + warn("%s: unable to malloc %zu bytes", __func__, sizeof(*dev)); + goto bailout; + } + + bzero(dev, sizeof(*dev)); + + dev->dev_type = dev_type; + dev->io_timeout = timeout; + dev->retry_count = retry_count; + STAILQ_INIT(&dev->free_queue); + STAILQ_INIT(&dev->free_indirect_queue); + STAILQ_INIT(&dev->active_queue); + STAILQ_INIT(&dev->pending_queue); + STAILQ_INIT(&dev->run_queue); + STAILQ_INIT(&dev->reorder_queue); + STAILQ_INIT(&dev->work_queue); + STAILQ_INIT(&dev->peer_done_queue); + STAILQ_INIT(&dev->peer_work_queue); + retval = pthread_mutex_init(&dev->mutex, NULL); + if (retval != 0) { + warnc(retval, "%s: failed to initialize mutex", __func__); + goto bailout; + } + + retval = pthread_cond_init(&dev->cond, NULL); + if (retval != 0) { + warnc(retval, "%s: failed to initialize condition variable", + __func__); + goto bailout; + } + + dev->kq = kqueue(); + if (dev->kq == -1) { + warn("%s: Unable to create kqueue", __func__); + goto bailout; + } + + ke_size = sizeof(struct kevent) * (num_ke + 4); + ke = malloc(ke_size); + if (ke == NULL) { + warn("%s: unable to malloc %zu bytes", __func__, ke_size); + goto bailout; + } + bzero(ke, ke_size); + if (num_ke > 0) + bcopy(new_ke, ke, num_ke * sizeof(struct kevent)); + + EV_SET(&ke[num_ke++], (uintptr_t)&dev->work_queue, EVFILT_USER, + EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0); + EV_SET(&ke[num_ke++], (uintptr_t)&dev->peer_done_queue, EVFILT_USER, + EV_ADD|EV_ENABLE|EV_CLEAR, 0,0, 0); + EV_SET(&ke[num_ke++], SIGINFO, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0); + EV_SET(&ke[num_ke++], SIGINT, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0,0,0); + + retval = kevent(dev->kq, ke, num_ke, NULL, 0, NULL); + if (retval == -1) { + warn("%s: Unable to register kevents", __func__); + goto bailout; + } + + + return (dev); + +bailout: + free(dev); + + return (NULL); +} + +static struct camdd_buf * +camdd_alloc_buf(struct camdd_dev *dev, camdd_buf_type buf_type) +{ + struct camdd_buf *buf = NULL; + uint8_t *data_ptr = NULL; + + /* + * We only need to allocate data space for data buffers. + */ + switch (buf_type) { + case CAMDD_BUF_DATA: + data_ptr = malloc(dev->blocksize); + if (data_ptr == NULL) { + warn("unable to allocate %u bytes", dev->blocksize); + goto bailout_error; + } + break; + default: + break; + } + + buf = malloc(sizeof(*buf)); + if (buf == NULL) { + warn("unable to allocate %zu bytes", sizeof(*buf)); + goto bailout_error; + } + + bzero(buf, sizeof(*buf)); + buf->buf_type = buf_type; + buf->dev = dev; + switch (buf_type) { + case CAMDD_BUF_DATA: { + struct camdd_buf_data *data; + + data = &buf->buf_type_spec.data; + + data->alloc_len = dev->blocksize; + data->buf = data_ptr; + break; + } + case CAMDD_BUF_INDIRECT: + break; + default: + break; + } + STAILQ_INIT(&buf->src_list); + + return (buf); + +bailout_error: + if (data_ptr != NULL) + free(data_ptr); + + if (buf != NULL) + free(buf); + + return (NULL); +} + +void +camdd_release_buf(struct camdd_buf *buf) +{ + struct camdd_dev *dev; + + dev = buf->dev; + + switch (buf->buf_type) { + case CAMDD_BUF_DATA: { + struct camdd_buf_data *data; + + data = &buf->buf_type_spec.data; + + if (data->segs != NULL) { + if (data->extra_buf != 0) { + void *extra_buf; + + extra_buf = (void *) + data->segs[data->sg_count - 1].ds_addr; + free(extra_buf); + data->extra_buf = 0; + } + free(data->segs); + data->segs = NULL; + data->sg_count = 0; + } else if (data->iovec != NULL) { + if (data->extra_buf != 0) { + free(data->iovec[data->sg_count - 1].iov_base); + data->extra_buf = 0; + } + free(data->iovec); + data->iovec = NULL; + data->sg_count = 0; + } + STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); + break; + } + case CAMDD_BUF_INDIRECT: + STAILQ_INSERT_TAIL(&dev->free_indirect_queue, buf, links); + break; + default: + err(1, "%s: Invalid buffer type %d for released buffer", + __func__, buf->buf_type); + break; + } +} + +struct camdd_buf * +camdd_get_buf(struct camdd_dev *dev, camdd_buf_type buf_type) +{ + struct camdd_buf *buf = NULL; + + switch (buf_type) { + case CAMDD_BUF_DATA: + buf = STAILQ_FIRST(&dev->free_queue); + if (buf != NULL) { + struct camdd_buf_data *data; + uint8_t *data_ptr; + uint32_t alloc_len; + + STAILQ_REMOVE_HEAD(&dev->free_queue, links); + data = &buf->buf_type_spec.data; + data_ptr = data->buf; + alloc_len = data->alloc_len; + bzero(buf, sizeof(*buf)); + data->buf = data_ptr; + data->alloc_len = alloc_len; + } + break; + case CAMDD_BUF_INDIRECT: + buf = STAILQ_FIRST(&dev->free_indirect_queue); + if (buf != NULL) { + STAILQ_REMOVE_HEAD(&dev->free_indirect_queue, links); + + bzero(buf, sizeof(*buf)); + } + break; + default: + warnx("Unknown buffer type %d requested", buf_type); + break; + } + + + if (buf == NULL) + return (camdd_alloc_buf(dev, buf_type)); + else { + STAILQ_INIT(&buf->src_list); + buf->dev = dev; + buf->buf_type = buf_type; + + return (buf); + } +} + +int +camdd_buf_sg_create(struct camdd_buf *buf, int iovec, uint32_t sector_size, + uint32_t *num_sectors_used, int *double_buf_needed) +{ + struct camdd_buf *tmp_buf; + struct camdd_buf_data *data; + uint8_t *extra_buf = NULL; + size_t extra_buf_len = 0; + int i, retval = 0; + + data = &buf->buf_type_spec.data; + + data->sg_count = buf->src_count; + /* + * Compose a scatter/gather list from all of the buffers in the list. + * If the length of the buffer isn't a multiple of the sector size, + * we'll have to add an extra buffer. This should only happen + * at the end of a transfer. + */ + if ((data->fill_len % sector_size) != 0) { + extra_buf_len = sector_size - (data->fill_len % sector_size); + extra_buf = calloc(extra_buf_len, 1); + if (extra_buf == NULL) { + warn("%s: unable to allocate %zu bytes for extra " + "buffer space", __func__, extra_buf_len); + retval = 1; + goto bailout; + } + data->extra_buf = 1; + data->sg_count++; + } + if (iovec == 0) { + data->segs = calloc(data->sg_count, sizeof(bus_dma_segment_t)); + if (data->segs == NULL) { + warn("%s: unable to allocate %zu bytes for S/G list", + __func__, sizeof(bus_dma_segment_t) * + data->sg_count); + retval = 1; + goto bailout; + } + + } else { + data->iovec = calloc(data->sg_count, sizeof(struct iovec)); + if (data->iovec == NULL) { + warn("%s: unable to allocate %zu bytes for S/G list", + __func__, sizeof(struct iovec) * data->sg_count); + retval = 1; + goto bailout; + } + } + + for (i = 0, tmp_buf = STAILQ_FIRST(&buf->src_list); + i < buf->src_count && tmp_buf != NULL; i++, + tmp_buf = STAILQ_NEXT(tmp_buf, src_links)) { + + if (tmp_buf->buf_type == CAMDD_BUF_DATA) { + struct camdd_buf_data *tmp_data; + + tmp_data = &tmp_buf->buf_type_spec.data; + if (iovec == 0) { + data->segs[i].ds_addr = + (bus_addr_t) tmp_data->buf; + data->segs[i].ds_len = tmp_data->fill_len - + tmp_data->resid; + } else { + data->iovec[i].iov_base = tmp_data->buf; + data->iovec[i].iov_len = tmp_data->fill_len - + tmp_data->resid; + } + if (((tmp_data->fill_len - tmp_data->resid) % + sector_size) != 0) + *double_buf_needed = 1; + } else { + struct camdd_buf_indirect *tmp_ind; + + tmp_ind = &tmp_buf->buf_type_spec.indirect; + if (iovec == 0) { + data->segs[i].ds_addr = + (bus_addr_t)tmp_ind->start_ptr; + data->segs[i].ds_len = tmp_ind->len; + } else { + data->iovec[i].iov_base = tmp_ind->start_ptr; + data->iovec[i].iov_len = tmp_ind->len; + } + if ((tmp_ind->len % sector_size) != 0) + *double_buf_needed = 1; + } + } + + if (extra_buf != NULL) { + if (iovec == 0) { + data->segs[i].ds_addr = (bus_addr_t)extra_buf; + data->segs[i].ds_len = extra_buf_len; + } else { + data->iovec[i].iov_base = extra_buf; + data->iovec[i].iov_len = extra_buf_len; + } + i++; + } + if ((tmp_buf != NULL) || (i != data->sg_count)) { + warnx("buffer source count does not match " + "number of buffers in list!"); + retval = 1; + goto bailout; + } + +bailout: + if (retval == 0) { + *num_sectors_used = (data->fill_len + extra_buf_len) / + sector_size; + } + return (retval); +} + +uint32_t +camdd_buf_get_len(struct camdd_buf *buf) +{ + uint32_t len = 0; + + if (buf->buf_type != CAMDD_BUF_DATA) { + struct camdd_buf_indirect *indirect; + + indirect = &buf->buf_type_spec.indirect; + len = indirect->len; + } else { + struct camdd_buf_data *data; + + data = &buf->buf_type_spec.data; + len = data->fill_len; + } + + return (len); +} + +void +camdd_buf_add_child(struct camdd_buf *buf, struct camdd_buf *child_buf) +{ + struct camdd_buf_data *data; + + assert(buf->buf_type == CAMDD_BUF_DATA); + + data = &buf->buf_type_spec.data; + + STAILQ_INSERT_TAIL(&buf->src_list, child_buf, src_links); + buf->src_count++; + + data->fill_len += camdd_buf_get_len(child_buf); +} + +typedef enum { + CAMDD_TS_MAX_BLK, + CAMDD_TS_MIN_BLK, + CAMDD_TS_BLK_GRAN, + CAMDD_TS_EFF_IOSIZE +} camdd_status_item_index; + +static struct camdd_status_items { + const char *name; + struct mt_status_entry *entry; +} req_status_items[] = { + { "max_blk", NULL }, + { "min_blk", NULL }, + { "blk_gran", NULL }, + { "max_effective_iosize", NULL } +}; + +int +camdd_probe_tape(int fd, char *filename, uint64_t *max_iosize, + uint64_t *max_blk, uint64_t *min_blk, uint64_t *blk_gran) +{ + struct mt_status_data status_data; + char *xml_str = NULL; + unsigned int i; + int retval = 0; + + retval = mt_get_xml_str(fd, MTIOCEXTGET, &xml_str); + if (retval != 0) + err(1, "Couldn't get XML string from %s", filename); + + retval = mt_get_status(xml_str, &status_data); + if (retval != XML_STATUS_OK) { + warn("couldn't get status for %s", filename); + retval = 1; + goto bailout; + } else + retval = 0; + + if (status_data.error != 0) { + warnx("%s", status_data.error_str); + retval = 1; + goto bailout; + } + + for (i = 0; i < sizeof(req_status_items) / + sizeof(req_status_items[0]); i++) { + char *name; + + name = __DECONST(char *, req_status_items[i].name); + req_status_items[i].entry = mt_status_entry_find(&status_data, + name); + if (req_status_items[i].entry == NULL) { + errx(1, "Cannot find status entry %s", + req_status_items[i].name); + } + } + + *max_iosize = req_status_items[CAMDD_TS_EFF_IOSIZE].entry->value_unsigned; + *max_blk= req_status_items[CAMDD_TS_MAX_BLK].entry->value_unsigned; + *min_blk= req_status_items[CAMDD_TS_MIN_BLK].entry->value_unsigned; + *blk_gran = req_status_items[CAMDD_TS_BLK_GRAN].entry->value_unsigned; +bailout: + + free(xml_str); + mt_status_free(&status_data); + + return (retval); +} + +struct camdd_dev * +camdd_probe_file(int fd, struct camdd_io_opts *io_opts, int retry_count, + int timeout) +{ + struct camdd_dev *dev = NULL; + struct camdd_dev_file *file_dev; + uint64_t blocksize = io_opts->blocksize; + + dev = camdd_alloc_dev(CAMDD_DEV_FILE, NULL, 0, retry_count, timeout); + if (dev == NULL) + goto bailout; + + file_dev = &dev->dev_spec.file; + file_dev->fd = fd; + strlcpy(file_dev->filename, io_opts->dev_name, + sizeof(file_dev->filename)); + strlcpy(dev->device_name, io_opts->dev_name, sizeof(dev->device_name)); + if (blocksize == 0) + dev->blocksize = CAMDD_FILE_DEFAULT_BLOCK; + else + dev->blocksize = blocksize; + + if ((io_opts->queue_depth != 0) + && (io_opts->queue_depth != 1)) { + warnx("Queue depth %ju for %s ignored, only 1 outstanding " + "command supported", (uintmax_t)io_opts->queue_depth, + io_opts->dev_name); + } + dev->target_queue_depth = CAMDD_FILE_DEFAULT_DEPTH; + dev->run = camdd_file_run; + dev->fetch = NULL; + + /* + * We can effectively access files on byte boundaries. We'll reset + * this for devices like disks that can be accessed on sector + * boundaries. + */ + dev->sector_size = 1; + + if ((fd != STDIN_FILENO) + && (fd != STDOUT_FILENO)) { + int retval; + + retval = fstat(fd, &file_dev->sb); + if (retval != 0) { + warn("Cannot stat %s", dev->device_name); + goto bailout; + camdd_free_dev(dev); + dev = NULL; + } + if (S_ISREG(file_dev->sb.st_mode)) { + file_dev->file_type = CAMDD_FILE_REG; + } else if (S_ISCHR(file_dev->sb.st_mode)) { + int type; + + if (ioctl(fd, FIODTYPE, &type) == -1) + err(1, "FIODTYPE ioctl failed on %s", + dev->device_name); + else { + if (type & D_TAPE) + file_dev->file_type = CAMDD_FILE_TAPE; + else if (type & D_DISK) + file_dev->file_type = CAMDD_FILE_DISK; + else if (type & D_MEM) + file_dev->file_type = CAMDD_FILE_MEM; + else if (type & D_TTY) + file_dev->file_type = CAMDD_FILE_TTY; + } + } else if (S_ISDIR(file_dev->sb.st_mode)) { + errx(1, "cannot operate on directory %s", + dev->device_name); + } else if (S_ISFIFO(file_dev->sb.st_mode)) { + file_dev->file_type = CAMDD_FILE_PIPE; + } else + errx(1, "Cannot determine file type for %s", + dev->device_name); + + switch (file_dev->file_type) { + case CAMDD_FILE_REG: + if (file_dev->sb.st_size != 0) + dev->max_sector = file_dev->sb.st_size - 1; + else + dev->max_sector = 0; + file_dev->file_flags |= CAMDD_FF_CAN_SEEK; + break; + case CAMDD_FILE_TAPE: { + uint64_t max_iosize, max_blk, min_blk, blk_gran; + /* + * Check block limits and maximum effective iosize. + * Make sure the blocksize is within the block + * limits (and a multiple of the minimum blocksize) + * and that the blocksize is <= maximum effective + * iosize. + */ + retval = camdd_probe_tape(fd, dev->device_name, + &max_iosize, &max_blk, &min_blk, &blk_gran); + if (retval != 0) + errx(1, "Unable to probe tape %s", + dev->device_name); + + /* + * The blocksize needs to be <= the maximum + * effective I/O size of the tape device. Note + * that this also takes into account the maximum + * blocksize reported by READ BLOCK LIMITS. + */ + if (dev->blocksize > max_iosize) { + warnx("Blocksize %u too big for %s, limiting " + "to %ju", dev->blocksize, dev->device_name, + max_iosize); + dev->blocksize = max_iosize; + } + + /* + * The blocksize needs to be at least min_blk; + */ + if (dev->blocksize < min_blk) { + warnx("Blocksize %u too small for %s, " + "increasing to %ju", dev->blocksize, + dev->device_name, min_blk); + dev->blocksize = min_blk; + } + + /* + * And the blocksize needs to be a multiple of + * the block granularity. + */ + if ((blk_gran != 0) + && (dev->blocksize % (1 << blk_gran))) { + warnx("Blocksize %u for %s not a multiple of " + "%d, adjusting to %d", dev->blocksize, + dev->device_name, (1 << blk_gran), + dev->blocksize & ~((1 << blk_gran) - 1)); + dev->blocksize &= ~((1 << blk_gran) - 1); + } + + if (dev->blocksize == 0) { + errx(1, "Unable to derive valid blocksize for " + "%s", dev->device_name); + } + + /* + * For tape drives, set the sector size to the + * blocksize so that we make sure not to write + * less than the blocksize out to the drive. + */ + dev->sector_size = dev->blocksize; + break; + } + case CAMDD_FILE_DISK: { + off_t media_size; + unsigned int sector_size; + + file_dev->file_flags |= CAMDD_FF_CAN_SEEK; + + if (ioctl(fd, DIOCGSECTORSIZE, §or_size) == -1) { + err(1, "DIOCGSECTORSIZE ioctl failed on %s", + dev->device_name); + } + + if (sector_size == 0) { + errx(1, "DIOCGSECTORSIZE ioctl returned " + "invalid sector size %u for %s", + sector_size, dev->device_name); + } + + if (ioctl(fd, DIOCGMEDIASIZE, &media_size) == -1) { + err(1, "DIOCGMEDIASIZE ioctl failed on %s", + dev->device_name); + } + + if (media_size == 0) { + errx(1, "DIOCGMEDIASIZE ioctl returned " + "invalid media size %ju for %s", + (uintmax_t)media_size, dev->device_name); + } + + if (dev->blocksize % sector_size) { + errx(1, "%s blocksize %u not a multiple of " + "sector size %u", dev->device_name, + dev->blocksize, sector_size); + } + + dev->sector_size = sector_size; + dev->max_sector = (media_size / sector_size) - 1; + break; + } + case CAMDD_FILE_MEM: + file_dev->file_flags |= CAMDD_FF_CAN_SEEK; + break; + default: + break; + } + } + + if ((io_opts->offset != 0) + && ((file_dev->file_flags & CAMDD_FF_CAN_SEEK) == 0)) { + warnx("Offset %ju specified for %s, but we cannot seek on %s", + io_opts->offset, io_opts->dev_name, io_opts->dev_name); + goto bailout_error; + } +#if 0 + else if ((io_opts->offset != 0) + && ((io_opts->offset % dev->sector_size) != 0)) { + warnx("Offset %ju for %s is not a multiple of the " + "sector size %u", io_opts->offset, + io_opts->dev_name, dev->sector_size); + goto bailout_error; + } else { + dev->start_offset_bytes = io_opts->offset; + } +#endif + +bailout: + return (dev); + +bailout_error: + camdd_free_dev(dev); + return (NULL); +} + +/* + * Need to implement this. Do a basic probe: + * - Check the inquiry data, make sure we're talking to a device that we + * can reasonably expect to talk to -- direct, RBC, CD, WORM. + * - Send a test unit ready, make sure the device is available. + * - Get the capacity and block size. + */ +struct camdd_dev * +camdd_probe_pass(struct cam_device *cam_dev, struct camdd_io_opts *io_opts, + camdd_argmask arglist, int probe_retry_count, + int probe_timeout, int io_retry_count, int io_timeout) +{ + union ccb *ccb; + uint64_t maxsector; + uint32_t cpi_maxio, max_iosize, pass_numblocks; + uint32_t block_len; + struct scsi_read_capacity_data rcap; + struct scsi_read_capacity_data_long rcaplong; + struct camdd_dev *dev; + struct camdd_dev_pass *pass_dev; + struct kevent ke; + int scsi_dev_type; + int retval; + + dev = NULL; + + scsi_dev_type = SID_TYPE(&cam_dev->inq_data); + maxsector = 0; + block_len = 0; + + /* + * For devices that support READ CAPACITY, we'll attempt to get the + * capacity. Otherwise, we really don't support tape or other + * devices via SCSI passthrough, so just return an error in that case. + */ + switch (scsi_dev_type) { + case T_DIRECT: + case T_WORM: + case T_CDROM: + case T_OPTICAL: + case T_RBC: + break; + default: + errx(1, "Unsupported SCSI device type %d", scsi_dev_type); + break; /*NOTREACHED*/ + } + + ccb = cam_getccb(cam_dev); + + if (ccb == NULL) { + warnx("%s: error allocating ccb", __func__); + goto bailout; + } + + bzero(&(&ccb->ccb_h)[1], + sizeof(struct ccb_scsiio) - sizeof(struct ccb_hdr)); + + scsi_read_capacity(&ccb->csio, + /*retries*/ probe_retry_count, + /*cbfcnp*/ NULL, + /*tag_action*/ MSG_SIMPLE_Q_TAG, + &rcap, + SSD_FULL_SIZE, + /*timeout*/ probe_timeout ? probe_timeout : 5000); + + /* Disable freezing the device queue */ + ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; + + if (arglist & CAMDD_ARG_ERR_RECOVER) + ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; + + if (cam_send_ccb(cam_dev, ccb) < 0) { + warn("error sending READ CAPACITY command"); + + cam_error_print(cam_dev, ccb, CAM_ESF_ALL, + CAM_EPF_ALL, stderr); + + goto bailout; + } + + if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { + cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); + retval = 1; + goto bailout; + } + + maxsector = scsi_4btoul(rcap.addr); + block_len = scsi_4btoul(rcap.length); + + /* + * A last block of 2^32-1 means that the true capacity is over 2TB, + * and we need to issue the long READ CAPACITY to get the real + * capacity. Otherwise, we're all set. + */ + if (maxsector != 0xffffffff) + goto rcap_done; + + scsi_read_capacity_16(&ccb->csio, + /*retries*/ probe_retry_count, + /*cbfcnp*/ NULL, + /*tag_action*/ MSG_SIMPLE_Q_TAG, + /*lba*/ 0, + /*reladdr*/ 0, + /*pmi*/ 0, + (uint8_t *)&rcaplong, + sizeof(rcaplong), + /*sense_len*/ SSD_FULL_SIZE, + /*timeout*/ probe_timeout ? probe_timeout : 5000); + + /* Disable freezing the device queue */ + ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; + + if (arglist & CAMDD_ARG_ERR_RECOVER) + ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; + + if (cam_send_ccb(cam_dev, ccb) < 0) { + warn("error sending READ CAPACITY (16) command"); + + cam_error_print(cam_dev, ccb, CAM_ESF_ALL, + CAM_EPF_ALL, stderr); + + retval = 1; + goto bailout; + } + + if ((ccb->ccb_h.status & CAM_STATUS_MASK) != CAM_REQ_CMP) { + cam_error_print(cam_dev, ccb, CAM_ESF_ALL, CAM_EPF_ALL, stderr); + goto bailout; + } + + maxsector = scsi_8btou64(rcaplong.addr); + block_len = scsi_4btoul(rcaplong.length); + +rcap_done: + + bzero(&(&ccb->ccb_h)[1], + sizeof(struct ccb_scsiio) - sizeof(struct ccb_hdr)); + + ccb->ccb_h.func_code = XPT_PATH_INQ; + ccb->ccb_h.flags = CAM_DIR_NONE; + ccb->ccb_h.retry_count = 1; + + if (cam_send_ccb(cam_dev, ccb) < 0) { + warn("error sending XPT_PATH_INQ CCB"); + + cam_error_print(cam_dev, ccb, CAM_ESF_ALL, + CAM_EPF_ALL, stderr); + goto bailout; + } + + EV_SET(&ke, cam_dev->fd, EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, 0); + + dev = camdd_alloc_dev(CAMDD_DEV_PASS, &ke, 1, io_retry_count, + io_timeout); + if (dev == NULL) + goto bailout; + + pass_dev = &dev->dev_spec.pass; + pass_dev->scsi_dev_type = scsi_dev_type; + pass_dev->dev = cam_dev; + pass_dev->max_sector = maxsector; + pass_dev->block_len = block_len; + pass_dev->cpi_maxio = ccb->cpi.maxio; + snprintf(dev->device_name, sizeof(dev->device_name), "%s%u", + pass_dev->dev->device_name, pass_dev->dev->dev_unit_num); + dev->sector_size = block_len; + dev->max_sector = maxsector; + + + /* + * Determine the optimal blocksize to use for this device. + */ + + /* + * If the controller has not specified a maximum I/O size, + * just go with 128K as a somewhat conservative value. + */ + if (pass_dev->cpi_maxio == 0) + cpi_maxio = 131072; + else + cpi_maxio = pass_dev->cpi_maxio; + + /* + * If the controller has a large maximum I/O size, limit it + * to something smaller so that the kernel doesn't have trouble + * allocating buffers to copy data in and out for us. + * XXX KDM this is until we have unmapped I/O support in the kernel. + */ + max_iosize = min(cpi_maxio, CAMDD_PASS_MAX_BLOCK); + + /* + * If we weren't able to get a block size for some reason, + * default to 512 bytes. + */ + block_len = pass_dev->block_len; + if (block_len == 0) + block_len = 512; + + /* + * Figure out how many blocksize chunks will fit in the + * maximum I/O size. + */ + pass_numblocks = max_iosize / block_len; + + /* + * And finally, multiple the number of blocks by the LBA + * length to get our maximum block size; + */ + dev->blocksize = pass_numblocks * block_len; + + if (io_opts->blocksize != 0) { + if ((io_opts->blocksize % dev->sector_size) != 0) { + warnx("Blocksize %ju for %s is not a multiple of " + "sector size %u", (uintmax_t)io_opts->blocksize, + dev->device_name, dev->sector_size); + goto bailout_error; + } + dev->blocksize = io_opts->blocksize; + } + dev->target_queue_depth = CAMDD_PASS_DEFAULT_DEPTH; + if (io_opts->queue_depth != 0) + dev->target_queue_depth = io_opts->queue_depth; + + if (io_opts->offset != 0) { + if (io_opts->offset > (dev->max_sector * dev->sector_size)) { + warnx("Offset %ju is past the end of device %s", + io_opts->offset, dev->device_name); + goto bailout_error; + } +#if 0 + else if ((io_opts->offset % dev->sector_size) != 0) { + warnx("Offset %ju for %s is not a multiple of the " + "sector size %u", io_opts->offset, + dev->device_name, dev->sector_size); + goto bailout_error; + } + dev->start_offset_bytes = io_opts->offset; +#endif + } + + dev->min_cmd_size = io_opts->min_cmd_size; + + dev->run = camdd_pass_run; + dev->fetch = camdd_pass_fetch; + +bailout: + cam_freeccb(ccb); + + return (dev); + +bailout_error: + cam_freeccb(ccb); + + camdd_free_dev(dev); + + return (NULL); +} + +void * +camdd_worker(void *arg) +{ + struct camdd_dev *dev = arg; + struct camdd_buf *buf; + struct timespec ts, *kq_ts; + + ts.tv_sec = 0; + ts.tv_nsec = 0; + + pthread_mutex_lock(&dev->mutex); + + dev->flags |= CAMDD_DEV_FLAG_ACTIVE; + + for (;;) { + struct kevent ke; + int retval = 0; + + /* + * XXX KDM check the reorder queue depth? + */ + if (dev->write_dev == 0) { + uint32_t our_depth, peer_depth, peer_bytes, our_bytes; + uint32_t target_depth = dev->target_queue_depth; + uint32_t peer_target_depth = + dev->peer_dev->target_queue_depth; + uint32_t peer_blocksize = dev->peer_dev->blocksize; + + camdd_get_depth(dev, &our_depth, &peer_depth, + &our_bytes, &peer_bytes); + +#if 0 + while (((our_depth < target_depth) + && (peer_depth < peer_target_depth)) + || ((peer_bytes + our_bytes) < + (peer_blocksize * 2))) { +#endif + while (((our_depth + peer_depth) < + (target_depth + peer_target_depth)) + || ((peer_bytes + our_bytes) < + (peer_blocksize * 3))) { + + retval = camdd_queue(dev, NULL); + if (retval == 1) + break; + else if (retval != 0) { + error_exit = 1; + goto bailout; + } + + camdd_get_depth(dev, &our_depth, &peer_depth, + &our_bytes, &peer_bytes); + } + } + /* + * See if we have any I/O that is ready to execute. + */ + buf = STAILQ_FIRST(&dev->run_queue); + if (buf != NULL) { + while (dev->target_queue_depth > dev->cur_active_io) { + retval = dev->run(dev); + if (retval == -1) { + dev->flags |= CAMDD_DEV_FLAG_EOF; + error_exit = 1; + break; + } else if (retval != 0) { + break; + } + } + } + + /* + * We've reached EOF, or our partner has reached EOF. + */ + if ((dev->flags & CAMDD_DEV_FLAG_EOF) + || (dev->flags & CAMDD_DEV_FLAG_PEER_EOF)) { + if (dev->write_dev != 0) { + if ((STAILQ_EMPTY(&dev->work_queue)) + && (dev->num_run_queue == 0) + && (dev->cur_active_io == 0)) { + goto bailout; + } + } else { + /* + * If we're the reader, and the writer + * got EOF, he is already done. If we got + * the EOF, then we need to wait until + * everything is flushed out for the writer. + */ + if (dev->flags & CAMDD_DEV_FLAG_PEER_EOF) { + goto bailout; + } else if ((dev->num_peer_work_queue == 0) + && (dev->num_peer_done_queue == 0) + && (dev->cur_active_io == 0) + && (dev->num_run_queue == 0)) { + goto bailout; + } + } + /* + * XXX KDM need to do something about the pending + * queue and cleanup resources. + */ + } + + if ((dev->write_dev == 0) + && (dev->cur_active_io == 0) + && (dev->peer_bytes_queued < dev->peer_dev->blocksize)) + kq_ts = &ts; + else + kq_ts = NULL; + + /* + * Run kevent to see if there are events to process. + */ + pthread_mutex_unlock(&dev->mutex); + retval = kevent(dev->kq, NULL, 0, &ke, 1, kq_ts); + pthread_mutex_lock(&dev->mutex); + if (retval == -1) { + warn("%s: error returned from kevent",__func__); + goto bailout; + } else if (retval != 0) { + switch (ke.filter) { + case EVFILT_READ: + if (dev->fetch != NULL) { + retval = dev->fetch(dev); + if (retval == -1) { + error_exit = 1; + goto bailout; + } + } + break; + case EVFILT_SIGNAL: + /* + * We register for this so we don't get + * an error as a result of a SIGINFO or a + * SIGINT. It will actually get handled + * by the signal handler. If we get a + * SIGINT, bail out without printing an + * error message. Any other signals + * will result in the error message above. + */ + if (ke.ident == SIGINT) + goto bailout; + break; + case EVFILT_USER: + retval = 0; + /* + * Check to see if the other thread has + * queued any I/O for us to do. (In this + * case we're the writer.) + */ + for (buf = STAILQ_FIRST(&dev->work_queue); + buf != NULL; + buf = STAILQ_FIRST(&dev->work_queue)) { + STAILQ_REMOVE_HEAD(&dev->work_queue, + work_links); + retval = camdd_queue(dev, buf); + /* + * We keep going unless we get an + * actual error. If we get EOF, we + * still want to remove the buffers + * from the queue and send the back + * to the reader thread. + */ + if (retval == -1) { + error_exit = 1; + goto bailout; + } else + retval = 0; + } + + /* + * Next check to see if the other thread has + * queued any completed buffers back to us. + * (In this case we're the reader.) + */ + for (buf = STAILQ_FIRST(&dev->peer_done_queue); + buf != NULL; + buf = STAILQ_FIRST(&dev->peer_done_queue)){ + STAILQ_REMOVE_HEAD( + &dev->peer_done_queue, work_links); + dev->num_peer_done_queue--; + camdd_peer_done(buf); + } + break; + default: + warnx("%s: unknown kevent filter %d", + __func__, ke.filter); + break; + } + } + } + +bailout: + + dev->flags &= ~CAMDD_DEV_FLAG_ACTIVE; + + /* XXX KDM cleanup resources here? */ + + pthread_mutex_unlock(&dev->mutex); + + need_exit = 1; + sem_post(&camdd_sem); + + return (NULL); +} + +/* + * Simplistic translation of CCB status to our local status. + */ +camdd_buf_status +camdd_ccb_status(union ccb *ccb) +{ + camdd_buf_status status = CAMDD_STATUS_NONE; + cam_status ccb_status; + + ccb_status = ccb->ccb_h.status & CAM_STATUS_MASK; + + switch (ccb_status) { + case CAM_REQ_CMP: { + if (ccb->csio.resid == 0) { + status = CAMDD_STATUS_OK; + } else if (ccb->csio.dxfer_len > ccb->csio.resid) { + status = CAMDD_STATUS_SHORT_IO; + } else { + status = CAMDD_STATUS_EOF; + } + break; + } + case CAM_SCSI_STATUS_ERROR: { + switch (ccb->csio.scsi_status) { + case SCSI_STATUS_OK: + case SCSI_STATUS_COND_MET: + case SCSI_STATUS_INTERMED: + case SCSI_STATUS_INTERMED_COND_MET: + status = CAMDD_STATUS_OK; + break; + case SCSI_STATUS_CMD_TERMINATED: + case SCSI_STATUS_CHECK_COND: + case SCSI_STATUS_QUEUE_FULL: + case SCSI_STATUS_BUSY: + case SCSI_STATUS_RESERV_CONFLICT: + default: + status = CAMDD_STATUS_ERROR; + break; + } + break; + } + default: + status = CAMDD_STATUS_ERROR; + break; + } + + return (status); +} + +/* + * Queue a buffer to our peer's work thread for writing. + * + * Returns 0 for success, -1 for failure, 1 if the other thread exited. + */ +int +camdd_queue_peer_buf(struct camdd_dev *dev, struct camdd_buf *buf) +{ + struct kevent ke; + STAILQ_HEAD(, camdd_buf) local_queue; + struct camdd_buf *buf1, *buf2; + struct camdd_buf_data *data = NULL; + uint64_t peer_bytes_queued = 0; + int active = 1; + int retval = 0; + + STAILQ_INIT(&local_queue); + + /* + * Since we're the reader, we need to queue our I/O to the writer + * in sequential order in order to make sure it gets written out + * in sequential order. + * + * Check the next expected I/O starting offset. If this doesn't + * match, put it on the reorder queue. + */ + if ((buf->lba * dev->sector_size) != dev->next_completion_pos_bytes) { + + /* + * If there is nothing on the queue, there is no sorting + * needed. + */ + if (STAILQ_EMPTY(&dev->reorder_queue)) { + STAILQ_INSERT_TAIL(&dev->reorder_queue, buf, links); + dev->num_reorder_queue++; + goto bailout; + } + + /* + * Sort in ascending order by starting LBA. There should + * be no identical LBAs. + */ + for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL; + buf1 = buf2) { + buf2 = STAILQ_NEXT(buf1, links); + if (buf->lba < buf1->lba) { + /* + * If we're less than the first one, then + * we insert at the head of the list + * because this has to be the first element + * on the list. + */ + STAILQ_INSERT_HEAD(&dev->reorder_queue, + buf, links); + dev->num_reorder_queue++; + break; + } else if (buf->lba > buf1->lba) { + if (buf2 == NULL) { + STAILQ_INSERT_TAIL(&dev->reorder_queue, + buf, links); + dev->num_reorder_queue++; + break; + } else if (buf->lba < buf2->lba) { + STAILQ_INSERT_AFTER(&dev->reorder_queue, + buf1, buf, links); + dev->num_reorder_queue++; + break; + } + } else { + errx(1, "Found buffers with duplicate LBA %ju!", + buf->lba); + } + } + goto bailout; + } else { + + /* + * We're the next expected I/O completion, so put ourselves + * on the local queue to be sent to the writer. We use + * work_links here so that we can queue this to the + * peer_work_queue before taking the buffer off of the + * local_queue. + */ + dev->next_completion_pos_bytes += buf->len; + STAILQ_INSERT_TAIL(&local_queue, buf, work_links); + + /* + * Go through the reorder queue looking for more sequential + * I/O and add it to the local queue. + */ + for (buf1 = STAILQ_FIRST(&dev->reorder_queue); buf1 != NULL; + buf1 = STAILQ_FIRST(&dev->reorder_queue)) { + /* + * As soon as we see an I/O that is out of sequence, + * we're done. + */ + if ((buf1->lba * dev->sector_size) != + dev->next_completion_pos_bytes) + break; + + STAILQ_REMOVE_HEAD(&dev->reorder_queue, links); + dev->num_reorder_queue--; + STAILQ_INSERT_TAIL(&local_queue, buf1, work_links); + dev->next_completion_pos_bytes += buf1->len; + } + } + + /* + * Setup the event to let the other thread know that it has work + * pending. + */ + EV_SET(&ke, (uintptr_t)&dev->peer_dev->work_queue, EVFILT_USER, 0, + NOTE_TRIGGER, 0, NULL); + + /* + * Put this on our shadow queue so that we know what we've queued + * to the other thread. + */ + STAILQ_FOREACH_SAFE(buf1, &local_queue, work_links, buf2) { + if (buf1->buf_type != CAMDD_BUF_DATA) { + errx(1, "%s: should have a data buffer, not an " + "indirect buffer", __func__); + } + data = &buf1->buf_type_spec.data; + + /* + * We only need to send one EOF to the writer, and don't + * need to continue sending EOFs after that. + */ + if (buf1->status == CAMDD_STATUS_EOF) { + if (dev->flags & CAMDD_DEV_FLAG_EOF_SENT) { + STAILQ_REMOVE(&local_queue, buf1, camdd_buf, + work_links); + camdd_release_buf(buf1); + retval = 1; + continue; + } + dev->flags |= CAMDD_DEV_FLAG_EOF_SENT; + } + + + STAILQ_INSERT_TAIL(&dev->peer_work_queue, buf1, links); + peer_bytes_queued += (data->fill_len - data->resid); + dev->peer_bytes_queued += (data->fill_len - data->resid); + dev->num_peer_work_queue++; + } + + if (STAILQ_FIRST(&local_queue) == NULL) + goto bailout; + + /* + * Drop our mutex and pick up the other thread's mutex. We need to + * do this to avoid deadlocks. + */ + pthread_mutex_unlock(&dev->mutex); + pthread_mutex_lock(&dev->peer_dev->mutex); + + if (dev->peer_dev->flags & CAMDD_DEV_FLAG_ACTIVE) { + /* + * Put the buffers on the other thread's incoming work queue. + */ + for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL; + buf1 = STAILQ_FIRST(&local_queue)) { + STAILQ_REMOVE_HEAD(&local_queue, work_links); + STAILQ_INSERT_TAIL(&dev->peer_dev->work_queue, buf1, + work_links); + } + /* + * Send an event to the other thread's kqueue to let it know + * that there is something on the work queue. + */ + retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL); + if (retval == -1) + warn("%s: unable to add peer work_queue kevent", + __func__); + else + retval = 0; + } else + active = 0; + + pthread_mutex_unlock(&dev->peer_dev->mutex); + pthread_mutex_lock(&dev->mutex); + + /* + * If the other side isn't active, run through the queue and + * release all of the buffers. + */ + if (active == 0) { + for (buf1 = STAILQ_FIRST(&local_queue); buf1 != NULL; + buf1 = STAILQ_FIRST(&local_queue)) { + STAILQ_REMOVE_HEAD(&local_queue, work_links); + STAILQ_REMOVE(&dev->peer_work_queue, buf1, camdd_buf, + links); + dev->num_peer_work_queue--; + camdd_release_buf(buf1); + } + dev->peer_bytes_queued -= peer_bytes_queued; + retval = 1; + } + +bailout: + return (retval); +} + +/* + * Return a buffer to the reader thread when we have completed writing it. + */ +int +camdd_complete_peer_buf(struct camdd_dev *dev, struct camdd_buf *peer_buf) +{ + struct kevent ke; + int retval = 0; + + /* + * Setup the event to let the other thread know that we have + * completed a buffer. + */ + EV_SET(&ke, (uintptr_t)&dev->peer_dev->peer_done_queue, EVFILT_USER, 0, + NOTE_TRIGGER, 0, NULL); + + /* + * Drop our lock and acquire the other thread's lock before + * manipulating + */ + pthread_mutex_unlock(&dev->mutex); + pthread_mutex_lock(&dev->peer_dev->mutex); + + /* + * Put the buffer on the reader thread's peer done queue now that + * we have completed it. + */ + STAILQ_INSERT_TAIL(&dev->peer_dev->peer_done_queue, peer_buf, + work_links); + dev->peer_dev->num_peer_done_queue++; + + /* + * Send an event to the peer thread to let it know that we've added + * something to its peer done queue. + */ + retval = kevent(dev->peer_dev->kq, &ke, 1, NULL, 0, NULL); + if (retval == -1) + warn("%s: unable to add peer_done_queue kevent", __func__); + else + retval = 0; + + /* + * Drop the other thread's lock and reacquire ours. + */ + pthread_mutex_unlock(&dev->peer_dev->mutex); + pthread_mutex_lock(&dev->mutex); + + return (retval); +} + +/* + * Free a buffer that was written out by the writer thread and returned to + * the reader thread. + */ +void +camdd_peer_done(struct camdd_buf *buf) +{ + struct camdd_dev *dev; + struct camdd_buf_data *data; + + dev = buf->dev; + if (buf->buf_type != CAMDD_BUF_DATA) { + errx(1, "%s: should have a data buffer, not an " + "indirect buffer", __func__); + } + + data = &buf->buf_type_spec.data; + + STAILQ_REMOVE(&dev->peer_work_queue, buf, camdd_buf, links); + dev->num_peer_work_queue--; + dev->peer_bytes_queued -= (data->fill_len - data->resid); + + if (buf->status == CAMDD_STATUS_EOF) + dev->flags |= CAMDD_DEV_FLAG_PEER_EOF; + + STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); +} + +/* + * Assumes caller holds the lock for this device. + */ +void +camdd_complete_buf(struct camdd_dev *dev, struct camdd_buf *buf, + int *error_count) +{ + int retval = 0; + + /* + * If we're the reader, we need to send the completed I/O + * to the writer. If we're the writer, we need to just + * free up resources, or let the reader know if we've + * encountered an error. + */ + if (dev->write_dev == 0) { + retval = camdd_queue_peer_buf(dev, buf); + if (retval != 0) + (*error_count)++; + } else { + struct camdd_buf *tmp_buf, *next_buf; + + STAILQ_FOREACH_SAFE(tmp_buf, &buf->src_list, src_links, + next_buf) { + struct camdd_buf *src_buf; + struct camdd_buf_indirect *indirect; + + STAILQ_REMOVE(&buf->src_list, tmp_buf, + camdd_buf, src_links); + + tmp_buf->status = buf->status; + + if (tmp_buf->buf_type == CAMDD_BUF_DATA) { + camdd_complete_peer_buf(dev, tmp_buf); + continue; + } + + indirect = &tmp_buf->buf_type_spec.indirect; + src_buf = indirect->src_buf; + src_buf->refcount--; + /* + * XXX KDM we probably need to account for + * exactly how many bytes we were able to + * write. Allocate the residual to the + * first N buffers? Or just track the + * number of bytes written? Right now the reader + * doesn't do anything with a residual. + */ + src_buf->status = buf->status; + if (src_buf->refcount <= 0) + camdd_complete_peer_buf(dev, src_buf); + STAILQ_INSERT_TAIL(&dev->free_indirect_queue, + tmp_buf, links); + } + + STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); + } +} + +/* + * Fetch all completed commands from the pass(4) device. + * + * Returns the number of commands received, or -1 if any of the commands + * completed with an error. Returns 0 if no commands are available. + */ +int +camdd_pass_fetch(struct camdd_dev *dev) +{ + struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass; + union ccb ccb; + int retval = 0, num_fetched = 0, error_count = 0; + + pthread_mutex_unlock(&dev->mutex); + /* + * XXX KDM we don't distinguish between EFAULT and ENOENT. + */ + while ((retval = ioctl(pass_dev->dev->fd, CAMIOGET, &ccb)) != -1) { + struct camdd_buf *buf; + struct camdd_buf_data *data; + cam_status ccb_status; + union ccb *buf_ccb; + + buf = ccb.ccb_h.ccb_buf; + data = &buf->buf_type_spec.data; + buf_ccb = &data->ccb; + + num_fetched++; + + /* + * Copy the CCB back out so we get status, sense data, etc. + */ + bcopy(&ccb, buf_ccb, sizeof(ccb)); + + pthread_mutex_lock(&dev->mutex); + + /* + * We're now done, so take this off the active queue. + */ + STAILQ_REMOVE(&dev->active_queue, buf, camdd_buf, links); + dev->cur_active_io--; + + ccb_status = ccb.ccb_h.status & CAM_STATUS_MASK; + if (ccb_status != CAM_REQ_CMP) { + cam_error_print(pass_dev->dev, &ccb, CAM_ESF_ALL, + CAM_EPF_ALL, stderr); + } + + data->resid = ccb.csio.resid; + dev->bytes_transferred += (ccb.csio.dxfer_len - ccb.csio.resid); + + if (buf->status == CAMDD_STATUS_NONE) + buf->status = camdd_ccb_status(&ccb); + if (buf->status == CAMDD_STATUS_ERROR) + error_count++; + else if (buf->status == CAMDD_STATUS_EOF) { + /* + * Once we queue this buffer to our partner thread, + * he will know that we've hit EOF. + */ + dev->flags |= CAMDD_DEV_FLAG_EOF; + } + + camdd_complete_buf(dev, buf, &error_count); + + /* + * Unlock in preparation for the ioctl call. + */ + pthread_mutex_unlock(&dev->mutex); + } + + pthread_mutex_lock(&dev->mutex); + + if (error_count > 0) + return (-1); + else + return (num_fetched); +} + +/* + * Returns -1 for error, 0 for success/continue, and 1 for resource + * shortage/stop processing. + */ +int +camdd_file_run(struct camdd_dev *dev) +{ + struct camdd_dev_file *file_dev = &dev->dev_spec.file; + struct camdd_buf_data *data; + struct camdd_buf *buf; + off_t io_offset; + int retval = 0, write_dev = dev->write_dev; + int error_count = 0, no_resources = 0, double_buf_needed = 0; + uint32_t num_sectors = 0, db_len = 0; + + buf = STAILQ_FIRST(&dev->run_queue); + if (buf == NULL) { + no_resources = 1; + goto bailout; + } else if ((dev->write_dev == 0) + && (dev->flags & (CAMDD_DEV_FLAG_EOF | + CAMDD_DEV_FLAG_EOF_SENT))) { + STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links); + dev->num_run_queue--; + buf->status = CAMDD_STATUS_EOF; + error_count++; + goto bailout; + } + + /* + * If we're writing, we need to go through the source buffer list + * and create an S/G list. + */ + if (write_dev != 0) { + retval = camdd_buf_sg_create(buf, /*iovec*/ 1, + dev->sector_size, &num_sectors, &double_buf_needed); + if (retval != 0) { + no_resources = 1; + goto bailout; + } + } + + STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links); + dev->num_run_queue--; + + data = &buf->buf_type_spec.data; + + /* + * pread(2) and pwrite(2) offsets are byte offsets. + */ + io_offset = buf->lba * dev->sector_size; + + /* + * Unlock the mutex while we read or write. + */ + pthread_mutex_unlock(&dev->mutex); + + /* + * Note that we don't need to double buffer if we're the reader + * because in that case, we have allocated a single buffer of + * sufficient size to do the read. This copy is necessary on + * writes because if one of the components of the S/G list is not + * a sector size multiple, the kernel will reject the write. This + * is unfortunate but not surprising. So this will make sure that + * we're using a single buffer that is a multiple of the sector size. + */ + if ((double_buf_needed != 0) + && (data->sg_count > 1) + && (write_dev != 0)) { + uint32_t cur_offset; + int i; + + if (file_dev->tmp_buf == NULL) + file_dev->tmp_buf = calloc(dev->blocksize, 1); + if (file_dev->tmp_buf == NULL) { + buf->status = CAMDD_STATUS_ERROR; + error_count++; + goto bailout; + } + for (i = 0, cur_offset = 0; i < data->sg_count; i++) { + bcopy(data->iovec[i].iov_base, + &file_dev->tmp_buf[cur_offset], + data->iovec[i].iov_len); + cur_offset += data->iovec[i].iov_len; + } + db_len = cur_offset; + } + + if (file_dev->file_flags & CAMDD_FF_CAN_SEEK) { + if (write_dev == 0) { + /* + * XXX KDM is there any way we would need a S/G + * list here? + */ + retval = pread(file_dev->fd, data->buf, + buf->len, io_offset); + } else { + if (double_buf_needed != 0) { + retval = pwrite(file_dev->fd, file_dev->tmp_buf, + db_len, io_offset); + } else if (data->sg_count == 0) { + retval = pwrite(file_dev->fd, data->buf, + data->fill_len, io_offset); + } else { + retval = pwritev(file_dev->fd, data->iovec, + data->sg_count, io_offset); + } + } + } else { + if (write_dev == 0) { + /* + * XXX KDM is there any way we would need a S/G + * list here? + */ + retval = read(file_dev->fd, data->buf, buf->len); + } else { + if (double_buf_needed != 0) { + retval = write(file_dev->fd, file_dev->tmp_buf, + db_len); + } else if (data->sg_count == 0) { + retval = write(file_dev->fd, data->buf, + data->fill_len); + } else { + retval = writev(file_dev->fd, data->iovec, + data->sg_count); + } + } + } + + /* We're done, re-acquire the lock */ + pthread_mutex_lock(&dev->mutex); + + if (retval >= (ssize_t)data->fill_len) { + /* + * If the bytes transferred is more than the request size, + * that indicates an overrun, which should only happen at + * the end of a transfer if we have to round up to a sector + * boundary. + */ + if (buf->status == CAMDD_STATUS_NONE) + buf->status = CAMDD_STATUS_OK; + data->resid = 0; + dev->bytes_transferred += retval; + } else if (retval == -1) { + warn("Error %s %s", (write_dev) ? "writing to" : + "reading from", file_dev->filename); + + buf->status = CAMDD_STATUS_ERROR; + data->resid = data->fill_len; + error_count++; + + if (dev->debug == 0) + goto bailout; + + if ((double_buf_needed != 0) + && (write_dev != 0)) { + fprintf(stderr, "%s: fd %d, DB buf %p, len %u lba %ju " + "offset %ju\n", __func__, file_dev->fd, + file_dev->tmp_buf, db_len, (uintmax_t)buf->lba, + (uintmax_t)io_offset); + } else if (data->sg_count == 0) { + fprintf(stderr, "%s: fd %d, buf %p, len %u, lba %ju " + "offset %ju\n", __func__, file_dev->fd, data->buf, + data->fill_len, (uintmax_t)buf->lba, + (uintmax_t)io_offset); + } else { + int i; + + fprintf(stderr, "%s: fd %d, len %u, lba %ju " + "offset %ju\n", __func__, file_dev->fd, + data->fill_len, (uintmax_t)buf->lba, + (uintmax_t)io_offset); + + for (i = 0; i < data->sg_count; i++) { + fprintf(stderr, "index %d ptr %p len %zu\n", + i, data->iovec[i].iov_base, + data->iovec[i].iov_len); + } + } + } else if (retval == 0) { + buf->status = CAMDD_STATUS_EOF; + if (dev->debug != 0) + printf("%s: got EOF from %s!\n", __func__, + file_dev->filename); + data->resid = data->fill_len; + error_count++; + } else if (retval < (ssize_t)data->fill_len) { + if (buf->status == CAMDD_STATUS_NONE) + buf->status = CAMDD_STATUS_SHORT_IO; + data->resid = data->fill_len - retval; + dev->bytes_transferred += retval; + } + +bailout: + if (buf != NULL) { + if (buf->status == CAMDD_STATUS_EOF) { + struct camdd_buf *buf2; + dev->flags |= CAMDD_DEV_FLAG_EOF; + STAILQ_FOREACH(buf2, &dev->run_queue, links) + buf2->status = CAMDD_STATUS_EOF; + } + + camdd_complete_buf(dev, buf, &error_count); + } + + if (error_count != 0) + return (-1); + else if (no_resources != 0) + return (1); + else + return (0); +} + +/* + * Execute one command from the run queue. Returns 0 for success, 1 for + * stop processing, and -1 for error. + */ +int +camdd_pass_run(struct camdd_dev *dev) +{ + struct camdd_buf *buf = NULL; + struct camdd_dev_pass *pass_dev = &dev->dev_spec.pass; + struct camdd_buf_data *data; + uint32_t num_blocks, sectors_used = 0; + union ccb *ccb; + int retval = 0, is_write = dev->write_dev; + int double_buf_needed = 0; + + buf = STAILQ_FIRST(&dev->run_queue); + if (buf == NULL) { + retval = 1; + goto bailout; + } + + /* + * If we're writing, we need to go through the source buffer list + * and create an S/G list. + */ + if (is_write != 0) { + retval = camdd_buf_sg_create(buf, /*iovec*/ 0,dev->sector_size, + §ors_used, &double_buf_needed); + if (retval != 0) { + retval = -1; + goto bailout; + } + } + + STAILQ_REMOVE(&dev->run_queue, buf, camdd_buf, links); + dev->num_run_queue--; + + data = &buf->buf_type_spec.data; + + ccb = &data->ccb; + bzero(&(&ccb->ccb_h)[1], + sizeof(struct ccb_scsiio) - sizeof(struct ccb_hdr)); + + /* + * In almost every case the number of blocks should be the device + * block size. The exception may be at the end of an I/O stream + * for a partial block or at the end of a device. + */ + if (is_write != 0) + num_blocks = sectors_used; + else + num_blocks = data->fill_len / pass_dev->block_len; + + scsi_read_write(&ccb->csio, + /*retries*/ dev->retry_count, + /*cbfcnp*/ NULL, + /*tag_action*/ MSG_SIMPLE_Q_TAG, + /*readop*/ (dev->write_dev == 0) ? SCSI_RW_READ : + SCSI_RW_WRITE, + /*byte2*/ 0, + /*minimum_cmd_size*/ dev->min_cmd_size, + /*lba*/ buf->lba, + /*block_count*/ num_blocks, + /*data_ptr*/ (data->sg_count != 0) ? + (uint8_t *)data->segs : data->buf, + /*dxfer_len*/ (num_blocks * pass_dev->block_len), + /*sense_len*/ SSD_FULL_SIZE, + /*timeout*/ dev->io_timeout); + + /* Disable freezing the device queue */ + ccb->ccb_h.flags |= CAM_DEV_QFRZDIS; + + if (dev->retry_count != 0) + ccb->ccb_h.flags |= CAM_PASS_ERR_RECOVER; + + if (data->sg_count != 0) { + ccb->csio.sglist_cnt = data->sg_count; + ccb->ccb_h.flags |= CAM_DATA_SG; + } + + /* + * Store a pointer to the buffer in the CCB. The kernel will + * restore this when we get it back, and we'll use it to identify + * the buffer this CCB came from. + */ + ccb->ccb_h.ccb_buf = buf; + + /* + * Unlock our mutex in preparation for issuing the ioctl. + */ + pthread_mutex_unlock(&dev->mutex); + /* + * Queue the CCB to the pass(4) driver. + */ + if (ioctl(pass_dev->dev->fd, CAMIOQUEUE, ccb) == -1) { + pthread_mutex_lock(&dev->mutex); + + warn("%s: error sending CAMIOQUEUE ioctl to %s%u", __func__, + pass_dev->dev->device_name, pass_dev->dev->dev_unit_num); + warn("%s: CCB address is %p", __func__, ccb); + retval = -1; + + STAILQ_INSERT_TAIL(&dev->free_queue, buf, links); + } else { + pthread_mutex_lock(&dev->mutex); + + dev->cur_active_io++; + STAILQ_INSERT_TAIL(&dev->active_queue, buf, links); + } + +bailout: + return (retval); +} + +int +camdd_get_next_lba_len(struct camdd_dev *dev, uint64_t *lba, ssize_t *len) +{ + struct camdd_dev_pass *pass_dev; + uint32_t num_blocks; + int retval = 0; + + pass_dev = &dev->dev_spec.pass; + + *lba = dev->next_io_pos_bytes / dev->sector_size; + *len = dev->blocksize; + num_blocks = *len / dev->sector_size; + + /* + * If max_sector is 0, then we have no set limit. This can happen + * if we're writing to a file in a filesystem, or reading from + * something like /dev/zero. + */ + if ((dev->max_sector != 0) + || (dev->sector_io_limit != 0)) { + uint64_t max_sector; + + if ((dev->max_sector != 0) + && (dev->sector_io_limit != 0)) + max_sector = min(dev->sector_io_limit, dev->max_sector); + else if (dev->max_sector != 0) + max_sector = dev->max_sector; + else + max_sector = dev->sector_io_limit; + + + /* + * Check to see whether we're starting off past the end of + * the device. If so, we need to just send an EOF + * notification to the writer. + */ + if (*lba > max_sector) { + *len = 0; + retval = 1; + } else if (((*lba + num_blocks) > max_sector + 1) + || ((*lba + num_blocks) < *lba)) { + /* + * If we get here (but pass the first check), we + * can trim the request length down to go to the + * end of the device. + */ + num_blocks = (max_sector + 1) - *lba; + *len = num_blocks * dev->sector_size; + retval = 1; + } + } + + dev->next_io_pos_bytes += *len; + + return (retval); +} + +/* + * Returns 0 for success, 1 for EOF detected, and -1 for failure. + */ +int +camdd_queue(struct camdd_dev *dev, struct camdd_buf *read_buf) +{ + struct camdd_buf *buf = NULL; + struct camdd_buf_data *data; + struct camdd_dev_pass *pass_dev; + size_t new_len; + struct camdd_buf_data *rb_data; + int is_write = dev->write_dev; + int eof_flush_needed = 0; + int retval = 0; + int error; + + pass_dev = &dev->dev_spec.pass; + + /* + * If we've gotten EOF or our partner has, we should not continue + * queueing I/O. If we're a writer, though, we should continue + * to write any buffers that don't have EOF status. + */ + if ((dev->flags & CAMDD_DEV_FLAG_EOF) + || ((dev->flags & CAMDD_DEV_FLAG_PEER_EOF) + && (is_write == 0))) { + /* + * Tell the worker thread that we have seen EOF. + */ + retval = 1; + + /* + * If we're the writer, send the buffer back with EOF status. + */ + if (is_write) { + read_buf->status = CAMDD_STATUS_EOF; + + error = camdd_complete_peer_buf(dev, read_buf); + } + goto bailout; + } + + if (is_write == 0) { + buf = camdd_get_buf(dev, CAMDD_BUF_DATA); + if (buf == NULL) { + retval = -1; + goto bailout; + } + data = &buf->buf_type_spec.data; + + retval = camdd_get_next_lba_len(dev, &buf->lba, &buf->len); + if (retval != 0) { + buf->status = CAMDD_STATUS_EOF; + + if ((buf->len == 0) + && ((dev->flags & (CAMDD_DEV_FLAG_EOF_SENT | + CAMDD_DEV_FLAG_EOF_QUEUED)) != 0)) { + camdd_release_buf(buf); + goto bailout; + } + dev->flags |= CAMDD_DEV_FLAG_EOF_QUEUED; + } + + data->fill_len = buf->len; + data->src_start_offset = buf->lba * dev->sector_size; + + /* + * Put this on the run queue. + */ + STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); + dev->num_run_queue++; + + /* We're done. */ + goto bailout; + } + + /* + * Check for new EOF status from the reader. + */ + if ((read_buf->status == CAMDD_STATUS_EOF) + || (read_buf->status == CAMDD_STATUS_ERROR)) { + dev->flags |= CAMDD_DEV_FLAG_PEER_EOF; + if ((STAILQ_FIRST(&dev->pending_queue) == NULL) + && (read_buf->len == 0)) { + camdd_complete_peer_buf(dev, read_buf); + retval = 1; + goto bailout; + } else + eof_flush_needed = 1; + } + + /* + * See if we have a buffer we're composing with pieces from our + * partner thread. + */ + buf = STAILQ_FIRST(&dev->pending_queue); + if (buf == NULL) { + uint64_t lba; + ssize_t len; + + retval = camdd_get_next_lba_len(dev, &lba, &len); + if (retval != 0) { + read_buf->status = CAMDD_STATUS_EOF; + + if (len == 0) { + dev->flags |= CAMDD_DEV_FLAG_EOF; + error = camdd_complete_peer_buf(dev, read_buf); + goto bailout; + } + } + + /* + * If we don't have a pending buffer, we need to grab a new + * one from the free list or allocate another one. + */ + buf = camdd_get_buf(dev, CAMDD_BUF_DATA); + if (buf == NULL) { + retval = 1; + goto bailout; + } + + buf->lba = lba; + buf->len = len; + + STAILQ_INSERT_TAIL(&dev->pending_queue, buf, links); + dev->num_pending_queue++; + } + + data = &buf->buf_type_spec.data; + + rb_data = &read_buf->buf_type_spec.data; + + if ((rb_data->src_start_offset != dev->next_peer_pos_bytes) + && (dev->debug != 0)) { + printf("%s: WARNING: reader offset %#jx != expected offset " + "%#jx\n", __func__, (uintmax_t)rb_data->src_start_offset, + (uintmax_t)dev->next_peer_pos_bytes); + } + dev->next_peer_pos_bytes = rb_data->src_start_offset + + (rb_data->fill_len - rb_data->resid); + + new_len = (rb_data->fill_len - rb_data->resid) + data->fill_len; + if (new_len < buf->len) { + /* + * There are three cases here: + * 1. We need more data to fill up a block, so we put + * this I/O on the queue and wait for more I/O. + * 2. We have a pending buffer in the queue that is + * smaller than our blocksize, but we got an EOF. So we + * need to go ahead and flush the write out. + * 3. We got an error. + */ + + /* + * Increment our fill length. + */ + data->fill_len += (rb_data->fill_len - rb_data->resid); + + /* + * Add the new read buffer to the list for writing. + */ + STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links); + + /* Increment the count */ + buf->src_count++; + + if (eof_flush_needed == 0) { + /* + * We need to exit, because we don't have enough + * data yet. + */ + goto bailout; + } else { + /* + * Take the buffer off of the pending queue. + */ + STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, + links); + dev->num_pending_queue--; + + /* + * If we need an EOF flush, but there is no data + * to flush, go ahead and return this buffer. + */ + if (data->fill_len == 0) { + camdd_complete_buf(dev, buf, /*error_count*/0); + retval = 1; + goto bailout; + } + + /* + * Put this on the next queue for execution. + */ + STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); + dev->num_run_queue++; + } + } else if (new_len == buf->len) { + /* + * We have enough data to completey fill one block, + * so we're ready to issue the I/O. + */ + + /* + * Take the buffer off of the pending queue. + */ + STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, links); + dev->num_pending_queue--; + + /* + * Add the new read buffer to the list for writing. + */ + STAILQ_INSERT_TAIL(&buf->src_list, read_buf, src_links); + + /* Increment the count */ + buf->src_count++; + + /* + * Increment our fill length. + */ + data->fill_len += (rb_data->fill_len - rb_data->resid); + + /* + * Put this on the next queue for execution. + */ + STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); + dev->num_run_queue++; + } else { + struct camdd_buf *idb; + struct camdd_buf_indirect *indirect; + uint32_t len_to_go, cur_offset; + + + idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT); + if (idb == NULL) { + retval = 1; + goto bailout; + } + indirect = &idb->buf_type_spec.indirect; + indirect->src_buf = read_buf; + read_buf->refcount++; + indirect->offset = 0; + indirect->start_ptr = rb_data->buf; + /* + * We've already established that there is more + * data in read_buf than we have room for in our + * current write request. So this particular chunk + * of the request should just be the remainder + * needed to fill up a block. + */ + indirect->len = buf->len - (data->fill_len - data->resid); + + camdd_buf_add_child(buf, idb); + + /* + * This buffer is ready to execute, so we can take + * it off the pending queue and put it on the run + * queue. + */ + STAILQ_REMOVE(&dev->pending_queue, buf, camdd_buf, + links); + dev->num_pending_queue--; + STAILQ_INSERT_TAIL(&dev->run_queue, buf, links); + dev->num_run_queue++; + + cur_offset = indirect->offset + indirect->len; + + /* + * The resulting I/O would be too large to fit in + * one block. We need to split this I/O into + * multiple pieces. Allocate as many buffers as needed. + */ + for (len_to_go = rb_data->fill_len - rb_data->resid - + indirect->len; len_to_go > 0;) { + struct camdd_buf *new_buf; + struct camdd_buf_data *new_data; + uint64_t lba; + ssize_t len; + + retval = camdd_get_next_lba_len(dev, &lba, &len); + if ((retval != 0) + && (len == 0)) { + /* + * The device has already been marked + * as EOF, and there is no space left. + */ + goto bailout; + } + + new_buf = camdd_get_buf(dev, CAMDD_BUF_DATA); + if (new_buf == NULL) { + retval = 1; + goto bailout; + } + + new_buf->lba = lba; + new_buf->len = len; + + idb = camdd_get_buf(dev, CAMDD_BUF_INDIRECT); + if (idb == NULL) { + retval = 1; + goto bailout; + } + + indirect = &idb->buf_type_spec.indirect; + + indirect->src_buf = read_buf; + read_buf->refcount++; + indirect->offset = cur_offset; + indirect->start_ptr = rb_data->buf + cur_offset; + indirect->len = min(len_to_go, new_buf->len); +#if 0 + if (((indirect->len % dev->sector_size) != 0) + || ((indirect->offset % dev->sector_size) != 0)) { + warnx("offset %ju len %ju not aligned with " + "sector size %u", indirect->offset, + (uintmax_t)indirect->len, dev->sector_size); + } +#endif + cur_offset += indirect->len; + len_to_go -= indirect->len; + + camdd_buf_add_child(new_buf, idb); + + new_data = &new_buf->buf_type_spec.data; + + if ((new_data->fill_len == new_buf->len) + || (eof_flush_needed != 0)) { + STAILQ_INSERT_TAIL(&dev->run_queue, + new_buf, links); + dev->num_run_queue++; + } else if (new_data->fill_len < buf->len) { + STAILQ_INSERT_TAIL(&dev->pending_queue, + new_buf, links); + dev->num_pending_queue++; + } else { + warnx("%s: too much data in new " + "buffer!", __func__); + retval = 1; + goto bailout; + } + } + } + +bailout: + return (retval); +} + +void +camdd_get_depth(struct camdd_dev *dev, uint32_t *our_depth, + uint32_t *peer_depth, uint32_t *our_bytes, uint32_t *peer_bytes) +{ + *our_depth = dev->cur_active_io + dev->num_run_queue; + if (dev->num_peer_work_queue > + dev->num_peer_done_queue) + *peer_depth = dev->num_peer_work_queue - + dev->num_peer_done_queue; + else + *peer_depth = 0; + *our_bytes = *our_depth * dev->blocksize; + *peer_bytes = dev->peer_bytes_queued; +} + +void +camdd_sig_handler(int sig) +{ + if (sig == SIGINFO) + need_status = 1; + else { + need_exit = 1; + error_exit = 1; + } + + sem_post(&camdd_sem); +} + +void +camdd_print_status(struct camdd_dev *camdd_dev, struct camdd_dev *other_dev, + struct timespec *start_time) +{ + struct timespec done_time; + uint64_t total_ns; + long double mb_sec, total_sec; + int error = 0; + + error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &done_time); + if (error != 0) { + warn("Unable to get done time"); + return; + } + + timespecsub(&done_time, start_time); + + total_ns = done_time.tv_nsec + (done_time.tv_sec * 1000000000); + total_sec = total_ns; + total_sec /= 1000000000; + + fprintf(stderr, "%ju bytes %s %s\n%ju bytes %s %s\n" + "%.4Lf seconds elapsed\n", + (uintmax_t)camdd_dev->bytes_transferred, + (camdd_dev->write_dev == 0) ? "read from" : "written to", + camdd_dev->device_name, + (uintmax_t)other_dev->bytes_transferred, + (other_dev->write_dev == 0) ? "read from" : "written to", + other_dev->device_name, total_sec); + + mb_sec = min(other_dev->bytes_transferred,camdd_dev->bytes_transferred); + mb_sec /= 1024 * 1024; + mb_sec *= 1000000000; + mb_sec /= total_ns; + fprintf(stderr, "%.2Lf MB/sec\n", mb_sec); +} + +int +camdd_rw(struct camdd_io_opts *io_opts, int num_io_opts, uint64_t max_io, + int retry_count, int timeout) +{ + char *device = NULL; + struct cam_device *new_cam_dev = NULL; + struct camdd_dev *devs[2]; + struct timespec start_time; + pthread_t threads[2]; + int unit = 0; + int error = 0; + int i; + + if (num_io_opts != 2) { + warnx("Must have one input and one output path"); + error = 1; + goto bailout; + } + + bzero(devs, sizeof(devs)); + + for (i = 0; i < num_io_opts; i++) { + switch (io_opts[i].dev_type) { + case CAMDD_DEV_PASS: { + camdd_argmask new_arglist = CAMDD_ARG_NONE; + int bus = 0, target = 0, lun = 0; + char name[30]; + int rv; + + if (isdigit(io_opts[i].dev_name[0])) { + /* device specified as bus:target[:lun] */ + rv = parse_btl(io_opts[i].dev_name, &bus, + &target, &lun, &new_arglist); + if (rv < 2) { + warnx("numeric device specification " + "must be either bus:target, or " + "bus:target:lun"); + error = 1; + goto bailout; + } + /* default to 0 if lun was not specified */ + if ((new_arglist & CAMDD_ARG_LUN) == 0) { + lun = 0; + new_arglist |= CAMDD_ARG_LUN; + } + } else { + if (cam_get_device(io_opts[i].dev_name, name, + sizeof name, &unit) == -1) { + warnx("%s", cam_errbuf); + error = 1; + goto bailout; + } + device = strdup(name); + new_arglist |= CAMDD_ARG_DEVICE |CAMDD_ARG_UNIT; + } + + if (new_arglist & (CAMDD_ARG_BUS | CAMDD_ARG_TARGET)) + new_cam_dev = cam_open_btl(bus, target, lun, + O_RDWR, NULL); + else + new_cam_dev = cam_open_spec_device(device, unit, + O_RDWR, NULL); + if (new_cam_dev == NULL) { + warnx("%s", cam_errbuf); + error = 1; + goto bailout; + } + + devs[i] = camdd_probe_pass(new_cam_dev, + /*io_opts*/ &io_opts[i], + CAMDD_ARG_ERR_RECOVER, + /*probe_retry_count*/ 3, + /*probe_timeout*/ 5000, + /*io_retry_count*/ retry_count, + /*io_timeout*/ timeout); + if (devs[i] == NULL) { + warn("Unable to probe device %s%u", + new_cam_dev->device_name, + new_cam_dev->dev_unit_num); + error = 1; + goto bailout; + } + break; + } + case CAMDD_DEV_FILE: { + int fd = -1; + + if (io_opts[i].dev_name[0] == '-') { + if (io_opts[i].write_dev != 0) + fd = STDOUT_FILENO; + else + fd = STDIN_FILENO; + } else { + if (io_opts[i].write_dev != 0) { + fd = open(io_opts[i].dev_name, + O_RDWR | O_CREAT, S_IWUSR |S_IRUSR); + } else { + fd = open(io_opts[i].dev_name, + O_RDONLY); + } + } + if (fd == -1) { + warn("error opening file %s", + io_opts[i].dev_name); + error = 1; + goto bailout; + } + + devs[i] = camdd_probe_file(fd, &io_opts[i], + retry_count, timeout); + if (devs[i] == NULL) { + error = 1; + goto bailout; + } + + break; + } + default: + warnx("Unknown device type %d (%s)", + io_opts[i].dev_type, io_opts[i].dev_name); + error = 1; + goto bailout; + break; /*NOTREACHED */ + } + + devs[i]->write_dev = io_opts[i].write_dev; + + devs[i]->start_offset_bytes = io_opts[i].offset; + + if (max_io != 0) { + devs[i]->sector_io_limit = + (devs[i]->start_offset_bytes / + devs[i]->sector_size) + + (max_io / devs[i]->sector_size) - 1; + devs[i]->sector_io_limit = + (devs[i]->start_offset_bytes / + devs[i]->sector_size) + + (max_io / devs[i]->sector_size) - 1; + } + + devs[i]->next_io_pos_bytes = devs[i]->start_offset_bytes; + devs[i]->next_completion_pos_bytes =devs[i]->start_offset_bytes; + } + + devs[0]->peer_dev = devs[1]; + devs[1]->peer_dev = devs[0]; + devs[0]->next_peer_pos_bytes = devs[0]->peer_dev->next_io_pos_bytes; + devs[1]->next_peer_pos_bytes = devs[1]->peer_dev->next_io_pos_bytes; + + sem_init(&camdd_sem, /*pshared*/ 0, 0); + + signal(SIGINFO, camdd_sig_handler); + signal(SIGINT, camdd_sig_handler); + + error = clock_gettime(CLOCK_MONOTONIC_PRECISE, &start_time); + if (error != 0) { + warn("Unable to get start time"); + goto bailout; + } + + for (i = 0; i < num_io_opts; i++) { + error = pthread_create(&threads[i], NULL, camdd_worker, + (void *)devs[i]); + if (error != 0) { + warnc(error, "pthread_create() failed"); + goto bailout; + } + } + + for (;;) { + if ((sem_wait(&camdd_sem) == -1) + || (need_exit != 0)) { + struct kevent ke; + + for (i = 0; i < num_io_opts; i++) { + EV_SET(&ke, (uintptr_t)&devs[i]->work_queue, + EVFILT_USER, 0, NOTE_TRIGGER, 0, NULL); + + devs[i]->flags |= CAMDD_DEV_FLAG_EOF; + + error = kevent(devs[i]->kq, &ke, 1, NULL, 0, + NULL); + if (error == -1) + warn("%s: unable to wake up thread", + __func__); + error = 0; + } + break; + } else if (need_status != 0) { + camdd_print_status(devs[0], devs[1], &start_time); + need_status = 0; + } + } + for (i = 0; i < num_io_opts; i++) { + pthread_join(threads[i], NULL); + } + + camdd_print_status(devs[0], devs[1], &start_time); + +bailout: + + for (i = 0; i < num_io_opts; i++) + camdd_free_dev(devs[i]); + + return (error + error_exit); +} + +void +usage(void) +{ + fprintf(stderr, +"usage: camdd <-i|-o pass=pass0,bs=1M,offset=1M,depth=4>\n" +" <-i|-o file=/tmp/file,bs=512K,offset=1M>\n" +" <-i|-o file=/dev/da0,bs=512K,offset=1M>\n" +" <-i|-o file=/dev/nsa0,bs=512K>\n" +" [-C retry_count][-E][-m max_io_amt][-t timeout_secs][-v][-h]\n" +"Option description\n" +"-i <arg=val> Specify input device/file and parameters\n" +"-o <arg=val> Specify output device/file and parameters\n" +"Input and Output parameters\n" +"pass=name Specify a pass(4) device like pass0 or /dev/pass0\n" +"file=name Specify a file or device, /tmp/foo, /dev/da0, /dev/null\n" +" or - for stdin/stdout\n" +"bs=blocksize Specify blocksize in bytes, or using K, M, G, etc. suffix\n" +"offset=len Specify starting offset in bytes or using K, M, G suffix\n" +" NOTE: offset cannot be specified on tapes, pipes, stdin/out\n" +"depth=N Specify a numeric queue depth. This only applies to pass(4)\n" +"mcs=N Specify a minimum cmd size for pass(4) read/write commands\n" +"Optional arguments\n" +"-C retry_cnt Specify a retry count for pass(4) devices\n" +"-E Enable CAM error recovery for pass(4) devices\n" +"-m max_io Specify the maximum amount to be transferred in bytes or\n" +" using K, G, M, etc. suffixes\n" +"-t timeout Specify the I/O timeout to use with pass(4) devices\n" +"-v Enable verbose error recovery\n" +"-h Print this message\n"); +} + + +int +camdd_parse_io_opts(char *args, int is_write, struct camdd_io_opts *io_opts) +{ + char *tmpstr, *tmpstr2; + char *orig_tmpstr = NULL; + int retval = 0; + + io_opts->write_dev = is_write; + + tmpstr = strdup(args); + if (tmpstr == NULL) { + warn("strdup failed"); + retval = 1; + goto bailout; + } + orig_tmpstr = tmpstr; + while ((tmpstr2 = strsep(&tmpstr, ",")) != NULL) { + char *name, *value; + + /* + * If the user creates an empty parameter by putting in two + * commas, skip over it and look for the next field. + */ + if (*tmpstr2 == '\0') + continue; + + name = strsep(&tmpstr2, "="); + if (*name == '\0') { + warnx("Got empty I/O parameter name"); + retval = 1; + goto bailout; + } + value = strsep(&tmpstr2, "="); + if ((value == NULL) + || (*value == '\0')) { + warnx("Empty I/O parameter value for %s", name); + retval = 1; + goto bailout; + } + if (strncasecmp(name, "file", 4) == 0) { + io_opts->dev_type = CAMDD_DEV_FILE; + io_opts->dev_name = strdup(value); + if (io_opts->dev_name == NULL) { + warn("Error allocating memory"); + retval = 1; + goto bailout; + } + } else if (strncasecmp(name, "pass", 4) == 0) { + io_opts->dev_type = CAMDD_DEV_PASS; + io_opts->dev_name = strdup(value); + if (io_opts->dev_name == NULL) { + warn("Error allocating memory"); + retval = 1; + goto bailout; + } + } else if ((strncasecmp(name, "bs", 2) == 0) + || (strncasecmp(name, "blocksize", 9) == 0)) { + retval = expand_number(value, &io_opts->blocksize); + if (retval == -1) { + warn("expand_number(3) failed on %s=%s", name, + value); + retval = 1; + goto bailout; + } + } else if (strncasecmp(name, "depth", 5) == 0) { + char *endptr; + + io_opts->queue_depth = strtoull(value, &endptr, 0); + if (*endptr != '\0') { + warnx("invalid queue depth %s", value); + retval = 1; + goto bailout; + } + } else if (strncasecmp(name, "mcs", 3) == 0) { + char *endptr; + + io_opts->min_cmd_size = strtol(value, &endptr, 0); + if ((*endptr != '\0') + || ((io_opts->min_cmd_size > 16) + || (io_opts->min_cmd_size < 0))) { + warnx("invalid minimum cmd size %s", value); + retval = 1; + goto bailout; + } + } else if (strncasecmp(name, "offset", 6) == 0) { + retval = expand_number(value, &io_opts->offset); + if (retval == -1) { + warn("expand_number(3) failed on %s=%s", name, + value); + retval = 1; + goto bailout; + } + } else if (strncasecmp(name, "debug", 5) == 0) { + char *endptr; + + io_opts->debug = strtoull(value, &endptr, 0); + if (*endptr != '\0') { + warnx("invalid debug level %s", value); + retval = 1; + goto bailout; + } + } else { + warnx("Unrecognized parameter %s=%s", name, value); + } + } +bailout: + free(orig_tmpstr); + + return (retval); +} + +int +main(int argc, char **argv) +{ + int c; + camdd_argmask arglist = CAMDD_ARG_NONE; + int timeout = 0, retry_count = 1; + int error = 0; + uint64_t max_io = 0; + struct camdd_io_opts *opt_list = NULL; + + if (argc == 1) { + usage(); + exit(1); + } + + opt_list = calloc(2, sizeof(struct camdd_io_opts)); + if (opt_list == NULL) { + warn("Unable to allocate option list"); + error = 1; + goto bailout; + } + + while ((c = getopt(argc, argv, "C:Ehi:m:o:t:v")) != -1){ + switch (c) { + case 'C': + retry_count = strtol(optarg, NULL, 0); + if (retry_count < 0) + errx(1, "retry count %d is < 0", + retry_count); + arglist |= CAMDD_ARG_RETRIES; + break; + case 'E': + arglist |= CAMDD_ARG_ERR_RECOVER; + break; + case 'i': + case 'o': + if (((c == 'i') + && (opt_list[0].dev_type != CAMDD_DEV_NONE)) + || ((c == 'o') + && (opt_list[1].dev_type != CAMDD_DEV_NONE))) { + errx(1, "Only one input and output path " + "allowed"); + } + error = camdd_parse_io_opts(optarg, (c == 'o') ? 1 : 0, + (c == 'o') ? &opt_list[1] : &opt_list[0]); + if (error != 0) + goto bailout; + break; + case 'm': + error = expand_number(optarg, &max_io); + if (error == -1) { + warn("invalid maximum I/O amount %s", optarg); + error = 1; + goto bailout; + } + break; + case 't': + timeout = strtol(optarg, NULL, 0); + if (timeout < 0) + errx(1, "invalid timeout %d", timeout); + /* Convert the timeout from seconds to ms */ + timeout *= 1000; + arglist |= CAMDD_ARG_TIMEOUT; + break; + case 'v': + arglist |= CAMDD_ARG_VERBOSE; + break; + case 'h': + default: + usage(); + exit(1); + break; /*NOTREACHED*/ + } + } + + if ((opt_list[0].dev_type == CAMDD_DEV_NONE) + || (opt_list[1].dev_type == CAMDD_DEV_NONE)) + errx(1, "Must specify both -i and -o"); + + /* + * Set the timeout if the user hasn't specified one. + */ + if (timeout == 0) + timeout = CAMDD_PASS_RW_TIMEOUT; + + error = camdd_rw(opt_list, 2, max_io, retry_count, timeout); + +bailout: + free(opt_list); + + exit(error); +} |